/arch/alpha/kernel/

stable&id2=c738888032ffafa1bbb971cd55b3d43b05b344cf'>Documentation/ABI/stable/sysfs-bus-mhi10
-rw-r--r--Documentation/ABI/stable/sysfs-class-infiniband64
-rw-r--r--Documentation/ABI/stable/sysfs-class-tpm2
-rw-r--r--Documentation/ABI/stable/sysfs-devices7
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node45
-rw-r--r--Documentation/ABI/stable/sysfs-devices-system-cpu19
-rw-r--r--Documentation/ABI/stable/sysfs-driver-dma-idxd150
-rw-r--r--Documentation/ABI/stable/sysfs-driver-firmware-zynqmp141
-rw-r--r--Documentation/ABI/stable/sysfs-driver-mlxreg-io475
-rw-r--r--Documentation/ABI/stable/sysfs-driver-speakup9
-rw-r--r--Documentation/ABI/stable/sysfs-hypervisor-xen13
-rw-r--r--Documentation/ABI/stable/sysfs-module25
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget13
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget-mass-storage10
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget-uac143
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget-uac248
-rw-r--r--Documentation/ABI/testing/configfs-usb-gadget-uvc66
-rw-r--r--Documentation/ABI/testing/debugfs-cros-ec22
-rw-r--r--Documentation/ABI/testing/debugfs-cxl35
-rw-r--r--Documentation/ABI/testing/debugfs-dell-wmi-ddv21
-rw-r--r--Documentation/ABI/testing/debugfs-driver-dcc127
-rw-r--r--Documentation/ABI/testing/debugfs-driver-habanalabs112
-rw-r--r--Documentation/ABI/testing/debugfs-hisi-hpre192
-rw-r--r--Documentation/ABI/testing/debugfs-hisi-sec160
-rw-r--r--Documentation/ABI/testing/debugfs-hisi-zip160
-rw-r--r--Documentation/ABI/testing/debugfs-scmi70
-rw-r--r--Documentation/ABI/testing/debugfs-scmi-raw117
-rw-r--r--Documentation/ABI/testing/evm5
-rw-r--r--Documentation/ABI/testing/ima_policy57
-rw-r--r--Documentation/ABI/testing/procfs-smaps_rollup1
-rw-r--r--Documentation/ABI/testing/pstore3
-rw-r--r--Documentation/ABI/testing/securityfs-secrets-coco51
-rw-r--r--Documentation/ABI/testing/sysfs-amd-pmc13
-rw-r--r--Documentation/ABI/testing/sysfs-amd-pmf13
-rw-r--r--Documentation/ABI/testing/sysfs-ata13
-rw-r--r--Documentation/ABI/testing/sysfs-block330
-rw-r--r--Documentation/ABI/testing/sysfs-block-zram14
-rw-r--r--Documentation/ABI/testing/sysfs-bus-bcma2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-cdx56
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coreboot45
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-etm3x2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x8
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm13
-rw-r--r--Documentation/ABI/testing/sysfs-bus-coresight-devices-ultra_smb31
-rw-r--r--Documentation/ABI/testing/sysfs-bus-counter105
-rw-r--r--Documentation/ABI/testing/sysfs-bus-css15
-rw-r--r--Documentation/ABI/testing/sysfs-bus-cxl376
-rw-r--r--Documentation/ABI/testing/sysfs-bus-event_source-devices-caps18
-rw-r--r--Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu37
-rw-r--r--Documentation/ABI/testing/sysfs-bus-fcoe2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-fsi-devices-sbefifo10
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio220
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-adc-ad413046
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-adc-ad7280a13
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-adc-max1141013
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-bno05581
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-cdc-ad774611
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-chemical-sunrise-co238
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-dac-ltc268886
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-filter-admv881816
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-frequency-admv101338
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-frequency-admv101423
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-proximity2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-scd3034
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-sx932429
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-temperature-max3185631
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-thermocouple18
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-timer-stm328
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-vf6102
-rw-r--r--Documentation/ABI/testing/sysfs-bus-mdio9
-rw-r--r--Documentation/ABI/testing/sysfs-bus-nvdimm49
-rw-r--r--Documentation/ABI/testing/sysfs-bus-papr-pmem12
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci78
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-drivers-xhci_hcd52
-rw-r--r--Documentation/ABI/testing/sysfs-bus-peci16
-rw-r--r--Documentation/ABI/testing/sysfs-bus-platform12
-rw-r--r--Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro325
-rw-r--r--Documentation/ABI/testing/sysfs-bus-platform-devices-occ-hwmon13
-rw-r--r--Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub8
-rw-r--r--Documentation/ABI/testing/sysfs-bus-rapidio32
-rw-r--r--Documentation/ABI/testing/sysfs-bus-soundwire-master20
-rw-r--r--Documentation/ABI/testing/sysfs-bus-soundwire-slave62
-rw-r--r--Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor6
-rw-r--r--Documentation/ABI/testing/sysfs-bus-surface_aggregator-tabletsw57
-rw-r--r--Documentation/ABI/testing/sysfs-bus-thunderbolt14
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb340
-rw-r--r--Documentation/ABI/testing/sysfs-bus-vdpa57
-rw-r--r--Documentation/ABI/testing/sysfs-class-bdi96
-rw-r--r--Documentation/ABI/testing/sysfs-class-cxl19
-rw-r--r--Documentation/ABI/testing/sysfs-class-devfreq-event12
-rw-r--r--Documentation/ABI/testing/sysfs-class-extcon12
-rw-r--r--Documentation/ABI/testing/sysfs-class-fc27
-rw-r--r--Documentation/ABI/testing/sysfs-class-firmware77
-rw-r--r--Documentation/ABI/testing/sysfs-class-firmware-attributes87
-rw-r--r--Documentation/ABI/testing/sysfs-class-gnss2
-rw-r--r--Documentation/ABI/testing/sysfs-class-hwmon958
-rw-r--r--Documentation/ABI/testing/sysfs-class-mei18
-rw-r--r--Documentation/ABI/testing/sysfs-class-mic24
-rw-r--r--Documentation/ABI/testing/sysfs-class-mux2
-rw-r--r--Documentation/ABI/testing/sysfs-class-net-peak_usb19
-rw-r--r--Documentation/ABI/testing/sysfs-class-power46
-rw-r--r--Documentation/ABI/testing/sysfs-class-power-rt946719
-rw-r--r--Documentation/ABI/testing/sysfs-class-power-rt947132
-rw-r--r--Documentation/ABI/testing/sysfs-class-pwm22
-rw-r--r--Documentation/ABI/testing/sysfs-class-rapidio4
-rw-r--r--Documentation/ABI/testing/sysfs-class-rc14
-rw-r--r--Documentation/ABI/testing/sysfs-class-rc-nuvoton2
-rw-r--r--Documentation/ABI/testing/sysfs-class-regulator81
-rw-r--r--Documentation/ABI/testing/sysfs-class-rtrs-client2
-rw-r--r--Documentation/ABI/testing/sysfs-class-rtrs-server2
-rw-r--r--Documentation/ABI/testing/sysfs-class-thermal259
-rw-r--r--Documentation/ABI/testing/sysfs-class-typec10
-rw-r--r--Documentation/ABI/testing/sysfs-class-usb_power_delivery249
-rw-r--r--Documentation/ABI/testing/sysfs-class-uwb_rc26
-rw-r--r--Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc10
-rw-r--r--Documentation/ABI/testing/sysfs-class-vduse33
-rw-r--r--Documentation/ABI/testing/sysfs-class-watchdog13
-rw-r--r--Documentation/ABI/testing/sysfs-devices-hisi_ptt61
-rw-r--r--Documentation/ABI/testing/sysfs-devices-mapping30
-rw-r--r--Documentation/ABI/testing/sysfs-devices-physical_location42
-rw-r--r--Documentation/ABI/testing/sysfs-devices-platform-ACPI-TAD2
-rw-r--r--Documentation/ABI/testing/sysfs-devices-platform-dock10
-rw-r--r--Documentation/ABI/testing/sysfs-devices-platform-soc-ipa62
-rw-r--r--Documentation/ABI/testing/sysfs-devices-power36
-rw-r--r--Documentation/ABI/testing/sysfs-devices-removable8
-rw-r--r--Documentation/ABI/testing/sysfs-devices-soc14
-rw-r--r--Documentation/ABI/testing/sysfs-devices-state_synced5
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu98
-rw-r--r--Documentation/ABI/testing/sysfs-devices-vfio-dev8
-rw-r--r--Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing27
-rw-r--r--Documentation/ABI/testing/sysfs-driver-bd9571mwv-regulator2
-rw-r--r--Documentation/ABI/testing/sysfs-driver-ccp87
-rw-r--r--Documentation/ABI/testing/sysfs-driver-chromeos-acpi137
-rw-r--r--Documentation/ABI/testing/sysfs-driver-eud9
-rw-r--r--Documentation/ABI/testing/sysfs-driver-habanalabs40
-rw-r--r--Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon77
-rw-r--r--Documentation/ABI/testing/sysfs-driver-intel-m10-bmc8
-rw-r--r--Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update61
-rw-r--r--Documentation/ABI/testing/sysfs-driver-intel_sdsi90
-rw-r--r--Documentation/ABI/testing/sysfs-driver-qat49
-rw-r--r--Documentation/ABI/testing/sysfs-driver-typec-displayport15
-rw-r--r--Documentation/ABI/testing/sysfs-driver-uacce18
-rw-r--r--Documentation/ABI/testing/sysfs-driver-ufs192
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkback6
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkfront4
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager16
-rw-r--r--Documentation/ABI/testing/sysfs-driver-zynqmp-fpga73
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-efi-esrt16
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info29
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg5
-rw-r--r--Documentation/ABI/testing/sysfs-fs-erofs18
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs274
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ubifs35
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-address_bits10
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-cpu_byteorder12
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-iommu_groups1
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-livepatch8
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon346
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-ksm10
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers25
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-oops_count6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab119
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-warn_count6
-rw-r--r--Documentation/ABI/testing/sysfs-mce97
-rw-r--r--Documentation/ABI/testing/sysfs-module7
-rw-r--r--Documentation/ABI/testing/sysfs-platform-asus-wmi41
-rw-r--r--Documentation/ABI/testing/sysfs-platform-brcmstb-memc15
-rw-r--r--Documentation/ABI/testing/sysfs-platform-chipidea-usb26
-rw-r--r--Documentation/ABI/testing/sysfs-platform-dell-privacy-wmi60
-rw-r--r--Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv7
-rw-r--r--Documentation/ABI/testing/sysfs-platform-dptf4
-rw-r--r--Documentation/ABI/testing/sysfs-platform-intel-ifs52
-rw-r--r--Documentation/ABI/testing/sysfs-platform-intel-pmc2
-rw-r--r--Documentation/ABI/testing/sysfs-platform-lg-laptop1
-rw-r--r--Documentation/ABI/testing/sysfs-platform-mellanox-bootctl7
-rw-r--r--Documentation/ABI/testing/sysfs-platform-sst-atom2
-rw-r--r--Documentation/ABI/testing/sysfs-power31
-rw-r--r--Documentation/ABI/testing/sysfs-ptp30
-rw-r--r--Documentation/ABI/testing/sysfs-secvar75
-rw-r--r--Documentation/ABI/testing/sysfs-timecard288
-rw-r--r--Documentation/ABI/testing/sysfs-tty32
-rw-r--r--Documentation/COPYING-logo13
-rw-r--r--Documentation/Kconfig33
-rw-r--r--Documentation/Makefile26
-rw-r--r--Documentation/PCI/endpoint/index.rst2
-rw-r--r--Documentation/PCI/endpoint/pci-vntb-function.rst129
-rw-r--r--Documentation/PCI/endpoint/pci-vntb-howto.rst167
-rw-r--r--Documentation/PCI/index.rst6
-rw-r--r--Documentation/PCI/msi-howto.rst10
-rw-r--r--Documentation/PCI/pci-error-recovery.rst8
-rw-r--r--Documentation/PCI/pci-iov-howto.rst7
-rw-r--r--Documentation/PCI/pci.rst16
-rw-r--r--Documentation/PCI/sysfs-pci.rst2
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.rst2
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst8
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel0.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel1.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel2.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel3.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel4.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel5.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel6.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel7.svg4
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Funnel8.svg4
-rw-r--r--Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst71
-rw-r--r--Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg36
-rw-r--r--Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg62
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.rst48
-rw-r--r--Documentation/RCU/NMI-RCU.rst4
-rw-r--r--Documentation/RCU/RTFP.txt10
-rw-r--r--Documentation/RCU/UP.rst17
-rw-r--r--Documentation/RCU/arrayRCU.rst165
-rw-r--r--Documentation/RCU/checklist.rst258
-rw-r--r--Documentation/RCU/index.rst1
-rw-r--r--Documentation/RCU/listRCU.rst174
-rw-r--r--Documentation/RCU/lockdep.rst19
-rw-r--r--Documentation/RCU/rcu.rst19
-rw-r--r--Documentation/RCU/rcu_dereference.rst35
-rw-r--r--Documentation/RCU/rcubarrier.rst357
-rw-r--r--Documentation/RCU/rculist_nulls.rst111
-rw-r--r--Documentation/RCU/stallwarn.rst176
-rw-r--r--Documentation/RCU/torture.rst91
-rw-r--r--Documentation/RCU/whatisRCU.rst334
-rw-r--r--Documentation/accel/index.rst18
-rw-r--r--Documentation/accel/introduction.rst110
-rw-r--r--Documentation/accel/qaic/aic100.rst510
-rw-r--r--Documentation/accel/qaic/index.rst13
-rw-r--r--Documentation/accel/qaic/qaic.rst170
-rw-r--r--Documentation/accounting/delay-accounting.rst71
-rw-r--r--Documentation/accounting/psi.rst16
-rw-r--r--Documentation/admin-guide/README.rst121
-rw-r--r--Documentation/admin-guide/acpi/cppc_sysfs.rst2
-rw-r--r--Documentation/admin-guide/acpi/dsdt-override.rst13
-rw-r--r--Documentation/admin-guide/acpi/fan_performance_states.rst28
-rw-r--r--Documentation/admin-guide/acpi/index.rst1
-rw-r--r--Documentation/admin-guide/bcache.rst2
-rw-r--r--Documentation/admin-guide/blockdev/drbd/figures.rst4
-rw-r--r--Documentation/admin-guide/blockdev/drbd/node-states-8.dot13
-rw-r--r--Documentation/admin-guide/blockdev/drbd/peer-states-8.dot8
-rw-r--r--Documentation/admin-guide/blockdev/index.rst6
-rw-r--r--Documentation/admin-guide/blockdev/nbd.rst2
-rw-r--r--Documentation/admin-guide/blockdev/paride.rst388
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst133
-rw-r--r--Documentation/admin-guide/bootconfig.rst37
-rw-r--r--Documentation/admin-guide/cgroup-v1/blkio-controller.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/cgroups.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/cpusets.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/hugetlb.rst4
-rw-r--r--Documentation/admin-guide/cgroup-v1/memcg_test.rst2
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst313
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst325
-rw-r--r--Documentation/admin-guide/cifs/usage.rst13
-rw-r--r--Documentation/admin-guide/cputopology.rst29
-rw-r--r--Documentation/admin-guide/device-mapper/cache-policies.rst2
-rw-r--r--Documentation/admin-guide/device-mapper/dm-ebs.rst2
-rw-r--r--Documentation/admin-guide/device-mapper/dm-flakey.rst4
-rw-r--r--Documentation/admin-guide/device-mapper/dm-init.rst8
-rw-r--r--Documentation/admin-guide/device-mapper/dm-zoned.rst2
-rw-r--r--Documentation/admin-guide/device-mapper/unstriped.rst10
-rw-r--r--Documentation/admin-guide/device-mapper/verity.rst4
-rw-r--r--Documentation/admin-guide/device-mapper/writecache.rst18
-rw-r--r--Documentation/admin-guide/devices.rst7
-rw-r--r--Documentation/admin-guide/devices.txt15
-rw-r--r--Documentation/admin-guide/dynamic-debug-howto.rst263
-rw-r--r--Documentation/admin-guide/efi-stub.rst4
-rw-r--r--Documentation/admin-guide/ext4.rst3
-rw-r--r--Documentation/admin-guide/filesystem-monitoring.rst78
-rw-r--r--Documentation/admin-guide/gpio/gpio-sim.rst134
-rw-r--r--Documentation/admin-guide/gpio/index.rst1
-rw-r--r--Documentation/admin-guide/gpio/sysfs.rst2
-rw-r--r--Documentation/admin-guide/hw-vuln/core-scheduling.rst5
-rw-r--r--Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst91
-rw-r--r--Documentation/admin-guide/hw-vuln/index.rst2
-rw-r--r--Documentation/admin-guide/hw-vuln/mds.rst6
-rw-r--r--Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst260
-rw-r--r--Documentation/admin-guide/hw-vuln/spectre.rst141
-rw-r--r--Documentation/admin-guide/hw-vuln/tsx_async_abort.rst2
-rw-r--r--Documentation/admin-guide/hw_random.rst6
-rw-r--r--Documentation/admin-guide/index.rst16
-rw-r--r--Documentation/admin-guide/iostats.rst6
-rw-r--r--Documentation/admin-guide/kdump/gdbmacros.txt2
-rw-r--r--Documentation/admin-guide/kdump/kdump.rst10
-rw-r--r--Documentation/admin-guide/kdump/vmcoreinfo.rst45
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst21
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt1834
-rw-r--r--Documentation/admin-guide/kernel-per-CPU-kthreads.rst4
-rw-r--r--Documentation/admin-guide/laptops/lg-laptop.rst2
-rw-r--r--Documentation/admin-guide/laptops/thinkpad-acpi.rst14
-rw-r--r--Documentation/admin-guide/md.rst2
-rw-r--r--Documentation/admin-guide/media/bttv.rst2
-rw-r--r--Documentation/admin-guide/media/building.rst2
-rw-r--r--Documentation/admin-guide/media/cec-drivers.rst10
-rw-r--r--Documentation/admin-guide/media/cec.rst380
-rw-r--r--Documentation/admin-guide/media/cpia2.rst145
-rw-r--r--Documentation/admin-guide/media/davinci-vpbe.rst65
-rw-r--r--Documentation/admin-guide/media/dvb-drivers.rst1
-rw-r--r--Documentation/admin-guide/media/fimc.rst2
-rw-r--r--Documentation/admin-guide/media/i2c-cardlist.rst18
-rw-r--r--Documentation/admin-guide/media/imx7.rst62
-rw-r--r--Documentation/admin-guide/media/index.rst3
-rw-r--r--Documentation/admin-guide/media/ipu3.rst14
-rw-r--r--Documentation/admin-guide/media/ivtv.rst2
-rw-r--r--Documentation/admin-guide/media/meye.rst93
-rw-r--r--Documentation/admin-guide/media/omap3isp.rst2
-rw-r--r--Documentation/admin-guide/media/omap4_camera.rst2
-rw-r--r--Documentation/admin-guide/media/other-usb-cardlist.rst14
-rw-r--r--Documentation/admin-guide/media/pci-cardlist.rst1
-rw-r--r--Documentation/admin-guide/media/platform-cardlist.rst3
-rw-r--r--Documentation/admin-guide/media/pulse8-cec.rst13
-rw-r--r--Documentation/admin-guide/media/si476x.rst2
-rw-r--r--Documentation/admin-guide/media/tm6000-cardlist.rst83
-rw-r--r--Documentation/admin-guide/media/usb-cardlist.rst7
-rw-r--r--Documentation/admin-guide/media/v4l-drivers.rst4
-rw-r--r--Documentation/admin-guide/media/vimc.dot18
-rw-r--r--Documentation/admin-guide/media/vimc.rst46
-rw-r--r--Documentation/admin-guide/media/visl.rst175
-rw-r--r--Documentation/admin-guide/media/vivid.rst20
-rw-r--r--Documentation/admin-guide/media/zr364xx.rst102
-rw-r--r--Documentation/admin-guide/mm/cma_debugfs.rst10
-rw-r--r--Documentation/admin-guide/mm/concepts.rst13
-rw-r--r--Documentation/admin-guide/mm/damon/index.rst10
-rw-r--r--Documentation/admin-guide/mm/damon/lru_sort.rst294
-rw-r--r--Documentation/admin-guide/mm/damon/reclaim.rst274
-rw-r--r--Documentation/admin-guide/mm/damon/start.rst141
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst754
-rw-r--r--Documentation/admin-guide/mm/hugetlbpage.rst60
-rw-r--r--Documentation/admin-guide/mm/idle_page_tracking.rst9
-rw-r--r--Documentation/admin-guide/mm/index.rst7
-rw-r--r--Documentation/admin-guide/mm/ksm.rst61
-rw-r--r--Documentation/admin-guide/mm/memory-hotplug.rst149
-rw-r--r--Documentation/admin-guide/mm/multigen_lru.rst162
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst22
-rw-r--r--Documentation/admin-guide/mm/numaperf.rst8
-rw-r--r--Documentation/admin-guide/mm/pagemap.rst96
-rw-r--r--Documentation/admin-guide/mm/shrinker_debugfs.rst133
-rw-r--r--Documentation/admin-guide/mm/soft-dirty.rst2
-rw-r--r--Documentation/admin-guide/mm/swap_numa.rst (renamed from Documentation/vm/swap_numa.rst)2
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst18
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst68
-rw-r--r--Documentation/admin-guide/mm/zswap.rst (renamed from Documentation/vm/zswap.rst)36
-rw-r--r--Documentation/admin-guide/nfs/nfs-client.rst15
-rw-r--r--Documentation/admin-guide/perf/alibaba_pmu.rst100
-rw-r--r--Documentation/admin-guide/perf/hisi-pcie-pmu.rst130
-rw-r--r--Documentation/admin-guide/perf/hns3-pmu.rst136
-rw-r--r--Documentation/admin-guide/perf/index.rst5
-rw-r--r--Documentation/admin-guide/perf/meson-ddr-pmu.rst70
-rw-r--r--Documentation/admin-guide/perf/nvidia-pmu.rst299
-rw-r--r--Documentation/admin-guide/pm/amd-pstate.rst720
-rw-r--r--Documentation/admin-guide/pm/cpuidle.rst15
-rw-r--r--Documentation/admin-guide/pm/intel-speed-select.rst22
-rw-r--r--Documentation/admin-guide/pm/intel_pstate.rst4
-rw-r--r--Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst60
-rw-r--r--Documentation/admin-guide/pm/working-state.rst2
-rw-r--r--Documentation/admin-guide/quickly-build-trimmed-linux.rst1092
-rw-r--r--Documentation/admin-guide/ramoops.rst2
-rw-r--r--Documentation/admin-guide/ras.rst2
-rw-r--r--Documentation/admin-guide/reporting-issues.rst73
-rw-r--r--Documentation/admin-guide/reporting-regressions.rst451
-rw-r--r--Documentation/admin-guide/serial-console.rst36
-rw-r--r--Documentation/admin-guide/spkguide.txt6
-rw-r--r--Documentation/admin-guide/syscall-user-dispatch.rst4
-rw-r--r--Documentation/admin-guide/sysctl/fs.rst240
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst226
-rw-r--r--Documentation/admin-guide/sysctl/net.rst68
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst57
-rw-r--r--Documentation/admin-guide/sysrq.rst2
-rw-r--r--Documentation/admin-guide/tainted-kernels.rst7
-rw-r--r--Documentation/admin-guide/thermal/index.rst8
-rw-r--r--Documentation/admin-guide/thermal/intel_powerclamp.rst (renamed from Documentation/driver-api/thermal/intel_powerclamp.rst)37
-rw-r--r--Documentation/admin-guide/unicode.rst9
-rw-r--r--Documentation/admin-guide/workload-tracing.rst606
-rw-r--r--Documentation/admin-guide/xfs.rst9
-rw-r--r--Documentation/arch.rst26
-rw-r--r--Documentation/arch/arc/arc.rst85
-rw-r--r--Documentation/arch/arc/features.rst3
-rw-r--r--Documentation/arch/arc/index.rst17
-rw-r--r--Documentation/arch/ia64/aliasing.rst (renamed from Documentation/ia64/aliasing.rst)2
-rw-r--r--Documentation/arch/ia64/efirtc.rst (renamed from Documentation/ia64/efirtc.rst)0
-rw-r--r--Documentation/arch/ia64/err_inject.rst (renamed from Documentation/ia64/err_inject.rst)0
-rw-r--r--Documentation/arch/ia64/features.rst (renamed from Documentation/ia64/features.rst)0
-rw-r--r--Documentation/arch/ia64/fsys.rst (renamed from Documentation/ia64/fsys.rst)0
-rw-r--r--Documentation/arch/ia64/ia64.rst (renamed from Documentation/ia64/ia64.rst)0
-rw-r--r--Documentation/arch/ia64/index.rst (renamed from Documentation/ia64/index.rst)0
-rw-r--r--Documentation/arch/ia64/irq-redir.rst (renamed from Documentation/ia64/irq-redir.rst)0
-rw-r--r--Documentation/arch/ia64/mca.rst (renamed from Documentation/ia64/mca.rst)0
-rw-r--r--Documentation/arch/ia64/serial.rst (renamed from Documentation/ia64/serial.rst)0
-rw-r--r--Documentation/arch/index.rst28
-rw-r--r--Documentation/arch/m68k/buddha-driver.rst (renamed from Documentation/m68k/buddha-driver.rst)0
-rw-r--r--Documentation/arch/m68k/features.rst (renamed from Documentation/m68k/features.rst)0
-rw-r--r--Documentation/arch/m68k/index.rst (renamed from Documentation/m68k/index.rst)0
-rw-r--r--Documentation/arch/m68k/kernel-options.rst (renamed from Documentation/m68k/kernel-options.rst)4
-rw-r--r--Documentation/arch/nios2/features.rst (renamed from Documentation/nios2/features.rst)0
-rw-r--r--Documentation/arch/nios2/index.rst (renamed from Documentation/nios2/index.rst)0
-rw-r--r--Documentation/arch/nios2/nios2.rst (renamed from Documentation/nios2/nios2.rst)0
-rw-r--r--Documentation/arch/openrisc/features.rst (renamed from Documentation/openrisc/features.rst)0
-rw-r--r--Documentation/arch/openrisc/index.rst (renamed from Documentation/openrisc/index.rst)0
-rw-r--r--Documentation/arch/openrisc/openrisc_port.rst (renamed from Documentation/openrisc/openrisc_port.rst)0
-rw-r--r--Documentation/arch/openrisc/todo.rst (renamed from Documentation/openrisc/todo.rst)0
-rw-r--r--Documentation/arch/parisc/debugging.rst (renamed from Documentation/parisc/debugging.rst)0
-rw-r--r--Documentation/arch/parisc/features.rst (renamed from Documentation/parisc/features.rst)0
-rw-r--r--Documentation/arch/parisc/index.rst (renamed from Documentation/parisc/index.rst)0
-rw-r--r--Documentation/arch/parisc/registers.rst (renamed from Documentation/parisc/registers.rst)0
-rw-r--r--Documentation/arch/sh/booting.rst (renamed from Documentation/sh/booting.rst)0
-rw-r--r--Documentation/arch/sh/features.rst (renamed from Documentation/sh/features.rst)0
-rw-r--r--Documentation/arch/sh/index.rst (renamed from Documentation/sh/index.rst)0
-rw-r--r--Documentation/arch/sh/new-machine.rst (renamed from Documentation/sh/new-machine.rst)0
-rw-r--r--Documentation/arch/sh/register-banks.rst (renamed from Documentation/sh/register-banks.rst)0
-rw-r--r--Documentation/arch/sparc/adi.rst (renamed from Documentation/sparc/adi.rst)4
-rw-r--r--Documentation/arch/sparc/console.rst (renamed from Documentation/sparc/console.rst)0
-rw-r--r--Documentation/arch/sparc/features.rst (renamed from Documentation/sparc/features.rst)0
-rw-r--r--Documentation/arch/sparc/index.rst (renamed from Documentation/sparc/index.rst)0
-rw-r--r--Documentation/arch/sparc/oradax/dax-hv-api.txt (renamed from Documentation/sparc/oradax/dax-hv-api.txt)44
-rw-r--r--Documentation/arch/sparc/oradax/oracle-dax.rst (renamed from Documentation/sparc/oradax/oracle-dax.rst)0
-rw-r--r--Documentation/arch/x86/amd-memory-encryption.rst133
-rw-r--r--Documentation/arch/x86/amd_hsmp.rst86
-rw-r--r--Documentation/arch/x86/boot.rst (renamed from Documentation/x86/boot.rst)5
-rw-r--r--Documentation/arch/x86/booting-dt.rst (renamed from Documentation/x86/booting-dt.rst)2
-rw-r--r--Documentation/arch/x86/buslock.rst (renamed from Documentation/x86/buslock.rst)10
-rw-r--r--Documentation/arch/x86/cpuinfo.rst (renamed from Documentation/x86/cpuinfo.rst)5
-rw-r--r--Documentation/arch/x86/earlyprintk.rst (renamed from Documentation/x86/earlyprintk.rst)0
-rw-r--r--Documentation/arch/x86/elf_auxvec.rst (renamed from Documentation/x86/elf_auxvec.rst)0
-rw-r--r--Documentation/arch/x86/entry_64.rst (renamed from Documentation/x86/entry_64.rst)6
-rw-r--r--Documentation/arch/x86/exception-tables.rst (renamed from Documentation/x86/exception-tables.rst)23
-rw-r--r--Documentation/arch/x86/features.rst (renamed from Documentation/x86/features.rst)0
-rw-r--r--Documentation/arch/x86/i386/IO-APIC.rst (renamed from Documentation/x86/i386/IO-APIC.rst)0
-rw-r--r--Documentation/arch/x86/i386/index.rst (renamed from Documentation/x86/i386/index.rst)0
-rw-r--r--Documentation/arch/x86/ifs.rst2
-rw-r--r--Documentation/arch/x86/index.rst44
-rw-r--r--Documentation/arch/x86/intel-hfi.rst72
-rw-r--r--Documentation/arch/x86/intel_txt.rst (renamed from Documentation/x86/intel_txt.rst)0
-rw-r--r--Documentation/arch/x86/iommu.rst151
-rw-r--r--Documentation/arch/x86/kernel-stacks.rst (renamed from Documentation/x86/kernel-stacks.rst)2
-rw-r--r--Documentation/arch/x86/mds.rst (renamed from Documentation/x86/mds.rst)0
-rw-r--r--Documentation/arch/x86/microcode.rst240
-rw-r--r--Documentation/arch/x86/mtrr.rst (renamed from Documentation/x86/mtrr.rst)2
-rw-r--r--Documentation/arch/x86/orc-unwinder.rst (renamed from Documentation/x86/orc-unwinder.rst)6
-rw-r--r--Documentation/arch/x86/pat.rst (renamed from Documentation/x86/pat.rst)0
-rw-r--r--Documentation/arch/x86/pti.rst (renamed from Documentation/x86/pti.rst)0
-rw-r--r--Documentation/arch/x86/resctrl.rst (renamed from Documentation/x86/resctrl.rst)165
-rw-r--r--Documentation/arch/x86/sgx.rst (renamed from Documentation/x86/sgx.rst)64
-rw-r--r--Documentation/arch/x86/sva.rst (renamed from Documentation/x86/sva.rst)53
-rw-r--r--Documentation/arch/x86/tdx.rst261
-rw-r--r--Documentation/arch/x86/tlb.rst (renamed from Documentation/x86/tlb.rst)0
-rw-r--r--Documentation/arch/x86/topology.rst (renamed from Documentation/x86/topology.rst)0
-rw-r--r--Documentation/arch/x86/tsx_async_abort.rst (renamed from Documentation/x86/tsx_async_abort.rst)0
-rw-r--r--Documentation/arch/x86/usb-legacy-support.rst (renamed from Documentation/x86/usb-legacy-support.rst)0
-rw-r--r--Documentation/arch/x86/x86_64/5level-paging.rst (renamed from Documentation/x86/x86_64/5level-paging.rst)2
-rw-r--r--Documentation/arch/x86/x86_64/boot-options.rst (renamed from Documentation/x86/x86_64/boot-options.rst)44
-rw-r--r--Documentation/arch/x86/x86_64/cpu-hotplug-spec.rst (renamed from Documentation/x86/x86_64/cpu-hotplug-spec.rst)0
-rw-r--r--Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst (renamed from Documentation/x86/x86_64/fake-numa-for-cpusets.rst)2
-rw-r--r--Documentation/arch/x86/x86_64/fsgs.rst (renamed from Documentation/x86/x86_64/fsgs.rst)0
-rw-r--r--Documentation/arch/x86/x86_64/index.rst (renamed from Documentation/x86/x86_64/index.rst)0
-rw-r--r--Documentation/arch/x86/x86_64/machinecheck.rst33
-rw-r--r--Documentation/arch/x86/x86_64/mm.rst (renamed from Documentation/x86/x86_64/mm.rst)2
-rw-r--r--Documentation/arch/x86/x86_64/uefi.rst (renamed from Documentation/x86/x86_64/uefi.rst)2
-rw-r--r--Documentation/arch/x86/xstate.rst174
-rw-r--r--Documentation/arch/x86/zero-page.rst (renamed from Documentation/x86/zero-page.rst)2
-rw-r--r--Documentation/arch/xtensa/atomctl.rst (renamed from Documentation/xtensa/atomctl.rst)0
-rw-r--r--Documentation/arch/xtensa/booting.rst (renamed from Documentation/xtensa/booting.rst)0
-rw-r--r--Documentation/arch/xtensa/features.rst (renamed from Documentation/xtensa/features.rst)0
-rw-r--r--Documentation/arch/xtensa/index.rst (renamed from Documentation/xtensa/index.rst)0
-rw-r--r--Documentation/arch/xtensa/mmu.rst (renamed from Documentation/xtensa/mmu.rst)0
-rw-r--r--Documentation/arm/google/chromebook-boot-flow.rst69
-rw-r--r--Documentation/arm/index.rst8
-rw-r--r--Documentation/arm/marvell.rst88
-rw-r--r--Documentation/arm/microchip.rst20
-rw-r--r--Documentation/arm/samsung-s3c24xx/cpufreq.rst76
-rw-r--r--Documentation/arm/samsung-s3c24xx/eb2410itx.rst59
-rw-r--r--Documentation/arm/samsung-s3c24xx/gpio.rst172
-rw-r--r--Documentation/arm/samsung-s3c24xx/h1940.rst41
-rw-r--r--Documentation/arm/samsung-s3c24xx/index.rst20
-rw-r--r--Documentation/arm/samsung-s3c24xx/nand.rst30
-rw-r--r--Documentation/arm/samsung-s3c24xx/overview.rst311
-rw-r--r--Documentation/arm/samsung-s3c24xx/s3c2412.rst121
-rw-r--r--Documentation/arm/samsung-s3c24xx/s3c2413.rst22
-rw-r--r--Documentation/arm/samsung-s3c24xx/smdk2440.rst57
-rw-r--r--Documentation/arm/samsung-s3c24xx/suspend.rst137
-rw-r--r--Documentation/arm/samsung-s3c24xx/usb-host.rst91
-rw-r--r--Documentation/arm/samsung/gpio.rst8
-rw-r--r--Documentation/arm/samsung/overview.rst13
-rw-r--r--Documentation/arm/sti/overview.rst10
-rw-r--r--Documentation/arm/sti/stih415-overview.rst14
-rw-r--r--Documentation/arm/sti/stih416-overview.rst13
-rw-r--r--Documentation/arm/stm32/stm32-dma-mdma-chaining.rst415
-rw-r--r--Documentation/arm/stm32/stm32mp13-overview.rst37
-rw-r--r--Documentation/arm/stm32/stm32mp151-overview.rst36
-rw-r--r--Documentation/arm/tcm.rst2
-rw-r--r--Documentation/arm/uefi.rst4
-rw-r--r--Documentation/arm64/acpi_object_usage.rst2
-rw-r--r--Documentation/arm64/booting.rst55
-rw-r--r--Documentation/arm64/cpu-feature-registers.rst69
-rw-r--r--Documentation/arm64/elf_hwcaps.rst95
-rw-r--r--Documentation/arm64/index.rst1
-rw-r--r--Documentation/arm64/memory-tagging-extension.rst54
-rw-r--r--Documentation/arm64/memory.rst10
-rw-r--r--Documentation/arm64/perf.rst78
-rw-r--r--Documentation/arm64/pointer-authentication.rst9
-rw-r--r--Documentation/arm64/silicon-errata.rst50
-rw-r--r--Documentation/arm64/sme.rst468
-rw-r--r--Documentation/arm64/sve.rst99
-rw-r--r--Documentation/arm64/tagged-address-abi.rst2
-rw-r--r--Documentation/atomic_bitops.txt10
-rw-r--r--Documentation/atomic_t.txt2
-rw-r--r--Documentation/block/biodoc.rst1164
-rw-r--r--Documentation/block/capability.rst10
-rw-r--r--Documentation/block/index.rst5
-rw-r--r--Documentation/block/inline-encryption.rst452
-rw-r--r--Documentation/block/null_blk.rst22
-rw-r--r--Documentation/block/queue-sysfs.rst289
-rw-r--r--Documentation/block/request.rst99
-rw-r--r--Documentation/block/ublk.rst326
-rw-r--r--Documentation/bpf/bpf_design_QA.rst89
-rw-r--r--Documentation/bpf/bpf_devel_QA.rst61
-rw-r--r--Documentation/bpf/bpf_iterators.rst485
-rw-r--r--Documentation/bpf/bpf_licensing.rst92
-rw-r--r--Documentation/bpf/bpf_lsm.rst143
-rw-r--r--Documentation/bpf/bpf_prog_run.rst117
-rw-r--r--Documentation/bpf/btf.rst183
-rw-r--r--Documentation/bpf/clang-notes.rst36
-rw-r--r--Documentation/bpf/classic_vs_extended.rst376
-rw-r--r--Documentation/bpf/cpumasks.rst383
-rw-r--r--Documentation/bpf/faq.rst11
-rw-r--r--Documentation/bpf/graph_ds_impl.rst267
-rw-r--r--Documentation/bpf/helpers.rst7
-rw-r--r--Documentation/bpf/index.rst109
-rw-r--r--Documentation/bpf/instruction-set.rst475
-rw-r--r--Documentation/bpf/kfuncs.rst609
-rw-r--r--Documentation/bpf/libbpf/index.rst29
-rw-r--r--Documentation/bpf/libbpf/libbpf_naming_convention.rst57
-rw-r--r--Documentation/bpf/libbpf/libbpf_overview.rst228
-rw-r--r--Documentation/bpf/libbpf/program_types.rst203
-rw-r--r--Documentation/bpf/linux-notes.rst83
-rw-r--r--Documentation/bpf/map_array.rst262
-rw-r--r--Documentation/bpf/map_bloom_filter.rst174
-rw-r--r--Documentation/bpf/map_cgroup_storage.rst4
-rw-r--r--Documentation/bpf/map_cgrp_storage.rst109
-rw-r--r--Documentation/bpf/map_cpumap.rst177
-rw-r--r--Documentation/bpf/map_devmap.rst238
-rw-r--r--Documentation/bpf/map_hash.rst208
-rw-r--r--Documentation/bpf/map_lpm_trie.rst197
-rw-r--r--Documentation/bpf/map_of_maps.rst130
-rw-r--r--Documentation/bpf/map_queue_stack.rst146
-rw-r--r--Documentation/bpf/map_sk_storage.rst159
-rw-r--r--Documentation/bpf/map_sockmap.rst498
-rw-r--r--Documentation/bpf/map_xskmap.rst192
-rw-r--r--Documentation/bpf/maps.rst82
-rw-r--r--Documentation/bpf/other.rst10
-rw-r--r--Documentation/bpf/prog_lsm.rst143
-rw-r--r--Documentation/bpf/programs.rst12
-rw-r--r--Documentation/bpf/redirect.rst81
-rw-r--r--Documentation/bpf/ringbuf.rst4
-rw-r--r--Documentation/bpf/syscall_api.rst11
-rw-r--r--Documentation/bpf/test_debug.rst9
-rw-r--r--Documentation/bpf/verifier.rst824
-rw-r--r--Documentation/cdrom/cdrom-standard.rst21
-rw-r--r--Documentation/cdrom/ide-cd.rst538
-rw-r--r--Documentation/cdrom/index.rst1
-rw-r--r--Documentation/cdrom/packet-writing.rst4
-rw-r--r--Documentation/conf.py426
-rw-r--r--Documentation/core-api/asm-annotations.rst (renamed from Documentation/asm-annotations.rst)20
-rw-r--r--Documentation/core-api/bus-virt-phys-mapping.rst220
-rw-r--r--Documentation/core-api/cachetlb.rst6
-rw-r--r--Documentation/core-api/cpu_hotplug.rst2
-rw-r--r--Documentation/core-api/dma-api-howto.rst16
-rw-r--r--Documentation/core-api/dma-api.rst14
-rw-r--r--Documentation/core-api/entry.rst279
-rw-r--r--Documentation/core-api/idr.rst3
-rw-r--r--Documentation/core-api/index.rst19
-rw-r--r--Documentation/core-api/irq/irq-domain.rst5
-rw-r--r--Documentation/core-api/kernel-api.rst42
-rw-r--r--Documentation/core-api/kobject.rst16
-rw-r--r--Documentation/core-api/local_ops.rst2
-rw-r--r--Documentation/core-api/maple_tree.rst217
-rw-r--r--Documentation/core-api/memory-allocation.rst17
-rw-r--r--Documentation/core-api/memory-hotplug.rst3
-rw-r--r--Documentation/core-api/mm-api.rst35
-rw-r--r--Documentation/core-api/netlink.rst101
-rw-r--r--Documentation/core-api/packing.rst2
-rw-r--r--Documentation/core-api/padata.rst2
-rw-r--r--Documentation/core-api/pin_user_pages.rst31
-rw-r--r--Documentation/core-api/printk-formats.rst28
-rw-r--r--Documentation/core-api/printk-index.rst137
-rw-r--r--Documentation/core-api/protection-keys.rst44
-rw-r--r--Documentation/core-api/symbol-namespaces.rst4
-rw-r--r--Documentation/core-api/timekeeping.rst1
-rw-r--r--Documentation/core-api/watch_queue.rst (renamed from Documentation/watch_queue.rst)0
-rw-r--r--Documentation/core-api/workqueue.rst25
-rw-r--r--Documentation/core-api/wrappers/atomic_bitops.rst18
-rw-r--r--Documentation/core-api/wrappers/atomic_t.rst19
-rw-r--r--Documentation/core-api/wrappers/memory-barriers.rst18
-rw-r--r--Documentation/core-api/xarray.rst14
-rw-r--r--Documentation/cpu-freq/core.rst6
-rw-r--r--Documentation/cpu-freq/cpu-drivers.rst3
-rw-r--r--Documentation/cpu-freq/index.rst15
-rw-r--r--Documentation/crypto/crypto_engine.rst4
-rw-r--r--Documentation/crypto/devel-algos.rst2
-rw-r--r--Documentation/crypto/index.rst6
-rw-r--r--Documentation/crypto/userspace-if.rst15
-rw-r--r--Documentation/dev-tools/checkpatch.rst88
-rw-r--r--Documentation/dev-tools/coccinelle.rst10
-rw-r--r--Documentation/dev-tools/gdb-kernel-debugging.rst4
-rw-r--r--Documentation/dev-tools/index.rst2
-rw-r--r--Documentation/dev-tools/kasan.rst248
-rw-r--r--Documentation/dev-tools/kcov.rst174
-rw-r--r--Documentation/dev-tools/kcsan.rst76
-rw-r--r--Documentation/dev-tools/kfence.rst35
-rw-r--r--Documentation/dev-tools/kgdb.rst6
-rw-r--r--Documentation/dev-tools/kmemleak.rst3
-rw-r--r--Documentation/dev-tools/kmsan.rst428
-rw-r--r--Documentation/dev-tools/kselftest.rst46
-rw-r--r--Documentation/dev-tools/ktap.rst311
-rw-r--r--Documentation/dev-tools/kunit/api/functionredirection.rst162
-rw-r--r--Documentation/dev-tools/kunit/api/index.rst17
-rw-r--r--Documentation/dev-tools/kunit/api/resource.rst13
-rw-r--r--Documentation/dev-tools/kunit/api/test.rst3
-rw-r--r--Documentation/dev-tools/kunit/architecture.rst196
-rw-r--r--Documentation/dev-tools/kunit/faq.rst79
-rw-r--r--Documentation/dev-tools/kunit/index.rst172
-rw-r--r--Documentation/dev-tools/kunit/kunit-tool.rst232
-rw-r--r--Documentation/dev-tools/kunit/kunit_suitememorydiagram.svg81
-rw-r--r--Documentation/dev-tools/kunit/run_manual.rst57
-rw-r--r--Documentation/dev-tools/kunit/run_wrapper.rst323
-rw-r--r--Documentation/dev-tools/kunit/running_tips.rst17
-rw-r--r--Documentation/dev-tools/kunit/start.rst250
-rw-r--r--Documentation/dev-tools/kunit/style.rst105
-rw-r--r--Documentation/dev-tools/kunit/tips.rst190
-rw-r--r--Documentation/dev-tools/kunit/usage.rst628
-rw-r--r--Documentation/dev-tools/sparse.rst2
-rw-r--r--Documentation/dev-tools/testing-overview.rst63
-rw-r--r--Documentation/devicetree/bindings/.gitignore5
-rw-r--r--Documentation/devicetree/bindings/.yamllint2
-rw-r--r--Documentation/devicetree/bindings/Makefile66
-rw-r--r--Documentation/devicetree/bindings/arm/actions.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/airoha.yaml28
-rw-r--r--Documentation/devicetree/bindings/arm/altera.yaml57
-rw-r--r--Documentation/devicetree/bindings/arm/amazon,al.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic.yaml33
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml6
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-mx-secbus2.yaml4
-rw-r--r--Documentation/devicetree/bindings/arm/apple.yaml56
-rw-r--r--Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml135
-rw-r--r--Documentation/devicetree/bindings/arm/arm,cci-400.yaml211
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-catu.yaml104
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-cpu-debug.yaml81
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-cti.yaml334
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-dynamic-funnel.yaml129
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-dynamic-replicator.yaml129
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-etb10.yaml95
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-etm.yaml159
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-static-funnel.yaml93
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-static-replicator.yaml94
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-stm.yaml104
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-tmc.yaml137
-rw-r--r--Documentation/devicetree/bindings/arm/arm,coresight-tpiu.yaml94
-rw-r--r--Documentation/devicetree/bindings/arm/arm,corstone1000.yaml45
-rw-r--r--Documentation/devicetree/bindings/arm/arm,embedded-trace-extension.yaml77
-rw-r--r--Documentation/devicetree/bindings/arm/arm,integrator.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/arm,realview.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/arm,trace-buffer-extension.yaml49
-rw-r--r--Documentation/devicetree/bindings/arm/arm,versatile-sysreg.yaml35
-rw-r--r--Documentation/devicetree/bindings/arm/arm,versatile.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml49
-rw-r--r--Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt27
-rw-r--r--Documentation/devicetree/bindings/arm/aspeed/aspeed,sbc.yaml37
-rw-r--r--Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml92
-rw-r--r--Documentation/devicetree/bindings/arm/atmel-at91.yaml53
-rw-r--r--Documentation/devicetree/bindings/arm/atmel-sysregs.txt15
-rw-r--r--Documentation/devicetree/bindings/arm/axxia.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/bcm2835.yaml4
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm11351.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm21664.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm23550.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.yaml21
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm4908.yaml41
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcm63138.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml151
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,brcmstb.txt11
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,cygnus.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,hr2.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,ns2.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,nsp.yaml67
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,stingray.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/brcm,vulcan-soc.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/bcm/raspberrypi,bcm2835-firmware.yaml5
-rw-r--r--Documentation/devicetree/bindings/arm/bitmain.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/calxeda.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/cci-control-port.yaml38
-rw-r--r--Documentation/devicetree/bindings/arm/cci.txt224
-rw-r--r--Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt49
-rw-r--r--Documentation/devicetree/bindings/arm/coresight-cti.yaml332
-rw-r--r--Documentation/devicetree/bindings/arm/coresight.txt397
-rw-r--r--Documentation/devicetree/bindings/arm/cpus.yaml46
-rw-r--r--Documentation/devicetree/bindings/arm/digicolor.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/ete.yaml75
-rw-r--r--Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.yaml11
-rw-r--r--Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.txt20
-rw-r--r--Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.yaml46
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt19
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt19
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt270
-rw-r--r--Documentation/devicetree/bindings/arm/fsl.yaml519
-rw-r--r--Documentation/devicetree/bindings/arm/fw-cfg.txt38
-rw-r--r--Documentation/devicetree/bindings/arm/hisilicon/controller/hip04-bootwrapper.yaml5
-rw-r--r--Documentation/devicetree/bindings/arm/hisilicon/hisilicon.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/hpe,gxp.yaml27
-rw-r--r--Documentation/devicetree/bindings/arm/idle-states.yaml661
-rw-r--r--Documentation/devicetree/bindings/arm/intel,keembay.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/intel,socfpga.yaml27
-rw-r--r--Documentation/devicetree/bindings/arm/intel-ixp4xx.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/keystone/ti,k3-sci-common.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/keystone/ti,sci.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/linux,dummy-virt.yaml20
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt26
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/armada-37xx.yaml47
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/armada-7k-8k.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/cp110-system-controller.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/marvell/marvell,ac5.yaml32
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek.yaml68
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt34
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt41
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.yaml84
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml59
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-pcie-mirror.yaml42
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml104
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7986-wed-pcie.yaml43
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-clock.yaml56
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-sys-clock.yaml57
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-clock.yaml12
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-sys-clock.yaml7
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-clock.yaml238
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-sys-clock.yaml76
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml7
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt23
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt34
-rw-r--r--Documentation/devicetree/bindings/arm/microchip,sparx5.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/moxart.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/mrvl/mrvl.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/msm/qcom,idle-state.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt49
-rw-r--r--Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt44
-rw-r--r--Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml60
-rw-r--r--Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/mstar/mstar.yaml6
-rw-r--r--Documentation/devicetree/bindings/arm/npcm/npcm.yaml9
-rw-r--r--Documentation/devicetree/bindings/arm/npcm/nuvoton,gcr.yaml50
-rw-r--r--Documentation/devicetree/bindings/arm/nvidia,tegra194-ccplex.yaml8
-rw-r--r--Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/omap/omap.txt3
-rw-r--r--Documentation/devicetree/bindings/arm/omap/prcm.txt7
-rw-r--r--Documentation/devicetree/bindings/arm/oxnas.txt14
-rw-r--r--Documentation/devicetree/bindings/arm/pmu.yaml14
-rw-r--r--Documentation/devicetree/bindings/arm/psci.yaml13
-rw-r--r--Documentation/devicetree/bindings/arm/qcom,coresight-tpda.yaml129
-rw-r--r--Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml93
-rw-r--r--Documentation/devicetree/bindings/arm/qcom-soc.yaml66
-rw-r--r--Documentation/devicetree/bindings/arm/qcom.yaml876
-rw-r--r--Documentation/devicetree/bindings/arm/rda.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/realtek.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/renesas.yaml353
-rw-r--r--Documentation/devicetree/bindings/arm/rockchip.yaml263
-rw-r--r--Documentation/devicetree/bindings/arm/rockchip/pmu.yaml10
-rw-r--r--Documentation/devicetree/bindings/arm/samsung/exynos-chipid.yaml40
-rw-r--r--Documentation/devicetree/bindings/arm/samsung/pmu.yaml128
-rw-r--r--Documentation/devicetree/bindings/arm/samsung/samsung-boards.yaml21
-rw-r--r--Documentation/devicetree/bindings/arm/samsung/samsung-soc.yaml40
-rw-r--r--Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/socionext/uniphier.yaml8
-rw-r--r--Documentation/devicetree/bindings/arm/sp810.txt46
-rw-r--r--Documentation/devicetree/bindings/arm/sp810.yaml80
-rw-r--r--Documentation/devicetree/bindings/arm/spe-pmu.txt20
-rw-r--r--Documentation/devicetree/bindings/arm/spear.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/sprd/sprd.yaml7
-rw-r--r--Documentation/devicetree/bindings/arm/sti.yaml4
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml6
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml8
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/stm32.yaml86
-rw-r--r--Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml29
-rw-r--r--Documentation/devicetree/bindings/arm/sunxi.yaml45
-rw-r--r--Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml102
-rw-r--r--Documentation/devicetree/bindings/arm/sunxi/allwinner,sun6i-a31-cpuconfig.yaml38
-rw-r--r--Documentation/devicetree/bindings/arm/sunxi/allwinner,sun9i-a80-prcm.yaml33
-rw-r--r--Documentation/devicetree/bindings/arm/swir.txt12
-rw-r--r--Documentation/devicetree/bindings/arm/syna.txt4
-rw-r--r--Documentation/devicetree/bindings/arm/tegra.yaml59
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml50
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt133
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.yaml198
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-axi2apb.yaml40
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-cbb.yaml97
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.yaml61
-rw-r--r--Documentation/devicetree/bindings/arm/tegra/nvidia,tegra234-cbb.yaml74
-rw-r--r--Documentation/devicetree/bindings/arm/tesla.yaml27
-rw-r--r--Documentation/devicetree/bindings/arm/ti/k3.yaml67
-rw-r--r--Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/toshiba.yaml3
-rw-r--r--Documentation/devicetree/bindings/arm/trbe.yaml49
-rw-r--r--Documentation/devicetree/bindings/arm/ux500.yaml37
-rw-r--r--Documentation/devicetree/bindings/arm/versatile-sysreg.txt10
-rw-r--r--Documentation/devicetree/bindings/arm/vexpress-config.yaml285
-rw-r--r--Documentation/devicetree/bindings/arm/vexpress-sysreg.txt103
-rw-r--r--Documentation/devicetree/bindings/arm/vexpress-sysreg.yaml96
-rw-r--r--Documentation/devicetree/bindings/arm/vt8500.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/xen.txt14
-rw-r--r--Documentation/devicetree/bindings/arm/xilinx.yaml19
-rw-r--r--Documentation/devicetree/bindings/ata/ahci-ceva.txt63
-rw-r--r--Documentation/devicetree/bindings/ata/ahci-common.yaml123
-rw-r--r--Documentation/devicetree/bindings/ata/ahci-platform.txt79
-rw-r--r--Documentation/devicetree/bindings/ata/ahci-platform.yaml176
-rw-r--r--Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml2
-rw-r--r--Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml2
-rw-r--r--Documentation/devicetree/bindings/ata/ata-generic.yaml58
-rw-r--r--Documentation/devicetree/bindings/ata/baikal,bt1-ahci.yaml115
-rw-r--r--Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt45
-rw-r--r--Documentation/devicetree/bindings/ata/brcm,sata-brcm.yaml87
-rw-r--r--Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml189
-rw-r--r--Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.txt55
-rw-r--r--Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.yaml107
-rw-r--r--Documentation/devicetree/bindings/ata/intel,ixp4xx-compact-flash.yaml1
-rw-r--r--Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml5
-rw-r--r--Documentation/devicetree/bindings/ata/sata-common.yaml17
-rw-r--r--Documentation/devicetree/bindings/ata/sata_highbank.yaml2
-rw-r--r--Documentation/devicetree/bindings/ata/snps,dwc-ahci-common.yaml102
-rw-r--r--Documentation/devicetree/bindings/ata/snps,dwc-ahci.yaml75
-rw-r--r--Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml36
-rw-r--r--Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml8
-rw-r--r--Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml3
-rw-r--r--Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml37
-rw-r--r--Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt34
-rw-r--r--Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml66
-rw-r--r--Documentation/devicetree/bindings/bus/fsl,imx8qxp-pixel-link-msi-bus.yaml232
-rw-r--r--Documentation/devicetree/bindings/bus/fsl,spba-bus.yaml68
-rw-r--r--Documentation/devicetree/bindings/bus/imx-weim.txt5
-rw-r--r--Documentation/devicetree/bindings/bus/intel,ixp4xx-expansion-bus-controller.yaml168
-rw-r--r--Documentation/devicetree/bindings/bus/microsoft,vmbus.yaml54
-rw-r--r--Documentation/devicetree/bindings/bus/nvidia,tegra210-aconnect.yaml5
-rw-r--r--Documentation/devicetree/bindings/bus/palmbus.yaml80
-rw-r--r--Documentation/devicetree/bindings/bus/qcom,ssc-block-bus.yaml144
-rw-r--r--Documentation/devicetree/bindings/bus/ti-sysc.txt139
-rw-r--r--Documentation/devicetree/bindings/bus/ti-sysc.yaml215
-rw-r--r--Documentation/devicetree/bindings/bus/xlnx,versal-net-cdx.yaml82
-rw-r--r--Documentation/devicetree/bindings/cache/baikal,bt1-l2-ctl.yaml (renamed from Documentation/devicetree/bindings/memory-controllers/baikal,bt1-l2-ctl.yaml)2
-rw-r--r--Documentation/devicetree/bindings/cache/freescale-l2cache.txt55
-rw-r--r--Documentation/devicetree/bindings/cache/l2c2x0.yaml (renamed from Documentation/devicetree/bindings/arm/l2c2x0.yaml)2
-rw-r--r--Documentation/devicetree/bindings/cache/marvell,feroceon-cache.txt (renamed from Documentation/devicetree/bindings/arm/mrvl/feroceon.txt)0
-rw-r--r--Documentation/devicetree/bindings/cache/marvell,tauros2-cache.txt (renamed from Documentation/devicetree/bindings/arm/mrvl/tauros2.txt)0
-rw-r--r--Documentation/devicetree/bindings/cache/qcom,llcc.yaml168
-rw-r--r--Documentation/devicetree/bindings/cache/sifive,ccache0.yaml170
-rw-r--r--Documentation/devicetree/bindings/cache/socionext,uniphier-system-cache.yaml (renamed from Documentation/devicetree/bindings/arm/socionext/socionext,uniphier-system-cache.yaml)3
-rw-r--r--Documentation/devicetree/bindings/chosen.txt137
-rw-r--r--Documentation/devicetree/bindings/chrome/google,cros-ec-typec.yaml18
-rw-r--r--Documentation/devicetree/bindings/chrome/google,cros-kbd-led-backlight.yaml36
-rw-r--r--Documentation/devicetree/bindings/clock/adi,axi-clkgen.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml58
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml6
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml7
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-ahb-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-apb0-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-cpus-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-de-clks.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-gt-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-mmc-config-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-pll4-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-clks.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-mod-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-phy-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/amlogic,meson8-ddr-clkc.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/apple,nco.yaml63
-rw-r--r--Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml9
-rw-r--r--Documentation/devicetree/bindings/clock/bitmain,bm1880-clk.yaml12
-rw-r--r--Documentation/devicetree/bindings/clock/brcm,bcm2711-dvp.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/brcm,bcm63268-timer-clocks.yaml40
-rw-r--r--Documentation/devicetree/bindings/clock/calxeda.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/cirrus,cs2000-cp.yaml90
-rw-r--r--Documentation/devicetree/bindings/clock/clock-bindings.txt188
-rw-r--r--Documentation/devicetree/bindings/clock/cs2000-cp.txt22
-rw-r--r--Documentation/devicetree/bindings/clock/efm32-clock.txt11
-rw-r--r--Documentation/devicetree/bindings/clock/exynos5260-clock.txt190
-rw-r--r--Documentation/devicetree/bindings/clock/exynos5410-clock.txt50
-rw-r--r--Documentation/devicetree/bindings/clock/exynos5433-clock.txt507
-rw-r--r--Documentation/devicetree/bindings/clock/exynos7-clock.txt108
-rw-r--r--Documentation/devicetree/bindings/clock/fixed-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/fixed-factor-clock.yaml3
-rw-r--r--Documentation/devicetree/bindings/clock/fixed-mmio-clock.txt24
-rw-r--r--Documentation/devicetree/bindings/clock/fixed-mmio-clock.yaml47
-rw-r--r--Documentation/devicetree/bindings/clock/fsl,imx8m-anatop.yaml51
-rw-r--r--Documentation/devicetree/bindings/clock/fsl,plldig.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/fsl,sai-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/fsl,scu-clk.yaml43
-rw-r--r--Documentation/devicetree/bindings/clock/gpio-gate-clock.txt21
-rw-r--r--Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml42
-rw-r--r--Documentation/devicetree/bindings/clock/idt,versaclock5.yaml18
-rw-r--r--Documentation/devicetree/bindings/clock/imx1-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx21-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx23-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx25-clock.yaml10
-rw-r--r--Documentation/devicetree/bindings/clock/imx27-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx28-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx31-clock.yaml10
-rw-r--r--Documentation/devicetree/bindings/clock/imx35-clock.yaml10
-rw-r--r--Documentation/devicetree/bindings/clock/imx5-clock.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/imx6q-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx6sl-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx6sll-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx6sx-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx6ul-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx7d-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/imx7ulp-pcc-clock.yaml13
-rw-r--r--Documentation/devicetree/bindings/clock/imx7ulp-scg-clock.yaml13
-rw-r--r--Documentation/devicetree/bindings/clock/imx8m-clock.yaml8
-rw-r--r--Documentation/devicetree/bindings/clock/imx8mp-audiomix.yaml79
-rw-r--r--Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml13
-rw-r--r--Documentation/devicetree/bindings/clock/imx8ulp-cgc-clock.yaml43
-rw-r--r--Documentation/devicetree/bindings/clock/imx8ulp-pcc-clock.yaml50
-rw-r--r--Documentation/devicetree/bindings/clock/imx93-clock.yaml62
-rw-r--r--Documentation/devicetree/bindings/clock/imxrt1050-clock.yaml59
-rw-r--r--Documentation/devicetree/bindings/clock/ingenic,cgu.yaml6
-rw-r--r--Documentation/devicetree/bindings/clock/intc_stratix10.txt20
-rw-r--r--Documentation/devicetree/bindings/clock/intel,agilex.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/intel,cgu-lgm.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/intel,easic-n5x.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/intel,stratix10.yaml35
-rw-r--r--Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml45
-rw-r--r--Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml63
-rw-r--r--Documentation/devicetree/bindings/clock/marvell,armada-3700-uart-clock.yaml59
-rw-r--r--Documentation/devicetree/bindings/clock/maxim,max77686.txt4
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,apmixedsys.yaml63
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt6795-clock.yaml66
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt6795-sys-clock.yaml54
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt7621-sysc.yaml14
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt8186-fhctl.yaml58
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml55
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt8365-clock.yaml42
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,mt8365-sys-clock.yaml47
-rw-r--r--Documentation/devicetree/bindings/clock/mediatek,topckgen.yaml63
-rw-r--r--Documentation/devicetree/bindings/clock/microchip,lan966x-gck.yaml60
-rw-r--r--Documentation/devicetree/bindings/clock/microchip,mpfs-ccc.yaml80
-rw-r--r--Documentation/devicetree/bindings/clock/microchip,mpfs-clkcfg.yaml80
-rw-r--r--Documentation/devicetree/bindings/clock/milbeaut-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/mstar,msc313-cpupll.yaml45
-rw-r--r--Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml49
-rw-r--r--Documentation/devicetree/bindings/clock/nvidia,tegra124-car.yaml7
-rw-r--r--Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt2
-rw-r--r--Documentation/devicetree/bindings/clock/nvidia,tegra20-car.yaml42
-rw-r--r--Documentation/devicetree/bindings/clock/pwm-clock.txt26
-rw-r--r--Documentation/devicetree/bindings/clock/pwm-clock.yaml45
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,a53pll.yaml10
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,a7pll.yaml4
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,aoncc-sm8250.yaml11
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,audiocc-sm8250.yaml7
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,camcc-sm8250.yaml26
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,camcc.txt18
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,dispcc-sc8280xp.yaml97
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,dispcc-sm6125.yaml86
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml86
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml27
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-apq8064.yaml84
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-apq8084.yaml86
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-ipq4019.yaml53
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-ipq8064.yaml81
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-ipq8074.yaml37
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8660.yaml54
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml62
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8916.yaml66
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8974.yaml61
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8976.yaml83
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml56
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8996.yaml48
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-msm8998.yaml58
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-other.yaml50
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-qcm2290.yaml54
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-qcs404.yaml47
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sc7180.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sc7280.yaml30
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sc8180x.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sc8280xp.yaml121
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sdm660.yaml61
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sdm845.yaml92
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sdx55.yaml37
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sdx65.yaml62
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm6115.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm8150.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm8250.yaml34
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm8350.yaml32
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc-sm8450.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gcc.yaml61
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gpucc-sdm660.yaml4
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gpucc.yaml26
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml53
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml61
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,kpss-acc-v1.yaml72
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,kpss-gcc.yaml88
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,lcc.txt22
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,lcc.yaml86
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,lpasscc.txt26
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,mmcc.yaml290
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,msm8996-apcc.yaml17
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,msm8996-cbf.yaml53
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,msm8998-gpucc.yaml8
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,q6sstopcc.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,qcm2290-dispcc.yaml87
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,qdu1000-gcc.yaml51
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,rpmcc.txt62
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,rpmcc.yaml160
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,rpmhcc.yaml9
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sa8775p-gcc.yaml84
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7180-camcc.yaml9
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7180-dispcc.yaml8
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7180-lpasscorecc.yaml9
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml7
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7280-camcc.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7280-dispcc.yaml8
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscc.yaml72
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscorecc.yaml192
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sdm845-camcc.yaml65
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sdm845-dispcc.yaml8
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sdm845-lpasscc.yaml47
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6115-dispcc.yaml69
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml58
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml64
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml49
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml54
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml51
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml60
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml52
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml81
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm8450-dispcc.yaml97
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm8550-dispcc.yaml105
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm8550-gcc.yaml62
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml55
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.txt59
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,videocc.yaml90
-rw-r--r--Documentation/devicetree/bindings/clock/qoriq-clock.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,9series.yaml103
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,cpg-div6-clock.yaml12
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml6
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt24
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,h8s2678-pll-clock.txt23
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.yaml13
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,rcar-usb2-clock-sel.yaml6
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,rzg2l-cpg.yaml23
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,versaclock7.yaml64
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,px30-cru.txt70
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,px30-cru.yaml119
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt56
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.yaml72
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.txt58
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.yaml76
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt61
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.yaml78
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt58
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.yaml74
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt67
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.yaml85
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.txt60
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.yaml76
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.txt61
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.yaml78
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3399-cru.yaml35
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3568-cru.yaml15
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rk3588-cru.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt59
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.yaml75
-rw-r--r--Documentation/devicetree/bindings/clock/rockchip,rv1126-cru.yaml62
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos-audss-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos-ext-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos4412-isp-clock.yaml3
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos5260-clock.yaml382
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos5410-clock.yaml66
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos5433-clock.yaml524
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos7-clock.yaml272
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml193
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml311
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,exynosautov9-clock.yaml263
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s2mps11.txt49
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s2mps11.yaml44
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s3c2410-clock.txt49
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s3c2412-clock.txt49
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s3c2443-clock.txt55
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s5pv210-audss-clock.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt77
-rw-r--r--Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.yaml79
-rw-r--r--Documentation/devicetree/bindings/clock/sifive/fu540-prci.yaml1
-rw-r--r--Documentation/devicetree/bindings/clock/sifive/fu740-prci.yaml4
-rw-r--r--Documentation/devicetree/bindings/clock/silabs,si5351.txt2
-rw-r--r--Documentation/devicetree/bindings/clock/skyworks,si521xx.yaml59
-rw-r--r--Documentation/devicetree/bindings/clock/socionext,uniphier-clock.yaml45
-rw-r--r--Documentation/devicetree/bindings/clock/sprd,sc9863a-clk.yaml6
-rw-r--r--Documentation/devicetree/bindings/clock/sprd,ums512-clk.yaml71
-rw-r--r--Documentation/devicetree/bindings/clock/st,stm32mp1-rcc.yaml48
-rw-r--r--Documentation/devicetree/bindings/clock/st/st,flexgen.txt2
-rw-r--r--Documentation/devicetree/bindings/clock/starfive,jh7100-audclk.yaml57
-rw-r--r--Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml56
-rw-r--r--Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml107
-rw-r--r--Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml104
-rw-r--r--Documentation/devicetree/bindings/clock/stericsson,u8500-clks.yaml178
-rw-r--r--Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml52
-rw-r--r--Documentation/devicetree/bindings/clock/tesla,fsd-clock.yaml198
-rw-r--r--Documentation/devicetree/bindings/clock/ti,am654-ehrpwm-tbclk.yaml1
-rw-r--r--Documentation/devicetree/bindings/clock/ti,cdce925.txt53
-rw-r--r--Documentation/devicetree/bindings/clock/ti,cdce925.yaml103
-rw-r--r--Documentation/devicetree/bindings/clock/ti,lmk04832.yaml4
-rw-r--r--Documentation/devicetree/bindings/clock/ti,sci-clk.yaml2
-rw-r--r--Documentation/devicetree/bindings/clock/ti-clkctrl.txt4
-rw-r--r--Documentation/devicetree/bindings/clock/ti/clockdomain.txt3
-rw-r--r--Documentation/devicetree/bindings/clock/ti/composite.txt3
-rw-r--r--Documentation/devicetree/bindings/clock/ti/davinci/pll.txt2
-rw-r--r--Documentation/devicetree/bindings/clock/ti/dra7-atl.txt2
-rw-r--r--Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/ti/gate.txt3
-rw-r--r--Documentation/devicetree/bindings/clock/ti/interface.txt3
-rw-r--r--Documentation/devicetree/bindings/clock/ti/mux.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/ti/ti,clksel.yaml51
-rw-r--r--Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml57
-rw-r--r--Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml52
-rw-r--r--Documentation/devicetree/bindings/clock/ux500.txt64
-rw-r--r--Documentation/devicetree/bindings/clock/xlnx,clocking-wizard.yaml77
-rw-r--r--Documentation/devicetree/bindings/connector/usb-connector.yaml155
-rw-r--r--Documentation/devicetree/bindings/counter/ti,am62-ecap-capture.yaml61
-rw-r--r--Documentation/devicetree/bindings/cpu/cpu-capacity.txt (renamed from Documentation/devicetree/bindings/arm/cpu-capacity.txt)8
-rw-r--r--Documentation/devicetree/bindings/cpu/idle-states.yaml855
-rw-r--r--Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml117
-rw-r--r--Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt2
-rw-r--r--Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek-hw.yaml2
-rw-r--r--Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt7
-rw-r--r--Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt172
-rw-r--r--Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml362
-rw-r--r--Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml204
-rw-r--r--Documentation/devicetree/bindings/crypto/allwinner,sun4i-a10-crypto.yaml12
-rw-r--r--Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml34
-rw-r--r--Documentation/devicetree/bindings/crypto/aspeed,ast2500-hace.yaml53
-rw-r--r--Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml49
-rw-r--r--Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-aes.yaml66
-rw-r--r--Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-sha.yaml60
-rw-r--r--Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-tdes.yaml64
-rw-r--r--Documentation/devicetree/bindings/crypto/atmel-crypto.txt68
-rw-r--r--Documentation/devicetree/bindings/crypto/fsl,sec-v4.0-mon.yaml156
-rw-r--r--Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml266
-rw-r--r--Documentation/devicetree/bindings/crypto/fsl-sec4.txt553
-rw-r--r--Documentation/devicetree/bindings/crypto/intel,ixp4xx-crypto.yaml15
-rw-r--r--Documentation/devicetree/bindings/crypto/intel,keembay-ocs-aes.yaml2
-rw-r--r--Documentation/devicetree/bindings/crypto/intel,keembay-ocs-ecc.yaml47
-rw-r--r--Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml2
-rw-r--r--Documentation/devicetree/bindings/crypto/qcom,inline-crypto-engine.yaml42
-rw-r--r--Documentation/devicetree/bindings/crypto/qcom,prng.txt19
-rw-r--r--Documentation/devicetree/bindings/crypto/qcom,prng.yaml43
-rw-r--r--Documentation/devicetree/bindings/crypto/qcom-qce.txt25
-rw-r--r--Documentation/devicetree/bindings/crypto/qcom-qce.yaml123
-rw-r--r--Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml127
-rw-r--r--Documentation/devicetree/bindings/crypto/rockchip-crypto.txt28
-rw-r--r--Documentation/devicetree/bindings/crypto/samsung-slimsss.yaml1
-rw-r--r--Documentation/devicetree/bindings/crypto/st,stm32-crc.yaml4
-rw-r--r--Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml23
-rw-r--r--Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml27
-rw-r--r--Documentation/devicetree/bindings/crypto/ti,sa2ul.yaml18
-rw-r--r--Documentation/devicetree/bindings/crypto/xlnx,zynqmp-aes.yaml2
-rw-r--r--Documentation/devicetree/bindings/ddr/lpddr2-timings.txt52
-rw-r--r--Documentation/devicetree/bindings/ddr/lpddr2.txt102
-rw-r--r--Documentation/devicetree/bindings/ddr/lpddr3-timings.txt58
-rw-r--r--Documentation/devicetree/bindings/ddr/lpddr3.txt106
-rw-r--r--Documentation/devicetree/bindings/devfreq/event/samsung,exynos-nocp.yaml2
-rw-r--r--Documentation/devicetree/bindings/devfreq/event/samsung,exynos-ppmu.yaml2
-rw-r--r--Documentation/devicetree/bindings/devfreq/exynos-bus.txt488
-rw-r--r--Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt212
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-backend.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml6
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-frontend.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-hdmi.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tv-encoder.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun6i-a31-drc.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml32
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-de2-mixer.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-dw-hdmi.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-hdmi-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun8i-r40-tcon-top.yaml135
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun9i-a80-deu.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/amlogic,meson-dw-hdmi.yaml12
-rw-r--r--Documentation/devicetree/bindings/display/amlogic,meson-vpu.yaml12
-rw-r--r--Documentation/devicetree/bindings/display/arm,hdlcd.txt79
-rw-r--r--Documentation/devicetree/bindings/display/arm,hdlcd.yaml89
-rw-r--r--Documentation/devicetree/bindings/display/arm,komeda.txt78
-rw-r--r--Documentation/devicetree/bindings/display/arm,komeda.yaml131
-rw-r--r--Documentation/devicetree/bindings/display/arm,malidp.txt68
-rw-r--r--Documentation/devicetree/bindings/display/arm,malidp.yaml119
-rw-r--r--Documentation/devicetree/bindings/display/arm,pl11x.txt110
-rw-r--r--Documentation/devicetree/bindings/display/arm,pl11x.yaml170
-rw-r--r--Documentation/devicetree/bindings/display/atmel,lcdc.txt1
-rw-r--r--Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/brcm,bcm2835-dsi0.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/brcm,bcm2835-hdmi.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/brcm,bcm2835-v3d.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/brcm,bcm2835-vec.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/bridge/adi,adv7511.yaml19
-rw-r--r--Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml14
-rw-r--r--Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml91
-rw-r--r--Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/bridge/analogix,dp.yaml63
-rw-r--r--Documentation/devicetree/bindings/display/bridge/analogix_dp.txt51
-rw-r--r--Documentation/devicetree/bindings/display/bridge/anx6345.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/bridge/cdns,dsi.txt112
-rw-r--r--Documentation/devicetree/bindings/display/bridge/cdns,dsi.yaml180
-rw-r--r--Documentation/devicetree/bindings/display/bridge/cdns,mhdp8546.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/bridge/chipone,icn6211.yaml28
-rw-r--r--Documentation/devicetree/bindings/display/bridge/chrontel,ch7033.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-ldb.yaml173
-rw-r--r--Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-combiner.yaml144
-rw-r--r--Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-link.yaml144
-rw-r--r--Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pxl2dpi.yaml108
-rw-r--r--Documentation/devicetree/bindings/display/bridge/fsl,ldb.yaml119
-rw-r--r--Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ingenic,jz4780-hdmi.yaml81
-rw-r--r--Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml70
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ite,it66121.yaml11
-rw-r--r--Documentation/devicetree/bindings/display/bridge/lontium,lt8912b.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/bridge/lontium,lt9211.yaml117
-rw-r--r--Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml54
-rw-r--r--Documentation/devicetree/bindings/display/bridge/nxp,ptn3460.yaml106
-rw-r--r--Documentation/devicetree/bindings/display/bridge/nxp,tda998x.yaml109
-rw-r--r--Documentation/devicetree/bindings/display/bridge/parade,ps8622.yaml115
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ps8622.txt31
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ps8640.yaml25
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ptn3460.txt39
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml119
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,dsi.yaml183
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/bridge/samsung,mipi-dsim.yaml255
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sii902x.txt78
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sii9234.txt49
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sil,sii8620.yaml108
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sil,sii9022.yaml131
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sil,sii9234.yaml110
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sil-sii8620.txt33
-rw-r--r--Documentation/devicetree/bindings/display/bridge/snps,dw-mipi-dsi.yaml18
-rw-r--r--Documentation/devicetree/bindings/display/bridge/synopsys,dw-hdmi.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/bridge/tda998x.txt54
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ti,dlpc3433.yaml117
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml6
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358762.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt35
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.yaml89
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt54
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.yaml174
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358768.yaml10
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/dp-aux-bus.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/dsi-controller.yaml18
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos-mic.txt51
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt60
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos7-decon.txt65
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_dp.txt2
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt90
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt64
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_hdmiddc.txt15
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_hdmiphy.txt15
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos_mixer.txt26
-rw-r--r--Documentation/devicetree/bindings/display/exynos/samsung-fimd.txt107
-rw-r--r--Documentation/devicetree/bindings/display/fsl,lcdif.yaml43
-rw-r--r--Documentation/devicetree/bindings/display/ilitek,ili9341.txt27
-rw-r--r--Documentation/devicetree/bindings/display/ilitek,ili9486.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt57
-rw-r--r--Documentation/devicetree/bindings/display/imx/fsl,imx-lcdc.yaml146
-rw-r--r--Documentation/devicetree/bindings/display/imx/nxp,imx8mq-dcss.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/ingenic,ipu.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/ingenic,lcd.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/intel,keembay-display.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/lvds.yaml (renamed from Documentation/devicetree/bindings/display/panel/lvds.yaml)35
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml90
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml85
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,cec.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml93
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,disp.txt219
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml86
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml116
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml16
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml80
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.txt56
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.yaml116
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml182
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml86
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi-ddc.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi.yaml9
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,mdp-rdma.yaml88
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml103
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml58
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml91
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml103
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml83
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml117
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml65
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml68
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml86
-rw-r--r--Documentation/devicetree/bindings/display/msm/dp-controller.yaml103
-rw-r--r--Documentation/devicetree/bindings/display/msm/dpu-common.yaml56
-rw-r--r--Documentation/devicetree/bindings/display/msm/dpu-sc7180.yaml228
-rw-r--r--Documentation/devicetree/bindings/display/msm/dpu-sdm845.yaml212
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-controller-main.yaml300
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml37
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-14nm.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-20nm.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-28nm.yaml7
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi-phy-common.yaml9
-rw-r--r--Documentation/devicetree/bindings/display/msm/edp.txt56
-rw-r--r--Documentation/devicetree/bindings/display/msm/gmu.yaml172
-rw-r--r--Documentation/devicetree/bindings/display/msm/gpu.txt157
-rw-r--r--Documentation/devicetree/bindings/display/msm/gpu.yaml294
-rw-r--r--Documentation/devicetree/bindings/display/msm/hdmi.txt99
-rw-r--r--Documentation/devicetree/bindings/display/msm/hdmi.yaml232
-rw-r--r--Documentation/devicetree/bindings/display/msm/mdp4.txt114
-rw-r--r--Documentation/devicetree/bindings/display/msm/mdp4.yaml124
-rw-r--r--Documentation/devicetree/bindings/display/msm/mdp5.txt160
-rw-r--r--Documentation/devicetree/bindings/display/msm/mdss-common.yaml90
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml156
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,mdss.yaml211
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,msm8998-dpu.yaml101
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,msm8998-mdss.yaml272
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,qcm2290-dpu.yaml90
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,qcm2290-mdss.yaml200
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc7180-dpu.yaml101
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc7180-mdss.yaml308
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc7280-dpu.yaml105
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc7280-mdss.yaml427
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-dpu.yaml122
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-mdss.yaml151
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sdm845-dpu.yaml96
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sdm845-mdss.yaml280
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm6115-dpu.yaml93
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm6115-mdss.yaml187
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8150-dpu.yaml92
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml332
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8250-dpu.yaml99
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8250-mdss.yaml334
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8350-dpu.yaml120
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8350-mdss.yaml223
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8450-dpu.yaml139
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8450-mdss.yaml345
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8550-dpu.yaml133
-rw-r--r--Documentation/devicetree/bindings/display/msm/qcom,sm8550-mdss.yaml333
-rw-r--r--Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/advantech,idk-1110wr.yaml19
-rw-r--r--Documentation/devicetree/bindings/display/panel/arm,rtsm-display.yaml27
-rw-r--r--Documentation/devicetree/bindings/display/panel/arm,versatile-tft-panel.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/panel/auo,a030jtn01.yaml60
-rw-r--r--Documentation/devicetree/bindings/display/panel/boe,bf060y8m-aj0.yaml81
-rw-r--r--Documentation/devicetree/bindings/display/panel/boe,tv101wum-nl6.yaml10
-rw-r--r--Documentation/devicetree/bindings/display/panel/display-timings.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/ebbg,ft8719.yaml74
-rw-r--r--Documentation/devicetree/bindings/display/panel/elida,kd35t133.yaml9
-rw-r--r--Documentation/devicetree/bindings/display/panel/feiyang,fy07024di26a30d.yaml9
-rw-r--r--Documentation/devicetree/bindings/display/panel/focaltech,gpt3.yaml56
-rw-r--r--Documentation/devicetree/bindings/display/panel/himax,hx8394.yaml76
-rw-r--r--Documentation/devicetree/bindings/display/panel/ilitek,ili9163.yaml70
-rw-r--r--Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml51
-rw-r--r--Documentation/devicetree/bindings/display/panel/ilitek,ili9881c.yaml6
-rw-r--r--Documentation/devicetree/bindings/display/panel/innolux,ee101ia-01d.yaml23
-rw-r--r--Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.yaml43
-rw-r--r--Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml72
-rw-r--r--Documentation/devicetree/bindings/display/panel/jdi,lt070me05000.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml7
-rw-r--r--Documentation/devicetree/bindings/display/panel/leadtek,ltk035c5444t.yaml61
-rw-r--r--Documentation/devicetree/bindings/display/panel/leadtek,ltk050h3146w.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/lg,lg4573.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/panel/mitsubishi,aa104xd12.yaml19
-rw-r--r--Documentation/devicetree/bindings/display/panel/mitsubishi,aa121td01.yaml19
-rw-r--r--Documentation/devicetree/bindings/display/panel/nec,nl8048hl11.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/newvision,nv3051d.yaml63
-rw-r--r--Documentation/devicetree/bindings/display/panel/novatek,nt35950.yaml106
-rw-r--r--Documentation/devicetree/bindings/display/panel/novatek,nt36523.yaml85
-rw-r--r--Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/panel/olimex,lcd-olinuxino.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/orisetech,otm8009a.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-edp.yaml188
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-lvds.yaml57
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-mipi-dbi-spi.yaml134
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-simple-dsi.yaml26
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-simple.yaml29
-rw-r--r--Documentation/devicetree/bindings/display/panel/panel-timing.yaml95
-rw-r--r--Documentation/devicetree/bindings/display/panel/raydium,rm67191.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/raydium,rm68200.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/ronbo,rb070d30.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,ams495qa01.yaml57
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml10
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,lms380kf01.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,lms397kf04.yaml3
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,s6d27a1.yaml98
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,s6e88a0-ams452ef01.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,s6e8aa0.yaml1
-rw-r--r--Documentation/devicetree/bindings/display/panel/seiko,43wvf1g.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/panel/sgd,gktw70sdae4se.yaml21
-rw-r--r--Documentation/devicetree/bindings/display/panel/sharp,lq101r1sx01.yaml7
-rw-r--r--Documentation/devicetree/bindings/display/panel/sharp,ls060t1sx01.yaml56
-rw-r--r--Documentation/devicetree/bindings/display/panel/sitronix,st7701.yaml14
-rw-r--r--Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml12
-rw-r--r--Documentation/devicetree/bindings/display/panel/sony,acx424akp.yaml11
-rw-r--r--Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml82
-rw-r--r--Documentation/devicetree/bindings/display/panel/sony,tulip-truly-nt35521.yaml72
-rw-r--r--Documentation/devicetree/bindings/display/panel/tpo,td.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/panel/tpo,tpg110.yaml1
-rw-r--r--Documentation/devicetree/bindings/display/panel/visionox,rm69299.yaml27
-rw-r--r--Documentation/devicetree/bindings/display/panel/visionox,vtdr6130.yaml63
-rw-r--r--Documentation/devicetree/bindings/display/panel/xinpeng,xpp055c272.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/renesas,du.yaml88
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/analogix_dp-rockchip.txt98
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt93
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml103
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip,dw-hdmi.yaml46
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip,dw-mipi-dsi.yaml166
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip,lvds.yaml170
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml8
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip-drm.yaml2
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip-lvds.txt92
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip-vop2.yaml146
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi-ddc.yaml41
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi.yaml226
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos-mixer.yaml142
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-decon.yaml145
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-mic.yaml93
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,exynos7-decon.yaml119
-rw-r--r--Documentation/devicetree/bindings/display/samsung/samsung,fimd.yaml197
-rw-r--r--Documentation/devicetree/bindings/display/simple-framebuffer.yaml43
-rw-r--r--Documentation/devicetree/bindings/display/sitronix,st7735r.yaml9
-rw-r--r--Documentation/devicetree/bindings/display/solomon,ssd1307fb.yaml99
-rw-r--r--Documentation/devicetree/bindings/display/sprd/sprd,display-subsystem.yaml65
-rw-r--r--Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dpu.yaml77
-rw-r--r--Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dsi-host.yaml88
-rw-r--r--Documentation/devicetree/bindings/display/st,stm32-dsi.yaml24
-rw-r--r--Documentation/devicetree/bindings/display/st,stm32-ltdc.yaml5
-rw-r--r--Documentation/devicetree/bindings/display/ste,mcde.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.txt41
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.yaml74
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-dpaux.yaml151
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-sor.yaml197
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-vic.yaml72
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dc.yaml85
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-display.yaml308
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dsi-padctl.yaml45
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dc.yaml182
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dsi.yaml158
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-epp.yaml69
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr2d.yaml73
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr3d.yaml213
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-hdmi.yaml125
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt622
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.yaml430
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-isp.yaml67
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-mpe.yaml70
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-tvo.yaml57
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-vi.yaml162
-rw-r--r--Documentation/devicetree/bindings/display/tegra/nvidia,tegra210-csi.yaml52
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml7
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml4
-rw-r--r--Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt4
-rw-r--r--Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml71
-rw-r--r--Documentation/devicetree/bindings/display/xylon,logicvc-display.yaml301
-rw-r--r--Documentation/devicetree/bindings/dma/allwinner,sun4i-a10-dma.yaml4
-rw-r--r--Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml15
-rw-r--r--Documentation/devicetree/bindings/dma/allwinner,sun6i-a31-dma.yaml4
-rw-r--r--Documentation/devicetree/bindings/dma/altr,msgdma.yaml4
-rw-r--r--Documentation/devicetree/bindings/dma/apple,admac.yaml91
-rw-r--r--Documentation/devicetree/bindings/dma/arm,pl330.yaml92
-rw-r--r--Documentation/devicetree/bindings/dma/arm-pl08x.yaml6
-rw-r--r--Documentation/devicetree/bindings/dma/arm-pl330.txt49
-rw-r--r--Documentation/devicetree/bindings/dma/dma-common.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/dma-controller.yaml12
-rw-r--r--Documentation/devicetree/bindings/dma/dma-router.yaml6
-rw-r--r--Documentation/devicetree/bindings/dma/fsl,edma.yaml155
-rw-r--r--Documentation/devicetree/bindings/dma/fsl,imx-sdma.yaml149
-rw-r--r--Documentation/devicetree/bindings/dma/fsl,mxs-dma.yaml80
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-edma.txt111
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-imx-dma.txt8
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt118
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-mxs-dma.txt60
-rw-r--r--Documentation/devicetree/bindings/dma/ingenic,dma.yaml47
-rw-r--r--Documentation/devicetree/bindings/dma/intel,ldma.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/mediatek,uart-dma.yaml124
-rw-r--r--Documentation/devicetree/bindings/dma/mmp-dma.txt10
-rw-r--r--Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt4
-rw-r--r--Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt56
-rw-r--r--Documentation/devicetree/bindings/dma/nvidia,tegra186-gpc-dma.yaml117
-rw-r--r--Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml6
-rw-r--r--Documentation/devicetree/bindings/dma/owl-dma.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/qcom,adm.yaml99
-rw-r--r--Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml100
-rw-r--r--Documentation/devicetree/bindings/dma/qcom,gpi.yaml29
-rw-r--r--Documentation/devicetree/bindings/dma/qcom_adm.txt61
-rw-r--r--Documentation/devicetree/bindings/dma/qcom_bam_dma.txt50
-rw-r--r--Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml10
-rw-r--r--Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml20
-rw-r--r--Documentation/devicetree/bindings/dma/renesas,rzn1-dmamux.yaml51
-rw-r--r--Documentation/devicetree/bindings/dma/renesas,usb-dmac.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml29
-rw-r--r--Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml10
-rw-r--r--Documentation/devicetree/bindings/dma/snps,dw-axi-dmac.yaml70
-rw-r--r--Documentation/devicetree/bindings/dma/socionext,uniphier-mio-dmac.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/socionext,uniphier-xdmac.yaml2
-rw-r--r--Documentation/devicetree/bindings/dma/sprd-dma.txt7
-rw-r--r--Documentation/devicetree/bindings/dma/st,stm32-dma.yaml6
-rw-r--r--Documentation/devicetree/bindings/dma/st,stm32-dmamux.yaml9
-rw-r--r--Documentation/devicetree/bindings/dma/st,stm32-mdma.yaml7
-rw-r--r--Documentation/devicetree/bindings/dma/ste-dma40.txt138
-rw-r--r--Documentation/devicetree/bindings/dma/stericsson,dma40.yaml159
-rw-r--r--Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt2
-rw-r--r--Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml78
-rw-r--r--Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml3
-rw-r--r--Documentation/devicetree/bindings/dma/ti/k3-udma.yaml13
-rw-r--r--Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt6
-rw-r--r--Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dma-1.0.yaml85
-rw-r--r--Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml4
-rw-r--r--Documentation/devicetree/bindings/dma/xilinx/zynqmp_dma.txt26
-rw-r--r--Documentation/devicetree/bindings/dsp/fsl,dsp.yaml124
-rw-r--r--Documentation/devicetree/bindings/dsp/mediatek,mt8186-dsp.yaml93
-rw-r--r--Documentation/devicetree/bindings/dsp/mediatek,mt8195-dsp.yaml105
-rw-r--r--Documentation/devicetree/bindings/dvfs/performance-domain.yaml15
-rw-r--r--Documentation/devicetree/bindings/edac/dmc-520.yaml2
-rw-r--r--Documentation/devicetree/bindings/eeprom/at24.txt1
-rw-r--r--Documentation/devicetree/bindings/eeprom/at24.yaml34
-rw-r--r--Documentation/devicetree/bindings/eeprom/at25.yaml8
-rw-r--r--Documentation/devicetree/bindings/eeprom/microchip,93lc46b.yaml70
-rw-r--r--Documentation/devicetree/bindings/example-schema.yaml46
-rw-r--r--Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml4
-rw-r--r--Documentation/devicetree/bindings/extcon/extcon-usbc-tusb320.yaml6
-rw-r--r--Documentation/devicetree/bindings/extcon/maxim,max77843.yaml40
-rw-r--r--Documentation/devicetree/bindings/extcon/siliconmitus,sm5502-muic.yaml5
-rw-r--r--Documentation/devicetree/bindings/firmware/amlogic,meson-gxbb-sm.yaml39
-rw-r--r--Documentation/devicetree/bindings/firmware/arm,scmi.yaml178
-rw-r--r--Documentation/devicetree/bindings/firmware/arm,scpi.yaml8
-rw-r--r--Documentation/devicetree/bindings/firmware/fsl,scu.yaml215
-rw-r--r--Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml35
-rw-r--r--Documentation/devicetree/bindings/firmware/meson/meson_sm.txt15
-rw-r--r--Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.txt107
-rw-r--r--Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.yaml186
-rw-r--r--Documentation/devicetree/bindings/firmware/qcom,scm.txt52
-rw-r--r--Documentation/devicetree/bindings/firmware/qcom,scm.yaml212
-rw-r--r--Documentation/devicetree/bindings/firmware/qemu,fw-cfg-mmio.yaml54
-rw-r--r--Documentation/devicetree/bindings/fpga/fpga-region.txt2
-rw-r--r--Documentation/devicetree/bindings/fpga/lattice,sysconfig.yaml81
-rw-r--r--Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml45
-rw-r--r--Documentation/devicetree/bindings/fpga/xilinx-pr-decoupler.txt54
-rw-r--r--Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt51
-rw-r--r--Documentation/devicetree/bindings/fpga/xilinx-zynq-fpga-mgr.yaml2
-rw-r--r--Documentation/devicetree/bindings/fpga/xlnx,fpga-slave-serial.yaml80
-rw-r--r--Documentation/devicetree/bindings/fpga/xlnx,pr-decoupler.yaml64
-rw-r--r--Documentation/devicetree/bindings/fpga/xlnx,zynqmp-pcap-fpga.yaml2
-rw-r--r--Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.txt42
-rw-r--r--Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.yaml88
-rw-r--r--Documentation/devicetree/bindings/gnss/brcm,bcm4751.yaml69
-rw-r--r--Documentation/devicetree/bindings/gnss/gnss-common.yaml55
-rw-r--r--Documentation/devicetree/bindings/gnss/gnss.txt37
-rw-r--r--Documentation/devicetree/bindings/gnss/mediatek.txt35
-rw-r--r--Documentation/devicetree/bindings/gnss/mediatek.yaml59
-rw-r--r--Documentation/devicetree/bindings/gnss/sirfstar.txt46
-rw-r--r--Documentation/devicetree/bindings/gnss/sirfstar.yaml76
-rw-r--r--Documentation/devicetree/bindings/gnss/u-blox,neo-6m.yaml58
-rw-r--r--Documentation/devicetree/bindings/gnss/u-blox.txt45
-rw-r--r--Documentation/devicetree/bindings/gpio/airoha,en7523-gpio.yaml66
-rw-r--r--Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt83
-rw-r--r--Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml104
-rw-r--r--Documentation/devicetree/bindings/gpio/delta,tn48m-gpio.yaml39
-rw-r--r--Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml8
-rw-r--r--Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt27
-rw-r--r--Documentation/devicetree/bindings/gpio/faraday,ftgpio010.yaml65
-rw-r--r--Documentation/devicetree/bindings/gpio/fcs,fxl6408.yaml58
-rw-r--r--Documentation/devicetree/bindings/gpio/fsl,imx8qxp-sc-gpio.yaml39
-rw-r--r--Documentation/devicetree/bindings/gpio/fsl-imx-gpio.yaml2
-rw-r--r--Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.txt20
-rw-r--r--Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.yaml50
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-altera.txt5
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-axp209.txt75
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-consumer-common.yaml64
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-davinci.yaml2
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-eic-sprd.txt97
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-latch.yaml94
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-mvebu.txt93
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml146
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml4
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-pca95xx.yaml114
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-pisosr.txt2
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-samsung.txt41
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-sprd.txt28
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-stmpe.txt3
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-tpic2810.txt16
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-tpic2810.yaml51
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-vf610.yaml5
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-xilinx.txt48
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-xlp.txt49
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-zynq.yaml61
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio.txt43
-rw-r--r--Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml56
-rw-r--r--Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml126
-rw-r--r--Documentation/devicetree/bindings/gpio/loongson,ls1x-gpio.yaml49
-rw-r--r--Documentation/devicetree/bindings/gpio/microchip,mpfs-gpio.yaml97
-rw-r--r--Documentation/devicetree/bindings/gpio/mstar,msc313-gpio.yaml4
-rw-r--r--Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.txt165
-rw-r--r--Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml214
-rw-r--r--Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.txt40
-rw-r--r--Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.yaml110
-rw-r--r--Documentation/devicetree/bindings/gpio/nxp,pcf8575.yaml4
-rw-r--r--Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml34
-rw-r--r--Documentation/devicetree/bindings/gpio/renesas,rcar-gpio.yaml6
-rw-r--r--Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml4
-rw-r--r--Documentation/devicetree/bindings/gpio/sifive,gpio.yaml10
-rw-r--r--Documentation/devicetree/bindings/gpio/socionext,uniphier-gpio.yaml17
-rw-r--r--Documentation/devicetree/bindings/gpio/sprd,gpio-eic.yaml124
-rw-r--r--Documentation/devicetree/bindings/gpio/sprd,gpio.yaml75
-rw-r--r--Documentation/devicetree/bindings/gpio/ti,omap-gpio.yaml2
-rw-r--r--Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml1
-rw-r--r--Documentation/devicetree/bindings/gpio/x-powers,axp209-gpio.yaml62
-rw-r--r--Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml154
-rw-r--r--Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml43
-rw-r--r--Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml169
-rw-r--r--Documentation/devicetree/bindings/gpu/arm,mali-midgard.yaml3
-rw-r--r--Documentation/devicetree/bindings/gpu/arm,mali-utgard.yaml3
-rw-r--r--Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml3
-rw-r--r--Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvdec.yaml106
-rw-r--r--Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvenc.yaml135
-rw-r--r--Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvjpg.yaml94
-rw-r--r--Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra234-nvdec.yaml156
-rw-r--r--Documentation/devicetree/bindings/gpu/samsung-rotator.yaml1
-rw-r--r--Documentation/devicetree/bindings/gpu/vivante,gc.yaml2
-rw-r--r--Documentation/devicetree/bindings/h8300/cpu.txt13
-rw-r--r--Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml38
-rw-r--r--Documentation/devicetree/bindings/hwinfo/renesas,prr.yaml (renamed from Documentation/devicetree/bindings/arm/renesas,prr.yaml)4
-rw-r--r--Documentation/devicetree/bindings/hwinfo/samsung,exynos-chipid.yaml41
-rw-r--r--Documentation/devicetree/bindings/hwinfo/samsung,s5pv210-chipid.yaml30
-rw-r--r--Documentation/devicetree/bindings/hwinfo/ti,k3-socinfo.yaml40
-rw-r--r--Documentation/devicetree/bindings/hwlock/allwinner,sun6i-a31-hwspinlock.yaml5
-rw-r--r--Documentation/devicetree/bindings/hwlock/qcom-hwspinlock.yaml29
-rw-r--r--Documentation/devicetree/bindings/hwlock/st,stm32-hwspinlock.yaml5
-rw-r--r--Documentation/devicetree/bindings/hwlock/ti,omap-hwspinlock.yaml33
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml13
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,adm1266.yaml6
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml68
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml22
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,ltc2945.yaml49
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,ltc2947.yaml20
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,ltc2992.yaml29
-rw-r--r--Documentation/devicetree/bindings/hwmon/adi,max31760.yaml42
-rw-r--r--Documentation/devicetree/bindings/hwmon/adt7475.yaml25
-rw-r--r--Documentation/devicetree/bindings/hwmon/amd,sbrmi.yaml6
-rw-r--r--Documentation/devicetree/bindings/hwmon/amd,sbtsi.yaml6
-rw-r--r--Documentation/devicetree/bindings/hwmon/dps650ab.txt11
-rw-r--r--Documentation/devicetree/bindings/hwmon/hih6130.txt12
-rw-r--r--Documentation/devicetree/bindings/hwmon/hpe,gxp-fan-ctrl.yaml45
-rw-r--r--Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt26
-rw-r--r--Documentation/devicetree/bindings/hwmon/ibm,occ-hwmon.yaml39
-rw-r--r--Documentation/devicetree/bindings/hwmon/ibm,p8-occ-hwmon.txt (renamed from Documentation/devicetree/bindings/i2c/ibm,p8-occ-hwmon.txt)0
-rw-r--r--Documentation/devicetree/bindings/hwmon/iio-hwmon.yaml37
-rw-r--r--Documentation/devicetree/bindings/hwmon/jc42.txt46
-rw-r--r--Documentation/devicetree/bindings/hwmon/jedec,jc42.yaml78
-rw-r--r--Documentation/devicetree/bindings/hwmon/lltc,ltc4151.yaml41
-rw-r--r--Documentation/devicetree/bindings/hwmon/lm70.txt22
-rw-r--r--Documentation/devicetree/bindings/hwmon/lm75.yaml1
-rw-r--r--Documentation/devicetree/bindings/hwmon/lm90.txt51
-rw-r--r--Documentation/devicetree/bindings/hwmon/ltc4151.txt18
-rw-r--r--Documentation/devicetree/bindings/hwmon/mcp3021.txt21
-rw-r--r--Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml53
-rw-r--r--Documentation/devicetree/bindings/hwmon/microchip,mcp3021.yaml43
-rw-r--r--Documentation/devicetree/bindings/hwmon/microchip,sparx5-temp.yaml4
-rw-r--r--Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml99
-rw-r--r--Documentation/devicetree/bindings/hwmon/national,lm90.yaml227
-rw-r--r--Documentation/devicetree/bindings/hwmon/ntc-thermistor.yaml141
-rw-r--r--Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt44
-rw-r--r--Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml57
-rw-r--r--Documentation/devicetree/bindings/hwmon/nuvoton,nct7802.yaml145
-rw-r--r--Documentation/devicetree/bindings/hwmon/nxp,mc34vr500.yaml36
-rw-r--r--Documentation/devicetree/bindings/hwmon/pmbus/ti,lm25066.yaml54
-rw-r--r--Documentation/devicetree/bindings/hwmon/pwm-fan.txt68
-rw-r--r--Documentation/devicetree/bindings/hwmon/pwm-fan.yaml97
-rw-r--r--Documentation/devicetree/bindings/hwmon/sensirion,sht15.yaml43
-rw-r--r--Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml8
-rw-r--r--Documentation/devicetree/bindings/hwmon/sht15.txt19
-rw-r--r--Documentation/devicetree/bindings/hwmon/starfive,jh71x0-temp.yaml70
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml27
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml47
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp108.yaml50
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml104
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp421.yaml109
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp464.yaml113
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml22
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tps23861.yaml16
-rw-r--r--Documentation/devicetree/bindings/hwmon/tmp108.txt18
-rw-r--r--Documentation/devicetree/bindings/hwmon/vexpress.txt2
-rw-r--r--Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml4
-rw-r--r--Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml6
-rw-r--r--Documentation/devicetree/bindings/i2c/apple,i2c.yaml67
-rw-r--r--Documentation/devicetree/bindings/i2c/arm,i2c-versatile.yaml29
-rw-r--r--Documentation/devicetree/bindings/i2c/aspeed,i2c.yaml5
-rw-r--r--Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml146
-rw-r--r--Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt22
-rw-r--r--Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.yaml54
-rw-r--r--Documentation/devicetree/bindings/i2c/brcm,kona-i2c.txt35
-rw-r--r--Documentation/devicetree/bindings/i2c/brcm,kona-i2c.yaml59
-rw-r--r--Documentation/devicetree/bindings/i2c/cdns,i2c-r1p10.yaml18
-rw-r--r--Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml4
-rw-r--r--Documentation/devicetree/bindings/i2c/hisilicon,ascend910-i2c.yaml73
-rw-r--r--Documentation/devicetree/bindings/i2c/hpe,gxp-i2c.yaml59
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-at91.txt82
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-efm32.txt33
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-exynos5.txt53
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-exynos5.yaml133
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-gate.yaml3
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-gpio.yaml28
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-imx-lpi2c.yaml27
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-imx.yaml22
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mpc.yaml5
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mt65xx.txt51
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mt65xx.yaml129
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mt7621.txt25
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mux-gpio.txt80
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mux-gpio.yaml104
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mux-gpmux.yaml1
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt93
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.yaml103
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-nomadik.txt23
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-ocores.txt78
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-pxa.yaml2
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-qcom-cci.txt93
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-rk3x.yaml3
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-s3c2410.txt58
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-st.txt41
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-synquacer.txt29
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-versatile.txt10
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-xlp9xx.txt22
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c.txt4
-rw-r--r--Documentation/devicetree/bindings/i2c/ingenic,i2c.yaml8
-rw-r--r--Documentation/devicetree/bindings/i2c/loongson,ls2x-i2c.yaml51
-rw-r--r--Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml26
-rw-r--r--Documentation/devicetree/bindings/i2c/mediatek,mt7621-i2c.yaml61
-rw-r--r--Documentation/devicetree/bindings/i2c/mellanox,i2c-mlxbf.yaml77
-rw-r--r--Documentation/devicetree/bindings/i2c/microchip,corei2c.yaml56
-rw-r--r--Documentation/devicetree/bindings/i2c/nuvoton,npcm7xx-i2c.yaml27
-rw-r--r--Documentation/devicetree/bindings/i2c/nvidia,tegra186-bpmp-i2c.txt42
-rw-r--r--Documentation/devicetree/bindings/i2c/nvidia,tegra186-bpmp-i2c.yaml45
-rw-r--r--Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.txt87
-rw-r--r--Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml192
-rw-r--r--Documentation/devicetree/bindings/i2c/opencores,i2c-ocores.yaml113
-rw-r--r--Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml277
-rw-r--r--Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml146
-rw-r--r--Documentation/devicetree/bindings/i2c/qcom,i2c-qup.txt40
-rw-r--r--Documentation/devicetree/bindings/i2c/qcom,i2c-qup.yaml89
-rw-r--r--Documentation/devicetree/bindings/i2c/renesas,rcar-i2c.yaml9
-rw-r--r--Documentation/devicetree/bindings/i2c/renesas,riic.yaml7
-rw-r--r--Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml80
-rw-r--r--Documentation/devicetree/bindings/i2c/samsung,s3c2410-i2c.yaml164
-rw-r--r--Documentation/devicetree/bindings/i2c/socionext,synquacer-i2c.yaml58
-rw-r--r--Documentation/devicetree/bindings/i2c/socionext,uniphier-fi2c.yaml3
-rw-r--r--Documentation/devicetree/bindings/i2c/socionext,uniphier-i2c.yaml3
-rw-r--r--Documentation/devicetree/bindings/i2c/st,nomadik-i2c.yaml115
-rw-r--r--Documentation/devicetree/bindings/i2c/st,sti-i2c.yaml71
-rw-r--r--Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml41
-rw-r--r--Documentation/devicetree/bindings/i2c/ti,omap4-i2c.yaml2
-rw-r--r--Documentation/devicetree/bindings/i2c/xlnx,xps-iic-2.00.a.yaml17
-rw-r--r--Documentation/devicetree/bindings/i3c/aspeed,ast2600-i3c.yaml72
-rw-r--r--Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt43
-rw-r--r--Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml60
-rw-r--r--Documentation/devicetree/bindings/i3c/i3c.yaml2
-rw-r--r--Documentation/devicetree/bindings/i3c/mipi-i3c-hci.yaml2
-rw-r--r--Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt41
-rw-r--r--Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml52
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adis16201.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adis16240.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl313.yaml91
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml13
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl355.yaml91
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl367.yaml80
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml55
-rw-r--r--Documentation/devicetree/bindings/iio/accel/bosch,bma220.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/accel/bosch,bmi088.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/accel/fsl,mma7455.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/accel/kionix,kx022a.yaml65
-rw-r--r--Documentation/devicetree/bindings/iio/accel/kionix,kxcjk1013.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/accel/kionix,kxsd9.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/accel/memsensing,msa311.yaml52
-rw-r--r--Documentation/devicetree/bindings/iio/accel/murata,sca3300.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/accel/nxp,fxls8962af.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adc.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad4130.yaml262
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7091r5.yaml10
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml49
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7280a.yaml78
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7291.yaml1
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7292.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7298.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7476.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7606.yaml50
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7923.yaml38
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7949.yaml56
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad799x.yaml73
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad9467.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,axi-adc.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,max11410.yaml177
-rw-r--r--Documentation/devicetree/bindings/iio/adc/allwinner,sun8i-a33-ths.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/aspeed,ast2600-adc.yaml93
-rw-r--r--Documentation/devicetree/bindings/iio/adc/atmel,sama5d2-adc.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/adc/cirrus,ep9301-adc.yaml47
-rw-r--r--Documentation/devicetree/bindings/iio/adc/fsl,vf610-adc.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/holt,hi8435.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ingenic,adc.yaml22
-rw-r--r--Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/lltc,ltc2497.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max1027.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max11100.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max1118.yaml26
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max11205.yaml69
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max1238.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max1241.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/maxim,max1363.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/mediatek,mt2701-auxadc.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/microchip,mcp3201.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/microchip,mcp3911.yaml16
-rw-r--r--Documentation/devicetree/bindings/iio/adc/motorola,cpcap-adc.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/nuvoton,npcm750-adc.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/nxp,imx8qxp-adc.yaml78
-rw-r--r--Documentation/devicetree/bindings/iio/adc/nxp,imx93-adc.yaml81
-rw-r--r--Documentation/devicetree/bindings/iio/adc/nxp,lpc1850-adc.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/adc/qcom,pm8018-adc.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/qcom,spmi-iadc.yaml12
-rw-r--r--Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml51
-rw-r--r--Documentation/devicetree/bindings/iio/adc/qcom,spmi-vadc.yaml107
-rw-r--r--Documentation/devicetree/bindings/iio/adc/renesas,rcar-gyroadc.yaml60
-rw-r--r--Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml33
-rw-r--r--Documentation/devicetree/bindings/iio/adc/richtek,rtq6056.yaml56
-rw-r--r--Documentation/devicetree/bindings/iio/adc/rockchip-saradc.yaml1
-rw-r--r--Documentation/devicetree/bindings/iio/adc/samsung,exynos-adc.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/sigma-delta-modulator.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/adc/sprd,sc2720-adc.yaml62
-rw-r--r--Documentation/devicetree/bindings/iio/adc/st,stm32-adc.yaml186
-rw-r--r--Documentation/devicetree/bindings/iio/adc/st,stm32-dfsdm-adc.yaml18
-rw-r--r--Documentation/devicetree/bindings/iio/adc/st,stmpe-adc.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc081c.yaml55
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc0832.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc084s021.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc108s102.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc12138.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc128s052.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,adc161s626.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads1015.yaml16
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads1100.yaml46
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads124s08.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads131e08.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads7924.yaml110
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads8344.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,ads8688.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,am3359-adc.yaml75
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,lmp92064.yaml70
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,palmas-gpadc.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,tlc4541.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/adc/ti,tsc2046.yaml42
-rw-r--r--Documentation/devicetree/bindings/iio/adc/x-powers,axp209-adc.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/adc/xlnx,zynqmp-ams.yaml236
-rw-r--r--Documentation/devicetree/bindings/iio/addac/adi,ad74115.yaml373
-rw-r--r--Documentation/devicetree/bindings/iio/addac/adi,ad74413r.yaml172
-rw-r--r--Documentation/devicetree/bindings/iio/afe/temperature-sense-rtd.yaml101
-rw-r--r--Documentation/devicetree/bindings/iio/afe/temperature-transducer.yaml114
-rw-r--r--Documentation/devicetree/bindings/iio/amplifiers/adi,ada4250.yaml51
-rw-r--r--Documentation/devicetree/bindings/iio/amplifiers/adi,hmc425a.yaml1
-rw-r--r--Documentation/devicetree/bindings/iio/chemical/senseair,sunrise.yaml55
-rw-r--r--Documentation/devicetree/bindings/iio/chemical/sensirion,scd4x.yaml46
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad3552r.yaml218
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5064.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5360.yaml13
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5380.yaml10
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5421.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5449.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5624r.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5686.yaml10
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5755.yaml10
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5758.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5761.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5764.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5766.yaml28
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5770r.yaml101
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad5791.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad7293.yaml61
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ad8801.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/adi,ltc2688.yaml147
-rw-r--r--Documentation/devicetree/bindings/iio/dac/lltc,ltc1660.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/dac/lltc,ltc2632.yaml20
-rw-r--r--Documentation/devicetree/bindings/iio/dac/maxim,max5522.yaml49
-rw-r--r--Documentation/devicetree/bindings/iio/dac/microchip,mcp4922.yaml10
-rw-r--r--Documentation/devicetree/bindings/iio/dac/nxp,lpc1850-dac.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/dac/st,stm32-dac.yaml8
-rw-r--r--Documentation/devicetree/bindings/iio/dac/ti,dac082s085.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/dac/ti,dac5571.yaml3
-rw-r--r--Documentation/devicetree/bindings/iio/dac/ti,dac7311.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/dac/ti,dac7612.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/filter/adi,admv8818.yaml66
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adf4371.yaml19
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,adf4350.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,adf4377.yaml92
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,admv1013.yaml94
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,admv1014.yaml137
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,admv4420.yaml59
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adi,adrf6780.yaml134
-rw-r--r--Documentation/devicetree/bindings/iio/gyroscope/adi,adxrs290.yaml19
-rw-r--r--Documentation/devicetree/bindings/iio/gyroscope/bosch,bmg160.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/gyroscope/invensense,mpu3050.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/gyroscope/nxp,fxas21002c.yaml35
-rw-r--r--Documentation/devicetree/bindings/iio/health/ti,afe4403.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/health/ti,afe4404.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/humidity/dht11.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml16
-rw-r--r--Documentation/devicetree/bindings/iio/imu/adi,adis16460.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/imu/adi,adis16475.yaml27
-rw-r--r--Documentation/devicetree/bindings/iio/imu/adi,adis16480.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/imu/bosch,bmi160.yaml39
-rw-r--r--Documentation/devicetree/bindings/iio/imu/bosch,bno055.yaml59
-rw-r--r--Documentation/devicetree/bindings/iio/imu/invensense,icm42600.yaml41
-rw-r--r--Documentation/devicetree/bindings/iio/imu/invensense,mpu6050.yaml41
-rw-r--r--Documentation/devicetree/bindings/iio/imu/nxp,fxos8700.yaml35
-rw-r--r--Documentation/devicetree/bindings/iio/imu/st,lsm6dsx.yaml70
-rw-r--r--Documentation/devicetree/bindings/iio/light/liteon,ltr501.yaml58
-rw-r--r--Documentation/devicetree/bindings/iio/light/liteon,ltrf216a.yaml49
-rw-r--r--Documentation/devicetree/bindings/iio/light/rohm,bu27034.yaml46
-rw-r--r--Documentation/devicetree/bindings/iio/light/stk33xx.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/magnetometer/asahi-kasei,ak8975.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/magnetometer/ti,tmag5273.yaml75
-rw-r--r--Documentation/devicetree/bindings/iio/magnetometer/yamaha,yas530.yaml18
-rw-r--r--Documentation/devicetree/bindings/iio/multiplexer/io-channel-mux.yaml15
-rw-r--r--Documentation/devicetree/bindings/iio/potentiometer/adi,ad5272.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/potentiometer/microchip,mcp41010.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/potentiometer/microchip,mcp4131.yaml11
-rw-r--r--Documentation/devicetree/bindings/iio/pressure/asc,dlhl60d.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/pressure/bmp085.yaml30
-rw-r--r--Documentation/devicetree/bindings/iio/pressure/meas,ms5611.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/pressure/murata,zpa2326.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/proximity/ams,as3935.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/proximity/semtech,sx9324.yaml200
-rw-r--r--Documentation/devicetree/bindings/iio/proximity/semtech,sx9360.yaml98
-rw-r--r--Documentation/devicetree/bindings/iio/proximity/st,vl53l0x.yaml5
-rw-r--r--Documentation/devicetree/bindings/iio/resolver/adi,ad2s90.yaml7
-rw-r--r--Documentation/devicetree/bindings/iio/samsung,sensorhub-rinato.yaml9
-rw-r--r--Documentation/devicetree/bindings/iio/st,st-sensors.yaml14
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/adi,ltc2983.yaml528
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/maxim,max31855k.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/maxim,max31856.yaml6
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/maxim,max31865.yaml54
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/melexis,mlx90632.yaml4
-rw-r--r--Documentation/devicetree/bindings/iio/temperature/ti,tmp117.yaml14
-rw-r--r--Documentation/devicetree/bindings/input/adc-joystick.yaml20
-rw-r--r--Documentation/devicetree/bindings/input/adc-keys.txt67
-rw-r--r--Documentation/devicetree/bindings/input/adc-keys.yaml103
-rw-r--r--Documentation/devicetree/bindings/input/adi,adp5588.yaml111
-rw-r--r--Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml33
-rw-r--r--Documentation/devicetree/bindings/input/ariel-pwrbutton.yaml1
-rw-r--r--Documentation/devicetree/bindings/input/azoteq,iqs7222.yaml950
-rw-r--r--Documentation/devicetree/bindings/input/cap11xx.txt78
-rw-r--r--Documentation/devicetree/bindings/input/cypress-sf.yaml61
-rw-r--r--Documentation/devicetree/bindings/input/da9062-onkey.txt2
-rw-r--r--Documentation/devicetree/bindings/input/elan,ekth3000.yaml81
-rw-r--r--Documentation/devicetree/bindings/input/elan,ekth6915.yaml65
-rw-r--r--Documentation/devicetree/bindings/input/elan_i2c.txt44
-rw-r--r--Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml4
-rw-r--r--Documentation/devicetree/bindings/input/fsl,scu-key.yaml40
-rw-r--r--Documentation/devicetree/bindings/input/goodix,gt7375p.yaml12
-rw-r--r--Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml38
-rw-r--r--Documentation/devicetree/bindings/input/gpio-beeper.txt13
-rw-r--r--Documentation/devicetree/bindings/input/gpio-beeper.yaml33
-rw-r--r--Documentation/devicetree/bindings/input/gpio-keys.yaml173
-rw-r--r--Documentation/devicetree/bindings/input/hid-over-i2c.txt44
-rw-r--r--Documentation/devicetree/bindings/input/hid-over-i2c.yaml83
-rw-r--r--Documentation/devicetree/bindings/input/ibm,op-panel.yaml50
-rw-r--r--Documentation/devicetree/bindings/input/ilitek,ili2xxx.txt27
-rw-r--r--Documentation/devicetree/bindings/input/imx-keypad.yaml4
-rw-r--r--Documentation/devicetree/bindings/input/input.yaml26
-rw-r--r--Documentation/devicetree/bindings/input/iqs269a.yaml17
-rw-r--r--Documentation/devicetree/bindings/input/iqs626a.yaml107
-rw-r--r--Documentation/devicetree/bindings/input/iqs62x-keys.yaml9
-rw-r--r--Documentation/devicetree/bindings/input/matrix-keymap.yaml4
-rw-r--r--Documentation/devicetree/bindings/input/max77650-onkey.yaml8
-rw-r--r--Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml83
-rw-r--r--Documentation/devicetree/bindings/input/mediatek,pmic-keys.yaml115
-rw-r--r--Documentation/devicetree/bindings/input/microchip,cap11xx.yaml152
-rw-r--r--Documentation/devicetree/bindings/input/mtk-pmic-keys.txt43
-rw-r--r--Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml66
-rw-r--r--Documentation/devicetree/bindings/input/pwm-beeper.txt24
-rw-r--r--Documentation/devicetree/bindings/input/pwm-beeper.yaml41
-rw-r--r--Documentation/devicetree/bindings/input/pwm-vibrator.txt66
-rw-r--r--Documentation/devicetree/bindings/input/pwm-vibrator.yaml57
-rw-r--r--Documentation/devicetree/bindings/input/qcom,pm8921-pwrkey.yaml75
-rw-r--r--Documentation/devicetree/bindings/input/qcom,pm8xxx-pwrkey.txt46
-rw-r--r--Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt23
-rw-r--r--Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.yaml38
-rw-r--r--Documentation/devicetree/bindings/input/regulator-haptic.yaml6
-rw-r--r--Documentation/devicetree/bindings/input/snvs-pwrkey.txt1
-rw-r--r--Documentation/devicetree/bindings/input/sprd,sc27xx-vibrator.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/ti,drv260x.txt50
-rw-r--r--Documentation/devicetree/bindings/input/ti,drv260x.yaml109
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/auo_pixcir_ts.txt6
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/chipone,icn8318.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt16
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/cypress,cy8ctma140.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/cypress,cy8ctma340.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/cypress,tt21000.yaml106
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml10
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/egalax-ts.txt4
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/ektf2127.txt2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/elan,elants_i2c.yaml14
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/goodix.yaml3
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/himax,hx83112b.yaml63
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/hynitron,cstxxx.yaml65
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml7
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/imagis,ist3038c.yaml74
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml10
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/pixcir,pixcir_ts.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/silead,gsl1680.yaml91
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt44
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/st,stmfts.txt41
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/st,stmfts.yaml72
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/stmpe.txt3
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/ti,am3359-tsc.yaml76
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/ti,tsc2005.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/ti-tsc-adc.txt91
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/touchscreen.yaml2
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/zinitix,bt400.yaml115
-rw-r--r--Documentation/devicetree/bindings/input/touchscreen/zinitix.txt40
-rw-r--r--Documentation/devicetree/bindings/interconnect/fsl,imx8m-noc.yaml11
-rw-r--r--Documentation/devicetree/bindings/interconnect/mediatek,cci.yaml142
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,bcm-voter.yaml8
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,msm8998-bwmon.yaml124
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,osm-l3.yaml27
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,qcm2290.yaml137
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,qdu1000-rpmh.yaml70
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,rpm.yaml214
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,rpmh-common.yaml43
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,rpmh.yaml49
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sa8775p-rpmh.yaml50
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sc7280-rpmh.yaml71
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sc8280xp-rpmh.yaml49
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sdm660.yaml185
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sm6350-rpmh.yaml82
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sm8450-rpmh.yaml124
-rw-r--r--Documentation/devicetree/bindings/interconnect/qcom,sm8550-rpmh.yaml139
-rw-r--r--Documentation/devicetree/bindings/interconnect/samsung,exynos-bus.yaml317
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/allwinner,sun4i-a10-ic.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/andestech,ativic32.txt19
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/apple,aic.yaml33
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/apple,aic2.yaml143
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml22
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/arm,gic.yaml9
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,bcm3380-l2-intc.txt39
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,bcm7038-l1-intc.txt61
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,bcm7038-l1-intc.yaml91
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,bcm7120-l2-intc.txt88
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,bcm7120-l2-intc.yaml152
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt31
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.yaml72
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,intmux.yaml3
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,irqsteer.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,ls-extirq.txt53
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,ls-extirq.yaml118
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,mu-msi.yaml99
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/ingenic,intc.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/intel,ce4100-ioapic.txt26
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/intel,ce4100-ioapic.yaml60
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/intel,ce4100-lapic.yaml71
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/intel,ixp4xx-interrupt.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,cpu-interrupt-controller.yaml34
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,htpic.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,htvec.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,liointc.yaml8
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,ls1x-intc.txt24
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,ls1x-intc.yaml51
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,pch-msi.yaml10
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/loongson,pch-pic.yaml6
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt33
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mediatek,mtk-cirq.yaml68
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/microchip,eic.yaml73
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mrvl,intc.yaml12
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mscc,ocelot-icpu-intr.yaml4
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/msi-controller.yaml46
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/mti,cpu-interrupt-controller.yaml46
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/nuvoton,wpcm450-aic.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/qcom,mpm.yaml96
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt76
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.yaml93
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/rda,8810pl-intc.txt61
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/rda,8810pl-intc.yaml43
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml62
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,h8300h-intc.txt22
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,h8s-intc.txt22
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,irqc.yaml5
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,rzg2l-irqc.yaml134
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/samsung,exynos4210-combiner.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/samsung,s3c24xx-irq.txt53
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml106
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.txt31
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/socionext,synquacer-exiu.yaml53
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.yaml1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/st,sti-irq-syscfg.txt9
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/st,stm32-exti.yaml7
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/sunplus,sp7021-intc.yaml62
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml8
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.yaml3
-rw-r--r--Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml2
-rw-r--r--Documentation/devicetree/bindings/iommu/apple,dart.yaml8
-rw-r--r--Documentation/devicetree/bindings/iommu/apple,sart.yaml56
-rw-r--r--Documentation/devicetree/bindings/iommu/arm,smmu-v3.yaml11
-rw-r--r--Documentation/devicetree/bindings/iommu/arm,smmu.yaml295
-rw-r--r--Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml63
-rw-r--r--Documentation/devicetree/bindings/iommu/qcom,iommu.txt121
-rw-r--r--Documentation/devicetree/bindings/iommu/qcom,iommu.yaml113
-rw-r--r--Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.yaml38
-rw-r--r--Documentation/devicetree/bindings/iommu/samsung,sysmmu.yaml11
-rw-r--r--Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml39
-rw-r--r--Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt3
-rw-r--r--Documentation/devicetree/bindings/ipmi/ipmi-ipmb.yaml67
-rw-r--r--Documentation/devicetree/bindings/ipmi/ipmi-smic.yaml2
-rw-r--r--Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt5
-rw-r--r--Documentation/devicetree/bindings/ipmi/ssif-bmc.yaml38
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/common.yaml2
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/gpio-backlight.yaml4
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/kinetic,ktz8866.yaml76
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/led-backlight.yaml6
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml2
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/mediatek,mt6370-backlight.yaml121
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/pwm-backlight.yaml4
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml15
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/richtek,rt4831-backlight.yaml5
-rw-r--r--Documentation/devicetree/bindings/leds/common.yaml83
-rw-r--r--Documentation/devicetree/bindings/leds/cznic,turris-omnia-leds.yaml7
-rw-r--r--Documentation/devicetree/bindings/leds/irled/gpio-ir-tx.txt14
-rw-r--r--Documentation/devicetree/bindings/leds/irled/gpio-ir-tx.yaml36
-rw-r--r--Documentation/devicetree/bindings/leds/irled/ir-spi-led.yaml61
-rw-r--r--Documentation/devicetree/bindings/leds/irled/pwm-ir-tx.txt13
-rw-r--r--Documentation/devicetree/bindings/leds/irled/pwm-ir-tx.yaml34
-rw-r--r--Documentation/devicetree/bindings/leds/irled/spi-ir-led.txt29
-rw-r--r--Documentation/devicetree/bindings/leds/issi,is31fl319x.yaml194
-rw-r--r--Documentation/devicetree/bindings/leds/kinetic,ktd2692.yaml87
-rw-r--r--Documentation/devicetree/bindings/leds/leds-aat1290.txt77
-rw-r--r--Documentation/devicetree/bindings/leds/leds-aw2013.yaml3
-rw-r--r--Documentation/devicetree/bindings/leds/leds-bcm63138.yaml95
-rw-r--r--Documentation/devicetree/bindings/leds/leds-bcm6328.txt319
-rw-r--r--Documentation/devicetree/bindings/leds/leds-bcm6328.yaml404
-rw-r--r--Documentation/devicetree/bindings/leds/leds-class-multicolor.yaml34
-rw-r--r--Documentation/devicetree/bindings/leds/leds-gpio.yaml2
-rw-r--r--Documentation/devicetree/bindings/leds/leds-is31fl319x.txt61
-rw-r--r--Documentation/devicetree/bindings/leds/leds-ktd2692.txt50
-rw-r--r--Documentation/devicetree/bindings/leds/leds-lgm.yaml10
-rw-r--r--Documentation/devicetree/bindings/leds/leds-lp50xx.yaml127
-rw-r--r--Documentation/devicetree/bindings/leds/leds-lp55xx.yaml269
-rw-r--r--Documentation/devicetree/bindings/leds/leds-max77650.yaml9
-rw-r--r--Documentation/devicetree/bindings/leds/leds-mt6323.txt2
-rw-r--r--Documentation/devicetree/bindings/leds/leds-mt6360.yaml192
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pca9532.txt49
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pm8058.txt67
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml88
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pwm.yaml2
-rw-r--r--Documentation/devicetree/bindings/leds/leds-qcom-lpg.yaml188
-rw-r--r--Documentation/devicetree/bindings/leds/leds-rt4505.yaml3
-rw-r--r--Documentation/devicetree/bindings/leds/leds-sgm3140.yaml5
-rw-r--r--Documentation/devicetree/bindings/leds/maxim,max77693.yaml105
-rw-r--r--Documentation/devicetree/bindings/leds/mediatek,mt6370-flashlight.yaml41
-rw-r--r--Documentation/devicetree/bindings/leds/mediatek,mt6370-indicator.yaml80
-rw-r--r--Documentation/devicetree/bindings/leds/nxp,pca953x.yaml90
-rw-r--r--Documentation/devicetree/bindings/leds/qcom,pm8058-led.yaml57
-rw-r--r--Documentation/devicetree/bindings/leds/qcom,spmi-flash-led.yaml117
-rw-r--r--Documentation/devicetree/bindings/leds/register-bit-led.txt94
-rw-r--r--Documentation/devicetree/bindings/leds/register-bit-led.yaml95
-rw-r--r--Documentation/devicetree/bindings/leds/regulator-led.yaml55
-rw-r--r--Documentation/devicetree/bindings/leds/rohm,bd2606mvv.yaml81
-rw-r--r--Documentation/devicetree/bindings/leds/rohm,bd71828-leds.yaml18
-rw-r--r--Documentation/devicetree/bindings/leds/skyworks,aat1290.yaml95
-rw-r--r--Documentation/devicetree/bindings/leds/ti,tca6507.yaml4
-rw-r--r--Documentation/devicetree/bindings/mailbox/amlogic,meson-gxbb-mhu.yaml7
-rw-r--r--Documentation/devicetree/bindings/mailbox/apple,mailbox.yaml88
-rw-r--r--Documentation/devicetree/bindings/mailbox/arm,mhu.yaml1
-rw-r--r--Documentation/devicetree/bindings/mailbox/fsl,mu.yaml42
-rw-r--r--Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml90
-rw-r--r--Documentation/devicetree/bindings/mailbox/microchip,mpfs-mailbox.yaml54
-rw-r--r--Documentation/devicetree/bindings/mailbox/microchip,polarfire-soc-mailbox.yaml47
-rw-r--r--Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml51
-rw-r--r--Documentation/devicetree/bindings/mailbox/mtk-gce.txt80
-rw-r--r--Documentation/devicetree/bindings/mailbox/nvidia,tegra186-hsp.txt72
-rw-r--r--Documentation/devicetree/bindings/mailbox/nvidia,tegra186-hsp.yaml123
-rw-r--r--Documentation/devicetree/bindings/mailbox/qcom,apcs-kpss-global.yaml156
-rw-r--r--Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml37
-rw-r--r--Documentation/devicetree/bindings/mailbox/sprd-mailbox.yaml6
-rw-r--r--Documentation/devicetree/bindings/mailbox/st,sti-mailbox.yaml53
-rw-r--r--Documentation/devicetree/bindings/mailbox/st,stm32-ipcc.yaml21
-rw-r--r--Documentation/devicetree/bindings/mailbox/sti-mailbox.txt51
-rw-r--r--Documentation/devicetree/bindings/mailbox/ti,omap-mailbox.yaml9
-rw-r--r--Documentation/devicetree/bindings/mailbox/xlnx,zynqmp-ipi-mailbox.txt127
-rw-r--r--Documentation/devicetree/bindings/mailbox/xlnx,zynqmp-ipi-mailbox.yaml141
-rw-r--r--Documentation/devicetree/bindings/media/allegro,al5e.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun4i-a10-ir.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun4i-a10-video-engine.yaml7
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun50i-h6-vpu-g2.yaml69
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml64
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml101
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml137
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun8i-a83t-de2-rotate.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun8i-a83t-mipi-csi2.yaml125
-rw-r--r--Documentation/devicetree/bindings/media/allwinner,sun8i-h3-deinterlace.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/amlogic,axg-ge2d.yaml6
-rw-r--r--Documentation/devicetree/bindings/media/amlogic,gx-vdec.yaml6
-rw-r--r--Documentation/devicetree/bindings/media/amlogic,meson-ir-tx.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/amlogic,meson6-ir.yaml47
-rw-r--r--Documentation/devicetree/bindings/media/amphion,vpu.yaml180
-rw-r--r--Documentation/devicetree/bindings/media/atmel,isc.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/cec-gpio.txt42
-rw-r--r--Documentation/devicetree/bindings/media/cec.txt8
-rw-r--r--Documentation/devicetree/bindings/media/cec/amlogic,meson-gx-ao-cec.yaml (renamed from Documentation/devicetree/bindings/media/amlogic,meson-gx-ao-cec.yaml)13
-rw-r--r--Documentation/devicetree/bindings/media/cec/cec-common.yaml28
-rw-r--r--Documentation/devicetree/bindings/media/cec/cec-gpio.yaml74
-rw-r--r--Documentation/devicetree/bindings/media/cec/nvidia,tegra114-cec.yaml58
-rw-r--r--Documentation/devicetree/bindings/media/cec/samsung,s5p-cec.yaml66
-rw-r--r--Documentation/devicetree/bindings/media/cec/st,stih-cec.yaml66
-rw-r--r--Documentation/devicetree/bindings/media/cec/st,stm32-cec.yaml (renamed from Documentation/devicetree/bindings/media/st,stm32-cec.yaml)7
-rw-r--r--Documentation/devicetree/bindings/media/coda.yaml1
-rw-r--r--Documentation/devicetree/bindings/media/exynos-fimc-lite.txt16
-rw-r--r--Documentation/devicetree/bindings/media/exynos-jpeg-codec.txt16
-rw-r--r--Documentation/devicetree/bindings/media/exynos4-fimc-is.txt50
-rw-r--r--Documentation/devicetree/bindings/media/exynos5-gsc.txt38
-rw-r--r--Documentation/devicetree/bindings/media/fsl,imx6ull-pxp.yaml88
-rw-r--r--Documentation/devicetree/bindings/media/fsl-pxp.txt26
-rw-r--r--Documentation/devicetree/bindings/media/gpio-ir-receiver.txt20
-rw-r--r--Documentation/devicetree/bindings/media/gpio-ir-receiver.yaml43
-rw-r--r--Documentation/devicetree/bindings/media/i2c/adv748x.txt116
-rw-r--r--Documentation/devicetree/bindings/media/i2c/adv748x.yaml212
-rw-r--r--Documentation/devicetree/bindings/media/i2c/adv7604.yaml16
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ak7375.txt8
-rw-r--r--Documentation/devicetree/bindings/media/i2c/aptina,mt9p031.yaml109
-rw-r--r--Documentation/devicetree/bindings/media/i2c/aptina,mt9v111.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/asahi-kasei,ak7375.yaml52
-rw-r--r--Documentation/devicetree/bindings/media/i2c/chrontel,ch7322.yaml15
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.txt9
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9714.yaml47
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9768.yaml8
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt9
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.yaml41
-rw-r--r--Documentation/devicetree/bindings/media/i2c/hynix,hi846.yaml124
-rw-r--r--Documentation/devicetree/bindings/media/i2c/imx219.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/imx258.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/imx290.txt57
-rw-r--r--Documentation/devicetree/bindings/media/i2c/isil,isl79987.yaml113
-rw-r--r--Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml315
-rw-r--r--Documentation/devicetree/bindings/media/i2c/maxim,max96712.yaml111
-rw-r--r--Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/i2c/mt9p031.txt40
-rw-r--r--Documentation/devicetree/bindings/media/i2c/onnn,ar0521.yaml112
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ov2685.txt41
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ov5640.txt92
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ov5645.txt54
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ov8856.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov02a10.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov2685.yaml102
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov4689.yaml134
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5640.yaml154
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5645.yaml104
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5648.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5670.yaml93
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5675.yaml122
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov5693.yaml124
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov772x.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov8858.yaml106
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov8865.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/ovti,ov9282.yaml16
-rw-r--r--Documentation/devicetree/bindings/media/i2c/rda,rda5807.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/samsung,s5k5baf.yaml101
-rw-r--r--Documentation/devicetree/bindings/media/i2c/samsung,s5k6a3.yaml98
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx290.yaml140
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx296.yaml106
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx334.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx335.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml15
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx415.yaml122
-rw-r--r--Documentation/devicetree/bindings/media/i2c/st,st-mipid02.txt82
-rw-r--r--Documentation/devicetree/bindings/media/i2c/st,st-mipid02.yaml176
-rw-r--r--Documentation/devicetree/bindings/media/i2c/st,st-vgxy61.yaml113
-rw-r--r--Documentation/devicetree/bindings/media/i2c/toshiba,tc358746.yaml178
-rw-r--r--Documentation/devicetree/bindings/media/marvell,mmp2-ccic.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,mdp3-rdma.yaml95
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,mdp3-rsz.yaml77
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,mdp3-wrot.yaml80
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,mt8195-jpegdec.yaml161
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,mt8195-jpegenc.yaml140
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,vcodec-decoder.yaml162
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml185
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml268
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt38
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.yaml81
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-jpeg-encoder.txt35
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-jpeg-encoder.yaml74
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-mdp.txt8
-rw-r--r--Documentation/devicetree/bindings/media/mediatek-vcodec.txt129
-rw-r--r--Documentation/devicetree/bindings/media/meson-ir.txt20
-rw-r--r--Documentation/devicetree/bindings/media/microchip,csi2dc.yaml199
-rw-r--r--Documentation/devicetree/bindings/media/microchip,sama5d4-vdec.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/microchip,xisc.yaml6
-rw-r--r--Documentation/devicetree/bindings/media/nvidia,tegra-vde.txt64
-rw-r--r--Documentation/devicetree/bindings/media/nvidia,tegra-vde.yaml119
-rw-r--r--Documentation/devicetree/bindings/media/nxp,dw100.yaml69
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx-mipi-csi2.yaml219
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx7-csi.yaml16
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml226
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx8-isi.yaml173
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx8-jpeg.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml22
-rw-r--r--Documentation/devicetree/bindings/media/nxp,imx8mq-vpu.yaml72
-rw-r--r--Documentation/devicetree/bindings/media/qcom,msm8916-camss.yaml14
-rw-r--r--Documentation/devicetree/bindings/media/qcom,msm8916-venus.yaml85
-rw-r--r--Documentation/devicetree/bindings/media/qcom,msm8996-camss.yaml24
-rw-r--r--Documentation/devicetree/bindings/media/qcom,msm8996-venus.yaml145
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sc7180-venus.yaml96
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sc7280-venus.yaml138
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sdm660-camss.yaml31
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sdm660-venus.yaml159
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sdm845-camss.yaml35
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sdm845-venus-v2.yaml107
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sdm845-venus.yaml103
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sm8250-camss.yaml463
-rw-r--r--Documentation/devicetree/bindings/media/qcom,sm8250-venus.yaml121
-rw-r--r--Documentation/devicetree/bindings/media/qcom,venus-common.yaml73
-rw-r--r--Documentation/devicetree/bindings/media/rc.yaml8
-rw-r--r--Documentation/devicetree/bindings/media/renesas,ceu.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/renesas,csi2.yaml7
-rw-r--r--Documentation/devicetree/bindings/media/renesas,fcp.yaml45
-rw-r--r--Documentation/devicetree/bindings/media/renesas,imr.txt31
-rw-r--r--Documentation/devicetree/bindings/media/renesas,imr.yaml67
-rw-r--r--Documentation/devicetree/bindings/media/renesas,isp.yaml1
-rw-r--r--Documentation/devicetree/bindings/media/renesas,jpu.txt25
-rw-r--r--Documentation/devicetree/bindings/media/renesas,jpu.yaml65
-rw-r--r--Documentation/devicetree/bindings/media/renesas,rzg2l-cru.yaml157
-rw-r--r--Documentation/devicetree/bindings/media/renesas,rzg2l-csi2.yaml149
-rw-r--r--Documentation/devicetree/bindings/media/renesas,vin.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/renesas,vsp1.yaml64
-rw-r--r--Documentation/devicetree/bindings/media/rockchip,rk3568-vepu.yaml69
-rw-r--r--Documentation/devicetree/bindings/media/rockchip,vdec.yaml6
-rw-r--r--Documentation/devicetree/bindings/media/rockchip-isp1.yaml156
-rw-r--r--Documentation/devicetree/bindings/media/rockchip-vpu.yaml5
-rw-r--r--Documentation/devicetree/bindings/media/s5p-cec.txt36
-rw-r--r--Documentation/devicetree/bindings/media/s5p-mfc.txt10
-rw-r--r--Documentation/devicetree/bindings/media/samsung,exynos4210-csis.yaml170
-rw-r--r--Documentation/devicetree/bindings/media/samsung,exynos4210-fimc.yaml152
-rw-r--r--Documentation/devicetree/bindings/media/samsung,exynos4212-fimc-is.yaml220
-rw-r--r--Documentation/devicetree/bindings/media/samsung,exynos4212-fimc-lite.yaml63
-rw-r--r--Documentation/devicetree/bindings/media/samsung,exynos5250-gsc.yaml109
-rw-r--r--Documentation/devicetree/bindings/media/samsung,fimc.yaml279
-rw-r--r--Documentation/devicetree/bindings/media/samsung,s5c73m3.yaml165
-rw-r--r--Documentation/devicetree/bindings/media/samsung,s5pv210-jpeg.yaml123
-rw-r--r--Documentation/devicetree/bindings/media/samsung-fimc.txt209
-rw-r--r--Documentation/devicetree/bindings/media/samsung-mipi-csis.txt81
-rw-r--r--Documentation/devicetree/bindings/media/samsung-s5c73m3.txt97
-rw-r--r--Documentation/devicetree/bindings/media/samsung-s5k5baf.txt58
-rw-r--r--Documentation/devicetree/bindings/media/samsung-s5k6a3.txt33
-rw-r--r--Documentation/devicetree/bindings/media/si470x.txt26
-rw-r--r--Documentation/devicetree/bindings/media/silabs,si470x.yaml48
-rw-r--r--Documentation/devicetree/bindings/media/st,stm32-dcmi.yaml8
-rw-r--r--Documentation/devicetree/bindings/media/st,stm32-dma2d.yaml71
-rw-r--r--Documentation/devicetree/bindings/media/stih-cec.txt27
-rw-r--r--Documentation/devicetree/bindings/media/tegra-cec.txt27
-rw-r--r--Documentation/devicetree/bindings/media/ti,cal.yaml16
-rw-r--r--Documentation/devicetree/bindings/media/ti,vpe.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/video-interface-devices.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/video-interfaces.yaml8
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/arm,pl353-smc.yaml130
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/arm,pl35x-smc.yaml156
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/brcm,brcmstb-memc-ddr.yaml52
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/brcm,dpfe-cpu.yaml2
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/calxeda-ddr-ctrlr.yaml2
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/canaan,k210-sram.yaml52
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr-channel.yaml146
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr-props.yaml74
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr2-timings.yaml135
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr2.yaml204
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr3-timings.yaml157
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr3.yaml243
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr4.yaml35
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ddr/jedec,lpddr5.yaml46
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/exynos-srom.yaml1
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/ddr.txt29
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml77
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ifc.yaml113
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/ifc.txt82
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/imx8m-ddrc.yaml5
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ingenic,nemc-peripherals.yaml46
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ingenic,nemc.yaml36
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/intel,ixp4xx-expansion-bus-controller.yaml107
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/intel,ixp4xx-expansion-peripheral-props.yaml80
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/marvell,mvebu-sdram-controller.yaml2
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/mc-peripheral-props.yaml39
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/mediatek,mt7621-memc.yaml32
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml72
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml32
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml1
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/nvidia,tegra186-mc.yaml165
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/nvidia,tegra20-emc.yaml23
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt157
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/qca,ath79-ddr-controller.yaml2
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/renesas,dbsc.yaml4
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/renesas,h8300-bsc.yaml35
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/renesas,rpc-if.yaml67
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml384
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/samsung,exynos5422-dmc.yaml12
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/snps,dw-umctl2-ddrc.yaml118
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/st,stm32-fmc2-ebi-props.yaml144
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/st,stm32-fmc2-ebi.yaml143
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/synopsys,ddrc-ecc.yaml73
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ti,da8xx-ddrctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ti,gpmc-child.yaml252
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ti,gpmc.yaml190
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/xlnx,zynq-ddrc-a05.yaml38
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/xlnx,zynqmp-ocmc-1.0.yaml45
-rw-r--r--Documentation/devicetree/bindings/mfd/ab8500.txt282
-rw-r--r--Documentation/devicetree/bindings/mfd/ac100.txt50
-rw-r--r--Documentation/devicetree/bindings/mfd/actions,atc260x.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/allwinner,sun4i-a10-ts.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/allwinner,sun6i-a31-prcm.yaml42
-rw-r--r--Documentation/devicetree/bindings/mfd/allwinner,sun8i-a23-prcm.yaml12
-rw-r--r--Documentation/devicetree/bindings/mfd/ampere,smpro.yaml42
-rw-r--r--Documentation/devicetree/bindings/mfd/aspeed,ast2x00-scu.yaml110
-rw-r--r--Documentation/devicetree/bindings/mfd/aspeed-lpc.txt157
-rw-r--r--Documentation/devicetree/bindings/mfd/aspeed-lpc.yaml199
-rw-r--r--Documentation/devicetree/bindings/mfd/aspeed-scu.txt48
-rw-r--r--Documentation/devicetree/bindings/mfd/atmel-flexcom.txt2
-rw-r--r--Documentation/devicetree/bindings/mfd/atmel-usart.txt98
-rw-r--r--Documentation/devicetree/bindings/mfd/axp20x.txt273
-rw-r--r--Documentation/devicetree/bindings/mfd/bd9571mwv.txt69
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm6318-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm63268-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm6328-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm6358-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm6362-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,bcm6368-gpio-sysctl.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,cru.yaml32
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,misc.yaml60
-rw-r--r--Documentation/devicetree/bindings/mfd/brcm,twd.yaml69
-rw-r--r--Documentation/devicetree/bindings/mfd/canaan,k210-sysctl.yaml8
-rw-r--r--Documentation/devicetree/bindings/mfd/cirrus,lochnagar.yaml13
-rw-r--r--Documentation/devicetree/bindings/mfd/cirrus,madera.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/da9062.txt13
-rw-r--r--Documentation/devicetree/bindings/mfd/da9063.txt111
-rw-r--r--Documentation/devicetree/bindings/mfd/delta,tn48m-cpld.yaml90
-rw-r--r--Documentation/devicetree/bindings/mfd/dlg,da9063.yaml146
-rw-r--r--Documentation/devicetree/bindings/mfd/ene-kb3930.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/ene-kb930.yaml8
-rw-r--r--Documentation/devicetree/bindings/mfd/fsl,imx8qxp-csr.yaml192
-rw-r--r--Documentation/devicetree/bindings/mfd/gateworks-gsc.yaml6
-rw-r--r--Documentation/devicetree/bindings/mfd/google,cros-ec.yaml190
-rw-r--r--Documentation/devicetree/bindings/mfd/hisilicon,hi6421-spmi-pmic.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/khadas,mcu.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/max14577.txt147
-rw-r--r--Documentation/devicetree/bindings/mfd/max77650.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/max77686.txt26
-rw-r--r--Documentation/devicetree/bindings/mfd/max77693.txt194
-rw-r--r--Documentation/devicetree/bindings/mfd/max77802.txt25
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max14577.yaml196
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max5970.yaml151
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max77686.yaml132
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max77693.yaml143
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max77714.yaml68
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max77802.yaml194
-rw-r--r--Documentation/devicetree/bindings/mfd/maxim,max77843.yaml145
-rw-r--r--Documentation/devicetree/bindings/mfd/mediatek,mt6357.yaml112
-rw-r--r--Documentation/devicetree/bindings/mfd/mediatek,mt6360.yaml247
-rw-r--r--Documentation/devicetree/bindings/mfd/mediatek,mt6370.yaml282
-rw-r--r--Documentation/devicetree/bindings/mfd/mediatek,mt8195-scpsys.yaml68
-rw-r--r--Documentation/devicetree/bindings/mfd/mps,mp2629.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/mscc,ocelot.yaml169
-rw-r--r--Documentation/devicetree/bindings/mfd/mt6397.txt13
-rw-r--r--Documentation/devicetree/bindings/mfd/nxp,bbnsm.yaml101
-rw-r--r--Documentation/devicetree/bindings/mfd/omap-usb-host.txt8
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom,pm8008.yaml2
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom,spmi-pmic.txt84
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom,spmi-pmic.yaml327
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom,tcsr.txt22
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom,tcsr.yaml62
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom-pm8xxx.yaml40
-rw-r--r--Documentation/devicetree/bindings/mfd/qcom-rpm.txt283
-rw-r--r--Documentation/devicetree/bindings/mfd/richtek,rt5120.yaml178
-rw-r--r--Documentation/devicetree/bindings/mfd/rk808.txt465
-rw-r--r--Documentation/devicetree/bindings/mfd/rockchip,rk805.yaml219
-rw-r--r--Documentation/devicetree/bindings/mfd/rockchip,rk808.yaml257
-rw-r--r--Documentation/devicetree/bindings/mfd/rockchip,rk809.yaml284
-rw-r--r--Documentation/devicetree/bindings/mfd/rockchip,rk817.yaml384
-rw-r--r--Documentation/devicetree/bindings/mfd/rockchip,rk818.yaml282
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt102
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71815-pmic.yaml8
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71828-pmic.yaml6
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71847-pmic.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd9571mwv.yaml127
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd9576-pmic.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt72
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.yaml117
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,s2mpa01.yaml91
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,s2mps11.yaml267
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,s5m8767.yaml307
-rw-r--r--Documentation/devicetree/bindings/mfd/samsung,sec-core.txt86
-rw-r--r--Documentation/devicetree/bindings/mfd/silergy,sy7636a.yaml83
-rw-r--r--Documentation/devicetree/bindings/mfd/sprd,ums512-glbreg.yaml71
-rw-r--r--Documentation/devicetree/bindings/mfd/st,stm32-lptimer.yaml36
-rw-r--r--Documentation/devicetree/bindings/mfd/st,stm32-timers.yaml45
-rw-r--r--Documentation/devicetree/bindings/mfd/st,stmfx.yaml5
-rw-r--r--Documentation/devicetree/bindings/mfd/st,stpmic1.yaml4
-rw-r--r--Documentation/devicetree/bindings/mfd/stericsson,ab8500.yaml522
-rw-r--r--Documentation/devicetree/bindings/mfd/stericsson,db8500-prcmu.yaml1
-rw-r--r--Documentation/devicetree/bindings/mfd/syscon.yaml52
-rw-r--r--Documentation/devicetree/bindings/mfd/ti,am3359-tscadc.yaml87
-rw-r--r--Documentation/devicetree/bindings/mfd/ti,j721e-system-controller.yaml35
-rw-r--r--Documentation/devicetree/bindings/mfd/ti,nspire-misc.yaml51
-rw-r--r--Documentation/devicetree/bindings/mfd/ti,tps65086.yaml3
-rw-r--r--Documentation/devicetree/bindings/mfd/wlf,arizona.yaml3
-rw-r--r--Documentation/devicetree/bindings/mfd/x-powers,ac100.yaml116
-rw-r--r--Documentation/devicetree/bindings/mfd/x-powers,axp152.yaml412
-rw-r--r--Documentation/devicetree/bindings/mfd/xylon,logicvc.yaml7
-rw-r--r--Documentation/devicetree/bindings/mips/brcm/brcm,bmips.txt8
-rw-r--r--Documentation/devicetree/bindings/mips/brcm/soc.yaml96
-rw-r--r--Documentation/devicetree/bindings/mips/cpu_irq.txt47
-rw-r--r--Documentation/devicetree/bindings/mips/cpus.yaml115
-rw-r--r--Documentation/devicetree/bindings/mips/ingenic/devices.yaml2
-rw-r--r--Documentation/devicetree/bindings/mips/ingenic/ingenic,cpu.yaml69
-rw-r--r--Documentation/devicetree/bindings/mips/lantiq/lantiq,dma-xway.yaml2
-rw-r--r--Documentation/devicetree/bindings/mips/lantiq/rcu.txt2
-rw-r--r--Documentation/devicetree/bindings/mips/loongson/devices.yaml14
-rw-r--r--Documentation/devicetree/bindings/mips/loongson/ls2k-reset.yaml38
-rw-r--r--Documentation/devicetree/bindings/mips/ralink.txt32
-rw-r--r--Documentation/devicetree/bindings/mips/ralink.yaml87
-rw-r--r--Documentation/devicetree/bindings/mips/realtek-rtl.yaml2
-rw-r--r--Documentation/devicetree/bindings/misc/eeprom-93xx46.yaml70
-rw-r--r--Documentation/devicetree/bindings/misc/idt,89hpesx.yaml72
-rw-r--r--Documentation/devicetree/bindings/misc/idt_89hpesx.txt44
-rw-r--r--Documentation/devicetree/bindings/misc/nvidia,tegra186-misc.txt14
-rw-r--r--Documentation/devicetree/bindings/misc/nvidia,tegra186-misc.yaml43
-rw-r--r--Documentation/devicetree/bindings/misc/nvidia,tegra20-apbmisc.txt17
-rw-r--r--Documentation/devicetree/bindings/misc/nvidia,tegra20-apbmisc.yaml51
-rw-r--r--Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.yaml2
-rw-r--r--Documentation/devicetree/bindings/misc/qcom,fastrpc.txt78
-rw-r--r--Documentation/devicetree/bindings/misc/qcom,fastrpc.yaml144
-rw-r--r--Documentation/devicetree/bindings/misc/qemu,vcpu-stall-detector.yaml51
-rw-r--r--Documentation/devicetree/bindings/misc/xlnx,tmr-inject.yaml47
-rw-r--r--Documentation/devicetree/bindings/misc/xlnx,tmr-manager.yaml47
-rw-r--r--Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml11
-rw-r--r--Documentation/devicetree/bindings/mmc/amlogic,meson-gx-mmc.yaml76
-rw-r--r--Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt39
-rw-r--r--Documentation/devicetree/bindings/mmc/amlogic,meson-mx-sdhc.yaml4
-rw-r--r--Documentation/devicetree/bindings/mmc/arasan,sdhci.yaml17
-rw-r--r--Documentation/devicetree/bindings/mmc/arm,pl18x.yaml29
-rw-r--r--Documentation/devicetree/bindings/mmc/aspeed,sdhci.yaml1
-rw-r--r--Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.txt53
-rw-r--r--Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml118
-rw-r--r--Documentation/devicetree/bindings/mmc/cdns,sdhci.yaml54
-rw-r--r--Documentation/devicetree/bindings/mmc/exynos-dw-mshc.txt92
-rw-r--r--Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml63
-rw-r--r--Documentation/devicetree/bindings/mmc/fsl-imx-mmc.yaml2
-rw-r--r--Documentation/devicetree/bindings/mmc/fujitsu,sdhci-fujitsu.yaml66
-rw-r--r--Documentation/devicetree/bindings/mmc/img-dw-mshc.txt28
-rw-r--r--Documentation/devicetree/bindings/mmc/ingenic,mmc.yaml41
-rw-r--r--Documentation/devicetree/bindings/mmc/litex,mmc.yaml78
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,dove-sdhci.yaml44
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,orion-sdio.yaml44
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt173
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml277
-rw-r--r--Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml6
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-card.txt30
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-card.yaml48
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-controller.yaml27
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-pwrseq-emmc.yaml4
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-pwrseq-sd8787.yaml4
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml4
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-spi-slot.txt29
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-spi-slot.yaml75
-rw-r--r--Documentation/devicetree/bindings/mmc/mtk-sd.yaml243
-rw-r--r--Documentation/devicetree/bindings/mmc/mxs-mmc.yaml2
-rw-r--r--Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt143
-rw-r--r--Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.yaml313
-rw-r--r--Documentation/devicetree/bindings/mmc/orion-sdio.txt16
-rw-r--r--Documentation/devicetree/bindings/mmc/owl-mmc.yaml2
-rw-r--r--Documentation/devicetree/bindings/mmc/renesas,mmcif.yaml2
-rw-r--r--Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml74
-rw-r--r--Documentation/devicetree/bindings/mmc/rockchip-dw-mshc.yaml10
-rw-r--r--Documentation/devicetree/bindings/mmc/samsung,exynos-dw-mshc.yaml160
-rw-r--r--Documentation/devicetree/bindings/mmc/samsung,s3c6410-sdhci.yaml81
-rw-r--r--Documentation/devicetree/bindings/mmc/samsung,s3cmci.txt42
-rw-r--r--Documentation/devicetree/bindings/mmc/samsung-sdhci.txt32
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-am654.yaml76
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-common.yaml32
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-dove.txt14
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-fujitsu.txt32
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-msm.txt120
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-msm.yaml259
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-omap.txt9
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-pxa.yaml19
-rw-r--r--Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml14
-rw-r--r--Documentation/devicetree/bindings/mmc/socfpga-dw-mshc.txt23
-rw-r--r--Documentation/devicetree/bindings/mmc/socionext,uniphier-sd.yaml16
-rw-r--r--Documentation/devicetree/bindings/mmc/starfive,jh7110-mmc.yaml77
-rw-r--r--Documentation/devicetree/bindings/mmc/sunplus,mmc.yaml61
-rw-r--r--Documentation/devicetree/bindings/mmc/synopsys-dw-mshc-common.yaml2
-rw-r--r--Documentation/devicetree/bindings/mmc/synopsys-dw-mshc.yaml39
-rw-r--r--Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml38
-rw-r--r--Documentation/devicetree/bindings/mtd/amlogic,meson-nand.txt60
-rw-r--r--Documentation/devicetree/bindings/mtd/amlogic,meson-nand.yaml93
-rw-r--r--Documentation/devicetree/bindings/mtd/arasan,nand-controller.yaml11
-rw-r--r--Documentation/devicetree/bindings/mtd/arm,pl353-nand-r2p1.yaml37
-rw-r--r--Documentation/devicetree/bindings/mtd/aspeed-smc.txt51
-rw-r--r--Documentation/devicetree/bindings/mtd/atmel-nand.txt6
-rw-r--r--Documentation/devicetree/bindings/mtd/brcm,brcmnand.yaml96
-rw-r--r--Documentation/devicetree/bindings/mtd/common.txt1
-rw-r--r--Documentation/devicetree/bindings/mtd/cortina,gemini-flash.txt24
-rw-r--r--Documentation/devicetree/bindings/mtd/denali,nand.yaml2
-rw-r--r--Documentation/devicetree/bindings/mtd/elm.txt16
-rw-r--r--Documentation/devicetree/bindings/mtd/gpmc-nand.txt147
-rw-r--r--Documentation/devicetree/bindings/mtd/gpmc-nor.txt98
-rw-r--r--Documentation/devicetree/bindings/mtd/gpmc-onenand.txt48
-rw-r--r--Documentation/devicetree/bindings/mtd/gpmi-nand.yaml6
-rw-r--r--Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt2
-rw-r--r--Documentation/devicetree/bindings/mtd/ingenic,nand.yaml151
-rw-r--r--Documentation/devicetree/bindings/mtd/intel,lgm-ebunand.yaml91
-rw-r--r--Documentation/devicetree/bindings/mtd/intel,lgm-nand.yaml99
-rw-r--r--Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml39
-rw-r--r--Documentation/devicetree/bindings/mtd/lpc32xx-mlc.txt2
-rw-r--r--Documentation/devicetree/bindings/mtd/lpc32xx-slc.txt2
-rw-r--r--Documentation/devicetree/bindings/mtd/mediatek,mtk-nfc.yaml155
-rw-r--r--Documentation/devicetree/bindings/mtd/mediatek,nand-ecc-engine.yaml63
-rw-r--r--Documentation/devicetree/bindings/mtd/microchip,mchp48l640.yaml25
-rw-r--r--Documentation/devicetree/bindings/mtd/mtd-physmap.yaml26
-rw-r--r--Documentation/devicetree/bindings/mtd/mtd.yaml27
-rw-r--r--Documentation/devicetree/bindings/mtd/mtk-nand.txt176
-rw-r--r--Documentation/devicetree/bindings/mtd/mxc-nand.yaml6
-rw-r--r--Documentation/devicetree/bindings/mtd/mxicy,nand-ecc-engine.yaml77
-rw-r--r--Documentation/devicetree/bindings/mtd/nand-chip.yaml74
-rw-r--r--Documentation/devicetree/bindings/mtd/nand-controller.yaml77
-rw-r--r--Documentation/devicetree/bindings/mtd/partition.txt33
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/arm,arm-firmware-suite.txt17
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/arm,arm-firmware-suite.yaml30
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/brcm,bcm4908-partitions.yaml4
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/brcm,bcm947xx-cfe-partitions.txt42
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/brcm,bcm947xx-cfe-partitions.yaml50
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml49
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/linksys,ns-partitions.yaml4
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/nvmem-cells.yaml4
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/partition.yaml25
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/partitions.yaml41
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/qcom,smem-part.yaml31
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/redboot-fis.yaml6
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/tplink,safeloader-partitions.yaml57
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/u-boot.yaml56
-rw-r--r--Documentation/devicetree/bindings/mtd/qcom,nandc.yaml158
-rw-r--r--Documentation/devicetree/bindings/mtd/renesas-nandc.yaml66
-rw-r--r--Documentation/devicetree/bindings/mtd/rockchip,nand-controller.yaml9
-rw-r--r--Documentation/devicetree/bindings/mtd/spi-nand.txt5
-rw-r--r--Documentation/devicetree/bindings/mtd/spi-nand.yaml28
-rw-r--r--Documentation/devicetree/bindings/mtd/st,stm32-fmc2-nand.yaml53
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,am654-hbmc.yaml36
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,elm.yaml72
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,gpmc-nand.yaml129
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,gpmc-onenand.yaml84
-rw-r--r--Documentation/devicetree/bindings/mux/gpio-mux.yaml13
-rw-r--r--Documentation/devicetree/bindings/mux/mux-consumer.yaml23
-rw-r--r--Documentation/devicetree/bindings/mux/mux-controller.yaml28
-rw-r--r--Documentation/devicetree/bindings/mux/reg-mux.yaml11
-rw-r--r--Documentation/devicetree/bindings/nds32/andestech-boards40
-rw-r--r--Documentation/devicetree/bindings/nds32/atl2c.txt28
-rw-r--r--Documentation/devicetree/bindings/nds32/cpus.txt38
-rw-r--r--Documentation/devicetree/bindings/net/actions,owl-emac.yaml6
-rw-r--r--Documentation/devicetree/bindings/net/adi,adin.yaml18
-rw-r--r--Documentation/devicetree/bindings/net/adi,adin1110.yaml81
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml8
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml4
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml35
-rw-r--r--Documentation/devicetree/bindings/net/altera_tse.txt113
-rw-r--r--Documentation/devicetree/bindings/net/altr,tse.yaml168
-rw-r--r--Documentation/devicetree/bindings/net/amlogic,g12a-mdio-mux.yaml80
-rw-r--r--Documentation/devicetree/bindings/net/amlogic,gxl-mdio-mux.yaml64
-rw-r--r--Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml6
-rw-r--r--Documentation/devicetree/bindings/net/asix,ax88178.yaml70
-rw-r--r--Documentation/devicetree/bindings/net/asix,ax88796c.yaml74
-rw-r--r--Documentation/devicetree/bindings/net/aspeed,ast2600-mdio.yaml8
-rw-r--r--Documentation/devicetree/bindings/net/bluetooth.txt5
-rw-r--r--Documentation/devicetree/bindings/net/bluetooth/bluetooth-controller.yaml29
-rw-r--r--Documentation/devicetree/bindings/net/bluetooth/brcm,bcm4377-bluetooth.yaml81
-rw-r--r--Documentation/devicetree/bindings/net/bluetooth/nxp,88w8987-bt.yaml45
-rw-r--r--Documentation/devicetree/bindings/net/bluetooth/qualcomm-bluetooth.yaml (renamed from Documentation/devicetree/bindings/net/qualcomm-bluetooth.yaml)23
-rw-r--r--Documentation/devicetree/bindings/net/brcm,amac.txt30
-rw-r--r--Documentation/devicetree/bindings/net/brcm,amac.yaml88
-rw-r--r--Documentation/devicetree/bindings/net/brcm,bcm6368-mdio-mux.yaml26
-rw-r--r--Documentation/devicetree/bindings/net/brcm,bcmgenet.txt124
-rw-r--r--Documentation/devicetree/bindings/net/brcm,bcmgenet.yaml143
-rw-r--r--Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.txt62
-rw-r--r--Documentation/devicetree/bindings/net/brcm,mdio-mux-iproc.yaml80
-rw-r--r--Documentation/devicetree/bindings/net/brcm,systemport.txt38
-rw-r--r--Documentation/devicetree/bindings/net/brcm,systemport.yaml86
-rw-r--r--Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml3
-rw-r--r--Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml45
-rw-r--r--Documentation/devicetree/bindings/net/can/allwinner,sun4i-a10-can.yaml29
-rw-r--r--Documentation/devicetree/bindings/net/can/bosch,c_can.yaml13
-rw-r--r--Documentation/devicetree/bindings/net/can/bosch,m_can.yaml64
-rw-r--r--Documentation/devicetree/bindings/net/can/can-controller.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/can/can-transceiver.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/can/ctu,ctucanfd.yaml66
-rw-r--r--Documentation/devicetree/bindings/net/can/fsl,flexcan.yaml16
-rw-r--r--Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml24
-rw-r--r--Documentation/devicetree/bindings/net/can/microchip,mpfs-can.yaml45
-rw-r--r--Documentation/devicetree/bindings/net/can/nxp,sja1000.yaml136
-rw-r--r--Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml151
-rw-r--r--Documentation/devicetree/bindings/net/can/sja1000.txt58
-rw-r--r--Documentation/devicetree/bindings/net/can/st,stm32-bxcan.yaml85
-rw-r--r--Documentation/devicetree/bindings/net/can/tcan4x5x.txt2
-rw-r--r--Documentation/devicetree/bindings/net/can/xilinx,can.yaml161
-rw-r--r--Documentation/devicetree/bindings/net/can/xilinx_can.txt61
-rw-r--r--Documentation/devicetree/bindings/net/cdns,macb.yaml214
-rw-r--r--Documentation/devicetree/bindings/net/cortina,gemini-ethernet.txt92
-rw-r--r--Documentation/devicetree/bindings/net/cortina,gemini-ethernet.yaml138
-rw-r--r--Documentation/devicetree/bindings/net/cpsw.txt2
-rw-r--r--Documentation/devicetree/bindings/net/davicom,dm9051.yaml62
-rw-r--r--Documentation/devicetree/bindings/net/dsa/ar9331.txt1
-rw-r--r--Documentation/devicetree/bindings/net/dsa/arrow,xrs700x.yaml7
-rw-r--r--Documentation/devicetree/bindings/net/dsa/brcm,b53.yaml123
-rw-r--r--Documentation/devicetree/bindings/net/dsa/brcm,sf2.yaml27
-rw-r--r--Documentation/devicetree/bindings/net/dsa/dsa-port.yaml81
-rw-r--r--Documentation/devicetree/bindings/net/dsa/dsa.yaml93
-rw-r--r--Documentation/devicetree/bindings/net/dsa/hirschmann,hellcreek.yaml15
-rw-r--r--Documentation/devicetree/bindings/net/dsa/lan9303.txt2
-rw-r--r--Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt1
-rw-r--r--Documentation/devicetree/bindings/net/dsa/mediatek,mt7530.yaml812
-rw-r--r--Documentation/devicetree/bindings/net/dsa/microchip,ksz.yaml17
-rw-r--r--Documentation/devicetree/bindings/net/dsa/microchip,lan937x.yaml192
-rw-r--r--Documentation/devicetree/bindings/net/dsa/mscc,ocelot.yaml260
-rw-r--r--Documentation/devicetree/bindings/net/dsa/mt7530.txt327
-rw-r--r--Documentation/devicetree/bindings/net/dsa/nxp,sja1105.yaml54
-rw-r--r--Documentation/devicetree/bindings/net/dsa/ocelot.txt213
-rw-r--r--Documentation/devicetree/bindings/net/dsa/qca8k.txt215
-rw-r--r--Documentation/devicetree/bindings/net/dsa/qca8k.yaml320
-rw-r--r--Documentation/devicetree/bindings/net/dsa/realtek-smi.txt153
-rw-r--r--Documentation/devicetree/bindings/net/dsa/realtek.yaml386
-rw-r--r--Documentation/devicetree/bindings/net/dsa/renesas,rzn1-a5psw.yaml158
-rw-r--r--Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt2
-rw-r--r--Documentation/devicetree/bindings/net/emac_rockchip.txt52
-rw-r--r--Documentation/devicetree/bindings/net/engleder,tsnep.yaml118
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-controller.yaml229
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-phy.yaml68
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-switch-port.yaml26
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-switch.yaml66
-rw-r--r--Documentation/devicetree/bindings/net/fsl,fec.yaml34
-rw-r--r--Documentation/devicetree/bindings/net/fsl,fman-dtsec.yaml172
-rw-r--r--Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml6
-rw-r--r--Documentation/devicetree/bindings/net/fsl-fman.txt164
-rw-r--r--Documentation/devicetree/bindings/net/gpmc-eth.txt97
-rw-r--r--Documentation/devicetree/bindings/net/ingenic,mac.yaml5
-rw-r--r--Documentation/devicetree/bindings/net/intel,dwmac-plat.yaml4
-rw-r--r--Documentation/devicetree/bindings/net/intel,ixp46x-ptp-timer.yaml4
-rw-r--r--Documentation/devicetree/bindings/net/intel,ixp4xx-ethernet.yaml31
-rw-r--r--Documentation/devicetree/bindings/net/intel,ixp4xx-hss.yaml123
-rw-r--r--Documentation/devicetree/bindings/net/lantiq,etop-xway.yaml68
-rw-r--r--Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt21
-rw-r--r--Documentation/devicetree/bindings/net/lantiq,xrx200-net.yaml58
-rw-r--r--Documentation/devicetree/bindings/net/litex,liteeth.yaml1
-rw-r--r--Documentation/devicetree/bindings/net/macb.txt56
-rw-r--r--Documentation/devicetree/bindings/net/marvell,dfx-server.yaml62
-rw-r--r--Documentation/devicetree/bindings/net/marvell,mvusb.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/marvell,orion-mdio.yaml82
-rw-r--r--Documentation/devicetree/bindings/net/marvell,pp2.yaml305
-rw-r--r--Documentation/devicetree/bindings/net/marvell,prestera.txt81
-rw-r--r--Documentation/devicetree/bindings/net/marvell,prestera.yaml91
-rw-r--r--Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt1
-rw-r--r--Documentation/devicetree/bindings/net/marvell-bluetooth.txt25
-rw-r--r--Documentation/devicetree/bindings/net/marvell-bluetooth.yaml49
-rw-r--r--Documentation/devicetree/bindings/net/marvell-orion-mdio.txt54
-rw-r--r--Documentation/devicetree/bindings/net/marvell-pp2.txt141
-rw-r--r--Documentation/devicetree/bindings/net/maxlinear,gpy2xx.yaml47
-rw-r--r--Documentation/devicetree/bindings/net/mctp-i2c-controller.yaml92
-rw-r--r--Documentation/devicetree/bindings/net/mdio-gpio.yaml6
-rw-r--r--Documentation/devicetree/bindings/net/mdio-mux-meson-g12a.txt48
-rw-r--r--Documentation/devicetree/bindings/net/mdio-mux.yaml7
-rw-r--r--Documentation/devicetree/bindings/net/mdio.yaml10
-rw-r--r--Documentation/devicetree/bindings/net/mediatek,mt7620-gsw.txt24
-rw-r--r--Documentation/devicetree/bindings/net/mediatek,net.yaml489
-rw-r--r--Documentation/devicetree/bindings/net/mediatek,star-emac.yaml24
-rw-r--r--Documentation/devicetree/bindings/net/mediatek-dwmac.txt91
-rw-r--r--Documentation/devicetree/bindings/net/mediatek-dwmac.yaml184
-rw-r--r--Documentation/devicetree/bindings/net/mediatek-net.txt98
-rw-r--r--Documentation/devicetree/bindings/net/micrel,ks8851.yaml1
-rw-r--r--Documentation/devicetree/bindings/net/micrel-ksz90x1.txt1
-rw-r--r--Documentation/devicetree/bindings/net/micrel.txt10
-rw-r--r--Documentation/devicetree/bindings/net/microchip,lan95xx.yaml65
-rw-r--r--Documentation/devicetree/bindings/net/microchip,lan966x-switch.yaml171
-rw-r--r--Documentation/devicetree/bindings/net/microchip,sparx5-switch.yaml42
-rw-r--r--Documentation/devicetree/bindings/net/motorcomm,yt8xxx.yaml117
-rw-r--r--Documentation/devicetree/bindings/net/mscc,miim.yaml61
-rw-r--r--Documentation/devicetree/bindings/net/mscc,vsc7514-switch.yaml233
-rw-r--r--Documentation/devicetree/bindings/net/mscc-miim.txt26
-rw-r--r--Documentation/devicetree/bindings/net/mscc-ocelot.txt83
-rw-r--r--Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml170
-rw-r--r--Documentation/devicetree/bindings/net/nfc/nfcmrvl.txt84
-rw-r--r--Documentation/devicetree/bindings/net/nfc/nxp,nci.yaml62
-rw-r--r--Documentation/devicetree/bindings/net/nfc/nxp,pn532.yaml65
-rw-r--r--Documentation/devicetree/bindings/net/nfc/nxp,pn544.yaml58
-rw-r--r--Documentation/devicetree/bindings/net/nfc/nxp-nci.txt33
-rw-r--r--Documentation/devicetree/bindings/net/nfc/pn532.txt46
-rw-r--r--Documentation/devicetree/bindings/net/nfc/pn544.txt33
-rw-r--r--Documentation/devicetree/bindings/net/nfc/samsung,s3fwrn5.yaml3
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st,st-nci.yaml105
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st,st21nfca.yaml64
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st,st95hf.yaml58
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st-nci-i2c.txt38
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st-nci-spi.txt36
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st21nfca.txt37
-rw-r--r--Documentation/devicetree/bindings/net/nfc/st95hf.txt45
-rw-r--r--Documentation/devicetree/bindings/net/nfc/ti,trf7970a.yaml99
-rw-r--r--Documentation/devicetree/bindings/net/nfc/trf7970a.txt43
-rw-r--r--Documentation/devicetree/bindings/net/nvidia,tegra234-mgbe.yaml162
-rw-r--r--Documentation/devicetree/bindings/net/nxp,dwmac-imx.yaml13
-rw-r--r--Documentation/devicetree/bindings/net/nxp,tja11xx.yaml17
-rw-r--r--Documentation/devicetree/bindings/net/oxnas-dwmac.txt3
-rw-r--r--Documentation/devicetree/bindings/net/pcs/fsl,lynx-pcs.yaml40
-rw-r--r--Documentation/devicetree/bindings/net/pcs/mediatek,sgmiisys.yaml55
-rw-r--r--Documentation/devicetree/bindings/net/pcs/renesas,rzn1-miic.yaml171
-rw-r--r--Documentation/devicetree/bindings/net/pse-pd/podl-pse-regulator.yaml40
-rw-r--r--Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml33
-rw-r--r--Documentation/devicetree/bindings/net/qca,ar71xx.yaml17
-rw-r--r--Documentation/devicetree/bindings/net/qca,ar803x.yaml8
-rw-r--r--Documentation/devicetree/bindings/net/qcom,bam-dmux.yaml92
-rw-r--r--Documentation/devicetree/bindings/net/qcom,ethqos.txt64
-rw-r--r--Documentation/devicetree/bindings/net/qcom,ethqos.yaml111
-rw-r--r--Documentation/devicetree/bindings/net/qcom,ipa.yaml103
-rw-r--r--Documentation/devicetree/bindings/net/qcom,ipq4019-mdio.yaml48
-rw-r--r--Documentation/devicetree/bindings/net/qcom,ipq8064-mdio.yaml9
-rw-r--r--Documentation/devicetree/bindings/net/qcom-emac.txt2
-rw-r--r--Documentation/devicetree/bindings/net/ralink,rt2880-net.txt59
-rw-r--r--Documentation/devicetree/bindings/net/ralink,rt3050-esw.txt30
-rw-r--r--Documentation/devicetree/bindings/net/realtek-bluetooth.yaml25
-rw-r--r--Documentation/devicetree/bindings/net/renesas,ether.yaml17
-rw-r--r--Documentation/devicetree/bindings/net/renesas,etheravb.yaml98
-rw-r--r--Documentation/devicetree/bindings/net/renesas,r8a779f0-ether-switch.yaml262
-rw-r--r--Documentation/devicetree/bindings/net/rfkill-gpio.yaml51
-rw-r--r--Documentation/devicetree/bindings/net/rockchip,emac.yaml115
-rw-r--r--Documentation/devicetree/bindings/net/rockchip-dwmac.yaml13
-rw-r--r--Documentation/devicetree/bindings/net/sff,sfp.txt85
-rw-r--r--Documentation/devicetree/bindings/net/sff,sfp.yaml143
-rw-r--r--Documentation/devicetree/bindings/net/smsc,lan91c111.yaml61
-rw-r--r--Documentation/devicetree/bindings/net/smsc-lan91c111.txt17
-rw-r--r--Documentation/devicetree/bindings/net/snps,dwmac.yaml468
-rw-r--r--Documentation/devicetree/bindings/net/socionext,synquacer-netsec.yaml73
-rw-r--r--Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml63
-rw-r--r--Documentation/devicetree/bindings/net/socionext-netsec.txt56
-rw-r--r--Documentation/devicetree/bindings/net/starfive,jh7110-dwmac.yaml144
-rw-r--r--Documentation/devicetree/bindings/net/sti-dwmac.txt3
-rw-r--r--Documentation/devicetree/bindings/net/stm32-dwmac.yaml26
-rw-r--r--Documentation/devicetree/bindings/net/sunplus,sp7021-emac.yaml143
-rw-r--r--Documentation/devicetree/bindings/net/ti,bluetooth.yaml92
-rw-r--r--Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml19
-rw-r--r--Documentation/devicetree/bindings/net/ti,davinci-mdio.yaml11
-rw-r--r--Documentation/devicetree/bindings/net/ti,dp83822.yaml8
-rw-r--r--Documentation/devicetree/bindings/net/ti,dp83867.yaml26
-rw-r--r--Documentation/devicetree/bindings/net/ti,dp83869.yaml10
-rw-r--r--Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml72
-rw-r--r--Documentation/devicetree/bindings/net/ti,k3-am654-cpts.yaml18
-rw-r--r--Documentation/devicetree/bindings/net/ti-bluetooth.txt60
-rw-r--r--Documentation/devicetree/bindings/net/toshiba,visconti-dwmac.yaml9
-rw-r--r--Documentation/devicetree/bindings/net/vertexcom-mse102x.yaml71
-rw-r--r--Documentation/devicetree/bindings/net/wireless/brcm,bcm4329-fmac.yaml49
-rw-r--r--Documentation/devicetree/bindings/net/wireless/esp,esp8089.txt30
-rw-r--r--Documentation/devicetree/bindings/net/wireless/esp,esp8089.yaml43
-rw-r--r--Documentation/devicetree/bindings/net/wireless/ieee80211.yaml3
-rw-r--r--Documentation/devicetree/bindings/net/wireless/marvell-8xxx.txt4
-rw-r--r--Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml76
-rw-r--r--Documentation/devicetree/bindings/net/wireless/microchip,wilc1000.yaml28
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qca,ath9k.txt48
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qca,ath9k.yaml90
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt215
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml358
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml58
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml415
-rw-r--r--Documentation/devicetree/bindings/net/wireless/silabs,wfx.yaml132
-rw-r--r--Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt57
-rw-r--r--Documentation/devicetree/bindings/net/wireless/ti,wlcore.txt45
-rw-r--r--Documentation/devicetree/bindings/net/wireless/ti,wlcore.yaml142
-rw-r--r--Documentation/devicetree/bindings/net/xilinx_axienet.txt10
-rw-r--r--Documentation/devicetree/bindings/net/xlnx,emaclite.yaml63
-rw-r--r--Documentation/devicetree/bindings/numa.txt46
-rw-r--r--Documentation/devicetree/bindings/nvme/apple,nvme-ans.yaml113
-rw-r--r--Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml5
-rw-r--r--Documentation/devicetree/bindings/nvmem/amlogic,meson-gxbb-efuse.yaml57
-rw-r--r--Documentation/devicetree/bindings/nvmem/amlogic,meson6-efuse.yaml57
-rw-r--r--Documentation/devicetree/bindings/nvmem/amlogic-efuse.txt48
-rw-r--r--Documentation/devicetree/bindings/nvmem/amlogic-meson-mx-efuse.txt22
-rw-r--r--Documentation/devicetree/bindings/nvmem/apple,efuses.yaml50
-rw-r--r--Documentation/devicetree/bindings/nvmem/brcm,nvram.yaml30
-rw-r--r--Documentation/devicetree/bindings/nvmem/fsl,layerscape-sfp.yaml62
-rw-r--r--Documentation/devicetree/bindings/nvmem/fsl,scu-ocotp.yaml56
-rw-r--r--Documentation/devicetree/bindings/nvmem/imx-iim.yaml4
-rw-r--r--Documentation/devicetree/bindings/nvmem/imx-ocotp.yaml4
-rw-r--r--Documentation/devicetree/bindings/nvmem/ingenic,jz4780-efuse.yaml6
-rw-r--r--Documentation/devicetree/bindings/nvmem/layouts/kontron,sl28-vpd.yaml64
-rw-r--r--Documentation/devicetree/bindings/nvmem/layouts/nvmem-layout.yaml34
-rw-r--r--Documentation/devicetree/bindings/nvmem/layouts/onie,tlv-layout.yaml147
-rw-r--r--Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml90
-rw-r--r--Documentation/devicetree/bindings/nvmem/microchip,lan9662-otpc.yaml45
-rw-r--r--Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml50
-rw-r--r--Documentation/devicetree/bindings/nvmem/mtk-efuse.txt41
-rw-r--r--Documentation/devicetree/bindings/nvmem/mxs-ocotp.yaml2
-rw-r--r--Documentation/devicetree/bindings/nvmem/nintendo-otp.yaml4
-rw-r--r--Documentation/devicetree/bindings/nvmem/nvmem-consumer.yaml2
-rw-r--r--Documentation/devicetree/bindings/nvmem/nvmem.yaml30
-rw-r--r--Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml16
-rw-r--r--Documentation/devicetree/bindings/nvmem/qcom,spmi-sdam.yaml63
-rw-r--r--Documentation/devicetree/bindings/nvmem/rmem.yaml5
-rw-r--r--Documentation/devicetree/bindings/nvmem/rockchip-efuse.yaml4
-rw-r--r--Documentation/devicetree/bindings/nvmem/snvs-lpgpr.yaml16
-rw-r--r--Documentation/devicetree/bindings/nvmem/socionext,uniphier-efuse.yaml105
-rw-r--r--Documentation/devicetree/bindings/nvmem/st,stm32-romem.yaml10
-rw-r--r--Documentation/devicetree/bindings/nvmem/sunplus,sp7021-ocotp.yaml84
-rw-r--r--Documentation/devicetree/bindings/nvmem/u-boot,env.yaml101
-rw-r--r--Documentation/devicetree/bindings/opp/allwinner,sun50i-h6-operating-points.yaml6
-rw-r--r--Documentation/devicetree/bindings/opp/opp-v1.yaml2
-rw-r--r--Documentation/devicetree/bindings/opp/opp-v2-base.yaml39
-rw-r--r--Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml281
-rw-r--r--Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml63
-rw-r--r--Documentation/devicetree/bindings/opp/opp-v2.yaml4
-rw-r--r--Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt796
-rw-r--r--Documentation/devicetree/bindings/opp/qcom-opp.txt19
-rw-r--r--Documentation/devicetree/bindings/pci/amlogic,axg-pcie.yaml134
-rw-r--r--Documentation/devicetree/bindings/pci/amlogic,meson-pcie.txt70
-rw-r--r--Documentation/devicetree/bindings/pci/apple,pcie.yaml175
-rw-r--r--Documentation/devicetree/bindings/pci/baikal,bt1-pcie.yaml168
-rw-r--r--Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt133
-rw-r--r--Documentation/devicetree/bindings/pci/brcm,iproc-pcie.yaml184
-rw-r--r--Documentation/devicetree/bindings/pci/brcm,stb-pcie.yaml33
-rw-r--r--Documentation/devicetree/bindings/pci/cdns,cdns-pcie-ep.yaml3
-rw-r--r--Documentation/devicetree/bindings/pci/cdns,cdns-pcie-host.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/cdns-pcie-ep.yaml7
-rw-r--r--Documentation/devicetree/bindings/pci/cdns-pcie-host.yaml8
-rw-r--r--Documentation/devicetree/bindings/pci/cdns-pcie.yaml4
-rw-r--r--Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-common.yaml270
-rw-r--r--Documentation/devicetree/bindings/pci/fsl,imx6q-pcie-ep.yaml123
-rw-r--r--Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.yaml173
-rw-r--r--Documentation/devicetree/bindings/pci/hisilicon,kirin-pcie.yaml13
-rw-r--r--Documentation/devicetree/bindings/pci/host-generic-pci.yaml3
-rw-r--r--Documentation/devicetree/bindings/pci/intel,keembay-pcie-ep.yaml4
-rw-r--r--Documentation/devicetree/bindings/pci/intel,keembay-pcie.yaml4
-rw-r--r--Documentation/devicetree/bindings/pci/layerscape-pci.txt65
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml145
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml88
-rw-r--r--Documentation/devicetree/bindings/pci/mediatek-pcie.txt1
-rw-r--r--Documentation/devicetree/bindings/pci/microchip,pcie-host.yaml52
-rw-r--r--Documentation/devicetree/bindings/pci/mvebu-pci.txt16
-rw-r--r--Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie-ep.yaml319
-rw-r--r--Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt245
-rw-r--r--Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.yaml380
-rw-r--r--Documentation/devicetree/bindings/pci/pci-ep.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/pci-rcar-gen2.txt84
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml204
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie.txt376
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie.yaml953
-rw-r--r--Documentation/devicetree/bindings/pci/rcar-pci-ep.yaml1
-rw-r--r--Documentation/devicetree/bindings/pci/renesas,pci-rcar-gen2.yaml186
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip,rk3399-pcie-common.yaml69
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip,rk3399-pcie-ep.yaml68
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip,rk3399-pcie.yaml132
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml135
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip-pcie-ep.txt62
-rw-r--r--Documentation/devicetree/bindings/pci/rockchip-pcie-host.txt135
-rw-r--r--Documentation/devicetree/bindings/pci/samsung,exynos-pcie.yaml2
-rw-r--r--Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml13
-rw-r--r--Documentation/devicetree/bindings/pci/snps,dw-pcie-common.yaml266
-rw-r--r--Documentation/devicetree/bindings/pci/snps,dw-pcie-ep.yaml212
-rw-r--r--Documentation/devicetree/bindings/pci/snps,dw-pcie.yaml264
-rw-r--r--Documentation/devicetree/bindings/pci/socionext,uniphier-pcie-ep.yaml80
-rw-r--r--Documentation/devicetree/bindings/pci/socionext,uniphier-pcie.yaml117
-rw-r--r--Documentation/devicetree/bindings/pci/ti,am65-pci-ep.yaml10
-rw-r--r--Documentation/devicetree/bindings/pci/ti,am65-pci-host.yaml20
-rw-r--r--Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml13
-rw-r--r--Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml40
-rw-r--r--Documentation/devicetree/bindings/pci/toshiba,visconti-pcie.yaml9
-rw-r--r--Documentation/devicetree/bindings/pci/uniphier-pcie.txt82
-rw-r--r--Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt73
-rw-r--r--Documentation/devicetree/bindings/pci/xilinx-pcie.txt88
-rw-r--r--Documentation/devicetree/bindings/pci/xilinx-versal-cpm.yaml49
-rw-r--r--Documentation/devicetree/bindings/pci/xlnx,axi-pcie-host.yaml88
-rw-r--r--Documentation/devicetree/bindings/pci/xlnx,nwl-pcie.yaml149
-rw-r--r--Documentation/devicetree/bindings/peci/peci-aspeed.yaml72
-rw-r--r--Documentation/devicetree/bindings/peci/peci-controller.yaml33
-rw-r--r--Documentation/devicetree/bindings/perf/amlogic,g12-ddr-pmu.yaml54
-rw-r--r--Documentation/devicetree/bindings/perf/arm,ccn.yaml40
-rw-r--r--Documentation/devicetree/bindings/perf/arm,cmn.yaml23
-rw-r--r--Documentation/devicetree/bindings/perf/arm,dsu-pmu.yaml44
-rw-r--r--Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml70
-rw-r--r--Documentation/devicetree/bindings/perf/arm-ccn.txt23
-rw-r--r--Documentation/devicetree/bindings/perf/marvell-cn10k-ddr.yaml37
-rw-r--r--Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml63
-rw-r--r--Documentation/devicetree/bindings/perf/nds32v3-pmu.txt17
-rw-r--r--Documentation/devicetree/bindings/perf/riscv,pmu.yaml160
-rw-r--r--Documentation/devicetree/bindings/perf/spe-pmu.yaml40
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun4i-a10-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun50i-a64-usb-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun50i-h6-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun50i-h6-usb3-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun5i-a13-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml25
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun8i-a23-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun8i-a83t-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun8i-h3-usb-phy.yaml28
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun8i-r40-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun8i-v3s-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun9i-a80-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,suniv-f1c100s-usb-phy.yaml83
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,axg-mipi-dphy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,g12a-mipi-dphy-analog.yaml35
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,g12a-usb2-phy.yaml78
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,g12a-usb3-pcie-phy.yaml64
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson-axg-mipi-pcie-analog.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson-axg-pcie.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson-g12a-usb2-phy.yaml78
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson-g12a-usb3-pcie-phy.yaml59
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson-gxl-usb2-phy.yaml56
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson8-hdmi-tx-phy.yaml65
-rw-r--r--Documentation/devicetree/bindings/phy/amlogic,meson8b-usb2-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/bcm-ns-usb2-phy.yaml25
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,bcm63xx-usbh-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,cygnus-pcie-phy.txt47
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,cygnus-pcie-phy.yaml77
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,mdio-mux-bus-pci.txt27
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,ns2-pcie-phy.yaml41
-rw-r--r--Documentation/devicetree/bindings/phy/brcm,sata-phy.yaml14
-rw-r--r--Documentation/devicetree/bindings/phy/calxeda-combophy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/cdns,dphy-rx.yaml42
-rw-r--r--Documentation/devicetree/bindings/phy/cdns,dphy.txt20
-rw-r--r--Documentation/devicetree/bindings/phy/cdns,dphy.yaml57
-rw-r--r--Documentation/devicetree/bindings/phy/cdns,salvo-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/fsl,imx8-pcie-phy.yaml102
-rw-r--r--Documentation/devicetree/bindings/phy/fsl,imx8mq-usb-phy.yaml5
-rw-r--r--Documentation/devicetree/bindings/phy/fsl,imx8qm-lvds-phy.yaml61
-rw-r--r--Documentation/devicetree/bindings/phy/fsl,lynx-28g.yaml40
-rw-r--r--Documentation/devicetree/bindings/phy/hisilicon,hi3660-usb3.yaml29
-rw-r--r--Documentation/devicetree/bindings/phy/hisilicon,hi3670-usb3.yaml35
-rw-r--r--Documentation/devicetree/bindings/phy/ingenic,phy-usb.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/intel,combo-phy.yaml8
-rw-r--r--Documentation/devicetree/bindings/phy/intel,keembay-phy-usb.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/intel,lgm-emmc-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/intel,lgm-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/lantiq,vrx200-pcie-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/marvell,armada-3700-utmi-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/marvell,armada-cp110-utmi-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/marvell,mmp3-hsic-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/marvell,mmp3-usb-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,dsi-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,hdmi-phy.yaml3
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,mt7621-pci-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,pcie-phy.yaml75
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,tphy.yaml36
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,ufs-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/mediatek,xsphy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/meson-gxl-usb2-phy.txt21
-rw-r--r--Documentation/devicetree/bindings/phy/microchip,lan966x-serdes.yaml59
-rw-r--r--Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt29
-rw-r--r--Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.yaml107
-rw-r--r--Documentation/devicetree/bindings/phy/mscc,vsc7514-serdes.yaml56
-rw-r--r--Documentation/devicetree/bindings/phy/mxs-usb-phy.txt5
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt779
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.yaml654
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra186-xusb-padctl.yaml544
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra194-xusb-padctl.yaml632
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra20-usb-phy.txt74
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra20-usb-phy.yaml373
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra210-xusb-padctl.yaml786
-rw-r--r--Documentation/devicetree/bindings/phy/phy-cadence-sierra.yaml23
-rw-r--r--Documentation/devicetree/bindings/phy/phy-cadence-torrent.yaml16
-rw-r--r--Documentation/devicetree/bindings/phy/phy-ocelot-serdes.txt43
-rw-r--r--Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.yaml151
-rw-r--r--Documentation/devicetree/bindings/phy/phy-rockchip-naneng-combphy.yaml110
-rw-r--r--Documentation/devicetree/bindings/phy/phy-stih407-usb.txt2
-rw-r--r--Documentation/devicetree/bindings/phy/phy-stm32-usbphyc.yaml140
-rw-r--r--Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt28
-rw-r--r--Documentation/devicetree/bindings/phy/phy-tegra194-p2u.yaml53
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,edp-phy.yaml77
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,hdmi-phy-other.yaml125
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,hdmi-phy-qmp.yaml94
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,ipq8074-qmp-pcie-phy.yaml299
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,msm8996-qmp-pcie-phy.yaml189
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,msm8996-qmp-ufs-phy.yaml244
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,msm8996-qmp-usb3-phy.yaml394
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,pcie2-phy.yaml86
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,qmp-phy.yaml453
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,qmp-usb3-dp-phy.yaml213
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,qusb2-phy.yaml176
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc7180-qmp-usb3-dp-phy.yaml276
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml216
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-ufs-phy.yaml113
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb3-uni-phy.yaml102
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-usb43dp-phy.yaml103
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,snps-eusb2-phy.yaml79
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,snps-eusb2-repeater.yaml52
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-hs-28nm.yaml5
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-hs-phy.txt84
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-hs-phy.yaml111
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-hsic-phy.txt65
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-hsic-phy.yaml67
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-snps-femto-v2.yaml122
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,usb-ss.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt42
-rw-r--r--Documentation/devicetree/bindings/phy/qcom-usb-ipq4019-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/rcar-gen2-phy.txt112
-rw-r--r--Documentation/devicetree/bindings/phy/renesas,r8a779f0-ether-serdes.yaml54
-rw-r--r--Documentation/devicetree/bindings/phy/renesas,rcar-gen2-usb-phy.yaml123
-rw-r--r--Documentation/devicetree/bindings/phy/renesas,usb2-phy.yaml4
-rw-r--r--Documentation/devicetree/bindings/phy/renesas,usb3-phy.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip,inno-usb2phy.yaml188
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip,pcie3-phy.yaml80
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip,px30-dsi-dphy.yaml1
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip,rk3288-dp-phy.yaml41
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip-dp-phy.txt26
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip-inno-csi-dphy.yaml3
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip-mipi-dphy-rx0.yaml2
-rw-r--r--Documentation/devicetree/bindings/phy/rockchip-usb-phy.yaml11
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,dp-video-phy.yaml40
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,exynos-hdmi-phy.yaml43
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,exynos-pcie-phy.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,exynos5250-sata-phy.yaml64
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,mipi-video-phy.yaml112
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,ufs-phy.yaml64
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,usb2-phy.yaml102
-rw-r--r--Documentation/devicetree/bindings/phy/samsung,usb3-drd-phy.yaml126
-rw-r--r--Documentation/devicetree/bindings/phy/samsung-phy.txt210
-rw-r--r--Documentation/devicetree/bindings/phy/socionext,uniphier-ahci-phy.yaml108
-rw-r--r--Documentation/devicetree/bindings/phy/socionext,uniphier-pcie-phy.yaml48
-rw-r--r--Documentation/devicetree/bindings/phy/socionext,uniphier-usb2-phy.yaml44
-rw-r--r--Documentation/devicetree/bindings/phy/socionext,uniphier-usb3hs-phy.yaml119
-rw-r--r--Documentation/devicetree/bindings/phy/socionext,uniphier-usb3ss-phy.yaml125
-rw-r--r--Documentation/devicetree/bindings/phy/sunplus,sp7021-usb2-phy.yaml73
-rw-r--r--Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml6
-rw-r--r--Documentation/devicetree/bindings/phy/ti,phy-am654-serdes.yaml7
-rw-r--r--Documentation/devicetree/bindings/phy/ti,phy-gmii-sel.yaml73
-rw-r--r--Documentation/devicetree/bindings/phy/ti,phy-j721e-wiz.yaml54
-rw-r--r--Documentation/devicetree/bindings/phy/ti,tcan104x-can.yaml12
-rw-r--r--Documentation/devicetree/bindings/phy/transmit-amplitude.yaml103
-rw-r--r--Documentation/devicetree/bindings/phy/xlnx,zynqmp-psgtr.yaml4
-rw-r--r--Documentation/devicetree/bindings/pinctrl/actions,s500-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/allwinner,sun4i-a10-pinctrl.yaml37
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson-pinctrl-a1.yaml67
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson-pinctrl-common.yaml57
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson-pinctrl-g12a-aobus.yaml68
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson-pinctrl-g12a-periphs.yaml72
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson8-pinctrl-aobus.yaml76
-rw-r--r--Documentation/devicetree/bindings/pinctrl/amlogic,meson8-pinctrl-cbus.yaml78
-rw-r--r--Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml21
-rw-r--r--Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml13
-rw-r--r--Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml72
-rw-r--r--Documentation/devicetree/bindings/pinctrl/aspeed,ast2600-pinctrl.yaml19
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm11351-pinctrl.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm4908-pinctrl.yaml73
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm6318-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm63268-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm6328-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm6358-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm6362-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm6368-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,ns-pinmux.yaml34
-rw-r--r--Documentation/devicetree/bindings/pinctrl/canaan,k210-fpioa.yaml15
-rw-r--r--Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml12
-rw-r--r--Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml25
-rw-r--r--Documentation/devicetree/bindings/pinctrl/cypress,cy8c95x0.yaml134
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx7d-pinctrl.txt87
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx7d-pinctrl.yaml113
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8m-pinctrl.yaml90
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.yaml81
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.yaml81
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mp-pinctrl.yaml81
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mq-pinctrl.yaml81
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8ulp-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx93-pinctrl.yaml85
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imxrt1050.yaml79
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imxrt1170.yaml77
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,scu-pinctrl.yaml74
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ingenic,pinctrl.yaml10
-rw-r--r--Documentation/devicetree/bindings/pinctrl/intel,lgm-io.yaml5
-rw-r--r--Documentation/devicetree/bindings/pinctrl/intel,pinctrl-keembay.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/lantiq,pinctrl-xway.txt35
-rw-r--r--Documentation/devicetree/bindings/pinctrl/loongson,ls2k-pinctrl.yaml123
-rw-r--r--Documentation/devicetree/bindings/pinctrl/marvell,ac5-pinctrl.yaml73
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt65xx-pinctrl.yaml52
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt6779-pinctrl.yaml127
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt6795-pinctrl.yaml228
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt6797-pinctrl.yaml173
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt7620-pinctrl.yaml298
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt7621-pinctrl.yaml261
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt7622-pinctrl.yaml43
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt76x8-pinctrl.yaml450
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt7981-pinctrl.yaml480
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt7986-pinctrl.yaml462
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8183-pinctrl.yaml49
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8186-pinctrl.yaml275
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8188-pinctrl.yaml232
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8192-pinctrl.yaml184
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8195-pinctrl.yaml286
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mediatek,mt8365-pinctrl.yaml230
-rw-r--r--Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt93
-rw-r--r--Documentation/devicetree/bindings/pinctrl/microchip,sparx5-sgpio.yaml9
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mscc,ocelot-pinctrl.txt41
-rw-r--r--Documentation/devicetree/bindings/pinctrl/mscc,ocelot-pinctrl.yaml116
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nuvoton,wpcm450-pinctrl.yaml161
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra-pinmux-common.yaml178
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra114-pinmux.txt131
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra114-pinmux.yaml155
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-dpaux-padctl.txt59
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-pinmux.txt153
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra124-pinmux.yaml176
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt107
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.yaml284
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra20-pinmux.txt143
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra20-pinmux.yaml112
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra210-pinmux.txt166
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra210-pinmux.yaml142
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra30-pinmux.txt144
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra30-pinmux.yaml176
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nxp,s32g2-siul2-pinctrl.yaml123
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pincfg-node.yaml17
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt41
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-mt8192.yaml155
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml148
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-rk805.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-sx150x.txt72
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl.yaml45
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinmux-node.yaml4
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq5332-tlmm.yaml125
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq6018-pinctrl.yaml116
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt181
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.yaml128
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq9574-tlmm.yaml130
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,lpass-lpi-pinctrl.yaml130
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-pinctrl.yaml133
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,mdm9607-tlmm.yaml124
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt161
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.yaml112
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8226-pinctrl.yaml107
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8660-pinctrl.txt96
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8660-pinctrl.yaml117
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8909-tlmm.yaml144
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt195
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.yaml159
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8953-pinctrl.yaml102
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt190
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.yaml157
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8974-pinctrl.txt121
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8974-pinctrl.yaml172
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8976-pinctrl.txt183
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8976-pinctrl.yaml129
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt186
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.yaml155
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt208
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.yaml175
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt202
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.yaml164
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.yaml330
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.txt187
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,pmic-mpp.yaml190
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,qcm2290-tlmm.yaml138
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt199
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.yaml169
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,qdu1000-tlmm.yaml125
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sa8775p-tlmm.yaml129
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc7180-pinctrl.txt187
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc7180-pinctrl.yaml151
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc7280-lpass-lpi-pinctrl.yaml139
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc7280-pinctrl.yaml86
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-pinctrl.yaml152
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc8180x-tlmm.yaml144
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-lpass-lpi-pinctrl.yaml155
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sc8280xp-tlmm.yaml153
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm630-pinctrl.yaml181
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt191
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm670-tlmm.yaml119
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt176
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.yaml162
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdx55-pinctrl.yaml100
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdx65-tlmm.yaml159
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6115-pinctrl.yaml179
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6115-tlmm.yaml144
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6125-pinctrl.yaml132
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6125-tlmm.yaml143
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6350-tlmm.yaml156
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm6375-tlmm.yaml148
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm7150-tlmm.yaml162
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt190
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.yaml166
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8250-lpass-lpi-pinctrl.yaml161
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8250-pinctrl.yaml192
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8350-pinctrl.yaml145
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8350-tlmm.yaml143
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8450-lpass-lpi-pinctrl.yaml164
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8450-tlmm.yaml142
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8550-lpass-lpi-pinctrl.yaml150
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8550-tlmm.yaml154
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,tlmm-common.yaml34
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt2880-pinctrl.yaml141
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt2880-pinmux.yaml64
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt305x-pinctrl.yaml206
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt3352-pinctrl.yaml243
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt3883-pinctrl.yaml261
-rw-r--r--Documentation/devicetree/bindings/pinctrl/ralink,rt5350-pinctrl.yaml206
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,pfc.yaml5
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rza1-ports.yaml4
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rza2-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-pinctrl.yaml39
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rzg2l-poeg.yaml86
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rzn1-pinctrl.yaml5
-rw-r--r--Documentation/devicetree/bindings/pinctrl/renesas,rzv2m-pinctrl.yaml170
-rw-r--r--Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt114
-rw-r--r--Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.yaml188
-rw-r--r--Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-gpio-bank.yaml52
-rw-r--r--Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-pins-cfg.yaml80
-rw-r--r--Documentation/devicetree/bindings/pinctrl/samsung,pinctrl-wakeup-interrupt.yaml106
-rw-r--r--Documentation/devicetree/bindings/pinctrl/samsung,pinctrl.yaml387
-rw-r--r--Documentation/devicetree/bindings/pinctrl/samsung-pinctrl.txt381
-rw-r--r--Documentation/devicetree/bindings/pinctrl/semtech,sx1501q.yaml208
-rw-r--r--Documentation/devicetree/bindings/pinctrl/socionext,uniphier-pinctrl.yaml62
-rw-r--r--Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml47
-rw-r--r--Documentation/devicetree/bindings/pinctrl/starfive,jh7100-pinctrl.yaml307
-rw-r--r--Documentation/devicetree/bindings/pinctrl/starfive,jh7110-aon-pinctrl.yaml124
-rw-r--r--Documentation/devicetree/bindings/pinctrl/starfive,jh7110-sys-pinctrl.yaml142
-rw-r--r--Documentation/devicetree/bindings/pinctrl/sunplus,sp7021-pinctrl.yaml377
-rw-r--r--Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml12
-rw-r--r--Documentation/devicetree/bindings/pinctrl/xlnx,zynq-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/xlnx,zynqmp-pinctrl.yaml3
-rw-r--r--Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml54
-rw-r--r--Documentation/devicetree/bindings/power/amlogic,meson-ee-pwrc.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/amlogic,meson-gx-pwrc.txt4
-rw-r--r--Documentation/devicetree/bindings/power/amlogic,meson-sec-pwrc.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/apple,pmgr-pwrstate.yaml80
-rw-r--r--Documentation/devicetree/bindings/power/avs/qcom,cpr.txt130
-rw-r--r--Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml161
-rw-r--r--Documentation/devicetree/bindings/power/brcm,bcm63xx-power.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/domain-idle-state.yaml12
-rw-r--r--Documentation/devicetree/bindings/power/fsl,imx-gpc.yaml33
-rw-r--r--Documentation/devicetree/bindings/power/fsl,imx-gpcv2.yaml21
-rw-r--r--Documentation/devicetree/bindings/power/fsl,scu-pd.yaml41
-rw-r--r--Documentation/devicetree/bindings/power/mediatek,power-controller.yaml144
-rw-r--r--Documentation/devicetree/bindings/power/power-domain.yaml3
-rw-r--r--Documentation/devicetree/bindings/power/qcom,kpss-acc-v2.yaml42
-rw-r--r--Documentation/devicetree/bindings/power/qcom,rpmpd.yaml17
-rw-r--r--Documentation/devicetree/bindings/power/renesas,apmu.yaml7
-rw-r--r--Documentation/devicetree/bindings/power/renesas,rcar-sysc.yaml21
-rw-r--r--Documentation/devicetree/bindings/power/reset/gpio-poweroff.txt41
-rw-r--r--Documentation/devicetree/bindings/power/reset/gpio-poweroff.yaml59
-rw-r--r--Documentation/devicetree/bindings/power/reset/gpio-restart.txt54
-rw-r--r--Documentation/devicetree/bindings/power/reset/gpio-restart.yaml79
-rw-r--r--Documentation/devicetree/bindings/power/reset/msm-poweroff.txt17
-rw-r--r--Documentation/devicetree/bindings/power/reset/qcom,pon.yaml58
-rw-r--r--Documentation/devicetree/bindings/power/reset/qcom,pshold.yaml35
-rw-r--r--Documentation/devicetree/bindings/power/reset/regulator-poweroff.yaml2
-rw-r--r--Documentation/devicetree/bindings/power/reset/restart-handler.yaml30
-rw-r--r--Documentation/devicetree/bindings/power/reset/syscon-reboot.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/reset/xlnx,zynqmp-power.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/rockchip,power-controller.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/rockchip-io-domain.yaml30
-rw-r--r--Documentation/devicetree/bindings/power/starfive,jh7110-pmu.yaml45
-rw-r--r--Documentation/devicetree/bindings/power/supply/active-semi,act8945a-charger.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/adc-battery.yaml70
-rw-r--r--Documentation/devicetree/bindings/power/supply/battery.yaml7
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq2415x.yaml9
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq24190.yaml10
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq24257.yaml10
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq24735.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq2515x.yaml9
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq256xx.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq25890.yaml24
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq25980.yaml11
-rw-r--r--Documentation/devicetree/bindings/power/supply/bq27xxx.yaml21
-rw-r--r--Documentation/devicetree/bindings/power/supply/charger-manager.yaml2
-rw-r--r--Documentation/devicetree/bindings/power/supply/cpcap-battery.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/cpcap-charger.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml7
-rw-r--r--Documentation/devicetree/bindings/power/supply/dlg,da9150-charger.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/dlg,da9150-fuel-gauge.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/ingenic,battery.yaml10
-rw-r--r--Documentation/devicetree/bindings/power/supply/isp1704.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/lego,ev3-battery.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/lltc,lt3651-charger.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/lltc,ltc294x.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/supply/ltc4162-l.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,ds2760.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max14577.yaml84
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max14656.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max17040.yaml14
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max17042.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max77693.yaml70
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max77976.yaml44
-rw-r--r--Documentation/devicetree/bindings/power/supply/maxim,max8903.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/mediatek,mt6370-charger.yaml96
-rw-r--r--Documentation/devicetree/bindings/power/supply/mt6360_charger.yaml2
-rw-r--r--Documentation/devicetree/bindings/power/supply/nokia,n900-battery.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/olpc-battery.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/power-supply.yaml13
-rw-r--r--Documentation/devicetree/bindings/power/supply/qcom,pm8941-charger.yaml13
-rw-r--r--Documentation/devicetree/bindings/power/supply/qcom,pm8941-coincell.yaml20
-rw-r--r--Documentation/devicetree/bindings/power/supply/richtek,rt5033-battery.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/richtek,rt9455.yaml8
-rw-r--r--Documentation/devicetree/bindings/power/supply/richtek,rt9467.yaml82
-rw-r--r--Documentation/devicetree/bindings/power/supply/richtek,rt9471.yaml73
-rw-r--r--Documentation/devicetree/bindings/power/supply/rohm,bd99954.yaml3
-rw-r--r--Documentation/devicetree/bindings/power/supply/samsung,battery.yaml56
-rw-r--r--Documentation/devicetree/bindings/power/supply/sbs,sbs-manager.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/sc2731-charger.yaml7
-rw-r--r--Documentation/devicetree/bindings/power/supply/sc27xx-fg.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/stericsson,ab8500-btemp.yaml14
-rw-r--r--Documentation/devicetree/bindings/power/supply/stericsson,ab8500-chargalg.yaml14
-rw-r--r--Documentation/devicetree/bindings/power/supply/stericsson,ab8500-charger.yaml14
-rw-r--r--Documentation/devicetree/bindings/power/supply/stericsson,ab8500-fg.yaml19
-rw-r--r--Documentation/devicetree/bindings/power/supply/summit,smb347-charger.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/ti,lp8727.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/tps65090-charger.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/tps65217-charger.yaml6
-rw-r--r--Documentation/devicetree/bindings/power/supply/twl4030-charger.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/x-powers,axp20x-ac-power-supply.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/x-powers,axp20x-battery-power-supply.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/supply/x-powers,axp20x-usb-power-supply.yaml4
-rw-r--r--Documentation/devicetree/bindings/power/wakeup-source.txt13
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/cache_sram.txt20
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/cpus.txt2
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt61
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/mpc5200.txt2
-rw-r--r--Documentation/devicetree/bindings/powerpc/nintendo/wii.txt10
-rw-r--r--Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt2
-rw-r--r--Documentation/devicetree/bindings/ptp/ptp-idt82p33.yaml2
-rw-r--r--Documentation/devicetree/bindings/ptp/ptp-idtcm.yaml2
-rw-r--r--Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml51
-rw-r--r--Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml51
-rw-r--r--Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml47
-rw-r--r--Documentation/devicetree/bindings/pwm/atmel-pwm.txt35
-rw-r--r--Documentation/devicetree/bindings/pwm/brcm,bcm7038-pwm.txt20
-rw-r--r--Documentation/devicetree/bindings/pwm/brcm,bcm7038-pwm.yaml42
-rw-r--r--Documentation/devicetree/bindings/pwm/clk-pwm.yaml46
-rw-r--r--Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml15
-rw-r--r--Documentation/devicetree/bindings/pwm/imx-pwm.yaml4
-rw-r--r--Documentation/devicetree/bindings/pwm/imx-tpm-pwm.yaml4
-rw-r--r--Documentation/devicetree/bindings/pwm/intel,keembay-pwm.yaml3
-rw-r--r--Documentation/devicetree/bindings/pwm/intel,lgm-pwm.yaml3
-rw-r--r--Documentation/devicetree/bindings/pwm/iqs620a-pwm.yaml4
-rw-r--r--Documentation/devicetree/bindings/pwm/mediatek,mt2712-pwm.yaml94
-rw-r--r--Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml76
-rw-r--r--Documentation/devicetree/bindings/pwm/microchip,corepwm.yaml83
-rw-r--r--Documentation/devicetree/bindings/pwm/mxs-pwm.yaml4
-rw-r--r--Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt77
-rw-r--r--Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.yaml95
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-amlogic.yaml70
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-mediatek.txt48
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-meson.txt29
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt44
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-omap-dmtimer.txt2
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml76
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-samsung.yaml1
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-sifive.yaml6
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-tiecap.yaml1
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-tiehrpwm.yaml1
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm.yaml2
-rw-r--r--Documentation/devicetree/bindings/pwm/renesas,pwm-rcar.yaml27
-rw-r--r--Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.yaml4
-rw-r--r--Documentation/devicetree/bindings/pwm/snps,dw-apb-timers-pwm2.yaml68
-rw-r--r--Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml42
-rw-r--r--Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml4
-rw-r--r--Documentation/devicetree/bindings/regulator/act8865-regulator.txt117
-rw-r--r--Documentation/devicetree/bindings/regulator/act8945a-regulator.txt113
-rw-r--r--Documentation/devicetree/bindings/regulator/active-semi,act8600.yaml139
-rw-r--r--Documentation/devicetree/bindings/regulator/active-semi,act8846.yaml205
-rw-r--r--Documentation/devicetree/bindings/regulator/active-semi,act8865.yaml158
-rw-r--r--Documentation/devicetree/bindings/regulator/active-semi,act8945a.yaml258
-rw-r--r--Documentation/devicetree/bindings/regulator/anatop-regulator.yaml22
-rw-r--r--Documentation/devicetree/bindings/regulator/dlg,da9121.yaml79
-rw-r--r--Documentation/devicetree/bindings/regulator/fan53555.txt24
-rw-r--r--Documentation/devicetree/bindings/regulator/fcs,fan53555.yaml73
-rw-r--r--Documentation/devicetree/bindings/regulator/fixed-regulator.yaml43
-rw-r--r--Documentation/devicetree/bindings/regulator/google,cros-ec-regulator.yaml5
-rw-r--r--Documentation/devicetree/bindings/regulator/gpio-regulator.yaml5
-rw-r--r--Documentation/devicetree/bindings/regulator/max77650-regulator.yaml3
-rw-r--r--Documentation/devicetree/bindings/regulator/max77686.txt71
-rw-r--r--Documentation/devicetree/bindings/regulator/max77802.txt111
-rw-r--r--Documentation/devicetree/bindings/regulator/max8660.yaml10
-rw-r--r--Documentation/devicetree/bindings/regulator/max8893.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/max8952.txt52
-rw-r--r--Documentation/devicetree/bindings/regulator/max8973-regulator.txt52
-rw-r--r--Documentation/devicetree/bindings/regulator/max8997-regulator.txt145
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max14577.yaml78
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max20086.yaml106
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max20411.yaml58
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max77686.yaml83
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max77693.yaml60
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max77802.yaml86
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max77843.yaml65
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max8952.yaml109
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max8973.yaml141
-rw-r--r--Documentation/devicetree/bindings/regulator/maxim,max8997.yaml446
-rw-r--r--Documentation/devicetree/bindings/regulator/mediatek,mt6331-regulator.yaml273
-rw-r--r--Documentation/devicetree/bindings/regulator/mediatek,mt6332-regulator.yaml112
-rw-r--r--Documentation/devicetree/bindings/regulator/mediatek,mt6357-regulator.yaml294
-rw-r--r--Documentation/devicetree/bindings/regulator/mps,mp5416.yaml5
-rw-r--r--Documentation/devicetree/bindings/regulator/mps,mp886x.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/mps,mpq7920.yaml6
-rw-r--r--Documentation/devicetree/bindings/regulator/mps,mpq7932.yaml68
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml9
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6358-regulator.txt22
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6359-regulator.yaml16
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6360-regulator.yaml16
-rw-r--r--Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml34
-rw-r--r--Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml23
-rw-r--r--Documentation/devicetree/bindings/regulator/pfuze100.yaml12
-rw-r--r--Documentation/devicetree/bindings/regulator/pwm-regulator.txt92
-rw-r--r--Documentation/devicetree/bindings/regulator/pwm-regulator.yaml126
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,rpm-regulator.yaml128
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml362
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml36
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt346
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.yaml354
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,usb-vbus-regulator.yaml4
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom-labibb-regulator.yaml18
-rw-r--r--Documentation/devicetree/bindings/regulator/raspberrypi,7inch-touchscreen-panel-regulator.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/regulator-output.yaml39
-rw-r--r--Documentation/devicetree/bindings/regulator/regulator.yaml25
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml22
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt4803.yaml68
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt5190a-regulator.yaml141
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt5739.yaml72
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt5759-regulator.yaml90
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt6190.yaml79
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt6245-regulator.yaml8
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rtmv20-regulator.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd70528-regulator.txt68
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71815-regulator.yaml11
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71828-regulator.yaml30
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71837-regulator.yaml14
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd71847-regulator.yaml14
-rw-r--r--Documentation/devicetree/bindings/regulator/rohm,bd9576-regulator.yaml6
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mpa01.txt79
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mpa01.yaml62
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mps11.txt102
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mps11.yaml44
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mps13.yaml44
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mps14.yaml61
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mps15.yaml44
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s2mpu02.yaml44
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s5m8767.txt145
-rw-r--r--Documentation/devicetree/bindings/regulator/samsung,s5m8767.yaml100
-rw-r--r--Documentation/devicetree/bindings/regulator/silergy,sy8106a.yaml52
-rw-r--r--Documentation/devicetree/bindings/regulator/siliconmitus,sm5703-regulator.yaml49
-rw-r--r--Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml80
-rw-r--r--Documentation/devicetree/bindings/regulator/st,stm32-booster.yaml8
-rw-r--r--Documentation/devicetree/bindings/regulator/st,stm32-vrefbuf.yaml7
-rw-r--r--Documentation/devicetree/bindings/regulator/st,stm32mp1-pwr-reg.yaml4
-rw-r--r--Documentation/devicetree/bindings/regulator/sy8106a-regulator.txt23
-rw-r--r--Documentation/devicetree/bindings/regulator/ti,tps62360.yaml98
-rw-r--r--Documentation/devicetree/bindings/regulator/ti,tps62864.yaml63
-rw-r--r--Documentation/devicetree/bindings/regulator/ti,tps65219.yaml173
-rw-r--r--Documentation/devicetree/bindings/regulator/tps62360-regulator.txt44
-rw-r--r--Documentation/devicetree/bindings/regulator/vexpress.txt2
-rw-r--r--Documentation/devicetree/bindings/regulator/vqmmc-ipq4019-regulator.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/wlf,arizona.yaml6
-rw-r--r--Documentation/devicetree/bindings/remoteproc/amlogic,meson-mx-ao-arc.yaml87
-rw-r--r--Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml31
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml8
-rw-r--r--Documentation/devicetree/bindings/remoteproc/mtk,scp.txt36
-rw-r--r--Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml124
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml394
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,glink-edge.yaml100
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,glink-rpm-edge.yaml99
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt140
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,msm8916-mss-pil.yaml291
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,msm8996-mss-pil.yaml393
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,pas-common.yaml89
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,pil-info.yaml4
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt179
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,qcs404-cdsp-pil.yaml160
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,qcs404-pas.yaml94
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc7180-mss-pil.yaml247
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc7180-pas.yaml133
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc7280-adsp-pil.yaml195
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc7280-mss-pil.yaml268
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc7280-wpss-pil.yaml207
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc8180x-pas.yaml95
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sc8280xp-pas.yaml147
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sdm845-adsp-pil.yaml159
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sdx55-pas.yaml109
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sm6115-pas.yaml143
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sm6350-pas.yaml167
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sm8150-pas.yaml174
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sm8350-pas.yaml182
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,sm8550-pas.yaml178
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,smd-edge.yaml117
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt177
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.yaml294
-rw-r--r--Documentation/devicetree/bindings/remoteproc/renesas,rcar-rproc.yaml65
-rw-r--r--Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml57
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ti,k3-dsp-rproc.yaml20
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ti,k3-r5f-rproc.yaml97
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ti,omap-remoteproc.yaml19
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ti,pru-consumer.yaml60
-rw-r--r--Documentation/devicetree/bindings/remoteproc/ti,pru-rproc.yaml5
-rw-r--r--Documentation/devicetree/bindings/remoteproc/xlnx,zynqmp-r5fss.yaml135
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/framebuffer.yaml52
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/google,open-dice.yaml46
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/memory-region.yaml40
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/nvidia,tegra210-emc-table.yaml31
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/phram.yaml47
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/qcom,cmd-db.txt37
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/qcom,cmd-db.yaml46
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/qcom,rmtfs-mem.txt51
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/qcom,rmtfs-mem.yaml55
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/ramoops.txt66
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/ramoops.yaml144
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt172
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/reserved-memory.yaml181
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/shared-dma-pool.yaml97
-rw-r--r--Documentation/devicetree/bindings/reset/allwinner,sun6i-a31-clock-reset.yaml2
-rw-r--r--Documentation/devicetree/bindings/reset/altr,rst-mgr.yaml47
-rw-r--r--Documentation/devicetree/bindings/reset/amlogic,meson-axg-audio-arb.txt22
-rw-r--r--Documentation/devicetree/bindings/reset/amlogic,meson-axg-audio-arb.yaml56
-rw-r--r--Documentation/devicetree/bindings/reset/amlogic,meson-reset.yaml7
-rw-r--r--Documentation/devicetree/bindings/reset/ath79-reset.txt20
-rw-r--r--Documentation/devicetree/bindings/reset/atmel,at91sam9260-reset.yaml68
-rw-r--r--Documentation/devicetree/bindings/reset/berlin,reset.txt23
-rw-r--r--Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt18
-rw-r--r--Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.yaml36
-rw-r--r--Documentation/devicetree/bindings/reset/brcm,bcm6345-reset.yaml4
-rw-r--r--Documentation/devicetree/bindings/reset/brcm,bcm7216-pcie-sata-rescal.yaml4
-rw-r--r--Documentation/devicetree/bindings/reset/brcm,brcmstb-reset.txt27
-rw-r--r--Documentation/devicetree/bindings/reset/brcm,brcmstb-reset.yaml48
-rw-r--r--Documentation/devicetree/bindings/reset/canaan,k210-rst.yaml2
-rw-r--r--Documentation/devicetree/bindings/reset/delta,tn48m-reset.yaml35
-rw-r--r--Documentation/devicetree/bindings/reset/hisilicon,hi3660-reset.yaml5
-rw-r--r--Documentation/devicetree/bindings/reset/hisilicon,hi6220-reset.txt37
-rw-r--r--Documentation/devicetree/bindings/reset/lantiq,reset.txt30
-rw-r--r--Documentation/devicetree/bindings/reset/lantiq,reset.yaml49
-rw-r--r--Documentation/devicetree/bindings/reset/marvell,berlin2-reset.yaml38
-rw-r--r--Documentation/devicetree/bindings/reset/microchip,rst.yaml11
-rw-r--r--Documentation/devicetree/bindings/reset/nuvoton,npcm-reset.txt32
-rw-r--r--Documentation/devicetree/bindings/reset/nuvoton,npcm750-reset.yaml58
-rw-r--r--Documentation/devicetree/bindings/reset/qca,ar7100-reset.yaml40
-rw-r--r--Documentation/devicetree/bindings/reset/qcom,aoss-reset.yaml2
-rw-r--r--Documentation/devicetree/bindings/reset/qcom,pdc-global.yaml2
-rw-r--r--Documentation/devicetree/bindings/reset/renesas,rst.yaml6
-rw-r--r--Documentation/devicetree/bindings/reset/renesas,rzg2l-usbphy-ctrl.yaml4
-rw-r--r--Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt33
-rw-r--r--Documentation/devicetree/bindings/reset/snps,axs10x-reset.yaml48
-rw-r--r--Documentation/devicetree/bindings/reset/socfpga-reset.txt16
-rw-r--r--Documentation/devicetree/bindings/reset/socionext,uniphier-glue-reset.yaml76
-rw-r--r--Documentation/devicetree/bindings/reset/socionext,uniphier-reset.yaml58
-rw-r--r--Documentation/devicetree/bindings/reset/st,sti-picophyreset.txt42
-rw-r--r--Documentation/devicetree/bindings/reset/st,sti-powerdown.txt45
-rw-r--r--Documentation/devicetree/bindings/reset/st,stih407-picophyreset.yaml47
-rw-r--r--Documentation/devicetree/bindings/reset/st,stih407-powerdown.yaml49
-rw-r--r--Documentation/devicetree/bindings/reset/starfive,jh7100-reset.yaml38
-rw-r--r--Documentation/devicetree/bindings/reset/sunplus,reset.yaml38
-rw-r--r--Documentation/devicetree/bindings/reset/ti,sci-reset.yaml2
-rw-r--r--Documentation/devicetree/bindings/reset/ti,tps380x-reset.yaml49
-rw-r--r--Documentation/devicetree/bindings/riscv/cpus.yaml60
-rw-r--r--Documentation/devicetree/bindings/riscv/microchip.yaml25
-rw-r--r--Documentation/devicetree/bindings/riscv/sifive-l2-cache.yaml124
-rw-r--r--Documentation/devicetree/bindings/riscv/starfive.yaml10
-rw-r--r--Documentation/devicetree/bindings/riscv/sunxi.yaml74
-rw-r--r--Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml6
-rw-r--r--Documentation/devicetree/bindings/rng/apm,rng.txt17
-rw-r--r--Documentation/devicetree/bindings/rng/apm,x-gene-rng.yaml47
-rw-r--r--Documentation/devicetree/bindings/rng/atmel,at91-trng.yaml51
-rw-r--r--Documentation/devicetree/bindings/rng/atmel-trng.txt16
-rw-r--r--Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt16
-rw-r--r--Documentation/devicetree/bindings/rng/brcm,iproc-rng200.yaml30
-rw-r--r--Documentation/devicetree/bindings/rng/ingenic,rng.yaml2
-rw-r--r--Documentation/devicetree/bindings/rng/ingenic,trng.yaml4
-rw-r--r--Documentation/devicetree/bindings/rng/intel,ixp46x-rng.yaml3
-rw-r--r--Documentation/devicetree/bindings/rng/ks-sa-rng.txt21
-rw-r--r--Documentation/devicetree/bindings/rng/mtk-rng.yaml4
-rw-r--r--Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.txt12
-rw-r--r--Documentation/devicetree/bindings/rng/nuvoton,npcm-rng.yaml37
-rw-r--r--Documentation/devicetree/bindings/rng/omap3_rom_rng.txt27
-rw-r--r--Documentation/devicetree/bindings/rng/omap_rng.txt38
-rw-r--r--Documentation/devicetree/bindings/rng/omap_rng.yaml81
-rw-r--r--Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml2
-rw-r--r--Documentation/devicetree/bindings/rng/silex-insight,ba431-rng.yaml2
-rw-r--r--Documentation/devicetree/bindings/rng/st,rng.txt15
-rw-r--r--Documentation/devicetree/bindings/rng/st,rng.yaml35
-rw-r--r--Documentation/devicetree/bindings/rng/st,stm32-rng.yaml5
-rw-r--r--Documentation/devicetree/bindings/rng/starfive,jh7110-trng.yaml55
-rw-r--r--Documentation/devicetree/bindings/rng/ti,keystone-rng.yaml50
-rw-r--r--Documentation/devicetree/bindings/rng/ti,omap-rom-rng.yaml41
-rw-r--r--Documentation/devicetree/bindings/rng/timeriomem_rng.txt25
-rw-r--r--Documentation/devicetree/bindings/rng/timeriomem_rng.yaml48
-rw-r--r--Documentation/devicetree/bindings/rng/xiphera,xip8001b-trng.yaml2
-rw-r--r--Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml4
-rw-r--r--Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml88
-rw-r--r--Documentation/devicetree/bindings/rtc/amlogic,meson-vrtc.yaml44
-rw-r--r--Documentation/devicetree/bindings/rtc/amlogic,meson6-rtc.yaml62
-rw-r--r--Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.yaml4
-rw-r--r--Documentation/devicetree/bindings/rtc/atmel,at91sam9-rtc.txt25
-rw-r--r--Documentation/devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml69
-rw-r--r--Documentation/devicetree/bindings/rtc/brcm,brcmstb-waketimer.txt20
-rw-r--r--Documentation/devicetree/bindings/rtc/brcm,brcmstb-waketimer.yaml51
-rw-r--r--Documentation/devicetree/bindings/rtc/epson,rx8900.yaml3
-rw-r--r--Documentation/devicetree/bindings/rtc/faraday,ftrtc010.yaml4
-rw-r--r--Documentation/devicetree/bindings/rtc/fsl,scu-rtc.yaml31
-rw-r--r--Documentation/devicetree/bindings/rtc/haoyu,hym8563.txt30
-rw-r--r--Documentation/devicetree/bindings/rtc/haoyu,hym8563.yaml56
-rw-r--r--Documentation/devicetree/bindings/rtc/ingenic,rtc.yaml33
-rw-r--r--Documentation/devicetree/bindings/rtc/microchip,mfps-rtc.yaml67
-rw-r--r--Documentation/devicetree/bindings/rtc/microcrystal,rv3028.yaml54
-rw-r--r--Documentation/devicetree/bindings/rtc/microcrystal,rv3032.yaml5
-rw-r--r--Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt12
-rw-r--r--Documentation/devicetree/bindings/rtc/mstar,msc313-rtc.yaml49
-rw-r--r--Documentation/devicetree/bindings/rtc/nuvoton,nct3018y.yaml45
-rw-r--r--Documentation/devicetree/bindings/rtc/nvidia,tegra20-rtc.txt24
-rw-r--r--Documentation/devicetree/bindings/rtc/nvidia,tegra20-rtc.yaml61
-rw-r--r--Documentation/devicetree/bindings/rtc/nxp,pcf2127.yaml7
-rw-r--r--Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt22
-rw-r--r--Documentation/devicetree/bindings/rtc/nxp,pcf85063.yaml92
-rw-r--r--Documentation/devicetree/bindings/rtc/nxp,pcf85363.yaml60
-rw-r--r--Documentation/devicetree/bindings/rtc/nxp,pcf8563.yaml2
-rw-r--r--Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml41
-rw-r--r--Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml70
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-ds1307.txt52
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-ds1307.yaml102
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-m41t80.txt39
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-meson-vrtc.txt22
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-meson.txt35
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-mt6397.txt2
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-mxc.yaml2
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc-mxc_v2.yaml2
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc.txt1
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc.yaml2
-rw-r--r--Documentation/devicetree/bindings/rtc/sa1100-rtc.yaml6
-rw-r--r--Documentation/devicetree/bindings/rtc/snvs-rtc.txt1
-rw-r--r--Documentation/devicetree/bindings/rtc/st,m41t80.yaml73
-rw-r--r--Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml7
-rw-r--r--Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml56
-rw-r--r--Documentation/devicetree/bindings/rtc/ti,k3-rtc.yaml62
-rw-r--r--Documentation/devicetree/bindings/rtc/trivial-rtc.yaml10
-rw-r--r--Documentation/devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml12
-rw-r--r--Documentation/devicetree/bindings/serial/8250.yaml16
-rw-r--r--Documentation/devicetree/bindings/serial/8250_omap.yaml27
-rw-r--r--Documentation/devicetree/bindings/serial/amlogic,meson-uart.yaml32
-rw-r--r--Documentation/devicetree/bindings/serial/atmel,at91-usart.yaml190
-rw-r--r--Documentation/devicetree/bindings/serial/brcm,bcm6345-uart.txt36
-rw-r--r--Documentation/devicetree/bindings/serial/brcm,bcm6345-uart.yaml47
-rw-r--r--Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml2
-rw-r--r--Documentation/devicetree/bindings/serial/cdns,uart.yaml29
-rw-r--r--Documentation/devicetree/bindings/serial/efm32-uart.txt20
-rw-r--r--Documentation/devicetree/bindings/serial/fsl,s32-linflexuart.txt22
-rw-r--r--Documentation/devicetree/bindings/serial/fsl,s32-linflexuart.yaml48
-rw-r--r--Documentation/devicetree/bindings/serial/fsl-imx-uart.yaml38
-rw-r--r--Documentation/devicetree/bindings/serial/fsl-lpuart.yaml22
-rw-r--r--Documentation/devicetree/bindings/serial/fsl-mxs-auart.yaml2
-rw-r--r--Documentation/devicetree/bindings/serial/ingenic,uart.yaml8
-rw-r--r--Documentation/devicetree/bindings/serial/mediatek,uart.yaml122
-rw-r--r--Documentation/devicetree/bindings/serial/mtk-uart.txt58
-rw-r--r--Documentation/devicetree/bindings/serial/mvebu-uart.txt9
-rw-r--r--Documentation/devicetree/bindings/serial/nvidia,tegra194-tcu.txt35
-rw-r--r--Documentation/devicetree/bindings/serial/nvidia,tegra194-tcu.yaml61
-rw-r--r--Documentation/devicetree/bindings/serial/pl011.yaml10
-rw-r--r--Documentation/devicetree/bindings/serial/qcom,msm-uart.txt25
-rw-r--r--Documentation/devicetree/bindings/serial/qcom,msm-uart.yaml56
-rw-r--r--Documentation/devicetree/bindings/serial/qcom,msm-uartdm.txt81
-rw-r--r--Documentation/devicetree/bindings/serial/qcom,msm-uartdm.yaml112
-rw-r--r--Documentation/devicetree/bindings/serial/qcom,serial-geni-qcom.yaml86
-rw-r--r--Documentation/devicetree/bindings/serial/rda,8810pl-uart.txt17
-rw-r--r--Documentation/devicetree/bindings/serial/rda,8810pl-uart.yaml46
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,em-uart.yaml51
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,hscif.yaml40
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,sci.yaml58
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,scif.yaml49
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,scifa.yaml26
-rw-r--r--Documentation/devicetree/bindings/serial/renesas,scifb.yaml16
-rw-r--r--Documentation/devicetree/bindings/serial/rs485.yaml17
-rw-r--r--Documentation/devicetree/bindings/serial/samsung_uart.yaml18
-rw-r--r--Documentation/devicetree/bindings/serial/serial.yaml26
-rw-r--r--Documentation/devicetree/bindings/serial/sifive-serial.yaml8
-rw-r--r--Documentation/devicetree/bindings/serial/snps-dw-apb-uart.yaml16
-rw-r--r--Documentation/devicetree/bindings/serial/socionext,uniphier-uart.yaml5
-rw-r--r--Documentation/devicetree/bindings/serial/sprd-uart.yaml7
-rw-r--r--Documentation/devicetree/bindings/serial/st,stm32-uart.yaml11
-rw-r--r--Documentation/devicetree/bindings/serial/sunplus,sp7021-uart.yaml56
-rw-r--r--Documentation/devicetree/bindings/serial/xlnx,opb-uartlite.txt23
-rw-r--r--Documentation/devicetree/bindings/serial/xlnx,opb-uartlite.yaml88
-rw-r--r--Documentation/devicetree/bindings/serio/allwinner,sun4i-a10-ps2.yaml2
-rw-r--r--Documentation/devicetree/bindings/serio/arm,pl050.yaml67
-rw-r--r--Documentation/devicetree/bindings/serio/ps2-gpio.txt23
-rw-r--r--Documentation/devicetree/bindings/serio/ps2-gpio.yaml64
-rw-r--r--Documentation/devicetree/bindings/slimbus/bus.txt60
-rw-r--r--Documentation/devicetree/bindings/slimbus/qcom,slim-ngd.yaml120
-rw-r--r--Documentation/devicetree/bindings/slimbus/qcom,slim.yaml86
-rw-r--r--Documentation/devicetree/bindings/slimbus/slim-ngd-qcom-ctrl.txt84
-rw-r--r--Documentation/devicetree/bindings/slimbus/slim-qcom-ctrl.txt39
-rw-r--r--Documentation/devicetree/bindings/slimbus/slimbus.yaml95
-rw-r--r--Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.yaml7
-rw-r--r--Documentation/devicetree/bindings/soc/amlogic/amlogic,meson-gx-clk-measure.yaml40
-rw-r--r--Documentation/devicetree/bindings/soc/amlogic/clk-measure.txt21
-rw-r--r--Documentation/devicetree/bindings/soc/aspeed/uart-routing.yaml56
-rw-r--r--Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-pm.txt46
-rw-r--r--Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-pm.yaml86
-rw-r--r--Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.txt17
-rw-r--r--Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.yaml53
-rw-r--r--Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-scc-qmc.yaml162
-rw-r--r--Documentation/devicetree/bindings/soc/fsl/cpm_qe/fsl,cpm1-tsa.yaml205
-rw-r--r--Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-dcfg.yaml68
-rw-r--r--Documentation/devicetree/bindings/soc/fsl/fsl,layerscape-scfg.yaml58
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx-iomuxc-gpr.yaml57
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mm-disp-blk-ctrl.yaml94
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mm-vpu-blk-ctrl.yaml164
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mn-disp-blk-ctrl.yaml97
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mp-hdmi-blk-ctrl.yaml93
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mp-hsio-blk-ctrl.yaml92
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mp-media-blk-ctrl.yaml169
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx8mq-vpu-blk-ctrl.yaml71
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx93-media-blk-ctrl.yaml80
-rw-r--r--Documentation/devicetree/bindings/soc/imx/fsl,imx93-src.yaml97
-rw-r--r--Documentation/devicetree/bindings/soc/intel/intel,hps-copy-engine.yaml51
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/devapc.yaml5
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mediatek,ccorr.yaml68
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mediatek,mt7986-wo-ccif.yaml51
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mediatek,mutex.yaml122
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mediatek,pwrap.yaml147
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mediatek,wdma.yaml81
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/mtk-svs.yaml91
-rw-r--r--Documentation/devicetree/bindings/soc/mediatek/pwrap.txt70
-rw-r--r--Documentation/devicetree/bindings/soc/microchip/atmel,at91rm9200-tcb.yaml7
-rw-r--r--Documentation/devicetree/bindings/soc/microchip/microchip,mpfs-sys-controller.yaml40
-rw-r--r--Documentation/devicetree/bindings/soc/microchip/microchip,polarfire-soc-sys-controller.yaml35
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.yaml20
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,apr-services.yaml53
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt134
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,apr.yaml211
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,dcc.yaml44
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,eud.yaml77
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml150
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt94
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,gsbi.txt87
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,gsbi.yaml132
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,msm8976-ramp-controller.yaml36
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,pmic-glink.yaml97
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,rpm.yaml101
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,rpmh-rsc.yaml267
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smd-rpm.yaml84
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smd.txt98
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smd.yaml62
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smem.yaml40
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.txt110
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smp2p.yaml145
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smsm.txt104
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,smsm.yaml138
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,spm.yaml85
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,wcnss.txt131
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,wcnss.yaml134
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom-stats.yaml53
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/rpmh-rsc.txt137
-rw-r--r--Documentation/devicetree/bindings/soc/renesas/renesas,r9a09g011-sys.yaml43
-rw-r--r--Documentation/devicetree/bindings/soc/renesas/renesas,rzg2l-sysc.yaml (renamed from Documentation/devicetree/bindings/power/renesas,rzg2l-sysc.yaml)12
-rw-r--r--Documentation/devicetree/bindings/soc/renesas/renesas,rzv2m-pwc.yaml56
-rw-r--r--Documentation/devicetree/bindings/soc/renesas/renesas.yaml479
-rw-r--r--Documentation/devicetree/bindings/soc/rockchip/grf.yaml33
-rw-r--r--Documentation/devicetree/bindings/soc/samsung/exynos-pmu.yaml195
-rw-r--r--Documentation/devicetree/bindings/soc/samsung/exynos-usi.yaml167
-rw-r--r--Documentation/devicetree/bindings/soc/samsung/samsung,exynos-sysreg.yaml87
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-adamv.yaml50
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-ahci-glue.yaml77
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-dwc3-glue.yaml106
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-mioctrl.yaml65
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-perictrl.yaml64
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-sdctrl.yaml61
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-soc-glue-debug.yaml68
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-soc-glue.yaml114
-rw-r--r--Documentation/devicetree/bindings/soc/socionext/socionext,uniphier-sysctrl.yaml104
-rw-r--r--Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml13
-rw-r--r--Documentation/devicetree/bindings/soc/ti/k3-socinfo.yaml40
-rw-r--r--Documentation/devicetree/bindings/soc/ti/sci-pm-domain.yaml2
-rw-r--r--Documentation/devicetree/bindings/soc/ti/ti,pruss.yaml23
-rw-r--r--Documentation/devicetree/bindings/soc/ti/wkup-m3-ipc.yaml175
-rw-r--r--Documentation/devicetree/bindings/soc/ti/wkup_m3_ipc.txt57
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau1372.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau17x1.txt32
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau17x1.yaml52
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau1977.yaml8
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau7002.txt19
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau7002.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/adi,adau7118.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/adi,max98363.yaml60
-rw-r--r--Documentation/devicetree/bindings/sound/adi,max98396.yaml141
-rw-r--r--Documentation/devicetree/bindings/sound/ak4375.yaml60
-rw-r--r--Documentation/devicetree/bindings/sound/ak4458.txt28
-rw-r--r--Documentation/devicetree/bindings/sound/ak4613.yaml14
-rw-r--r--Documentation/devicetree/bindings/sound/ak4642.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/ak5558.txt24
-rw-r--r--Documentation/devicetree/bindings/sound/alc5632.txt43
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml14
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun50i-a64-codec-analog.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun50i-h6-dmic.yaml87
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun8i-a23-codec-analog.yaml2
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun8i-a33-codec.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,aiu.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-fifo.txt34
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-fifo.yaml112
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-pdm.txt29
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-pdm.yaml82
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-sound-card.txt124
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-sound-card.yaml183
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-spdifin.txt27
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-spdifin.yaml86
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-spdifout.txt25
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-spdifout.yaml79
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt36
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.yaml88
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-tdm-iface.txt22
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-tdm-iface.yaml55
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,g12a-toacodec.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,gx-sound-card.yaml9
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,t9015.yaml11
-rw-r--r--Documentation/devicetree/bindings/sound/apple,mca.yaml135
-rw-r--r--Documentation/devicetree/bindings/sound/arm,pl041.yaml62
-rw-r--r--Documentation/devicetree/bindings/sound/arndale.txt25
-rw-r--r--Documentation/devicetree/bindings/sound/asahi-kasei,ak4458.yaml73
-rw-r--r--Documentation/devicetree/bindings/sound/asahi-kasei,ak5558.yaml48
-rw-r--r--Documentation/devicetree/bindings/sound/atmel,sama5d2-classd.yaml100
-rw-r--r--Documentation/devicetree/bindings/sound/atmel,sama5d2-i2s.yaml85
-rw-r--r--Documentation/devicetree/bindings/sound/atmel,sama5d2-pdmic.yaml98
-rw-r--r--Documentation/devicetree/bindings/sound/atmel-classd.txt55
-rw-r--r--Documentation/devicetree/bindings/sound/atmel-i2s.txt46
-rw-r--r--Documentation/devicetree/bindings/sound/atmel-pdmic.txt55
-rw-r--r--Documentation/devicetree/bindings/sound/atmel-sam9x5-wm8731-audio.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/audio-graph-card.yaml2
-rw-r--r--Documentation/devicetree/bindings/sound/audio-graph-card2.yaml60
-rw-r--r--Documentation/devicetree/bindings/sound/audio-graph-port.yaml103
-rw-r--r--Documentation/devicetree/bindings/sound/audio-graph.yaml13
-rw-r--r--Documentation/devicetree/bindings/sound/awinic,aw8738.yaml54
-rw-r--r--Documentation/devicetree/bindings/sound/awinic,aw88395.yaml53
-rw-r--r--Documentation/devicetree/bindings/sound/bt-sco.txt13
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,cs35l41.yaml209
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,cs35l45.yaml156
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,cs42l42.yaml226
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,cs42l51.yaml9
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,ep9301-i2s.yaml66
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,lochnagar.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/cirrus,madera.yaml3
-rw-r--r--Documentation/devicetree/bindings/sound/component-common.yaml21
-rw-r--r--Documentation/devicetree/bindings/sound/cs42l42.txt114
-rw-r--r--Documentation/devicetree/bindings/sound/da9055.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/dai-common.yaml18
-rw-r--r--Documentation/devicetree/bindings/sound/dai-params.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt86
-rw-r--r--Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml202
-rw-r--r--Documentation/devicetree/bindings/sound/designware-i2s.txt35
-rw-r--r--Documentation/devicetree/bindings/sound/dmic-codec.yaml55
-rw-r--r--Documentation/devicetree/bindings/sound/dmic.txt22
-rw-r--r--Documentation/devicetree/bindings/sound/everest,es8316.yaml11
-rw-r--r--Documentation/devicetree/bindings/sound/everest,es8326.yaml116
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,micfil.txt32
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,micfil.yaml86
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,mqs.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,qmc-audio.yaml117
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,rpmsg.yaml36
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,sai.yaml203
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,spdif.yaml4
-rw-r--r--Documentation/devicetree/bindings/sound/fsl,xcvr.yaml1
-rw-r--r--Documentation/devicetree/bindings/sound/fsl-asoc-card.txt3
-rw-r--r--Documentation/devicetree/bindings/sound/fsl-sai.txt84
-rw-r--r--Documentation/devicetree/bindings/sound/google,cros-ec-codec.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/google,sc7180-trogdor.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/google,sc7280-herobrine.yaml192
-rw-r--r--Documentation/devicetree/bindings/sound/imx-audio-card.yaml9
-rw-r--r--Documentation/devicetree/bindings/sound/imx-audio-hdmi.yaml3
-rw-r--r--Documentation/devicetree/bindings/sound/infineon,peb2466.yaml91
-rw-r--r--Documentation/devicetree/bindings/sound/ingenic,aic.yaml19
-rw-r--r--Documentation/devicetree/bindings/sound/ingenic,codec.yaml9
-rw-r--r--Documentation/devicetree/bindings/sound/intel,keembay-i2s.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/irondevice,sma1303.yaml48
-rw-r--r--Documentation/devicetree/bindings/sound/linux,bt-sco.yaml41
-rw-r--r--Documentation/devicetree/bindings/sound/linux,spdif-dit.yaml37
-rw-r--r--Documentation/devicetree/bindings/sound/marvell,mmp-sspa.yaml8
-rw-r--r--Documentation/devicetree/bindings/sound/max98090.txt59
-rw-r--r--Documentation/devicetree/bindings/sound/max98095.txt22
-rw-r--r--Documentation/devicetree/bindings/sound/max98357a.txt28
-rw-r--r--Documentation/devicetree/bindings/sound/max98371.txt17
-rw-r--r--Documentation/devicetree/bindings/sound/max98504.txt44
-rw-r--r--Documentation/devicetree/bindings/sound/max9867.txt17
-rw-r--r--Documentation/devicetree/bindings/sound/max9892x.txt3
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max9759.txt18
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max9759.yaml45
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98090.yaml84
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98095.yaml54
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98357a.yaml52
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98371.yaml42
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98390.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98504.yaml86
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max98520.yaml35
-rw-r--r--Documentation/devicetree/bindings/sound/maxim,max9867.yaml60
-rw-r--r--Documentation/devicetree/bindings/sound/mchp,i2s-mcc.yaml108
-rw-r--r--Documentation/devicetree/bindings/sound/mchp,spdifrx.yaml73
-rw-r--r--Documentation/devicetree/bindings/sound/mchp,spdiftx.yaml75
-rw-r--r--Documentation/devicetree/bindings/sound/mediatek,mt8188-afe.yaml208
-rw-r--r--Documentation/devicetree/bindings/sound/mediatek,mt8188-mt6359.yaml97
-rw-r--r--Documentation/devicetree/bindings/sound/microchip,sama7g5-i2smcc.yaml110
-rw-r--r--Documentation/devicetree/bindings/sound/microchip,sama7g5-pdmc.yaml109
-rw-r--r--Documentation/devicetree/bindings/sound/microchip,sama7g5-spdifrx.yaml73
-rw-r--r--Documentation/devicetree/bindings/sound/microchip,sama7g5-spdiftx.yaml78
-rw-r--r--Documentation/devicetree/bindings/sound/mt6358.txt4
-rw-r--r--Documentation/devicetree/bindings/sound/mt6359.yaml2
-rw-r--r--Documentation/devicetree/bindings/sound/mt8186-afe-pcm.yaml175
-rw-r--r--Documentation/devicetree/bindings/sound/mt8186-mt6366-da7219-max98357.yaml85
-rw-r--r--Documentation/devicetree/bindings/sound/mt8186-mt6366-rt1019-rt5682s.yaml98
-rw-r--r--Documentation/devicetree/bindings/sound/mt8192-afe-pcm.yaml100
-rw-r--r--Documentation/devicetree/bindings/sound/mt8192-mt6359-rt1015-rt5682.yaml39
-rw-r--r--Documentation/devicetree/bindings/sound/mt8195-afe-pcm.yaml20
-rw-r--r--Documentation/devicetree/bindings/sound/mt8195-mt6359-rt1019-rt5682.yaml47
-rw-r--r--Documentation/devicetree/bindings/sound/mt8195-mt6359.yaml64
-rw-r--r--Documentation/devicetree/bindings/sound/mvebu-audio.txt14
-rw-r--r--Documentation/devicetree/bindings/sound/name-prefix.txt24
-rw-r--r--Documentation/devicetree/bindings/sound/nau8315.txt6
-rw-r--r--Documentation/devicetree/bindings/sound/nau8821.txt55
-rw-r--r--Documentation/devicetree/bindings/sound/nau8822.txt16
-rw-r--r--Documentation/devicetree/bindings/sound/nau8825.txt6
-rw-r--r--Documentation/devicetree/bindings/sound/nuvoton,nau8822.yaml46
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-alc5632.txt48
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-alc5632.yaml74
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-common.yaml87
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-graph-card.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max9808x.yaml90
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max98090.txt53
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-max98090.yaml97
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-rt5631.yaml85
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-rt5640.txt52
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-rt5640.yaml84
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-rt5677.txt67
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-rt5677.yaml100
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-sgtl5000.txt42
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-sgtl5000.yaml67
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-trimslice.txt21
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-trimslice.yaml33
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm8753.txt40
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm8753.yaml79
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm8903.txt62
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm8903.yaml93
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm9712.txt60
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra-audio-wm9712.yaml76
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra186-asrc.yaml81
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra186-dspk.yaml15
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra20-i2s.txt30
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra20-i2s.yaml77
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra20-spdif.yaml88
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-admaif.yaml6
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-adx.yaml77
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-ahub.yaml35
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-amx.yaml79
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-dmic.yaml12
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-i2s.yaml12
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-mbdrc.yaml47
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-mixer.yaml75
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-mvc.yaml77
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-ope.yaml87
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-peq.yaml48
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra210-sfc.yaml74
-rw-r--r--Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/nxp,tfa989x.yaml51
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,apq8016-sbc.txt96
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,lpass-cpu.yaml139
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,lpass-rx-macro.yaml88
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,lpass-tx-macro.yaml88
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,lpass-va-macro.yaml101
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,lpass-wsa-macro.yaml87
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6adm-routing.yaml39
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6adm.txt39
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6adm.yaml51
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6afe.txt201
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6afe.yaml68
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6apm-dai.yaml34
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6apm-lpass-dais.yaml35
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6apm.yaml68
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6asm-dais.yaml96
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6asm.txt70
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6asm.yaml68
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6core.txt21
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6core.yaml39
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6dsp-lpass-clocks.yaml41
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6dsp-lpass-ports.yaml164
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,q6prm.yaml50
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,sdm845.txt91
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,sm8250.yaml183
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd9335.txt123
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd9335.yaml156
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd934x.yaml68
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd938x-sdw.yaml2
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wcd938x.yaml14
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wsa881x.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/qcom,wsa883x.yaml81
-rw-r--r--Documentation/devicetree/bindings/sound/realtek,alc5632.yaml63
-rw-r--r--Documentation/devicetree/bindings/sound/realtek,rt1015p.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/realtek,rt5682s.yaml150
-rw-r--r--Documentation/devicetree/bindings/sound/renesas,fsi.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/renesas,idt821034.yaml75
-rw-r--r--Documentation/devicetree/bindings/sound/renesas,rsnd.yaml225
-rw-r--r--Documentation/devicetree/bindings/sound/renesas,rz-ssi.yaml32
-rw-r--r--Documentation/devicetree/bindings/sound/richtek,rt9120.yaml62
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip,i2s-tdm.yaml192
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip,pdm.txt46
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip,pdm.yaml123
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip,rk3328-codec.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip-i2s.yaml17
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip-spdif.yaml23
-rw-r--r--Documentation/devicetree/bindings/sound/rohm,bd28623.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/rt5640.txt3
-rw-r--r--Documentation/devicetree/bindings/sound/rt5659.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/rt5682.txt22
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,aries-wm8994.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,arndale.yaml45
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,midas-audio.yaml4
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,odroid.yaml13
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,smdk-wm8994.txt14
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,smdk5250.yaml38
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,snow.yaml76
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,tm2-audio.txt42
-rw-r--r--Documentation/devicetree/bindings/sound/samsung,tm2.yaml80
-rw-r--r--Documentation/devicetree/bindings/sound/samsung-i2s.yaml22
-rw-r--r--Documentation/devicetree/bindings/sound/serial-midi.yaml51
-rw-r--r--Documentation/devicetree/bindings/sound/sgtl5000.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/simple-amplifier.txt17
-rw-r--r--Documentation/devicetree/bindings/sound/simple-audio-amplifier.yaml45
-rw-r--r--Documentation/devicetree/bindings/sound/simple-audio-mux.yaml9
-rw-r--r--Documentation/devicetree/bindings/sound/simple-card.yaml99
-rw-r--r--Documentation/devicetree/bindings/sound/snow.txt31
-rw-r--r--Documentation/devicetree/bindings/sound/snps,designware-i2s.yaml94
-rw-r--r--Documentation/devicetree/bindings/sound/socionext,uniphier-aio.yaml29
-rw-r--r--Documentation/devicetree/bindings/sound/socionext,uniphier-evea.yaml11
-rw-r--r--Documentation/devicetree/bindings/sound/sound-dai.yaml20
-rw-r--r--Documentation/devicetree/bindings/sound/spdif-transmitter.txt10
-rw-r--r--Documentation/devicetree/bindings/sound/st,stm32-i2s.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/st,stm32-sai.yaml24
-rw-r--r--Documentation/devicetree/bindings/sound/st,stm32-spdifrx.yaml7
-rw-r--r--Documentation/devicetree/bindings/sound/tas2562.yaml10
-rw-r--r--Documentation/devicetree/bindings/sound/tas2764.yaml76
-rw-r--r--Documentation/devicetree/bindings/sound/tas2770.yaml8
-rw-r--r--Documentation/devicetree/bindings/sound/tas27xx.yaml80
-rw-r--r--Documentation/devicetree/bindings/sound/tas571x.txt1
-rw-r--r--Documentation/devicetree/bindings/sound/tas5720.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/tas5805m.yaml56
-rw-r--r--Documentation/devicetree/bindings/sound/test-component.yaml33
-rw-r--r--Documentation/devicetree/bindings/sound/ti,j721e-cpb-audio.yaml2
-rw-r--r--Documentation/devicetree/bindings/sound/ti,pcm3168a.txt56
-rw-r--r--Documentation/devicetree/bindings/sound/ti,pcm3168a.yaml107
-rw-r--r--Documentation/devicetree/bindings/sound/ti,src4xxx.yaml48
-rw-r--r--Documentation/devicetree/bindings/sound/ti,tlv320adc3xxx.yaml140
-rw-r--r--Documentation/devicetree/bindings/sound/ti,tlv320aic3x.yaml165
-rw-r--r--Documentation/devicetree/bindings/sound/ti,ts3a227e.yaml94
-rw-r--r--Documentation/devicetree/bindings/sound/tlv320adcx140.yaml85
-rw-r--r--Documentation/devicetree/bindings/sound/tlv320aic31xx.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/tlv320aic3x.txt97
-rw-r--r--Documentation/devicetree/bindings/sound/ts3a227e.txt30
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,arizona.yaml3
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8510.yaml41
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8523.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8524.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8580.yaml42
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8711.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8728.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8731.yaml99
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8737.yaml40
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8753.yaml62
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8903.yaml116
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8940.yaml60
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8960.yaml88
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8961.yaml43
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8962.yaml124
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8978.yaml61
-rw-r--r--Documentation/devicetree/bindings/sound/wlf,wm8994.yaml194
-rw-r--r--Documentation/devicetree/bindings/sound/wm8510.txt18
-rw-r--r--Documentation/devicetree/bindings/sound/wm8523.txt16
-rw-r--r--Documentation/devicetree/bindings/sound/wm8524.txt16
-rw-r--r--Documentation/devicetree/bindings/sound/wm8580.txt16
-rw-r--r--Documentation/devicetree/bindings/sound/wm8711.txt18
-rw-r--r--Documentation/devicetree/bindings/sound/wm8728.txt18
-rw-r--r--Documentation/devicetree/bindings/sound/wm8731.txt27
-rw-r--r--Documentation/devicetree/bindings/sound/wm8737.txt18
-rw-r--r--Documentation/devicetree/bindings/sound/wm8753.txt40
-rw-r--r--Documentation/devicetree/bindings/sound/wm8903.txt82
-rw-r--r--Documentation/devicetree/bindings/sound/wm8960.txt42
-rw-r--r--Documentation/devicetree/bindings/sound/wm8962.txt43
-rw-r--r--Documentation/devicetree/bindings/sound/wm8994.txt112
-rw-r--r--Documentation/devicetree/bindings/sound/zl38060.yaml7
-rw-r--r--Documentation/devicetree/bindings/soundwire/qcom,sdw.txt188
-rw-r--r--Documentation/devicetree/bindings/soundwire/qcom,soundwire.yaml271
-rw-r--r--Documentation/devicetree/bindings/soundwire/soundwire-controller.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml5
-rw-r--r--Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml6
-rw-r--r--Documentation/devicetree/bindings/spi/amlogic,a1-spifc.yaml41
-rw-r--r--Documentation/devicetree/bindings/spi/amlogic,meson-gx-spicc.yaml108
-rw-r--r--Documentation/devicetree/bindings/spi/amlogic,meson6-spifc.yaml31
-rw-r--r--Documentation/devicetree/bindings/spi/aspeed,ast2600-fmc.yaml82
-rw-r--r--Documentation/devicetree/bindings/spi/atmel,at91rm9200-spi.yaml85
-rw-r--r--Documentation/devicetree/bindings/spi/atmel,quadspi.yaml99
-rw-r--r--Documentation/devicetree/bindings/spi/atmel-quadspi.txt37
-rw-r--r--Documentation/devicetree/bindings/spi/brcm,bcm63xx-hsspi.yaml134
-rw-r--r--Documentation/devicetree/bindings/spi/brcm,spi-bcm-qspi.yaml156
-rw-r--r--Documentation/devicetree/bindings/spi/cdns,qspi-nor-peripheral-props.yaml42
-rw-r--r--Documentation/devicetree/bindings/spi/cdns,qspi-nor.yaml121
-rw-r--r--Documentation/devicetree/bindings/spi/cdns,xspi.yaml77
-rw-r--r--Documentation/devicetree/bindings/spi/efm32-spi.txt39
-rw-r--r--Documentation/devicetree/bindings/spi/fsl,spi-fsl-qspi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/fsl-imx-cspi.yaml6
-rw-r--r--Documentation/devicetree/bindings/spi/hpe,gxp-spifi.yaml56
-rw-r--r--Documentation/devicetree/bindings/spi/ingenic,spi.yaml75
-rw-r--r--Documentation/devicetree/bindings/spi/marvell,mmp2-ssp.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml113
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml32
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-mtk-snfi.yaml124
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-slave-mt27xx.yaml58
-rw-r--r--Documentation/devicetree/bindings/spi/microchip,mpfs-spi.yaml58
-rw-r--r--Documentation/devicetree/bindings/spi/mikrotik,rb4xx-spi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/mxicy,mx25f0a-spi.yaml65
-rw-r--r--Documentation/devicetree/bindings/spi/mxs-spi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/nuvoton,npcm-fiu.txt15
-rw-r--r--Documentation/devicetree/bindings/spi/nuvoton,npcm-pspi.txt3
-rw-r--r--Documentation/devicetree/bindings/spi/nuvoton,wpcm450-fiu.yaml66
-rw-r--r--Documentation/devicetree/bindings/spi/nvidia,tegra210-quad-peripheral-props.yaml32
-rw-r--r--Documentation/devicetree/bindings/spi/nvidia,tegra210-quad.yaml69
-rw-r--r--Documentation/devicetree/bindings/spi/omap-spi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.txt39
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-geni-qcom.yaml118
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml23
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-qup.txt103
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-qup.yaml81
-rw-r--r--Documentation/devicetree/bindings/spi/ralink,mt7621-spi.yaml61
-rw-r--r--Documentation/devicetree/bindings/spi/realtek,rtl-spi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spi/renesas,hspi.yaml1
-rw-r--r--Documentation/devicetree/bindings/spi/renesas,rspi.yaml30
-rw-r--r--Documentation/devicetree/bindings/spi/renesas,sh-msiof.yaml39
-rw-r--r--Documentation/devicetree/bindings/spi/samsung,spi-peripheral-props.yaml33
-rw-r--r--Documentation/devicetree/bindings/spi/samsung,spi.yaml192
-rw-r--r--Documentation/devicetree/bindings/spi/snps,dw-apb-ssi.yaml48
-rw-r--r--Documentation/devicetree/bindings/spi/socionext,f-ospi.yaml57
-rw-r--r--Documentation/devicetree/bindings/spi/socionext,synquacer-spi.yaml73
-rw-r--r--Documentation/devicetree/bindings/spi/spi-bcm63xx-hsspi.txt33
-rw-r--r--Documentation/devicetree/bindings/spi/spi-cadence.yaml11
-rw-r--r--Documentation/devicetree/bindings/spi/spi-controller.yaml63
-rw-r--r--Documentation/devicetree/bindings/spi/spi-davinci.txt2
-rw-r--r--Documentation/devicetree/bindings/spi/spi-fsl-lpspi.yaml33
-rw-r--r--Documentation/devicetree/bindings/spi/spi-gpio.yaml6
-rw-r--r--Documentation/devicetree/bindings/spi/spi-mt65xx.txt68
-rw-r--r--Documentation/devicetree/bindings/spi/spi-mt7621.txt26
-rw-r--r--Documentation/devicetree/bindings/spi/spi-mux.yaml3
-rw-r--r--Documentation/devicetree/bindings/spi/spi-mxic.txt34
-rw-r--r--Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt44
-rw-r--r--Documentation/devicetree/bindings/spi/spi-nxp-fspi.yaml87
-rw-r--r--Documentation/devicetree/bindings/spi/spi-peripheral-props.yaml122
-rw-r--r--Documentation/devicetree/bindings/spi/spi-pl022.yaml27
-rw-r--r--Documentation/devicetree/bindings/spi/spi-rockchip.yaml8
-rw-r--r--Documentation/devicetree/bindings/spi/spi-samsung.txt122
-rw-r--r--Documentation/devicetree/bindings/spi/spi-sifive.yaml6
-rw-r--r--Documentation/devicetree/bindings/spi/spi-slave-mt27xx.txt33
-rw-r--r--Documentation/devicetree/bindings/spi/spi-st-ssc.txt40
-rw-r--r--Documentation/devicetree/bindings/spi/spi-sunplus-sp7021.yaml78
-rw-r--r--Documentation/devicetree/bindings/spi/spi-synquacer.txt27
-rw-r--r--Documentation/devicetree/bindings/spi/spi-xilinx.yaml4
-rw-r--r--Documentation/devicetree/bindings/spi/spi-xlp.txt38
-rw-r--r--Documentation/devicetree/bindings/spi/spi-zynqmp-qspi.yaml15
-rw-r--r--Documentation/devicetree/bindings/spi/spi_atmel.txt36
-rw-r--r--Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml7
-rw-r--r--Documentation/devicetree/bindings/spi/st,ssc-spi.yaml61
-rw-r--r--Documentation/devicetree/bindings/spi/st,stm32-qspi.yaml8
-rw-r--r--Documentation/devicetree/bindings/spi/st,stm32-spi.yaml31
-rw-r--r--Documentation/devicetree/bindings/spi/xlnx,zynq-qspi.yaml2
-rw-r--r--Documentation/devicetree/bindings/spmi/mtk,spmi-mtk-pmif.yaml76
-rw-r--r--Documentation/devicetree/bindings/spmi/qcom,spmi-pmic-arb.txt65
-rw-r--r--Documentation/devicetree/bindings/spmi/qcom,spmi-pmic-arb.yaml127
-rw-r--r--Documentation/devicetree/bindings/spmi/spmi.yaml3
-rw-r--r--Documentation/devicetree/bindings/sram/allwinner,sun4i-a10-system-control.yaml94
-rw-r--r--Documentation/devicetree/bindings/sram/qcom,imem.yaml78
-rw-r--r--Documentation/devicetree/bindings/sram/qcom,ocmem.yaml11
-rw-r--r--Documentation/devicetree/bindings/sram/sram.yaml20
-rw-r--r--Documentation/devicetree/bindings/submitting-patches.rst3
-rw-r--r--Documentation/devicetree/bindings/thermal/allwinner,sun8i-a83t-ths.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/amlogic,thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/brcm,avs-ro-thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/brcm,avs-tmon.txt23
-rw-r--r--Documentation/devicetree/bindings/thermal/brcm,avs-tmon.yaml56
-rw-r--r--Documentation/devicetree/bindings/thermal/exynos-thermal.txt106
-rw-r--r--Documentation/devicetree/bindings/thermal/fsl,scu-thermal.yaml38
-rw-r--r--Documentation/devicetree/bindings/thermal/generic-adc-thermal.yaml84
-rw-r--r--Documentation/devicetree/bindings/thermal/imx-thermal.yaml20
-rw-r--r--Documentation/devicetree/bindings/thermal/imx8mm-thermal.yaml9
-rw-r--r--Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml142
-rw-r--r--Documentation/devicetree/bindings/thermal/mediatek-thermal.txt3
-rw-r--r--Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt2
-rw-r--r--Documentation/devicetree/bindings/thermal/nvidia,tegra186-bpmp-thermal.txt33
-rw-r--r--Documentation/devicetree/bindings/thermal/nvidia,tegra186-bpmp-thermal.yaml42
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom,spmi-temp-alarm.yaml85
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-lmh.yaml4
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm-hc.yaml149
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml116
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt51
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-tsens.yaml244
-rw-r--r--Documentation/devicetree/bindings/thermal/qoriq-thermal.yaml4
-rw-r--r--Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml38
-rw-r--r--Documentation/devicetree/bindings/thermal/rcar-thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/rockchip-thermal.yaml24
-rw-r--r--Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml78
-rw-r--r--Documentation/devicetree/bindings/thermal/samsung,exynos-thermal.yaml184
-rw-r--r--Documentation/devicetree/bindings/thermal/socionext,uniphier-thermal.yaml16
-rw-r--r--Documentation/devicetree/bindings/thermal/sprd-thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/st,stm32-thermal.yaml4
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal-cooling-devices.yaml12
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal-generic-adc.txt95
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal-idle.yaml160
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal-sensor.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal-zones.yaml13
-rw-r--r--Documentation/devicetree/bindings/thermal/ti,am654-thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml96
-rw-r--r--Documentation/devicetree/bindings/timer/allwinner,sun4i-a10-timer.yaml3
-rw-r--r--Documentation/devicetree/bindings/timer/allwinner,sun5i-a13-hstimer.yaml2
-rw-r--r--Documentation/devicetree/bindings/timer/amlogic,meson6-timer.txt22
-rw-r--r--Documentation/devicetree/bindings/timer/amlogic,meson6-timer.yaml54
-rw-r--r--Documentation/devicetree/bindings/timer/andestech,atcpit100-timer.txt33
-rw-r--r--Documentation/devicetree/bindings/timer/arm,arch_timer.yaml11
-rw-r--r--Documentation/devicetree/bindings/timer/arm,arch_timer_mmio.yaml3
-rw-r--r--Documentation/devicetree/bindings/timer/arm,armv7m-systick.txt26
-rw-r--r--Documentation/devicetree/bindings/timer/arm,armv7m-systick.yaml54
-rw-r--r--Documentation/devicetree/bindings/timer/brcm,bcmbca-timer.yaml40
-rw-r--r--Documentation/devicetree/bindings/timer/cdns,ttc.yaml6
-rw-r--r--Documentation/devicetree/bindings/timer/fsl,imxgpt.yaml2
-rw-r--r--Documentation/devicetree/bindings/timer/hpe,gxp-timer.yaml47
-rw-r--r--Documentation/devicetree/bindings/timer/ingenic,sysost.yaml4
-rw-r--r--Documentation/devicetree/bindings/timer/ingenic,tcu.yaml13
-rw-r--r--Documentation/devicetree/bindings/timer/intel,ixp4xx-timer.yaml4
-rw-r--r--Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt10
-rw-r--r--Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml2
-rw-r--r--Documentation/devicetree/bindings/timer/mstar,msc313e-timer.yaml46
-rw-r--r--Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.txt21
-rw-r--r--Documentation/devicetree/bindings/timer/nuvoton,npcm7xx-timer.yaml54
-rw-r--r--Documentation/devicetree/bindings/timer/nvidia,tegra-timer.yaml150
-rw-r--r--Documentation/devicetree/bindings/timer/nvidia,tegra186-timer.yaml109
-rw-r--r--Documentation/devicetree/bindings/timer/nvidia,tegra20-timer.txt24
-rw-r--r--Documentation/devicetree/bindings/timer/nvidia,tegra210-timer.txt36
-rw-r--r--Documentation/devicetree/bindings/timer/nvidia,tegra30-timer.txt28
-rw-r--r--Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml4
-rw-r--r--Documentation/devicetree/bindings/timer/nxp,tpm-timer.yaml6
-rw-r--r--Documentation/devicetree/bindings/timer/qcom,msm-timer.txt47
-rw-r--r--Documentation/devicetree/bindings/timer/rda,8810pl-timer.txt20
-rw-r--r--Documentation/devicetree/bindings/timer/rda,8810pl-timer.yaml47
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,16bit-timer.txt25
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,8bit-timer.txt25
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,cmt.yaml18
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,ostm.yaml24
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,rz-mtu3.yaml302
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,tmu.yaml2
-rw-r--r--Documentation/devicetree/bindings/timer/riscv,timer.yaml52
-rw-r--r--Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml7
-rw-r--r--Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml98
-rw-r--r--Documentation/devicetree/bindings/timer/sifive,clint.yaml35
-rw-r--r--Documentation/devicetree/bindings/timer/st,nomadik-mtu.yaml58
-rw-r--r--Documentation/devicetree/bindings/timer/st,stm32-timer.yaml5
-rw-r--r--Documentation/devicetree/bindings/timer/ti,timer-dm.yaml159
-rw-r--r--Documentation/devicetree/bindings/timer/ti,timer.txt44
-rw-r--r--Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml92
-rw-r--r--Documentation/devicetree/bindings/timestamp/hardware-timestamps-common.yaml29
-rw-r--r--Documentation/devicetree/bindings/timestamp/hte-consumer.yaml39
-rw-r--r--Documentation/devicetree/bindings/timestamp/nvidia,tegra194-hte.yaml140
-rw-r--r--Documentation/devicetree/bindings/trivial-devices.yaml108
-rw-r--r--Documentation/devicetree/bindings/ufs/cdns,ufshc.txt32
-rw-r--r--Documentation/devicetree/bindings/ufs/cdns,ufshc.yaml73
-rw-r--r--Documentation/devicetree/bindings/ufs/hisilicon,ufs.yaml90
-rw-r--r--Documentation/devicetree/bindings/ufs/mediatek,ufs.yaml67
-rw-r--r--Documentation/devicetree/bindings/ufs/qcom,ufs.yaml253
-rw-r--r--Documentation/devicetree/bindings/ufs/renesas,ufs.yaml61
-rw-r--r--Documentation/devicetree/bindings/ufs/samsung,exynos-ufs.yaml26
-rw-r--r--Documentation/devicetree/bindings/ufs/snps,tc-dwc-g210.yaml51
-rw-r--r--Documentation/devicetree/bindings/ufs/sprd,ums9620-ufs.yaml79
-rw-r--r--Documentation/devicetree/bindings/ufs/tc-dwc-g210-pltfrm.txt26
-rw-r--r--Documentation/devicetree/bindings/ufs/ti,j721e-ufs.yaml7
-rw-r--r--Documentation/devicetree/bindings/ufs/ufs-common.yaml82
-rw-r--r--Documentation/devicetree/bindings/ufs/ufs-hisi.txt42
-rw-r--r--Documentation/devicetree/bindings/ufs/ufs-mediatek.txt45
-rw-r--r--Documentation/devicetree/bindings/ufs/ufs-qcom.txt63
-rw-r--r--Documentation/devicetree/bindings/ufs/ufshcd-pltfrm.txt89
-rw-r--r--Documentation/devicetree/bindings/usb/allwinner,sun4i-a10-musb.yaml13
-rw-r--r--Documentation/devicetree/bindings/usb/am33xx-usb.txt7
-rw-r--r--Documentation/devicetree/bindings/usb/amlogic,meson-g12a-usb-ctrl.yaml7
-rw-r--r--Documentation/devicetree/bindings/usb/analogix,anx7411.yaml83
-rw-r--r--Documentation/devicetree/bindings/usb/aspeed,ast2600-udc.yaml52
-rw-r--r--Documentation/devicetree/bindings/usb/aspeed,usb-vhub.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/atmel-usb.txt7
-rw-r--r--Documentation/devicetree/bindings/usb/brcm,bcm3384-usb.txt11
-rw-r--r--Documentation/devicetree/bindings/usb/brcm,bcm7445-ehci.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/brcm,bdc.txt29
-rw-r--r--Documentation/devicetree/bindings/usb/brcm,bdc.yaml50
-rw-r--r--Documentation/devicetree/bindings/usb/brcm,usb-pinmap.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/cdns,usb3.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/ci-hdrc-usb2.txt158
-rw-r--r--Documentation/devicetree/bindings/usb/ci-hdrc-usb2.yaml448
-rw-r--r--Documentation/devicetree/bindings/usb/cypress,cypd4226.yaml98
-rw-r--r--Documentation/devicetree/bindings/usb/da8xx-usb.txt5
-rw-r--r--Documentation/devicetree/bindings/usb/dwc2.yaml63
-rw-r--r--Documentation/devicetree/bindings/usb/dwc3-cavium.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/dwc3-st.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/dwc3-xilinx.txt56
-rw-r--r--Documentation/devicetree/bindings/usb/dwc3-xilinx.yaml135
-rw-r--r--Documentation/devicetree/bindings/usb/ehci-omap.txt31
-rw-r--r--Documentation/devicetree/bindings/usb/ehci-orion.txt22
-rw-r--r--Documentation/devicetree/bindings/usb/ehci-st.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/exynos-usb.txt115
-rw-r--r--Documentation/devicetree/bindings/usb/faraday,fotg210.txt35
-rw-r--r--Documentation/devicetree/bindings/usb/faraday,fotg210.yaml78
-rw-r--r--Documentation/devicetree/bindings/usb/fcs,fsa4480.yaml72
-rw-r--r--Documentation/devicetree/bindings/usb/fcs,fusb302.txt34
-rw-r--r--Documentation/devicetree/bindings/usb/fcs,fusb302.yaml67
-rw-r--r--Documentation/devicetree/bindings/usb/fsl,imx8mp-dwc3.yaml37
-rw-r--r--Documentation/devicetree/bindings/usb/fsl,imx8mq-dwc3.yaml48
-rw-r--r--Documentation/devicetree/bindings/usb/fsl,usbmisc.yaml68
-rw-r--r--Documentation/devicetree/bindings/usb/generic-ehci.yaml22
-rw-r--r--Documentation/devicetree/bindings/usb/generic-ohci.yaml42
-rw-r--r--Documentation/devicetree/bindings/usb/generic-xhci.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/genesys,gl850g.yaml49
-rw-r--r--Documentation/devicetree/bindings/usb/gpio-sbu-mux.yaml110
-rw-r--r--Documentation/devicetree/bindings/usb/ingenic,musb.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/intel,keembay-dwc3.yaml3
-rw-r--r--Documentation/devicetree/bindings/usb/marvell,pxau2o-ehci.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/maxim,max33359.yaml8
-rw-r--r--Documentation/devicetree/bindings/usb/maxim,max3420-udc.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/mediatek,mt6360-tcpc.yaml13
-rw-r--r--Documentation/devicetree/bindings/usb/mediatek,mt6370-tcpc.yaml36
-rw-r--r--Documentation/devicetree/bindings/usb/mediatek,mtk-xhci.yaml28
-rw-r--r--Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml25
-rw-r--r--Documentation/devicetree/bindings/usb/mediatek,musb.yaml8
-rw-r--r--Documentation/devicetree/bindings/usb/microchip,mpfs-musb.yaml59
-rw-r--r--Documentation/devicetree/bindings/usb/npcm7xx-usb.txt18
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra-xudc.yaml36
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra124-xusb.txt132
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra124-xusb.yaml200
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra186-xusb.yaml171
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra194-xusb.yaml175
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra210-xusb.yaml195
-rw-r--r--Documentation/devicetree/bindings/usb/nvidia,tegra234-xusb.yaml159
-rw-r--r--Documentation/devicetree/bindings/usb/nxp,isp1760.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/nxp,ptn5110.yaml72
-rw-r--r--Documentation/devicetree/bindings/usb/ohci-nxp.txt24
-rw-r--r--Documentation/devicetree/bindings/usb/ohci-omap3.txt15
-rw-r--r--Documentation/devicetree/bindings/usb/ohci-st.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/pxa-usb.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/qcom,dwc3.yaml402
-rw-r--r--Documentation/devicetree/bindings/usb/realtek,rts5411.yaml14
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,rzn1-usbf.yaml68
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,rzv2m-usb3drd.yaml129
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml41
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,usb3-peri.yaml63
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,usbhs.yaml9
-rw-r--r--Documentation/devicetree/bindings/usb/richtek,rt1711h.yaml100
-rw-r--r--Documentation/devicetree/bindings/usb/richtek,rt1719.yaml85
-rw-r--r--Documentation/devicetree/bindings/usb/rockchip,dwc3.yaml12
-rw-r--r--Documentation/devicetree/bindings/usb/rockchip,rk3399-dwc3.yaml115
-rw-r--r--Documentation/devicetree/bindings/usb/samsung,exynos-dwc3.yaml129
-rw-r--r--Documentation/devicetree/bindings/usb/samsung,exynos-usb2.yaml107
-rw-r--r--Documentation/devicetree/bindings/usb/smsc,usb3503.yaml159
-rw-r--r--Documentation/devicetree/bindings/usb/snps,dwc3.yaml95
-rw-r--r--Documentation/devicetree/bindings/usb/spear-usb.txt35
-rw-r--r--Documentation/devicetree/bindings/usb/st,stusb160x.yaml26
-rw-r--r--Documentation/devicetree/bindings/usb/st,typec-stm32g0.yaml91
-rw-r--r--Documentation/devicetree/bindings/usb/ti,am62-usb.yaml103
-rw-r--r--Documentation/devicetree/bindings/usb/ti,hd3ss3220.yaml3
-rw-r--r--Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml14
-rw-r--r--Documentation/devicetree/bindings/usb/ti,keystone-dwc3.yaml12
-rw-r--r--Documentation/devicetree/bindings/usb/ti,tps6598x.yaml17
-rw-r--r--Documentation/devicetree/bindings/usb/ti,usb8041.yaml67
-rw-r--r--Documentation/devicetree/bindings/usb/typec-tcpci.txt49
-rw-r--r--Documentation/devicetree/bindings/usb/udc-xilinx.txt18
-rw-r--r--Documentation/devicetree/bindings/usb/usb-device.yaml3
-rw-r--r--Documentation/devicetree/bindings/usb/usb-drd.yaml3
-rw-r--r--Documentation/devicetree/bindings/usb/usb-hcd.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/usb-nop-xceiv.yaml10
-rw-r--r--Documentation/devicetree/bindings/usb/usb-xhci.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/usb.yaml2
-rw-r--r--Documentation/devicetree/bindings/usb/usb251xb.txt89
-rw-r--r--Documentation/devicetree/bindings/usb/usb251xb.yaml271
-rw-r--r--Documentation/devicetree/bindings/usb/usb3503.txt39
-rw-r--r--Documentation/devicetree/bindings/usb/usbmisc-imx.txt18
-rw-r--r--Documentation/devicetree/bindings/usb/vialab,vl817.yaml71
-rw-r--r--Documentation/devicetree/bindings/usb/willsemi,wusb3801.yaml76
-rw-r--r--Documentation/devicetree/bindings/usb/xlnx,usb2.yaml47
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.yaml272
-rw-r--r--Documentation/devicetree/bindings/virtio/iommu.txt66
-rw-r--r--Documentation/devicetree/bindings/virtio/mmio.yaml6
-rw-r--r--Documentation/devicetree/bindings/virtio/pci-iommu.yaml101
-rw-r--r--Documentation/devicetree/bindings/virtio/virtio-device.yaml2
-rw-r--r--Documentation/devicetree/bindings/w1/maxim,ds2482.yaml44
-rw-r--r--Documentation/devicetree/bindings/w1/w1-gpio.txt27
-rw-r--r--Documentation/devicetree/bindings/w1/w1-gpio.yaml43
-rw-r--r--Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml31
-rw-r--r--Documentation/devicetree/bindings/watchdog/alphascale,asm9260-wdt.yaml70
-rw-r--r--Documentation/devicetree/bindings/watchdog/alphascale-asm9260.txt35
-rw-r--r--Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml12
-rw-r--r--Documentation/devicetree/bindings/watchdog/amlogic,meson6-wdt.yaml50
-rw-r--r--Documentation/devicetree/bindings/watchdog/apple,wdt.yaml53
-rw-r--r--Documentation/devicetree/bindings/watchdog/arm,sbsa-gwdt.yaml1
-rw-r--r--Documentation/devicetree/bindings/watchdog/arm,sp805.yaml1
-rw-r--r--Documentation/devicetree/bindings/watchdog/arm,twd-wdt.yaml6
-rw-r--r--Documentation/devicetree/bindings/watchdog/arm-smc-wdt.yaml9
-rw-r--r--Documentation/devicetree/bindings/watchdog/atmel,at91sam9-wdt.yaml127
-rw-r--r--Documentation/devicetree/bindings/watchdog/atmel,sama5d4-wdt.yaml19
-rw-r--r--Documentation/devicetree/bindings/watchdog/atmel-wdt.txt51
-rw-r--r--Documentation/devicetree/bindings/watchdog/brcm,bcm7038-wdt.txt19
-rw-r--r--Documentation/devicetree/bindings/watchdog/brcm,bcm7038-wdt.yaml43
-rw-r--r--Documentation/devicetree/bindings/watchdog/da9062-wdt.txt6
-rw-r--r--Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt22
-rw-r--r--Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml67
-rw-r--r--Documentation/devicetree/bindings/watchdog/fsl,scu-wdt.yaml34
-rw-r--r--Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.yaml37
-rw-r--r--Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml22
-rw-r--r--Documentation/devicetree/bindings/watchdog/gpio-wdt.txt28
-rw-r--r--Documentation/devicetree/bindings/watchdog/linux,wdt-gpio.yaml68
-rw-r--r--Documentation/devicetree/bindings/watchdog/maxim,max63xx.yaml3
-rw-r--r--Documentation/devicetree/bindings/watchdog/mediatek,mt7621-wdt.yaml40
-rw-r--r--Documentation/devicetree/bindings/watchdog/mediatek,mtk-wdt.yaml88
-rw-r--r--Documentation/devicetree/bindings/watchdog/meson-wdt.txt21
-rw-r--r--Documentation/devicetree/bindings/watchdog/mstar,msc313e-wdt.yaml2
-rw-r--r--Documentation/devicetree/bindings/watchdog/mt7621-wdt.txt12
-rw-r--r--Documentation/devicetree/bindings/watchdog/mtk-wdt.txt40
-rw-r--r--Documentation/devicetree/bindings/watchdog/nuvoton,npcm-wdt.txt3
-rw-r--r--Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt26
-rw-r--r--Documentation/devicetree/bindings/watchdog/qcom,pm8916-wdt.txt28
-rw-r--r--Documentation/devicetree/bindings/watchdog/qcom,pm8916-wdt.yaml51
-rw-r--r--Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml134
-rw-r--r--Documentation/devicetree/bindings/watchdog/ralink,rt2880-wdt.yaml46
-rw-r--r--Documentation/devicetree/bindings/watchdog/realtek,otto-wdt.yaml90
-rw-r--r--Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml130
-rw-r--r--Documentation/devicetree/bindings/watchdog/rt2880-wdt.txt18
-rw-r--r--Documentation/devicetree/bindings/watchdog/samsung-wdt.yaml51
-rw-r--r--Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml36
-rw-r--r--Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml14
-rw-r--r--Documentation/devicetree/bindings/watchdog/st,stm32-iwdg.yaml18
-rw-r--r--Documentation/devicetree/bindings/watchdog/starfive,jh7100-wdt.yaml71
-rw-r--r--Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml47
-rw-r--r--Documentation/devicetree/bindings/watchdog/ti,rti-wdt.yaml8
-rw-r--r--Documentation/devicetree/bindings/watchdog/toshiba,visconti-wdt.yaml16
-rw-r--r--Documentation/devicetree/bindings/watchdog/watchdog.yaml9
-rw-r--r--Documentation/devicetree/bindings/watchdog/xlnx,xps-timebase-wdt.yaml68
-rw-r--r--Documentation/devicetree/bindings/writing-bindings.rst29
-rw-r--r--Documentation/devicetree/bindings/writing-schema.rst63
-rw-r--r--Documentation/devicetree/of_unittest.rst27
-rw-r--r--Documentation/devicetree/overlay-notes.rst30
-rw-r--r--Documentation/doc-guide/contributing.rst5
-rw-r--r--Documentation/doc-guide/kernel-doc.rst7
-rw-r--r--Documentation/doc-guide/sphinx.rst120
-rw-r--r--Documentation/dontdiff2
-rw-r--r--Documentation/driver-api/aperture.rst13
-rw-r--r--Documentation/driver-api/auxiliary_bus.rst236
-rw-r--r--Documentation/driver-api/basics.rst3
-rw-r--r--Documentation/driver-api/clk.rst5
-rw-r--r--Documentation/driver-api/cxl/memory-devices.rst327
-rw-r--r--Documentation/driver-api/device-io.rst11
-rw-r--r--Documentation/driver-api/dma-buf.rst49
-rw-r--r--Documentation/driver-api/dmaengine/client.rst2
-rw-r--r--Documentation/driver-api/dmaengine/dmatest.rst19
-rw-r--r--Documentation/driver-api/dmaengine/provider.rst21
-rw-r--r--Documentation/driver-api/driver-model/bus.rst4
-rw-r--r--Documentation/driver-api/driver-model/devres.rst39
-rw-r--r--Documentation/driver-api/eisa.rst2
-rw-r--r--Documentation/driver-api/firewire.rst4
-rw-r--r--Documentation/driver-api/firmware/core.rst1
-rw-r--r--Documentation/driver-api/firmware/firmware-usage-guidelines.rst44
-rw-r--r--Documentation/driver-api/firmware/fw_search_path.rst9
-rw-r--r--Documentation/driver-api/firmware/fw_upload.rst127
-rw-r--r--Documentation/driver-api/firmware/index.rst1
-rw-r--r--Documentation/driver-api/firmware/other_interfaces.rst6
-rw-r--r--Documentation/driver-api/fpga/fpga-bridge.rst6
-rw-r--r--Documentation/driver-api/fpga/fpga-mgr.rst65
-rw-r--r--Documentation/driver-api/fpga/fpga-region.rst12
-rw-r--r--Documentation/driver-api/generic-counter.rst371
-rw-r--r--Documentation/driver-api/gpio/board.rst23
-rw-r--r--Documentation/driver-api/gpio/consumer.rst8
-rw-r--r--Documentation/driver-api/gpio/driver.rst193
-rw-r--r--Documentation/driver-api/gpio/intro.rst6
-rw-r--r--Documentation/driver-api/gpio/legacy.rst45
-rw-r--r--Documentation/driver-api/gpio/using-gpio.rst2
-rw-r--r--Documentation/driver-api/hsi.rst4
-rw-r--r--Documentation/driver-api/hte/hte.rst79
-rw-r--r--Documentation/driver-api/hte/index.rst22
-rw-r--r--Documentation/driver-api/hte/tegra-hte.rst47
-rw-r--r--Documentation/driver-api/index.rst13
-rw-r--r--Documentation/driver-api/io-mapping.rst4
-rw-r--r--Documentation/driver-api/ipmi.rst64
-rw-r--r--Documentation/driver-api/isa.rst2
-rw-r--r--Documentation/driver-api/libata.rst11
-rw-r--r--Documentation/driver-api/md/md-cluster.rst2
-rw-r--r--Documentation/driver-api/md/raid5-cache.rst2
-rw-r--r--Documentation/driver-api/media/cec-core.rst13
-rw-r--r--Documentation/driver-api/media/drivers/ccs/ccs.rst22
-rw-r--r--Documentation/driver-api/media/drivers/cpia2_devel.rst56
-rw-r--r--Documentation/driver-api/media/drivers/davinci-vpbe-devel.rst39
-rw-r--r--Documentation/driver-api/media/drivers/fimc-devel.rst14
-rw-r--r--Documentation/driver-api/media/drivers/index.rst3
-rw-r--r--Documentation/driver-api/media/drivers/pxa_camera.rst2
-rw-r--r--Documentation/driver-api/media/drivers/rkisp1.rst43
-rw-r--r--Documentation/driver-api/media/drivers/vidtv.rst2
-rw-r--r--Documentation/driver-api/media/dtv-demux.rst2
-rw-r--r--Documentation/driver-api/media/maintainer-entry-profile.rst2
-rw-r--r--Documentation/driver-api/media/mc-core.rst43
-rw-r--r--Documentation/driver-api/media/v4l2-event.rst2
-rw-r--r--Documentation/driver-api/media/v4l2-subdev.rst99
-rw-r--r--Documentation/driver-api/mei/nfc.rst2
-rw-r--r--Documentation/driver-api/miscellaneous.rst5
-rw-r--r--Documentation/driver-api/mmc/mmc-tools.rst4
-rw-r--r--Documentation/driver-api/mtd/index.rst2
-rw-r--r--Documentation/driver-api/mtd/intel-spi.rst90
-rw-r--r--Documentation/driver-api/mtd/spi-intel.rst90
-rw-r--r--Documentation/driver-api/mtd/spi-nor.rst3
-rw-r--r--Documentation/driver-api/nfc/nfc-hci.rst2
-rw-r--r--Documentation/driver-api/nvdimm/nvdimm.rst406
-rw-r--r--Documentation/driver-api/nvdimm/security.rst2
-rw-r--r--Documentation/driver-api/nvmem.rst43
-rw-r--r--Documentation/driver-api/pci/pci.rst2
-rw-r--r--Documentation/driver-api/phy/phy.rst49
-rw-r--r--Documentation/driver-api/pin-control.rst510
-rw-r--r--Documentation/driver-api/pldmfw/index.rst2
-rw-r--r--Documentation/driver-api/pwm.rst20
-rw-r--r--Documentation/driver-api/serial/driver.rst485
-rw-r--r--Documentation/driver-api/serial/index.rst3
-rw-r--r--Documentation/driver-api/serial/n_gsm.rst100
-rw-r--r--Documentation/driver-api/serial/serial-rs485.rst58
-rw-r--r--Documentation/driver-api/serial/tty.rst328
-rw-r--r--Documentation/driver-api/spi.rst4
-rw-r--r--Documentation/driver-api/surface_aggregator/client.rst18
-rw-r--r--Documentation/driver-api/surface_aggregator/ssh.rst38
-rw-r--r--Documentation/driver-api/thermal/index.rst2
-rw-r--r--Documentation/driver-api/thermal/intel_dptf.rst317
-rw-r--r--Documentation/driver-api/thermal/sysfs-api.rst265
-rw-r--r--Documentation/driver-api/tty/index.rst73
-rw-r--r--Documentation/driver-api/tty/moxa-smartio.rst (renamed from Documentation/driver-api/serial/moxa-smartio.rst)0
-rw-r--r--Documentation/driver-api/tty/n_gsm.rst192
-rw-r--r--Documentation/driver-api/tty/n_tty.rst22
-rw-r--r--Documentation/driver-api/tty/tty_buffer.rst46
-rw-r--r--Documentation/driver-api/tty/tty_driver.rst128
-rw-r--r--Documentation/driver-api/tty/tty_internals.rst31
-rw-r--r--Documentation/driver-api/tty/tty_ldisc.rst85
-rw-r--r--Documentation/driver-api/tty/tty_port.rst70
-rw-r--r--Documentation/driver-api/tty/tty_struct.rst81
-rw-r--r--Documentation/driver-api/usb/dwc3.rst2
-rw-r--r--Documentation/driver-api/usb/usb3-debug-port.rst2
-rw-r--r--Documentation/driver-api/usb/writing_usb_driver.rst45
-rw-r--r--Documentation/driver-api/vfio-mediated-device.rst169
-rw-r--r--Documentation/driver-api/vfio-pci-device-specific-driver-acceptance.rst35
-rw-r--r--Documentation/driver-api/vfio.rst84
-rw-r--r--Documentation/driver-api/virtio/index.rst11
-rw-r--r--Documentation/driver-api/virtio/virtio.rst145
-rw-r--r--Documentation/driver-api/virtio/writing_virtio_drivers.rst197
-rw-r--r--Documentation/driver-api/vme.rst4
-rw-r--r--Documentation/fault-injection/fault-injection.rst104
-rw-r--r--Documentation/fault-injection/notifier-error-inject.rst4
-rw-r--r--Documentation/fb/modedb.rst10
-rw-r--r--Documentation/fb/udlfb.rst23
-rw-r--r--Documentation/features/core/cBPF-JIT/arch-support.txt3
-rw-r--r--Documentation/features/core/eBPF-JIT/arch-support.txt3
-rw-r--r--Documentation/features/core/generic-idle-thread/arch-support.txt3
-rw-r--r--Documentation/features/core/jump-labels/arch-support.txt5
-rw-r--r--Documentation/features/core/thread-info-in-task/arch-support.txt7
-rw-r--r--Documentation/features/core/tracehook/arch-support.txt3
-rw-r--r--Documentation/features/debug/KASAN/arch-support.txt5
-rw-r--r--Documentation/features/debug/debug-vm-pgtable/arch-support.txt7
-rw-r--r--Documentation/features/debug/gcov-profile-all/arch-support.txt7
-rw-r--r--Documentation/features/debug/kcov/arch-support.txt5
-rw-r--r--Documentation/features/debug/kgdb/arch-support.txt3
-rw-r--r--Documentation/features/debug/kmemleak/arch-support.txt3
-rw-r--r--Documentation/features/debug/kprobes-on-ftrace/arch-support.txt3
-rw-r--r--Documentation/features/debug/kprobes/arch-support.txt3
-rw-r--r--Documentation/features/debug/kretprobes/arch-support.txt3
-rw-r--r--Documentation/features/debug/optprobes/arch-support.txt3
-rw-r--r--Documentation/features/debug/stackprotector/arch-support.txt3
-rw-r--r--Documentation/features/debug/uprobes/arch-support.txt3
-rw-r--r--Documentation/features/debug/user-ret-profiler/arch-support.txt3
-rw-r--r--Documentation/features/io/dma-contiguous/arch-support.txt3
-rw-r--r--Documentation/features/locking/cmpxchg-local/arch-support.txt3
-rw-r--r--Documentation/features/locking/lockdep/arch-support.txt3
-rw-r--r--Documentation/features/locking/queued-rwlocks/arch-support.txt5
-rw-r--r--Documentation/features/locking/queued-spinlocks/arch-support.txt5
-rw-r--r--Documentation/features/perf/kprobes-event/arch-support.txt5
-rw-r--r--Documentation/features/perf/perf-regs/arch-support.txt3
-rw-r--r--Documentation/features/perf/perf-stackdump/arch-support.txt3
-rw-r--r--Documentation/features/sched/membarrier-sync-core/arch-support.txt7
-rw-r--r--Documentation/features/sched/numa-balancing/arch-support.txt3
-rwxr-xr-xDocumentation/features/scripts/features-refresh.sh2
-rw-r--r--Documentation/features/seccomp/seccomp-filter/arch-support.txt5
-rw-r--r--Documentation/features/time/arch-tick-broadcast/arch-support.txt3
-rw-r--r--Documentation/features/time/clockevents/arch-support.txt3
-rw-r--r--Documentation/features/time/context-tracking/arch-support.txt11
-rw-r--r--Documentation/features/time/irq-time-acct/arch-support.txt3
-rw-r--r--Documentation/features/time/virt-cpuacct/arch-support.txt5
-rw-r--r--Documentation/features/vm/ELF-ASLR/arch-support.txt3
-rw-r--r--Documentation/features/vm/PG_uncached/arch-support.txt3
-rw-r--r--Documentation/features/vm/THP/arch-support.txt3
-rw-r--r--Documentation/features/vm/TLB/arch-support.txt3
-rw-r--r--Documentation/features/vm/huge-vmap/arch-support.txt5
-rw-r--r--Documentation/features/vm/ioremap_prot/arch-support.txt5
-rw-r--r--Documentation/features/vm/pte_special/arch-support.txt5
-rw-r--r--Documentation/filesystems/9p.rst52
-rw-r--r--Documentation/filesystems/autofs.rst2
-rw-r--r--Documentation/filesystems/btrfs.rst16
-rw-r--r--Documentation/filesystems/caching/backend-api.rst850
-rw-r--r--Documentation/filesystems/caching/cachefiles.rst184
-rw-r--r--Documentation/filesystems/caching/fscache.rst525
-rw-r--r--Documentation/filesystems/caching/index.rst4
-rw-r--r--Documentation/filesystems/caching/netfs-api.rst1136
-rw-r--r--Documentation/filesystems/caching/object.rst313
-rw-r--r--Documentation/filesystems/caching/operations.rst210
-rw-r--r--Documentation/filesystems/ceph.rst26
-rw-r--r--Documentation/filesystems/cifs/ksmbd.rst56
-rw-r--r--Documentation/filesystems/configfs.rst48
-rw-r--r--Documentation/filesystems/dax.rst24
-rw-r--r--Documentation/filesystems/debugfs.rst8
-rw-r--r--Documentation/filesystems/erofs.rst102
-rw-r--r--Documentation/filesystems/ext2.rst2
-rw-r--r--Documentation/filesystems/ext4/attributes.rst68
-rw-r--r--Documentation/filesystems/ext4/bigalloc.rst2
-rw-r--r--Documentation/filesystems/ext4/bitmaps.rst6
-rw-r--r--Documentation/filesystems/ext4/blockgroup.rst36
-rw-r--r--Documentation/filesystems/ext4/blockmap.rst2
-rw-r--r--Documentation/filesystems/ext4/blocks.rst2
-rw-r--r--Documentation/filesystems/ext4/checksums.rst26
-rw-r--r--Documentation/filesystems/ext4/directory.rst166
-rw-r--r--Documentation/filesystems/ext4/eainode.rst10
-rw-r--r--Documentation/filesystems/ext4/group_descr.rst126
-rw-r--r--Documentation/filesystems/ext4/ifork.rst60
-rw-r--r--Documentation/filesystems/ext4/inlinedata.rst8
-rw-r--r--Documentation/filesystems/ext4/inodes.rst306
-rw-r--r--Documentation/filesystems/ext4/journal.rst214
-rw-r--r--Documentation/filesystems/ext4/mmp.rst36
-rw-r--r--Documentation/filesystems/ext4/orphan.rst44
-rw-r--r--Documentation/filesystems/ext4/overview.rst2
-rw-r--r--Documentation/filesystems/ext4/special_inodes.rst8
-rw-r--r--Documentation/filesystems/ext4/super.rst556
-rw-r--r--Documentation/filesystems/f2fs.rst130
-rw-r--r--Documentation/filesystems/fscrypt.rst143
-rw-r--r--Documentation/filesystems/fsverity.rst180
-rw-r--r--Documentation/filesystems/fuse.rst29
-rw-r--r--Documentation/filesystems/idmappings.rst257
-rw-r--r--Documentation/filesystems/index.rst2
-rw-r--r--Documentation/filesystems/locking.rst163
-rw-r--r--Documentation/filesystems/locks.rst17
-rw-r--r--Documentation/filesystems/mount_api.rst13
-rw-r--r--Documentation/filesystems/netfs_library.rst266
-rw-r--r--Documentation/filesystems/nfs/client-identifier.rst216
-rw-r--r--Documentation/filesystems/nfs/index.rst3
-rw-r--r--Documentation/filesystems/nfs/reexport.rst113
-rw-r--r--Documentation/filesystems/ntfs3.rst30
-rw-r--r--Documentation/filesystems/overlayfs.rst2
-rw-r--r--Documentation/filesystems/porting.rst41
-rw-r--r--Documentation/filesystems/proc.rst370
-rw-r--r--Documentation/filesystems/qnx6.rst2
-rw-r--r--Documentation/filesystems/spufs/spufs.rst2
-rw-r--r--Documentation/filesystems/sysfs.rst47
-rw-r--r--Documentation/filesystems/tmpfs.rst66
-rw-r--r--Documentation/filesystems/ubifs.rst2
-rw-r--r--Documentation/filesystems/vfs.rst368
-rw-r--r--Documentation/filesystems/xfs-delayed-logging-design.rst371
-rw-r--r--Documentation/filesystems/xfs-online-fsck-design.rst5315
-rw-r--r--Documentation/filesystems/xfs-self-describing-metadata.rst1
-rw-r--r--Documentation/filesystems/zonefs.rst52
-rw-r--r--Documentation/firmware-guide/acpi/DSD-properties-rules.rst11
-rw-r--r--Documentation/firmware-guide/acpi/acpi-lid.rst2
-rw-r--r--Documentation/firmware-guide/acpi/apei/einj.rst21
-rw-r--r--Documentation/firmware-guide/acpi/chromeos-acpi-device.rst363
-rw-r--r--Documentation/firmware-guide/acpi/dsd/data-node-references.rst28
-rw-r--r--Documentation/firmware-guide/acpi/dsd/graph.rst40
-rw-r--r--Documentation/firmware-guide/acpi/dsd/leds.rst40
-rw-r--r--Documentation/firmware-guide/acpi/dsd/phy.rst28
-rw-r--r--Documentation/firmware-guide/acpi/enumeration.rst188
-rw-r--r--Documentation/firmware-guide/acpi/gpio-properties.rst61
-rw-r--r--Documentation/firmware-guide/acpi/index.rst2
-rw-r--r--Documentation/firmware-guide/acpi/namespace.rst2
-rw-r--r--Documentation/firmware-guide/acpi/non-d0-probe.rst78
-rw-r--r--Documentation/firmware-guide/acpi/osi.rst27
-rw-r--r--Documentation/fpga/dfl.rst124
-rw-r--r--Documentation/gpu/amdgpu-dc.rst74
-rw-r--r--Documentation/gpu/amdgpu.rst324
-rw-r--r--Documentation/gpu/amdgpu/amdgpu-glossary.rst123
-rw-r--r--Documentation/gpu/amdgpu/apu-asic-info-table.csv10
-rw-r--r--Documentation/gpu/amdgpu/dgpu-asic-info-table.csv26
-rw-r--r--Documentation/gpu/amdgpu/display/config_example.svg414
-rw-r--r--Documentation/gpu/amdgpu/display/dc-debug.rst77
-rw-r--r--Documentation/gpu/amdgpu/display/dc-glossary.rst237
-rw-r--r--Documentation/gpu/amdgpu/display/dc_pipeline_overview.svg1125
-rw-r--r--Documentation/gpu/amdgpu/display/dcn-overview.rst230
-rw-r--r--Documentation/gpu/amdgpu/display/dcn2_cm_drm_current.svg1370
-rw-r--r--Documentation/gpu/amdgpu/display/dcn3_cm_drm_current.svg1529
-rw-r--r--Documentation/gpu/amdgpu/display/display-manager.rst183
-rw-r--r--Documentation/gpu/amdgpu/display/global_sync_vblank.svg485
-rw-r--r--Documentation/gpu/amdgpu/display/index.rst32
-rw-r--r--Documentation/gpu/amdgpu/display/mpo-cursor.svg435
-rw-r--r--Documentation/gpu/amdgpu/display/mpo-overview.rst242
-rw-r--r--Documentation/gpu/amdgpu/display/multi-display-hdcp-mpo-less-pipe-ex.svg220
-rw-r--r--Documentation/gpu/amdgpu/display/multi-display-hdcp-mpo.svg171
-rw-r--r--Documentation/gpu/amdgpu/display/pipeline_4k_no_split.svg958
-rw-r--r--Documentation/gpu/amdgpu/display/pipeline_4k_split.svg1062
-rw-r--r--Documentation/gpu/amdgpu/display/single-display-mpo-multi-video.svg339
-rw-r--r--Documentation/gpu/amdgpu/display/single-display-mpo.svg266
-rw-r--r--Documentation/gpu/amdgpu/driver-core.rst182
-rw-r--r--Documentation/gpu/amdgpu/driver-misc.rst129
-rw-r--r--Documentation/gpu/amdgpu/index.rst17
-rw-r--r--Documentation/gpu/amdgpu/module-parameters.rst7
-rw-r--r--Documentation/gpu/amdgpu/ras.rst62
-rw-r--r--Documentation/gpu/amdgpu/thermal.rst122
-rw-r--r--Documentation/gpu/amdgpu/xgmi.rst5
-rw-r--r--Documentation/gpu/drivers.rst3
-rw-r--r--Documentation/gpu/drm-internals.rst38
-rw-r--r--Documentation/gpu/drm-kms-helpers.rst88
-rw-r--r--Documentation/gpu/drm-kms.rst16
-rw-r--r--Documentation/gpu/drm-mm.rst111
-rw-r--r--Documentation/gpu/drm-uapi.rst16
-rw-r--r--Documentation/gpu/drm-usage-stats.rst132
-rw-r--r--Documentation/gpu/i915.rst74
-rw-r--r--Documentation/gpu/index.rst7
-rw-r--r--Documentation/gpu/introduction.rst60
-rw-r--r--Documentation/gpu/rfc/i915_parallel_execbuf.h122
-rw-r--r--Documentation/gpu/rfc/i915_scheduler.rst4
-rw-r--r--Documentation/gpu/rfc/i915_small_bar.h189
-rw-r--r--Documentation/gpu/rfc/i915_small_bar.rst47
-rw-r--r--Documentation/gpu/rfc/i915_vm_bind.h291
-rw-r--r--Documentation/gpu/rfc/i915_vm_bind.rst245
-rw-r--r--Documentation/gpu/rfc/index.rst8
-rw-r--r--Documentation/gpu/todo.rst266
-rw-r--r--Documentation/gpu/vc4.rst19
-rw-r--r--Documentation/gpu/vgaarbiter.rst2
-rw-r--r--Documentation/gpu/vkms.rst15
-rw-r--r--Documentation/hid/hid-alps.rst2
-rw-r--r--Documentation/hid/hid-bpf.rst522
-rw-r--r--Documentation/hid/hiddev.rst2
-rw-r--r--Documentation/hid/hidraw.rst2
-rw-r--r--Documentation/hid/index.rst1
-rw-r--r--Documentation/hid/intel-ish-hid.rst6
-rw-r--r--Documentation/hwmon/acbel-fsg032.rst80
-rw-r--r--Documentation/hwmon/aht10.rst2
-rw-r--r--Documentation/hwmon/aquacomputer_d5next.rst89
-rw-r--r--Documentation/hwmon/aspeed-pwm-tacho.rst2
-rw-r--r--Documentation/hwmon/asus_ec_sensors.rst66
-rw-r--r--Documentation/hwmon/asus_wmi_sensors.rst78
-rw-r--r--Documentation/hwmon/corsair-psu.rst11
-rw-r--r--Documentation/hwmon/dell-smm-hwmon.rst195
-rw-r--r--Documentation/hwmon/emc2305.rst37
-rw-r--r--Documentation/hwmon/ftsteutates.rst15
-rw-r--r--Documentation/hwmon/gsc-hwmon.rst6
-rw-r--r--Documentation/hwmon/gxp-fan-ctrl.rst28
-rw-r--r--Documentation/hwmon/hwmon-kernel-api.rst84
-rw-r--r--Documentation/hwmon/ina238.rst56
-rw-r--r--Documentation/hwmon/index.rst31
-rw-r--r--Documentation/hwmon/ir38064.rst28
-rw-r--r--Documentation/hwmon/it87.rst47
-rw-r--r--Documentation/hwmon/lan966x.rst40
-rw-r--r--Documentation/hwmon/lm25066.rst2
-rw-r--r--Documentation/hwmon/lm70.rst7
-rw-r--r--Documentation/hwmon/lm90.rst243
-rw-r--r--Documentation/hwmon/lt7182s.rst92
-rw-r--r--Documentation/hwmon/ltc2978.rst2
-rw-r--r--Documentation/hwmon/max16601.rst19
-rw-r--r--Documentation/hwmon/max31760.rst77
-rw-r--r--Documentation/hwmon/max31790.rst1
-rw-r--r--Documentation/hwmon/max6620.rst46
-rw-r--r--Documentation/hwmon/max6639.rst2
-rw-r--r--Documentation/hwmon/max6697.rst2
-rw-r--r--Documentation/hwmon/mc34vr500.rst32
-rw-r--r--Documentation/hwmon/menf21bmc.rst2
-rw-r--r--Documentation/hwmon/mp5023.rst84
-rw-r--r--Documentation/hwmon/nzxt-smart2.rst62
-rw-r--r--Documentation/hwmon/oxp-sensors.rst49
-rw-r--r--Documentation/hwmon/peci-cputemp.rst90
-rw-r--r--Documentation/hwmon/peci-dimmtemp.rst57
-rw-r--r--Documentation/hwmon/pli1209bc.rst75
-rw-r--r--Documentation/hwmon/pmbus-core.rst11
-rw-r--r--Documentation/hwmon/pwm-fan.rst12
-rw-r--r--Documentation/hwmon/sch5627.rst4
-rw-r--r--Documentation/hwmon/sfctemp.rst33
-rw-r--r--Documentation/hwmon/sht4x.rst2
-rw-r--r--Documentation/hwmon/smm665.rst2
-rw-r--r--Documentation/hwmon/smpro-hwmon.rst102
-rw-r--r--Documentation/hwmon/stpddc60.rst2
-rw-r--r--Documentation/hwmon/submitting-patches.rst3
-rw-r--r--Documentation/hwmon/sy7636a-hwmon.rst26
-rw-r--r--Documentation/hwmon/sysfs-interface.rst600
-rw-r--r--Documentation/hwmon/tmp401.rst15
-rw-r--r--Documentation/hwmon/tmp421.rst10
-rw-r--r--Documentation/hwmon/tmp464.rst73
-rw-r--r--Documentation/hwmon/tps546d24.rst35
-rw-r--r--Documentation/hwmon/vexpress.rst2
-rw-r--r--Documentation/hwmon/via686a.rst2
-rw-r--r--Documentation/hwmon/xdpe12284.rst12
-rw-r--r--Documentation/hwmon/xdpe152c4.rst118
-rw-r--r--Documentation/i2c/busses/i2c-i801.rst2
-rw-r--r--Documentation/i2c/busses/i2c-piix4.rst13
-rw-r--r--Documentation/i2c/dev-interface.rst2
-rw-r--r--Documentation/i2c/gpio-fault-injection.rst2
-rw-r--r--Documentation/i2c/i2c-protocol.rst11
-rw-r--r--Documentation/i2c/i2c-sysfs.rst24
-rw-r--r--Documentation/i2c/i2c-topology.rst214
-rw-r--r--Documentation/i2c/instantiating-devices.rst16
-rw-r--r--Documentation/i2c/slave-interface.rst15
-rw-r--r--Documentation/i2c/smbus-protocol.rst22
-rw-r--r--Documentation/i2c/summary.rst8
-rw-r--r--Documentation/i2c/writing-clients.rst19
-rw-r--r--Documentation/ide/ChangeLog.ide-cd.1994-2004268
-rw-r--r--Documentation/ide/ChangeLog.ide-floppy.1996-200263
-rw-r--r--Documentation/ide/ChangeLog.ide-tape.1995-2002257
-rw-r--r--Documentation/ide/changelogs.rst17
-rw-r--r--Documentation/ide/ide-tape.rst68
-rw-r--r--Documentation/ide/ide.rst265
-rw-r--r--Documentation/ide/index.rst21
-rw-r--r--Documentation/ide/warm-plug-howto.rst18
-rw-r--r--Documentation/iio/bno055.rst51
-rw-r--r--Documentation/iio/index.rst2
-rw-r--r--Documentation/images/COPYING-logo21
-rw-r--r--Documentation/images/logo.gif (renamed from Documentation/logo.gif)bin16335 -> 16335 bytes-rw-r--r--Documentation/images/logo.svg2040
-rw-r--r--Documentation/index.rst166
-rw-r--r--Documentation/input/devices/atarikbd.rst4
-rw-r--r--Documentation/input/devices/ntrig.rst2
-rw-r--r--Documentation/input/event-codes.rst12
-rw-r--r--Documentation/input/gamepad.rst6
-rw-r--r--Documentation/input/index.rst6
-rw-r--r--Documentation/input/input-programming.rst6
-rw-r--r--Documentation/input/joydev/joystick.rst1
-rw-r--r--Documentation/isdn/interface_capi.rst2
-rw-r--r--Documentation/isdn/m_isdn.rst2
-rw-r--r--Documentation/kbuild/Kconfig.recursion-issue-022
-rw-r--r--Documentation/kbuild/gcc-plugins.rst47
-rw-r--r--Documentation/kbuild/kbuild.rst37
-rw-r--r--Documentation/kbuild/kconfig-language.rst14
-rw-r--r--Documentation/kbuild/llvm.rst60
-rw-r--r--Documentation/kbuild/makefiles.rst2201
-rw-r--r--Documentation/kbuild/reproducible-builds.rst18
-rw-r--r--Documentation/kernel-hacking/false-sharing.rst206
-rw-r--r--Documentation/kernel-hacking/hacking.rst41
-rw-r--r--Documentation/kernel-hacking/index.rst1
-rw-r--r--Documentation/kernel-hacking/locking.rst42
-rw-r--r--Documentation/leds/index.rst2
-rw-r--r--Documentation/leds/leds-mt6370-rgb.rst64
-rw-r--r--Documentation/leds/leds-qcom-lpg.rst78
-rw-r--r--Documentation/leds/ledtrig-oneshot.rst2
-rw-r--r--Documentation/leds/well-known-leds.txt44
-rw-r--r--Documentation/litmus-tests/README2
-rw-r--r--Documentation/litmus-tests/locking/DCL-broken.litmus54
-rw-r--r--Documentation/litmus-tests/locking/DCL-fixed.litmus55
-rw-r--r--Documentation/litmus-tests/locking/RM-broken.litmus41
-rw-r--r--Documentation/litmus-tests/locking/RM-fixed.litmus41
-rw-r--r--Documentation/livepatch/api.rst30
-rw-r--r--Documentation/livepatch/index.rst1
-rw-r--r--Documentation/livepatch/module-elf-format.rst41
-rw-r--r--Documentation/livepatch/reliable-stacktrace.rst2
-rw-r--r--Documentation/livepatch/shadow-vars.rst4
-rw-r--r--Documentation/livepatch/system-state.rst4
-rw-r--r--Documentation/locking/locktorture.rst4
-rw-r--r--Documentation/locking/locktypes.rst14
-rw-r--r--Documentation/locking/seqlock.rst2
-rw-r--r--Documentation/locking/ww-mutex-design.rst2
-rw-r--r--Documentation/loongarch/booting.rst42
-rw-r--r--Documentation/loongarch/features.rst3
-rw-r--r--Documentation/loongarch/index.rst22
-rw-r--r--Documentation/loongarch/introduction.rst390
-rw-r--r--Documentation/loongarch/irq-chip-model.rst160
-rw-r--r--Documentation/maintainer/index.rst1
-rw-r--r--Documentation/maintainer/maintainer-entry-profile.rst2
-rw-r--r--Documentation/maintainer/messy-diffstat.rst96
-rw-r--r--Documentation/maintainer/pull-requests.rst2
-rw-r--r--Documentation/maintainer/rebasing-and-merging.rst6
-rw-r--r--Documentation/memory-barriers.txt208
-rw-r--r--Documentation/misc-devices/index.rst1
-rw-r--r--Documentation/misc-devices/oxsemi-tornado.rst131
-rw-r--r--Documentation/mm/active_mm.rst95
-rw-r--r--Documentation/mm/arch_pgtable_helpers.rst (renamed from Documentation/vm/arch_pgtable_helpers.rst)36
-rw-r--r--Documentation/mm/balance.rst100
-rw-r--r--Documentation/mm/bootmem.rst5
-rw-r--r--Documentation/mm/damon/api.rst (renamed from Documentation/vm/damon/api.rst)0
-rw-r--r--Documentation/mm/damon/design.rst176
-rw-r--r--Documentation/mm/damon/faq.rst50
-rw-r--r--Documentation/mm/damon/index.rst35
-rw-r--r--Documentation/mm/damon/maintainer-profile.rst62
-rw-r--r--Documentation/mm/free_page_reporting.rst38
-rw-r--r--Documentation/mm/frontswap.rst264
-rw-r--r--Documentation/mm/highmem.rst209
-rw-r--r--Documentation/mm/hmm.rst450
-rw-r--r--Documentation/mm/hugetlbfs_reserv.rst595
-rw-r--r--Documentation/mm/hwpoison.rst182
-rw-r--r--Documentation/mm/index.rst69
-rw-r--r--Documentation/mm/ksm.rst85
-rw-r--r--Documentation/mm/memory-model.rst175
-rw-r--r--Documentation/mm/mmu_notifier.rst97
-rw-r--r--Documentation/mm/multigen_lru.rst269
-rw-r--r--Documentation/mm/numa.rst148
-rw-r--r--Documentation/mm/oom.rst5
-rw-r--r--Documentation/mm/overcommit-accounting.rst86
-rw-r--r--Documentation/mm/page_allocation.rst5
-rw-r--r--Documentation/mm/page_cache.rst5
-rw-r--r--Documentation/mm/page_frags.rst43
-rw-r--r--Documentation/mm/page_migration.rst193
-rw-r--r--Documentation/mm/page_owner.rst187
-rw-r--r--Documentation/mm/page_reclaim.rst5
-rw-r--r--Documentation/mm/page_table_check.rst54
-rw-r--r--Documentation/mm/page_tables.rst5
-rw-r--r--Documentation/mm/physical_memory.rst371
-rw-r--r--Documentation/mm/process_addrs.rst5
-rw-r--r--Documentation/mm/remap_file_pages.rst31
-rw-r--r--Documentation/mm/shmfs.rst5
-rw-r--r--Documentation/mm/slab.rst5
-rw-r--r--Documentation/mm/slub.rst (renamed from Documentation/vm/slub.rst)79
-rw-r--r--Documentation/mm/split_page_table_lock.rst98
-rw-r--r--Documentation/mm/swap.rst5
-rw-r--r--Documentation/mm/transhuge.rst (renamed from Documentation/vm/transhuge.rst)42
-rw-r--r--Documentation/mm/unevictable-lru.rst559
-rw-r--r--Documentation/mm/vmalloc.rst5
-rw-r--r--Documentation/mm/vmalloced-kernel-stacks.rst153
-rw-r--r--Documentation/mm/vmemmap_dedup.rst249
-rw-r--r--Documentation/mm/z3fold.rst28
-rw-r--r--Documentation/mm/zsmalloc.rst265
-rw-r--r--Documentation/netlink/genetlink-c.yaml331
-rw-r--r--Documentation/netlink/genetlink-legacy.yaml377
-rw-r--r--Documentation/netlink/genetlink.yaml299
-rw-r--r--Documentation/netlink/specs/devlink.yaml198
-rw-r--r--Documentation/netlink/specs/ethtool.yaml1646
-rw-r--r--Documentation/netlink/specs/fou.yaml132
-rw-r--r--Documentation/netlink/specs/handshake.yaml124
-rw-r--r--Documentation/netlink/specs/netdev.yaml101
-rw-r--r--Documentation/netlink/specs/ovs_datapath.yaml153
-rw-r--r--Documentation/netlink/specs/ovs_vport.yaml139
-rw-r--r--Documentation/networking/af_xdp.rst4
-rw-r--r--Documentation/networking/arcnet-hardware.rst2
-rw-r--r--Documentation/networking/batman-adv.rst2
-rw-r--r--Documentation/networking/bonding.rst72
-rw-r--r--Documentation/networking/bridge.rst2
-rw-r--r--Documentation/networking/can.rst37
-rw-r--r--Documentation/networking/can_ucan_protocol.rst2
-rw-r--r--Documentation/networking/cdc_mbim.rst2
-rw-r--r--Documentation/networking/decnet.rst243
-rw-r--r--Documentation/networking/device_drivers/appletalk/index.rst1
-rw-r--r--Documentation/networking/device_drivers/appletalk/ltpc.rst144
-rw-r--r--Documentation/networking/device_drivers/atm/iphase.rst2
-rw-r--r--Documentation/networking/device_drivers/can/can327.rst331
-rw-r--r--Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst638
-rw-r--r--Documentation/networking/device_drivers/can/ctu/fsm_txt_buffer_user.svg151
-rw-r--r--Documentation/networking/device_drivers/can/freescale/flexcan.rst54
-rw-r--r--Documentation/networking/device_drivers/can/index.rst22
-rw-r--r--Documentation/networking/device_drivers/ethernet/3com/vortex.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/amazon/ena.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/amd/pds_core.rst139
-rw-r--r--Documentation/networking/device_drivers/ethernet/aquantia/atlantic.rst6
-rw-r--r--Documentation/networking/device_drivers/ethernet/dec/de4x5.rst189
-rw-r--r--Documentation/networking/device_drivers/ethernet/freescale/dpaa2/mac-phy-support.rst11
-rw-r--r--Documentation/networking/device_drivers/ethernet/freescale/dpaa2/overview.rst1
-rw-r--r--Documentation/networking/device_drivers/ethernet/index.rst9
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/e100.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/e1000.rst9
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/e1000e.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/fm10k.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/i40e.rst11
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/iavf.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ice.rst20
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/igb.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/igbvf.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ixgb.rst468
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ixgbe.rst23
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ixgbevf.rst7
-rw-r--r--Documentation/networking/device_drivers/ethernet/marvell/octeon_ep.rst36
-rw-r--r--Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst702
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst1276
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst292
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/index.rst26
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/kconfig.rst168
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst239
-rw-r--r--Documentation/networking/device_drivers/ethernet/mellanox/mlx5/tracepoints.rst229
-rw-r--r--Documentation/networking/device_drivers/ethernet/neterion/vxge.rst115
-rw-r--r--Documentation/networking/device_drivers/ethernet/netronome/nfp.rst165
-rw-r--r--Documentation/networking/device_drivers/ethernet/pensando/ionic.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/ti/am65_nuss_cpsw_switchdev.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/ti/cpsw_switchdev.rst2
-rw-r--r--Documentation/networking/device_drivers/ethernet/wangxun/ngbe.rst14
-rw-r--r--Documentation/networking/device_drivers/ethernet/wangxun/txgbe.rst20
-rw-r--r--Documentation/networking/device_drivers/index.rst2
-rw-r--r--Documentation/networking/device_drivers/wan/index.rst18
-rw-r--r--Documentation/networking/device_drivers/wan/z8530book.rst256
-rw-r--r--Documentation/networking/device_drivers/wwan/index.rst1
-rw-r--r--Documentation/networking/device_drivers/wwan/iosm.rst2
-rw-r--r--Documentation/networking/device_drivers/wwan/t7xx.rst120
-rw-r--r--Documentation/networking/devlink/bnxt.rst2
-rw-r--r--Documentation/networking/devlink/devlink-health.rst23
-rw-r--r--Documentation/networking/devlink/devlink-info.rst5
-rw-r--r--Documentation/networking/devlink/devlink-linecard.rst122
-rw-r--r--Documentation/networking/devlink/devlink-params.rst15
-rw-r--r--Documentation/networking/devlink/devlink-port.rst168
-rw-r--r--Documentation/networking/devlink/devlink-region.rst17
-rw-r--r--Documentation/networking/devlink/devlink-selftests.rst38
-rw-r--r--Documentation/networking/devlink/devlink-trap.rst13
-rw-r--r--Documentation/networking/devlink/etas_es58x.rst36
-rw-r--r--Documentation/networking/devlink/ice.rst216
-rw-r--r--Documentation/networking/devlink/index.rst20
-rw-r--r--Documentation/networking/devlink/iosm.rst162
-rw-r--r--Documentation/networking/devlink/mlx5.rst41
-rw-r--r--Documentation/networking/devlink/mlxsw.rst24
-rw-r--r--Documentation/networking/devlink/netdevsim.rst4
-rw-r--r--Documentation/networking/devlink/octeontx2.rst42
-rw-r--r--Documentation/networking/devlink/prestera.rst2
-rw-r--r--Documentation/networking/devlink/sfc.rst57
-rw-r--r--Documentation/networking/driver.rst156
-rw-r--r--Documentation/networking/dsa/configuration.rst98
-rw-r--r--Documentation/networking/dsa/dsa.rst480
-rw-r--r--Documentation/networking/dsa/sja1105.rst27
-rw-r--r--Documentation/networking/ethtool-netlink.rst518
-rw-r--r--Documentation/networking/filter.rst1038
-rw-r--r--Documentation/networking/generic_netlink.rst2
-rw-r--r--Documentation/networking/gtp.rst2
-rw-r--r--Documentation/networking/ieee802154.rst2
-rw-r--r--Documentation/networking/index.rst17
-rw-r--r--Documentation/networking/ip-sysctl.rst343
-rw-r--r--Documentation/networking/ipvlan.rst4
-rw-r--r--Documentation/networking/ipvs-sysctl.rst34
-rw-r--r--Documentation/networking/j1939.rst2
-rw-r--r--Documentation/networking/l2tp.rst2
-rw-r--r--Documentation/networking/mctp.rst117
-rw-r--r--Documentation/networking/mptcp-sysctl.rst17
-rw-r--r--Documentation/networking/msg_zerocopy.rst8
-rw-r--r--Documentation/networking/napi.rst254
-rw-r--r--Documentation/networking/net_failover.rst111
-rw-r--r--Documentation/networking/netconsole.rst2
-rw-r--r--Documentation/networking/netdev-FAQ.rst263
-rw-r--r--Documentation/networking/nf_conntrack-sysctl.rst43
-rw-r--r--Documentation/networking/page_pool.rst63
-rw-r--r--Documentation/networking/phonet.rst2
-rw-r--r--Documentation/networking/phy.rst24
-rw-r--r--Documentation/networking/rds.rst2
-rw-r--r--Documentation/networking/regulatory.rst4
-rw-r--r--Documentation/networking/representors.rst259
-rw-r--r--Documentation/networking/rxrpc.rst34
-rw-r--r--Documentation/networking/sfp-phylink.rst6
-rw-r--r--Documentation/networking/skbuff.rst37
-rw-r--r--Documentation/networking/smc-sysctl.rst61
-rw-r--r--Documentation/networking/snmp_counter.rst4
-rw-r--r--Documentation/networking/statistics.rst1
-rw-r--r--Documentation/networking/switchdev.rst3
-rw-r--r--Documentation/networking/sysfs-tagging.rst2
-rw-r--r--Documentation/networking/tc-queue-filters.rst37
-rw-r--r--Documentation/networking/timestamping.rst42
-rw-r--r--Documentation/networking/tls-handshake.rst217
-rw-r--r--Documentation/networking/tls.rst47
-rw-r--r--Documentation/networking/x25-iface.rst3
-rw-r--r--Documentation/networking/xdp-rx-metadata.rst113
-rw-r--r--Documentation/networking/xfrm_device.rst64
-rw-r--r--Documentation/nvme/feature-and-quirk-policy.rst77
-rw-r--r--Documentation/peci/index.rst16
-rw-r--r--Documentation/peci/peci.rst51
-rw-r--r--Documentation/power/energy-model.rst99
-rw-r--r--Documentation/power/opp.rst14
-rw-r--r--Documentation/power/pci.rst2
-rw-r--r--Documentation/power/power_supply_class.rst4
-rw-r--r--Documentation/power/regulator/consumer.rst2
-rw-r--r--Documentation/power/runtime_pm.rst14
-rw-r--r--Documentation/power/suspend-and-interrupts.rst2
-rw-r--r--Documentation/powerpc/cpu_families.rst13
-rw-r--r--Documentation/powerpc/dawr-power9.rst26
-rw-r--r--Documentation/powerpc/elf_hwcaps.rst231
-rw-r--r--Documentation/powerpc/index.rst1
-rw-r--r--Documentation/powerpc/isa-versions.rst22
-rw-r--r--Documentation/powerpc/kasan.txt58
-rw-r--r--Documentation/process/2.Process.rst15
-rw-r--r--Documentation/process/3.Early-stage.rst9
-rw-r--r--Documentation/process/5.Posting.rst53
-rw-r--r--Documentation/process/8.Conclusion.rst16
-rw-r--r--Documentation/process/applying-patches.rst28
-rw-r--r--Documentation/process/botching-up-ioctls.rst2
-rw-r--r--Documentation/process/changes.rst84
-rw-r--r--Documentation/process/code-of-conduct-interpretation.rst26
-rw-r--r--Documentation/process/coding-style.rst103
-rw-r--r--Documentation/process/contribution-maturity-model.rst109
-rw-r--r--Documentation/process/deprecated.rst62
-rw-r--r--Documentation/process/email-clients.rst89
-rw-r--r--Documentation/process/embargoed-hardware-issues.rst13
-rw-r--r--Documentation/process/handling-regressions.rst746
-rw-r--r--Documentation/process/howto.rst10
-rw-r--r--Documentation/process/index.rst15
-rw-r--r--Documentation/process/kernel-docs.rst503
-rw-r--r--Documentation/process/magic-number.rst70
-rw-r--r--Documentation/process/maintainer-handbooks.rst19
-rw-r--r--Documentation/process/maintainer-netdev.rst396
-rw-r--r--Documentation/process/maintainer-pgp-guide.rst374
-rw-r--r--Documentation/process/maintainer-tip.rst799
-rw-r--r--Documentation/process/programming-language.rst30
-rw-r--r--Documentation/process/researcher-guidelines.rst143
-rw-r--r--Documentation/process/security-bugs.rst (renamed from Documentation/admin-guide/security-bugs.rst)0
-rw-r--r--Documentation/process/stable-kernel-rules.rst40
-rw-r--r--Documentation/process/submitting-drivers.rst194
-rw-r--r--Documentation/process/submitting-patches.rst122
-rw-r--r--Documentation/riscv/hwprobe.rst86
-rw-r--r--Documentation/riscv/index.rst3
-rw-r--r--Documentation/riscv/patch-acceptance.rst22
-rw-r--r--Documentation/riscv/pmu.rst255
-rw-r--r--Documentation/riscv/uabi.rst48
-rw-r--r--Documentation/riscv/vm-layout.rst84
-rw-r--r--Documentation/rust/arch-support.rst21
-rw-r--r--Documentation/rust/coding-guidelines.rst216
-rw-r--r--Documentation/rust/general-information.rst79
-rw-r--r--Documentation/rust/index.rst22
-rw-r--r--Documentation/rust/quick-start.rst232
-rw-r--r--Documentation/s390/index.rst1
-rw-r--r--Documentation/s390/pci.rst4
-rw-r--r--Documentation/s390/vfio-ap-locking.rst115
-rw-r--r--Documentation/s390/vfio-ap.rst501
-rw-r--r--Documentation/s390/vfio-ccw.rst8
-rw-r--r--Documentation/scheduler/index.rst9
-rw-r--r--Documentation/scheduler/sched-arch.rst2
-rw-r--r--Documentation/scheduler/sched-bwc.rst85
-rw-r--r--Documentation/scheduler/sched-capacity.rst4
-rw-r--r--Documentation/scheduler/sched-debug.rst54
-rw-r--r--Documentation/scheduler/sched-design-CFS.rst2
-rw-r--r--Documentation/scheduler/sched-domains.rst8
-rw-r--r--Documentation/scheduler/sched-stats.rst8
-rw-r--r--Documentation/scheduler/sched-util-clamp.rst741
-rw-r--r--Documentation/scheduler/schedutil.rst173
-rw-r--r--Documentation/scheduler/schedutil.txt169
-rw-r--r--Documentation/scsi/ChangeLog.lpfc38
-rw-r--r--Documentation/scsi/ChangeLog.megaraid8
-rw-r--r--Documentation/scsi/ChangeLog.megaraid_sas4
-rw-r--r--Documentation/scsi/ChangeLog.ncr53c8xx16
-rw-r--r--Documentation/scsi/ChangeLog.sym53c8xx14
-rw-r--r--Documentation/scsi/ChangeLog.sym53c8xx_210
-rw-r--r--Documentation/scsi/index.rst6
-rw-r--r--Documentation/scsi/libsas.rst2
-rw-r--r--Documentation/scsi/ncr53c8xx.rst4
-rw-r--r--Documentation/scsi/scsi_eh.rst25
-rw-r--r--Documentation/scsi/scsi_mid_low_api.rst4
-rw-r--r--Documentation/scsi/sym53c8xx_2.rst2
-rw-r--r--Documentation/scsi/tcm_qla2xxx.rst2
-rw-r--r--Documentation/scsi/ufs.rst85
-rw-r--r--Documentation/security/IMA-templates.rst11
-rw-r--r--Documentation/security/SCTP.rst69
-rw-r--r--Documentation/security/index.rst1
-rw-r--r--Documentation/security/keys/core.rst2
-rw-r--r--Documentation/security/keys/trusted-encrypted.rst86
-rw-r--r--Documentation/security/landlock.rst45
-rw-r--r--Documentation/security/lsm-development.rst6
-rw-r--r--Documentation/security/lsm.rst2
-rw-r--r--Documentation/security/secrets/coco.rst103
-rw-r--r--Documentation/security/secrets/index.rst9
-rw-r--r--Documentation/security/self-protection.rst3
-rw-r--r--Documentation/security/siphash.rst44
-rw-r--r--Documentation/sound/alsa-configuration.rst39
-rw-r--r--Documentation/sound/cards/audigy-mixer.rst29
-rw-r--r--Documentation/sound/cards/maya44.rst2
-rw-r--r--Documentation/sound/cards/sb-live-mixer.rst19
-rw-r--r--Documentation/sound/designs/control-names.rst2
-rw-r--r--Documentation/sound/designs/jack-controls.rst2
-rw-r--r--Documentation/sound/designs/seq-oss.rst2
-rw-r--r--Documentation/sound/hd-audio/index.rst1
-rw-r--r--Documentation/sound/hd-audio/intel-multi-link.rst312
-rw-r--r--Documentation/sound/hd-audio/models.rst8
-rw-r--r--Documentation/sound/hd-audio/notes.rst8
-rw-r--r--Documentation/sound/index.rst8
-rw-r--r--Documentation/sound/kernel-api/writing-an-alsa-driver.rst1121
-rw-r--r--Documentation/sound/soc/codec.rst10
-rw-r--r--Documentation/sound/soc/dai.rst2
-rw-r--r--Documentation/sound/soc/platform.rst2
-rw-r--r--Documentation/sphinx-static/custom.css75
-rw-r--r--Documentation/sphinx-static/theme_overrides.css16
-rw-r--r--Documentation/sphinx-static/theme_rtd_colors.css37
-rw-r--r--Documentation/sphinx/automarkup.py81
-rw-r--r--Documentation/sphinx/kernel_abi.py8
-rw-r--r--Documentation/sphinx/kernel_feat.py22
-rwxr-xr-xDocumentation/sphinx/kernel_include.py3
-rw-r--r--Documentation/sphinx/kerneldoc-preamble.sty236
-rw-r--r--Documentation/sphinx/kerneldoc.py2
-rw-r--r--Documentation/sphinx/kfigure.py140
-rw-r--r--Documentation/sphinx/load_config.py6
-rw-r--r--Documentation/sphinx/requirements.txt3
-rw-r--r--Documentation/sphinx/templates/kernel-toc.html16
-rw-r--r--Documentation/spi/pxa2xx.rst44
-rw-r--r--Documentation/spi/spi-lm70llp.rst2
-rw-r--r--Documentation/spi/spi-summary.rst33
-rw-r--r--Documentation/spi/spidev.rst58
-rw-r--r--Documentation/staging/index.rst42
-rw-r--r--Documentation/staging/remoteproc.rst3
-rw-r--r--Documentation/staging/static-keys.rst3
-rw-r--r--Documentation/staging/tee.rst87
-rw-r--r--Documentation/subsystem-apis.rst59
-rw-r--r--Documentation/target/tcmu-design.rst2
-rw-r--r--Documentation/timers/hrtimers.rst21
-rw-r--r--Documentation/timers/no_hz.rst18
-rw-r--r--Documentation/tools/index.rst21
-rw-r--r--Documentation/tools/rtla/Makefile53
-rw-r--r--Documentation/tools/rtla/common_appendix.rst13
-rw-r--r--Documentation/tools/rtla/common_hist_options.rst23
-rw-r--r--Documentation/tools/rtla/common_options.rst47
-rw-r--r--Documentation/tools/rtla/common_osnoise_description.rst8
-rw-r--r--Documentation/tools/rtla/common_osnoise_options.rst27
-rw-r--r--Documentation/tools/rtla/common_timerlat_aa.rst14
-rw-r--r--Documentation/tools/rtla/common_timerlat_description.rst10
-rw-r--r--Documentation/tools/rtla/common_timerlat_options.rst28
-rw-r--r--Documentation/tools/rtla/common_top_options.rst3
-rw-r--r--Documentation/tools/rtla/index.rst27
-rw-r--r--Documentation/tools/rtla/rtla-hwnoise.rst107
-rw-r--r--Documentation/tools/rtla/rtla-osnoise-hist.rst66
-rw-r--r--Documentation/tools/rtla/rtla-osnoise-top.rst61
-rw-r--r--Documentation/tools/rtla/rtla-osnoise.rst59
-rw-r--r--Documentation/tools/rtla/rtla-timerlat-hist.rst106
-rw-r--r--Documentation/tools/rtla/rtla-timerlat-top.rst127
-rw-r--r--Documentation/tools/rtla/rtla-timerlat.rst57
-rw-r--r--Documentation/tools/rtla/rtla.rst48
-rw-r--r--Documentation/tools/rv/Makefile52
-rw-r--r--Documentation/tools/rv/common_appendix.rst16
-rw-r--r--Documentation/tools/rv/common_ikm.rst21
-rw-r--r--Documentation/tools/rv/index.rst24
-rw-r--r--Documentation/tools/rv/rv-list.rst43
-rw-r--r--Documentation/tools/rv/rv-mon-wip.rst44
-rw-r--r--Documentation/tools/rv/rv-mon-wwnr.rst43
-rw-r--r--Documentation/tools/rv/rv-mon.rst55
-rw-r--r--Documentation/tools/rv/rv.rst63
-rw-r--r--Documentation/trace/boottime-trace.rst4
-rw-r--r--Documentation/trace/coresight/coresight-config.rst78
-rw-r--r--Documentation/trace/coresight/coresight-cpu-debug.rst3
-rw-r--r--Documentation/trace/coresight/coresight-etm4x-reference.rst31
-rw-r--r--Documentation/trace/coresight/coresight-perf.rst158
-rw-r--r--Documentation/trace/coresight/coresight-tpda.rst52
-rw-r--r--Documentation/trace/coresight/coresight-tpdm.rst45
-rw-r--r--Documentation/trace/coresight/coresight.rst58
-rw-r--r--Documentation/trace/coresight/ultrasoc-smb.rst83
-rw-r--r--Documentation/trace/events-msr.rst4
-rw-r--r--Documentation/trace/events-nmi.rst6
-rw-r--r--Documentation/trace/events.rst109
-rw-r--r--Documentation/trace/fprobe.rst182
-rw-r--r--Documentation/trace/ftrace.rst66
-rw-r--r--Documentation/trace/hisi-ptt.rst298
-rw-r--r--Documentation/trace/histogram-design.rst12
-rw-r--r--Documentation/trace/histogram.rst445
-rw-r--r--Documentation/trace/index.rst4
-rw-r--r--Documentation/trace/kprobes.rst9
-rw-r--r--Documentation/trace/kprobetrace.rst67
-rw-r--r--Documentation/trace/mmiotrace.rst20
-rw-r--r--Documentation/trace/osnoise-tracer.rst36
-rw-r--r--Documentation/trace/postprocess/trace-pagealloc-postprocess.pl4
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl4
-rw-r--r--Documentation/trace/rv/da_monitor_instrumentation.rst171
-rw-r--r--Documentation/trace/rv/da_monitor_synthesis.rst147
-rw-r--r--Documentation/trace/rv/deterministic_automata.rst184
-rw-r--r--Documentation/trace/rv/index.rst14
-rw-r--r--Documentation/trace/rv/monitor_wip.rst55
-rw-r--r--Documentation/trace/rv/monitor_wwnr.rst45
-rw-r--r--Documentation/trace/rv/runtime-verification.rst231
-rw-r--r--Documentation/trace/timerlat-tracer.rst27
-rw-r--r--Documentation/trace/tracepoint-analysis.rst8
-rw-r--r--Documentation/trace/uprobetracer.rst30
-rw-r--r--Documentation/trace/user_events.rst265
-rw-r--r--Documentation/translations/conf.py12
-rw-r--r--Documentation/translations/index.rst1
-rw-r--r--Documentation/translations/it_IT/admin-guide/README.rst2
-rw-r--r--Documentation/translations/it_IT/admin-guide/security-bugs.rst2
-rw-r--r--Documentation/translations/it_IT/core-api/symbol-namespaces.rst9
-rw-r--r--Documentation/translations/it_IT/devicetree/bindings/submitting-patches.rst11
-rw-r--r--Documentation/translations/it_IT/doc-guide/kernel-doc.rst4
-rw-r--r--Documentation/translations/it_IT/doc-guide/parse-headers.rst5
-rw-r--r--Documentation/translations/it_IT/doc-guide/sphinx.rst55
-rw-r--r--Documentation/translations/it_IT/index.rst124
-rw-r--r--Documentation/translations/it_IT/kernel-hacking/hacking.rst29
-rw-r--r--Documentation/translations/it_IT/kernel-hacking/locking.rst51
-rw-r--r--Documentation/translations/it_IT/maintainer/configure-git.rst10
-rw-r--r--Documentation/translations/it_IT/networking/netdev-FAQ.rst2
-rw-r--r--Documentation/translations/it_IT/process/2.Process.rst15
-rw-r--r--Documentation/translations/it_IT/process/3.Early-stage.rst17
-rw-r--r--Documentation/translations/it_IT/process/5.Posting.rst45
-rw-r--r--Documentation/translations/it_IT/process/7.AdvancedTopics.rst8
-rw-r--r--Documentation/translations/it_IT/process/8.Conclusion.rst5
-rw-r--r--Documentation/translations/it_IT/process/botching-up-ioctls.rst249
-rw-r--r--Documentation/translations/it_IT/process/changes.rst42
-rw-r--r--Documentation/translations/it_IT/process/clang-format.rst2
-rw-r--r--Documentation/translations/it_IT/process/coding-style.rst48
-rw-r--r--Documentation/translations/it_IT/process/deprecated.rst53
-rw-r--r--Documentation/translations/it_IT/process/email-clients.rst94
-rw-r--r--Documentation/translations/it_IT/process/howto.rst7
-rw-r--r--Documentation/translations/it_IT/process/index.rst4
-rw-r--r--Documentation/translations/it_IT/process/kernel-docs.rst4
-rw-r--r--Documentation/translations/it_IT/process/magic-number.rst71
-rw-r--r--Documentation/translations/it_IT/process/maintainer-handbooks.rst24
-rw-r--r--Documentation/translations/it_IT/process/maintainer-pgp-guide.rst356
-rw-r--r--Documentation/translations/it_IT/process/maintainer-tip.rst10
-rw-r--r--Documentation/translations/it_IT/process/maintainers.rst13
-rw-r--r--Documentation/translations/it_IT/process/programming-language.rst29
-rw-r--r--Documentation/translations/it_IT/process/stable-kernel-rules.rst48
-rw-r--r--Documentation/translations/it_IT/process/submitting-drivers.rst16
-rw-r--r--Documentation/translations/it_IT/process/submitting-patches.rst105
-rw-r--r--Documentation/translations/it_IT/process/volatile-considered-harmful.rst4
-rw-r--r--Documentation/translations/ja_JP/SubmittingPatches56
-rw-r--r--Documentation/translations/ja_JP/howto.rst118
-rw-r--r--Documentation/translations/ja_JP/index.rst6
-rw-r--r--Documentation/translations/ko_KR/howto.rst8
-rw-r--r--Documentation/translations/ko_KR/index.rst5
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt157
-rw-r--r--Documentation/translations/sp_SP/disclaimer-sp.rst6
-rw-r--r--Documentation/translations/sp_SP/howto.rst617
-rw-r--r--Documentation/translations/sp_SP/index.rst81
-rw-r--r--Documentation/translations/sp_SP/memory-barriers.txt3134
-rw-r--r--Documentation/translations/sp_SP/process/adding-syscalls.rst632
-rw-r--r--Documentation/translations/sp_SP/process/code-of-conduct.rst97
-rw-r--r--Documentation/translations/sp_SP/process/coding-style.rst1315
-rw-r--r--Documentation/translations/sp_SP/process/deprecated.rst381
-rw-r--r--Documentation/translations/sp_SP/process/email-clients.rst374
-rw-r--r--Documentation/translations/sp_SP/process/index.rst22
-rw-r--r--Documentation/translations/sp_SP/process/kernel-docs.rst187
-rw-r--r--Documentation/translations/sp_SP/process/kernel-enforcement-statement.rst174
-rw-r--r--Documentation/translations/sp_SP/process/magic-number.rst89
-rw-r--r--Documentation/translations/sp_SP/process/programming-language.rst53
-rw-r--r--Documentation/translations/sp_SP/process/submitting-patches.rst894
-rw-r--r--Documentation/translations/sp_SP/wrappers/memory-barriers.rst19
-rw-r--r--Documentation/translations/zh_CN/IRQ.txt39
-rw-r--r--Documentation/translations/zh_CN/PCI/acpi-info.rst139
-rw-r--r--Documentation/translations/zh_CN/PCI/index.rst34
-rw-r--r--Documentation/translations/zh_CN/PCI/msi-howto.rst244
-rw-r--r--Documentation/translations/zh_CN/PCI/pci-iov-howto.rst169
-rw-r--r--Documentation/translations/zh_CN/PCI/pci.rst514
-rw-r--r--Documentation/translations/zh_CN/PCI/pciebus-howto.rst192
-rw-r--r--Documentation/translations/zh_CN/PCI/sysfs-pci.rst126
-rw-r--r--Documentation/translations/zh_CN/accounting/delay-accounting.rst112
-rw-r--r--Documentation/translations/zh_CN/accounting/index.rst4
-rw-r--r--Documentation/translations/zh_CN/accounting/taskstats.rst145
-rw-r--r--Documentation/translations/zh_CN/admin-guide/README.rst112
-rw-r--r--Documentation/translations/zh_CN/admin-guide/bootconfig.rst293
-rw-r--r--Documentation/translations/zh_CN/admin-guide/cputopology.rst96
-rw-r--r--Documentation/translations/zh_CN/admin-guide/index.rst128
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst29
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/lru_sort.rst263
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst228
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/start.rst124
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst591
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/index.rst49
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/ksm.rst198
-rw-r--r--Documentation/translations/zh_CN/admin-guide/reporting-issues.rst125
-rw-r--r--Documentation/translations/zh_CN/admin-guide/reporting-regressions.rst370
-rw-r--r--Documentation/translations/zh_CN/admin-guide/security-bugs.rst2
-rw-r--r--Documentation/translations/zh_CN/admin-guide/sysrq.rst280
-rw-r--r--Documentation/translations/zh_CN/arch/index.rst29
-rw-r--r--Documentation/translations/zh_CN/arch/openrisc/index.rst32
-rw-r--r--Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst (renamed from Documentation/translations/zh_CN/openrisc/openrisc_port.rst)4
-rw-r--r--Documentation/translations/zh_CN/arch/openrisc/todo.rst (renamed from Documentation/translations/zh_CN/openrisc/todo.rst)4
-rw-r--r--Documentation/translations/zh_CN/arch/parisc/debugging.rst (renamed from Documentation/translations/zh_CN/parisc/debugging.rst)4
-rw-r--r--Documentation/translations/zh_CN/arch/parisc/index.rst31
-rw-r--r--Documentation/translations/zh_CN/arch/parisc/registers.rst (renamed from Documentation/translations/zh_CN/parisc/registers.rst)4
-rw-r--r--Documentation/translations/zh_CN/core-api/assoc_array.rst473
-rw-r--r--Documentation/translations/zh_CN/core-api/boot-time-mm.rst49
-rw-r--r--Documentation/translations/zh_CN/core-api/cachetlb.rst6
-rw-r--r--Documentation/translations/zh_CN/core-api/circular-buffers.rst210
-rw-r--r--Documentation/translations/zh_CN/core-api/cpu_hotplug.rst435
-rw-r--r--Documentation/translations/zh_CN/core-api/errseq.rst145
-rw-r--r--Documentation/translations/zh_CN/core-api/genalloc.rst109
-rw-r--r--Documentation/translations/zh_CN/core-api/generic-radix-tree.rst23
-rw-r--r--Documentation/translations/zh_CN/core-api/gfp_mask-from-fs-io.rst66
-rw-r--r--Documentation/translations/zh_CN/core-api/idr.rst80
-rw-r--r--Documentation/translations/zh_CN/core-api/index.rst28
-rw-r--r--Documentation/translations/zh_CN/core-api/irq/irq-affinity.rst2
-rw-r--r--Documentation/translations/zh_CN/core-api/irq/irq-domain.rst22
-rw-r--r--Documentation/translations/zh_CN/core-api/kernel-api.rst19
-rw-r--r--Documentation/translations/zh_CN/core-api/kobject.rst12
-rw-r--r--Documentation/translations/zh_CN/core-api/kref.rst311
-rw-r--r--Documentation/translations/zh_CN/core-api/local_ops.rst2
-rw-r--r--Documentation/translations/zh_CN/core-api/memory-allocation.rst138
-rw-r--r--Documentation/translations/zh_CN/core-api/memory-hotplug.rst6
-rw-r--r--Documentation/translations/zh_CN/core-api/mm-api.rst131
-rw-r--r--Documentation/translations/zh_CN/core-api/packing.rst160
-rw-r--r--Documentation/translations/zh_CN/core-api/printk-basics.rst3
-rw-r--r--Documentation/translations/zh_CN/core-api/printk-formats.rst3
-rw-r--r--Documentation/translations/zh_CN/core-api/rbtree.rst391
-rw-r--r--Documentation/translations/zh_CN/core-api/symbol-namespaces.rst2
-rw-r--r--Documentation/translations/zh_CN/core-api/this_cpu_ops.rst285
-rw-r--r--Documentation/translations/zh_CN/core-api/unaligned-memory-access.rst229
-rw-r--r--Documentation/translations/zh_CN/core-api/watch_queue.rst313
-rw-r--r--Documentation/translations/zh_CN/core-api/workqueue.rst25
-rw-r--r--Documentation/translations/zh_CN/core-api/xarray.rst373
-rw-r--r--Documentation/translations/zh_CN/cpu-freq/core.rst24
-rw-r--r--Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst141
-rw-r--r--Documentation/translations/zh_CN/cpu-freq/cpufreq-stats.rst45
-rw-r--r--Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst167
-rw-r--r--Documentation/translations/zh_CN/dev-tools/index.rst4
-rw-r--r--Documentation/translations/zh_CN/dev-tools/kasan.rst163
-rw-r--r--Documentation/translations/zh_CN/dev-tools/sparse.rst110
-rw-r--r--Documentation/translations/zh_CN/dev-tools/testing-overview.rst52
-rw-r--r--Documentation/translations/zh_CN/devicetree/changesets.rst37
-rw-r--r--Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst31
-rw-r--r--Documentation/translations/zh_CN/devicetree/index.rst45
-rw-r--r--Documentation/translations/zh_CN/devicetree/kernel-api.rst58
-rw-r--r--Documentation/translations/zh_CN/devicetree/of_unittest.rst189
-rw-r--r--Documentation/translations/zh_CN/devicetree/overlay-notes.rst140
-rw-r--r--Documentation/translations/zh_CN/devicetree/usage-model.rst330
-rw-r--r--Documentation/translations/zh_CN/doc-guide/index.rst2
-rw-r--r--Documentation/translations/zh_CN/doc-guide/kernel-doc.rst2
-rw-r--r--Documentation/translations/zh_CN/doc-guide/sphinx.rst21
-rw-r--r--Documentation/translations/zh_CN/driver-api/gpio/index.rst69
-rw-r--r--Documentation/translations/zh_CN/driver-api/gpio/legacy.rst659
-rw-r--r--Documentation/translations/zh_CN/driver-api/index.rst132
-rw-r--r--Documentation/translations/zh_CN/driver-api/io_ordering.rst60
-rw-r--r--Documentation/translations/zh_CN/filesystems/sysfs.txt4
-rw-r--r--Documentation/translations/zh_CN/glossary.rst36
-rw-r--r--Documentation/translations/zh_CN/gpio.txt650
-rw-r--r--Documentation/translations/zh_CN/iio/iio_configfs.rst12
-rw-r--r--Documentation/translations/zh_CN/index.rst186
-rw-r--r--Documentation/translations/zh_CN/io_ordering.txt67
-rw-r--r--Documentation/translations/zh_CN/kernel-hacking/hacking.rst25
-rw-r--r--Documentation/translations/zh_CN/locking/index.rst43
-rw-r--r--Documentation/translations/zh_CN/locking/mutex-design.rst145
-rw-r--r--Documentation/translations/zh_CN/locking/spinlocks.rst149
-rw-r--r--Documentation/translations/zh_CN/loongarch/booting.rst48
-rw-r--r--Documentation/translations/zh_CN/loongarch/features.rst8
-rw-r--r--Documentation/translations/zh_CN/loongarch/index.rst27
-rw-r--r--Documentation/translations/zh_CN/loongarch/introduction.rst353
-rw-r--r--Documentation/translations/zh_CN/loongarch/irq-chip-model.rst157
-rw-r--r--Documentation/translations/zh_CN/maintainer/pull-requests.rst2
-rw-r--r--Documentation/translations/zh_CN/mm/active_mm.rst85
-rw-r--r--Documentation/translations/zh_CN/mm/balance.rst81
-rw-r--r--Documentation/translations/zh_CN/mm/damon/api.rst32
-rw-r--r--Documentation/translations/zh_CN/mm/damon/design.rst140
-rw-r--r--Documentation/translations/zh_CN/mm/damon/faq.rst48
-rw-r--r--Documentation/translations/zh_CN/mm/damon/index.rst32
-rw-r--r--Documentation/translations/zh_CN/mm/free_page_reporting.rst38
-rw-r--r--Documentation/translations/zh_CN/mm/frontswap.rst196
-rw-r--r--Documentation/translations/zh_CN/mm/highmem.rst151
-rw-r--r--Documentation/translations/zh_CN/mm/hmm.rst361
-rw-r--r--Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst437
-rw-r--r--Documentation/translations/zh_CN/mm/hwpoison.rst166
-rw-r--r--Documentation/translations/zh_CN/mm/index.rst69
-rw-r--r--Documentation/translations/zh_CN/mm/ksm.rst70
-rw-r--r--Documentation/translations/zh_CN/mm/memory-model.rst135
-rw-r--r--Documentation/translations/zh_CN/mm/mmu_notifier.rst97
-rw-r--r--Documentation/translations/zh_CN/mm/numa.rst101
-rw-r--r--Documentation/translations/zh_CN/mm/overcommit-accounting.rst86
-rw-r--r--Documentation/translations/zh_CN/mm/page_frags.rst38
-rw-r--r--Documentation/translations/zh_CN/mm/page_migration.rst228
-rw-r--r--Documentation/translations/zh_CN/mm/page_owner.rst170
-rw-r--r--Documentation/translations/zh_CN/mm/page_table_check.rst56
-rw-r--r--Documentation/translations/zh_CN/mm/remap_file_pages.rst32
-rw-r--r--Documentation/translations/zh_CN/mm/split_page_table_lock.rst96
-rw-r--r--Documentation/translations/zh_CN/mm/vmalloced-kernel-stacks.rst133
-rw-r--r--Documentation/translations/zh_CN/mm/z3fold.rst31
-rw-r--r--Documentation/translations/zh_CN/mm/zsmalloc.rst78
-rw-r--r--Documentation/translations/zh_CN/oops-tracing.txt212
-rw-r--r--Documentation/translations/zh_CN/openrisc/index.rst32
-rw-r--r--Documentation/translations/zh_CN/parisc/index.rst31
-rw-r--r--Documentation/translations/zh_CN/peci/index.rst26
-rw-r--r--Documentation/translations/zh_CN/peci/peci.rst54
-rw-r--r--Documentation/translations/zh_CN/power/energy-model.rst210
-rw-r--r--Documentation/translations/zh_CN/power/index.rst56
-rw-r--r--Documentation/translations/zh_CN/power/opp.rst341
-rw-r--r--Documentation/translations/zh_CN/process/5.Posting.rst11
-rw-r--r--Documentation/translations/zh_CN/process/8.Conclusion.rst1
-rw-r--r--Documentation/translations/zh_CN/process/coding-style.rst274
-rw-r--r--Documentation/translations/zh_CN/process/email-clients.rst265
-rw-r--r--Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst2
-rw-r--r--Documentation/translations/zh_CN/process/howto.rst21
-rw-r--r--Documentation/translations/zh_CN/process/index.rst2
-rw-r--r--Documentation/translations/zh_CN/process/magic-number.rst73
-rw-r--r--Documentation/translations/zh_CN/process/management-style.rst4
-rw-r--r--Documentation/translations/zh_CN/process/programming-language.rst3
-rw-r--r--Documentation/translations/zh_CN/process/submit-checklist.rst84
-rw-r--r--Documentation/translations/zh_CN/process/submitting-drivers.rst160
-rw-r--r--Documentation/translations/zh_CN/process/submitting-patches.rst747
-rw-r--r--Documentation/translations/zh_CN/riscv/index.rst2
-rw-r--r--Documentation/translations/zh_CN/riscv/pmu.rst235
-rw-r--r--Documentation/translations/zh_CN/riscv/vm-layout.rst104
-rw-r--r--Documentation/translations/zh_CN/rust/arch-support.rst23
-rw-r--r--Documentation/translations/zh_CN/rust/coding-guidelines.rst192
-rw-r--r--Documentation/translations/zh_CN/rust/general-information.rst75
-rw-r--r--Documentation/translations/zh_CN/rust/index.rst28
-rw-r--r--Documentation/translations/zh_CN/rust/quick-start.rst211
-rw-r--r--Documentation/translations/zh_CN/scheduler/completion.rst256
-rw-r--r--Documentation/translations/zh_CN/scheduler/index.rst45
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-arch.rst74
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-bwc.rst204
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-capacity.rst390
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-debug.rst51
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst205
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-domains.rst72
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-energy.rst351
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-nice-design.rst99
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-stats.rst156
-rw-r--r--Documentation/translations/zh_CN/scheduler/schedutil.rst165
-rw-r--r--Documentation/translations/zh_CN/sparse.txt91
-rw-r--r--Documentation/translations/zh_CN/staging/index.rst26
-rw-r--r--Documentation/translations/zh_CN/staging/xz.rst100
-rw-r--r--Documentation/translations/zh_CN/userspace-api/accelerators/ocxl.rst168
-rw-r--r--Documentation/translations/zh_CN/userspace-api/ebpf/index.rst22
-rw-r--r--Documentation/translations/zh_CN/userspace-api/ebpf/syscall.rst29
-rw-r--r--Documentation/translations/zh_CN/userspace-api/futex2.rst80
-rw-r--r--Documentation/translations/zh_CN/userspace-api/index.rst50
-rw-r--r--Documentation/translations/zh_CN/userspace-api/no_new_privs.rst57
-rw-r--r--Documentation/translations/zh_CN/userspace-api/seccomp_filter.rst293
-rw-r--r--Documentation/translations/zh_CN/userspace-api/sysfs-platform_profile.rst40
-rw-r--r--Documentation/translations/zh_TW/admin-guide/security-bugs.rst2
-rw-r--r--Documentation/translations/zh_TW/filesystems/sysfs.txt4
-rw-r--r--Documentation/translations/zh_TW/gpio.txt35
-rw-r--r--Documentation/translations/zh_TW/index.rst16
-rw-r--r--Documentation/translations/zh_TW/oops-tracing.txt212
-rw-r--r--Documentation/translations/zh_TW/process/5.Posting.rst3
-rw-r--r--Documentation/translations/zh_TW/process/8.Conclusion.rst1
-rw-r--r--Documentation/translations/zh_TW/process/embargoed-hardware-issues.rst2
-rw-r--r--Documentation/translations/zh_TW/process/howto.rst7
-rw-r--r--Documentation/translations/zh_TW/process/index.rst1
-rw-r--r--Documentation/translations/zh_TW/process/magic-number.rst74
-rw-r--r--Documentation/translations/zh_TW/process/programming-language.rst3
-rw-r--r--Documentation/translations/zh_TW/process/submitting-drivers.rst164
-rw-r--r--Documentation/translations/zh_TW/process/submitting-patches.rst32
-rw-r--r--Documentation/usb/CREDITS6
-rw-r--r--Documentation/usb/chipidea.rst19
-rw-r--r--Documentation/usb/functionfs.rst2
-rw-r--r--Documentation/usb/gadget-testing.rst23
-rw-r--r--Documentation/usb/gadget_configfs.rst10
-rw-r--r--Documentation/usb/gadget_multi.rst2
-rw-r--r--Documentation/usb/gadget_uvc.rst380
-rw-r--r--Documentation/usb/index.rst1
-rw-r--r--Documentation/usb/mass-storage.rst11
-rw-r--r--Documentation/usb/usbip_protocol.rst13
-rw-r--r--Documentation/usb/usbmon.rst2
-rw-r--r--Documentation/userspace-api/ELF.rst34
-rw-r--r--Documentation/userspace-api/futex2.rst86
-rw-r--r--Documentation/userspace-api/index.rst4
-rw-r--r--Documentation/userspace-api/ioctl/cdrom.rst119
-rw-r--r--Documentation/userspace-api/ioctl/ioctl-number.rst17
-rw-r--r--Documentation/userspace-api/iommufd.rst223
-rw-r--r--Documentation/userspace-api/landlock.rst238
-rw-r--r--Documentation/userspace-api/media/Makefile3
-rw-r--r--Documentation/userspace-api/media/cec.h.rst.exceptions2
-rw-r--r--Documentation/userspace-api/media/cec/cec-ioc-receive.rst49
-rw-r--r--Documentation/userspace-api/media/cec/cec-pin-error-inj.rst2
-rw-r--r--Documentation/userspace-api/media/drivers/aspeed-video.rst65
-rw-r--r--Documentation/userspace-api/media/drivers/cx2341x-uapi.rst8
-rw-r--r--Documentation/userspace-api/media/drivers/dw100.rst84
-rw-r--r--Documentation/userspace-api/media/drivers/hantro.rst19
-rw-r--r--Documentation/userspace-api/media/drivers/index.rst5
-rw-r--r--Documentation/userspace-api/media/drivers/meye-uapi.rst53
-rw-r--r--Documentation/userspace-api/media/drivers/st-vgxy61.rst25
-rw-r--r--Documentation/userspace-api/media/drivers/uvcvideo.rst2
-rw-r--r--Documentation/userspace-api/media/dvb/fe_property_parameters.rst25
-rw-r--r--Documentation/userspace-api/media/frontend.h.rst.exceptions28
-rw-r--r--Documentation/userspace-api/media/lirc.h.rst.exceptions8
-rw-r--r--Documentation/userspace-api/media/mediactl/media-controller-model.rst6
-rw-r--r--Documentation/userspace-api/media/mediactl/media-types.rst17
-rw-r--r--Documentation/userspace-api/media/rc/lirc-dev-intro.rst19
-rw-r--r--Documentation/userspace-api/media/rc/lirc-func.rst1
-rw-r--r--Documentation/userspace-api/media/rc/lirc-get-features.rst18
-rw-r--r--Documentation/userspace-api/media/rc/lirc-set-rec-timeout-reports.rst49
-rw-r--r--Documentation/userspace-api/media/rc/lirc-set-wideband-receiver.rst2
-rw-r--r--Documentation/userspace-api/media/rc/rc-protos.rst2
-rw-r--r--Documentation/userspace-api/media/rc/rc-tables.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/async.rst9
-rw-r--r--Documentation/userspace-api/media/v4l/biblio.rst10
-rw-r--r--Documentation/userspace-api/media/v4l/buffer.rst51
-rw-r--r--Documentation/userspace-api/media/v4l/capture.c.rst52
-rw-r--r--Documentation/userspace-api/media/v4l/control.rst13
-rw-r--r--Documentation/userspace-api/media/v4l/dev-decoder.rst26
-rw-r--r--Documentation/userspace-api/media/v4l/dev-overlay.rst10
-rw-r--r--Documentation/userspace-api/media/v4l/dev-raw-vbi.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/dev-sdr.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/dev-sliced-vbi.rst4
-rw-r--r--Documentation/userspace-api/media/v4l/dev-subdev.rst166
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-camera.rst8
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst1498
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst745
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-image-source.rst20
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-jpeg.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/hist-v4l2.rst6
-rw-r--r--Documentation/userspace-api/media/v4l/io.rst4
-rw-r--r--Documentation/userspace-api/media/v4l/libv4l-introduction.rst6
-rw-r--r--Documentation/userspace-api/media/v4l/mmap.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-compressed.rst47
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-packed-yuv.rst97
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-reserved.rst66
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-rgb.rst235
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst35
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst256
-rw-r--r--Documentation/userspace-api/media/v4l/subdev-formats.rst306
-rw-r--r--Documentation/userspace-api/media/v4l/user-func.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/v4l2grab.c.rst10
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst7
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-cropcap.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-dqevent.rst5
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-g-ctrl.rst3
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst41
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-g-fbuf.rst52
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-qbuf.rst2
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-querycap.rst3
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst26
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst16
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-streamon.rst3
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-enum-frame-interval.rst5
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-enum-frame-size.rst49
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-enum-mbus-code.rst44
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-client-cap.rst83
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst5
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-fmt.rst5
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-frame-interval.rst5
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-routing.rst147
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-subdev-g-selection.rst5
-rw-r--r--Documentation/userspace-api/media/videodev2.h.rst.exceptions14
-rw-r--r--Documentation/userspace-api/netlink/c-code-gen.rst107
-rw-r--r--Documentation/userspace-api/netlink/genetlink-legacy.rst260
-rw-r--r--Documentation/userspace-api/netlink/index.rst18
-rw-r--r--Documentation/userspace-api/netlink/intro-specs.rst80
-rw-r--r--Documentation/userspace-api/netlink/intro.rst681
-rw-r--r--Documentation/userspace-api/netlink/specs.rst445
-rw-r--r--Documentation/userspace-api/seccomp_filter.rst10
-rw-r--r--Documentation/userspace-api/sysfs-platform_profile.rst2
-rw-r--r--Documentation/virt/coco/sev-guest.rst161
-rw-r--r--Documentation/virt/coco/tdx-guest.rst52
-rw-r--r--Documentation/virt/hyperv/clocks.rst73
-rw-r--r--Documentation/virt/hyperv/index.rst12
-rw-r--r--Documentation/virt/hyperv/overview.rst207
-rw-r--r--Documentation/virt/hyperv/vmbus.rst303
-rw-r--r--Documentation/virt/index.rst9
-rw-r--r--Documentation/virt/kvm/amd-memory-encryption.rst439
-rw-r--r--Documentation/virt/kvm/api.rst1931
-rw-r--r--Documentation/virt/kvm/arm/hyp-abi.rst11
-rw-r--r--Documentation/virt/kvm/arm/hypercalls.rst138
-rw-r--r--Documentation/virt/kvm/arm/index.rst2
-rw-r--r--Documentation/virt/kvm/arm/psci.rst77
-rw-r--r--Documentation/virt/kvm/arm/pvtime.rst14
-rw-r--r--Documentation/virt/kvm/devices/arm-vgic-its.rst5
-rw-r--r--Documentation/virt/kvm/devices/vcpu.rst108
-rw-r--r--Documentation/virt/kvm/devices/vfio.rst5
-rw-r--r--Documentation/virt/kvm/devices/vm.rst86
-rw-r--r--Documentation/virt/kvm/devices/xics.rst2
-rw-r--r--Documentation/virt/kvm/devices/xive.rst2
-rw-r--r--Documentation/virt/kvm/halt-polling.rst13
-rw-r--r--Documentation/virt/kvm/hypercalls.rst192
-rw-r--r--Documentation/virt/kvm/index.rst27
-rw-r--r--Documentation/virt/kvm/locking.rst84
-rw-r--r--Documentation/virt/kvm/s390/index.rst13
-rw-r--r--Documentation/virt/kvm/s390/s390-diag.rst (renamed from Documentation/virt/kvm/s390-diag.rst)0
-rw-r--r--Documentation/virt/kvm/s390/s390-pv-boot.rst (renamed from Documentation/virt/kvm/s390-pv-boot.rst)2
-rw-r--r--Documentation/virt/kvm/s390/s390-pv-dump.rst64
-rw-r--r--Documentation/virt/kvm/s390/s390-pv.rst (renamed from Documentation/virt/kvm/s390-pv.rst)0
-rw-r--r--Documentation/virt/kvm/vcpu-requests.rst43
-rw-r--r--Documentation/virt/kvm/x86/amd-memory-encryption.rst446
-rw-r--r--Documentation/virt/kvm/x86/cpuid.rst (renamed from Documentation/virt/kvm/cpuid.rst)0
-rw-r--r--Documentation/virt/kvm/x86/errata.rst50
-rw-r--r--Documentation/virt/kvm/x86/hypercalls.rst192
-rw-r--r--Documentation/virt/kvm/x86/index.rst18
-rw-r--r--Documentation/virt/kvm/x86/mmu.rst (renamed from Documentation/virt/kvm/mmu.rst)14
-rw-r--r--Documentation/virt/kvm/x86/msr.rst (renamed from Documentation/virt/kvm/msr.rst)0
-rw-r--r--Documentation/virt/kvm/x86/nested-vmx.rst (renamed from Documentation/virt/kvm/nested-vmx.rst)0
-rw-r--r--Documentation/virt/kvm/x86/running-nested-guests.rst (renamed from Documentation/virt/kvm/running-nested-guests.rst)4
-rw-r--r--Documentation/virt/kvm/x86/timekeeping.rst (renamed from Documentation/virt/kvm/timekeeping.rst)0
-rw-r--r--Documentation/virt/ne_overview.rst21
-rw-r--r--Documentation/virt/uml/user_mode_linux_howto_v2.rst145
-rw-r--r--Documentation/vm/.gitignore3
-rw-r--r--Documentation/vm/active_mm.rst91
-rw-r--r--Documentation/vm/balance.rst102
-rw-r--r--Documentation/vm/cleancache.rst296
-rw-r--r--Documentation/vm/damon/design.rst166
-rw-r--r--Documentation/vm/damon/faq.rst51
-rw-r--r--Documentation/vm/damon/index.rst30
-rw-r--r--Documentation/vm/free_page_reporting.rst40
-rw-r--r--Documentation/vm/frontswap.rst293
-rw-r--r--Documentation/vm/highmem.rst147
-rw-r--r--Documentation/vm/hmm.rst452
-rw-r--r--Documentation/vm/hugetlbfs_reserv.rst596
-rw-r--r--Documentation/vm/hwpoison.rst185
-rw-r--r--Documentation/vm/index.rst56
-rw-r--r--Documentation/vm/ksm.rst87
-rw-r--r--Documentation/vm/memory-model.rst177
-rw-r--r--Documentation/vm/mmu_notifier.rst99
-rw-r--r--Documentation/vm/numa.rst150
-rw-r--r--Documentation/vm/overcommit-accounting.rst87
-rw-r--r--Documentation/vm/page_frags.rst45
-rw-r--r--Documentation/vm/page_migration.rst288
-rw-r--r--Documentation/vm/page_owner.rst89
-rw-r--r--Documentation/vm/remap_file_pages.rst33
-rw-r--r--Documentation/vm/split_page_table_lock.rst100
-rw-r--r--Documentation/vm/unevictable-lru.rst605
-rw-r--r--Documentation/vm/z3fold.rst30
-rw-r--r--Documentation/vm/zsmalloc.rst82
-rw-r--r--Documentation/w1/masters/ds2490.rst2
-rw-r--r--Documentation/w1/masters/w1-gpio.rst2
-rw-r--r--Documentation/w1/slaves/w1_therm.rst9
-rw-r--r--Documentation/w1/w1-generic.rst2
-rw-r--r--Documentation/watchdog/hpwdt.rst8
-rw-r--r--Documentation/watchdog/index.rst6
-rw-r--r--Documentation/watchdog/watchdog-parameters.rst12
-rw-r--r--Documentation/x86/amd-memory-encryption.rst97
-rw-r--r--Documentation/x86/index.rst39
-rw-r--r--Documentation/x86/intel-iommu.rst115
-rw-r--r--Documentation/x86/microcode.rst142
-rw-r--r--Documentation/x86/x86_64/machinecheck.rst85
5839 files changed, 333520 insertions, 105690 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
new file mode 100644
index 000000000000..fe7e45e17bc7
--- /dev/null
+++ b/Documentation/ABI/obsolete/o2cb
@@ -0,0 +1,11 @@
+What: /sys/o2cb
+Date: Dec 2005
+KernelVersion: 2.6.16
+Contact: ocfs2-devel@oss.oracle.com
+Description: Ocfs2-tools looks at 'interface-revision' for versioning
+ information. Each logmask/ file controls a set of debug prints
+ and can be written into with the strings "allow", "deny", or
+ "off". Reading the file returns the current state.
+ Was renamed to /sys/fs/u2cb/
+Users: ocfs2-tools. It's sufficient to mail proposed changes to
+ ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/obsolete/procfs-i8k b/Documentation/ABI/obsolete/procfs-i8k
new file mode 100644
index 000000000000..32df4d5bdd15
--- /dev/null
+++ b/Documentation/ABI/obsolete/procfs-i8k
@@ -0,0 +1,10 @@
+What: /proc/i8k
+Date: November 2001
+KernelVersion: 2.4.14
+Contact: Pali Rohár <pali@kernel.org>
+Description: Legacy interface for getting/setting sensor information like
+ fan speed, temperature, serial number, hotkey status etc
+ on Dell Laptops.
+ Since the driver is now using the standard hwmon sysfs interface,
+ the procfs interface is deprecated.
+Users: https://github.com/vitorafsr/i8kutils
diff --git a/Documentation/ABI/obsolete/sysfs-bus-iio b/Documentation/ABI/obsolete/sysfs-bus-iio
index c9531bb64816..b64394b0b374 100644
--- a/Documentation/ABI/obsolete/sysfs-bus-iio
+++ b/Documentation/ABI/obsolete/sysfs-bus-iio
@@ -6,6 +6,7 @@ Description:
Since Kernel 5.11, multiple buffers are supported.
so, it is better to use, instead:
+
/sys/bus/iio/devices/iio:deviceX/bufferY/length
What: /sys/bus/iio/devices/iio:deviceX/buffer/enable
@@ -17,6 +18,7 @@ Description:
Since Kernel 5.11, multiple buffers are supported.
so, it is better to use, instead:
+
/sys/bus/iio/devices/iio:deviceX/bufferY/enable
What: /sys/bus/iio/devices/iio:deviceX/scan_elements
@@ -165,6 +167,7 @@ Description:
Since Kernel 5.11, multiple buffers are supported.
so, it is better to use, instead:
+
/sys/bus/iio/devices/iio:deviceX/bufferY/watermark
What: /sys/bus/iio/devices/iio:deviceX/buffer/data_available
@@ -179,4 +182,5 @@ Description:
Since Kernel 5.11, multiple buffers are supported.
so, it is better to use, instead:
+
/sys/bus/iio/devices/iio:deviceX/bufferY/data_available
diff --git a/Documentation/ABI/obsolete/sysfs-class-dax b/Documentation/ABI/obsolete/sysfs-class-dax
deleted file mode 100644
index 5bcce27458e3..000000000000
--- a/Documentation/ABI/obsolete/sysfs-class-dax
+++ /dev/null
@@ -1,22 +0,0 @@
-What: /sys/class/dax/
-Date: May, 2016
-KernelVersion: v4.7
-Contact: nvdimm@lists.linux.dev
-Description: Device DAX is the device-centric analogue of Filesystem
- DAX (CONFIG_FS_DAX). It allows memory ranges to be
- allocated and mapped without need of an intervening file
- system. Device DAX is strict, precise and predictable.
- Specifically this interface:
-
- 1. Guarantees fault granularity with respect to a given
- page size (pte, pmd, or pud) set at configuration time.
-
- 2. Enforces deterministic behavior by being strict about
- what fault scenarios are supported.
-
- The /sys/class/dax/ interface enumerates all the
- device-dax instances in the system. The ABI is
- deprecated and will be removed after 2020. It is
- replaced with the DAX bus interface /sys/bus/dax/ where
- device-dax instances can be found under
- /sys/bus/dax/devices/
diff --git a/Documentation/ABI/removed/sysfs-mce b/Documentation/ABI/removed/sysfs-mce
new file mode 100644
index 000000000000..ef5dd2a80918
--- /dev/null
+++ b/Documentation/ABI/removed/sysfs-mce
@@ -0,0 +1,37 @@
+What: /sys/devices/system/machinecheck/machinecheckX/tolerant
+Contact: Borislav Petkov <bp@suse.de>
+Date: Dec, 2021
+Description:
+ Unused and obsolete after the advent of recoverable machine
+ checks (see last sentence below) and those are present since
+ 2010 (Nehalem).
+
+ Original description:
+
+ The entries appear for each CPU, but they are truly shared
+ between all CPUs.
+
+ Tolerance level. When a machine check exception occurs for a
+ non corrected machine check the kernel can take different
+ actions.
+
+ Since machine check exceptions can happen any time it is
+ sometimes risky for the kernel to kill a process because it
+ defies normal kernel locking rules. The tolerance level
+ configures how hard the kernel tries to recover even at some
+ risk of deadlock. Higher tolerant values trade potentially
+ better uptime with the risk of a crash or even corruption
+ (for tolerant >= 3).
+
+ == ===========================================================
+ 0 always panic on uncorrected errors, log corrected errors
+ 1 panic or SIGBUS on uncorrected errors, log corrected errors
+ 2 SIGBUS or log uncorrected errors, log corrected errors
+ 3 never panic or SIGBUS, log all errors (for testing only)
+ == ===========================================================
+
+ Default: 1
+
+ Note this only makes a difference if the CPU allows recovery
+ from a machine check exception. Current x86 CPUs generally
+ do not.
diff --git a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot b/Documentation/ABI/removed/sysfs-selinux-checkreqprot
index ed6b52ca210f..f599a0a87e8b 100644
--- a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
+++ b/Documentation/ABI/removed/sysfs-selinux-checkreqprot
@@ -4,6 +4,9 @@ KernelVersion: 2.6.12-rc2 (predates git)
Contact: selinux@vger.kernel.org
Description:
+ REMOVAL UPDATE: The SELinux checkreqprot functionality was removed in
+ March 2023, the original deprecation notice is shown below.
+
The selinuxfs "checkreqprot" node allows SELinux to be configured
to check the protection requested by userspace for mmap/mprotect
calls instead of the actual protection applied by the kernel.
diff --git a/Documentation/ABI/obsolete/sysfs-selinux-disable b/Documentation/ABI/removed/sysfs-selinux-disable
index c340278e3cf8..cb783c64cab3 100644
--- a/Documentation/ABI/obsolete/sysfs-selinux-disable
+++ b/Documentation/ABI/removed/sysfs-selinux-disable
@@ -4,6 +4,9 @@ KernelVersion: 2.6.12-rc2 (predates git)
Contact: selinux@vger.kernel.org
Description:
+ REMOVAL UPDATE: The SELinux runtime disable functionality was removed
+ in March 2023, the original deprecation notice is shown below.
+
The selinuxfs "disable" node allows SELinux to be disabled at runtime
prior to a policy being loaded into the kernel. If disabled via this
mechanism, SELinux will remain disabled until the system is rebooted.
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb
index 5eb1545e0b8d..b62a967f01a0 100644
--- a/Documentation/ABI/stable/o2cb
+++ b/Documentation/ABI/stable/o2cb
@@ -1,4 +1,4 @@
-What: /sys/fs/o2cb/ (was /sys/o2cb)
+What: /sys/fs/o2cb/
Date: Dec 2005
KernelVersion: 2.6.16
Contact: ocfs2-devel@oss.oracle.com
diff --git a/Documentation/ABI/stable/sysfs-acpi-pmprofile b/Documentation/ABI/stable/sysfs-acpi-pmprofile
index 2d6314f0e4e4..cd55e421d921 100644
--- a/Documentation/ABI/stable/sysfs-acpi-pmprofile
+++ b/Documentation/ABI/stable/sysfs-acpi-pmprofile
@@ -2,16 +2,17 @@ What: /sys/firmware/acpi/pm_profile
Date: 03-Nov-2011
KernelVersion: v3.2
Contact: linux-acpi@vger.kernel.org
-Description: The ACPI pm_profile sysfs interface exports the platform
- power management (and performance) requirement expectations
- as provided by BIOS. The integer value is directly passed as
- retrieved from the FADT ACPI table.
+Description: The ACPI pm_profile sysfs interface exposes the preferred
+ power management (and performance) profile of the platform
+ as provided in the ACPI FADT Preferred_PM_Profile field.
-Values: For possible values see ACPI specification:
- 5.2.9 Fixed ACPI Description Table (FADT)
- Field: Preferred_PM_Profile
+ The integer value is directly passed as retrieved from the FADT.
- Currently these values are defined by spec:
+Values: For the possible values refer to the Preferred_PM_Profile field
+ definition in Table 5.9 "FADT Format", Section 5.2.9 "Fixed ACPI
+ Description Table (FADT)" of the ACPI specification.
+
+ As of ACPI 6.5, the following values are defined:
== =================
0 Unspecified
@@ -22,5 +23,6 @@ Values: For possible values see ACPI specification:
5 SOHO Server
6 Appliance PC
7 Performance Server
- >7 Reserved
+ 8 Tablet
+ >8 Reserved
== =================
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
new file mode 100644
index 000000000000..c57e5b7cb532
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-block
@@ -0,0 +1,737 @@
+What: /sys/block/<disk>/alignment_offset
+Date: April 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Storage devices may report a physical block size that is
+ bigger than the logical block size (for instance a drive
+ with 4KB physical sectors exposing 512-byte logical
+ blocks to the operating system). This parameter
+ indicates how many bytes the beginning of the device is
+ offset from the disk's natural alignment.
+
+
+What: /sys/block/<disk>/discard_alignment
+Date: May 2011
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Devices that support discard functionality may
+ internally allocate space in units that are bigger than
+ the exported logical block size. The discard_alignment
+ parameter indicates how many bytes the beginning of the
+ device is offset from the internal allocation unit's
+ natural alignment.
+
+
+What: /sys/block/<disk>/diskseq
+Date: February 2021
+Contact: Matteo Croce <mcroce@microsoft.com>
+Description:
+ The /sys/block/<disk>/diskseq files reports the disk
+ sequence number, which is a monotonically increasing
+ number assigned to every drive.
+ Some devices, like the loop device, refresh such number
+ every time the backing file is changed.
+ The value type is 64 bit unsigned.
+
+
+What: /sys/block/<disk>/inflight
+Date: October 2009
+Contact: Jens Axboe <axboe@kernel.dk>, Nikanth Karthikesan <knikanth@suse.de>
+Description:
+ Reports the number of I/O requests currently in progress
+ (pending / in flight) in a device driver. This can be less
+ than the number of requests queued in the block device queue.
+ The report contains 2 fields: one for read requests
+ and one for write requests.
+ The value type is unsigned int.
+ Cf. Documentation/block/stat.rst which contains a single value for
+ requests in flight.
+ This is related to /sys/block/<disk>/queue/nr_requests
+ and for SCSI device also its queue_depth.
+
+
+What: /sys/block/<disk>/integrity/device_is_integrity_capable
+Date: July 2014
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Indicates whether a storage device is capable of storing
+ integrity metadata. Set if the device is T10 PI-capable.
+
+
+What: /sys/block/<disk>/integrity/format
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Metadata format for integrity capable block device.
+ E.g. T10-DIF-TYPE1-CRC.
+
+
+What: /sys/block/<disk>/integrity/protection_interval_bytes
+Date: July 2015
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Describes the number of data bytes which are protected
+ by one integrity tuple. Typically the device's logical
+ block size.
+
+
+What: /sys/block/<disk>/integrity/read_verify
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Indicates whether the block layer should verify the
+ integrity of read requests serviced by devices that
+ support sending integrity metadata.
+
+
+What: /sys/block/<disk>/integrity/tag_size
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Number of bytes of integrity tag space available per
+ 512 bytes of data.
+
+
+What: /sys/block/<disk>/integrity/write_generate
+Date: June 2008
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Indicates whether the block layer should automatically
+ generate checksums for write requests bound for
+ devices that support receiving integrity metadata.
+
+
+What: /sys/block/<disk>/<partition>/alignment_offset
+Date: April 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Storage devices may report a physical block size that is
+ bigger than the logical block size (for instance a drive
+ with 4KB physical sectors exposing 512-byte logical
+ blocks to the operating system). This parameter
+ indicates how many bytes the beginning of the partition
+ is offset from the disk's natural alignment.
+
+
+What: /sys/block/<disk>/<partition>/discard_alignment
+Date: May 2011
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Devices that support discard functionality may
+ internally allocate space in units that are bigger than
+ the exported logical block size. The discard_alignment
+ parameter indicates how many bytes the beginning of the
+ partition is offset from the internal allocation unit's
+ natural alignment.
+
+
+What: /sys/block/<disk>/<partition>/stat
+Date: February 2008
+Contact: Jerome Marchand <jmarchan@redhat.com>
+Description:
+ The /sys/block/<disk>/<partition>/stat files display the
+ I/O statistics of partition <partition>. The format is the
+ same as the format of /sys/block/<disk>/stat.
+
+
+What: /sys/block/<disk>/queue/add_random
+Date: June 2010
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This file allows to turn off the disk entropy contribution.
+ Default value of this file is '1'(on).
+
+
+What: /sys/block/<disk>/queue/chunk_sectors
+Date: September 2016
+Contact: Hannes Reinecke <hare@suse.com>
+Description:
+ [RO] chunk_sectors has different meaning depending on the type
+ of the disk. For a RAID device (dm-raid), chunk_sectors
+ indicates the size in 512B sectors of the RAID volume stripe
+ segment. For a zoned block device, either host-aware or
+ host-managed, chunk_sectors indicates the size in 512B sectors
+ of the zones of the device, with the eventual exception of the
+ last zone of the device which may be smaller.
+
+
+What: /sys/block/<disk>/queue/crypto/
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ The presence of this subdirectory of /sys/block/<disk>/queue/
+ indicates that the device supports inline encryption. This
+ subdirectory contains files which describe the inline encryption
+ capabilities of the device. For more information about inline
+ encryption, refer to Documentation/block/inline-encryption.rst.
+
+
+What: /sys/block/<disk>/queue/crypto/max_dun_bits
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file shows the maximum length, in bits, of data unit
+ numbers accepted by the device in inline encryption requests.
+
+
+What: /sys/block/<disk>/queue/crypto/modes/<mode>
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] For each crypto mode (i.e., encryption/decryption
+ algorithm) the device supports with inline encryption, a file
+ will exist at this location. It will contain a hexadecimal
+ number that is a bitmask of the supported data unit sizes, in
+ bytes, for that crypto mode.
+
+ Currently, the crypto modes that may be supported are:
+
+ * AES-256-XTS
+ * AES-128-CBC-ESSIV
+ * Adiantum
+
+ For example, if a device supports AES-256-XTS inline encryption
+ with data unit sizes of 512 and 4096 bytes, the file
+ /sys/block/<disk>/queue/crypto/modes/AES-256-XTS will exist and
+ will contain "0x1200".
+
+
+What: /sys/block/<disk>/queue/crypto/num_keyslots
+Date: February 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file shows the number of keyslots the device has for
+ use with inline encryption.
+
+
+What: /sys/block/<disk>/queue/dax
+Date: June 2016
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file indicates whether the device supports Direct
+ Access (DAX), used by CPU-addressable storage to bypass the
+ pagecache. It shows '1' if true, '0' if not.
+
+
+What: /sys/block/<disk>/queue/discard_granularity
+Date: May 2011
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] Devices that support discard functionality may internally
+ allocate space using units that are bigger than the logical
+ block size. The discard_granularity parameter indicates the size
+ of the internal allocation unit in bytes if reported by the
+ device. Otherwise the discard_granularity will be set to match
+ the device's physical block size. A discard_granularity of 0
+ means that the device does not support discard functionality.
+
+
+What: /sys/block/<disk>/queue/discard_max_bytes
+Date: May 2011
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RW] While discard_max_hw_bytes is the hardware limit for the
+ device, this setting is the software limit. Some devices exhibit
+ large latencies when large discards are issued, setting this
+ value lower will make Linux issue smaller discards and
+ potentially help reduce latencies induced by large discard
+ operations.
+
+
+What: /sys/block/<disk>/queue/discard_max_hw_bytes
+Date: July 2015
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Devices that support discard functionality may have
+ internal limits on the number of bytes that can be trimmed or
+ unmapped in a single operation. The `discard_max_hw_bytes`
+ parameter is set by the device driver to the maximum number of
+ bytes that can be discarded in a single operation. Discard
+ requests issued to the device must not exceed this limit. A
+ `discard_max_hw_bytes` value of 0 means that the device does not
+ support discard functionality.
+
+
+What: /sys/block/<disk>/queue/discard_zeroes_data
+Date: May 2011
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] Will always return 0. Don't rely on any specific behavior
+ for discards, and don't read this file.
+
+
+What: /sys/block/<disk>/queue/dma_alignment
+Date: May 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ Reports the alignment that user space addresses must have to be
+ used for raw block device access with O_DIRECT and other driver
+ specific passthrough mechanisms.
+
+
+What: /sys/block/<disk>/queue/fua
+Date: May 2018
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Whether or not the block driver supports the FUA flag for
+ write requests. FUA stands for Force Unit Access. If the FUA
+ flag is set that means that write requests must bypass the
+ volatile cache of the storage device.
+
+
+What: /sys/block/<disk>/queue/hw_sector_size
+Date: January 2008
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This is the hardware sector size of the device, in bytes.
+
+
+What: /sys/block/<disk>/queue/independent_access_ranges/
+Date: October 2021
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] The presence of this sub-directory of the
+ /sys/block/xxx/queue/ directory indicates that the device is
+ capable of executing requests targeting different sector ranges
+ in parallel. For instance, single LUN multi-actuator hard-disks
+ will have an independent_access_ranges directory if the device
+ correctly advertizes the sector ranges of its actuators.
+
+ The independent_access_ranges directory contains one directory
+ per access range, with each range described using the sector
+ (RO) attribute file to indicate the first sector of the range
+ and the nr_sectors (RO) attribute file to indicate the total
+ number of sectors in the range starting from the first sector of
+ the range. For example, a dual-actuator hard-disk will have the
+ following independent_access_ranges entries.::
+
+ $ tree /sys/block/<disk>/queue/independent_access_ranges/
+ /sys/block/<disk>/queue/independent_access_ranges/
+ |-- 0
+ | |-- nr_sectors
+ | `-- sector
+ `-- 1
+ |-- nr_sectors
+ `-- sector
+
+ The sector and nr_sectors attributes use 512B sector unit,
+ regardless of the actual block size of the device. Independent
+ access ranges do not overlap and include all sectors within the
+ device capacity. The access ranges are numbered in increasing
+ order of the range start sector, that is, the sector attribute
+ of range 0 always has the value 0.
+
+
+What: /sys/block/<disk>/queue/io_poll
+Date: November 2015
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] When read, this file shows whether polling is enabled (1)
+ or disabled (0). Writing '0' to this file will disable polling
+ for this device. Writing any non-zero value will enable this
+ feature.
+
+
+What: /sys/block/<disk>/queue/io_poll_delay
+Date: November 2016
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This was used to control what kind of polling will be
+ performed. It is now fixed to -1, which is classic polling.
+ In this mode, the CPU will repeatedly ask for completions
+ without giving up any time.
+ <deprecated>
+
+
+What: /sys/block/<disk>/queue/io_timeout
+Date: November 2018
+Contact: Weiping Zhang <zhangweiping@didiglobal.com>
+Description:
+ [RW] io_timeout is the request timeout in milliseconds. If a
+ request does not complete in this time then the block driver
+ timeout handler is invoked. That timeout handler can decide to
+ retry the request, to fail it or to start a device recovery
+ strategy.
+
+
+What: /sys/block/<disk>/queue/iostats
+Date: January 2009
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This file is used to control (on/off) the iostats
+ accounting of the disk.
+
+
+What: /sys/block/<disk>/queue/logical_block_size
+Date: May 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] This is the smallest unit the storage device can address.
+ It is typically 512 bytes.
+
+
+What: /sys/block/<disk>/queue/max_active_zones
+Date: July 2020
+Contact: Niklas Cassel <niklas.cassel@wdc.com>
+Description:
+ [RO] For zoned block devices (zoned attribute indicating
+ "host-managed" or "host-aware"), the sum of zones belonging to
+ any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED,
+ is limited by this value. If this value is 0, there is no limit.
+
+ If the host attempts to exceed this limit, the driver should
+ report this error with BLK_STS_ZONE_ACTIVE_RESOURCE, which user
+ space may see as the EOVERFLOW errno.
+
+
+What: /sys/block/<disk>/queue/max_discard_segments
+Date: February 2017
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] The maximum number of DMA scatter/gather entries in a
+ discard request.
+
+
+What: /sys/block/<disk>/queue/max_hw_sectors_kb
+Date: September 2004
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This is the maximum number of kilobytes supported in a
+ single data transfer.
+
+
+What: /sys/block/<disk>/queue/max_integrity_segments
+Date: September 2010
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Maximum number of elements in a DMA scatter/gather list
+ with integrity data that will be submitted by the block layer
+ core to the associated block driver.
+
+
+What: /sys/block/<disk>/queue/max_open_zones
+Date: July 2020
+Contact: Niklas Cassel <niklas.cassel@wdc.com>
+Description:
+ [RO] For zoned block devices (zoned attribute indicating
+ "host-managed" or "host-aware"), the sum of zones belonging to
+ any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN, is
+ limited by this value. If this value is 0, there is no limit.
+
+
+What: /sys/block/<disk>/queue/max_sectors_kb
+Date: September 2004
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This is the maximum number of kilobytes that the block
+ layer will allow for a filesystem request. Must be smaller than
+ or equal to the maximum size allowed by the hardware. Write 0
+ to use default kernel settings.
+
+
+What: /sys/block/<disk>/queue/max_segment_size
+Date: March 2010
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Maximum size in bytes of a single element in a DMA
+ scatter/gather list.
+
+
+What: /sys/block/<disk>/queue/max_segments
+Date: March 2010
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Maximum number of elements in a DMA scatter/gather list
+ that is submitted to the associated block driver.
+
+
+What: /sys/block/<disk>/queue/minimum_io_size
+Date: April 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] Storage devices may report a granularity or preferred
+ minimum I/O size which is the smallest request the device can
+ perform without incurring a performance penalty. For disk
+ drives this is often the physical block size. For RAID arrays
+ it is often the stripe chunk size. A properly aligned multiple
+ of minimum_io_size is the preferred request size for workloads
+ where a high number of I/O operations is desired.
+
+
+What: /sys/block/<disk>/queue/nomerges
+Date: January 2010
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] Standard I/O elevator operations include attempts to merge
+ contiguous I/Os. For known random I/O loads these attempts will
+ always fail and result in extra cycles being spent in the
+ kernel. This allows one to turn off this behavior on one of two
+ ways: When set to 1, complex merge checks are disabled, but the
+ simple one-shot merges with the previous I/O request are
+ enabled. When set to 2, all merge tries are disabled. The
+ default value is 0 - which enables all types of merge tries.
+
+
+What: /sys/block/<disk>/queue/nr_requests
+Date: July 2003
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This controls how many requests may be allocated in the
+ block layer for read or write requests. Note that the total
+ allocated number may be twice this amount, since it applies only
+ to reads or writes (not the accumulated sum).
+
+ To avoid priority inversion through request starvation, a
+ request queue maintains a separate request pool per each cgroup
+ when CONFIG_BLK_CGROUP is enabled, and this parameter applies to
+ each such per-block-cgroup request pool. IOW, if there are N
+ block cgroups, each request queue may have up to N request
+ pools, each independently regulated by nr_requests.
+
+
+What: /sys/block/<disk>/queue/nr_zones
+Date: November 2018
+Contact: Damien Le Moal <damien.lemoal@wdc.com>
+Description:
+ [RO] nr_zones indicates the total number of zones of a zoned
+ block device ("host-aware" or "host-managed" zone model). For
+ regular block devices, the value is always 0.
+
+
+What: /sys/block/<disk>/queue/optimal_io_size
+Date: April 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] Storage devices may report an optimal I/O size, which is
+ the device's preferred unit for sustained I/O. This is rarely
+ reported for disk drives. For RAID arrays it is usually the
+ stripe width or the internal track size. A properly aligned
+ multiple of optimal_io_size is the preferred request size for
+ workloads where sustained throughput is desired. If no optimal
+ I/O size is reported this file contains 0.
+
+
+What: /sys/block/<disk>/queue/physical_block_size
+Date: May 2009
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] This is the smallest unit a physical storage device can
+ write atomically. It is usually the same as the logical block
+ size but may be bigger. One example is SATA drives with 4KB
+ sectors that expose a 512-byte logical block size to the
+ operating system. For stacked block devices the
+ physical_block_size variable contains the maximum
+ physical_block_size of the component devices.
+
+
+What: /sys/block/<disk>/queue/read_ahead_kb
+Date: May 2004
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] Maximum number of kilobytes to read-ahead for filesystems
+ on this block device.
+
+
+What: /sys/block/<disk>/queue/rotational
+Date: January 2009
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This file is used to stat if the device is of rotational
+ type or non-rotational type.
+
+
+What: /sys/block/<disk>/queue/rq_affinity
+Date: September 2008
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] If this option is '1', the block layer will migrate request
+ completions to the cpu "group" that originally submitted the
+ request. For some workloads this provides a significant
+ reduction in CPU cycles due to caching effects.
+
+ For storage configurations that need to maximize distribution of
+ completion processing setting this option to '2' forces the
+ completion to run on the requesting cpu (bypassing the "group"
+ aggregation logic).
+
+
+What: /sys/block/<disk>/queue/scheduler
+Date: October 2004
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] When read, this file will display the current and available
+ IO schedulers for this block device. The currently active IO
+ scheduler will be enclosed in [] brackets. Writing an IO
+ scheduler name to this file will switch control of this block
+ device to that new IO scheduler. Note that writing an IO
+ scheduler name to this file will attempt to load that IO
+ scheduler module, if it isn't already present in the system.
+
+
+What: /sys/block/<disk>/queue/stable_writes
+Date: September 2020
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This file will contain '1' if memory must not be modified
+ while it is being used in a write request to this device. When
+ this is the case and the kernel is performing writeback of a
+ page, the kernel will wait for writeback to complete before
+ allowing the page to be modified again, rather than allowing
+ immediate modification as is normally the case. This
+ restriction arises when the device accesses the memory multiple
+ times where the same data must be seen every time -- for
+ example, once to calculate a checksum and once to actually write
+ the data. If no such restriction exists, this file will contain
+ '0'. This file is writable for testing purposes.
+
+
+What: /sys/block/<disk>/queue/throttle_sample_time
+Date: March 2017
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This is the time window that blk-throttle samples data, in
+ millisecond. blk-throttle makes decision based on the
+ samplings. Lower time means cgroups have more smooth throughput,
+ but higher CPU overhead. This exists only when
+ CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
+
+
+What: /sys/block/<disk>/queue/virt_boundary_mask
+Date: April 2021
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file shows the I/O segment memory alignment mask for
+ the block device. I/O requests to this device will be split
+ between segments wherever either the memory address of the end
+ of the previous segment or the memory address of the beginning
+ of the current segment is not aligned to virt_boundary_mask + 1
+ bytes.
+
+
+What: /sys/block/<disk>/queue/wbt_lat_usec
+Date: November 2016
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] If the device is registered for writeback throttling, then
+ this file shows the target minimum read latency. If this latency
+ is exceeded in a given window of time (see wb_window_usec), then
+ the writeback throttling will start scaling back writes. Writing
+ a value of '0' to this file disables the feature. Writing a
+ value of '-1' to this file resets the value to the default
+ setting.
+
+
+What: /sys/block/<disk>/queue/write_cache
+Date: April 2016
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] When read, this file will display whether the device has
+ write back caching enabled or not. It will return "write back"
+ for the former case, and "write through" for the latter. Writing
+ to this file can change the kernels view of the device, but it
+ doesn't alter the device state. This means that it might not be
+ safe to toggle the setting from "write back" to "write through",
+ since that will also eliminate cache flushes issued by the
+ kernel.
+
+
+What: /sys/block/<disk>/queue/write_same_max_bytes
+Date: January 2012
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ [RO] Some devices support a write same operation in which a
+ single data block can be written to a range of several
+ contiguous blocks on storage. This can be used to wipe areas on
+ disk or to initialize drives in a RAID configuration.
+ write_same_max_bytes indicates how many bytes can be written in
+ a single write same command. If write_same_max_bytes is 0, write
+ same is not supported by the device.
+
+
+What: /sys/block/<disk>/queue/write_zeroes_max_bytes
+Date: November 2016
+Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Description:
+ [RO] Devices that support write zeroes operation in which a
+ single request can be issued to zero out the range of contiguous
+ blocks on storage without having any payload in the request.
+ This can be used to optimize writing zeroes to the devices.
+ write_zeroes_max_bytes indicates how many bytes can be written
+ in a single write zeroes command. If write_zeroes_max_bytes is
+ 0, write zeroes is not supported by the device.
+
+
+What: /sys/block/<disk>/queue/zone_append_max_bytes
+Date: May 2020
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This is the maximum number of bytes that can be written to
+ a sequential zone of a zoned block device using a zone append
+ write operation (REQ_OP_ZONE_APPEND). This value is always 0 for
+ regular block devices.
+
+
+What: /sys/block/<disk>/queue/zone_write_granularity
+Date: January 2021
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This indicates the alignment constraint, in bytes, for
+ write operations in sequential zones of zoned block devices
+ (devices with a zoned attributed that reports "host-managed" or
+ "host-aware"). This value is always 0 for regular block devices.
+
+
+What: /sys/block/<disk>/queue/zoned
+Date: September 2016
+Contact: Damien Le Moal <damien.lemoal@wdc.com>
+Description:
+ [RO] zoned indicates if the device is a zoned block device and
+ the zone model of the device if it is indeed zoned. The
+ possible values indicated by zoned are "none" for regular block
+ devices and "host-aware" or "host-managed" for zoned block
+ devices. The characteristics of host-aware and host-managed
+ zoned block devices are described in the ZBC (Zoned Block
+ Commands) and ZAC (Zoned Device ATA Command Set) standards.
+ These standards also define the "drive-managed" zone model.
+ However, since drive-managed zoned block devices do not support
+ zone commands, they will be treated as regular block devices and
+ zoned will report "none".
+
+
+What: /sys/block/<disk>/hidden
+Date: March 2023
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] the block device is hidden. it doesn’t produce events, and
+ can’t be opened from userspace or using blkdev_get*.
+ Used for the underlying components of multipath devices.
+
+
+What: /sys/block/<disk>/stat
+Date: February 2008
+Contact: Jerome Marchand <jmarchan@redhat.com>
+Description:
+ The /sys/block/<disk>/stat files displays the I/O
+ statistics of disk <disk>. They contain 11 fields:
+
+ == ==============================================
+ 1 reads completed successfully
+ 2 reads merged
+ 3 sectors read
+ 4 time spent reading (ms)
+ 5 writes completed
+ 6 writes merged
+ 7 sectors written
+ 8 time spent writing (ms)
+ 9 I/Os currently in progress
+ 10 time spent doing I/Os (ms)
+ 11 weighted time spent doing I/Os (ms)
+ 12 discards completed
+ 13 discards merged
+ 14 sectors discarded
+ 15 time spent discarding (ms)
+ 16 flush requests completed
+ 17 time spent flushing (ms)
+ == ==============================================
+
+ For more details refer Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/stable/sysfs-bus-mhi b/Documentation/ABI/stable/sysfs-bus-mhi
index ecfe7662f8d0..96ccc3385a2b 100644
--- a/Documentation/ABI/stable/sysfs-bus-mhi
+++ b/Documentation/ABI/stable/sysfs-bus-mhi
@@ -19,3 +19,13 @@ Description: The file holds the OEM PK Hash value of the endpoint device
read without having the device power on at least once, the file
will read all 0's.
Users: Any userspace application or clients interested in device info.
+
+What: /sys/bus/mhi/devices/.../soc_reset
+Date: April 2022
+KernelVersion: 5.19
+Contact: mhi@lists.linux.dev
+Description: Initiates a SoC reset on the MHI controller. A SoC reset is
+ a reset of last resort, and will require a complete re-init.
+ This can be useful as a method of recovery if the device is
+ non-responsive, or as a means of loading new firmware as a
+ system administration task.
diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband
index 9b1bdfa43354..ebf08c604336 100644
--- a/Documentation/ABI/stable/sysfs-class-infiniband
+++ b/Documentation/ABI/stable/sysfs-class-infiniband
@@ -232,10 +232,10 @@ Description: The RoCE type of the associated GID resides at index <gid-index>.
or "RoCE v2" for RoCE v2 based GIDs.
-What: /sys/class/infiniband_mad/umadN/ibdev
-What: /sys/class/infiniband_mad/umadN/port
-What: /sys/class/infiniband_mad/issmN/ibdev
-What: /sys/class/infiniband_mad/issmN/port
+What: /sys/class/infiniband_mad/umad<N>/ibdev
+What: /sys/class/infiniband_mad/umad<N>/port
+What: /sys/class/infiniband_mad/issm<N>/ibdev
+What: /sys/class/infiniband_mad/issm<N>/port
Date: Apr, 2005
KernelVersion: v2.6.12
Contact: linux-rdma@vger.kernel.org
@@ -261,8 +261,8 @@ Description:
userspace ABI compatibility of umad & issm devices.
-What: /sys/class/infiniband_verbs/uverbsN/ibdev
-What: /sys/class/infiniband_verbs/uverbsN/abi_version
+What: /sys/class/infiniband_verbs/uverbs<N>/ibdev
+What: /sys/class/infiniband_verbs/uverbs<N>/abi_version
Date: Sept, 2005
KernelVersion: v2.6.14
Contact: linux-rdma@vger.kernel.org
@@ -471,7 +471,7 @@ Description:
=============== ======================================================
-What: /sys/class/infiniband/qibX/ports/N/sl2vl/[0-15]
+What: /sys/class/infiniband/qibX/ports/<N>/sl2vl/[0-15]
Date: May, 2010
KernelVersion: v2.6.35
Contact: linux-rdma@vger.kernel.org
@@ -480,8 +480,8 @@ Description:
the Service Level (SL). Listing the SL files returns the Virtual
Lane (VL) as programmed by the SL.
-What: /sys/class/infiniband/qibX/ports/N/CCMgtA/cc_settings_bin
-What: /sys/class/infiniband/qibX/ports/N/CCMgtA/cc_table_bin
+What: /sys/class/infiniband/qibX/ports/<N>/CCMgtA/cc_settings_bin
+What: /sys/class/infiniband/qibX/ports/<N>/CCMgtA/cc_table_bin
Date: May, 2010
KernelVersion: v2.6.35
Contact: linux-rdma@vger.kernel.org
@@ -499,11 +499,11 @@ Description:
delay.
=============== ================================================
-What: /sys/class/infiniband/qibX/ports/N/linkstate/loopback
-What: /sys/class/infiniband/qibX/ports/N/linkstate/led_override
-What: /sys/class/infiniband/qibX/ports/N/linkstate/hrtbt_enable
-What: /sys/class/infiniband/qibX/ports/N/linkstate/status
-What: /sys/class/infiniband/qibX/ports/N/linkstate/status_str
+What: /sys/class/infiniband/qibX/ports/<N>/linkstate/loopback
+What: /sys/class/infiniband/qibX/ports/<N>/linkstate/led_override
+What: /sys/class/infiniband/qibX/ports/<N>/linkstate/hrtbt_enable
+What: /sys/class/infiniband/qibX/ports/<N>/linkstate/status
+What: /sys/class/infiniband/qibX/ports/<N>/linkstate/status_str
Date: May, 2010
KernelVersion: v2.6.35
Contact: linux-rdma@vger.kernel.org
@@ -523,16 +523,16 @@ Description:
"Fatal_Hardware_Error".
=============== ===============================================
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/rc_resends
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/seq_naks
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/rdma_seq
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/rnr_naks
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/other_naks
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/rc_timeouts
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/look_pkts
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/pkt_drops
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/dma_wait
-What: /sys/class/infiniband/qibX/ports/N/diag_counters/unaligned
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/rc_resends
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/seq_naks
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/rdma_seq
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/rnr_naks
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/other_naks
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/rc_timeouts
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/look_pkts
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/pkt_drops
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/dma_wait
+What: /sys/class/infiniband/qibX/ports/<N>/diag_counters/unaligned
Date: May, 2010
KernelVersion: v2.6.35
Contact: linux-rdma@vger.kernel.org
@@ -650,9 +650,9 @@ Description:
=============== =============================================
-What: /sys/class/infiniband/hfi1_X/ports/N/CCMgtA/cc_settings_bin
-What: /sys/class/infiniband/hfi1_X/ports/N/CCMgtA/cc_table_bin
-What: /sys/class/infiniband/hfi1_X/ports/N/CCMgtA/cc_prescan
+What: /sys/class/infiniband/hfi1_X/ports/<N>/CCMgtA/cc_settings_bin
+What: /sys/class/infiniband/hfi1_X/ports/<N>/CCMgtA/cc_table_bin
+What: /sys/class/infiniband/hfi1_X/ports/<N>/CCMgtA/cc_prescan
Date: May, 2016
KernelVersion: v4.6
Contact: linux-rdma@vger.kernel.org
@@ -675,9 +675,9 @@ Description:
disable.
=============== ================================================
-What: /sys/class/infiniband/hfi1_X/ports/N/sc2vl/[0-31]
-What: /sys/class/infiniband/hfi1_X/ports/N/sl2sc/[0-31]
-What: /sys/class/infiniband/hfi1_X/ports/N/vl2mtu/[0-15]
+What: /sys/class/infiniband/hfi1_X/ports/<N>/sc2vl/[0-31]
+What: /sys/class/infiniband/hfi1_X/ports/<N>/sl2sc/[0-31]
+What: /sys/class/infiniband/hfi1_X/ports/<N>/vl2mtu/[0-15]
Date: May, 2016
KernelVersion: v4.6
Contact: linux-rdma@vger.kernel.org
@@ -691,8 +691,8 @@ Description:
=============== ===================================================
-What: /sys/class/infiniband/hfi1_X/sdma_N/cpu_list
-What: /sys/class/infiniband/hfi1_X/sdma_N/vl
+What: /sys/class/infiniband/hfi1_X/sdma_<N>/cpu_list
+What: /sys/class/infiniband/hfi1_X/sdma_<N>/vl
Date: Sept, 2016
KernelVersion: v4.8
Contact: linux-rdma@vger.kernel.org
diff --git a/Documentation/ABI/stable/sysfs-class-tpm b/Documentation/ABI/stable/sysfs-class-tpm
index d897ecb9615f..411d5895bed4 100644
--- a/Documentation/ABI/stable/sysfs-class-tpm
+++ b/Documentation/ABI/stable/sysfs-class-tpm
@@ -195,7 +195,7 @@ Description: The "tpm_version_major" property shows the TCG spec major version
2
-What: /sys/class/tpm/tpmX/pcr-H/N
+What: /sys/class/tpm/tpmX/pcr-<H>/<N>
Date: March 2021
KernelVersion: 5.12
Contact: linux-integrity@vger.kernel.org
diff --git a/Documentation/ABI/stable/sysfs-devices b/Documentation/ABI/stable/sysfs-devices
index 42bf1eab5677..98a8ef99ac5f 100644
--- a/Documentation/ABI/stable/sysfs-devices
+++ b/Documentation/ABI/stable/sysfs-devices
@@ -23,3 +23,10 @@ Contact: Device Tree mailing list <devicetree@vger.kernel.org>
Description:
If CONFIG_OF is enabled, then this file is present. When
read, it returns full name of the device node.
+
+What: /sys/devices/*/dev
+Date: Jun 2006
+Contact: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Description:
+ Major and minor numbers of the character device corresponding
+ to the device (in <major>:<minor> format).
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index 484fc04bcc25..402af4b2b905 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -176,3 +176,48 @@ Contact: Keith Busch <keith.busch@intel.com>
Description:
The cache write policy: 0 for write-back, 1 for write-through,
other or unknown.
+
+What: /sys/devices/system/node/nodeX/x86/sgx_total_bytes
+Date: November 2021
+Contact: Jarkko Sakkinen <jarkko@kernel.org>
+Description:
+ The total amount of SGX physical memory in bytes.
+
+What: /sys/devices/system/node/nodeX/memory_failure/total
+Date: January 2023
+Contact: Jiaqi Yan <jiaqiyan@google.com>
+Description:
+ The total number of raw poisoned pages (pages containing
+ corrupted data due to memory errors) on a NUMA node.
+
+What: /sys/devices/system/node/nodeX/memory_failure/ignored
+Date: January 2023
+Contact: Jiaqi Yan <jiaqiyan@google.com>
+Description:
+ Of the raw poisoned pages on a NUMA node, how many pages are
+ ignored by memory error recovery attempt, usually because
+ support for this type of pages is unavailable, and kernel
+ gives up the recovery.
+
+What: /sys/devices/system/node/nodeX/memory_failure/failed
+Date: January 2023
+Contact: Jiaqi Yan <jiaqiyan@google.com>
+Description:
+ Of the raw poisoned pages on a NUMA node, how many pages are
+ failed by memory error recovery attempt. This usually means
+ a key recovery operation failed.
+
+What: /sys/devices/system/node/nodeX/memory_failure/delayed
+Date: January 2023
+Contact: Jiaqi Yan <jiaqiyan@google.com>
+Description:
+ Of the raw poisoned pages on a NUMA node, how many pages are
+ delayed by memory error recovery attempt. Delayed poisoned
+ pages usually will be retried by kernel.
+
+What: /sys/devices/system/node/nodeX/memory_failure/recovered
+Date: January 2023
+Contact: Jiaqi Yan <jiaqiyan@google.com>
+Description:
+ Of the raw poisoned pages on a NUMA node, how many pages are
+ recovered by memory error recovery attempt.
diff --git a/Documentation/ABI/stable/sysfs-devices-system-cpu b/Documentation/ABI/stable/sysfs-devices-system-cpu
index 516dafea03eb..902392d7eddf 100644
--- a/Documentation/ABI/stable/sysfs-devices-system-cpu
+++ b/Documentation/ABI/stable/sysfs-devices-system-cpu
@@ -42,6 +42,12 @@ Description: the CPU core ID of cpuX. Typically it is the hardware platform's
architecture and platform dependent.
Values: integer
+What: /sys/devices/system/cpu/cpuX/topology/cluster_id
+Description: the cluster ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+Values: integer
+
What: /sys/devices/system/cpu/cpuX/topology/book_id
Description: the book ID of cpuX. Typically it is the hardware platform's
identifier (rather than the kernel's). The actual value is
@@ -80,11 +86,24 @@ What: /sys/devices/system/cpu/cpuX/topology/die_cpus
Description: internal kernel map of CPUs within the same die.
Values: hexadecimal bitmask.
+What: /sys/devices/system/cpu/cpuX/topology/ppin
+Description: per-socket protected processor inventory number
+Values: hexadecimal.
+
What: /sys/devices/system/cpu/cpuX/topology/die_cpus_list
Description: human-readable list of CPUs within the same die.
The format is like 0-3, 8-11, 14,17.
Values: decimal list.
+What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus
+Description: internal kernel map of CPUs within the same cluster.
+Values: hexadecimal bitmask.
+
+What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus_list
+Description: human-readable list of CPUs within the same cluster.
+ The format is like 0-3, 8-11, 14,17.
+Values: decimal list.
+
What: /sys/devices/system/cpu/cpuX/topology/book_siblings
Description: internal kernel map of cpuX's hardware threads within the same
book_id. it's only used on s390.
diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index df4afbccf037..534b7a3d59fc 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -22,6 +22,7 @@ Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The largest number of work descriptors in a batch.
+ It's not visible when the device does not support batch.
What: /sys/bus/dsa/devices/dsa<m>/max_work_queues_size
Date: Oct 25, 2019
@@ -41,14 +42,16 @@ KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The maximum number of groups can be created under this device.
-What: /sys/bus/dsa/devices/dsa<m>/max_tokens
-Date: Oct 25, 2019
-KernelVersion: 5.6.0
+What: /sys/bus/dsa/devices/dsa<m>/max_read_buffers
+Date: Dec 10, 2021
+KernelVersion: 5.17.0
Contact: dmaengine@vger.kernel.org
-Description: The total number of bandwidth tokens supported by this device.
- The bandwidth tokens represent resources within the DSA
+Description: The total number of read buffers supported by this device.
+ The read buffers represent resources within the DSA
implementation, and these resources are allocated by engines to
- support operations.
+ support operations. See DSA spec v1.2 9.2.4 Total Read Buffers.
+ It's not visible when the device does not support Read Buffer
+ allocation control.
What: /sys/bus/dsa/devices/dsa<m>/max_transfer_size
Date: Oct 25, 2019
@@ -115,13 +118,15 @@ KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: To indicate if this device is configurable or not.
-What: /sys/bus/dsa/devices/dsa<m>/token_limit
-Date: Oct 25, 2019
-KernelVersion: 5.6.0
+What: /sys/bus/dsa/devices/dsa<m>/read_buffer_limit
+Date: Dec 10, 2021
+KernelVersion: 5.17.0
Contact: dmaengine@vger.kernel.org
-Description: The maximum number of bandwidth tokens that may be in use at
+Description: The maximum number of read buffers that may be in use at
one time by operations that access low bandwidth memory in the
- device.
+ device. See DSA spec v1.2 9.2.8 GENCFG on Global Read Buffer Limit.
+ It's not visible when the device does not support Read Buffer
+ allocation control.
What: /sys/bus/dsa/devices/dsa<m>/cmd_status
Date: Aug 28, 2020
@@ -131,6 +136,22 @@ Description: The last executed device administrative command's status/error.
Also last configuration error overloaded.
Writing to it will clear the status.
+What: /sys/bus/dsa/devices/dsa<m>/iaa_cap
+Date: Sept 14, 2022
+KernelVersion: 6.0.0
+Contact: dmaengine@vger.kernel.org
+Description: IAA (IAX) capability mask. Exported to user space for application
+ consumption. This attribute should only be visible on IAA devices
+ that are version 2 or later.
+
+What: /sys/bus/dsa/devices/dsa<m>/event_log_size
+Date: Sept 14, 2022
+KernelVersion: 6.4.0
+Contact: dmaengine@vger.kernel.org
+Description: The event log size to be configured. Default is 64 entries and
+ occupies 4k size if the evl entry is 64 bytes. It's visible
+ only on platforms that support the capability.
+
What: /sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
Date: Oct 27, 2020
KernelVersion: 5.11.0
@@ -205,6 +226,7 @@ KernelVersion: 5.10.0
Contact: dmaengine@vger.kernel.org
Description: The max batch size for this workqueue. Cannot exceed device
max batch size. Configurable parameter.
+ It's not visible when the device does not support batch.
What: /sys/bus/dsa/devices/wq<m>.<n>/ats_disable
Date: Nov 13, 2020
@@ -213,6 +235,16 @@ Contact: dmaengine@vger.kernel.org
Description: Indicate whether ATS disable is turned on for the workqueue.
0 indicates ATS is on, and 1 indicates ATS is off for the workqueue.
+What: /sys/bus/dsa/devices/wq<m>.<n>/prs_disable
+Date: Sept 14, 2022
+KernelVersion: 6.4.0
+Contact: dmaengine@vger.kernel.org
+Description: Controls whether PRS disable is turned on for the workqueue.
+ 0 indicates PRS is on, and 1 indicates PRS is off for the
+ workqueue. This option overrides block_on_fault attribute
+ if set. It's visible only on platforms that support the
+ capability.
+
What: /sys/bus/dsa/devices/wq<m>.<n>/occupancy
Date May 25, 2021
KernelVersion: 5.14.0
@@ -220,8 +252,104 @@ Contact: dmaengine@vger.kernel.org
Description: Show the current number of entries in this WQ if WQ Occupancy
Support bit WQ capabilities is 1.
+What: /sys/bus/dsa/devices/wq<m>.<n>/enqcmds_retries
+Date Oct 29, 2021
+KernelVersion: 5.17.0
+Contact: dmaengine@vger.kernel.org
+Description: Indicate the number of retires for an enqcmds submission on a sharedwq.
+ A max value to set attribute is capped at 64.
+
+What: /sys/bus/dsa/devices/wq<m>.<n>/op_config
+Date: Sept 14, 2022
+KernelVersion: 6.0.0
+Contact: dmaengine@vger.kernel.org
+Description: Shows the operation capability bits displayed in bitmap format
+ presented by %*pb printk() output format specifier.
+ The attribute can be configured when the WQ is disabled in
+ order to configure the WQ to accept specific bits that
+ correlates to the operations allowed. It's visible only
+ on platforms that support the capability.
+
What: /sys/bus/dsa/devices/engine<m>.<n>/group_id
Date: Oct 25, 2019
KernelVersion: 5.6.0
Contact: dmaengine@vger.kernel.org
Description: The group that this engine belongs to.
+
+What: /sys/bus/dsa/devices/group<m>.<n>/use_read_buffer_limit
+Date: Dec 10, 2021
+KernelVersion: 5.17.0
+Contact: dmaengine@vger.kernel.org
+Description: Enable the use of global read buffer limit for the group. See DSA
+ spec v1.2 9.2.18 GRPCFG Use Global Read Buffer Limit.
+ It's not visible when the device does not support Read Buffer
+ allocation control.
+
+What: /sys/bus/dsa/devices/group<m>.<n>/read_buffers_allowed
+Date: Dec 10, 2021
+KernelVersion: 5.17.0
+Contact: dmaengine@vger.kernel.org
+Description: Indicates max number of read buffers that may be in use at one time
+ by all engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read
+ Buffers Allowed.
+ It's not visible when the device does not support Read Buffer
+ allocation control.
+
+What: /sys/bus/dsa/devices/group<m>.<n>/read_buffers_reserved
+Date: Dec 10, 2021
+KernelVersion: 5.17.0
+Contact: dmaengine@vger.kernel.org
+Description: Indicates the number of Read Buffers reserved for the use of
+ engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
+ Reserved.
+ It's not visible when the device does not support Read Buffer
+ allocation control.
+
+What: /sys/bus/dsa/devices/group<m>.<n>/desc_progress_limit
+Date: Sept 14, 2022
+KernelVersion: 6.0.0
+Contact: dmaengine@vger.kernel.org
+Description: Allows control of the number of work descriptors that can be
+ concurrently processed by an engine in the group as a fraction
+ of the Maximum Work Descriptors in Progress value specified in
+ the ENGCAP register. The acceptable values are 0 (default),
+ 1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
+ the max value). It's visible only on platforms that support
+ the capability.
+
+What: /sys/bus/dsa/devices/group<m>.<n>/batch_progress_limit
+Date: Sept 14, 2022
+KernelVersion: 6.0.0
+Contact: dmaengine@vger.kernel.org
+Description: Allows control of the number of batch descriptors that can be
+ concurrently processed by an engine in the group as a fraction
+ of the Maximum Batch Descriptors in Progress value specified in
+ the ENGCAP register. The acceptable values are 0 (default),
+ 1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
+ the max value). It's visible only on platforms that support
+ the capability.
+
+What: /sys/bus/dsa/devices/wq<m>.<n>/dsa<x>\!wq<m>.<n>/file<y>/cr_faults
+Date: Sept 14, 2022
+KernelVersion: 6.4.0
+Contact: dmaengine@vger.kernel.org
+Description: Show the number of Completion Record (CR) faults this application
+ has caused.
+
+What: /sys/bus/dsa/devices/wq<m>.<n>/dsa<x>\!wq<m>.<n>/file<y>/cr_fault_failures
+Date: Sept 14, 2022
+KernelVersion: 6.4.0
+Contact: dmaengine@vger.kernel.org
+Description: Show the number of Completion Record (CR) faults failures that this
+ application has caused. The failure counter is incremented when the
+ driver cannot fault in the address for the CR. Typically this is caused
+ by a bad address programmed in the submitted descriptor or a malicious
+ submitter is using bad CR address on purpose.
+
+What: /sys/bus/dsa/devices/wq<m>.<n>/dsa<x>\!wq<m>.<n>/file<y>/pid
+Date: Sept 14, 2022
+KernelVersion: 6.4.0
+Contact: dmaengine@vger.kernel.org
+Description: Show the process id of the application that opened the file. This is
+ helpful information for a monitor daemon that wants to kill the
+ application that opened the file.
diff --git a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
index f5724bb5b462..c3fec3c835af 100644
--- a/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
+++ b/Documentation/ABI/stable/sysfs-driver-firmware-zynqmp
@@ -113,3 +113,144 @@ Description:
# echo 0 > /sys/devices/platform/firmware\:zynqmp-firmware/health_status
Users: Xilinx
+
+What: /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "Ronak Jain" <ronak.jain@xilinx.com>
+Description:
+ This sysfs interface allows user to configure features at
+ runtime. The user can enable or disable features running at
+ firmware as well as the user can configure the parameters of
+ the features at runtime. The supported features are over
+ temperature and external watchdog. Here, the external watchdog
+ is completely different than the /dev/watchdog as the external
+ watchdog is running on the firmware and it is used to monitor
+ the health of firmware not APU(Linux). Also, the external
+ watchdog is interfaced outside of the zynqmp soc.
+
+ The supported config ids are for the feature configuration is,
+ 1. PM_FEATURE_OVERTEMP_STATUS = 1, the user can enable or
+ disable the over temperature feature.
+ 2. PM_FEATURE_OVERTEMP_VALUE = 2, the user can configure the
+ over temperature limit in Degree Celsius.
+ 3. PM_FEATURE_EXTWDT_STATUS = 3, the user can enable or disable
+ the external watchdog feature.
+ 4. PM_FEATURE_EXTWDT_VALUE = 4, the user can configure the
+ external watchdog feature.
+
+ Usage:
+
+ Select over temperature config ID to enable/disable feature
+ # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+
+ Check over temperature config ID is selected or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ The expected result is 1.
+
+ Select over temperature config ID to configure OT limit
+ # echo 2 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+
+ Check over temperature config ID is selected or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ The expected result is 2.
+
+ Select external watchdog config ID to enable/disable feature
+ # echo 3 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+
+ Check external watchdog config ID is selected or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ The expected result is 3.
+
+ Select external watchdog config ID to configure time interval
+ # echo 4 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+
+ Check external watchdog config ID is selected or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ The expected result is 4.
+
+Users: Xilinx
+
+What: /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "Ronak Jain" <ronak.jain@xilinx.com>
+Description:
+ This sysfs interface allows to configure features at runtime.
+ The user can enable or disable features running at firmware.
+ Also, the user can configure the parameters of the features
+ at runtime. The supported features are over temperature and
+ external watchdog. Here, the external watchdog is completely
+ different than the /dev/watchdog as the external watchdog is
+ running on the firmware and it is used to monitor the health
+ of firmware not APU(Linux). Also, the external watchdog is
+ interfaced outside of the zynqmp soc.
+
+ By default the features are disabled in the firmware. The user
+ can enable features by querying appropriate config id of the
+ features.
+
+ The default limit for the over temperature is 90 Degree Celsius.
+ The default timer interval for the external watchdog is 570ms.
+
+ The supported config ids are for the feature configuration is,
+ 1. PM_FEATURE_OVERTEMP_STATUS = 1, the user can enable or
+ disable the over temperature feature.
+ 2. PM_FEATURE_OVERTEMP_VALUE = 2, the user can configure the
+ over temperature limit in Degree Celsius.
+ 3. PM_FEATURE_EXTWDT_STATUS = 3, the user can enable or disable
+ the external watchdog feature.
+ 4. PM_FEATURE_EXTWDT_VALUE = 4, the user can configure the
+ external watchdog feature.
+
+ Usage:
+
+ Enable over temperature feature
+ # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the over temperature feature is enabled or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 1.
+
+ Disable over temperature feature
+ # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 0 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the over temperature feature is disabled or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 0.
+
+ Configure over temperature limit to 50 Degree Celsius
+ # echo 2 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 50 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the over temperature limit is configured or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 50.
+
+ Enable external watchdog feature
+ # echo 3 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 1 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the external watchdog feature is enabled or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 1.
+
+ Disable external watchdog feature
+ # echo 3 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 0 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the external watchdog feature is disabled or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 0.
+
+ Configure external watchdog timer interval to 500ms
+ # echo 4 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_id
+ # echo 500 > /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+
+ Check whether the external watchdog timer interval is configured or not
+ # cat /sys/devices/platform/firmware\:zynqmp-firmware/feature_config_value
+ The expected result is 500.
+
+Users: Xilinx
diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
index b2553df2e786..60953903d007 100644
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@@ -1,7 +1,7 @@
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_health
Date: June 2018
KernelVersion: 4.19
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file shows ASIC health status. The possible values are:
0 - health failed, 2 - health OK, 3 - ASIC in booting state.
@@ -11,7 +11,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld1_version
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld2_version
Date: June 2018
KernelVersion: 4.19
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show with which CPLD versions have been burned
on carrier and switch boards.
@@ -20,7 +20,7 @@ Description: These files show with which CPLD versions have been burned
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/fan_dir
Date: December 2018
KernelVersion: 5.0
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file shows the system fans direction:
forward direction - relevant bit is set 0;
reversed direction - relevant bit is set 1.
@@ -30,7 +30,7 @@ Description: This file shows the system fans direction:
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld3_version
Date: November 2018
KernelVersion: 5.0
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show with which CPLD versions have been burned
on LED or Gearbox board.
@@ -39,7 +39,7 @@ Description: These files show with which CPLD versions have been burned
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
Date: November 2018
KernelVersion: 5.0
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files enable and disable the access to the JTAG domain.
By default access to the JTAG domain is disabled.
@@ -48,7 +48,7 @@ Description: These files enable and disable the access to the JTAG domain.
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/select_iio
Date: June 2018
KernelVersion: 4.19
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file allows iio devices selection.
Attribute select_iio can be written with 0 or with 1. It
@@ -62,7 +62,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/psu1_on
/sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pwr_down
Date: June 2018
KernelVersion: 4.19
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files allow asserting system power cycling, switching
power supply units on and off and system's main power domain
shutdown.
@@ -89,7 +89,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_short_pb
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_reset
Date: June 2018
KernelVersion: 4.19
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show the system reset cause, as following: power
auxiliary outage or power refresh, ASIC thermal shutdown, halt,
hotswap, watchdog, firmware reset, long press power button,
@@ -106,7 +106,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_system
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_voltmon_upgrade_fail
Date: November 2018
KernelVersion: 5.0
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show the system reset cause, as following: ComEx
power fail, reset from ComEx, system platform reset, reset
due to voltage monitor devices upgrade failure,
@@ -119,7 +119,7 @@ Description: These files show the system reset cause, as following: ComEx
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld4_version
Date: November 2018
KernelVersion: 5.0
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show with which CPLD versions have been burned
on LED board.
@@ -133,7 +133,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd
Date: June 2019
KernelVersion: 5.3
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show the system reset cause, as following:
COMEX thermal shutdown; wathchdog power off or reset was derived
by one of the next components: COMEX, switch board or by Small Form
@@ -148,7 +148,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config1
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config2
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show system static topology identification
like system's static I2C topology, number and type of FPGA
devices within the system and so on.
@@ -161,7 +161,7 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_soc
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_pwr_off
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show the system reset causes, as following: reset
due to AC power failure, reset invoked from software by
assertion reset signal through CPLD. reset caused by signal
@@ -173,7 +173,7 @@ Description: These files show the system reset causes, as following: reset
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pcie_asic_reset_dis
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file allows to retain ASIC up during PCIe root complex
reset, when attribute is set 1.
@@ -182,7 +182,7 @@ Description: This file allows to retain ASIC up during PCIe root complex
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/vpd_wp
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file allows to overwrite system VPD hardware write
protection when attribute is set 1.
@@ -191,7 +191,7 @@ Description: This file allows to overwrite system VPD hardware write
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/voltreg_update_status
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file exposes the configuration update status of burnable
voltage regulator devices. The status values are as following:
0 - OK; 1 - CRC failure; 2 = I2C failure; 3 - in progress.
@@ -201,7 +201,7 @@ Description: This file exposes the configuration update status of burnable
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/ufm_version
Date: January 2020
KernelVersion: 5.6
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: This file exposes the firmware version of burnable voltage
regulator devices.
@@ -217,9 +217,448 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld3_version_min
What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld4_version_min
Date: July 2020
KernelVersion: 5.9
-Contact: Vadim Pasternak <vadimpmellanox.com>
+Contact: Vadim Pasternak <vadimp@nvidia.com>
Description: These files show with which CPLD part numbers and minor
versions have been burned CPLD devices equipped on a
system.
The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/bios_active_image
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/bios_auth_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/bios_upgrade_fail
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: The files represent BIOS statuses:
+
+ bios_active_image: location of current active BIOS image:
+ 0: Top, 1: Bottom.
+ The reported value should correspond to value expected by OS
+ in case of BIOS safe mode is 0. This bit is related to Intel
+ top-swap feature of DualBios on the same flash.
+
+ bios_auth_fail: BIOS upgrade is failed because provided BIOS
+ image is not signed correctly.
+
+ bios_upgrade_fail: BIOS upgrade is failed by some other
+ reason not because authentication. For example due to
+ physical SPI flash problem.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc1_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc2_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc3_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc4_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc5_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc6_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc7_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc8_enable
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow line cards enable state control.
+ Expected behavior:
+ When lc{n}_enable is written 1, related line card is released
+ from the reset state, when 0 - is hold in reset state.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc1_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc2_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc3_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc4_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc5_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc6_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc7_pwr
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc8_pwr
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files switching line cards power on and off.
+ Expected behavior:
+ When lc{n}_pwr is written 1, related line card is powered
+ on, when written 0 - powered off.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc1_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc2_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc3_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc4_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc5_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc6_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc7_rst_mask
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/lc8_rst_mask
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files clear line card reset bit enforced by ASIC, when it
+ sets it due to some abnormal ASIC behavior.
+ Expected behavior:
+ When lc{n}_rst_mask is written 1, related line card reset bit
+ is cleared, when written 0 - no effect.
+
+ The files are write only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/os_started
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file, when written 1, indicates to programmable devices
+ that OS is taking control over it.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pm_mgmt_en
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file assigns power management control ownership.
+ When power management control is provided by hardware, hardware
+ will automatically power off one or more line previously
+ powered line cards in case system power budget is getting
+ insufficient. It could be in case when some of power units lost
+ power good state.
+ When pm_mgmt_en is written 1, power management control by
+ software is enabled, 0 - power management control by hardware.
+ Note that for any setting of pm_mgmt_en attribute hardware will
+ not allow to power on any new line card in case system power
+ budget is insufficient.
+ Same in case software will try to power on several line cards
+ at once - hardware will power line cards while system has
+ enough power budget.
+ Default is 0.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/psu3_on
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/psu4_on
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files switching power supply units on and off.
+ Expected behavior:
+ When psu3_on or psu4_on is written 1, related unit will be
+ disconnected from the power source, when written 0 - connected.
+
+ The files are write only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/shutdown_unlock
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file allows to unlock ASIC after thermal shutdown event.
+ When system thermal shutdown is enforced by ASIC, ASIC is
+ getting locked and after system boot it will not be available.
+ Software can decide to unlock it by setting this attribute to
+ 1 and then perform system power cycle by setting pwr_cycle
+ attribute to 1 (power cycle of main power domain).
+ Before setting shutdown_unlock to 1 it is recommended to
+ validate that system reboot cause is reset_asic_thermal or
+ reset_thermal_spc_or_pciesw.
+ In case shutdown_unlock is not set 1, the only way to release
+ ASIC from locking - is full system power cycle through the
+ external power distribution unit.
+ Default is 1.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/cpld1_pn
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/cpld1_version
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/cpld1_version_min
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files show with which CPLD major and minor versions
+ and part number has been burned CPLD device on line card.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/fpga1_pn
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/fpga1_version
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/fpga1_version_min
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files show with which FPGA major and minor versions
+ and part number has been burned FPGA device on line card.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/vpd_wp
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file allow to overwrite line card VPD hardware write
+ protection mode. When attribute is set 1 - write protection is
+ disabled, when 0 - enabled.
+ Default is 0.
+ If the system is in locked-down mode writing this file will not
+ be allowed.
+ The purpose if this file is to allow line card VPD burning
+ during production flow.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_aux_pwr_or_ref
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_dc_dc_pwr_fail
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_fpga_not_done
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_from_chassis
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_line_card
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/reset_pwr_off_from_chassis
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files show the line reset cause, as following: power
+ auxiliary outage or power refresh, DC-to-DC power failure, FPGA reset
+ failed, line card reset failed, power off from chassis.
+ Value 1 in file means this is reset cause, 0 - otherwise. Only one of
+ the above causes could be 1 at the same time, representing only last
+ reset cause.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/cpld_upgrade_en
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/fpga_upgrade_en
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow CPLD and FPGA burning. Value 1 in file means burning
+ is enabled, 0 - otherwise.
+ If the system is in locked-down mode writing these files will
+ not be allowed.
+ The purpose of these files to allow line card CPLD and FPGA
+ upgrade through the JTAG daisy-chain.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/qsfp_pwr_en
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/pwr_en
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow to power on/off all QSFP ports and whole line card.
+ The attributes are set 1 for power on, 0 - for power off.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/agb_spi_burn_en
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/fpga_spi_burn_en
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow gearboxes and FPGA SPI flash burning.
+ The attributes are set 1 to enable burning, 0 - to disable.
+ If the system is in locked-down mode writing these files will
+ not be allowed.
+ The purpose of these files to allow line card Gearboxes and FPGA
+ burning during production flow.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/max_power
+What: /sys/devices/platform/mlxplat/i2c_mlxcpld.*/i2c-*/i2c-*/i2c-*/*-0032/mlxreg-io.*/hwmon/hwmon*/config
+Date: October 2021
+KernelVersion: 5.16
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files provide the maximum powered required for line card
+ feeding and line card configuration Id.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/phy_reset
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file allows to reset PHY 88E1548 when attribute is set 0
+ due to some abnormal PHY behavior.
+ Expected behavior:
+ When phy_reset is written 1, all PHY 88E1548 are released
+ from the reset state, when 0 - are hold in reset state.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/mac_reset
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file allows to reset ASIC MT52132 when attribute is set 0
+ due to some abnormal ASIC behavior.
+ Expected behavior:
+ When mac_reset is written 1, the ASIC MT52132 is released
+ from the reset state, when 0 - is hold in reset state.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/qsfp_pwr_good
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file shows QSFP ports power status. The value is set to 0
+ when one of any QSFP ports is plugged. The value is set to 1 when
+ there are no any QSFP ports are plugged.
+ The possible values are:
+ 0 - Power good, 1 - Not power good.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic2_health
+Date: July 2022
+KernelVersion: 5.20
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file shows 2-nd ASIC health status. The possible values are:
+ 0 - health failed, 2 - health OK, 3 - ASIC in booting state.
+
+ The file is read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_reset
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic2_reset
+Date: July 2022
+KernelVersion: 5.20
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow to each of ASICs by writing 1.
+
+ The files are write only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/comm_chnl_ready
+Date: July 2022
+KernelVersion: 5.20
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file is used to indicate remote end (for example BMC) that system
+ host CPU is ready for sending telemetry data to remote end.
+ For indication the file should be written 1.
+
+ The file is write only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config3
+Date: January 2020
+KernelVersion: 5.6
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: The file indicates COME module hardware configuration.
+ The value is pushed by hardware through GPIO pins.
+ The purpose is to expose some minor BOM changes for the same system SKU.
+
+ The file is read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_pwr_converter_fail
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file shows the system reset cause due to power converter
+ devices failure.
+ Value 1 in file means this is reset cause, 0 - otherwise.
+
+ The file is read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_ap_reset
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_ap_reset
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files aim to monitor the status of the External Root of Trust (EROT)
+ processor's RESET output to the Application Processor (AP).
+ By reading this file, could be determined if the EROT has invalidated or
+ revoked AP Firmware, at which point it will hold the AP in RESET until a
+ valid firmware is loaded. This protects the AP from running an
+ unauthorized firmware. In the normal flow, the AP reset should be released
+ after the EROT validates the integrity of the FW, and it should be done so
+ as quickly as possible so that the AP boots before the CPU starts to
+ communicate to each ASIC.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_recovery
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_recovery
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_reset
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_reset
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files aim to perform External Root of Trust (EROT) recovery
+ sequence after EROT device failure.
+ These EROT devices protect ASICs from unauthorized access and in normal
+ flow their reset should be released with system power – earliest power
+ up stage, so that EROTs can begin boot and authentication process before
+ CPU starts to communicate to ASICs.
+ Issuing a reset to the EROT while asserting the recovery signal will cause
+ the EROT Application Processor to enter recovery mode so that the EROT FW
+ can be updated/recovered.
+ For reset/recovery the related file should be toggled by 1/0.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot1_wp
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/erot2_wp
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: These files allow access to External Root of Trust (EROT) for reset
+ and recovery sequence after EROT device failure.
+ Default is 0 (programming disabled).
+ If the system is in locked-down mode writing this file will not be allowed.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/spi_chnl_select
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file allows SPI chip selection for External Root of Trust (EROT)
+ device Out-of-Band recovery.
+ File can be written with 0 or with 1. It selects which EROT can be accessed
+ through SPI device.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_pg_fail
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak vadimp@nvidia.com
+Description: This file shows ASIC Power Good status.
+ Value 1 in file means ASIC Power Good failed, 0 - otherwise.
+
+ The file is read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd1_boot_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd2_boot_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd_fail
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak vadimp@nvidia.com
+Description: These files are related to clock boards status in system.
+ - clk_brd1_boot_fail: warning about 1-st clock board failed to boot from CI.
+ - clk_brd2_boot_fail: warning about 2-nd clock board failed to boot from CI.
+ - clk_brd_fail: error about common clock board boot failure.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/clk_brd_prog_en
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file enables programming of clock boards.
+ Default is 0 (programming disabled).
+ If the system is in locked-down mode writing this file will not be allowed.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pwr_converter_prog_en
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file enables programming of power converters.
+ Default is 0 (programming disabled).
+ If the system is in locked-down mode writing this file will not be allowed.
+
+ The file is read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_ac_ok_fail
+Date: February 2023
+KernelVersion: 6.3
+Contact: Vadim Pasternak <vadimp@nvidia.com>
+Description: This file shows the system reset cause due to AC power failure.
+ Value 1 in file means this is reset cause, 0 - otherwise.
+
+ The file is read only.
diff --git a/Documentation/ABI/stable/sysfs-driver-speakup b/Documentation/ABI/stable/sysfs-driver-speakup
index dc2a6ba1674b..bcb6831aa114 100644
--- a/Documentation/ABI/stable/sysfs-driver-speakup
+++ b/Documentation/ABI/stable/sysfs-driver-speakup
@@ -35,6 +35,15 @@ Description: This controls cursor delay when using arrow keys. When a
characters. Set this to a higher value to adjust for the delay
and better synchronisation between cursor position and speech.
+What: /sys/accessibility/speakup/cur_phonetic
+KernelVersion: 6.2
+Contact: speakup@linux-speakup.org
+Description: This allows speakup to speak letters phoneticaly when arrowing through
+ a word letter by letter. This doesn't affect the spelling when typing
+ the characters. When cur_phonetic=1, speakup will speak characters
+ phoneticaly when arrowing over a letter. When cur_phonetic=0, speakup
+ will speak letters as normally.
+
What: /sys/accessibility/speakup/delimiters
KernelVersion: 2.6
Contact: speakup@linux-speakup.org
diff --git a/Documentation/ABI/stable/sysfs-hypervisor-xen b/Documentation/ABI/stable/sysfs-hypervisor-xen
index 748593c64568..be9ca9981bb1 100644
--- a/Documentation/ABI/stable/sysfs-hypervisor-xen
+++ b/Documentation/ABI/stable/sysfs-hypervisor-xen
@@ -120,3 +120,16 @@ Contact: xen-devel@lists.xenproject.org
Description: If running under Xen:
The Xen version is in the format <major>.<minor><extra>
This is the <minor> part of it.
+
+What: /sys/hypervisor/start_flags/*
+Date: March 2023
+KernelVersion: 6.3.0
+Contact: xen-devel@lists.xenproject.org
+Description: If running under Xen:
+ All bits in Xen's start-flags are represented as
+ boolean files, returning '1' if set, '0' otherwise.
+ This takes the place of the defunct /proc/xen/capabilities,
+ which would contain "control_d" on dom0, and be empty
+ otherwise. This flag is now exposed as "initdomain" in
+ addition to the "privileged" flag; all other possible flags
+ are accessible as "unknownXX".
diff --git a/Documentation/ABI/stable/sysfs-module b/Documentation/ABI/stable/sysfs-module
index 6272ae5fb366..41b1f16e8795 100644
--- a/Documentation/ABI/stable/sysfs-module
+++ b/Documentation/ABI/stable/sysfs-module
@@ -1,8 +1,7 @@
-What: /sys/module
-Description:
- The /sys/module tree consists of the following structure:
+The /sys/module tree consists of the following structure:
- /sys/module/MODULENAME
+What: /sys/module/<MODULENAME>
+Description:
The name of the module that is in the kernel. This
module name will always show up if the module is loaded as a
dynamic module. If it is built directly into the kernel, it
@@ -12,7 +11,8 @@ Description:
Note: The conditions of creation in the built-in case are not
by design and may be removed in the future.
- /sys/module/MODULENAME/parameters
+What: /sys/module/<MODULENAME>/parameters
+Description:
This directory contains individual files that are each
individual parameters of the module that are able to be
changed at runtime. See the individual module
@@ -25,10 +25,23 @@ Description:
individual driver documentation for details as to the
stability of the different parameters.
- /sys/module/MODULENAME/refcnt
+What: /sys/module/<MODULENAME>/refcnt
+Description:
If the module is able to be unloaded from the kernel, this file
will contain the current reference count of the module.
Note: If the module is built into the kernel, or if the
CONFIG_MODULE_UNLOAD kernel configuration value is not enabled,
this file will not be present.
+
+What: /sys/module/<MODULENAME>/srcversion
+Date: Jun 2005
+Description:
+ If the module source has MODULE_VERSION, this file will contain
+ the checksum of the source code.
+
+What: /sys/module/<MODULENAME>/version
+Date: Jun 2005
+Description:
+ If the module source has MODULE_VERSION, this file will contain
+ the version of the source code.
diff --git a/Documentation/ABI/testing/configfs-usb-gadget b/Documentation/ABI/testing/configfs-usb-gadget
index b7943aa7e997..a8bb896def54 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget
+++ b/Documentation/ABI/testing/configfs-usb-gadget
@@ -143,3 +143,16 @@ Description:
qw_sign an identifier to be reported as "OS String"
proper
============= ===============================================
+
+What: /config/usb-gadget/gadget/webusb
+Date: Dec 2022
+KernelVersion: 6.3
+Description:
+ This group contains "WebUSB" extension handling attributes.
+
+ ============= ===============================================
+ use flag turning "WebUSB" support on/off
+ bcdVersion bcd WebUSB specification version number
+ bVendorCode one-byte value used for custom per-device
+ landingPage UTF-8 encoded URL of the device's landing page
+ ============= ===============================================
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-mass-storage b/Documentation/ABI/testing/configfs-usb-gadget-mass-storage
index c86b63a7bb43..fc0328069267 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-mass-storage
+++ b/Documentation/ABI/testing/configfs-usb-gadget-mass-storage
@@ -19,7 +19,7 @@ KernelVersion: 3.13
Description:
The attributes:
- =========== ==============================================
+ ============ ==============================================
file The path to the backing file for the LUN.
Required if LUN is not marked as removable.
ro Flag specifying access to the LUN shall be
@@ -32,4 +32,10 @@ Description:
being a CD-ROM.
nofua Flag specifying that FUA flag
in SCSI WRITE(10,12)
- =========== ==============================================
+ forced_eject This write-only file is useful only when
+ the function is active. It causes the backing
+ file to be forcibly detached from the LUN,
+ regardless of whether the host has allowed it.
+ Any non-zero number of bytes written will
+ result in ejection.
+ ============ ==============================================
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uac1 b/Documentation/ABI/testing/configfs-usb-gadget-uac1
index dd647d44d975..c4ba92f004c3 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-uac1
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uac1
@@ -4,23 +4,30 @@ KernelVersion: 4.14
Description:
The attributes:
- ========== ===================================
- c_chmask capture channel mask
- c_srate capture sampling rate
- c_ssize capture sample size (bytes)
- c_mute_present capture mute control enable
+ ===================== =======================================
+ c_chmask capture channel mask
+ c_srate list of capture sampling rates (comma-separated)
+ c_ssize capture sample size (bytes)
+ c_mute_present capture mute control enable
c_volume_present capture volume control enable
- c_volume_min capture volume control min value (in 1/256 dB)
- c_volume_max capture volume control max value (in 1/256 dB)
- c_volume_res capture volume control resolution (in 1/256 dB)
- p_chmask playback channel mask
- p_srate playback sampling rate
- p_ssize playback sample size (bytes)
- p_mute_present playback mute control enable
+ c_volume_min capture volume control min value
+ (in 1/256 dB)
+ c_volume_max capture volume control max value
+ (in 1/256 dB)
+ c_volume_res capture volume control resolution
+ (in 1/256 dB)
+ p_chmask playback channel mask
+ p_srate list of playback sampling rates (comma-separated)
+ p_ssize playback sample size (bytes)
+ p_mute_present playback mute control enable
p_volume_present playback volume control enable
- p_volume_min playback volume control min value (in 1/256 dB)
- p_volume_max playback volume control max value (in 1/256 dB)
- p_volume_res playback volume control resolution (in 1/256 dB)
- req_number the number of pre-allocated request
- for both capture and playback
- ========== ===================================
+ p_volume_min playback volume control min value
+ (in 1/256 dB)
+ p_volume_max playback volume control max value
+ (in 1/256 dB)
+ p_volume_res playback volume control resolution
+ (in 1/256 dB)
+ req_number the number of pre-allocated requests
+ for both capture and playback
+ function_name name of the interface
+ ===================== =======================================
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uac2 b/Documentation/ABI/testing/configfs-usb-gadget-uac2
index cfd160ff8b56..3371c39f651d 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-uac2
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uac2
@@ -4,23 +4,35 @@ KernelVersion: 3.18
Description:
The attributes:
- ========= ============================
- c_chmask capture channel mask
- c_srate capture sampling rate
- c_ssize capture sample size (bytes)
- c_sync capture synchronization type (async/adaptive)
- c_mute_present capture mute control enable
+ ===================== =======================================
+ c_chmask capture channel mask
+ c_srate list of capture sampling rates (comma-separated)
+ c_ssize capture sample size (bytes)
+ c_hs_bint capture bInterval for HS/SS (1-4: fixed, 0: auto)
+ c_sync capture synchronization type
+ (async/adaptive)
+ c_mute_present capture mute control enable
c_volume_present capture volume control enable
- c_volume_min capture volume control min value (in 1/256 dB)
- c_volume_max capture volume control max value (in 1/256 dB)
- c_volume_res capture volume control resolution (in 1/256 dB)
- fb_max maximum extra bandwidth in async mode
- p_chmask playback channel mask
- p_srate playback sampling rate
- p_ssize playback sample size (bytes)
- p_mute_present playback mute control enable
+ c_volume_min capture volume control min value
+ (in 1/256 dB)
+ c_volume_max capture volume control max value
+ (in 1/256 dB)
+ c_volume_res capture volume control resolution
+ (in 1/256 dB)
+ fb_max maximum extra bandwidth in async mode
+ p_chmask playback channel mask
+ p_srate list of playback sampling rates (comma-separated)
+ p_ssize playback sample size (bytes)
+ p_hs_bint playback bInterval for HS/SS (1-4: fixed, 0: auto)
+ p_mute_present playback mute control enable
p_volume_present playback volume control enable
- p_volume_min playback volume control min value (in 1/256 dB)
- p_volume_max playback volume control max value (in 1/256 dB)
- p_volume_res playback volume control resolution (in 1/256 dB)
- ========= ============================
+ p_volume_min playback volume control min value
+ (in 1/256 dB)
+ p_volume_max playback volume control max value
+ (in 1/256 dB)
+ p_volume_res playback volume control resolution
+ (in 1/256 dB)
+ req_number the number of pre-allocated requests
+ for both capture and playback
+ function_name name of the interface
+ ===================== =======================================
diff --git a/Documentation/ABI/testing/configfs-usb-gadget-uvc b/Documentation/ABI/testing/configfs-usb-gadget-uvc
index 889ed45be4ca..4feb692c4c1d 100644
--- a/Documentation/ABI/testing/configfs-usb-gadget-uvc
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uvc
@@ -7,6 +7,7 @@ Description: UVC function directory
streaming_maxburst 0..15 (ss only)
streaming_maxpacket 1..1023 (fs), 1..3072 (hs/ss)
streaming_interval 1..16
+ function_name string [32]
=================== =============================
What: /config/usb-gadget/gadget/functions/uvc.name/control
@@ -14,12 +15,14 @@ Date: Dec 2014
KernelVersion: 4.0
Description: Control descriptors
- All attributes read only:
+ All attributes read only except enable_interrupt_ep:
- ================ =============================
+ =================== =============================
bInterfaceNumber USB interface number for this
streaming interface
- ================ =============================
+ enable_interrupt_ep flag to enable the interrupt
+ endpoint for the VC interface
+ =================== =============================
What: /config/usb-gadget/gadget/functions/uvc.name/control/class
Date: Dec 2014
@@ -51,7 +54,7 @@ Date: Dec 2014
KernelVersion: 4.0
Description: Default output terminal descriptors
- All attributes read only:
+ All attributes read only except bSourceID:
============== =============================================
iTerminal index of string descriptor
@@ -73,7 +76,7 @@ Date: Dec 2014
KernelVersion: 4.0
Description: Default camera terminal descriptors
- All attributes read only:
+ All attributes read only except bmControls, which is read/write:
======================== ====================================
bmControls bitmap specifying which controls are
@@ -98,7 +101,7 @@ Date: Dec 2014
KernelVersion: 4.0
Description: Default processing unit descriptors
- All attributes read only:
+ All attributes read only except bmControls, which is read/write:
=============== ========================================
iProcessing index of string descriptor
@@ -110,6 +113,34 @@ Description: Default processing unit descriptors
bUnitID a non-zero id of this unit
=============== ========================================
+What: /config/usb-gadget/gadget/functions/uvc.name/control/extensions
+Date: Nov 2022
+KernelVersion: 6.1
+Description: Extension unit descriptors
+
+What: /config/usb-gadget/gadget/functions/uvc.name/control/extensions/name
+Date: Nov 2022
+KernelVersion: 6.1
+Description: Extension Unit (XU) Descriptor
+
+ bLength, bUnitID and iExtension are read-only. All others are
+ read-write.
+
+ ================= ========================================
+ bLength size of the descriptor in bytes
+ bUnitID non-zero ID of this unit
+ guidExtensionCode Vendor-specific code identifying the XU
+ bNumControls number of controls in this XU
+ bNrInPins number of input pins for this unit
+ baSourceID list of the IDs of the units or terminals
+ to which this XU is connected
+ bControlSize size of the bmControls field in bytes
+ bmControls list of bitmaps detailing which vendor
+ specific controls are supported
+ iExtension index of a string descriptor that describes
+ this extension unit
+ ================= ========================================
+
What: /config/usb-gadget/gadget/functions/uvc.name/control/header
Date: Dec 2014
KernelVersion: 4.0
@@ -164,7 +195,24 @@ Date: Dec 2014
KernelVersion: 4.0
Description: Default color matching descriptors
- All attributes read only:
+ All attributes read/write:
+
+ ======================== ======================================
+ bMatrixCoefficients matrix used to compute luma and
+ chroma values from the color primaries
+ bTransferCharacteristics optoelectronic transfer
+ characteristic of the source picture,
+ also called the gamma function
+ bColorPrimaries color primaries and the reference
+ white
+ ======================== ======================================
+
+What: /config/usb-gadget/gadget/functions/uvc.name/streaming/color_matching/name
+Date: Dec 2022
+KernelVersion: 6.3
+Description: Additional color matching descriptors
+
+ All attributes read/write:
======================== ======================================
bMatrixCoefficients matrix used to compute luma and
@@ -196,7 +244,7 @@ Description: Specific MJPEG format descriptors
read-only
bmaControls this format's data for bmaControls in
the streaming header
- bmInterfaceFlags specifies interlace information,
+ bmInterlaceFlags specifies interlace information,
read-only
bAspectRatioY the X dimension of the picture aspect
ratio, read-only
@@ -252,7 +300,7 @@ Description: Specific uncompressed format descriptors
read-only
bmaControls this format's data for bmaControls in
the streaming header
- bmInterfaceFlags specifies interlace information,
+ bmInterlaceFlags specifies interlace information,
read-only
bAspectRatioY the X dimension of the picture aspect
ratio, read-only
diff --git a/Documentation/ABI/testing/debugfs-cros-ec b/Documentation/ABI/testing/debugfs-cros-ec
index 1fe0add99a2a..9a040c6f5e03 100644
--- a/Documentation/ABI/testing/debugfs-cros-ec
+++ b/Documentation/ABI/testing/debugfs-cros-ec
@@ -54,3 +54,25 @@ Description:
this feature.
Output will be in the format: "0x%08x\n".
+
+What: /sys/kernel/debug/<cros-ec-device>/suspend_timeout_ms
+Date: August 2022
+KernelVersion: 6.1
+Description:
+ Some ECs have a feature where they will track transitions of
+ a hardware-controlled sleep line, such as Intel's SLP_S0 line,
+ in order to detect cases where a system failed to go into deep
+ sleep states. The suspend_timeout_ms file controls the amount of
+ time in milliseconds the EC will wait before declaring a sleep
+ timeout event and attempting to wake the system.
+
+ Supply 0 to use the default value coded into EC firmware. Supply
+ 65535 (EC_HOST_SLEEP_TIMEOUT_INFINITE) to disable the EC sleep
+ failure detection mechanism. Values in between 0 and 65535
+ indicate the number of milliseconds the EC should wait after a
+ sleep transition before declaring a timeout. This includes both
+ the duration after a sleep command was received but before the
+ hardware line changed, as well as the duration between when the
+ hardware line changed and the kernel sent an EC resume command.
+
+ Output will be in the format: "%u\n".
diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl
new file mode 100644
index 000000000000..fe61d372e3fa
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-cxl
@@ -0,0 +1,35 @@
+What: /sys/kernel/debug/cxl/memX/inject_poison
+Date: April, 2023
+KernelVersion: v6.4
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) When a Device Physical Address (DPA) is written to this
+ attribute, the memdev driver sends an inject poison command to
+ the device for the specified address. The DPA must be 64-byte
+ aligned and the length of the injected poison is 64-bytes. If
+ successful, the device returns poison when the address is
+ accessed through the CXL.mem bus. Injecting poison adds the
+ address to the device's Poison List and the error source is set
+ to Injected. In addition, the device adds a poison creation
+ event to its internal Informational Event log, updates the
+ Event Status register, and if configured, interrupts the host.
+ It is not an error to inject poison into an address that
+ already has poison present and no error is returned. The
+ inject_poison attribute is only visible for devices supporting
+ the capability.
+
+
+What: /sys/kernel/debug/memX/clear_poison
+Date: April, 2023
+KernelVersion: v6.4
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) When a Device Physical Address (DPA) is written to this
+ attribute, the memdev driver sends a clear poison command to
+ the device for the specified address. Clearing poison removes
+ the address from the device's Poison List and writes 0 (zero)
+ for 64 bytes starting at address. It is not an error to clear
+ poison from an address that does not have poison set. If the
+ device cannot clear poison from the address, -ENXIO is returned.
+ The clear_poison attribute is only visible for devices
+ supporting the capability.
diff --git a/Documentation/ABI/testing/debugfs-dell-wmi-ddv b/Documentation/ABI/testing/debugfs-dell-wmi-ddv
new file mode 100644
index 000000000000..fbcc5d6f7388
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-dell-wmi-ddv
@@ -0,0 +1,21 @@
+What: /sys/kernel/debug/dell-wmi-ddv-<wmi_device_name>/fan_sensor_information
+Date: September 2022
+KernelVersion: 6.1
+Contact: Armin Wolf <W_Armin@gmx.de>
+Description:
+ This file contains the contents of the fan sensor information buffer,
+ which contains fan sensor entries and a terminating character (0xFF).
+
+ Each fan sensor entry consists of three bytes with an unknown meaning,
+ interested people may use this file for reverse-engineering.
+
+What: /sys/kernel/debug/dell-wmi-ddv-<wmi_device_name>/thermal_sensor_information
+Date: September 2022
+KernelVersion: 6.1
+Contact: Armin Wolf <W_Armin@gmx.de>
+Description:
+ This file contains the contents of the thermal sensor information buffer,
+ which contains thermal sensor entries and a terminating character (0xFF).
+
+ Each thermal sensor entry consists of five bytes with an unknown meaning,
+ interested people may use this file for reverse-engineering.
diff --git a/Documentation/ABI/testing/debugfs-driver-dcc b/Documentation/ABI/testing/debugfs-driver-dcc
new file mode 100644
index 000000000000..27ed5919d21b
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-driver-dcc
@@ -0,0 +1,127 @@
+What: /sys/kernel/debug/dcc/.../ready
+Date: December 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ This file is used to check the status of the dcc
+ hardware if it's ready to receive user configurations.
+ A 'Y' here indicates dcc is ready.
+
+What: /sys/kernel/debug/dcc/.../trigger
+Date: December 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ This is the debugfs interface for manual software
+ triggers. The trigger can be invoked by writing '1'
+ to the file.
+
+What: /sys/kernel/debug/dcc/.../config_reset
+Date: December 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ This file is used to reset the configuration of
+ a dcc driver to the default configuration. When '1'
+ is written to the file, all the previous addresses
+ stored in the driver gets removed and users need to
+ reconfigure addresses again.
+
+What: /sys/kernel/debug/dcc/.../[list-number]/config
+Date: December 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ This stores the addresses of the registers which
+ can be read in case of a hardware crash or manual
+ software triggers. The input addresses type
+ can be one of following dcc instructions: read,
+ write, read-write, and loop type. The lists need to
+ be configured sequentially and not in a overlapping
+ manner; e.g. users can jump to list x only after
+ list y is configured and enabled. The input format for
+ each type is as follows:
+
+ i) Read instruction
+
+ ::
+
+ echo R <addr> <n> <bus> >/sys/kernel/debug/dcc/../[list-number]/config
+
+ where:
+
+ <addr>
+ The address to be read.
+
+ <n>
+ The addresses word count, starting from address <1>.
+ Each word is 32 bits (4 bytes). If omitted, defaulted
+ to 1.
+
+ <bus type>
+ The bus type, which can be either 'apb' or 'ahb'.
+ The default is 'ahb' if leaved out.
+
+ ii) Write instruction
+
+ ::
+
+ echo W <addr> <n> <bus type> > /sys/kernel/debug/dcc/../[list-number]/config
+
+ where:
+
+ <addr>
+ The address to be written.
+
+ <n>
+ The value to be written at <addr>.
+
+ <bus type>
+ The bus type, which can be either 'apb' or 'ahb'.
+
+ iii) Read-write instruction
+
+ ::
+
+ echo RW <addr> <n> <mask> > /sys/kernel/debug/dcc/../[list-number]/config
+
+ where:
+
+ <addr>
+ The address to be read and written.
+
+ <n>
+ The value to be written at <addr>.
+
+ <mask>
+ The value mask.
+
+ iv) Loop instruction
+
+ ::
+
+ echo L <loop count> <address count> <address>... > /sys/kernel/debug/dcc/../[list-number]/config
+
+ where:
+
+ <loop count>
+ Number of iterations
+
+ <address count>
+ total number of addresses to be written
+
+ <address>
+ Space-separated list of addresses.
+
+What: /sys/kernel/debug/dcc/.../[list-number]/enable
+Date: December 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ This debugfs interface is used for enabling the
+ the dcc hardware. A file named "enable" is in the
+ directory list number where users can enable/disable
+ the specific list by writing boolean (1 or 0) to the
+ file.
+
+ On enabling the dcc, all the addresses specified
+ by the user for the corresponding list is written
+ into dcc sram which is read by the dcc hardware
+ on manual or crash induced triggers. Lists must
+ be configured and enabled sequentially, e.g. list
+ 2 can only be enabled when list 1 have so.
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 284e2dfa61cd..85f6d04f528b 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -12,24 +12,7 @@ What: /sys/kernel/debug/habanalabs/hl<n>/clk_gate
Date: May 2020
KernelVersion: 5.8
Contact: ogabbay@kernel.org
-Description: Allow the root user to disable/enable in runtime the clock
- gating mechanism in Gaudi. Due to how Gaudi is built, the
- clock gating needs to be disabled in order to access the
- registers of the TPC and MME engines. This is sometimes needed
- during debug by the user and hence the user needs this option.
- The user can supply a bitmask value, each bit represents
- a different engine to disable/enable its clock gating feature.
- The bitmask is composed of 20 bits:
-
- ======= ============
- 0 - 7 DMA channels
- 8 - 11 MME engines
- 12 - 19 TPC engines
- ======= ============
-
- The bit's location of a specific engine can be determined
- using (1 << GAUDI_ENGINE_ID_*). GAUDI_ENGINE_ID_* values
- are defined in uapi habanalabs.h file in enum gaudi_engine_id
+Description: This setting is now deprecated as clock gating is handled solely by the f/w
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
Date: Jan 2019
@@ -108,6 +91,13 @@ Description: Enables the root user to set the device to specific state.
Valid values are "disable", "enable", "suspend", "resume".
User can read this property to see the valid values
+What: /sys/kernel/debug/habanalabs/hl<n>/device_release_watchdog_timeout
+Date: Oct 2022
+KernelVersion: 6.2
+Contact: ttayar@habana.ai
+Description: The watchdog timeout value in seconds for a device relese upon
+ certain error cases, after which the device is reset.
+
What: /sys/kernel/debug/habanalabs/hl<n>/dma_size
Date: Apr 2021
KernelVersion: 5.13
@@ -118,6 +108,15 @@ Description: Specify the size of the DMA transaction when using DMA to read
When the write is finished, the user can read the "data_dma"
blob
+What: /sys/kernel/debug/habanalabs/hl<n>/dump_razwi_events
+Date: Aug 2022
+KernelVersion: 5.20
+Contact: fkassabri@habana.ai
+Description: Dumps all razwi events to dmesg if exist.
+ After reading the status register of an existing event
+ the routine will clear the status register.
+ Usage: cat dump_razwi_events
+
What: /sys/kernel/debug/habanalabs/hl<n>/dump_security_violations
Date: Jan 2021
KernelVersion: 5.12
@@ -138,14 +137,16 @@ Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Sets I2C device address for I2C transaction that is generated
- by the device's CPU
+ by the device's CPU, Not available when device is loaded with secured
+ firmware
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_bus
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Sets I2C bus address for I2C transaction that is generated by
- the device's CPU
+ the device's CPU, Not available when device is loaded with secured
+ firmware
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_data
Date: Jan 2019
@@ -153,32 +154,60 @@ KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Triggers an I2C transaction that is generated by the device's
CPU. Writing to this file generates a write transaction while
- reading from the file generates a read transaction
+ reading from the file generates a read transaction, Not available
+ when device is loaded with secured firmware
+
+What: /sys/kernel/debug/habanalabs/hl<n>/i2c_len
+Date: Dec 2021
+KernelVersion: 5.17
+Contact: obitton@habana.ai
+Description: Sets I2C length in bytes for I2C transaction that is generated by
+ the device's CPU, Not available when device is loaded with secured
+ firmware
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_reg
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Sets I2C register id for I2C transaction that is generated by
- the device's CPU
+ the device's CPU, Not available when device is loaded with secured
+ firmware
What: /sys/kernel/debug/habanalabs/hl<n>/led0
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Sets the state of the first S/W led on the device
+Description: Sets the state of the first S/W led on the device, Not available
+ when device is loaded with secured firmware
What: /sys/kernel/debug/habanalabs/hl<n>/led1
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Sets the state of the second S/W led on the device
+Description: Sets the state of the second S/W led on the device, Not available
+ when device is loaded with secured firmware
What: /sys/kernel/debug/habanalabs/hl<n>/led2
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Sets the state of the third S/W led on the device
+Description: Sets the state of the third S/W led on the device, Not available
+ when device is loaded with secured firmware
+
+What: /sys/kernel/debug/habanalabs/hl<n>/memory_scrub
+Date: May 2022
+KernelVersion: 5.19
+Contact: dhirschfeld@habana.ai
+Description: Allows the root user to scrub the dram memory. The scrubbing
+ value can be set using the debugfs file memory_scrub_val.
+
+What: /sys/kernel/debug/habanalabs/hl<n>/memory_scrub_val
+Date: May 2022
+KernelVersion: 5.19
+Contact: dhirschfeld@habana.ai
+Description: The value to which the dram will be set to when the user
+ scrubs the dram using 'memory_scrub' debugfs file and
+ the scrubbing value when using module param 'memory_scrub'
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
Date: Jan 2019
@@ -200,6 +229,30 @@ Description: Check and display page fault or access violation mmu errors for
echo "0x200" > /sys/kernel/debug/habanalabs/hl0/mmu_error
cat /sys/kernel/debug/habanalabs/hl0/mmu_error
+What: /sys/kernel/debug/habanalabs/hl<n>/monitor_dump
+Date: Mar 2022
+KernelVersion: 5.19
+Contact: osharabi@habana.ai
+Description: Allows the root user to dump monitors status from the device's
+ protected config space.
+ This property is a binary blob that contains the result of the
+ monitors registers dump.
+ This custom interface is needed (instead of using the generic
+ Linux user-space PCI mapping) because this space is protected
+ and cannot be accessed using PCI read.
+ This interface doesn't support concurrency in the same device.
+ Only supported on GAUDI.
+
+What: /sys/kernel/debug/habanalabs/hl<n>/monitor_dump_trig
+Date: Mar 2022
+KernelVersion: 5.19
+Contact: osharabi@habana.ai
+Description: Triggers dump of monitor data. The value to trigger the operation
+ must be 1. Triggering the monitor dump operation initiates dump of
+ current registers values of all monitors.
+ When the write is finished, the user can read the "monitor_dump"
+ blob
+
What: /sys/kernel/debug/habanalabs/hl<n>/set_power_state
Date: Jan 2019
KernelVersion: 5.1
@@ -232,6 +285,13 @@ KernelVersion: 5.6
Contact: ogabbay@kernel.org
Description: Sets the stop-on_error option for the device engines. Value of
"0" is for disable, otherwise enable.
+ Relevant only for GOYA and GAUDI.
+
+What: /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
+Date: Sep 2021
+KernelVersion: 5.16
+Contact: obitton@habana.ai
+Description: Sets the command submission timeout value in seconds.
What: /sys/kernel/debug/habanalabs/hl<n>/userptr
Date: Jan 2019
@@ -242,7 +302,7 @@ Description: Displays a list with information about the currently user
to DMA addresses
What: /sys/kernel/debug/habanalabs/hl<n>/userptr_lookup
-Date: Aug 2021
+Date: Oct 2021
KernelVersion: 5.15
Contact: ogabbay@kernel.org
Description: Allows to search for specific user pointers (user virtual
diff --git a/Documentation/ABI/testing/debugfs-hisi-hpre b/Documentation/ABI/testing/debugfs-hisi-hpre
index b4be5f1db4b7..82abf92df429 100644
--- a/Documentation/ABI/testing/debugfs-hisi-hpre
+++ b/Documentation/ABI/testing/debugfs-hisi-hpre
@@ -1,140 +1,164 @@
-What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/regs
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Dump debug registers from the HPRE cluster.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/regs
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Dump debug registers from the HPRE cluster.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/cluster_ctrl
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Write the HPRE core selection in the cluster into this file,
+What: /sys/kernel/debug/hisi_hpre/<bdf>/cluster[0-3]/cluster_ctrl
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Write the HPRE core selection in the cluster into this file,
and then we can read the debug information of the core.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/rdclr_en
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: HPRE cores debug registers read clear control. 1 means enable
+What: /sys/kernel/debug/hisi_hpre/<bdf>/rdclr_en
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: HPRE cores debug registers read clear control. 1 means enable
register read clear, otherwise 0. Writing to this file has no
functional effect, only enable or disable counters clear after
reading of these registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/current_qm
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: One HPRE controller has one PF and multiple VFs, each function
+What: /sys/kernel/debug/hisi_hpre/<bdf>/current_qm
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: One HPRE controller has one PF and multiple VFs, each function
has a QM. Select the QM which below qm refers to.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/regs
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Dump debug registers from the HPRE.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/alg_qos
+Date: Jun 2021
+Contact: linux-crypto@vger.kernel.org
+Description: The <bdf> is related the function for PF and VF.
+ HPRE driver supports to configure each function's QoS, the driver
+ supports to write <bdf> value to alg_qos in the host. Such as
+ "echo <bdf> value > alg_qos". The qos value is 1~1000, means
+ 1/1000~1000/1000 of total QoS. The driver reading alg_qos to
+ get related QoS in the host and VM, Such as "cat alg_qos".
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/regs
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Dump debug registers from the HPRE.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/regs
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Dump debug registers from the QM.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/regs
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Dump debug registers from the QM.
Available for PF and VF in host. VF in guest currently only
has one debug register.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/current_q
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: One QM may contain multiple queues. Select specific queue to
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/current_q
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: One QM may contain multiple queues. Select specific queue to
show its debug registers in above regs.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/clear_enable
-Date: Sep 2019
-Contact: linux-crypto@vger.kernel.org
-Description: QM debug registers(regs) read clear control. 1 means enable
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/clear_enable
+Date: Sep 2019
+Contact: linux-crypto@vger.kernel.org
+Description: QM debug registers(regs) read clear control. 1 means enable
register read clear, otherwise 0.
Writing to this file has no functional effect, only enable or
disable counters clear after reading of these registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/err_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of invalid interrupts for
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/err_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of invalid interrupts for
QM task completion.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/aeq_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of QM async event queue interrupts.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/aeq_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of QM async event queue interrupts.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/abnormal_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of interrupts for QM abnormal event.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/abnormal_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of interrupts for QM abnormal event.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/create_qp_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of queue allocation errors.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/create_qp_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of queue allocation errors.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/mb_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of failed QM mailbox commands.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/mb_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of failed QM mailbox commands.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/status
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the status of the QM.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/status
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the status of the QM.
Four states: initiated, started, stopped and closed.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of sent requests.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/qm/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: QM debug registers(regs) read hardware register value. This
+ node is used to show the change of the qm register values. This
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: HPRE debug registers(regs) read hardware register value. This
+ node is used to show the change of the register values. This
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of sent requests.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/recv_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of received requests.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/recv_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of received requests.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_busy_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of requests sent
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_busy_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of requests sent
with returning busy.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_fail_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of completed but error requests.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/send_fail_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of completed but error requests.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/invalid_req_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of invalid requests being received.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/invalid_req_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of invalid requests being received.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/overtime_thrhld
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Set the threshold time for counting the request which is
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/overtime_thrhld
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Set the threshold time for counting the request which is
processed longer than the threshold.
0: disable(default), 1: 1 microsecond.
Available for both PF and VF, and take no other effect on HPRE.
-What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/over_thrhld_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of time out requests.
+What: /sys/kernel/debug/hisi_hpre/<bdf>/hpre_dfx/over_thrhld_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of time out requests.
Available for both PF and VF, and take no other effect on HPRE.
diff --git a/Documentation/ABI/testing/debugfs-hisi-sec b/Documentation/ABI/testing/debugfs-hisi-sec
index 85feb4408e0f..93c530d1bf0f 100644
--- a/Documentation/ABI/testing/debugfs-hisi-sec
+++ b/Documentation/ABI/testing/debugfs-hisi-sec
@@ -1,113 +1,137 @@
-What: /sys/kernel/debug/hisi_sec2/<bdf>/clear_enable
-Date: Oct 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Enabling/disabling of clear action after reading
+What: /sys/kernel/debug/hisi_sec2/<bdf>/clear_enable
+Date: Oct 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Enabling/disabling of clear action after reading
the SEC debug registers.
0: disable, 1: enable.
Only available for PF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/current_qm
-Date: Oct 2019
-Contact: linux-crypto@vger.kernel.org
-Description: One SEC controller has one PF and multiple VFs, each function
+What: /sys/kernel/debug/hisi_sec2/<bdf>/current_qm
+Date: Oct 2019
+Contact: linux-crypto@vger.kernel.org
+Description: One SEC controller has one PF and multiple VFs, each function
has a QM. This file can be used to select the QM which below
qm refers to.
Only available for PF.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/qm_regs
-Date: Oct 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Dump of QM related debug registers.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/alg_qos
+Date: Jun 2021
+Contact: linux-crypto@vger.kernel.org
+Description: The <bdf> is related the function for PF and VF.
+ SEC driver supports to configure each function's QoS, the driver
+ supports to write <bdf> value to alg_qos in the host. Such as
+ "echo <bdf> value > alg_qos". The qos value is 1~1000, means
+ 1/1000~1000/1000 of total QoS. The driver reading alg_qos to
+ get related QoS in the host and VM, Such as "cat alg_qos".
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/qm_regs
+Date: Oct 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Dump of QM related debug registers.
Available for PF and VF in host. VF in guest currently only
has one debug register.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/current_q
-Date: Oct 2019
-Contact: linux-crypto@vger.kernel.org
-Description: One QM of SEC may contain multiple queues. Select specific
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/current_q
+Date: Oct 2019
+Contact: linux-crypto@vger.kernel.org
+Description: One QM of SEC may contain multiple queues. Select specific
queue to show its debug registers in above 'regs'.
Only available for PF.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/clear_enable
-Date: Oct 2019
-Contact: linux-crypto@vger.kernel.org
-Description: Enabling/disabling of clear action after reading
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/clear_enable
+Date: Oct 2019
+Contact: linux-crypto@vger.kernel.org
+Description: Enabling/disabling of clear action after reading
the SEC's QM debug registers.
0: disable, 1: enable.
Only available for PF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/err_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of invalid interrupts for
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/err_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of invalid interrupts for
QM task completion.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/aeq_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of QM async event queue interrupts.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/aeq_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of QM async event queue interrupts.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/abnormal_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of interrupts for QM abnormal event.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/abnormal_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of interrupts for QM abnormal event.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/create_qp_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of queue allocation errors.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/create_qp_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of queue allocation errors.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/mb_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of failed QM mailbox commands.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/mb_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of failed QM mailbox commands.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/status
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the status of the QM.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/status
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the status of the QM.
Four states: initiated, started, stopped and closed.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/send_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of sent requests.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/qm/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: QM debug registers(regs) read hardware register value. This
+ node is used to show the change of the qm register values. This
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: SEC debug registers(regs) read hardware register value. This
+ node is used to show the change of the register values. This
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/send_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of sent requests.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/recv_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of received requests.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/recv_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of received requests.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/send_busy_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of requests sent with returning busy.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/send_busy_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of requests sent with returning busy.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/err_bd_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of BD type error requests
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/err_bd_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of BD type error requests
to be received.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/invalid_req_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of invalid requests being received.
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/invalid_req_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of invalid requests being received.
Available for both PF and VF, and take no other effect on SEC.
-What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/done_flag_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of completed but marked error requests
+What: /sys/kernel/debug/hisi_sec2/<bdf>/sec_dfx/done_flag_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of completed but marked error requests
to be received.
Available for both PF and VF, and take no other effect on SEC.
diff --git a/Documentation/ABI/testing/debugfs-hisi-zip b/Documentation/ABI/testing/debugfs-hisi-zip
index 3034a2bf99ca..fd3f314cf8d1 100644
--- a/Documentation/ABI/testing/debugfs-hisi-zip
+++ b/Documentation/ABI/testing/debugfs-hisi-zip
@@ -1,114 +1,138 @@
-What: /sys/kernel/debug/hisi_zip/<bdf>/comp_core[01]/regs
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: Dump of compression cores related debug registers.
+What: /sys/kernel/debug/hisi_zip/<bdf>/comp_core[01]/regs
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: Dump of compression cores related debug registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/decomp_core[0-5]/regs
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: Dump of decompression cores related debug registers.
+What: /sys/kernel/debug/hisi_zip/<bdf>/decomp_core[0-5]/regs
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: Dump of decompression cores related debug registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/clear_enable
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: Compression/decompression core debug registers read clear
+What: /sys/kernel/debug/hisi_zip/<bdf>/clear_enable
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: Compression/decompression core debug registers read clear
control. 1 means enable register read clear, otherwise 0.
Writing to this file has no functional effect, only enable or
disable counters clear after reading of these registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/current_qm
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: One ZIP controller has one PF and multiple VFs, each function
+What: /sys/kernel/debug/hisi_zip/<bdf>/current_qm
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: One ZIP controller has one PF and multiple VFs, each function
has a QM. Select the QM which below qm refers to.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/regs
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: Dump of QM related debug registers.
+What: /sys/kernel/debug/hisi_zip/<bdf>/alg_qos
+Date: Jun 2021
+Contact: linux-crypto@vger.kernel.org
+Description: The <bdf> is related the function for PF and VF.
+ ZIP driver supports to configure each function's QoS, the driver
+ supports to write <bdf> value to alg_qos in the host. Such as
+ "echo <bdf> value > alg_qos". The qos value is 1~1000, means
+ 1/1000~1000/1000 of total QoS. The driver reading alg_qos to
+ get related QoS in the host and VM, Such as "cat alg_qos".
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/regs
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: Dump of QM related debug registers.
Available for PF and VF in host. VF in guest currently only
has one debug register.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/current_q
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: One QM may contain multiple queues. Select specific queue to
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/current_q
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: One QM may contain multiple queues. Select specific queue to
show its debug registers in above regs.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/clear_enable
-Date: Nov 2018
-Contact: linux-crypto@vger.kernel.org
-Description: QM debug registers(regs) read clear control. 1 means enable
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/clear_enable
+Date: Nov 2018
+Contact: linux-crypto@vger.kernel.org
+Description: QM debug registers(regs) read clear control. 1 means enable
register read clear, otherwise 0.
Writing to this file has no functional effect, only enable or
disable counters clear after reading of these registers.
Only available for PF.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/err_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of invalid interrupts for
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/err_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of invalid interrupts for
QM task completion.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/aeq_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of QM async event queue interrupts.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/aeq_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of QM async event queue interrupts.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/abnormal_irq
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of interrupts for QM abnormal event.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/abnormal_irq
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of interrupts for QM abnormal event.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/create_qp_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of queue allocation errors.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/create_qp_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of queue allocation errors.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/mb_err
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the number of failed QM mailbox commands.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/mb_err
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the number of failed QM mailbox commands.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/qm/status
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the status of the QM.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/status
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the status of the QM.
Four states: initiated, started, stopped and closed.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/send_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of sent requests.
+What: /sys/kernel/debug/hisi_zip/<bdf>/qm/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: QM debug registers(regs) read hardware register value. This
+ node is used to show the change of the qm registers value. This
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/diff_regs
+Date: Mar 2022
+Contact: linux-crypto@vger.kernel.org
+Description: ZIP debug registers(regs) read hardware register value. This
+ node is used to show the change of the registers value. this
+ node can be help users to check the change of register values.
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/send_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of sent requests.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/recv_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of received requests.
+What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/recv_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of received requests.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/send_busy_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of requests received
+What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/send_busy_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of requests received
with returning busy.
Available for both PF and VF, and take no other effect on ZIP.
-What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/err_bd_cnt
-Date: Apr 2020
-Contact: linux-crypto@vger.kernel.org
-Description: Dump the total number of BD type error requests
+What: /sys/kernel/debug/hisi_zip/<bdf>/zip_dfx/err_bd_cnt
+Date: Apr 2020
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the total number of BD type error requests
to be received.
Available for both PF and VF, and take no other effect on ZIP.
diff --git a/Documentation/ABI/testing/debugfs-scmi b/Documentation/ABI/testing/debugfs-scmi
new file mode 100644
index 000000000000..ee7179ab2edf
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-scmi
@@ -0,0 +1,70 @@
+What: /sys/kernel/debug/scmi/<n>/instance_name
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: The name of the underlying SCMI instance <n> described by
+ all the debugfs accessors rooted at /sys/kernel/debug/scmi/<n>,
+ expressed as the full name of the top DT SCMI node under which
+ this SCMI instance is rooted.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/atomic_threshold_us
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: An optional time value, expressed in microseconds, representing,
+ on this SCMI instance <n>, the threshold above which any SCMI
+ command, advertised to have an higher-than-threshold execution
+ latency, should not be considered for atomic mode of operation,
+ even if requested.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/type
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: A string representing the type of transport configured for this
+ SCMI instance <n>.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/is_atomic
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: A boolean stating if the transport configured on the underlying
+ SCMI instance <n> is capable of atomic mode of operation.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/max_rx_timeout_ms
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: Timeout in milliseconds allowed for SCMI synchronous replies
+ for the currently configured SCMI transport for instance <n>.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/max_msg_size
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: Max message size of allowed SCMI messages for the currently
+ configured SCMI transport for instance <n>.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/tx_max_msg
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: Max number of concurrently allowed in-flight SCMI messages for
+ the currently configured SCMI transport for instance <n> on the
+ TX channels.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/transport/rx_max_msg
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: Max number of concurrently allowed in-flight SCMI messages for
+ the currently configured SCMI transport for instance <n> on the
+ RX channels.
+Users: Debugging, any userspace test suite
diff --git a/Documentation/ABI/testing/debugfs-scmi-raw b/Documentation/ABI/testing/debugfs-scmi-raw
new file mode 100644
index 000000000000..97678cc9535c
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-scmi-raw
@@ -0,0 +1,117 @@
+What: /sys/kernel/debug/scmi/<n>/raw/message
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw synchronous message injection/snooping facility; write
+ a complete SCMI synchronous command message (header included)
+ in little-endian binary format to have it sent to the configured
+ backend SCMI server for instance <n>.
+ Any subsequently received response can be read from this same
+ entry if it arrived within the configured timeout.
+ Each write to the entry causes one command request to be built
+ and sent while the replies are read back one message at time
+ (receiving an EOF at each message boundary).
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/message_async
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw asynchronous message injection/snooping facility; write
+ a complete SCMI asynchronous command message (header included)
+ in little-endian binary format to have it sent to the configured
+ backend SCMI server for instance <n>.
+ Any subsequently received response can be read from this same
+ entry if it arrived within the configured timeout.
+ Any additional delayed response received afterwards can be read
+ from this same entry too if it arrived within the configured
+ timeout.
+ Each write to the entry causes one command request to be built
+ and sent while the replies are read back one message at time
+ (receiving an EOF at each message boundary).
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/errors
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw message errors facility; any kind of timed-out or
+ generally unexpectedly received SCMI message, for instance <n>,
+ can be read from this entry.
+ Each read gives back one message at time (receiving an EOF at
+ each message boundary).
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/notification
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw notification snooping facility; any notification
+ emitted by the backend SCMI server, for instance <n>, can be
+ read from this entry.
+ Each read gives back one message at time (receiving an EOF at
+ each message boundary).
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/reset
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw stack reset facility; writing a value to this entry
+ causes the internal queues of any kind of received message,
+ still pending to be read out for instance <n>, to be immediately
+ flushed.
+ Can be used to reset and clean the SCMI Raw stack between to
+ different test-run.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/channels/<m>/message
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw synchronous message injection/snooping facility; write
+ a complete SCMI synchronous command message (header included)
+ in little-endian binary format to have it sent to the configured
+ backend SCMI server for instance <n> through the <m> transport
+ channel.
+ Any subsequently received response can be read from this same
+ entry if it arrived on channel <m> within the configured
+ timeout.
+ Each write to the entry causes one command request to be built
+ and sent while the replies are read back one message at time
+ (receiving an EOF at each message boundary).
+ Channel identifier <m> matches the SCMI protocol number which
+ has been associated with this transport channel in the DT
+ description, with base protocol number 0x10 being the default
+ channel for this instance.
+ Note that these per-channel entries rooted at <..>/channels
+ exist only if the transport is configured to have more than
+ one default channel.
+Users: Debugging, any userspace test suite
+
+What: /sys/kernel/debug/scmi/<n>/raw/channels/<m>/message_async
+Date: March 2023
+KernelVersion: 6.3
+Contact: cristian.marussi@arm.com
+Description: SCMI Raw asynchronous message injection/snooping facility; write
+ a complete SCMI asynchronous command message (header included)
+ in little-endian binary format to have it sent to the configured
+ backend SCMI server for instance <n> through the <m> transport
+ channel.
+ Any subsequently received response can be read from this same
+ entry if it arrived on channel <m> within the configured
+ timeout.
+ Any additional delayed response received afterwards can be read
+ from this same entry too if it arrived within the configured
+ timeout.
+ Each write to the entry causes one command request to be built
+ and sent while the replies are read back one message at time
+ (receiving an EOF at each message boundary).
+ Channel identifier <m> matches the SCMI protocol number which
+ has been associated with this transport channel in the DT
+ description, with base protocol number 0x10 being the default
+ channel for this instance.
+ Note that these per-channel entries rooted at <..>/channels
+ exist only if the transport is configured to have more than
+ one default channel.
+Users: Debugging, any userspace test suite
diff --git a/Documentation/ABI/testing/evm b/Documentation/ABI/testing/evm
index 553fd8a33e56..44750a933db4 100644
--- a/Documentation/ABI/testing/evm
+++ b/Documentation/ABI/testing/evm
@@ -1,4 +1,5 @@
-What: security/evm
+What: /sys/kernel/security/evm
+What: /sys/kernel/security/*/evm
Date: March 2011
Contact: Mimi Zohar <zohar@us.ibm.com>
Description:
@@ -93,7 +94,7 @@ Description:
core/ima-setup) have support for loading keys at boot
time.
-What: security/integrity/evm/evm_xattrs
+What: /sys/kernel/security/*/evm/evm_xattrs
Date: April 2018
Contact: Matthew Garrett <mjg59@google.com>
Description:
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index 5c2798534950..49db0ff288e5 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -1,4 +1,4 @@
-What: security/ima/policy
+What: /sys/kernel/security/*/ima/policy
Date: May 2008
Contact: Mimi Zohar <zohar@us.ibm.com>
Description:
@@ -22,31 +22,47 @@ Description:
action: measure | dont_measure | appraise | dont_appraise |
audit | hash | dont_hash
condition:= base | lsm [option]
- base: [[func=] [mask=] [fsmagic=] [fsuuid=] [uid=]
- [euid=] [fowner=] [fsname=]]
+ base: [[func=] [mask=] [fsmagic=] [fsuuid=] [fsname=]
+ [uid=] [euid=] [gid=] [egid=]
+ [fowner=] [fgroup=]]
lsm: [[subj_user=] [subj_role=] [subj_type=]
[obj_user=] [obj_role=] [obj_type=]]
- option: [[appraise_type=]] [template=] [permit_directio]
- [appraise_flag=] [appraise_algos=] [keyrings=]
+ option: [digest_type=] [template=] [permit_directio]
+ [appraise_type=] [appraise_flag=]
+ [appraise_algos=] [keyrings=]
base:
func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
[KEXEC_CMDLINE] [KEY_CHECK] [CRITICAL_DATA]
- [SETXATTR_CHECK]
+ [SETXATTR_CHECK][MMAP_CHECK_REQPROT]
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
[[^]MAY_EXEC]
fsmagic:= hex value
fsuuid:= file system UUID (e.g 8bcbe394-4f13-4144-be8e-5aa9ea2ce2f6)
uid:= decimal value
euid:= decimal value
+ gid:= decimal value
+ egid:= decimal value
fowner:= decimal value
+ fgroup:= decimal value
lsm: are LSM specific
option:
- appraise_type:= [imasig] [imasig|modsig]
+ appraise_type:= [imasig] | [imasig|modsig] | [sigv3]
+ where 'imasig' is the original or the signature
+ format v2.
+ where 'modsig' is an appended signature,
+ where 'sigv3' is the signature format v3. (Currently
+ limited to fsverity digest based signatures
+ stored in security.ima xattr. Requires
+ specifying "digest_type=verity" first.)
+
appraise_flag:= [check_blacklist]
Currently, blacklist check is only for files signed with appended
signature.
+ digest_type:= verity
+ Require fs-verity's file digest instead of the
+ regular IMA file hash.
keyrings:= list of keyrings
(eg, .builtin_trusted_keys|.ima). Only valid
when action is "measure" and func is KEY_CHECK.
@@ -145,3 +161,30 @@ Description:
security.ima xattr of a file:
appraise func=SETXATTR_CHECK appraise_algos=sha256,sha384,sha512
+
+ Example of a 'measure' rule requiring fs-verity's digests
+ with indication of type of digest in the measurement list.
+
+ measure func=FILE_CHECK digest_type=verity \
+ template=ima-ngv2
+
+ Example of 'measure' and 'appraise' rules requiring fs-verity
+ signatures (format version 3) stored in security.ima xattr.
+
+ The 'measure' rule specifies the 'ima-sigv3' template option,
+ which includes the indication of type of digest and the file
+ signature in the measurement list.
+
+ measure func=BPRM_CHECK digest_type=verity \
+ template=ima-sigv3
+
+
+ The 'appraise' rule specifies the type and signature format
+ version (sigv3) required.
+
+ appraise func=BPRM_CHECK digest_type=verity \
+ appraise_type=sigv3
+
+ All of these policy rules could, for example, be constrained
+ either based on a filesystem's UUID (fsuuid) or based on LSM
+ labels.
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
index a4e31c465194..b446a7154a1b 100644
--- a/Documentation/ABI/testing/procfs-smaps_rollup
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -22,6 +22,7 @@ Description:
MMUPageSize: 4 kB
Rss: 884 kB
Pss: 385 kB
+ Pss_Dirty: 68 kB
Pss_Anon: 301 kB
Pss_File: 80 kB
Pss_Shmem: 4 kB
diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore
index 5b02540781a2..d3cff4a7ee10 100644
--- a/Documentation/ABI/testing/pstore
+++ b/Documentation/ABI/testing/pstore
@@ -1,4 +1,5 @@
-What: /sys/fs/pstore/... (or /dev/pstore/...)
+What: /sys/fs/pstore/...
+What: /dev/pstore/...
Date: March 2011
KernelVersion: 2.6.39
Contact: tony.luck@intel.com
diff --git a/Documentation/ABI/testing/securityfs-secrets-coco b/Documentation/ABI/testing/securityfs-secrets-coco
new file mode 100644
index 000000000000..f2b6909155f9
--- /dev/null
+++ b/Documentation/ABI/testing/securityfs-secrets-coco
@@ -0,0 +1,51 @@
+What: security/secrets/coco
+Date: February 2022
+Contact: Dov Murik <dovmurik@linux.ibm.com>
+Description:
+ Exposes confidential computing (coco) EFI secrets to
+ userspace via securityfs.
+
+ EFI can declare memory area used by confidential computing
+ platforms (such as AMD SEV and SEV-ES) for secret injection by
+ the Guest Owner during VM's launch. The secrets are encrypted
+ by the Guest Owner and decrypted inside the trusted enclave,
+ and therefore are not readable by the untrusted host.
+
+ The efi_secret module exposes the secrets to userspace. Each
+ secret appears as a file under <securityfs>/secrets/coco,
+ where the filename is the GUID of the entry in the secrets
+ table. This module is loaded automatically by the EFI driver
+ if the EFI secret area is populated.
+
+ Two operations are supported for the files: read and unlink.
+ Reading the file returns the content of secret entry.
+ Unlinking the file overwrites the secret data with zeroes and
+ removes the entry from the filesystem. A secret cannot be read
+ after it has been unlinked.
+
+ For example, listing the available secrets::
+
+ # modprobe efi_secret
+ # ls -l /sys/kernel/security/secrets/coco
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+ -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910
+
+ Reading the secret data by reading a file::
+
+ # cat /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+ the-content-of-the-secret-data
+
+ Wiping a secret by unlinking a file::
+
+ # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+ # ls -l /sys/kernel/security/secrets/coco
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+
+ Note: The binary format of the secrets table injected by the
+ Guest Owner is described in
+ drivers/virt/coco/efi_secret/efi_secret.c under "Structure of
+ the EFI secret area".
diff --git a/Documentation/ABI/testing/sysfs-amd-pmc b/Documentation/ABI/testing/sysfs-amd-pmc
new file mode 100644
index 000000000000..c421b72844f1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-amd-pmc
@@ -0,0 +1,13 @@
+What: /sys/bus/platform/drivers/amd_pmc/*/smu_fw_version
+Date: October 2022
+Contact: Mario Limonciello <mario.limonciello@amd.com>
+Description: Reading this file reports the version of the firmware loaded to
+ System Management Unit (SMU) contained in AMD CPUs and
+ APUs.
+
+What: /sys/bus/platform/drivers/amd_pmc/*/smu_program
+Date: October 2022
+Contact: Mario Limonciello <mario.limonciello@amd.com>
+Description: Reading this file reports the program corresponding to the SMU
+ firmware version. The program field is used to disambiguate two
+ APU/CPU models that can share the same firmware binary.
diff --git a/Documentation/ABI/testing/sysfs-amd-pmf b/Documentation/ABI/testing/sysfs-amd-pmf
new file mode 100644
index 000000000000..7fc0e1c2b76b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-amd-pmf
@@ -0,0 +1,13 @@
+What: /sys/devices/platform/*/cnqf_enable
+Date: September 2022
+Contact: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
+Description: Reading this file tells if the AMD Platform Management(PMF)
+ Cool n Quiet Framework(CnQF) feature is enabled or not.
+
+ This feature is not enabled by default and gets only turned on
+ if OEM BIOS passes a "flag" to PMF ACPI function (index 11 or 12)
+ or in case the user writes "on".
+
+ To turn off CnQF user can write "off" to the sysfs node.
+ Note: Systems that support auto mode will not have this sysfs file
+ available.
diff --git a/Documentation/ABI/testing/sysfs-ata b/Documentation/ABI/testing/sysfs-ata
index 9ab0ef1dd1c7..3daecac48964 100644
--- a/Documentation/ABI/testing/sysfs-ata
+++ b/Documentation/ABI/testing/sysfs-ata
@@ -1,4 +1,4 @@
-What: /sys/class/ata_...
+What: /sys/class/ata_*
Description:
Provide a place in sysfs for storing the ATA topology of the
system. This allows retrieving various information about ATA
@@ -107,13 +107,14 @@ Description:
described in ATA8 7.16 and 7.17. Only valid if
the device is not a PM.
- pio_mode: (RO) Transfer modes supported by the device when
- in PIO mode. Mostly used by PATA device.
+ pio_mode: (RO) PIO transfer mode used by the device.
+ Mostly used by PATA devices.
- xfer_mode: (RO) Current transfer mode
+ xfer_mode: (RO) Current transfer mode. Mostly used by
+ PATA devices.
- dma_mode: (RO) Transfer modes supported by the device when
- in DMA mode. Mostly used by PATA device.
+ dma_mode: (RO) DMA transfer mode used by the device.
+ Mostly used by PATA devices.
class: (RO) Device class. Can be "ata" for disk,
"atapi" for packet device, "pmp" for PM, or
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
deleted file mode 100644
index a0ed87386639..000000000000
--- a/Documentation/ABI/testing/sysfs-block
+++ /dev/null
@@ -1,330 +0,0 @@
-What: /sys/block/<disk>/stat
-Date: February 2008
-Contact: Jerome Marchand <jmarchan@redhat.com>
-Description:
- The /sys/block/<disk>/stat files displays the I/O
- statistics of disk <disk>. They contain 11 fields:
-
- == ==============================================
- 1 reads completed successfully
- 2 reads merged
- 3 sectors read
- 4 time spent reading (ms)
- 5 writes completed
- 6 writes merged
- 7 sectors written
- 8 time spent writing (ms)
- 9 I/Os currently in progress
- 10 time spent doing I/Os (ms)
- 11 weighted time spent doing I/Os (ms)
- 12 discards completed
- 13 discards merged
- 14 sectors discarded
- 15 time spent discarding (ms)
- 16 flush requests completed
- 17 time spent flushing (ms)
- == ==============================================
-
- For more details refer Documentation/admin-guide/iostats.rst
-
-
-What: /sys/block/<disk>/diskseq
-Date: February 2021
-Contact: Matteo Croce <mcroce@microsoft.com>
-Description:
- The /sys/block/<disk>/diskseq files reports the disk
- sequence number, which is a monotonically increasing
- number assigned to every drive.
- Some devices, like the loop device, refresh such number
- every time the backing file is changed.
- The value type is 64 bit unsigned.
-
-
-What: /sys/block/<disk>/<part>/stat
-Date: February 2008
-Contact: Jerome Marchand <jmarchan@redhat.com>
-Description:
- The /sys/block/<disk>/<part>/stat files display the
- I/O statistics of partition <part>. The format is the
- same as the above-written /sys/block/<disk>/stat
- format.
-
-
-What: /sys/block/<disk>/integrity/format
-Date: June 2008
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Metadata format for integrity capable block device.
- E.g. T10-DIF-TYPE1-CRC.
-
-
-What: /sys/block/<disk>/integrity/read_verify
-Date: June 2008
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Indicates whether the block layer should verify the
- integrity of read requests serviced by devices that
- support sending integrity metadata.
-
-
-What: /sys/block/<disk>/integrity/tag_size
-Date: June 2008
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Number of bytes of integrity tag space available per
- 512 bytes of data.
-
-
-What: /sys/block/<disk>/integrity/device_is_integrity_capable
-Date: July 2014
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Indicates whether a storage device is capable of storing
- integrity metadata. Set if the device is T10 PI-capable.
-
-What: /sys/block/<disk>/integrity/protection_interval_bytes
-Date: July 2015
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Describes the number of data bytes which are protected
- by one integrity tuple. Typically the device's logical
- block size.
-
-What: /sys/block/<disk>/integrity/write_generate
-Date: June 2008
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Indicates whether the block layer should automatically
- generate checksums for write requests bound for
- devices that support receiving integrity metadata.
-
-What: /sys/block/<disk>/alignment_offset
-Date: April 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Storage devices may report a physical block size that is
- bigger than the logical block size (for instance a drive
- with 4KB physical sectors exposing 512-byte logical
- blocks to the operating system). This parameter
- indicates how many bytes the beginning of the device is
- offset from the disk's natural alignment.
-
-What: /sys/block/<disk>/<partition>/alignment_offset
-Date: April 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Storage devices may report a physical block size that is
- bigger than the logical block size (for instance a drive
- with 4KB physical sectors exposing 512-byte logical
- blocks to the operating system). This parameter
- indicates how many bytes the beginning of the partition
- is offset from the disk's natural alignment.
-
-What: /sys/block/<disk>/queue/logical_block_size
-Date: May 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- This is the smallest unit the storage device can
- address. It is typically 512 bytes.
-
-What: /sys/block/<disk>/queue/physical_block_size
-Date: May 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- This is the smallest unit a physical storage device can
- write atomically. It is usually the same as the logical
- block size but may be bigger. One example is SATA
- drives with 4KB sectors that expose a 512-byte logical
- block size to the operating system. For stacked block
- devices the physical_block_size variable contains the
- maximum physical_block_size of the component devices.
-
-What: /sys/block/<disk>/queue/minimum_io_size
-Date: April 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Storage devices may report a granularity or preferred
- minimum I/O size which is the smallest request the
- device can perform without incurring a performance
- penalty. For disk drives this is often the physical
- block size. For RAID arrays it is often the stripe
- chunk size. A properly aligned multiple of
- minimum_io_size is the preferred request size for
- workloads where a high number of I/O operations is
- desired.
-
-What: /sys/block/<disk>/queue/optimal_io_size
-Date: April 2009
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Storage devices may report an optimal I/O size, which is
- the device's preferred unit for sustained I/O. This is
- rarely reported for disk drives. For RAID arrays it is
- usually the stripe width or the internal track size. A
- properly aligned multiple of optimal_io_size is the
- preferred request size for workloads where sustained
- throughput is desired. If no optimal I/O size is
- reported this file contains 0.
-
-What: /sys/block/<disk>/queue/nomerges
-Date: January 2010
-Contact:
-Description:
- Standard I/O elevator operations include attempts to
- merge contiguous I/Os. For known random I/O loads these
- attempts will always fail and result in extra cycles
- being spent in the kernel. This allows one to turn off
- this behavior on one of two ways: When set to 1, complex
- merge checks are disabled, but the simple one-shot merges
- with the previous I/O request are enabled. When set to 2,
- all merge tries are disabled. The default value is 0 -
- which enables all types of merge tries.
-
-What: /sys/block/<disk>/discard_alignment
-Date: May 2011
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Devices that support discard functionality may
- internally allocate space in units that are bigger than
- the exported logical block size. The discard_alignment
- parameter indicates how many bytes the beginning of the
- device is offset from the internal allocation unit's
- natural alignment.
-
-What: /sys/block/<disk>/<partition>/discard_alignment
-Date: May 2011
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Devices that support discard functionality may
- internally allocate space in units that are bigger than
- the exported logical block size. The discard_alignment
- parameter indicates how many bytes the beginning of the
- partition is offset from the internal allocation unit's
- natural alignment.
-
-What: /sys/block/<disk>/queue/discard_granularity
-Date: May 2011
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Devices that support discard functionality may
- internally allocate space using units that are bigger
- than the logical block size. The discard_granularity
- parameter indicates the size of the internal allocation
- unit in bytes if reported by the device. Otherwise the
- discard_granularity will be set to match the device's
- physical block size. A discard_granularity of 0 means
- that the device does not support discard functionality.
-
-What: /sys/block/<disk>/queue/discard_max_bytes
-Date: May 2011
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Devices that support discard functionality may have
- internal limits on the number of bytes that can be
- trimmed or unmapped in a single operation. Some storage
- protocols also have inherent limits on the number of
- blocks that can be described in a single command. The
- discard_max_bytes parameter is set by the device driver
- to the maximum number of bytes that can be discarded in
- a single operation. Discard requests issued to the
- device must not exceed this limit. A discard_max_bytes
- value of 0 means that the device does not support
- discard functionality.
-
-What: /sys/block/<disk>/queue/discard_zeroes_data
-Date: May 2011
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Will always return 0. Don't rely on any specific behavior
- for discards, and don't read this file.
-
-What: /sys/block/<disk>/queue/write_same_max_bytes
-Date: January 2012
-Contact: Martin K. Petersen <martin.petersen@oracle.com>
-Description:
- Some devices support a write same operation in which a
- single data block can be written to a range of several
- contiguous blocks on storage. This can be used to wipe
- areas on disk or to initialize drives in a RAID
- configuration. write_same_max_bytes indicates how many
- bytes can be written in a single write same command. If
- write_same_max_bytes is 0, write same is not supported
- by the device.
-
-What: /sys/block/<disk>/queue/write_zeroes_max_bytes
-Date: November 2016
-Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
-Description:
- Devices that support write zeroes operation in which a
- single request can be issued to zero out the range of
- contiguous blocks on storage without having any payload
- in the request. This can be used to optimize writing zeroes
- to the devices. write_zeroes_max_bytes indicates how many
- bytes can be written in a single write zeroes command. If
- write_zeroes_max_bytes is 0, write zeroes is not supported
- by the device.
-
-What: /sys/block/<disk>/queue/zoned
-Date: September 2016
-Contact: Damien Le Moal <damien.lemoal@wdc.com>
-Description:
- zoned indicates if the device is a zoned block device
- and the zone model of the device if it is indeed zoned.
- The possible values indicated by zoned are "none" for
- regular block devices and "host-aware" or "host-managed"
- for zoned block devices. The characteristics of
- host-aware and host-managed zoned block devices are
- described in the ZBC (Zoned Block Commands) and ZAC
- (Zoned Device ATA Command Set) standards. These standards
- also define the "drive-managed" zone model. However,
- since drive-managed zoned block devices do not support
- zone commands, they will be treated as regular block
- devices and zoned will report "none".
-
-What: /sys/block/<disk>/queue/nr_zones
-Date: November 2018
-Contact: Damien Le Moal <damien.lemoal@wdc.com>
-Description:
- nr_zones indicates the total number of zones of a zoned block
- device ("host-aware" or "host-managed" zone model). For regular
- block devices, the value is always 0.
-
-What: /sys/block/<disk>/queue/max_active_zones
-Date: July 2020
-Contact: Niklas Cassel <niklas.cassel@wdc.com>
-Description:
- For zoned block devices (zoned attribute indicating
- "host-managed" or "host-aware"), the sum of zones belonging to
- any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED,
- is limited by this value. If this value is 0, there is no limit.
-
-What: /sys/block/<disk>/queue/max_open_zones
-Date: July 2020
-Contact: Niklas Cassel <niklas.cassel@wdc.com>
-Description:
- For zoned block devices (zoned attribute indicating
- "host-managed" or "host-aware"), the sum of zones belonging to
- any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN,
- is limited by this value. If this value is 0, there is no limit.
-
-What: /sys/block/<disk>/queue/chunk_sectors
-Date: September 2016
-Contact: Hannes Reinecke <hare@suse.com>
-Description:
- chunk_sectors has different meaning depending on the type
- of the disk. For a RAID device (dm-raid), chunk_sectors
- indicates the size in 512B sectors of the RAID volume
- stripe segment. For a zoned block device, either
- host-aware or host-managed, chunk_sectors indicates the
- size in 512B sectors of the zones of the device, with
- the eventual exception of the last zone of the device
- which may be smaller.
-
-What: /sys/block/<disk>/queue/io_timeout
-Date: November 2018
-Contact: Weiping Zhang <zhangweiping@didiglobal.com>
-Description:
- io_timeout is the request timeout in milliseconds. If a request
- does not complete in this time then the block driver timeout
- handler is invoked. That timeout handler can decide to retry
- the request, to fail it or to start a device recovery strategy.
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 14b2bf2e5105..628a00fb20a9 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -137,3 +137,17 @@ Description:
The writeback_limit file is read-write and specifies the maximum
amount of writeback ZRAM can do. The limit could be changed
in run time.
+
+What: /sys/block/zram<id>/recomp_algorithm
+Date: November 2022
+Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
+Description:
+ The recomp_algorithm file is read-write and allows to set
+ or show secondary compression algorithms.
+
+What: /sys/block/zram<id>/recompress
+Date: November 2022
+Contact: Sergey Senozhatsky <senozhatsky@chromium.org>
+Description:
+ The recompress file is write-only and triggers re-compression
+ with secondary compression algorithms.
diff --git a/Documentation/ABI/testing/sysfs-bus-bcma b/Documentation/ABI/testing/sysfs-bus-bcma
index 721b4aea3020..e93d3ddca844 100644
--- a/Documentation/ABI/testing/sysfs-bus-bcma
+++ b/Documentation/ABI/testing/sysfs-bus-bcma
@@ -3,7 +3,7 @@ Date: May 2011
KernelVersion: 3.0
Contact: Rafał Miłecki <zajec5@gmail.com>
Description:
- Each BCMA core has it's manufacturer id. See
+ Each BCMA core has its manufacturer id. See
include/linux/bcma/bcma.h for possible values.
What: /sys/bus/bcma/devices/.../id
diff --git a/Documentation/ABI/testing/sysfs-bus-cdx b/Documentation/ABI/testing/sysfs-bus-cdx
new file mode 100644
index 000000000000..7af477f49998
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-cdx
@@ -0,0 +1,56 @@
+What: /sys/bus/cdx/rescan
+Date: March 2023
+Contact: nipun.gupta@amd.com
+Description:
+ Writing y/1/on to this file will cause rescan of the bus
+ and devices on the CDX bus. Any new devices are scanned and
+ added to the list of Linux devices and any devices removed are
+ also deleted from Linux.
+
+ For example::
+
+ # echo 1 > /sys/bus/cdx/rescan
+
+What: /sys/bus/cdx/devices/.../vendor
+Date: March 2023
+Contact: nipun.gupta@amd.com
+Description:
+ Vendor ID for this CDX device, in hexadecimal. Vendor ID is
+ 16 bit identifier which is specific to the device manufacturer.
+ Combination of Vendor ID and Device ID identifies a device.
+
+What: /sys/bus/cdx/devices/.../device
+Date: March 2023
+Contact: nipun.gupta@amd.com
+Description:
+ Device ID for this CDX device, in hexadecimal. Device ID is
+ 16 bit identifier to identify a device type within the range
+ of a device manufacturer.
+ Combination of Vendor ID and Device ID identifies a device.
+
+What: /sys/bus/cdx/devices/.../reset
+Date: March 2023
+Contact: nipun.gupta@amd.com
+Description:
+ Writing y/1/on to this file resets the CDX device.
+ On resetting the device, the corresponding driver is notified
+ twice, once before the device is being reset, and again after
+ the reset has been complete.
+
+ For example::
+
+ # echo 1 > /sys/bus/cdx/.../reset
+
+What: /sys/bus/cdx/devices/.../remove
+Date: March 2023
+Contact: tarak.reddy@amd.com
+Description:
+ Writing y/1/on to this file removes the corresponding
+ device from the CDX bus. If the device is to be reconfigured
+ reconfigured in the Hardware, the device can be removed, so
+ that the device driver does not access the device while it is
+ being reconfigured.
+
+ For example::
+
+ # echo 1 > /sys/bus/cdx/devices/.../remove
diff --git a/Documentation/ABI/testing/sysfs-bus-coreboot b/Documentation/ABI/testing/sysfs-bus-coreboot
new file mode 100644
index 000000000000..9c5accecc470
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coreboot
@@ -0,0 +1,45 @@
+What: /sys/bus/coreboot
+Date: August 2022
+Contact: Jack Rosenthal <jrosenth@chromium.org>
+Description:
+ The coreboot bus provides a variety of virtual devices used to
+ access data structures created by the Coreboot BIOS.
+
+What: /sys/bus/coreboot/devices/cbmem-<id>
+Date: August 2022
+Contact: Jack Rosenthal <jrosenth@chromium.org>
+Description:
+ CBMEM is a downwards-growing memory region created by Coreboot,
+ and contains tagged data structures to be shared with payloads
+ in the boot process and the OS. Each CBMEM entry is given a
+ directory in /sys/bus/coreboot/devices based on its id.
+ A list of ids known to Coreboot can be found in the coreboot
+ source tree at
+ ``src/commonlib/bsd/include/commonlib/bsd/cbmem_id.h``.
+
+What: /sys/bus/coreboot/devices/cbmem-<id>/address
+Date: August 2022
+Contact: Jack Rosenthal <jrosenth@chromium.org>
+Description:
+ This is the pyhsical memory address that the CBMEM entry's data
+ begins at, in hexadecimal (e.g., ``0x76ffe000``).
+
+What: /sys/bus/coreboot/devices/cbmem-<id>/size
+Date: August 2022
+Contact: Jack Rosenthal <jrosenth@chromium.org>
+Description:
+ This is the size of the CBMEM entry's data, in hexadecimal
+ (e.g., ``0x1234``).
+
+What: /sys/bus/coreboot/devices/cbmem-<id>/mem
+Date: August 2022
+Contact: Jack Rosenthal <jrosenth@chromium.org>
+Description:
+ A file exposing read/write access to the entry's data. Note
+ that this file does not support mmap(), as coreboot
+ does not guarantee that the data will be page-aligned.
+
+ The mode of this file is 0600. While there shouldn't be
+ anything security-sensitive contained in CBMEM, read access
+ requires root privileges given this is exposing a small subset
+ of physical memory.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm3x b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm3x
index 651602a61eac..234c33fbdb55 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm3x
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm3x
@@ -236,7 +236,7 @@ What: /sys/bus/coresight/devices/<memory_map>.[etm|ptm]/traceid
Date: November 2014
KernelVersion: 3.19
Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
-Description: (RW) Holds the trace ID that will appear in the trace stream
+Description: (RO) Holds the trace ID that will appear in the trace stream
coming from this trace entity.
What: /sys/bus/coresight/devices/<memory_map>.[etm|ptm]/trigger_event
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
index 8e53a32f8150..08b1964f27d3 100644
--- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-etm4x
@@ -516,3 +516,11 @@ Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
Description: (Read) Returns the number of special conditional P1 right-hand keys
that the trace unit can use (0x194). The value is taken
directly from the HW.
+
+What: /sys/bus/coresight/devices/etm<N>/ts_source
+Date: October 2022
+KernelVersion: 6.1
+Contact: Mathieu Poirier <mathieu.poirier@linaro.org> or Suzuki K Poulose <suzuki.poulose@arm.com>
+Description: (Read) When FEAT_TRF is implemented, value of TRFCR_ELx.TS used for
+ trace session. Otherwise -1 indicates an unknown time source. Check
+ trcidr0.tssize to see if a global timestamp is available.
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
new file mode 100644
index 000000000000..4a58e649550d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm
@@ -0,0 +1,13 @@
+What: /sys/bus/coresight/devices/<tpdm-name>/integration_test
+Date: January 2023
+KernelVersion 6.2
+Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com>
+Description:
+ (Write) Run integration test for tpdm. Integration test
+ will generate test data for tpdm. It can help to make
+ sure that the trace path is enabled and the link configurations
+ are fine.
+
+ Accepts only one of the 2 values - 1 or 2.
+ 1 : Generate 64 bits data
+ 2 : Generate 32 bits data
diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-ultra_smb b/Documentation/ABI/testing/sysfs-bus-coresight-devices-ultra_smb
new file mode 100644
index 000000000000..f560918ae738
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-ultra_smb
@@ -0,0 +1,31 @@
+What: /sys/bus/coresight/devices/ultra_smb<N>/enable_sink
+Date: January 2023
+KernelVersion: 6.3
+Contact: Junhao He <hejunhao3@huawei.com>
+Description: (RW) Add/remove a SMB device from a trace path. There can be
+ multiple sources for a single SMB device.
+
+What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/buf_size
+Date: January 2023
+KernelVersion: 6.3
+Contact: Junhao He <hejunhao3@huawei.com>
+Description: (RO) Shows the buffer size of each UltraSoc SMB device.
+
+What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/buf_status
+Date: January 2023
+KernelVersion: 6.3
+Contact: Junhao He <hejunhao3@huawei.com>
+Description: (RO) Shows the value of UltraSoc SMB status register.
+ BIT(0) is zero means buffer is empty.
+
+What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/read_pos
+Date: January 2023
+KernelVersion: 6.3
+Contact: Junhao He <hejunhao3@huawei.com>
+Description: (RO) Shows the value of UltraSoc SMB Read Pointer register.
+
+What: /sys/bus/coresight/devices/ultra_smb<N>/mgmt/write_pos
+Date: January 2023
+KernelVersion: 6.3
+Contact: Junhao He <hejunhao3@huawei.com>
+Description: (RO) Shows the value of UltraSoc SMB Write Pointer register.
diff --git a/Documentation/ABI/testing/sysfs-bus-counter b/Documentation/ABI/testing/sysfs-bus-counter
index 20fe5afd4f9e..1417c4272c6c 100644
--- a/Documentation/ABI/testing/sysfs-bus-counter
+++ b/Documentation/ABI/testing/sysfs-bus-counter
@@ -1,9 +1,45 @@
+What: /sys/bus/counter/devices/counterX/cascade_counts_enable
+KernelVersion: 6.4
+Contact: linux-iio@vger.kernel.org
+Description:
+ Indicates the cascading of Counts on Counter X.
+
+ Valid attribute values are boolean.
+
+What: /sys/bus/counter/devices/counterX/external_input_phase_clock_select
+KernelVersion: 6.4
+Contact: linux-iio@vger.kernel.org
+Description:
+ Selects the external clock pin for phase counting mode of
+ Counter X.
+
+ MTCLKA-MTCLKB:
+ MTCLKA and MTCLKB pins are selected for the external
+ phase clock.
+
+ MTCLKC-MTCLKD:
+ MTCLKC and MTCLKD pins are selected for the external
+ phase clock.
+
+What: /sys/bus/counter/devices/counterX/external_input_phase_clock_select_available
+KernelVersion: 6.4
+Contact: linux-iio@vger.kernel.org
+Description:
+ Discrete set of available values for the respective device
+ configuration are listed in this file.
+
What: /sys/bus/counter/devices/counterX/countY/count
KernelVersion: 5.2
Contact: linux-iio@vger.kernel.org
Description:
Count data of Count Y represented as a string.
+What: /sys/bus/counter/devices/counterX/countY/capture
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Historical capture of the Count Y count data.
+
What: /sys/bus/counter/devices/counterX/countY/ceiling
KernelVersion: 5.2
Contact: linux-iio@vger.kernel.org
@@ -203,6 +239,39 @@ Description:
both edges:
Any state transition.
+What: /sys/bus/counter/devices/counterX/countY/num_overflows
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ This attribute indicates the number of overflows of count Y.
+
+What: /sys/bus/counter/devices/counterX/cascade_counts_enable_component_id
+What: /sys/bus/counter/devices/counterX/external_input_phase_clock_select_component_id
+What: /sys/bus/counter/devices/counterX/countY/capture_component_id
+What: /sys/bus/counter/devices/counterX/countY/ceiling_component_id
+What: /sys/bus/counter/devices/counterX/countY/floor_component_id
+What: /sys/bus/counter/devices/counterX/countY/count_mode_component_id
+What: /sys/bus/counter/devices/counterX/countY/direction_component_id
+What: /sys/bus/counter/devices/counterX/countY/enable_component_id
+What: /sys/bus/counter/devices/counterX/countY/error_noise_component_id
+What: /sys/bus/counter/devices/counterX/countY/prescaler_component_id
+What: /sys/bus/counter/devices/counterX/countY/preset_component_id
+What: /sys/bus/counter/devices/counterX/countY/preset_enable_component_id
+What: /sys/bus/counter/devices/counterX/countY/signalZ_action_component_id
+What: /sys/bus/counter/devices/counterX/countY/num_overflows_component_id
+What: /sys/bus/counter/devices/counterX/signalY/cable_fault_component_id
+What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable_component_id
+What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler_component_id
+What: /sys/bus/counter/devices/counterX/signalY/index_polarity_component_id
+What: /sys/bus/counter/devices/counterX/signalY/polarity_component_id
+What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_component_id
+What: /sys/bus/counter/devices/counterX/signalY/frequency_component_id
+KernelVersion: 5.16
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read-only attribute that indicates the component ID of the
+ respective extension or Synapse.
+
What: /sys/bus/counter/devices/counterX/countY/spike_filter_ns
KernelVersion: 5.14
Contact: linux-iio@vger.kernel.org
@@ -212,6 +281,14 @@ Description:
shorter or equal to configured value are ignored. Value 0 means
filter is disabled.
+What: /sys/bus/counter/devices/counterX/events_queue_size
+KernelVersion: 5.16
+Contact: linux-iio@vger.kernel.org
+Description:
+ Size of the Counter events queue in number of struct
+ counter_event data structures. The number of elements will be
+ rounded-up to a power of 2.
+
What: /sys/bus/counter/devices/counterX/name
KernelVersion: 5.2
Contact: linux-iio@vger.kernel.org
@@ -274,6 +351,19 @@ Description:
Discrete set of available values for the respective Signal Y
configuration are listed in this file.
+What: /sys/bus/counter/devices/counterX/signalY/polarity
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Active level of Signal Y. The following polarity values are
+ available:
+
+ positive:
+ Signal high state considered active level (rising edge).
+
+ negative:
+ Signal low state considered active level (falling edge).
+
What: /sys/bus/counter/devices/counterX/signalY/name
KernelVersion: 5.2
Contact: linux-iio@vger.kernel.org
@@ -286,7 +376,14 @@ What: /sys/bus/counter/devices/counterX/signalY/signal
KernelVersion: 5.2
Contact: linux-iio@vger.kernel.org
Description:
- Signal data of Signal Y represented as a string.
+ Signal level state of Signal Y. The following signal level
+ states are available:
+
+ low:
+ Low level state.
+
+ high:
+ High level state.
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode
KernelVersion: 5.2
@@ -309,3 +406,9 @@ Description:
via index_polarity. The index function (as enabled via
preset_enable) is performed synchronously with the
quadrature clock on the active level of the index input.
+
+What: /sys/bus/counter/devices/counterX/signalY/frequency
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read-only attribute that indicates the signal Y frequency, in Hz.
diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
index 12a733fe357f..d4d5cfb63b90 100644
--- a/Documentation/ABI/testing/sysfs-bus-css
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -1,22 +1,19 @@
What: /sys/bus/css/devices/.../type
Date: March 2008
-Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
- linux-s390@vger.kernel.org
+Contact: linux-s390@vger.kernel.org
Description: Contains the subchannel type, as reported by the hardware.
This attribute is present for all subchannel types.
What: /sys/bus/css/devices/.../modalias
Date: March 2008
-Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
- linux-s390@vger.kernel.org
+Contact: linux-s390@vger.kernel.org
Description: Contains the module alias as reported with uevents.
It is of the format css:t<type> and present for all
subchannel types.
What: /sys/bus/css/drivers/io_subchannel/.../chpids
Date: December 2002
-Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
- linux-s390@vger.kernel.org
+Contact: linux-s390@vger.kernel.org
Description: Contains the ids of the channel paths used by this
subchannel, as reported by the channel subsystem
during subchannel recognition.
@@ -26,8 +23,7 @@ Users: s390-tools, HAL
What: /sys/bus/css/drivers/io_subchannel/.../pimpampom
Date: December 2002
-Contact: Cornelia Huck <cornelia.huck@de.ibm.com>
- linux-s390@vger.kernel.org
+Contact: linux-s390@vger.kernel.org
Description: Contains the PIM/PAM/POM values, as reported by the
channel subsystem when last queried by the common I/O
layer (this implies that this attribute is not necessarily
@@ -38,8 +34,7 @@ Users: s390-tools, HAL
What: /sys/bus/css/devices/.../driver_override
Date: June 2019
-Contact: Cornelia Huck <cohuck@redhat.com>
- linux-s390@vger.kernel.org
+Contact: linux-s390@vger.kernel.org
Description: This file allows the driver for a device to be specified. When
specified, only a driver with a name matching the value written
to driver_override will have an opportunity to bind to the
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 0b6a2e6e8fbb..48ac0d911801 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -1,3 +1,13 @@
+What: /sys/bus/cxl/flush
+Date: Januarry, 2022
+KernelVersion: v5.18
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) If userspace manually unbinds a port the kernel schedules
+ all descendant memdevs for unbind. Writing '1' to this attribute
+ flushes that work.
+
+
What: /sys/bus/cxl/devices/memX/firmware_version
Date: December, 2020
KernelVersion: v5.12
@@ -7,6 +17,7 @@ Description:
Memory Device Output Payload in the CXL-2.0
specification.
+
What: /sys/bus/cxl/devices/memX/ram/size
Date: December, 2020
KernelVersion: v5.12
@@ -16,6 +27,7 @@ Description:
identically named field in the Identify Memory Device Output
Payload in the CXL-2.0 specification.
+
What: /sys/bus/cxl/devices/memX/pmem/size
Date: December, 2020
KernelVersion: v5.12
@@ -25,105 +37,395 @@ Description:
identically named field in the Identify Memory Device Output
Payload in the CXL-2.0 specification.
+
+What: /sys/bus/cxl/devices/memX/serial
+Date: January, 2022
+KernelVersion: v5.18
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) 64-bit serial number per the PCIe Device Serial Number
+ capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
+ Memory Device PCIe Capabilities and Extended Capabilities.
+
+
+What: /sys/bus/cxl/devices/memX/numa_node
+Date: January, 2022
+KernelVersion: v5.18
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) If NUMA is enabled and the platform has affinitized the
+ host PCI device for this memory device, emit the CPU node
+ affinity for this device.
+
+
What: /sys/bus/cxl/devices/*/devtype
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- CXL device objects export the devtype attribute which mirrors
- the same value communicated in the DEVTYPE environment variable
- for uevents for devices on the "cxl" bus.
+ (RO) CXL device objects export the devtype attribute which
+ mirrors the same value communicated in the DEVTYPE environment
+ variable for uevents for devices on the "cxl" bus.
+
+
+What: /sys/bus/cxl/devices/*/modalias
+Date: December, 2021
+KernelVersion: v5.18
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) CXL device objects export the modalias attribute which
+ mirrors the same value communicated in the MODALIAS environment
+ variable for uevents for devices on the "cxl" bus.
+
What: /sys/bus/cxl/devices/portX/uport
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- CXL port objects are enumerated from either a platform firmware
- device (ACPI0017 and ACPI0016) or PCIe switch upstream port with
- CXL component registers. The 'uport' symlink connects the CXL
- portX object to the device that published the CXL port
+ (RO) CXL port objects are enumerated from either a platform
+ firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream
+ port with CXL component registers. The 'uport' symlink connects
+ the CXL portX object to the device that published the CXL port
capability.
+
+What: /sys/bus/cxl/devices/{port,endpoint}X/parent_dport
+Date: January, 2023
+KernelVersion: v6.3
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) CXL port objects are instantiated for each upstream port in
+ a CXL/PCIe switch, and for each endpoint to map the
+ corresponding memory device into the CXL port hierarchy. When a
+ descendant CXL port (switch or endpoint) is enumerated it is
+ useful to know which 'dport' object in the parent CXL port
+ routes to this descendant. The 'parent_dport' symlink points to
+ the device representing the downstream port of a CXL switch that
+ routes to {port,endpoint}X.
+
+
What: /sys/bus/cxl/devices/portX/dportY
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- CXL port objects are enumerated from either a platform firmware
- device (ACPI0017 and ACPI0016) or PCIe switch upstream port with
- CXL component registers. The 'dportY' symlink identifies one or
- more downstream ports that the upstream port may target in its
- decode of CXL memory resources. The 'Y' integer reflects the
- hardware port unique-id used in the hardware decoder target
- list.
+ (RO) CXL port objects are enumerated from either a platform
+ firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream
+ port with CXL component registers. The 'dportY' symlink
+ identifies one or more downstream ports that the upstream port
+ may target in its decode of CXL memory resources. The 'Y'
+ integer reflects the hardware port unique-id used in the
+ hardware decoder target list.
+
What: /sys/bus/cxl/devices/decoderX.Y
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- CXL decoder objects are enumerated from either a platform
+ (RO) CXL decoder objects are enumerated from either a platform
firmware description, or a CXL HDM decoder register set in a
PCIe device (see CXL 2.0 section 8.2.5.12 CXL HDM Decoder
Capability Structure). The 'X' in decoderX.Y represents the
cxl_port container of this decoder, and 'Y' represents the
instance id of a given decoder resource.
+
What: /sys/bus/cxl/devices/decoderX.Y/{start,size}
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- The 'start' and 'size' attributes together convey the physical
- address base and number of bytes mapped in the decoder's decode
- window. For decoders of devtype "cxl_decoder_root" the address
- range is fixed. For decoders of devtype "cxl_decoder_switch" the
- address is bounded by the decode range of the cxl_port ancestor
- of the decoder's cxl_port, and dynamically updates based on the
- active memory regions in that address space.
+ (RO) The 'start' and 'size' attributes together convey the
+ physical address base and number of bytes mapped in the
+ decoder's decode window. For decoders of devtype
+ "cxl_decoder_root" the address range is fixed. For decoders of
+ devtype "cxl_decoder_switch" the address is bounded by the
+ decode range of the cxl_port ancestor of the decoder's cxl_port,
+ and dynamically updates based on the active memory regions in
+ that address space.
+
What: /sys/bus/cxl/devices/decoderX.Y/locked
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- CXL HDM decoders have the capability to lock the configuration
- until the next device reset. For decoders of devtype
- "cxl_decoder_root" there is no standard facility to unlock them.
- For decoders of devtype "cxl_decoder_switch" a secondary bus
- reset, of the PCIe bridge that provides the bus for this
- decoders uport, unlocks / resets the decoder.
+ (RO) CXL HDM decoders have the capability to lock the
+ configuration until the next device reset. For decoders of
+ devtype "cxl_decoder_root" there is no standard facility to
+ unlock them. For decoders of devtype "cxl_decoder_switch" a
+ secondary bus reset, of the PCIe bridge that provides the bus
+ for this decoders uport, unlocks / resets the decoder.
+
What: /sys/bus/cxl/devices/decoderX.Y/target_list
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- Display a comma separated list of the current decoder target
- configuration. The list is ordered by the current configured
- interleave order of the decoder's dport instances. Each entry in
- the list is a dport id.
+ (RO) Display a comma separated list of the current decoder
+ target configuration. The list is ordered by the current
+ configured interleave order of the decoder's dport instances.
+ Each entry in the list is a dport id.
+
What: /sys/bus/cxl/devices/decoderX.Y/cap_{pmem,ram,type2,type3}
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- When a CXL decoder is of devtype "cxl_decoder_root", it
+ (RO) When a CXL decoder is of devtype "cxl_decoder_root", it
represents a fixed memory window identified by platform
firmware. A fixed window may only support a subset of memory
types. The 'cap_*' attributes indicate whether persistent
memory, volatile memory, accelerator memory, and / or expander
memory may be mapped behind this decoder's memory window.
+
What: /sys/bus/cxl/devices/decoderX.Y/target_type
Date: June, 2021
KernelVersion: v5.14
Contact: linux-cxl@vger.kernel.org
Description:
- When a CXL decoder is of devtype "cxl_decoder_switch", it can
- optionally decode either accelerator memory (type-2) or expander
- memory (type-3). The 'target_type' attribute indicates the
- current setting which may dynamically change based on what
+ (RO) When a CXL decoder is of devtype "cxl_decoder_switch", it
+ can optionally decode either accelerator memory (type-2) or
+ expander memory (type-3). The 'target_type' attribute indicates
+ the current setting which may dynamically change based on what
memory regions are activated in this decode hierarchy.
+
+
+What: /sys/bus/cxl/devices/endpointX/CDAT
+Date: July, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) If this sysfs entry is not present no DOE mailbox was
+ found to support CDAT data. If it is present and the length of
+ the data is 0 reading the CDAT data failed. Otherwise the CDAT
+ data is reported.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/mode
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
+ translates from a host physical address range, to a device local
+ address range. Device-local address ranges are further split
+ into a 'ram' (volatile memory) range and 'pmem' (persistent
+ memory) range. The 'mode' attribute emits one of 'ram', 'pmem',
+ 'mixed', or 'none'. The 'mixed' indication is for error cases
+ when a decoder straddles the volatile/persistent partition
+ boundary, and 'none' indicates the decoder is not actively
+ decoding, or no DPA allocation policy has been set.
+
+ 'mode' can be written, when the decoder is in the 'disabled'
+ state, with either 'ram' or 'pmem' to set the boundaries for the
+ next allocation.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/dpa_resource
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) When a CXL decoder is of devtype "cxl_decoder_endpoint",
+ and its 'dpa_size' attribute is non-zero, this attribute
+ indicates the device physical address (DPA) base address of the
+ allocation.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/dpa_size
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
+ translates from a host physical address range, to a device local
+ address range. The range, base address plus length in bytes, of
+ DPA allocated to this decoder is conveyed in these 2 attributes.
+ Allocations can be mutated as long as the decoder is in the
+ disabled state. A write to 'dpa_size' releases the previous DPA
+ allocation and then attempts to allocate from the free capacity
+ in the device partition referred to by 'decoderX.Y/mode'.
+ Allocate and free requests can only be performed on the highest
+ instance number disabled decoder with non-zero size. I.e.
+ allocations are enforced to occur in increasing 'decoderX.Y/id'
+ order and frees are enforced to occur in decreasing
+ 'decoderX.Y/id' order.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/interleave_ways
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The number of targets across which this decoder's host
+ physical address (HPA) memory range is interleaved. The device
+ maps every Nth block of HPA (of size ==
+ 'interleave_granularity') to consecutive DPA addresses. The
+ decoder's position in the interleave is determined by the
+ device's (endpoint or switch) switch ancestry. For root
+ decoders their interleave is specified by platform firmware and
+ they only specify a downstream target order for host bridges.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/interleave_granularity
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The number of consecutive bytes of host physical address
+ space this decoder claims at address N before the decode rotates
+ to the next target in the interleave at address N +
+ interleave_granularity (assuming N is aligned to
+ interleave_granularity).
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/create_{pmem,ram}_region
+Date: May, 2022, January, 2023
+KernelVersion: v6.0 (pmem), v6.3 (ram)
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Write a string in the form 'regionZ' to start the process
+ of defining a new persistent, or volatile memory region
+ (interleave-set) within the decode range bounded by root decoder
+ 'decoderX.Y'. The value written must match the current value
+ returned from reading this attribute. An atomic compare exchange
+ operation is done on write to assign the requested id to a
+ region and allocate the region-id for the next creation attempt.
+ EBUSY is returned if the region name written does not match the
+ current cached value.
+
+
+What: /sys/bus/cxl/devices/decoderX.Y/delete_region
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) Write a string in the form 'regionZ' to delete that region,
+ provided it is currently idle / not bound to a driver.
+
+
+What: /sys/bus/cxl/devices/regionZ/uuid
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Write a unique identifier for the region. This field must
+ be set for persistent regions and it must not conflict with the
+ UUID of another region. For volatile ram regions this
+ attribute is a read-only empty string.
+
+
+What: /sys/bus/cxl/devices/regionZ/interleave_granularity
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Set the number of consecutive bytes each device in the
+ interleave set will claim. The possible interleave granularity
+ values are determined by the CXL spec and the participating
+ devices.
+
+
+What: /sys/bus/cxl/devices/regionZ/interleave_ways
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Configures the number of devices participating in the
+ region is set by writing this value. Each device will provide
+ 1/interleave_ways of storage for the region.
+
+
+What: /sys/bus/cxl/devices/regionZ/size
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) System physical address space to be consumed by the region.
+ When written trigger the driver to allocate space out of the
+ parent root decoder's address space. When read the size of the
+ address space is reported and should match the span of the
+ region's resource attribute. Size shall be set after the
+ interleave configuration parameters. Once set it cannot be
+ changed, only freed by writing 0. The kernel makes no guarantees
+ that data is maintained over an address space freeing event, and
+ there is no guarantee that a free followed by an allocate
+ results in the same address being allocated.
+
+
+What: /sys/bus/cxl/devices/regionZ/mode
+Date: January, 2023
+KernelVersion: v6.3
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) The mode of a region is established at region creation time
+ and dictates the mode of the endpoint decoder that comprise the
+ region. For more details on the possible modes see
+ /sys/bus/cxl/devices/decoderX.Y/mode
+
+
+What: /sys/bus/cxl/devices/regionZ/resource
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RO) A region is a contiguous partition of a CXL root decoder
+ address space. Region capacity is allocated by writing to the
+ size attribute, the resulting physical address space determined
+ by the driver is reflected here. It is therefore not useful to
+ read this before writing a value to the size attribute.
+
+
+What: /sys/bus/cxl/devices/regionZ/target[0..N]
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Write an endpoint decoder object name to 'targetX' where X
+ is the intended position of the endpoint device in the region
+ interleave and N is the 'interleave_ways' setting for the
+ region. ENXIO is returned if the write results in an impossible
+ to map decode scenario, like the endpoint is unreachable at that
+ position relative to the root decoder interleave. EBUSY is
+ returned if the position in the region is already occupied, or
+ if the region is not in a state to accept interleave
+ configuration changes. EINVAL is returned if the object name is
+ not an endpoint decoder. Once all positions have been
+ successfully written a final validation for decode conflicts is
+ performed before activating the region.
+
+
+What: /sys/bus/cxl/devices/regionZ/commit
+Date: May, 2022
+KernelVersion: v6.0
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (RW) Write a boolean 'true' string value to this attribute to
+ trigger the region to transition from the software programmed
+ state to the actively decoding in hardware state. The commit
+ operation in addition to validating that the region is in proper
+ configured state, validates that the decoders are being
+ committed in spec mandated order (last committed decoder id +
+ 1), and checks that the hardware accepts the commit request.
+ Reading this value indicates whether the region is committed or
+ not.
+
+
+What: /sys/bus/cxl/devices/memX/trigger_poison_list
+Date: April, 2023
+KernelVersion: v6.4
+Contact: linux-cxl@vger.kernel.org
+Description:
+ (WO) When a boolean 'true' is written to this attribute the
+ memdev driver retrieves the poison list from the device. The
+ list consists of addresses that are poisoned, or would result
+ in poison if accessed, and the source of the poison. This
+ attribute is only visible for devices supporting the
+ capability. The retrieved errors are logged as kernel
+ events when cxl_poison event tracing is enabled.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
new file mode 100644
index 000000000000..8757dcf41c08
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
@@ -0,0 +1,18 @@
+What: /sys/bus/event_source/devices/<dev>/caps
+Date: May 2022
+KernelVersion: 5.19
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:
+ Attribute group to describe the capabilities exposed
+ for a particular pmu. Each attribute of this group can
+ expose information specific to a PMU, say pmu_name, so that
+ userspace can understand some of the feature which the
+ platform specific PMU supports.
+
+ One of the example available capability in supported platform
+ like Intel is pmu_name, which exposes underlying CPU name known
+ to the PMU driver.
+
+ Example output in powerpc:
+ grep . /sys/bus/event_source/devices/cpu/caps/*
+ /sys/bus/event_source/devices/cpu/caps/pmu_name:POWER9
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
new file mode 100644
index 000000000000..d7af4919302e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-iommu
@@ -0,0 +1,37 @@
+What: /sys/bus/event_source/devices/dmar*/format
+Date: Jan 2023
+KernelVersion: 6.3
+Contact: Kan Liang <kan.liang@linux.intel.com>
+Description: Read-only. Attribute group to describe the magic bits
+ that go into perf_event_attr.config,
+ perf_event_attr.config1 or perf_event_attr.config2 for
+ the IOMMU pmu. (See also
+ ABI/testing/sysfs-bus-event_source-devices-format).
+
+ Each attribute in this group defines a bit range in
+ perf_event_attr.config, perf_event_attr.config1,
+ or perf_event_attr.config2. All supported attributes
+ are listed below (See the VT-d Spec 4.0 for possible
+ attribute values)::
+
+ event = "config:0-27" - event ID
+ event_group = "config:28-31" - event group ID
+
+ filter_requester_en = "config1:0" - Enable Requester ID filter
+ filter_domain_en = "config1:1" - Enable Domain ID filter
+ filter_pasid_en = "config1:2" - Enable PASID filter
+ filter_ats_en = "config1:3" - Enable Address Type filter
+ filter_page_table_en= "config1:4" - Enable Page Table Level filter
+ filter_requester_id = "config1:16-31" - Requester ID filter
+ filter_domain = "config1:32-47" - Domain ID filter
+ filter_pasid = "config2:0-21" - PASID filter
+ filter_ats = "config2:24-28" - Address Type filter
+ filter_page_table = "config2:32-36" - Page Table Level filter
+
+What: /sys/bus/event_source/devices/dmar*/cpumask
+Date: Jan 2023
+KernelVersion: 6.3
+Contact: Kan Liang <kan.liang@linux.intel.com>
+Description: Read-only. This file always returns the CPU to which the
+ IOMMU pmu is bound for access to all IOMMU pmu performance
+ monitoring events.
diff --git a/Documentation/ABI/testing/sysfs-bus-fcoe b/Documentation/ABI/testing/sysfs-bus-fcoe
index 8fe787cc4ab7..5a4f2091ac37 100644
--- a/Documentation/ABI/testing/sysfs-bus-fcoe
+++ b/Documentation/ABI/testing/sysfs-bus-fcoe
@@ -31,7 +31,7 @@ Description: 'FCoE Controller' instances on the fcoe bus.
1) Write interface name to ctlr_create 2) Configure the FCoE
Controller (ctlr_X) 3) Enable the FCoE Controller to begin
discovery and login. The FCoE Controller is destroyed by
- writing it's name, i.e. ctlr_X to the ctlr_delete file.
+ writing its name, i.e. ctlr_X to the ctlr_delete file.
Attributes:
diff --git a/Documentation/ABI/testing/sysfs-bus-fsi-devices-sbefifo b/Documentation/ABI/testing/sysfs-bus-fsi-devices-sbefifo
new file mode 100644
index 000000000000..531fe9d6b40a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-fsi-devices-sbefifo
@@ -0,0 +1,10 @@
+What: /sys/bus/fsi/devices/XX.XX.00:06/sbefifoX/timeout
+KernelVersion: 5.15
+Contact: eajames@linux.ibm.com
+Description:
+ Indicates whether or not this SBE device has experienced a
+ timeout; i.e. the SBE did not respond within the time allotted
+ by the driver. A value of 1 indicates that a timeout has
+ ocurred and no transfers have completed since the timeout. A
+ value of 0 indicates that no timeout has ocurred, or if one
+ has, more recent transfers have completed successful.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 6ad47a67521c..7140e8e7313f 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -79,6 +79,11 @@ Description:
* "accel-base"
* "accel-display"
+ For devices where an accelerometer is housed in the swivel camera subassembly
+ (for AR application), the following standardized label is used:
+
+ * "accel-camera"
+
What: /sys/bus/iio/devices/iio:deviceX/current_timestamp_clock
KernelVersion: 4.5
Contact: linux-iio@vger.kernel.org
@@ -102,6 +107,9 @@ Description:
relevant directories. If it affects all of the above
then it is to be found in the base device directory.
+ The stm32-timer-trigger has the additional characteristic that
+ a sampling_frequency of 0 is defined to stop sampling.
+
What: /sys/bus/iio/devices/iio:deviceX/sampling_frequency_available
What: /sys/bus/iio/devices/iio:deviceX/in_intensity_sampling_frequency_available
What: /sys/bus/iio/devices/iio:deviceX/in_proximity_sampling_frequency_available
@@ -188,7 +196,7 @@ Description:
Raw capacitance measurement from channel Y. Units after
application of scale and offset are nanofarads.
-What: /sys/.../iio:deviceX/in_capacitanceY-in_capacitanceZ_raw
+What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_raw
KernelVersion: 3.2
Contact: linux-iio@vger.kernel.org
Description:
@@ -199,6 +207,25 @@ Description:
is required is a consistent labeling. Units after application
of scale and offset are nanofarads.
+What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_zeropoint
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ For differential channels, this an offset that is applied
+ equally to both inputs. As the reading is of the difference
+ between the two inputs, this should not be applied to the _raw
+ reading by userspace (unlike _offset) and unlike calibbias
+ it does not affect the differential value measured because
+ the effect of _zeropoint cancels out across the two inputs
+ that make up the differential pair. It's purpose is to bring
+ the individual signals, before the differential is measured,
+ within the measurement range of the device. The naming is
+ chosen because if the separate inputs that make the
+ differential pair are drawn on a graph in their
+ _raw units, this is the value that the zero point on the
+ measurement axis represents. It is expressed with the
+ same scaling as _raw.
+
What: /sys/bus/iio/devices/iio:deviceX/in_temp_raw
What: /sys/bus/iio/devices/iio:deviceX/in_tempX_raw
What: /sys/bus/iio/devices/iio:deviceX/in_temp_x_raw
@@ -233,6 +260,15 @@ Description:
Has all of the equivalent parameters as per voltageY. Units
after application of scale and offset are m/s^2.
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_x_raw
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_y_raw
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_z_raw
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ As per in_accel_X_raw attributes, but minus the
+ acceleration due to gravity.
+
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_x_raw
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_y_raw
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_z_raw
@@ -429,6 +465,7 @@ What: /sys/bus/iio/devices/iio:deviceX/in_angl_scale
What: /sys/bus/iio/devices/iio:deviceX/in_intensity_x_scale
What: /sys/bus/iio/devices/iio:deviceX/in_intensity_y_scale
What: /sys/bus/iio/devices/iio:deviceX/in_intensity_z_scale
+What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_scale
KernelVersion: 2.6.35
Contact: linux-iio@vger.kernel.org
Description:
@@ -475,6 +512,7 @@ What: /sys/bus/iio/devices/iio:deviceX/in_voltageY_i_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_voltageY_q_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_voltage_i_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_voltage_q_calibscale
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_voltage_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_accel_x_calibscale
What: /sys/bus/iio/devices/iio:deviceX/in_accel_y_calibscale
@@ -1212,6 +1250,32 @@ Description:
number or direction is not specified, applies to all channels of
this type.
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_en
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_rising_en
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_falling_en
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_en
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_rising_en
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_falling_en
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Similar to in_accel_mag[_y][_rising|_falling]_en, but the event
+ value is relative to a reference magnitude. The reference magnitude
+ includes the graviational acceleration.
+
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_value
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_rising_value
+What: /sys/.../iio:deviceX/events/in_accel_mag_referenced_falling_value
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_value
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_rising_value
+What: /sys/.../iio:deviceX/events/in_accel_y_mag_referenced_falling_value
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ The value to which the reference magnitude of the channel is
+ compared. If the axis is not specified, it applies to all channels
+ of this type.
+
What: /sys/.../events/in_steps_change_en
KernelVersion: 4.0
Contact: linux-iio@vger.kernel.org
@@ -1251,6 +1315,10 @@ Description:
Actually start the buffer capture up. Will start trigger
if first device and appropriate.
+ Note that it might be impossible to configure other attributes,
+ (e.g.: events, scale, sampling rate) if they impact the currently
+ active buffer capture session.
+
What: /sys/bus/iio/devices/iio:deviceX/bufferY
KernelVersion: 5.11
Contact: linux-iio@vger.kernel.org
@@ -1739,8 +1807,8 @@ What: /sys/bus/iio/devices/iio:deviceX/out_resistanceX_raw
KernelVersion: 4.3
Contact: linux-iio@vger.kernel.org
Description:
- Raw (unscaled no offset etc.) resistance reading that can be processed
- into an ohm value.
+ Raw (unscaled no offset etc.) resistance reading.
+ Units after application of scale and offset are ohms.
What: /sys/bus/iio/devices/iio:deviceX/heater_enable
KernelVersion: 4.1.0
@@ -1826,8 +1894,9 @@ What: /sys/bus/iio/devices/iio:deviceX/in_electricalconductivity_raw
KernelVersion: 4.8
Contact: linux-iio@vger.kernel.org
Description:
- Raw (unscaled no offset etc.) electric conductivity reading that
- can be processed to siemens per meter.
+ Raw (unscaled no offset etc.) electric conductivity reading.
+ Units after application of scale and offset are siemens per
+ meter.
What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
KernelVersion: 4.10
@@ -1883,8 +1952,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
KernelVersion: 4.18
Contact: linux-iio@vger.kernel.org
Description:
- Raw (unscaled) phase difference reading from channel Y
- that can be processed to radians.
+ Raw (unscaled) phase difference reading from channel Y.
+ Units after application of scale and offset are radians.
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentration_pm1_input
What: /sys/bus/iio/devices/iio:deviceX/in_massconcentrationY_pm1_input
@@ -1957,3 +2026,140 @@ Description:
Specify the percent for light sensor relative to the channel
absolute value that a data field should change before an event
is generated. Units are a percentage of the prior reading.
+
+What: /sys/bus/iio/devices/iio:deviceX/calibration_auto_enable
+Date: June 2020
+KernelVersion: 5.8
+Contact: linux-iio@vger.kernel.org
+Description:
+ Some sensors have the ability to apply auto calibration at
+ runtime. For example, it may be necessary to compensate for
+ contaminant build-up in a measurement chamber or optical
+ element deterioration that would otherwise lead to sensor drift.
+
+ Writing 1 or 0 to this attribute will respectively activate or
+ deactivate this auto calibration function.
+
+ Upon reading, the current status is returned.
+
+What: /sys/bus/iio/devices/iio:deviceX/calibration_forced_value
+Date: June 2020
+KernelVersion: 5.8
+Contact: linux-iio@vger.kernel.org
+Description:
+ Some sensors have the ability to apply a manual calibration using
+ a known measurement value, perhaps obtained from an external
+ reference device.
+
+ Writing a value to this function will force such a calibration
+ change. For the scd30 the value should be from the range
+ [400 1 2000].
+
+ Note for the scd30 that a valid value may only be obtained once
+ it is has been written. Until then any read back of this value
+ should be ignored. As for the scd4x an error will be returned
+ immediately if the manual calibration has failed.
+
+What: /sys/bus/iio/devices/iio:deviceX/calibration_forced_value_available
+KernelVersion: 5.15
+Contact: linux-iio@vger.kernel.org
+Description:
+ Available range for the forced calibration value, expressed as:
+
+ - a range specified as "[min step max]"
+
+What: /sys/bus/iio/devices/iio:deviceX/in_voltageX_sampling_frequency
+What: /sys/bus/iio/devices/iio:deviceX/in_powerY_sampling_frequency
+What: /sys/bus/iio/devices/iio:deviceX/in_currentZ_sampling_frequency
+KernelVersion: 5.20
+Contact: linux-iio@vger.kernel.org
+Description:
+ Some devices have separate controls of sampling frequency for
+ individual channels. If multiple channels are enabled in a scan,
+ then the sampling_frequency of the scan may be computed from the
+ per channel sampling frequencies.
+
+What: /sys/.../events/in_accel_gesture_singletap_en
+What: /sys/.../events/in_accel_gesture_doubletap_en
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Device generates an event on a single or double tap.
+
+What: /sys/.../events/in_accel_gesture_singletap_value
+What: /sys/.../events/in_accel_gesture_doubletap_value
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Specifies the threshold value that the device is comparing
+ against to generate the tap gesture event. The lower
+ threshold value increases the sensitivity of tap detection.
+ Units and the exact meaning of value are device-specific.
+
+What: /sys/.../events/in_accel_gesture_tap_value_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Lists all available threshold values which can be used to
+ modify the sensitivity of the tap detection.
+
+What: /sys/.../events/in_accel_gesture_singletap_reset_timeout
+What: /sys/.../events/in_accel_gesture_doubletap_reset_timeout
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Specifies the timeout value in seconds for the tap detector
+ to not to look for another tap event after the event as
+ occurred. Basically the minimum quiet time between the two
+ single-tap's or two double-tap's.
+
+What: /sys/.../events/in_accel_gesture_tap_reset_timeout_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Lists all available tap reset timeout values. Units in seconds.
+
+What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Specifies the minimum quiet time in seconds between the two
+ taps of a double tap.
+
+What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Lists all available delay values between two taps in the double
+ tap. Units in seconds.
+
+What: /sys/.../events/in_accel_gesture_tap_maxtomin_time
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Specifies the maximum time difference allowed between upper
+ and lower peak of tap to consider it as the valid tap event.
+ Units in seconds.
+
+What: /sys/.../events/in_accel_gesture_tap_maxtomin_time_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Lists all available time values between upper peak to lower
+ peak. Units in seconds.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_rot_yaw_raw
+What: /sys/bus/iio/devices/iio:deviceX/in_rot_pitch_raw
+What: /sys/bus/iio/devices/iio:deviceX/in_rot_roll_raw
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Raw (unscaled) euler angles readings. Units after
+ application of scale are deg.
+
+What: /sys/bus/iio/devices/iio:deviceX/serialnumber
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ An example format is 16-bytes, 2-digits-per-byte, HEX-string
+ representing the sensor unique ID number.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-ad4130 b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad4130
new file mode 100644
index 000000000000..f24ed6687e90
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad4130
@@ -0,0 +1,46 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_voltage-voltage_filter_mode_available
+KernelVersion: 6.2
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reading returns a list with the possible filter modes.
+
+ * "sinc4" - Sinc 4. Excellent noise performance. Long
+ 1st conversion time. No natural 50/60Hz rejection.
+
+ * "sinc4+sinc1" - Sinc4 + averaging by 8. Low 1st conversion
+ time.
+
+ * "sinc3" - Sinc3. Moderate 1st conversion time.
+ Good noise performance.
+
+ * "sinc3+rej60" - Sinc3 + 60Hz rejection. At a sampling
+ frequency of 50Hz, achieves simultaneous 50Hz and 60Hz
+ rejection.
+
+ * "sinc3+sinc1" - Sinc3 + averaging by 8. Low 1st conversion
+ time. Best used with a sampling frequency of at least
+ 216.19Hz.
+
+ * "sinc3+pf1" - Sinc3 + Post Filter 1. 53dB rejection @
+ 50Hz, 58dB rejection @ 60Hz.
+
+ * "sinc3+pf2" - Sinc3 + Post Filter 2. 70dB rejection @
+ 50Hz, 70dB rejection @ 60Hz.
+
+ * "sinc3+pf3" - Sinc3 + Post Filter 3. 99dB rejection @
+ 50Hz, 103dB rejection @ 60Hz.
+
+ * "sinc3+pf4" - Sinc3 + Post Filter 4. 103dB rejection @
+ 50Hz, 109dB rejection @ 60Hz.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_voltageY-voltageZ_filter_mode
+KernelVersion: 6.2
+Contact: linux-iio@vger.kernel.org
+Description:
+ Set the filter mode of the differential channel. When the filter
+ mode changes, the in_voltageY-voltageZ_sampling_frequency and
+ in_voltageY-voltageZ_sampling_frequency_available attributes
+ might also change to accommodate the new filter mode.
+ If the current sampling frequency is out of range for the new
+ filter mode, the sampling frequency will be changed to the
+ closest valid one.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7280a b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7280a
new file mode 100644
index 000000000000..83b7efe6aa07
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7280a
@@ -0,0 +1,13 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_voltageY-voltageZ_balance_switch_en
+KernelVersion: 5.14
+Contact: linux-iio@vger.kernel.org
+Description:
+ Used to enable an output for balancing cells for time
+ controlled via in_voltage_Y-voltageZ_balance_switch_timer.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_voltageY-voltageZ_balance_switch_timer
+KernelVersion: 5.14
+Contact: linux-iio@vger.kernel.org
+Description:
+ Time in seconds for which balance switch will be turned on.
+ Multiple of 71.5 seconds.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-max11410 b/Documentation/ABI/testing/sysfs-bus-iio-adc-max11410
new file mode 100644
index 000000000000..2a53c6b37360
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-max11410
@@ -0,0 +1,13 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_voltage_filterY_notch_en
+Date: September 2022
+KernelVersion: 6.0
+Contact: linux-iio@vger.kernel.org
+Description:
+ Enable or disable a notch filter.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_voltage_filterY_notch_center
+Date: September 2022
+KernelVersion: 6.0
+Contact: linux-iio@vger.kernel.org
+Description:
+ Center frequency of the notch filter in Hz.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-bno055 b/Documentation/ABI/testing/sysfs-bus-iio-bno055
new file mode 100644
index 000000000000..f32b1644e986
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-bno055
@@ -0,0 +1,81 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Raw (unscaled) range for acceleration readings. Unit after
+ application of scale is m/s^2. Note that this doesn't affects
+ the scale (which should be used when changing the maximum and
+ minimum readable value affects also the reading scaling factor).
+
+What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Range for angular velocity readings in radians per second. Note
+ that this does not affects the scale (which should be used when
+ changing the maximum and minimum readable value affects also the
+ reading scaling factor).
+
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ List of allowed values for in_accel_raw_range attribute
+
+What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range_available
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ List of allowed values for in_anglvel_raw_range attribute
+
+What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_fast_enable
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Can be 1 or 0. Enables/disables the "Fast Magnetometer
+ Calibration" HW function.
+
+What: /sys/bus/iio/devices/iio:deviceX/fusion_enable
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Can be 1 or 0. Enables/disables the "sensor fusion" (a.k.a.
+ NDOF) HW function.
+
+What: /sys/bus/iio/devices/iio:deviceX/calibration_data
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reports the binary calibration data blob for the IMU sensors.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_accel_calibration_auto_status
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reports the autocalibration status for the accelerometer sensor.
+ Can be 0 (calibration non even enabled) or 1 to 5 where the greater
+ the number, the better the calibration status.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_gyro_calibration_auto_status
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reports the autocalibration status for the gyroscope sensor.
+ Can be 0 (calibration non even enabled) or 1 to 5 where the greater
+ the number, the better the calibration status.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_auto_status
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reports the autocalibration status for the magnetometer sensor.
+ Can be 0 (calibration non even enabled) or 1 to 5 where the greater
+ the number, the better the calibration status.
+
+What: /sys/bus/iio/devices/iio:deviceX/sys_calibration_auto_status
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reports the status for the IMU overall autocalibration.
+ Can be 0 (calibration non even enabled) or 1 to 5 where the greater
+ the number, the better the calibration status.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746 b/Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
new file mode 100644
index 000000000000..02ca8941dce1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
@@ -0,0 +1,11 @@
+What: /sys/.../iio:deviceX/in_capacitableY_calibbias_calibration
+What: /sys/.../iio:deviceX/in_capacitableY_calibscale_calibration
+KernelVersion: 6.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Write 1 to trigger a calibration of the calibbias or
+ calibscale. For calibscale, a full scale capacitance should
+ be connected to the capacitance input and a
+ calibscale_calibration then started. For calibbias see
+ the device datasheet section on "capacitive system offset
+ calibration".
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-chemical-sunrise-co2 b/Documentation/ABI/testing/sysfs-bus-iio-chemical-sunrise-co2
new file mode 100644
index 000000000000..ee7aeb11709b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-chemical-sunrise-co2
@@ -0,0 +1,38 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_calibration_factory
+Date: August 2021
+KernelVersion: 5.16
+Contact: Jacopo Mondi <jacopo@jmondi.org>
+Description:
+ Writing '1' triggers a 'Factory' calibration cycle.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_calibration_background
+Date: August 2021
+KernelVersion: 5.16
+Contact: Jacopo Mondi <jacopo@jmondi.org>
+Description:
+ Writing '1' triggers a 'Background' calibration cycle.
+
+What: /sys/bus/iio/devices/iio:deviceX/error_status_available
+Date: August 2021
+KernelVersion: 5.16
+Contact: Jacopo Mondi <jacopo@jmondi.org>
+Description:
+ Reading returns the list of possible chip error status.
+ Available options are:
+ - 'error_fatal': Analog front-end initialization error
+ - 'error_i2c': Read/write to non-existing register
+ - 'error_algorithm': Corrupted parameters
+ - 'error_calibration': Calibration has failed
+ - 'error_self_diagnostic': Internal interface failure
+ - 'error_out_of_range': Measured concentration out of scale
+ - 'error_memory': Error during memory operations
+ - 'error_no_measurement': Cleared at first measurement
+ - 'error_low_voltage': Sensor regulated voltage too low
+ - 'error_measurement_timeout': Unable to complete measurement
+
+What: /sys/bus/iio/devices/iio:deviceX/error_status
+Date: August 2021
+KernelVersion: 5.16
+Contact: Jacopo Mondi <jacopo@jmondi.org>
+Description:
+ Reading returns the current chip error status.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-dac-ltc2688 b/Documentation/ABI/testing/sysfs-bus-iio-dac-ltc2688
new file mode 100644
index 000000000000..1c35971277ba
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-dac-ltc2688
@@ -0,0 +1,86 @@
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_en
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Dither enable. Write 1 to enable dither or 0 to disable it. This is useful
+ for changing the dither parameters. They way it should be done is:
+
+ - disable dither operation;
+ - change dither parameters (eg: frequency, phase...);
+ - enabled dither operation
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_raw
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ This raw, unscaled value refers to the dither signal amplitude.
+ The same scale as in out_voltageY_raw applies. However, the
+ offset might be different as it's always 0 for this attribute.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_raw_available
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Available range for dither raw amplitude values.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_offset
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Offset applied to out_voltageY_dither_raw. Read only attribute
+ always set to 0.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_frequency
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Sets the dither signal frequency. Units are in Hz.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_frequency_available
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Returns the available values for the dither frequency.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_phase
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Sets the dither signal phase. Units are in Radians.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_dither_phase_available
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Returns the available values for the dither phase.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_toggle_en
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Toggle enable. Write 1 to enable toggle or 0 to disable it. This is
+ useful when one wants to change the DAC output codes. The way it should
+ be done is:
+
+ - disable toggle operation;
+ - change out_voltageY_raw0 and out_voltageY_raw1;
+ - enable toggle operation.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_raw0
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_raw1
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ It has the same meaning as out_voltageY_raw. This attribute is
+ specific to toggle enabled channels and refers to the DAC output
+ code in INPUT_A (_raw0) and INPUT_B (_raw1). The same scale and offset
+ as in out_voltageY_raw applies.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_voltageY_symbol
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Performs a SW toggle. This attribute is specific to toggle
+ enabled channels and allows to toggle between out_voltageY_raw0
+ and out_voltageY_raw1 through software. Writing 0 will select
+ out_voltageY_raw0 while 1 selects out_voltageY_raw1.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-filter-admv8818 b/Documentation/ABI/testing/sysfs-bus-iio-filter-admv8818
new file mode 100644
index 000000000000..f6c035752639
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-filter-admv8818
@@ -0,0 +1,16 @@
+What: /sys/bus/iio/devices/iio:deviceX/filter_mode_available
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reading this returns the valid values that can be written to the
+ on_altvoltage0_mode attribute:
+
+ - auto -> Adjust bandpass filter to track changes in input clock rate.
+ - manual -> disable/unregister the clock rate notifier / input clock tracking.
+
+What: /sys/bus/iio/devices/iio:deviceX/filter_mode
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ This attribute configures the filter mode.
+ Reading returns the actual mode.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1013 b/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1013
new file mode 100644
index 000000000000..de1e323e5d47
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1013
@@ -0,0 +1,38 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0-1_i_calibphase
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write unscaled value for the Local Oscillatior path quadrature I phase shift.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0-1_q_calibphase
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write unscaled value for the Local Oscillatior path quadrature Q phase shift.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_i_calibbias
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the Local Oscillatior Feedthrough Offset Calibration I Positive
+ side.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_q_calibbias
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the Local Oscillatior Feedthrough Offset Calibration Q Positive side.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage1_i_calibbias
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write raw value for the Local Oscillatior Feedthrough Offset Calibration I Negative
+ side.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage1_q_calibbias
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write raw value for the Local Oscillatior Feedthrough Offset Calibration Q Negative
+ side.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1014 b/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1014
new file mode 100644
index 000000000000..395010a0ef8b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-frequency-admv1014
@@ -0,0 +1,23 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_i_calibscale_coarse
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the digital attenuator gain (IF_I) with coarse steps.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_q_calibscale_coarse
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the digital attenuator gain (IF_Q) with coarse steps.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_i_calibscale_fine
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the digital attenuator gain (IF_I) with fine steps.
+
+What: /sys/bus/iio/devices/iio:deviceX/in_altvoltage0_q_calibscale_fine
+KernelVersion: 5.18
+Contact: linux-iio@vger.kernel.org
+Description:
+ Read/write value for the digital attenuator gain (IF_Q) with fine steps.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-proximity b/Documentation/ABI/testing/sysfs-bus-iio-proximity
index 3aac6dab8775..9b9d1cc9b703 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-proximity
+++ b/Documentation/ABI/testing/sysfs-bus-iio-proximity
@@ -18,7 +18,7 @@ Description:
on the signal from which time of flight measurements are
taken.
The appropriate values to take is dependent on both the
- sensor and it's operating environment:
+ sensor and its operating environment:
* as3935 (0-31 range)
18 = indoors (default)
14 = outdoors
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-scd30 b/Documentation/ABI/testing/sysfs-bus-iio-scd30
deleted file mode 100644
index b9712f390bec..000000000000
--- a/Documentation/ABI/testing/sysfs-bus-iio-scd30
+++ /dev/null
@@ -1,34 +0,0 @@
-What: /sys/bus/iio/devices/iio:deviceX/calibration_auto_enable
-Date: June 2020
-KernelVersion: 5.8
-Contact: linux-iio@vger.kernel.org
-Description:
- Contaminants build-up in the measurement chamber or optical
- elements deterioration leads to sensor drift.
-
- One can compensate for sensor drift by using automatic self
- calibration procedure (asc).
-
- Writing 1 or 0 to this attribute will respectively activate or
- deactivate asc.
-
- Upon reading current asc status is returned.
-
-What: /sys/bus/iio/devices/iio:deviceX/calibration_forced_value
-Date: June 2020
-KernelVersion: 5.8
-Contact: linux-iio@vger.kernel.org
-Description:
- Contaminants build-up in the measurement chamber or optical
- elements deterioration leads to sensor drift.
-
- One can compensate for sensor drift by using forced
- recalibration (frc). This is useful in case there's known
- co2 reference available nearby the sensor.
-
- Picking value from the range [400 1 2000] and writing it to the
- sensor will set frc.
-
- Upon reading current frc value is returned. Note that after
- power cycling default value (i.e 400) is returned even though
- internally sensor had recalibrated itself.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-sx9324 b/Documentation/ABI/testing/sysfs-bus-iio-sx9324
new file mode 100644
index 000000000000..a8342770e7cb
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-sx9324
@@ -0,0 +1,29 @@
+What: /sys/bus/iio/devices/iio:deviceX/in_proximity<id>_setup
+Date: November 2021
+KernelVersion: 5.17
+Contact: Gwendal Grignou <gwendal@chromium.org>
+Description:
+ SX9324 has 3 inputs, CS0, CS1 and CS2. Hardware layout
+ defines if the input is
+
+ + not connected (HZ),
+ + grounded (GD),
+ + connected to an antenna where it can act as a base
+ (DS - data shield), or measured input (MI).
+
+ The sensor rotates measurement across 4 phases
+ (PH0, PH1, PH2, PH3), where the inputs are configured
+ and then measured.
+
+ By default, during the first phase, [PH0], CS0 is measured,
+ while CS1 and CS2 are used as shields.
+ `cat in_proximity0_setup` returns "MI,DS,DS".
+ [PH1], CS1 is measured, CS0 and CS2 are shield:
+ `cat in_proximity1_setup` returns "DS,MI,DS".
+ [PH2], CS2 is measured, CS0 and CS1 are shield:
+ `cat in_proximity1_setup` returns "DS,DS,MI".
+ [PH3], CS1 and CS2 are measured (combo mode):
+ `cat in_proximity1_setup` returns "DS,MI,MI".
+
+ Note, these are the chip default. Hardware layout will most
+ likely dictate different output. The entry is read-only.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856 b/Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
deleted file mode 100644
index e5ef6d8e5da1..000000000000
--- a/Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
+++ /dev/null
@@ -1,31 +0,0 @@
-What: /sys/bus/iio/devices/iio:deviceX/fault_oc
-KernelVersion: 5.1
-Contact: linux-iio@vger.kernel.org
-Description:
- Open-circuit fault. The detection of open-circuit faults,
- such as those caused by broken thermocouple wires.
- Reading returns either '1' or '0'.
-
- === =======================================================
- '1' An open circuit such as broken thermocouple wires
- has been detected.
- '0' No open circuit or broken thermocouple wires are detected
- === =======================================================
-
-What: /sys/bus/iio/devices/iio:deviceX/fault_ovuv
-KernelVersion: 5.1
-Contact: linux-iio@vger.kernel.org
-Description:
- Overvoltage or Undervoltage Input Fault. The internal circuitry
- is protected from excessive voltages applied to the thermocouple
- cables by integrated MOSFETs at the T+ and T- inputs, and the
- BIAS output. These MOSFETs turn off when the input voltage is
- negative or greater than VDD.
-
- Reading returns either '1' or '0'.
-
- === =======================================================
- '1' The input voltage is negative or greater than VDD.
- '0' The input voltage is positive and less than VDD (normal
- state).
- === =======================================================
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-thermocouple b/Documentation/ABI/testing/sysfs-bus-iio-thermocouple
new file mode 100644
index 000000000000..01259df297ca
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-thermocouple
@@ -0,0 +1,18 @@
+What: /sys/bus/iio/devices/iio:deviceX/fault_ovuv
+KernelVersion: 5.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Overvoltage or Undervoltage Input Fault. The internal circuitry
+ is protected from excessive voltages applied to the thermocouple
+ cables. The device can also detect if such a condition occurs.
+
+ Reading returns '1' if input voltage is negative or greater
+ than VDD, otherwise '0'.
+
+What: /sys/bus/iio/devices/iio:deviceX/fault_oc
+KernelVersion: 5.1
+Contact: linux-iio@vger.kernel.org
+Description:
+ Open-circuit fault. The detection of open-circuit faults,
+ such as those caused by broken thermocouple wires.
+ Reading returns '1' if fault, '0' otherwise.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
index c4a4497c249a..05074c4a65e2 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
+++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
@@ -90,14 +90,6 @@ Description:
Reading returns the current master modes.
Writing set the master mode
-What: /sys/bus/iio/devices/triggerX/sampling_frequency
-KernelVersion: 4.11
-Contact: benjamin.gaignard@st.com
-Description:
- Reading returns the current sampling frequency.
- Writing an value different of 0 set and start sampling.
- Writing 0 stop sampling.
-
What: /sys/bus/iio/devices/iio:deviceX/in_count0_preset
KernelVersion: 4.12
Contact: benjamin.gaignard@st.com
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-vf610 b/Documentation/ABI/testing/sysfs-bus-iio-vf610
index 308a6756d3bf..491ead804488 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-vf610
+++ b/Documentation/ABI/testing/sysfs-bus-iio-vf610
@@ -1,4 +1,4 @@
-What: /sys/bus/iio/devices/iio:deviceX/conversion_mode
+What: /sys/bus/iio/devices/iio:deviceX/in_conversion_mode
KernelVersion: 4.2
Contact: linux-iio@vger.kernel.org
Description:
diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio
index da86efc7781b..38be04dfc05e 100644
--- a/Documentation/ABI/testing/sysfs-bus-mdio
+++ b/Documentation/ABI/testing/sysfs-bus-mdio
@@ -1,4 +1,5 @@
What: /sys/bus/mdio_bus/devices/.../statistics/
+What: /sys/class/mdio_bus/.../statistics/
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -7,6 +8,7 @@ Description:
MDIO bus address statistics.
What: /sys/bus/mdio_bus/devices/.../statistics/transfers
+What: /sys/class/mdio_bus/.../transfers
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -14,6 +16,7 @@ Description:
Total number of transfers for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/errors
+What: /sys/class/mdio_bus/.../statistics/errors
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -21,6 +24,7 @@ Description:
Total number of transfer errors for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/writes
+What: /sys/class/mdio_bus/.../statistics/writes
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -28,6 +32,7 @@ Description:
Total number of write transactions for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/reads
+What: /sys/class/mdio_bus/.../statistics/reads
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -35,6 +40,7 @@ Description:
Total number of read transactions for this MDIO bus.
What: /sys/bus/mdio_bus/devices/.../statistics/transfers_<addr>
+What: /sys/class/mdio_bus/.../statistics/transfers_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -42,6 +48,7 @@ Description:
Total number of transfers for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/errors_<addr>
+What: /sys/class/mdio_bus/.../statistics/errors_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -49,6 +56,7 @@ Description:
Total number of transfer errors for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/writes_<addr>
+What: /sys/class/mdio_bus/.../statistics/writes_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
@@ -56,6 +64,7 @@ Description:
Total number of write transactions for this MDIO bus address.
What: /sys/bus/mdio_bus/devices/.../statistics/reads_<addr>
+What: /sys/class/mdio_bus/.../statistics/reads_<addr>
Date: January 2020
KernelVersion: 5.6
Contact: netdev@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm b/Documentation/ABI/testing/sysfs-bus-nvdimm
index bff84a16812a..de8c5a59c77f 100644
--- a/Documentation/ABI/testing/sysfs-bus-nvdimm
+++ b/Documentation/ABI/testing/sysfs-bus-nvdimm
@@ -6,3 +6,52 @@ Description:
The libnvdimm sub-system implements a common sysfs interface for
platform nvdimm resources. See Documentation/driver-api/nvdimm/.
+
+What: /sys/bus/event_source/devices/nmemX/format
+Date: February 2022
+KernelVersion: 5.18
+Contact: Kajol Jain <kjain@linux.ibm.com>
+Description: (RO) Attribute group to describe the magic bits
+ that go into perf_event_attr.config for a particular pmu.
+ (See ABI/testing/sysfs-bus-event_source-devices-format).
+
+ Each attribute under this group defines a bit range of the
+ perf_event_attr.config. Supported attribute is listed
+ below::
+ event = "config:0-4" - event ID
+
+ For example::
+ ctl_res_cnt = "event=0x1"
+
+What: /sys/bus/event_source/devices/nmemX/events
+Date: February 2022
+KernelVersion: 5.18
+Contact: Kajol Jain <kjain@linux.ibm.com>
+Description: (RO) Attribute group to describe performance monitoring events
+ for the nvdimm memory device. Each attribute in this group
+ describes a single performance monitoring event supported by
+ this nvdimm pmu. The name of the file is the name of the event.
+ (See ABI/testing/sysfs-bus-event_source-devices-events). A
+ listing of the events supported by a given nvdimm provider type
+ can be found in Documentation/driver-api/nvdimm/$provider.
+
+What: /sys/bus/event_source/devices/nmemX/cpumask
+Date: February 2022
+KernelVersion: 5.18
+Contact: Kajol Jain <kjain@linux.ibm.com>
+Description: (RO) This sysfs file exposes the cpumask which is designated to
+ to retrieve nvdimm pmu event counter data.
+
+What: /sys/bus/nd/devices/nmemX/cxl/id
+Date: November 2022
+KernelVersion: 6.2
+Contact: Dave Jiang <dave.jiang@intel.com>
+Description: (RO) Show the id (serial) of the device. This is CXL specific.
+
+What: /sys/bus/nd/devices/nmemX/cxl/provider
+Date: November 2022
+KernelVersion: 6.2
+Contact: Dave Jiang <dave.jiang@intel.com>
+Description: (RO) Shows the CXL bridge device that ties to a CXL memory device
+ to this NVDIMM device. I.e. the parent of the device returned is
+ a /sys/bus/cxl/devices/memX instance.
diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem
index 95254cec92bf..4ac0673901e7 100644
--- a/Documentation/ABI/testing/sysfs-bus-papr-pmem
+++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem
@@ -61,3 +61,15 @@ Description:
* "CchRHCnt" : Cache Read Hit Count
* "CchWHCnt" : Cache Write Hit Count
* "FastWCnt" : Fast Write Count
+
+What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject
+Date: Jan, 2022
+KernelVersion: v5.17
+Contact: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev,
+Description:
+ (RO) Reports the health bitmap inject bitmap that is applied to
+ bitmap received from PowerVM via the H_SCM_HEALTH. This is used
+ to forcibly set specific bits returned from Hcall. These is then
+ used to simulate various health or shutdown states for an nvdimm
+ and are set by user-space tools like ndctl by issuing a PAPR DSM.
+
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index d4ae03296861..ecf47559f495 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -1,4 +1,5 @@
What: /sys/bus/pci/drivers/.../bind
+What: /sys/devices/pciX/.../bind
Date: December 2003
Contact: linux-pci@vger.kernel.org
Description:
@@ -14,6 +15,7 @@ Description:
(Note: kernels before 2.6.28 may require echo -n).
What: /sys/bus/pci/drivers/.../unbind
+What: /sys/devices/pciX/.../unbind
Date: December 2003
Contact: linux-pci@vger.kernel.org
Description:
@@ -29,6 +31,7 @@ Description:
(Note: kernels before 2.6.28 may require echo -n).
What: /sys/bus/pci/drivers/.../new_id
+What: /sys/devices/pciX/.../new_id
Date: December 2003
Contact: linux-pci@vger.kernel.org
Description:
@@ -47,6 +50,7 @@ Description:
# echo "8086 10f5" > /sys/bus/pci/drivers/foo/new_id
What: /sys/bus/pci/drivers/.../remove_id
+What: /sys/devices/pciX/.../remove_id
Date: February 2009
Contact: Chris Wright <chrisw@sous-sol.org>
Description:
@@ -96,6 +100,17 @@ Description:
This attribute indicates the mode that the irq vector named by
the file is in (msi vs. msix)
+What: /sys/bus/pci/devices/.../irq
+Date: August 2021
+Contact: Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+ If a driver has enabled MSI (not MSI-X), "irq" contains the
+ IRQ of the first MSI vector. Otherwise "irq" contains the
+ IRQ of the legacy INTx interrupt.
+
+ "irq" being set to 0 indicates that the device isn't
+ capable of generating legacy INTx interrupts.
+
What: /sys/bus/pci/devices/.../remove
Date: January 2009
Contact: Linux PCI developers <linux-pci@vger.kernel.org>
@@ -160,7 +175,7 @@ Description:
If the underlying VPD has a writable section then the
corresponding section of this file will be writable.
-What: /sys/bus/pci/devices/.../virtfnN
+What: /sys/bus/pci/devices/.../virtfn<N>
Date: March 2009
Contact: Yu Zhao <yu.zhao@intel.com>
Description:
@@ -187,6 +202,24 @@ Description:
The symbolic link points to the PCI device sysfs entry of the
Physical Function this device associates with.
+What: /sys/bus/pci/devices/.../modalias
+Date: May 2005
+Contact: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Description:
+ This attribute indicates the PCI ID of the device object.
+
+ That is in the format:
+ pci:vXXXXXXXXdXXXXXXXXsvXXXXXXXXsdXXXXXXXXbcXXscXXiXX,
+ where:
+
+ - vXXXXXXXX contains the vendor ID;
+ - dXXXXXXXX contains the device ID;
+ - svXXXXXXXX contains the sub-vendor ID;
+ - sdXXXXXXXX contains the subsystem device ID;
+ - bcXX contains the device class;
+ - scXX contains the device subclass;
+ - iXX contains the device class programming interface.
+
What: /sys/bus/pci/slots/.../module
Date: June 2009
Contact: linux-pci@vger.kernel.org
@@ -374,6 +407,16 @@ Description:
file contains a '1' if the memory has been published for
use outside the driver that owns the device.
+What: /sys/bus/pci/devices/.../p2pmem/allocate
+Date: August 2022
+Contact: Logan Gunthorpe <logang@deltatee.com>
+Description:
+ This file allows mapping p2pmem into userspace. For each
+ mmap() call on this file, the kernel will allocate a chunk
+ of Peer-to-Peer memory for use in Peer-to-Peer transactions.
+ This memory can be used in O_DIRECT calls to NVMe backed
+ files for Peer-to-Peer copies.
+
What: /sys/bus/pci/devices/.../link/clkpm
/sys/bus/pci/devices/.../link/l0s_aspm
/sys/bus/pci/devices/.../link/l1_aspm
@@ -424,3 +467,36 @@ Description:
The file is writable if the PF is bound to a driver that
implements ->sriov_set_msix_vec_count().
+
+What: /sys/bus/pci/devices/.../resourceN_resize
+Date: September 2022
+Contact: Alex Williamson <alex.williamson@redhat.com>
+Description:
+ These files provide an interface to PCIe Resizable BAR support.
+ A file is created for each BAR resource (N) supported by the
+ PCIe Resizable BAR extended capability of the device. Reading
+ each file exposes the bitmap of available resource sizes:
+
+ # cat resource1_resize
+ 00000000000001c0
+
+ The bitmap represents supported resource sizes for the BAR,
+ where bit0 = 1MB, bit1 = 2MB, bit2 = 4MB, etc. In the above
+ example the device supports 64MB, 128MB, and 256MB BAR sizes.
+
+ When writing the file, the user provides the bit position of
+ the desired resource size, for example:
+
+ # echo 7 > resource1_resize
+
+ This indicates to set the size value corresponding to bit 7,
+ 128MB. The resulting size is 2 ^ (bit# + 20). This definition
+ matches the PCIe specification of this capability.
+
+ In order to make use of resource resizing, all PCI drivers must
+ be unbound from the device and peer devices under the same
+ parent bridge may need to be soft removed. In the case of
+ VGA devices, writing a resize value will remove low level
+ console drivers from the device. Raw users of pci-sysfs
+ resourceN attributes must be terminated prior to resizing.
+ Success of the resizing operation is not guaranteed.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-drivers-xhci_hcd b/Documentation/ABI/testing/sysfs-bus-pci-drivers-xhci_hcd
index 0088aba4caa8..5a775b8f6543 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-drivers-xhci_hcd
+++ b/Documentation/ABI/testing/sysfs-bus-pci-drivers-xhci_hcd
@@ -23,3 +23,55 @@ Description:
Reading this attribute gives the state of the DbC. It
can be one of the following states: disabled, enabled,
initialized, connected, configured and stalled.
+
+What: /sys/bus/pci/drivers/xhci_hcd/.../dbc_idVendor
+Date: March 2023
+Contact: Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+ This dbc_idVendor attribute lets us change the idVendor field
+ presented in the USB device descriptor by this xhci debug
+ device.
+ Value can only be changed while debug capability (DbC) is in
+ disabled state to prevent USB device descriptor change while
+ connected to a USB host.
+ The default value is 0x1d6b (Linux Foundation).
+ It can be any 16-bit integer.
+
+What: /sys/bus/pci/drivers/xhci_hcd/.../dbc_idProduct
+Date: March 2023
+Contact: Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+ This dbc_idProduct attribute lets us change the idProduct field
+ presented in the USB device descriptor by this xhci debug
+ device.
+ Value can only be changed while debug capability (DbC) is in
+ disabled state to prevent USB device descriptor change while
+ connected to a USB host.
+ The default value is 0x0010. It can be any 16-bit integer.
+
+What: /sys/bus/pci/drivers/xhci_hcd/.../dbc_bcdDevice
+Date: March 2023
+Contact: Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+ This dbc_bcdDevice attribute lets us change the bcdDevice field
+ presented in the USB device descriptor by this xhci debug
+ device.
+ Value can only be changed while debug capability (DbC) is in
+ disabled state to prevent USB device descriptor change while
+ connected to a USB host.
+ The default value is 0x0010. (device rev 0.10)
+ It can be any 16-bit integer.
+
+What: /sys/bus/pci/drivers/xhci_hcd/.../dbc_bInterfaceProtocol
+Date: March 2023
+Contact: Mathias Nyman <mathias.nyman@linux.intel.com>
+Description:
+ This attribute lets us change the bInterfaceProtocol field
+ presented in the USB Interface descriptor by the xhci debug
+ device.
+ Value can only be changed while debug capability (DbC) is in
+ disabled state to prevent USB descriptor change while
+ connected to a USB host.
+ The default value is 1 (GNU Remote Debug command).
+ Other permissible value is 0 which is for vendor defined debug
+ target.
diff --git a/Documentation/ABI/testing/sysfs-bus-peci b/Documentation/ABI/testing/sysfs-bus-peci
new file mode 100644
index 000000000000..87454ec5d981
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-peci
@@ -0,0 +1,16 @@
+What: /sys/bus/peci/rescan
+Date: July 2021
+KernelVersion: 5.18
+Contact: Iwona Winiarska <iwona.winiarska@intel.com>
+Description:
+ Writing a non-zero value to this attribute will
+ initiate scan for PECI devices on all PECI controllers
+ in the system.
+
+What: /sys/bus/peci/devices/<controller_id>-<device_addr>/remove
+Date: July 2021
+KernelVersion: 5.18
+Contact: Iwona Winiarska <iwona.winiarska@intel.com>
+Description:
+ Writing a non-zero value to this attribute will
+ remove the PECI device and any of its children.
diff --git a/Documentation/ABI/testing/sysfs-bus-platform b/Documentation/ABI/testing/sysfs-bus-platform
index ff30728595ef..c4dfe7355c2d 100644
--- a/Documentation/ABI/testing/sysfs-bus-platform
+++ b/Documentation/ABI/testing/sysfs-bus-platform
@@ -42,3 +42,15 @@ Date: August 2021
Contact: Barry Song <song.bao.hua@hisilicon.com>
Description:
This attribute will show "msi" if <N> is a valid msi irq
+
+What: /sys/bus/platform/devices/.../modalias
+Description:
+ Same as MODALIAS in the uevent at device creation.
+
+ A platform device that it is exposed via devicetree uses:
+
+ - of:N`of node name`T`type`
+
+ Other platform devices use, instead:
+
+ - platform:`driver name`
diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro
new file mode 100644
index 000000000000..fead760dcf77
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro
@@ -0,0 +1,325 @@
+What: /sys/bus/platform/devices/smpro-errmon.*/error_[core|mem|pcie|other]_[ce|ue]
+KernelVersion: 6.1
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) Contains the 48-byte Ampere (Vendor-Specific) Error Record printed
+ in hex format according to the table below:
+
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | Offset | Field | Size (byte) | Description |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 00 | Error Type | 1 | See :ref:`the table below <smpro-error-types>` for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 01 | Subtype | 1 | See :ref:`the table below <smpro-error-types>` for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 02 | Instance | 2 | See :ref:`the table below <smpro-error-types>` for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 04 | Error status | 4 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 08 | Error Address | 8 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 16 | Error Misc 0 | 8 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 24 | Error Misc 1 | 8 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 32 | Error Misc 2 | 8 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+ | 40 | Error Misc 3 | 8 | See ARM RAS specification for details |
+ +--------+---------------+-------------+------------------------------------------------------------+
+
+ The table below defines the value of error types, their subtype, subcomponent and instance:
+
+ .. _smpro-error-types:
+
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | Error Group | Error Type | Sub type | Sub component | Instance |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | CPM (core) | 0 | 0 | Snoop-Logic | CPM # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | CPM (core) | 0 | 2 | Armv8 Core 1 | CPM # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 1 | ERR1 | MCU # \| SLOT << 11 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 2 | ERR2 | MCU # \| SLOT << 11 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 3 | ERR3 | MCU # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 4 | ERR4 | MCU # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 5 | ERR5 | MCU # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 6 | ERR6 | MCU # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | MCU (mem) | 1 | 7 | Link Error | MCU # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | Mesh (other) | 2 | 0 | Cross Point | X \| (Y << 5) \| NS <<11 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | Mesh (other) | 2 | 1 | Home Node(IO) | X \| (Y << 5) \| NS <<11 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | Mesh (other) | 2 | 2 | Home Node(Mem) | X \| (Y << 5) \| NS <<11 \| device<<12 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | Mesh (other) | 2 | 4 | CCIX Node | X \| (Y << 5) \| NS <<11 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | 2P Link (other) | 3 | 0 | N/A | Altra 2P Link # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 0 | ERR0 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 1 | ERR1 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 2 | ERR2 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 3 | ERR3 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 4 | ERR4 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 5 | ERR5 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 6 | ERR6 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 7 | ERR7 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 8 | ERR8 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 9 | ERR9 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 10 | ERR10 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 11 | ERR11 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 12 | ERR12 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | GIC (other) | 5 | 13-21 | ERR13 | RC # + 1 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TCU | 100 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU0 | 0 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU1 | 1 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU2 | 2 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU3 | 3 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU4 | 4 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU5 | 5 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU6 | 6 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU7 | 7 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU8 | 8 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMMU (other) | 6 | TBU9 | 9 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PCIe AER (pcie) | 7 | Root | 0 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PCIe AER (pcie) | 7 | Device | 1 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PCIe RC (pcie) | 8 | RCA HB | 0 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PCIe RC (pcie) | 8 | RCB HB | 1 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PCIe RC (pcie) | 8 | RASDP | 8 | RC # |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | OCM (other) | 9 | ERR0 | 0 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | OCM (other) | 9 | ERR1 | 1 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | OCM (other) | 9 | ERR2 | 2 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMpro (other) | 10 | ERR0 | 0 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMpro (other) | 10 | ERR1 | 1 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | SMpro (other) | 10 | MPA_ERR | 2 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PMpro (other) | 11 | ERR0 | 0 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PMpro (other) | 11 | ERR1 | 1 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+ | PMpro (other) | 11 | MPA_ERR | 2 | 0 |
+ +-----------------+------------+----------+----------------+----------------------------------------+
+
+ Example::
+
+ # cat error_other_ue
+ 880807001e004010401040101500000001004010401040100c0000000000000000000000000000000000000000000000
+
+ The detail of each sysfs entries is as below:
+
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Error | Sysfs entry | Description (when triggered) |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ce | Core has CE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ue | Core has UE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ce | Memory has CE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ue | Memory has UE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ce | any PCIe controller has CE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ue | any PCIe controller has UE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ce | any other CE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+ | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ue | any other UE error |
+ +-------------+---------------------------------------------------------+----------------------------------+
+
+ UE: Uncorrect-able Error
+ CE: Correct-able Error
+
+ For details, see section `3.3 Ampere (Vendor-Specific) Error Record Formats,
+ Altra Family RAS Supplement`.
+
+
+What: /sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue]
+KernelVersion: 6.1
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) Return the overflow status of each type HW error reported:
+
+ - 0 : No overflow
+ - 1 : There is an overflow and the oldest HW errors are dropped
+
+ The detail of each sysfs entries is as below:
+
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Overflow | Sysfs entry | Description |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ce | Core CE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ue | Core UE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ce | Memory CE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ue | Memory UE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ce | any PCIe controller CE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ue | any PCIe controller UE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ce| any other CE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+ | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ue| other UE error overflow |
+ +-------------+-----------------------------------------------------------+---------------------------------------+
+
+ where:
+
+ - UE: Uncorrect-able Error
+ - CE: Correct-able Error
+
+What: /sys/bus/platform/devices/smpro-errmon.*/[error|warn]_[smpro|pmpro]
+KernelVersion: 6.1
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) Contains the internal firmware error/warning printed as hex format.
+
+ The detail of each sysfs entries is as below:
+
+ +---------------+------------------------------------------------------+--------------------------+
+ | Error | Sysfs entry | Description |
+ +---------------+------------------------------------------------------+--------------------------+
+ | SMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_smpro | system has SMpro error |
+ +---------------+------------------------------------------------------+--------------------------+
+ | SMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_smpro | system has SMpro warning |
+ +---------------+------------------------------------------------------+--------------------------+
+ | PMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_pmpro | system has PMpro error |
+ +---------------+------------------------------------------------------+--------------------------+
+ | PMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_pmpro | system has PMpro warning |
+ +---------------+------------------------------------------------------+--------------------------+
+
+ For details, see section `5.10 RAS Internal Error Register Definitions,
+ Altra Family Soc BMC Interface Specification`.
+
+What: /sys/bus/platform/devices/smpro-errmon.*/event_[vrd_warn_fault|vrd_hot|dimm_hot|dimm_2x_refresh]
+KernelVersion: 6.1 (event_[vrd_warn_fault|vrd_hot|dimm_hot]), 6.4 (event_dimm_2x_refresh)
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) Contains the detail information in case of VRD/DIMM warning/hot events
+ in hex format as below::
+
+ AAAA
+
+ where:
+
+ - ``AAAA``: The event detail information data
+
+ The detail of each sysfs entries is as below:
+
+ +---------------+---------------------------------------------------------------+---------------------+
+ | Event | Sysfs entry | Description |
+ +---------------+---------------------------------------------------------------+---------------------+
+ | VRD HOT | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_hot | VRD Hot |
+ +---------------+---------------------------------------------------------------+---------------------+
+ | VR Warn/Fault | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_warn_fault | VR Warning or Fault |
+ +---------------+---------------------------------------------------------------+---------------------+
+ | DIMM HOT | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_hot | DIMM Hot |
+ +---------------+---------------------------------------------------------------+---------------------+
+ | DIMM 2X | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_2x_refresh| DIMM 2x refresh rate|
+ | REFRESH RATE | | event in high temp |
+ +---------------+---------------------------------------------------------------+---------------------+
+
+ For more details, see section `5.7 GPI Status Registers and 5.9 Memory Error Register Definitions,
+ Altra Family Soc BMC Interface Specification`.
+
+What: /sys/bus/platform/devices/smpro-errmon.*/event_dimm[0-15]_syndrome
+KernelVersion: 6.4
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) The sysfs returns the 2-byte DIMM failure syndrome data for slot
+ 0-15 if it failed to initialize.
+
+ For more details, see section `5.11 Boot Stage Register Definitions,
+ Altra Family Soc BMC Interface Specification`.
+
+What: /sys/bus/platform/devices/smpro-misc.*/boot_progress
+KernelVersion: 6.1
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RO) Contains the boot stages information in hex as format below::
+
+ AABBCCCCCCCC
+
+ where:
+
+ - ``AA`` : The boot stages
+
+ - 00: SMpro firmware booting
+ - 01: PMpro firmware booting
+ - 02: ATF BL1 firmware booting
+ - 03: DDR initialization
+ - 04: DDR training report status
+ - 05: ATF BL2 firmware booting
+ - 06: ATF BL31 firmware booting
+ - 07: ATF BL32 firmware booting
+ - 08: UEFI firmware booting
+ - 09: OS booting
+
+ - ``BB`` : Boot status
+
+ - 00: Not started
+ - 01: Started
+ - 02: Completed without error
+ - 03: Failed.
+
+ - ``CCCCCCCC``: Boot status information defined for each boot stages
+
+ For details, see section `5.11 Boot Stage Register Definitions`
+ and section `6. Processor Boot Progress Codes, Altra Family Soc BMC
+ Interface Specification`.
+
+
+What: /sys/bus/platform/devices/smpro-misc*/soc_power_limit
+KernelVersion: 6.1
+Contact: Quan Nguyen <quan@os.amperecomputing.com>
+Description:
+ (RW) Contains the desired SoC power limit in Watt.
+ Writes to this sysfs set the desired SoC power limit (W).
+ Reads from this register return the current SoC power limit (W).
+ The value ranges:
+
+ - Minimum: 120 W
+ - Maximum: Socket TDP power
diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-occ-hwmon b/Documentation/ABI/testing/sysfs-bus-platform-devices-occ-hwmon
new file mode 100644
index 000000000000..b24d7ab0278f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-devices-occ-hwmon
@@ -0,0 +1,13 @@
+What: /sys/bus/platform/devices/occ-hwmon.X/ffdc
+KernelVersion: 5.15
+Contact: eajames@linux.ibm.com
+Description:
+ Contains the First Failure Data Capture from the SBEFIFO
+ hardware, if there is any from a previous transfer. Otherwise,
+ the file is empty. The data is cleared when it's been
+ completely read by a user. As the name suggests, only the data
+ from the first error is saved, until it's cleared upon read. The OCC hwmon driver, running on
+ a Baseboard Management Controller (BMC), communicates with
+ POWER9 and up processors over the Self-Boot Engine (SBE) FIFO.
+ In many error conditions, the SBEFIFO will return error data
+ indicating the type of error and system state, etc.
diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
new file mode 100644
index 000000000000..42deb0552065
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
@@ -0,0 +1,8 @@
+What: /sys/bus/platform/devices/<dev>/always_powered_in_suspend
+Date: June 2022
+KernelVersion: 5.20
+Contact: Matthias Kaehlcke <matthias@kaehlcke.net>
+ linux-usb@vger.kernel.org
+Description:
+ (RW) Controls whether the USB hub remains always powered
+ during system suspend or not. \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-bus-rapidio b/Documentation/ABI/testing/sysfs-bus-rapidio
index 634ea207a50a..f8b6728dac10 100644
--- a/Documentation/ABI/testing/sysfs-bus-rapidio
+++ b/Documentation/ABI/testing/sysfs-bus-rapidio
@@ -1,4 +1,4 @@
-What: /sys/bus/rapidio/devices/nn:d:iiii
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>
Description:
For each RapidIO device, the RapidIO subsystem creates files in
an individual subdirectory with the following name format of
@@ -29,7 +29,7 @@ Description:
Attributes Common for All RapidIO Devices
-----------------------------------------
-What: /sys/bus/rapidio/devices/nn:d:iiii/did
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/did
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -37,7 +37,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) returns the device identifier
-What: /sys/bus/rapidio/devices/nn:d:iiii/vid
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/vid
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -45,7 +45,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) returns the device vendor identifier
-What: /sys/bus/rapidio/devices/nn:d:iiii/device_rev
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/device_rev
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -53,7 +53,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) returns the device revision level
-What: /sys/bus/rapidio/devices/nn:d:iiii/asm_did
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/asm_did
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -61,7 +61,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) returns identifier for the assembly containing the device
-What: /sys/bus/rapidio/devices/nn:d:iiii/asm_rev
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/asm_rev
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -70,7 +70,7 @@ Description:
(RO) returns revision level of the assembly containing the
device
-What: /sys/bus/rapidio/devices/nn:d:iiii/asm_vid
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/asm_vid
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -79,7 +79,7 @@ Description:
(RO) returns vendor identifier of the assembly containing the
device
-What: /sys/bus/rapidio/devices/nn:d:iiii/destid
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/destid
Date: Mar, 2011
KernelVersion: v2.6.3
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -88,7 +88,7 @@ Description:
(RO) returns device destination ID assigned by the enumeration
routine
-What: /sys/bus/rapidio/devices/nn:d:iiii/lprev
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/lprev
Date: Mar, 2011
KernelVersion: v2.6.39
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -97,7 +97,7 @@ Description:
(RO) returns name of previous device (switch) on the path to the
device that that owns this attribute
-What: /sys/bus/rapidio/devices/nn:d:iiii/modalias
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/modalias
Date: Jul, 2013
KernelVersion: v3.11
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -105,7 +105,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) returns the device modalias
-What: /sys/bus/rapidio/devices/nn:d:iiii/config
+What: /sys/bus/rapidio/devices/<nn>:<d>:<iiii>/config
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -128,7 +128,7 @@ device-specific sysfs attributes by specifying a callback function that may be
set by the switch initialization routine during enumeration or discovery
process.
-What: /sys/bus/rapidio/devices/nn:s:iiii/routes
+What: /sys/bus/rapidio/devices/<nn>:<s>:<iiii>/routes
Date: Nov, 2005
KernelVersion: v2.6.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -138,7 +138,7 @@ Description:
This attribute reports only valid routing table entries, one
line for each entry.
-What: /sys/bus/rapidio/devices/nn:s:iiii/destid
+What: /sys/bus/rapidio/devices/<nn>:<s>:<iiii>/destid
Date: Mar, 2011
KernelVersion: v2.6.3
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -147,7 +147,7 @@ Description:
(RO) device destination ID of the associated device that defines
a route to the switch
-What: /sys/bus/rapidio/devices/nn:s:iiii/hopcount
+What: /sys/bus/rapidio/devices/<nn>:<s>:<iiii>/hopcount
Date: Mar, 2011
KernelVersion: v2.6.39
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -155,7 +155,7 @@ Contact: Matt Porter <mporter@kernel.crashing.org>,
Description:
(RO) number of hops on the path to the switch
-What: /sys/bus/rapidio/devices/nn:s:iiii/lnext
+What: /sys/bus/rapidio/devices/<nn>:<s>:<iiii>/lnext
Date: Mar, 2011
KernelVersion: v2.6.39
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -172,7 +172,7 @@ Device-specific Switch Attributes
IDT_GEN2-
-What: /sys/bus/rapidio/devices/nn:s:iiii/errlog
+What: /sys/bus/rapidio/devices/<nn>:<s>:<iiii>/errlog
Date: Oct, 2010
KernelVersion: v2.6.37
Contact: Matt Porter <mporter@kernel.crashing.org>,
diff --git a/Documentation/ABI/testing/sysfs-bus-soundwire-master b/Documentation/ABI/testing/sysfs-bus-soundwire-master
index 46ef038d8722..d2342911ffbb 100644
--- a/Documentation/ABI/testing/sysfs-bus-soundwire-master
+++ b/Documentation/ABI/testing/sysfs-bus-soundwire-master
@@ -1,13 +1,13 @@
-What: /sys/bus/soundwire/devices/sdw-master-N/revision
- /sys/bus/soundwire/devices/sdw-master-N/clk_stop_modes
- /sys/bus/soundwire/devices/sdw-master-N/clk_freq
- /sys/bus/soundwire/devices/sdw-master-N/clk_gears
- /sys/bus/soundwire/devices/sdw-master-N/default_col
- /sys/bus/soundwire/devices/sdw-master-N/default_frame_rate
- /sys/bus/soundwire/devices/sdw-master-N/default_row
- /sys/bus/soundwire/devices/sdw-master-N/dynamic_shape
- /sys/bus/soundwire/devices/sdw-master-N/err_threshold
- /sys/bus/soundwire/devices/sdw-master-N/max_clk_freq
+What: /sys/bus/soundwire/devices/sdw-master-<N>/revision
+ /sys/bus/soundwire/devices/sdw-master-<N>/clk_stop_modes
+ /sys/bus/soundwire/devices/sdw-master-<N>/clk_freq
+ /sys/bus/soundwire/devices/sdw-master-<N>/clk_gears
+ /sys/bus/soundwire/devices/sdw-master-<N>/default_col
+ /sys/bus/soundwire/devices/sdw-master-<N>/default_frame_rate
+ /sys/bus/soundwire/devices/sdw-master-<N>/default_row
+ /sys/bus/soundwire/devices/sdw-master-<N>/dynamic_shape
+ /sys/bus/soundwire/devices/sdw-master-<N>/err_threshold
+ /sys/bus/soundwire/devices/sdw-master-<N>/max_clk_freq
Date: April 2020
diff --git a/Documentation/ABI/testing/sysfs-bus-soundwire-slave b/Documentation/ABI/testing/sysfs-bus-soundwire-slave
index d324aa0b678f..fbf55834dfee 100644
--- a/Documentation/ABI/testing/sysfs-bus-soundwire-slave
+++ b/Documentation/ABI/testing/sysfs-bus-soundwire-slave
@@ -64,37 +64,37 @@ Description: SoundWire Slave Data Port-0 DisCo properties.
Data port 0 are used by the bus to configure the Data Port 0.
-What: /sys/bus/soundwire/devices/sdw:.../dpN_src/max_word
- /sys/bus/soundwire/devices/sdw:.../dpN_src/min_word
- /sys/bus/soundwire/devices/sdw:.../dpN_src/words
- /sys/bus/soundwire/devices/sdw:.../dpN_src/type
- /sys/bus/soundwire/devices/sdw:.../dpN_src/max_grouping
- /sys/bus/soundwire/devices/sdw:.../dpN_src/simple_ch_prep_sm
- /sys/bus/soundwire/devices/sdw:.../dpN_src/ch_prep_timeout
- /sys/bus/soundwire/devices/sdw:.../dpN_src/imp_def_interrupts
- /sys/bus/soundwire/devices/sdw:.../dpN_src/min_ch
- /sys/bus/soundwire/devices/sdw:.../dpN_src/max_ch
- /sys/bus/soundwire/devices/sdw:.../dpN_src/channels
- /sys/bus/soundwire/devices/sdw:.../dpN_src/ch_combinations
- /sys/bus/soundwire/devices/sdw:.../dpN_src/max_async_buffer
- /sys/bus/soundwire/devices/sdw:.../dpN_src/block_pack_mode
- /sys/bus/soundwire/devices/sdw:.../dpN_src/port_encoding
-
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/max_word
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/min_word
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/words
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/type
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/max_grouping
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/simple_ch_prep_sm
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/ch_prep_timeout
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/imp_def_interrupts
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/min_ch
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/max_ch
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/channels
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/ch_combinations
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/max_async_buffer
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/block_pack_mode
- /sys/bus/soundwire/devices/sdw:.../dpN_sink/port_encoding
+What: /sys/bus/soundwire/devices/sdw:.../dp<N>_src/max_word
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/min_word
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/words
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/type
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/max_grouping
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/simple_ch_prep_sm
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/ch_prep_timeout
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/imp_def_interrupts
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/min_ch
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/max_ch
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/channels
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/ch_combinations
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/max_async_buffer
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/block_pack_mode
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_src/port_encoding
+
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/max_word
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/min_word
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/words
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/type
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/max_grouping
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/simple_ch_prep_sm
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/ch_prep_timeout
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/imp_def_interrupts
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/min_ch
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/max_ch
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/channels
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/ch_combinations
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/max_async_buffer
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/block_pack_mode
+ /sys/bus/soundwire/devices/sdw:.../dp<N>_sink/port_encoding
Date: May 2020
diff --git a/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor b/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor
index d76cd3946434..c800621eff95 100644
--- a/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor
+++ b/Documentation/ABI/testing/sysfs-bus-spi-devices-spi-nor
@@ -5,6 +5,9 @@ Contact: linux-mtd@lists.infradead.org
Description: (RO) The JEDEC ID of the SPI NOR flash as reported by the
flash device.
+ The attribute is not present if the flash doesn't support
+ the "Read JEDEC ID" command (9Fh). This is the case for
+ non-JEDEC compliant flashes.
What: /sys/bus/spi/devices/.../spi-nor/manufacturer
Date: April 2021
@@ -12,6 +15,9 @@ KernelVersion: 5.14
Contact: linux-mtd@lists.infradead.org
Description: (RO) Manufacturer of the SPI NOR flash.
+ The attribute is not present if the flash device isn't
+ known to the kernel and is only probed by its SFDP
+ tables.
What: /sys/bus/spi/devices/.../spi-nor/partname
Date: April 2021
diff --git a/Documentation/ABI/testing/sysfs-bus-surface_aggregator-tabletsw b/Documentation/ABI/testing/sysfs-bus-surface_aggregator-tabletsw
new file mode 100644
index 000000000000..74cd9d754e60
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-surface_aggregator-tabletsw
@@ -0,0 +1,57 @@
+What: /sys/bus/surface_aggregator/devices/01:0e:01:00:01/state
+Date: July 2022
+KernelVersion: 5.20
+Contact: Maximilian Luz <luzmaximilian@gmail.com>
+Description:
+ This attribute returns a string with the current type-cover
+ or device posture, as indicated by the embedded controller.
+ Currently returned posture states are:
+
+ - "disconnected": The type-cover has been disconnected.
+
+ - "closed": The type-cover has been folded closed and lies on
+ top of the display.
+
+ - "laptop": The type-cover is open and in laptop-mode, i.e.,
+ ready for normal use.
+
+ - "folded-canvas": The type-cover has been folded back
+ part-ways, but does not lie flush with the back side of the
+ device. In general, this means that the kick-stand is used
+ and extended atop of the cover.
+
+ - "folded-back": The type cover has been fully folded back and
+ lies flush with the back side of the device.
+
+ - "<unknown>": The current state is unknown to the driver, for
+ example due to newer as-of-yet unsupported hardware.
+
+ New states may be introduced with new hardware. Users therefore
+ must not rely on this list of states being exhaustive and
+ gracefully handle unknown states.
+
+What: /sys/bus/surface_aggregator/devices/01:26:01:00:01/state
+Date: July 2022
+KernelVersion: 5.20
+Contact: Maximilian Luz <luzmaximilian@gmail.com>
+Description:
+ This attribute returns a string with the current device posture, as indicated by the embedded controller. Currently
+ returned posture states are:
+
+ - "closed": The lid of the device is closed.
+
+ - "laptop": The lid of the device is opened and the device
+ operates as a normal laptop.
+
+ - "slate": The screen covers the keyboard or has been flipped
+ back and the device operates mainly based on touch input.
+
+ - "tablet": The device operates as tablet and exclusively
+ relies on touch input (or external peripherals).
+
+ - "<unknown>": The current state is unknown to the driver, for
+ example due to newer as-of-yet unsupported hardware.
+
+ New states may be introduced with new hardware. Users therefore
+ must not rely on this list of states being exhaustive and
+ gracefully handle unknown states.
diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
index b7e87f6c7d47..76ab3e1fe374 100644
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -153,7 +153,7 @@ Date: Jan 2020
KernelVersion: 5.5
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
Description: This attribute reports number of RX lanes the device is
- using simultaneusly through its upstream port.
+ using simultaneously through its upstream port.
What: /sys/bus/thunderbolt/devices/.../tx_speed
Date: Jan 2020
@@ -167,7 +167,7 @@ Date: Jan 2020
KernelVersion: 5.5
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
Description: This attribute reports number of TX lanes the device is
- using simultaneusly through its upstream port.
+ using simultaneously through its upstream port.
What: /sys/bus/thunderbolt/devices/.../vendor
Date: Sep 2017
@@ -293,6 +293,16 @@ Contact: thunderbolt-software@lists.01.org
Description: This contains XDomain service specific settings as
bitmask. Format: %x
+What: /sys/bus/thunderbolt/devices/usb4_portX/connector
+Date: April 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Symlink to the USB Type-C connector. This link is only
+ created when USB Type-C Connector Class is enabled,
+ and only if the system firmware is capable of
+ describing the connection between a port and its
+ connector.
+
What: /sys/bus/thunderbolt/devices/usb4_portX/link
Date: Sep 2021
KernelVersion: v5.14
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index 73eb23bc1f34..cb172db41b34 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -1,4 +1,4 @@
-What: /sys/bus/usb/devices/INTERFACE/authorized
+What: /sys/bus/usb/devices/<INTERFACE>/authorized
Date: August 2015
Description:
This allows to authorize (1) or deauthorize (0)
@@ -166,14 +166,31 @@ Description:
The file will be present for all speeds of USB devices, and will
always read "no" for USB 1.1 and USB 2.0 devices.
-What: /sys/bus/usb/devices/.../(hub interface)/portX
+What: /sys/bus/usb/devices/<INTERFACE>/wireless_status
+Date: February 2023
+Contact: Bastien Nocera <hadess@hadess.net>
+Description:
+ Some USB devices use a USB receiver dongle to communicate
+ wirelessly with their device using proprietary protocols. This
+ attribute allows user-space to know whether the device is
+ connected to its receiver dongle, and, for example, consider
+ the device to be absent when choosing whether to show the
+ device's battery, show a headset in a list of outputs, or show
+ an on-screen keyboard if the only wireless keyboard is
+ turned off.
+ This attribute is not to be used to replace protocol specific
+ statuses available in WWAN, WLAN/Wi-Fi, Bluetooth, etc.
+ If the device does not use a receiver dongle with a wireless
+ device, then this attribute will not exist.
+
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>
Date: August 2012
Contact: Lan Tianyu <tianyu.lan@intel.com>
Description:
- The /sys/bus/usb/devices/.../(hub interface)/portX
+ The /sys/bus/usb/devices/.../<hub_interface>/port<X>
is usb port device's sysfs directory.
-What: /sys/bus/usb/devices/.../(hub interface)/portX/connect_type
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/connect_type
Date: January 2013
Contact: Lan Tianyu <tianyu.lan@intel.com>
Description:
@@ -182,7 +199,7 @@ Description:
The file will read "hotplug", "hardwired" and "not used" if the
information is available, and "unknown" otherwise.
-What: /sys/bus/usb/devices/.../(hub interface)/portX/location
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/location
Date: October 2018
Contact: Bjørn Mork <bjorn@mork.no>
Description:
@@ -192,7 +209,7 @@ Description:
raw location value as a hex integer.
-What: /sys/bus/usb/devices/.../(hub interface)/portX/quirks
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/quirks
Date: May 2018
Contact: Nicolas Boichat <drinkcat@chromium.org>
Description:
@@ -216,7 +233,7 @@ Description:
used to help make enumeration work better on some high speed
devices.
-What: /sys/bus/usb/devices/.../(hub interface)/portX/over_current_count
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/over_current_count
Date: February 2018
Contact: Richard Leitner <richard.leitner@skidata.com>
Description:
@@ -230,10 +247,10 @@ Description:
Any time this value changes the corresponding hub device will send a
udev event with the following attributes::
- OVER_CURRENT_PORT=/sys/bus/usb/devices/.../(hub interface)/portX
+ OVER_CURRENT_PORT=/sys/bus/usb/devices/.../<hub_interface>/port<X>
OVER_CURRENT_COUNT=[current value of this sysfs attribute]
-What: /sys/bus/usb/devices/.../(hub interface)/portX/usb3_lpm_permit
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/usb3_lpm_permit
Date: November 2015
Contact: Lu Baolu <baolu.lu@linux.intel.com>
Description:
@@ -244,6 +261,37 @@ Description:
is permitted, "u2" if only u2 is permitted, "u1_u2" if both u1 and
u2 are permitted.
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/connector
+Date: December 2021
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Link to the USB Type-C connector when available. This link is
+ only created when USB Type-C Connector Class is enabled, and
+ only if the system firmware is capable of describing the
+ connection between a port and its connector.
+
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/disable
+Date: June 2022
+Contact: Michael Grzeschik <m.grzeschik@pengutronix.de>
+Description:
+ This file controls the state of a USB port, including
+ Vbus power output (but only on hubs that support
+ power switching -- most hubs don't support it). If
+ a port is disabled, the port is unusable: Devices
+ attached to the port will not be detected, initialized,
+ or enumerated.
+
+What: /sys/bus/usb/devices/.../<hub_interface>/port<X>/early_stop
+Date: Sep 2022
+Contact: Ray Chi <raychi@google.com>
+Description:
+ Some USB hosts have some watchdog mechanisms so that the device
+ may enter ramdump if it takes a long time during port initialization.
+ This attribute allows each port just has two attempts so that the
+ port initialization will be failed quickly. In addition, if a port
+ which is marked with early_stop has failed to initialize, it will ignore
+ all future connections until this attribute is clear.
+
What: /sys/bus/usb/devices/.../power/usb2_lpm_l1_timeout
Date: May 2013
Contact: Mathias Nyman <mathias.nyman@linux.intel.com>
@@ -288,3 +336,277 @@ Description:
USB 3.2 adds Dual-lane support, 2 rx and 2 tx -lanes over Type-C.
Inter-Chip SSIC devices support asymmetric lanes up to 4 lanes per
direction. Devices before USB 3.2 are single lane (tx_lanes = 1)
+
+What: /sys/bus/usb/devices/usbX/bAlternateSetting
+Description:
+ The current interface alternate setting number, in decimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bcdDevice
+Description:
+ The device's release number, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bConfigurationValue
+Description:
+ While a USB device typically have just one configuration
+ setting, some devices support multiple configurations.
+
+ This value shows the current configuration, in decimal.
+
+ Changing its value will change the device's configuration
+ to another setting.
+
+ The number of configurations supported by a device is at:
+
+ /sys/bus/usb/devices/usbX/bNumConfigurations
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bDeviceClass
+Description:
+ Class code of the device, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bDeviceProtocol
+Description:
+ Protocol code of the device, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bDeviceSubClass
+Description:
+ Subclass code of the device, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bInterfaceClass
+Description:
+ Class code of the interface, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bInterfaceNumber
+Description:
+ Interface number, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bInterfaceProtocol
+Description:
+ Protocol code of the interface, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bInterfaceSubClass
+Description:
+ Subclass code of the interface, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bmAttributes
+Description:
+ Attributes of the current configuration, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bMaxPacketSize0
+Description:
+ Maximum endpoint 0 packet size, in decimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bMaxPower
+Description:
+ Maximum power consumption of the active configuration of
+ the device, in miliamperes.
+
+What: /sys/bus/usb/devices/usbX/bNumConfigurations
+Description:
+ Number of the possible configurations of the device, in
+ decimal. The current configuration is controlled via:
+
+ /sys/bus/usb/devices/usbX/bConfigurationValue
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bNumEndpoints
+Description:
+ Number of endpoints used on this interface, in hexadecimal.
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/bNumInterfaces
+Description:
+ Number of interfaces on this device, in decimal.
+
+What: /sys/bus/usb/devices/usbX/busnum
+Description:
+ Number of the bus.
+
+What: /sys/bus/usb/devices/usbX/configuration
+Description:
+ Contents of the string descriptor associated with the
+ current configuration. It may include the firmware version
+ of a device and/or its serial number.
+
+What: /sys/bus/usb/devices/usbX/descriptors
+Description:
+ Contains the interface descriptors, in binary.
+
+What: /sys/bus/usb/devices/usbX/idProduct
+Description:
+ Product ID, in hexadecimal.
+
+What: /sys/bus/usb/devices/usbX/idVendor
+Description:
+ Vendor ID, in hexadecimal.
+
+What: /sys/bus/usb/devices/usbX/devspec
+Description:
+ Displays the Device Tree Open Firmware node of the interface.
+
+What: /sys/bus/usb/devices/usbX/avoid_reset_quirk
+Description:
+ Most devices have this set to zero.
+
+ If the value is 1, enable a USB quirk that prevents this
+ device to use reset.
+
+ (read/write)
+
+What: /sys/bus/usb/devices/usbX/devnum
+Description:
+ USB interface device number, in decimal.
+
+What: /sys/bus/usb/devices/usbX/devpath
+Description:
+ String containing the USB interface device path.
+
+What: /sys/bus/usb/devices/usbX/manufacturer
+Description:
+ Vendor specific string containing the name of the
+ manufacturer of the device.
+
+What: /sys/bus/usb/devices/usbX/maxchild
+Description:
+ Number of ports of an USB hub
+
+What: /sys/bus/usb/devices/usbX/persist
+Description:
+ Keeps the device even if it gets disconnected.
+
+What: /sys/bus/usb/devices/usbX/product
+Description:
+ Vendor specific string containing the name of the
+ device's product.
+
+What: /sys/bus/usb/devices/usbX/speed
+Description:
+ Shows the device's max speed, according to the USB version,
+ in Mbps.
+ Can be:
+
+ ======= ====================
+ Unknown speed unknown
+ 1.5 Low speed
+ 15 Full speed
+ 480 High Speed
+ 5000 Super Speed
+ 10000 Super Speed+
+ 20000 Super Speed+ Gen 2x2
+ ======= ====================
+
+What: /sys/bus/usb/devices/usbX/supports_autosuspend
+Description:
+ Returns 1 if the device doesn't support autosuspend.
+ Otherwise, returns 0.
+
+What: /sys/bus/usb/devices/usbX/urbnum
+Description:
+ Number of URBs submitted for the whole device.
+
+What: /sys/bus/usb/devices/usbX/version
+Description:
+ String containing the USB device version, as encoded
+ at the BCD descriptor.
+
+What: /sys/bus/usb/devices/usbX/power/autosuspend
+Description:
+ Time in milliseconds for the device to autosuspend. If the
+ value is negative, then autosuspend is prevented.
+
+ (read/write)
+
+What: /sys/bus/usb/devices/usbX/power/active_duration
+Description:
+ The total time the device has not been suspended.
+
+What: /sys/bus/usb/devices/usbX/power/connected_duration
+Description:
+ The total time (in msec) that the device has been connected.
+
+What: /sys/bus/usb/devices/usbX/power/level
+Description:
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/bEndpointAddress
+Description:
+ The address of the endpoint described by this descriptor,
+ in hexadecimal. The endpoint direction on this bitmapped field
+ is also shown at:
+
+ /sys/bus/usb/devices/usbX/ep_<N>/direction
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/bInterval
+Description:
+ The interval of the endpoint as described on its descriptor,
+ in hexadecimal. The actual interval depends on the version
+ of the USB. Also shown in time units at
+ /sys/bus/usb/devices/usbX/ep_<N>/interval.
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/bLength
+Description:
+ Number of bytes of the endpoint descriptor, in hexadecimal.
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/bmAttributes
+Description:
+ Attributes which apply to the endpoint as described on its
+ descriptor, in hexadecimal. The endpoint type on this
+ bitmapped field is also shown at:
+
+ /sys/bus/usb/devices/usbX/ep_<N>/type
+
+ See USB specs for its meaning.
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/direction
+Description:
+ Direction of the endpoint. Can be:
+
+ - both (on control endpoints)
+ - in
+ - out
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/interval
+Description:
+ Interval for polling endpoint for data transfers, in
+ milisseconds or microseconds.
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/type
+Description:
+ Descriptor type. Can be:
+
+ - Control
+ - Isoc
+ - Bulk
+ - Interrupt
+ - unknown
+
+What: /sys/bus/usb/devices/usbX/ep_<N>/wMaxPacketSize
+Description:
+ Maximum packet size this endpoint is capable of
+ sending or receiving, in hexadecimal.
diff --git a/Documentation/ABI/testing/sysfs-bus-vdpa b/Documentation/ABI/testing/sysfs-bus-vdpa
new file mode 100644
index 000000000000..28a6111202ba
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-vdpa
@@ -0,0 +1,57 @@
+What: /sys/bus/vdpa/driver_autoprobe
+Date: March 2020
+Contact: virtualization@lists.linux-foundation.org
+Description:
+ This file determines whether new devices are immediately bound
+ to a driver after the creation. It initially contains 1, which
+ means the kernel automatically binds devices to a compatible
+ driver immediately after they are created.
+
+ Writing "0" to this file disable this feature, any other string
+ enable it.
+
+What: /sys/bus/vdpa/driver_probe
+Date: March 2020
+Contact: virtualization@lists.linux-foundation.org
+Description:
+ Writing a device name to this file will cause the kernel binds
+ devices to a compatible driver.
+
+ This can be useful when /sys/bus/vdpa/driver_autoprobe is
+ disabled.
+
+What: /sys/bus/vdpa/drivers/.../bind
+Date: March 2020
+Contact: virtualization@lists.linux-foundation.org
+Description:
+ Writing a device name to this file will cause the driver to
+ attempt to bind to the device. This is useful for overriding
+ default bindings.
+
+What: /sys/bus/vdpa/drivers/.../unbind
+Date: March 2020
+Contact: virtualization@lists.linux-foundation.org
+Description:
+ Writing a device name to this file will cause the driver to
+ attempt to unbind from the device. This may be useful when
+ overriding default bindings.
+
+What: /sys/bus/vdpa/devices/.../driver_override
+Date: November 2021
+Contact: virtualization@lists.linux-foundation.org
+Description:
+ This file allows the driver for a device to be specified.
+ When specified, only a driver with a name matching the value
+ written to driver_override will have an opportunity to bind to
+ the device. The override is specified by writing a string to the
+ driver_override file (echo vhost-vdpa > driver_override) and may
+ be cleared with an empty string (echo > driver_override).
+ This returns the device to standard matching rules binding.
+ Writing to driver_override does not automatically unbind the
+ device from its current driver or make any attempt to
+ automatically load the specified driver. If no driver with a
+ matching name is currently loaded in the kernel, the device will
+ not bind to any driver. This also allows devices to opt-out of
+ driver binding using a driver_override name such as "none".
+ Only a single driver may be specified in the override, there is
+ no support for parsing delimiters.
diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi
index 5402bd74ba43..0d2abd88a18c 100644
--- a/Documentation/ABI/testing/sysfs-class-bdi
+++ b/Documentation/ABI/testing/sysfs-class-bdi
@@ -23,14 +23,17 @@ default
The default backing dev, used for non-block device backed
filesystems which do not provide their own BDI.
-Files under /sys/class/bdi/<bdi>/
-
-read_ahead_kb (read-write)
-
+What: /sys/class/bdi/<bdi>/read_ahead_kb
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:
Size of the read-ahead window in kilobytes
-min_ratio (read-write)
-
+ (read-write)
+What: /sys/class/bdi/<bdi>/min_ratio
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:
Under normal circumstances each device is given a part of the
total write-back cache that relates to its current average
writeout speed in relation to the other devices.
@@ -39,8 +42,27 @@ min_ratio (read-write)
percentage of the write-back cache to a particular device.
For example, this is useful for providing a minimum QoS.
-max_ratio (read-write)
+ (read-write)
+What: /sys/class/bdi/<bdi>/min_ratio_fine
+Date: November 2022
+Contact: Stefan Roesch <shr@devkernel.io>
+Description:
+ Under normal circumstances each device is given a part of the
+ total write-back cache that relates to its current average
+ writeout speed in relation to the other devices.
+
+ The 'min_ratio_fine' parameter allows assigning a minimum reserve
+ of the write-back cache to a particular device. The value is
+ expressed as part of 1 million. For example, this is useful for
+ providing a minimum QoS.
+
+ (read-write)
+
+What: /sys/class/bdi/<bdi>/max_ratio
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:
Allows limiting a particular device to use not more than the
given percentage of the write-back cache. This is useful in
situations where we want to avoid one device taking all or
@@ -48,7 +70,65 @@ max_ratio (read-write)
mount that is prone to get stuck, or a FUSE mount which cannot
be trusted to play fair.
-stable_pages_required (read-only)
+ (read-write)
+
+What: /sys/class/bdi/<bdi>/max_ratio_fine
+Date: November 2022
+Contact: Stefan Roesch <shr@devkernel.io>
+Description:
+ Allows limiting a particular device to use not more than the
+ given value of the write-back cache. The value is given as part
+ of 1 million. This is useful in situations where we want to avoid
+ one device taking all or most of the write-back cache. For example
+ in case of an NFS mount that is prone to get stuck, or a FUSE mount
+ which cannot be trusted to play fair.
+
+ (read-write)
+What: /sys/class/bdi/<bdi>/min_bytes
+Date: October 2022
+Contact: Stefan Roesch <shr@devkernel.io>
+Description:
+ Under normal circumstances each device is given a part of the
+ total write-back cache that relates to its current average
+ writeout speed in relation to the other devices.
+
+ The 'min_bytes' parameter allows assigning a minimum
+ percentage of the write-back cache to a particular device
+ expressed in bytes.
+ For example, this is useful for providing a minimum QoS.
+
+ (read-write)
+
+What: /sys/class/bdi/<bdi>/max_bytes
+Date: October 2022
+Contact: Stefan Roesch <shr@devkernel.io>
+Description:
+ Allows limiting a particular device to use not more than the
+ given 'max_bytes' of the write-back cache. This is useful in
+ situations where we want to avoid one device taking all or
+ most of the write-back cache. For example in case of an NFS
+ mount that is prone to get stuck, a FUSE mount which cannot be
+ trusted to play fair, or a nbd device.
+
+ (read-write)
+
+What: /sys/class/bdi/<bdi>/strict_limit
+Date: October 2022
+Contact: Stefan Roesch <shr@devkernel.io>
+Description:
+ Forces per-BDI checks for the share of given device in the write-back
+ cache even before the global background dirty limit is reached. This
+ is useful in situations where the global limit is much higher than
+ affordable for given relatively slow (or untrusted) device. Turning
+ strictlimit on has no visible effect if max_ratio is equal to 100%.
+
+ (read-write)
+What: /sys/class/bdi/<bdi>/stable_pages_required
+Date: January 2008
+Contact: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Description:
If set, the backing device requires that all pages comprising a write
request must not be changed until writeout is complete.
+
+ (read-only)
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index 818f55970efb..594fda254130 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -103,8 +103,8 @@ What: /sys/class/cxl/<afu>/api_version_compatible
Date: September 2014
Contact: linuxppc-dev@lists.ozlabs.org
Description: read only
- Decimal value of the the lowest version of the userspace API
- this this kernel supports.
+ Decimal value of the lowest version of the userspace API
+ this kernel supports.
Users: https://github.com/ibm-capi/libcxl
@@ -166,10 +166,11 @@ Description: read only
Decimal value of the Per Process MMIO space length.
Users: https://github.com/ibm-capi/libcxl
-What: /sys/class/cxl/<afu>m/pp_mmio_off (not in a guest)
+What: /sys/class/cxl/<afu>m/pp_mmio_off
Date: September 2014
Contact: linuxppc-dev@lists.ozlabs.org
Description: read only
+ (not in a guest)
Decimal value of the Per Process MMIO space offset.
Users: https://github.com/ibm-capi/libcxl
@@ -190,28 +191,31 @@ Description: read only
Identifies the revision level of the PSL.
Users: https://github.com/ibm-capi/libcxl
-What: /sys/class/cxl/<card>/base_image (not in a guest)
+What: /sys/class/cxl/<card>/base_image
Date: September 2014
Contact: linuxppc-dev@lists.ozlabs.org
Description: read only
+ (not in a guest)
Identifies the revision level of the base image for devices
that support loadable PSLs. For FPGAs this field identifies
the image contained in the on-adapter flash which is loaded
during the initial program load.
Users: https://github.com/ibm-capi/libcxl
-What: /sys/class/cxl/<card>/image_loaded (not in a guest)
+What: /sys/class/cxl/<card>/image_loaded
Date: September 2014
Contact: linuxppc-dev@lists.ozlabs.org
Description: read only
+ (not in a guest)
Will return "user" or "factory" depending on the image loaded
onto the card.
Users: https://github.com/ibm-capi/libcxl
-What: /sys/class/cxl/<card>/load_image_on_perst (not in a guest)
+What: /sys/class/cxl/<card>/load_image_on_perst
Date: December 2014
Contact: linuxppc-dev@lists.ozlabs.org
Description: read/write
+ (not in a guest)
Valid entries are "none", "user", and "factory".
"none" means PERST will not cause image to be loaded to the
card. A power cycle is required to load the image.
@@ -235,10 +239,11 @@ Description: write only
contexts on the card AFUs.
Users: https://github.com/ibm-capi/libcxl
-What: /sys/class/cxl/<card>/perst_reloads_same_image (not in a guest)
+What: /sys/class/cxl/<card>/perst_reloads_same_image
Date: July 2015
Contact: linuxppc-dev@lists.ozlabs.org
Description: read/write
+ (not in a guest)
Trust that when an image is reloaded via PERST, it will not
have changed.
diff --git a/Documentation/ABI/testing/sysfs-class-devfreq-event b/Documentation/ABI/testing/sysfs-class-devfreq-event
index ceaf0f686d4a..dbe48495e55a 100644
--- a/Documentation/ABI/testing/sysfs-class-devfreq-event
+++ b/Documentation/ABI/testing/sysfs-class-devfreq-event
@@ -1,25 +1,25 @@
-What: /sys/class/devfreq-event/event(x)/
+What: /sys/class/devfreq-event/event<x>/
Date: January 2017
Contact: Chanwoo Choi <cw00.choi@samsung.com>
Description:
Provide a place in sysfs for the devfreq-event objects.
This allows accessing various devfreq-event specific variables.
- The name of devfreq-event object denoted as 'event(x)' which
+ The name of devfreq-event object denoted as 'event<x>' which
includes the unique number of 'x' for each devfreq-event object.
-What: /sys/class/devfreq-event/event(x)/name
+What: /sys/class/devfreq-event/event<x>/name
Date: January 2017
Contact: Chanwoo Choi <cw00.choi@samsung.com>
Description:
- The /sys/class/devfreq-event/event(x)/name attribute contains
+ The /sys/class/devfreq-event/event<x>/name attribute contains
the name of the devfreq-event object. This attribute is
read-only.
-What: /sys/class/devfreq-event/event(x)/enable_count
+What: /sys/class/devfreq-event/event<x>/enable_count
Date: January 2017
Contact: Chanwoo Choi <cw00.choi@samsung.com>
Description:
- The /sys/class/devfreq-event/event(x)/enable_count attribute
+ The /sys/class/devfreq-event/event<x>/enable_count attribute
contains the reference count to enable the devfreq-event
object. If the device is enabled, the value of attribute is
greater than zero.
diff --git a/Documentation/ABI/testing/sysfs-class-extcon b/Documentation/ABI/testing/sysfs-class-extcon
index fde0fecd5de9..f8e705375b24 100644
--- a/Documentation/ABI/testing/sysfs-class-extcon
+++ b/Documentation/ABI/testing/sysfs-class-extcon
@@ -65,19 +65,19 @@ Description:
interface associated with each cable cannot update
multiple cable states of an extcon device simultaneously.
-What: /sys/class/extcon/.../cable.x/name
+What: /sys/class/extcon/.../cable.X/name
Date: February 2012
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
Description:
- The /sys/class/extcon/.../cable.x/name shows the name of cable
- "x" (integer between 0 and 31) of an extcon device.
+ The /sys/class/extcon/.../cable.X/name shows the name of cable
+ "X" (integer between 0 and 31) of an extcon device.
-What: /sys/class/extcon/.../cable.x/state
+What: /sys/class/extcon/.../cable.X/state
Date: February 2012
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
Description:
- The /sys/class/extcon/.../cable.x/state shows and stores the
- state of cable "x" (integer between 0 and 31) of an extcon
+ The /sys/class/extcon/.../cable.X/state shows and stores the
+ state of cable "X" (integer between 0 and 31) of an extcon
device. The state value is either 0 (detached) or 1
(attached).
diff --git a/Documentation/ABI/testing/sysfs-class-fc b/Documentation/ABI/testing/sysfs-class-fc
new file mode 100644
index 000000000000..3057a6d3b8cf
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-fc
@@ -0,0 +1,27 @@
+What: /sys/class/fc/fc_udev_device/appid_store
+Date: Aug 2021
+Contact: Muneendra Kumar <muneendra.kumar@broadconm.com>
+Description:
+ This interface allows an admin to set an FC application
+ identifier in the blkcg associated with a cgroup id. The
+ identifier is typically a UUID that is associated with
+ an application or logical entity such as a virtual
+ machine or container group. The application or logical
+ entity utilizes a block device via the cgroup id.
+ FC adapter drivers may query the identifier and tag FC
+ traffic based on the identifier. FC host and FC fabric
+ entities can utilize the application id and FC traffic
+ tag to identify traffic sources.
+
+ The interface expects a string "<cgroupid>:<appid>" where:
+ <cgroupid> is inode of the cgroup in hexadecimal
+ <appid> is user provided string upto 128 characters
+ in length.
+
+ If an appid_store is done for a cgroup id that already
+ has an appid set, the new value will override the
+ previous value.
+
+ If an admin wants to remove an FC application identifier
+ from a cgroup, an appid_store should be done with the
+ following string: "<cgroupid>:"
diff --git a/Documentation/ABI/testing/sysfs-class-firmware b/Documentation/ABI/testing/sysfs-class-firmware
new file mode 100644
index 000000000000..978d3d500400
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-firmware
@@ -0,0 +1,77 @@
+What: /sys/class/firmware/.../data
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: The data sysfs file is used for firmware-fallback and for
+ firmware uploads. Cat a firmware image to this sysfs file
+ after you echo 1 to the loading sysfs file. When the firmware
+ image write is complete, echo 0 to the loading sysfs file. This
+ sequence will signal the completion of the firmware write and
+ signal the lower-level driver that the firmware data is
+ available.
+
+What: /sys/class/firmware/.../cancel
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Write-only. For firmware uploads, write a "1" to this file to
+ request that the transfer of firmware data to the lower-level
+ device be canceled. This request will be rejected (EBUSY) if
+ the update cannot be canceled (e.g. a FLASH write is in
+ progress) or (ENODEV) if there is no firmware update in progress.
+
+What: /sys/class/firmware/.../error
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read-only. Returns a string describing a failed firmware
+ upload. This string will be in the form of <STATUS>:<ERROR>,
+ where <STATUS> will be one of the status strings described
+ for the status sysfs file and <ERROR> will be one of the
+ following: "hw-error", "timeout", "user-abort", "device-busy",
+ "invalid-file-size", "read-write-error", "flash-wearout". The
+ error sysfs file is only meaningful when the current firmware
+ upload status is "idle". If this file is read while a firmware
+ transfer is in progress, then the read will fail with EBUSY.
+
+What: /sys/class/firmware/.../loading
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: The loading sysfs file is used for both firmware-fallback and
+ for firmware uploads. Echo 1 onto the loading file to indicate
+ you are writing a firmware file to the data sysfs node. Echo
+ -1 onto this file to abort the data write or echo 0 onto this
+ file to indicate that the write is complete. For firmware
+ uploads, the zero value also triggers the transfer of the
+ firmware data to the lower-level device driver.
+
+What: /sys/class/firmware/.../remaining_size
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read-only. For firmware upload, this file contains the size
+ of the firmware data that remains to be transferred to the
+ lower-level device driver. The size value is initialized to
+ the full size of the firmware image that was previously
+ written to the data sysfs file. This value is periodically
+ updated during the "transferring" phase of the firmware
+ upload.
+ Format: "%u".
+
+What: /sys/class/firmware/.../status
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read-only. Returns a string describing the current status of
+ a firmware upload. The string will be one of the following:
+ idle, "receiving", "preparing", "transferring", "programming".
+
+What: /sys/class/firmware/.../timeout
+Date: July 2022
+KernelVersion: 5.19
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: This file supports the timeout mechanism for firmware
+ fallback. This file has no affect on firmware uploads. For
+ more information on timeouts please see the documentation
+ for firmware fallback.
diff --git a/Documentation/ABI/testing/sysfs-class-firmware-attributes b/Documentation/ABI/testing/sysfs-class-firmware-attributes
index 90fdf935aa5e..4cdba3477176 100644
--- a/Documentation/ABI/testing/sysfs-class-firmware-attributes
+++ b/Documentation/ABI/testing/sysfs-class-firmware-attributes
@@ -116,7 +116,7 @@ Description:
<value>[ForceIf:<attribute>=<value>]
<value>[ForceIfNot:<attribute>=<value>]
- For example:
+ For example::
LegacyOrom/dell_value_modifier has value:
Disabled[ForceIf:SecureBoot=Enabled]
@@ -161,6 +161,15 @@ Description:
power-on:
Representing a password required to use
the system
+ system-mgmt:
+ Representing System Management password.
+ See Lenovo extensions section for details
+ HDD:
+ Representing HDD password
+ See Lenovo extensions section for details
+ NVMe:
+ Representing NVMe password
+ See Lenovo extensions section for details
mechanism:
The means of authentication. This attribute is mandatory.
@@ -203,10 +212,17 @@ Description:
the next boot.
Lenovo specific class extensions
- ------------------------------
+ --------------------------------
On Lenovo systems the following additional settings are available:
+ role: system-mgmt This gives the same authority as the bios-admin password to control
+ security related features. The authorities allocated can be set via
+ the BIOS menu SMP Access Control Policy
+
+ role: HDD & NVMe This password is used to unlock access to the drive at boot. Note see
+ 'level' and 'index' extensions below.
+
lenovo_encoding:
The encoding method that is used. This can be either "ascii"
or "scancode". Default is set to "ascii"
@@ -216,6 +232,71 @@ Description:
two char code (e.g. "us", "fr", "gr") and may vary per platform.
Default is set to "us"
+ level:
+ Available for HDD and NVMe authentication to set 'user' or 'master'
+ privilege level.
+ If only the user password is configured then this should be used to
+ unlock the drive at boot. If both master and user passwords are set
+ then either can be used. If a master password is set a user password
+ is required.
+ This attribute defaults to 'user' level
+
+ index:
+ Used with HDD and NVME authentication to set the drive index
+ that is being referenced (e.g hdd0, hdd1 etc)
+ This attribute defaults to device 0.
+
+ certificate, signature, save_signature:
+ These attributes are used for certificate based authentication. This is
+ used in conjunction with a signing server as an alternative to password
+ based authentication.
+ The user writes to the attribute(s) with a BASE64 encoded string obtained
+ from the signing server.
+ The attributes can be displayed to check the stored value.
+
+ Some usage examples:
+
+ Installing a certificate to enable feature::
+
+ echo "supervisor password" > authentication/Admin/current_password
+ echo "signed certificate" > authentication/Admin/certificate
+
+ Updating the installed certificate::
+
+ echo "signature" > authentication/Admin/signature
+ echo "signed certificate" > authentication/Admin/certificate
+
+ Removing the installed certificate::
+
+ echo "signature" > authentication/Admin/signature
+ echo "" > authentication/Admin/certificate
+
+ Changing a BIOS setting::
+
+ echo "signature" > authentication/Admin/signature
+ echo "save signature" > authentication/Admin/save_signature
+ echo Enable > attribute/PasswordBeep/current_value
+
+ You cannot enable certificate authentication if a supervisor password
+ has not been set.
+ Clearing the certificate results in no bios-admin authentication method
+ being configured allowing anyone to make changes.
+ After any of these operations the system must reboot for the changes to
+ take effect.
+
+ certificate_thumbprint:
+ Read only attribute used to display the MD5, SHA1 and SHA256 thumbprints
+ for the certificate installed in the BIOS.
+
+ certificate_to_password:
+ Write only attribute used to switch from certificate based authentication
+ back to password based.
+ Usage::
+
+ echo "signature" > authentication/Admin/signature
+ echo "password" > authentication/Admin/certificate_to_password
+
+
What: /sys/class/firmware-attributes/*/attributes/pending_reboot
Date: February 2021
KernelVersion: 5.11
@@ -268,7 +349,7 @@ Description:
# echo "factory" > /sys/class/firmware-attributes/*/device/attributes/reset_bios
# cat /sys/class/firmware-attributes/*/device/attributes/reset_bios
- # builtinsafe lastknowngood [factory] custom
+ builtinsafe lastknowngood [factory] custom
Note that any changes to this attribute requires a reboot
for changes to take effect.
diff --git a/Documentation/ABI/testing/sysfs-class-gnss b/Documentation/ABI/testing/sysfs-class-gnss
index c8553d972edd..9650f3a7fc03 100644
--- a/Documentation/ABI/testing/sysfs-class-gnss
+++ b/Documentation/ABI/testing/sysfs-class-gnss
@@ -1,4 +1,4 @@
-What: /sys/class/gnss/gnssN/type
+What: /sys/class/gnss/gnss<N>/type
Date: May 2018
KernelVersion: 4.18
Contact: Johan Hovold <johan@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-class-hwmon b/Documentation/ABI/testing/sysfs-class-hwmon
new file mode 100644
index 000000000000..638f4c6d4ec7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-hwmon
@@ -0,0 +1,958 @@
+What: /sys/class/hwmon/hwmonX/name
+Description:
+ The chip name.
+ This should be a short, lowercase string, not containing
+ whitespace, dashes, or the wildcard character '*'.
+ This attribute represents the chip name. It is the only
+ mandatory attribute.
+ I2C devices get this attribute created automatically.
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/label
+Description:
+ A descriptive label that allows to uniquely identify a
+ device within the system.
+ The contents of the label are free-form.
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/update_interval
+Description:
+ The interval at which the chip will update readings.
+ Unit: millisecond
+
+ RW
+
+ Some devices have a variable update rate or interval.
+ This attribute can be used to change it to the desired value.
+
+What: /sys/class/hwmon/hwmonX/inY_min
+Description:
+ Voltage min value.
+
+ Unit: millivolt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/inY_lcrit
+Description:
+ Voltage critical min value.
+
+ Unit: millivolt
+
+ RW
+
+ If voltage drops to or below this limit, the system may
+ take drastic action such as power down or reset. At the very
+ least, it should report a fault.
+
+What: /sys/class/hwmon/hwmonX/inY_max
+Description:
+ Voltage max value.
+
+ Unit: millivolt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/inY_crit
+Description:
+ Voltage critical max value.
+
+ Unit: millivolt
+
+ RW
+
+ If voltage reaches or exceeds this limit, the system may
+ take drastic action such as power down or reset. At the very
+ least, it should report a fault.
+
+What: /sys/class/hwmon/hwmonX/inY_input
+Description:
+ Voltage input value.
+
+ Unit: millivolt
+
+ RO
+
+ Voltage measured on the chip pin.
+
+ Actual voltage depends on the scaling resistors on the
+ motherboard, as recommended in the chip datasheet.
+
+ This varies by chip and by motherboard.
+ Because of this variation, values are generally NOT scaled
+ by the chip driver, and must be done by the application.
+ However, some drivers (notably lm87 and via686a)
+ do scale, because of internal resistors built into a chip.
+ These drivers will output the actual voltage. Rule of
+ thumb: drivers should report the voltage values at the
+ "pins" of the chip.
+
+What: /sys/class/hwmon/hwmonX/inY_average
+Description:
+ Average voltage
+
+ Unit: millivolt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/inY_lowest
+Description:
+ Historical minimum voltage
+
+ Unit: millivolt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/inY_highest
+Description:
+ Historical maximum voltage
+
+ Unit: millivolt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/inY_reset_history
+Description:
+ Reset inX_lowest and inX_highest
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/in_reset_history
+Description:
+ Reset inX_lowest and inX_highest for all sensors
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/inY_label
+Description:
+ Suggested voltage channel label.
+
+ Text string
+
+ Should only be created if the driver has hints about what
+ this voltage channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/inY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/cpuY_vid
+Description:
+ CPU core reference voltage.
+
+ Unit: millivolt
+
+ RO
+
+ Not always correct.
+
+What: /sys/class/hwmon/hwmonX/vrm
+Description:
+ Voltage Regulator Module version number.
+
+ RW (but changing it should no more be necessary)
+
+ Originally the VRM standard version multiplied by 10, but now
+ an arbitrary number, as not all standards have a version
+ number.
+
+ Affects the way the driver calculates the CPU core reference
+ voltage from the vid pins.
+
+What: /sys/class/hwmon/hwmonX/inY_rated_min
+Description:
+ Minimum rated voltage.
+
+ Unit: millivolt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/inY_rated_max
+Description:
+ Maximum rated voltage.
+
+ Unit: millivolt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/fanY_min
+Description:
+ Fan minimum value
+
+ Unit: revolution/min (RPM)
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/fanY_max
+Description:
+ Fan maximum value
+
+ Unit: revolution/min (RPM)
+
+ Only rarely supported by the hardware.
+ RW
+
+What: /sys/class/hwmon/hwmonX/fanY_input
+Description:
+ Fan input value.
+
+ Unit: revolution/min (RPM)
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/fanY_div
+Description:
+ Fan divisor.
+
+ Integer value in powers of two (1, 2, 4, 8, 16, 32, 64, 128).
+
+ RW
+
+ Some chips only support values 1, 2, 4 and 8.
+ Note that this is actually an internal clock divisor, which
+ affects the measurable speed range, not the read value.
+
+What: /sys/class/hwmon/hwmonX/fanY_pulses
+Description:
+ Number of tachometer pulses per fan revolution.
+
+ Integer value, typically between 1 and 4.
+
+ RW
+
+ This value is a characteristic of the fan connected to the
+ device's input, so it has to be set in accordance with the fan
+ model.
+
+ Should only be created if the chip has a register to configure
+ the number of pulses. In the absence of such a register (and
+ thus attribute) the value assumed by all devices is 2 pulses
+ per fan revolution.
+
+What: /sys/class/hwmon/hwmonX/fanY_target
+Description:
+ Desired fan speed
+
+ Unit: revolution/min (RPM)
+
+ RW
+
+ Only makes sense if the chip supports closed-loop fan speed
+ control based on the measured fan speed.
+
+What: /sys/class/hwmon/hwmonX/fanY_label
+Description:
+ Suggested fan channel label.
+
+ Text string
+
+ Should only be created if the driver has hints about what
+ this fan channel is being used for, and user-space doesn't.
+ In all other cases, the label is provided by user-space.
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/fanY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/fanY_fault
+Description:
+ Reports if a fan has reported failure.
+
+ - 1: Failed
+ - 0: Ok
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/pwmY
+Description:
+ Pulse width modulation fan control.
+
+ Integer value in the range 0 to 255
+
+ RW
+
+ 255 is max or 100%.
+
+What: /sys/class/hwmon/hwmonX/pwmY_enable
+Description:
+ Fan speed control method:
+
+ - 0: no fan speed control (i.e. fan at full speed)
+ - 1: manual fan speed control enabled (using `pwmY`)
+ - 2+: automatic fan speed control enabled
+
+ Check individual chip documentation files for automatic mode
+ details.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/pwmY_mode
+Description:
+ - 0: DC mode (direct current)
+ - 1: PWM mode (pulse-width modulation)
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/pwmY_freq
+Description:
+ Base PWM frequency in Hz.
+
+ Only possibly available when pwmN_mode is PWM, but not always
+ present even then.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/pwmY_auto_channels_temp
+Description:
+ Select which temperature channels affect this PWM output in
+ auto mode.
+
+ Bitfield, 1 is temp1, 2 is temp2, 4 is temp3 etc...
+ Which values are possible depend on the chip used.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/pwmY_auto_pointZ_pwm
+What: /sys/class/hwmon/hwmonX/pwmY_auto_pointZ_temp
+What: /sys/class/hwmon/hwmonX/pwmY_auto_pointZ_temp_hyst
+Description:
+ Define the PWM vs temperature curve.
+
+ Number of trip points is chip-dependent. Use this for chips
+ which associate trip points to PWM output channels.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_auto_pointZ_pwm
+What: /sys/class/hwmon/hwmonX/tempY_auto_pointZ_temp
+What: /sys/class/hwmon/hwmonX/tempY_auto_pointZ_temp_hyst
+Description:
+ Define the PWM vs temperature curve.
+
+ Number of trip points is chip-dependent. Use this for chips
+ which associate trip points to temperature channels.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_type
+Description:
+ Sensor type selection.
+
+ Integers 1 to 6
+
+ RW
+
+ - 1: CPU embedded diode
+ - 2: 3904 transistor
+ - 3: thermal diode
+ - 4: thermistor
+ - 5: AMD AMDSI
+ - 6: Intel PECI
+
+ Not all types are supported by all chips
+
+What: /sys/class/hwmon/hwmonX/tempY_max
+Description:
+ Temperature max value.
+
+ Unit: millidegree Celsius (or millivolt, see below)
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_min
+Description:
+ Temperature min value.
+
+ Unit: millidegree Celsius
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_max_hyst
+Description:
+ Temperature hysteresis value for max limit.
+
+ Unit: millidegree Celsius
+
+ Must be reported as an absolute temperature, NOT a delta
+ from the max value.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_min_hyst
+Description:
+ Temperature hysteresis value for min limit.
+ Unit: millidegree Celsius
+
+ Must be reported as an absolute temperature, NOT a delta
+ from the min value.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_input
+Description:
+ Temperature input value.
+
+ Unit: millidegree Celsius
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/tempY_crit
+Description:
+ Temperature critical max value, typically greater than
+ corresponding temp_max values.
+
+ Unit: millidegree Celsius
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_crit_alarm
+Description:
+ Critical high temperature alarm flag.
+
+ - 0: OK
+ - 1: temperature has reached tempY_crit
+
+ RW
+
+ Contrary to regular alarm flags which clear themselves
+ automatically when read, this one sticks until cleared by
+ the user. This is done by writing 0 to the file. Writing
+ other values is unsupported.
+
+What: /sys/class/hwmon/hwmonX/tempY_crit_hyst
+Description:
+ Temperature hysteresis value for critical limit.
+
+ Unit: millidegree Celsius
+
+ Must be reported as an absolute temperature, NOT a delta
+ from the critical value.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_emergency
+Description:
+ Temperature emergency max value, for chips supporting more than
+ two upper temperature limits. Must be equal or greater than
+ corresponding temp_crit values.
+
+ Unit: millidegree Celsius
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_emergency_hyst
+Description:
+ Temperature hysteresis value for emergency limit.
+
+ Unit: millidegree Celsius
+
+ Must be reported as an absolute temperature, NOT a delta
+ from the emergency value.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_lcrit
+Description:
+ Temperature critical min value, typically lower than
+ corresponding temp_min values.
+
+ Unit: millidegree Celsius
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_lcrit_hyst
+Description:
+ Temperature hysteresis value for critical min limit.
+
+ Unit: millidegree Celsius
+
+ Must be reported as an absolute temperature, NOT a delta
+ from the critical min value.
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_offset
+Description:
+ Temperature offset which is added to the temperature reading
+ by the chip.
+
+ Unit: millidegree Celsius
+
+ Read/Write value.
+
+What: /sys/class/hwmon/hwmonX/tempY_label
+Description:
+ Suggested temperature channel label.
+
+ Text string
+
+ Should only be created if the driver has hints about what
+ this temperature channel is being used for, and user-space
+ doesn't. In all other cases, the label is provided by
+ user-space.
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/tempY_lowest
+Description:
+ Historical minimum temperature
+
+ Unit: millidegree Celsius
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/tempY_highest
+Description:
+ Historical maximum temperature
+
+ Unit: millidegree Celsius
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/tempY_reset_history
+Description:
+ Reset temp_lowest and temp_highest
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/temp_reset_history
+Description:
+ Reset temp_lowest and temp_highest for all sensors
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/tempY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/tempY_rated_min
+Description:
+ Minimum rated temperature.
+
+ Unit: millidegree Celsius
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/tempY_rated_max
+Description:
+ Maximum rated temperature.
+
+ Unit: millidegree Celsius
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_max
+Description:
+ Current max value
+
+ Unit: milliampere
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/currY_min
+Description:
+ Current min value.
+
+ Unit: milliampere
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/currY_lcrit
+Description:
+ Current critical low value
+
+ Unit: milliampere
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/currY_crit
+Description:
+ Current critical high value.
+
+ Unit: milliampere
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/currY_input
+Description:
+ Current input value
+
+ Unit: milliampere
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_average
+Description:
+ Average current use
+
+ Unit: milliampere
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_lowest
+Description:
+ Historical minimum current
+
+ Unit: milliampere
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_highest
+Description:
+ Historical maximum current
+ Unit: milliampere
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_reset_history
+Description:
+ Reset currX_lowest and currX_highest
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/curr_reset_history
+Description:
+ Reset currX_lowest and currX_highest for all sensors
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/currY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/currY_rated_min
+Description:
+ Minimum rated current.
+
+ Unit: milliampere
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/currY_rated_max
+Description:
+ Maximum rated current.
+
+ Unit: milliampere
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average
+Description:
+ Average power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average_interval
+Description:
+ Power use averaging interval. A poll
+ notification is sent to this file if the
+ hardware changes the averaging interval.
+
+ Unit: milliseconds
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_average_interval_max
+Description:
+ Maximum power use averaging interval
+
+ Unit: milliseconds
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average_interval_min
+Description:
+ Minimum power use averaging interval
+
+ Unit: milliseconds
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average_highest
+Description:
+ Historical average maximum power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average_lowest
+Description:
+ Historical average minimum power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_average_max
+Description:
+ A poll notification is sent to
+ `powerY_average` when power use
+ rises above this value.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_average_min
+Description:
+ A poll notification is sent to
+ `powerY_average` when power use
+ sinks below this value.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_input
+Description:
+ Instantaneous power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_input_highest
+Description:
+ Historical maximum power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_input_lowest
+Description:
+ Historical minimum power use
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_reset_history
+Description:
+ Reset input_highest, input_lowest,
+ average_highest and average_lowest.
+
+ WO
+
+What: /sys/class/hwmon/hwmonX/powerY_accuracy
+Description:
+ Accuracy of the power meter.
+
+ Unit: Percent
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_cap
+Description:
+ If power use rises above this limit, the
+ system should take action to reduce power use.
+ A poll notification is sent to this file if the
+ cap is changed by the hardware. The `*_cap`
+ files only appear if the cap is known to be
+ enforced by hardware.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_cap_hyst
+Description:
+ Margin of hysteresis built around capping and
+ notification.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_cap_max
+Description:
+ Maximum cap that can be set.
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_cap_min
+Description:
+ Minimum cap that can be set.
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_max
+Description:
+ Maximum power.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_crit
+Description:
+ Critical maximum power.
+
+ If power rises to or above this limit, the
+ system is expected take drastic action to reduce
+ power consumption, such as a system shutdown or
+ a forced powerdown of some devices.
+
+ Unit: microWatt
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return
+ -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/powerY_rated_min
+Description:
+ Minimum rated power.
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/powerY_rated_max
+Description:
+ Maximum rated power.
+
+ Unit: microWatt
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/energyY_input
+Description:
+ Cumulative energy use
+
+ Unit: microJoule
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/energyY_enable
+Description:
+ Enable or disable the sensors.
+
+ When disabled the sensor read will return
+ -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/humidityY_input
+Description:
+ Humidity
+
+ Unit: milli-percent (per cent mille, pcm)
+
+ RO
+
+
+What: /sys/class/hwmon/hwmonX/humidityY_enable
+Description:
+ Enable or disable the sensors
+
+ When disabled the sensor read will return
+ -ENODATA.
+
+ - 1: Enable
+ - 0: Disable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/humidityY_rated_min
+Description:
+ Minimum rated humidity.
+
+ Unit: milli-percent (per cent mille, pcm)
+
+ RO
+
+What: /sys/class/hwmon/hwmonX/humidityY_rated_max
+Description:
+ Maximum rated humidity.
+
+ Unit: milli-percent (per cent mille, pcm)
+
+ RO
+
+
+What: /sys/class/hwmon/hwmonX/intrusionY_alarm
+Description:
+ Chassis intrusion detection
+
+ - 0: OK
+ - 1: intrusion detected
+
+ RW
+
+ Contrary to regular alarm flags which clear themselves
+ automatically when read, this one sticks until cleared by
+ the user. This is done by writing 0 to the file. Writing
+ other values is unsupported.
+
+What: /sys/class/hwmon/hwmonX/intrusionY_beep
+Description:
+ Chassis intrusion beep
+
+ - 0: disable
+ - 1: enable
+
+ RW
+
+What: /sys/class/hwmon/hwmonX/device/pec
+Description:
+ PEC support on I2C devices
+
+ - 0, off, n: disable
+ - 1, on, y: enable
+
+ RW
diff --git a/Documentation/ABI/testing/sysfs-class-mei b/Documentation/ABI/testing/sysfs-class-mei
index 5c52372b43cb..1db36ddf8e58 100644
--- a/Documentation/ABI/testing/sysfs-class-mei
+++ b/Documentation/ABI/testing/sysfs-class-mei
@@ -6,7 +6,7 @@ Description:
The mei/ class sub-directory belongs to mei device class
-What: /sys/class/mei/meiN/
+What: /sys/class/mei/mei<N>/
Date: May 2014
KernelVersion: 3.17
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -14,7 +14,7 @@ Description:
The /sys/class/mei/meiN directory is created for
each probed mei device
-What: /sys/class/mei/meiN/fw_status
+What: /sys/class/mei/mei<N>/fw_status
Date: Nov 2014
KernelVersion: 3.19
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -29,7 +29,7 @@ Description: Display fw status registers content
Also number of registers varies between 1 and 6
depending on generation.
-What: /sys/class/mei/meiN/hbm_ver
+What: /sys/class/mei/mei<N>/hbm_ver
Date: Aug 2016
KernelVersion: 4.9
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -38,7 +38,7 @@ Description: Display the negotiated HBM protocol version.
The HBM protocol version negotiated
between the driver and the device.
-What: /sys/class/mei/meiN/hbm_ver_drv
+What: /sys/class/mei/mei<N>/hbm_ver_drv
Date: Aug 2016
KernelVersion: 4.9
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -46,7 +46,7 @@ Description: Display the driver HBM protocol version.
The HBM protocol version supported by the driver.
-What: /sys/class/mei/meiN/tx_queue_limit
+What: /sys/class/mei/mei<N>/tx_queue_limit
Date: Jan 2018
KernelVersion: 4.16
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -55,7 +55,7 @@ Description: Configure tx queue limit
Set maximal number of pending writes
per opened session.
-What: /sys/class/mei/meiN/fw_ver
+What: /sys/class/mei/mei<N>/fw_ver
Date: May 2018
KernelVersion: 4.18
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -66,7 +66,7 @@ Description: Display the ME firmware version.
There can be up to three such blocks for different
FW components.
-What: /sys/class/mei/meiN/dev_state
+What: /sys/class/mei/mei<N>/dev_state
Date: Mar 2019
KernelVersion: 5.1
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -81,7 +81,7 @@ Description: Display the ME device state.
POWER_DOWN
POWER_UP
-What: /sys/class/mei/meiN/trc
+What: /sys/class/mei/mei<N>/trc
Date: Nov 2019
KernelVersion: 5.5
Contact: Tomas Winkler <tomas.winkler@intel.com>
@@ -91,7 +91,7 @@ Description: Display trc status register content
status information into trc status register
for BIOS and OS to monitor fw health.
-What: /sys/class/mei/meiN/kind
+What: /sys/class/mei/mei<N>/kind
Date: Jul 2020
KernelVersion: 5.8
Contact: Tomas Winkler <tomas.winkler@intel.com>
diff --git a/Documentation/ABI/testing/sysfs-class-mic b/Documentation/ABI/testing/sysfs-class-mic
index bd0e780c3760..5e5f36d10055 100644
--- a/Documentation/ABI/testing/sysfs-class-mic
+++ b/Documentation/ABI/testing/sysfs-class-mic
@@ -8,7 +8,7 @@ Description:
PCIe form factor add-in Coprocessor card based on the Intel Many
Integrated Core (MIC) architecture that runs a Linux OS.
-What: /sys/class/mic/mic(x)
+What: /sys/class/mic/mic<X>
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -17,7 +17,7 @@ Description:
represent MIC devices (0,1,..etc). Each directory has
information specific to that MIC device.
-What: /sys/class/mic/mic(x)/family
+What: /sys/class/mic/mic<X>/family
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -25,7 +25,7 @@ Description:
Provides information about the Coprocessor family for an Intel
MIC device. For example - "x100"
-What: /sys/class/mic/mic(x)/stepping
+What: /sys/class/mic/mic<X>/stepping
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -33,7 +33,7 @@ Description:
Provides information about the silicon stepping for an Intel
MIC device. For example - "A0" or "B0"
-What: /sys/class/mic/mic(x)/state
+What: /sys/class/mic/mic<X>/state
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -69,7 +69,7 @@ Description:
"shutdown" Initiates card OS shutdown.
========== ===================================================
-What: /sys/class/mic/mic(x)/shutdown_status
+What: /sys/class/mic/mic<X>/shutdown_status
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -88,7 +88,7 @@ Description:
"restart" Shutdown because of a restart command.
========== ===================================================
-What: /sys/class/mic/mic(x)/cmdline
+What: /sys/class/mic/mic<X>/cmdline
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -104,7 +104,7 @@ Description:
or modify existing ones and then write the whole kernel command
line back to this entry.
-What: /sys/class/mic/mic(x)/firmware
+What: /sys/class/mic/mic<X>/firmware
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -114,7 +114,7 @@ Description:
card can be found. The entry can be written to change the
firmware image location under /lib/firmware/.
-What: /sys/class/mic/mic(x)/ramdisk
+What: /sys/class/mic/mic<X>/ramdisk
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -124,7 +124,7 @@ Description:
OS boot can be found. The entry can be written to change
the ramdisk image location under /lib/firmware/.
-What: /sys/class/mic/mic(x)/bootmode
+What: /sys/class/mic/mic<X>/bootmode
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -135,7 +135,7 @@ Description:
a) linux - Boot a Linux image.
b) flash - Boot an image for flash updates.
-What: /sys/class/mic/mic(x)/log_buf_addr
+What: /sys/class/mic/mic<X>/log_buf_addr
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -149,7 +149,7 @@ Description:
log buffer address to be written can be found in the System.map
file of the card OS.
-What: /sys/class/mic/mic(x)/log_buf_len
+What: /sys/class/mic/mic<X>/log_buf_len
Date: October 2013
KernelVersion: 3.13
Contact: Sudeep Dutt <sudeep.dutt@intel.com>
@@ -163,7 +163,7 @@ Description:
buffer length address to be written can be found in the
System.map file of the card OS.
-What: /sys/class/mic/mic(x)/heartbeat_enable
+What: /sys/class/mic/mic<X>/heartbeat_enable
Date: March 2015
KernelVersion: 4.4
Contact: Ashutosh Dixit <ashutosh.dixit@intel.com>
diff --git a/Documentation/ABI/testing/sysfs-class-mux b/Documentation/ABI/testing/sysfs-class-mux
index 8715f9c7bd4f..c58b7b6e1aa6 100644
--- a/Documentation/ABI/testing/sysfs-class-mux
+++ b/Documentation/ABI/testing/sysfs-class-mux
@@ -7,7 +7,7 @@ Description:
Framework and provides a sysfs interface for using MUX
controllers.
-What: /sys/class/mux/muxchipN/
+What: /sys/class/mux/muxchip<N>/
Date: April 2017
KernelVersion: 4.13
Contact: Peter Rosin <peda@axentia.se>
diff --git a/Documentation/ABI/testing/sysfs-class-net-peak_usb b/Documentation/ABI/testing/sysfs-class-net-peak_usb
new file mode 100644
index 000000000000..9e3d0bf4d4b2
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-net-peak_usb
@@ -0,0 +1,19 @@
+
+What: /sys/class/net/<iface>/peak_usb/can_channel_id
+Date: November 2022
+KernelVersion: 6.2
+Contact: Stephane Grosjean <s.grosjean@peak-system.com>
+Description:
+ PEAK PCAN-USB devices support user-configurable CAN channel
+ identifiers. Contrary to a USB serial number, these identifiers
+ are writable and can be set per CAN interface. This means that
+ if a USB device exports multiple CAN interfaces, each of them
+ can be assigned a unique channel ID.
+ This attribute provides read-only access to the currently
+ configured value of the channel identifier. Depending on the
+ device type, the identifier has a length of 8 or 32 bit. The
+ value read from this attribute is always an 8 digit 32 bit
+ hexadecimal value in big endian format. If the device only
+ supports an 8 bit identifier, the upper 24 bit of the value are
+ set to zero.
+
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index ca830c6cd809..7c81f0a25a48 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -364,7 +364,10 @@ Date: April 2019
Contact: linux-pm@vger.kernel.org
Description:
Represents a battery percentage level, above which charging will
- stop.
+ stop. Not all hardware is capable of setting this to an arbitrary
+ percentage. Drivers will round written values to the nearest
+ supported value. Reading back the value will show the actual
+ threshold set by the driver.
Access: Read, Write
@@ -380,13 +383,17 @@ Description:
algorithm to adjust the charge rate dynamically, without
any user configuration required. "Custom" means that the charger
uses the charge_control_* properties as configuration for some
- different algorithm.
+ different algorithm. "Long Life" means the charger reduces its
+ charging rate in order to prolong the battery health. "Bypass"
+ means the charger bypasses the charging path around the
+ integrated converter allowing for a "smart" wall adaptor to
+ perform the power conversion externally.
Access: Read, Write
Valid values:
"Unknown", "N/A", "Trickle", "Fast", "Standard",
- "Adaptive", "Custom"
+ "Adaptive", "Custom", "Long Life", "Bypass"
What: /sys/class/power_supply/<supply_name>/charge_term_current
Date: July 2014
@@ -413,7 +420,7 @@ Description:
"Over voltage", "Unspecified failure", "Cold",
"Watchdog timer expire", "Safety timer expire",
"Over current", "Calibration required", "Warm",
- "Cool", "Hot"
+ "Cool", "Hot", "No battery"
What: /sys/class/power_supply/<supply_name>/precharge_current
Date: June 2017
@@ -430,7 +437,8 @@ What: /sys/class/power_supply/<supply_name>/present
Date: May 2007
Contact: linux-pm@vger.kernel.org
Description:
- Reports whether a battery is present or not in the system.
+ Reports whether a battery is present or not in the system. If the
+ property does not exist, the battery is considered to be present.
Access: Read
@@ -455,6 +463,21 @@ Description:
"Unknown", "Charging", "Discharging",
"Not charging", "Full"
+What: /sys/class/power_supply/<supply_name>/charge_behaviour
+Date: November 2021
+Contact: linux-pm@vger.kernel.org
+Description:
+ Represents the charging behaviour.
+
+ Access: Read, Write
+
+ Valid values:
+ ================ ====================================
+ auto: Charge normally, respect thresholds
+ inhibit-charge: Do not charge while AC is attached
+ force-discharge: Force discharge while AC is attached
+ ================ ====================================
+
What: /sys/class/power_supply/<supply_name>/technology
Date: May 2007
Contact: linux-pm@vger.kernel.org
@@ -480,6 +503,19 @@ Description:
Valid values: Represented in microvolts
+What: /sys/class/power_supply/<supply_name>/cycle_count
+Date: January 2010
+Contact: linux-pm@vger.kernel.org
+Description:
+ Reports the number of full charge + discharge cycles the
+ battery has undergone.
+
+ Access: Read
+
+ Valid values:
+ Integer > 0: representing full cycles
+ Integer = 0: cycle_count info is not available
+
**USB Properties**
What: /sys/class/power_supply/<supply_name>/input_current_limit
diff --git a/Documentation/ABI/testing/sysfs-class-power-rt9467 b/Documentation/ABI/testing/sysfs-class-power-rt9467
new file mode 100644
index 000000000000..619b7c45d145
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-rt9467
@@ -0,0 +1,19 @@
+What: /sys/class/power_supply/rt9467-*/sysoff_enable
+Date: Feb 2023
+KernelVersion: 6.3
+Contact: ChiaEn Wu <chiaen_wu@richtek.com>
+Description:
+ This entry allows enabling the sysoff mode of rt9467 charger
+ devices.
+ If enabled and the input is removed, the internal battery FET
+ is turned off to reduce the leakage from the BAT pin. See
+ device datasheet for details. It's commonly used when the
+ product enter shipping stage. After entering shipping mode,
+ only 'VBUS' or 'Power key" pressed can make it leave this mode.
+ 'Disable' also can help to leave it, but it's more like to
+ abort the action before the device really enter shipping mode.
+
+ Access: Read, Write
+ Valid values:
+ - 1: enabled
+ - 0: disabled
diff --git a/Documentation/ABI/testing/sysfs-class-power-rt9471 b/Documentation/ABI/testing/sysfs-class-power-rt9471
new file mode 100644
index 000000000000..0a390ee5ac21
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-rt9471
@@ -0,0 +1,32 @@
+What: /sys/class/power_supply/rt9471-*/sysoff_enable
+Date: Feb 2023
+KernelVersion: 6.3
+Contact: ChiYuan Huang <cy_huang@richtek.com>
+Description:
+ This entry allows enabling the sysoff mode of rt9471 charger devices.
+ If enabled and the input is removed, the internal battery FET is turned
+ off to reduce the leakage from the BAT pin. See device datasheet for details.
+ It's commonly used when the product enter shipping stage. After entering
+ shipping mode, only 'VBUS' or 'Power key" pressed can make it leave this
+ mode. 'Disable' also can help to leave it, but it's more like to abort
+ the action before the device really enter shipping mode.
+
+ Access: Read, Write
+ Valid values:
+ - 1: enabled
+ - 0: disabled
+
+What: /sys/class/power_supply/rt9471-*/port_detect_enable
+Date: Feb 2023
+KernelVersion: 6.3
+Contact: ChiYuan Huang <cy_huang@richtek.com>
+Description:
+ This entry allows enabling the USB BC12 port detect function of rt9471 charger
+ devices. If enabled and VBUS is inserted, device will start to do the BC12
+ port detect and report the usb port type when port detect is done. See
+ datasheet for details. Normally controlled when TypeC/USBPD port integrated.
+
+ Access: Read, Write
+ Valid values:
+ - 1: enabled
+ - 0: disabled
diff --git a/Documentation/ABI/testing/sysfs-class-pwm b/Documentation/ABI/testing/sysfs-class-pwm
index c20e61354561..0638c94d01ef 100644
--- a/Documentation/ABI/testing/sysfs-class-pwm
+++ b/Documentation/ABI/testing/sysfs-class-pwm
@@ -7,7 +7,7 @@ Description:
Framework and provides a sysfs interface for using PWM
channels.
-What: /sys/class/pwm/pwmchipN/
+What: /sys/class/pwm/pwmchip<N>/
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -16,14 +16,14 @@ Description:
probed PWM controller/chip where N is the base of the
PWM chip.
-What: /sys/class/pwm/pwmchipN/npwm
+What: /sys/class/pwm/pwmchip<N>/npwm
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
Description:
The number of PWM channels supported by the PWM chip.
-What: /sys/class/pwm/pwmchipN/export
+What: /sys/class/pwm/pwmchip<N>/export
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -31,14 +31,14 @@ Description:
Exports a PWM channel from the PWM chip for sysfs control.
Value is between 0 and /sys/class/pwm/pwmchipN/npwm - 1.
-What: /sys/class/pwm/pwmchipN/unexport
+What: /sys/class/pwm/pwmchip<N>/unexport
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
Description:
Unexports a PWM channel.
-What: /sys/class/pwm/pwmchipN/pwmX
+What: /sys/class/pwm/pwmchip<N>/pwmX
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -47,21 +47,21 @@ Description:
each exported PWM channel where X is the exported PWM
channel number.
-What: /sys/class/pwm/pwmchipN/pwmX/period
+What: /sys/class/pwm/pwmchip<N>/pwmX/period
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
Description:
Sets the PWM signal period in nanoseconds.
-What: /sys/class/pwm/pwmchipN/pwmX/duty_cycle
+What: /sys/class/pwm/pwmchip<N>/pwmX/duty_cycle
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
Description:
Sets the PWM signal duty cycle in nanoseconds.
-What: /sys/class/pwm/pwmchipN/pwmX/polarity
+What: /sys/class/pwm/pwmchip<N>/pwmX/polarity
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -69,7 +69,7 @@ Description:
Sets the output polarity of the PWM signal to "normal" or
"inversed".
-What: /sys/class/pwm/pwmchipN/pwmX/enable
+What: /sys/class/pwm/pwmchip<N>/pwmX/enable
Date: May 2013
KernelVersion: 3.11
Contact: H Hartley Sweeten <hsweeten@visionengravers.com>
@@ -78,10 +78,10 @@ Description:
0 is disabled
1 is enabled
-What: /sys/class/pwm/pwmchipN/pwmX/capture
+What: /sys/class/pwm/pwmchip<N>/pwmX/capture
Date: June 2016
KernelVersion: 4.8
-Contact: Lee Jones <lee.jones@linaro.org>
+Contact: Lee Jones <lee@kernel.org>
Description:
Capture information about a PWM signal. The output format is a
pair unsigned integers (period and duty cycle), separated by a
diff --git a/Documentation/ABI/testing/sysfs-class-rapidio b/Documentation/ABI/testing/sysfs-class-rapidio
index 19aefb21b639..81e09145525a 100644
--- a/Documentation/ABI/testing/sysfs-class-rapidio
+++ b/Documentation/ABI/testing/sysfs-class-rapidio
@@ -10,7 +10,7 @@ Description:
NOTE: An mport ID is not a RapidIO destination ID assigned to a
given local mport device.
-What: /sys/class/rapidio_port/rapidioN/sys_size
+What: /sys/class/rapidio_port/rapidio<N>/sys_size
Date: Apr, 2014
KernelVersion: v3.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
@@ -22,7 +22,7 @@ Description:
1 = large (16-bit destination ID, max. 65536 devices).
-What: /sys/class/rapidio_port/rapidioN/port_destid
+What: /sys/class/rapidio_port/rapidio<N>/port_destid
Date: Apr, 2014
KernelVersion: v3.15
Contact: Matt Porter <mporter@kernel.crashing.org>,
diff --git a/Documentation/ABI/testing/sysfs-class-rc b/Documentation/ABI/testing/sysfs-class-rc
index 9c8ff7910858..84e46d70d82b 100644
--- a/Documentation/ABI/testing/sysfs-class-rc
+++ b/Documentation/ABI/testing/sysfs-class-rc
@@ -7,7 +7,7 @@ Description:
core and provides a sysfs interface for configuring infrared
remote controller receivers.
-What: /sys/class/rc/rcN/
+What: /sys/class/rc/rc<N>/
Date: Apr 2010
KernelVersion: 2.6.35
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -15,7 +15,7 @@ Description:
A /sys/class/rc/rcN directory is created for each remote
control receiver device where N is the number of the receiver.
-What: /sys/class/rc/rcN/protocols
+What: /sys/class/rc/rc<N>/protocols
Date: Jun 2010
KernelVersion: 2.6.36
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -40,7 +40,7 @@ Description:
Write fails with EINVAL if an invalid protocol combination or
unknown protocol name is used.
-What: /sys/class/rc/rcN/filter
+What: /sys/class/rc/rc<N>/filter
Date: Jan 2014
KernelVersion: 3.15
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -55,7 +55,7 @@ Description:
This value may be reset to 0 if the current protocol is altered.
-What: /sys/class/rc/rcN/filter_mask
+What: /sys/class/rc/rc<N>/filter_mask
Date: Jan 2014
KernelVersion: 3.15
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -72,7 +72,7 @@ Description:
This value may be reset to 0 if the current protocol is altered.
-What: /sys/class/rc/rcN/wakeup_protocols
+What: /sys/class/rc/rc<N>/wakeup_protocols
Date: Feb 2017
KernelVersion: 4.11
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -98,7 +98,7 @@ Description:
unknown protocol name is used, or if wakeup is not supported by
the hardware.
-What: /sys/class/rc/rcN/wakeup_filter
+What: /sys/class/rc/rc<N>/wakeup_filter
Date: Jan 2014
KernelVersion: 3.15
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
@@ -117,7 +117,7 @@ Description:
This value may be reset to 0 if the wakeup protocol is altered.
-What: /sys/class/rc/rcN/wakeup_filter_mask
+What: /sys/class/rc/rc<N>/wakeup_filter_mask
Date: Jan 2014
KernelVersion: 3.15
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-class-rc-nuvoton b/Documentation/ABI/testing/sysfs-class-rc-nuvoton
index d3abe45f8690..f7bad8ecd08f 100644
--- a/Documentation/ABI/testing/sysfs-class-rc-nuvoton
+++ b/Documentation/ABI/testing/sysfs-class-rc-nuvoton
@@ -1,4 +1,4 @@
-What: /sys/class/rc/rcN/wakeup_data
+What: /sys/class/rc/rc<N>/wakeup_data
Date: Mar 2016
KernelVersion: 4.6
Contact: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 8516f08806dd..475b9a372657 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -370,3 +370,84 @@ Description:
'unknown' means software cannot determine the state, or
the reported state is invalid.
+
+What: /sys/class/regulator/.../under_voltage
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ under_voltage. This indicates if the device reports an
+ under-voltage fault (1) or not (0).
+
+What: /sys/class/regulator/.../over_current
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_current. This indicates if the device reports an
+ over-current fault (1) or not (0).
+
+What: /sys/class/regulator/.../regulation_out
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ regulation_out. This indicates if the device reports an
+ out-of-regulation fault (1) or not (0).
+
+What: /sys/class/regulator/.../fail
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ fail. This indicates if the device reports an output failure
+ (1) or not (0).
+
+What: /sys/class/regulator/.../over_temp
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_temp. This indicates if the device reports an
+ over-temperature fault (1) or not (0).
+
+What: /sys/class/regulator/.../under_voltage_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ under_voltage_warn. This indicates if the device reports an
+ under-voltage warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_current_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_current_warn. This indicates if the device reports an
+ over-current warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_voltage_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_voltage_warn. This indicates if the device reports an
+ over-voltage warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_temp_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_temp_warn. This indicates if the device reports an
+ over-temperature warning (1) or not (0).
diff --git a/Documentation/ABI/testing/sysfs-class-rtrs-client b/Documentation/ABI/testing/sysfs-class-rtrs-client
index 49a4157c7bf1..fecc59d1b96f 100644
--- a/Documentation/ABI/testing/sysfs-class-rtrs-client
+++ b/Documentation/ABI/testing/sysfs-class-rtrs-client
@@ -78,7 +78,7 @@ What: /sys/class/rtrs-client/<session-name>/paths/<src@dst>/hca_name
Date: Feb 2020
KernelVersion: 5.7
Contact: Jack Wang <jinpu.wang@cloud.ionos.com> Danil Kipnis <danil.kipnis@cloud.ionos.com>
-Description: RO, Contains the the name of HCA the connection established on.
+Description: RO, Contains the name of HCA the connection established on.
What: /sys/class/rtrs-client/<session-name>/paths/<src@dst>/hca_port
Date: Feb 2020
diff --git a/Documentation/ABI/testing/sysfs-class-rtrs-server b/Documentation/ABI/testing/sysfs-class-rtrs-server
index 3b6d5b067df0..b08601d80409 100644
--- a/Documentation/ABI/testing/sysfs-class-rtrs-server
+++ b/Documentation/ABI/testing/sysfs-class-rtrs-server
@@ -24,7 +24,7 @@ What: /sys/class/rtrs-server/<session-name>/paths/<src@dst>/hca_name
Date: Feb 2020
KernelVersion: 5.7
Contact: Jack Wang <jinpu.wang@cloud.ionos.com> Danil Kipnis <danil.kipnis@cloud.ionos.com>
-Description: RO, Contains the the name of HCA the connection established on.
+Description: RO, Contains the name of HCA the connection established on.
What: /sys/class/rtrs-server/<session-name>/paths/<src@dst>/hca_port
Date: Feb 2020
diff --git a/Documentation/ABI/testing/sysfs-class-thermal b/Documentation/ABI/testing/sysfs-class-thermal
new file mode 100644
index 000000000000..8eee37982b2a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-thermal
@@ -0,0 +1,259 @@
+What: /sys/class/thermal/thermal_zoneX/type
+Description:
+ Strings which represent the thermal zone type.
+ This is given by thermal zone driver as part of registration.
+ E.g: "acpitz" indicates it's an ACPI thermal device.
+ In order to keep it consistent with hwmon sys attribute; this
+ shouldbe a short, lowercase string, not containing spaces nor
+ dashes.
+
+ RO, Required
+
+What: /sys/class/thermal/thermal_zoneX/temp
+Description:
+ Current temperature as reported by thermal zone (sensor).
+
+ Unit: millidegree Celsius
+
+ RO, Required
+
+What: /sys/class/thermal/thermal_zoneX/mode
+Description:
+ One of the predefined values in [enabled, disabled].
+ This file gives information about the algorithm that is
+ currently managing the thermal zone. It can be either default
+ kernel based algorithm or user space application.
+
+ enabled
+ enable Kernel Thermal management.
+ disabled
+ Preventing kernel thermal zone driver actions upon
+ trip points so that user application can take full
+ charge of the thermal management.
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/policy
+Description:
+ One of the various thermal governors used for a particular zone.
+
+ RW, Required
+
+What: /sys/class/thermal/thermal_zoneX/available_policies
+Description:
+ Available thermal governors which can be used for a
+ particular zone.
+
+ RO, Required
+
+What: /sys/class/thermal/thermal_zoneX/trip_point_Y_temp
+Description:
+ The temperature above which trip point will be fired.
+
+ Unit: millidegree Celsius
+
+ RO, Optional
+
+What: /sys/class/thermal/thermal_zoneX/trip_point_Y_type
+Description:
+ Strings which indicate the type of the trip point.
+
+ E.g. it can be one of critical, hot, passive, `active[0-*]`
+ for ACPI thermal zone.
+
+ RO, Optional
+
+What: /sys/class/thermal/thermal_zoneX/trip_point_Y_hyst
+Description:
+ The hysteresis value for a trip point, represented as an
+ integer.
+
+ Unit: Celsius
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/cdevY
+Description:
+ Sysfs link to the thermal cooling device node where the sys I/F
+ for cooling device throttling control represents.
+
+ RO, Optional
+
+What: /sys/class/thermal/thermal_zoneX/cdevY_trip_point
+Description:
+ The trip point in this thermal zone which `cdev[0-*]` is
+ associated with; -1 means the cooling device is not
+ associated with any trip point.
+
+ RO, Optional
+
+What: /sys/class/thermal/thermal_zoneX/cdevY_weight
+Description:
+ The influence of `cdev[0-*]` in this thermal zone. This value
+ is relative to the rest of cooling devices in the thermal
+ zone. For example, if a cooling device has a weight double
+ than that of other, it's twice as effective in cooling the
+ thermal zone.
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/emul_temp
+Description:
+ Interface to set the emulated temperature method in thermal zone
+ (sensor). After setting this temperature, the thermal zone may
+ pass this temperature to platform emulation function if
+ registered or cache it locally. This is useful in debugging
+ different temperature threshold and its associated cooling
+ action. This is write only node and writing 0 on this node
+ should disable emulation.
+
+ Unit: millidegree Celsius
+
+ WO, Optional
+
+ WARNING:
+ Be careful while enabling this option on production systems,
+ because userland can easily disable the thermal policy by simply
+ flooding this sysfs node with low temperature values.
+
+
+What: /sys/class/thermal/thermal_zoneX/k_d
+Description:
+ The derivative term of the power allocator governor's PID
+ controller. For more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/k_i
+Description:
+ The integral term of the power allocator governor's PID
+ controller. This term allows the PID controller to compensate
+ for long term drift. For more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/k_po
+Description:
+ The proportional term of the power allocator governor's PID
+ controller during temperature overshoot. Temperature overshoot
+ is when the current temperature is above the "desired
+ temperature" trip point. For more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/k_pu
+Description:
+ The proportional term of the power allocator governor's PID
+ controller during temperature undershoot. Temperature undershoot
+ is when the current temperature is below the "desired
+ temperature" trip point. For more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/integral_cutoff
+Description:
+ Temperature offset from the desired temperature trip point
+ above which the integral term of the power allocator
+ governor's PID controller starts accumulating errors. For
+ example, if integral_cutoff is 0, then the integral term only
+ accumulates error when temperature is above the desired
+ temperature trip point. For more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ Unit: millidegree Celsius
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/slope
+Description:
+ The slope constant used in a linear extrapolation model
+ to determine a hotspot temperature based off the sensor's
+ raw readings. It is up to the device driver to determine
+ the usage of these values.
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/offset
+Description:
+ The offset constant used in a linear extrapolation model
+ to determine a hotspot temperature based off the sensor's
+ raw readings. It is up to the device driver to determine
+ the usage of these values.
+
+ RW, Optional
+
+What: /sys/class/thermal/thermal_zoneX/sustainable_power
+Description:
+ An estimate of the sustained power that can be dissipated by
+ the thermal zone. Used by the power allocator governor. For
+ more information see
+ Documentation/driver-api/thermal/power_allocator.rst
+
+ Unit: milliwatts
+
+ RW, Optional
+
+What: /sys/class/thermal/cooling_deviceX/type
+Description:
+ String which represents the type of device, e.g:
+
+ - for generic ACPI: should be "Fan", "Processor" or "LCD"
+ - for memory controller device on intel_menlow platform:
+ should be "Memory controller".
+
+ RO, Required
+
+What: /sys/class/thermal/cooling_deviceX/max_state
+Description:
+ The maximum permissible cooling state of this cooling device.
+
+ RO, Required
+
+What: /sys/class/thermal/cooling_deviceX/cur_state
+Description:
+ The current cooling state of this cooling device.
+ The value can any integer numbers between 0 and max_state:
+
+ - cur_state == 0 means no cooling
+ - cur_state == max_state means the maximum cooling.
+
+ RW, Required
+
+What: /sys/class/thermal/cooling_deviceX/stats/reset
+Description:
+ Writing any value resets the cooling device's statistics.
+
+ WO, Required
+
+What: /sys/class/thermal/cooling_deviceX/stats/time_in_state_ms:
+Description:
+ The amount of time spent by the cooling device in various
+ cooling states. The output will have "<state> <time>" pair
+ in each line, which will mean this cooling device spent <time>
+ msec of time at <state>.
+
+ Output will have one line for each of the supported states.
+
+ RO, Required
+
+What: /sys/class/thermal/cooling_deviceX/stats/total_trans
+Description:
+ A single positive value showing the total number of times
+ the state of a cooling device is changed.
+
+ RO, Required
+
+What: /sys/class/thermal/cooling_deviceX/stats/trans_table
+Description:
+ This gives fine grained information about all the cooling state
+ transitions. The cat output here is a two dimensional matrix,
+ where an entry <i,j> (row i, column j) represents the number
+ of transitions from State_i to State_j. If the transition
+ table is bigger than PAGE_SIZE, reading this will return
+ an -EFBIG error.
+
+ RO, Required
diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
index 40122d915ae1..281b995beb05 100644
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -141,6 +141,14 @@ Description:
- "reverse": CC2 orientation
- "unknown": Orientation cannot be determined.
+What: /sys/class/typec/<port>/select_usb_power_delivery
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Lists the USB Power Delivery Capabilities that the port can
+ advertise to the partner. The currently used capabilities are in
+ brackets. Selection happens by writing to the file.
+
USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
What: /sys/class/typec/<port>-partner/accessory_mode
@@ -200,7 +208,7 @@ Description: USB Power Delivery Specification defines a set of product types
amc Alternate Mode Controller
====================== ==========================
-What: /sys/class/typec/<port>-partner>/identity/
+What: /sys/class/typec/<port>-partner/identity/
Date: April 2017
Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Description:
diff --git a/Documentation/ABI/testing/sysfs-class-usb_power_delivery b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
new file mode 100644
index 000000000000..1bf9d1d7902c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-usb_power_delivery
@@ -0,0 +1,249 @@
+What: /sys/class/usb_power_delivery
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Directory for USB Power Delivery devices.
+
+What: /sys/class/usb_power_delivery/.../revision
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ File showing the USB Power Delivery Specification Revision used
+ in communication.
+
+What: /sys/class/usb_power_delivery/.../version
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This is an optional attribute file showing the version of the
+ specific revision of the USB Power Delivery Specification. In
+ most cases the specification version is not known and the file
+ is not available.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The source capabilities message "Source_Capabilities" contains a
+ set of Power Data Objects (PDO), each representing a type of
+ power supply. The order of the PDO objects is defined in the USB
+ Power Delivery Specification. Each PDO - power supply - will
+ have its own device, and the PDO device name will start with the
+ object position number as the first character followed by the
+ power supply type name (":" as delimiter).
+
+ /sys/class/usb_power_delivery/.../source_capabilities/<position>:<type>
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The sink capability message "Sink_Capabilities" contains a set
+ of Power Data Objects (PDO) just like with source capabilities,
+ but instead of describing the power capabilities, these objects
+ describe the power requirements.
+
+ The order of the objects in the sink capability message is the
+ same as with the source capabilities message.
+
+Fixed Supplies
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:fixed_supply
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Devices containing the attributes (the bit fields) defined for
+ Fixed Supplies.
+
+ The device "1:fixed_supply" is special. USB Power Delivery
+ Specification dictates that the first PDO (at object position
+ 1), and the only mandatory PDO, is always the vSafe5V Fixed
+ Supply Object. vSafe5V Object has additional fields defined for
+ it that the other Fixed Supply Objects do not have and that are
+ related to the USB capabilities rather than power capabilities.
+
+What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/dual_role_power
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file contains boolean value that tells does the device
+ support both source and sink power roles.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities/1:fixed_supply/usb_suspend_supported
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file shows the value of the USB Suspend Supported bit in
+ vSafe5V Fixed Supply Object. If the bit is set then the device
+ will follow the USB 2.0 and USB 3.2 rules for suspend and
+ resume.
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities/1:fixed_supply/higher_capability
+Date: February 2023
+Contact: Saranya Gopal <saranya.gopal@linux.intel.com>
+Description:
+ This file shows the value of the Higher capability bit in
+ vsafe5V Fixed Supply Object. If the bit is set, then the sink
+ needs more than vsafe5V(eg. 12 V) to provide full functionality.
+ Valid values: 0, 1
+
+What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/unconstrained_power
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file shows the value of the Unconstrained Power bit in
+ vSafe5V Fixed Supply Object. The bit is set when an external
+ source of power, powerful enough to power the entire system on
+ its own, is available for the device.
+
+What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/usb_communication_capable
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file shows the value of the USB Communication Capable bit in
+ vSafe5V Fixed Supply Object.
+
+What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/dual_role_data
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file shows the value of the Dual-Role Data bit in vSafe5V
+ Fixed Supply Object. Dual role data means ability act as both
+ USB host and USB device.
+
+What: /sys/class/usb_power_delivery/.../<capability>/1:fixed_supply/unchunked_extended_messages_supported
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file shows the value of the Unchunked Extended Messages
+ Supported bit in vSafe5V Fixed Supply Object.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:fixed_supply/voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The voltage the supply supports in millivolts.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities/<position>:fixed_supply/maximum_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum current of the fixed source supply in milliamperes.
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities/<position>:fixed_supply/operational_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Operational current of the sink in milliamperes.
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities/<position>:fixed_supply/fast_role_swap_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ This file contains the value of the "Fast Role Swap USB Type-C
+ Current" field that tells the current level the sink requires
+ after a Fast Role Swap.
+ 0 - Fast Swap not supported"
+ 1 - Default USB Power"
+ 2 - 1.5A@5V"
+ 3 - 3.0A@5V"
+
+Variable Supplies
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Variable Power Supply PDO.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply/maximum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:variable_supply/minimum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Minimum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities/<position>:variable_supply/maximum_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The maximum current in milliamperes that the source can supply
+ at the given Voltage range.
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities/<position>:variable_supply/operational_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The operational current in milliamperes that the sink requires
+ at the given Voltage range.
+
+Battery Supplies
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:battery
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Battery PDO.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:battery/maximum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:battery/minimum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Minimum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities/<position>:battery/maximum_power
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum allowable Power in milliwatts.
+
+What: /sys/class/usb_power_delivery/.../sink-capabilities/<position>:battery/operational_power
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The operational power that the sink requires at the given
+ voltage range.
+
+Standard Power Range (SPR) Programmable Power Supplies
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Programmable Power Supply (PPS) Augmented PDO (APDO).
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/maximum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/minimum_voltage
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Minimum Voltage in millivolts.
+
+What: /sys/class/usb_power_delivery/.../<capability>/<position>:programmable_supply/maximum_current
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ Maximum Current in milliamperes.
+
+What: /sys/class/usb_power_delivery/.../source-capabilities/<position>:programmable_supply/pps_power_limited
+Date: May 2022
+Contact: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+ The PPS Power Limited bit indicates whether or not the source
+ supply will exceed the rated output power if requested.
diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc
index 6c5dcad21e19..a7ea169dc4eb 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc
@@ -18,14 +18,14 @@ Description:
and it will be removed. The default is 3 superframes
(~197 ms) as required by the specification.
-What: /sys/class/uwb_rc/uwbN/
+What: /sys/class/uwb_rc/uwb<N>/
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
Description:
An individual UWB radio controller.
-What: /sys/class/uwb_rc/uwbN/beacon
+What: /sys/class/uwb_rc/uwb<N>/beacon
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -43,7 +43,7 @@ Description:
Reading returns the currently active channel, or -1 if
the radio controller is not beaconing.
-What: /sys/class/uwb_rc/uwbN/ASIE
+What: /sys/class/uwb_rc/uwb<N>/ASIE
Date: August 2014
KernelVersion: 3.18
Contact: linux-usb@vger.kernel.org
@@ -56,7 +56,7 @@ Description:
Reading returns the current ASIE. Writing replaces
the current ASIE with the one written.
-What: /sys/class/uwb_rc/uwbN/scan
+What: /sys/class/uwb_rc/uwb<N>/scan
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -75,7 +75,7 @@ Description:
4 scan (with start time of <bpst offset>)
== =======================================
-What: /sys/class/uwb_rc/uwbN/mac_address
+What: /sys/class/uwb_rc/uwb<N>/mac_address
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -85,7 +85,7 @@ Description:
controller's EUI-48 but only do so while the device is
not beaconing or scanning.
-What: /sys/class/uwb_rc/uwbN/wusbhc
+What: /sys/class/uwb_rc/uwb<N>/wusbhc
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -93,7 +93,7 @@ Description:
A symlink to the device (if any) of the WUSB Host
Controller PAL using this radio controller.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -102,7 +102,7 @@ Description:
as part of a scan or is a member of the radio
controllers beacon group.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/BPST
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/BPST
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -111,7 +111,7 @@ Description:
interval superframe timer) of the last beacon from
this device was received.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/DevAddr
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/DevAddr
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -119,7 +119,7 @@ Description:
The current DevAddr of this device in colon separated
hex octets.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/EUI_48
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/EUI_48
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -128,7 +128,7 @@ Description:
The EUI-48 of this device in colon separated hex
octets.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/IEs
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/IEs
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -136,7 +136,7 @@ Description:
The latest IEs included in this device's beacon, in
space separated hex octets with one IE per line.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/LQE
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/LQE
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
@@ -146,7 +146,7 @@ Description:
This gives an estimate on a suitable PHY rate. Refer
to [ECMA-368] section 13.3 for more details.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/RSSI
+What: /sys/class/uwb_rc/uwb<N>/<EUI-48>/RSSI
Date: July 2008
KernelVersion: 2.6.27
Contact: linux-usb@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc
index 5977e2875325..55eb55cac92e 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc-wusbhc
@@ -1,4 +1,4 @@
-What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_chid
+What: /sys/class/uwb_rc/uwb<N>/wusbhc/wusb_chid
Date: July 2008
KernelVersion: 2.6.27
Contact: David Vrabel <david.vrabel@csr.com>
@@ -9,7 +9,7 @@ Description:
Set an all zero CHID to stop the host controller.
-What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_trust_timeout
+What: /sys/class/uwb_rc/uwb<N>/wusbhc/wusb_trust_timeout
Date: July 2008
KernelVersion: 2.6.27
Contact: David Vrabel <david.vrabel@csr.com>
@@ -24,7 +24,7 @@ Description:
lifetime of PTKs and GTKs) it should not be changed
from the default.
-What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_phy_rate
+What: /sys/class/uwb_rc/uwb<N>/wusbhc/wusb_phy_rate
Date: August 2009
KernelVersion: 2.6.32
Contact: David Vrabel <david.vrabel@csr.com>
@@ -37,7 +37,7 @@ Description:
Refer to [ECMA-368] section 10.3.1.1 for the value to
use.
-What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_dnts
+What: /sys/class/uwb_rc/uwb<N>/wusbhc/wusb_dnts
Date: June 2013
KernelVersion: 3.11
Contact: Thomas Pugliese <thomas.pugliese@gmail.com>
@@ -47,7 +47,7 @@ Description:
often the devices will have the opportunity to send
notifications to the host.
-What: /sys/class/uwb_rc/uwbN/wusbhc/wusb_retry_count
+What: /sys/class/uwb_rc/uwb<N>/wusbhc/wusb_retry_count
Date: June 2013
KernelVersion: 3.11
Contact: Thomas Pugliese <thomas.pugliese@gmail.com>
diff --git a/Documentation/ABI/testing/sysfs-class-vduse b/Documentation/ABI/testing/sysfs-class-vduse
new file mode 100644
index 000000000000..2f2bc5c8fc48
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-vduse
@@ -0,0 +1,33 @@
+What: /sys/class/vduse/
+Date: Oct 2021
+KernelVersion: 5.15
+Contact: Yongji Xie <xieyongji@bytedance.com>
+Description:
+ The vduse/ class sub-directory belongs to the VDUSE
+ framework and provides a sysfs interface for configuring
+ VDUSE devices.
+
+What: /sys/class/vduse/control/
+Date: Oct 2021
+KernelVersion: 5.15
+Contact: Yongji Xie <xieyongji@bytedance.com>
+Description:
+ This directory entry is created for the control device
+ of VDUSE framework.
+
+What: /sys/class/vduse/<device-name>/
+Date: Oct 2021
+KernelVersion: 5.15
+Contact: Yongji Xie <xieyongji@bytedance.com>
+Description:
+ This directory entry is created when a VDUSE device is
+ created via the control device.
+
+What: /sys/class/vduse/<device-name>/msg_timeout
+Date: Oct 2021
+KernelVersion: 5.15
+Contact: Yongji Xie <xieyongji@bytedance.com>
+Description:
+ (RW) The timeout (in seconds) for waiting for the control
+ message's response from userspace. Default value is 30s.
+ Writing a '0' to the file means to disable the timeout.
diff --git a/Documentation/ABI/testing/sysfs-class-watchdog b/Documentation/ABI/testing/sysfs-class-watchdog
index 585caecda3a5..94fb74615951 100644
--- a/Documentation/ABI/testing/sysfs-class-watchdog
+++ b/Documentation/ABI/testing/sysfs-class-watchdog
@@ -6,6 +6,19 @@ Description:
device at boot. It is equivalent to WDIOC_GETBOOTSTATUS of
ioctl interface.
+What: /sys/class/watchdog/watchdogn/options
+Date: April 2023
+Contact: Thomas Weißschuh
+Description:
+ It is a read only file. It contains options of watchdog device.
+
+What: /sys/class/watchdog/watchdogn/fw_version
+Date: April 2023
+Contact: Thomas Weißschuh
+Description:
+ It is a read only file. It contains firmware version of
+ watchdog device.
+
What: /sys/class/watchdog/watchdogn/identity
Date: August 2015
Contact: Wim Van Sebroeck <wim@iguana.be>
diff --git a/Documentation/ABI/testing/sysfs-devices-hisi_ptt b/Documentation/ABI/testing/sysfs-devices-hisi_ptt
new file mode 100644
index 000000000000..82de6d710266
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-hisi_ptt
@@ -0,0 +1,61 @@
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: This directory contains files for tuning the PCIe link
+ parameters(events). Each file is named after the event
+ of the PCIe link.
+
+ See Documentation/trace/hisi-ptt.rst for more information.
+
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_cpl
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: (RW) Controls the weight of Tx completion TLPs, which influence
+ the proportion of outbound completion TLPs on the PCIe link.
+ The available tune data is [0, 1, 2]. Writing a negative value
+ will return an error, and out of range values will be converted
+ to 2. The value indicates a probable level of the event.
+
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_np
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: (RW) Controls the weight of Tx non-posted TLPs, which influence
+ the proportion of outbound non-posted TLPs on the PCIe link.
+ The available tune data is [0, 1, 2]. Writing a negative value
+ will return an error, and out of range values will be converted
+ to 2. The value indicates a probable level of the event.
+
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_p
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: (RW) Controls the weight of Tx posted TLPs, which influence the
+ proportion of outbound posted TLPs on the PCIe link.
+ The available tune data is [0, 1, 2]. Writing a negative value
+ will return an error, and out of range values will be converted
+ to 2. The value indicates a probable level of the event.
+
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/rx_alloc_buf_level
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: (RW) Control the allocated buffer watermark for inbound packets.
+ The packets will be stored in the buffer first and then transmitted
+ either when the watermark reached or when timed out.
+ The available tune data is [0, 1, 2]. Writing a negative value
+ will return an error, and out of range values will be converted
+ to 2. The value indicates a probable level of the event.
+
+What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/tx_alloc_buf_level
+Date: October 2022
+KernelVersion: 6.1
+Contact: Yicong Yang <yangyicong@hisilicon.com>
+Description: (RW) Control the allocated buffer watermark of outbound packets.
+ The packets will be stored in the buffer first and then transmitted
+ either when the watermark reached or when timed out.
+ The available tune data is [0, 1, 2]. Writing a negative value
+ will return an error, and out of range values will be converted
+ to 2. The value indicates a probable level of the event.
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
index 8d202bac9394..2eee1446ad4c 100644
--- a/Documentation/ABI/testing/sysfs-devices-mapping
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -1,6 +1,6 @@
What: /sys/devices/uncore_iio_x/dieX
Date: February 2020
-Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
Description:
Each IIO stack (PCIe root port) has its own IIO PMON block, so
each dieX file (where X is die number) holds "Segment:Root Bus"
@@ -32,3 +32,31 @@ Description:
IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
+
+What: /sys/devices/uncore_upi_x/dieX
+Date: March 2022
+Contact: Alexander Antonov <alexander.antonov@linux.intel.com>
+Description:
+ Each /sys/devices/uncore_upi_X/dieY file holds "upi_Z,die_W"
+ value that means UPI link number X on die Y is connected to UPI
+ link Z on die W and this link between sockets can be monitored
+ by UPI PMON block.
+ For example, 4-die Sapphire Rapids platform has the following
+ UPI 0 topology::
+
+ # tail /sys/devices/uncore_upi_0/die*
+ ==> /sys/devices/uncore_upi_0/die0 <==
+ upi_1,die_1
+ ==> /sys/devices/uncore_upi_0/die1 <==
+ upi_0,die_3
+ ==> /sys/devices/uncore_upi_0/die2 <==
+ upi_1,die_3
+ ==> /sys/devices/uncore_upi_0/die3 <==
+ upi_0,die_1
+
+ Which means::
+
+ UPI link 0 on die 0 is connected to UPI link 1 on die 1
+ UPI link 0 on die 1 is connected to UPI link 0 on die 3
+ UPI link 0 on die 2 is connected to UPI link 1 on die 3
+ UPI link 0 on die 3 is connected to UPI link 0 on die 1 \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-devices-physical_location b/Documentation/ABI/testing/sysfs-devices-physical_location
new file mode 100644
index 000000000000..202324b87083
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-physical_location
@@ -0,0 +1,42 @@
+What: /sys/devices/.../physical_location
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ This directory contains information on physical location of
+ the device connection point with respect to the system's
+ housing.
+
+What: /sys/devices/.../physical_location/panel
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ Describes which panel surface of the system’s housing the
+ device connection point resides on.
+
+What: /sys/devices/.../physical_location/vertical_position
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ Describes vertical position of the device connection point on
+ the panel surface.
+
+What: /sys/devices/.../physical_location/horizontal_position
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ Describes horizontal position of the device connection point on
+ the panel surface.
+
+What: /sys/devices/.../physical_location/dock
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ "Yes" if the device connection point resides in a docking
+ station or a port replicator. "No" otherwise.
+
+What: /sys/devices/.../physical_location/lid
+Date: March 2022
+Contact: Won Chung <wonchung@google.com>
+Description:
+ "Yes" if the device connection point resides on the lid of
+ laptop system. "No" otherwise.
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-ACPI-TAD b/Documentation/ABI/testing/sysfs-devices-platform-ACPI-TAD
index f7b360a61b21..bc44bc903bc8 100644
--- a/Documentation/ABI/testing/sysfs-devices-platform-ACPI-TAD
+++ b/Documentation/ABI/testing/sysfs-devices-platform-ACPI-TAD
@@ -74,7 +74,7 @@ Description:
Reads also cause the AC alarm timer status to be reset.
- Another way to reset the the status of the AC alarm timer is to
+ Another way to reset the status of the AC alarm timer is to
write (the number) 0 to this file.
If the status return value indicates that the timer has expired,
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-dock b/Documentation/ABI/testing/sysfs-devices-platform-dock
index 1d8c18f905c7..411c174de830 100644
--- a/Documentation/ABI/testing/sysfs-devices-platform-dock
+++ b/Documentation/ABI/testing/sysfs-devices-platform-dock
@@ -1,4 +1,4 @@
-What: /sys/devices/platform/dock.N/docked
+What: /sys/devices/platform/dock.<N>/docked
Date: Dec, 2006
KernelVersion: 2.6.19
Contact: linux-acpi@vger.kernel.org
@@ -6,7 +6,7 @@ Description:
(RO) Value 1 or 0 indicates whether the software believes the
laptop is docked in a docking station.
-What: /sys/devices/platform/dock.N/undock
+What: /sys/devices/platform/dock.<N>/undock
Date: Dec, 2006
KernelVersion: 2.6.19
Contact: linux-acpi@vger.kernel.org
@@ -14,14 +14,14 @@ Description:
(WO) Writing to this file causes the software to initiate an
undock request to the firmware.
-What: /sys/devices/platform/dock.N/uid
+What: /sys/devices/platform/dock.<N>/uid
Date: Feb, 2007
KernelVersion: v2.6.21
Contact: linux-acpi@vger.kernel.org
Description:
(RO) Displays the docking station the laptop is docked to.
-What: /sys/devices/platform/dock.N/flags
+What: /sys/devices/platform/dock.<N>/flags
Date: May, 2007
KernelVersion: v2.6.21
Contact: linux-acpi@vger.kernel.org
@@ -30,7 +30,7 @@ Description:
request has been made by the user (from the immediate_undock
option).
-What: /sys/devices/platform/dock.N/type
+What: /sys/devices/platform/dock.<N>/type
Date: Aug, 2008
KernelVersion: v2.6.27
Contact: linux-acpi@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-devices-platform-soc-ipa b/Documentation/ABI/testing/sysfs-devices-platform-soc-ipa
index c56dcf15bf29..364b1ba41242 100644
--- a/Documentation/ABI/testing/sysfs-devices-platform-soc-ipa
+++ b/Documentation/ABI/testing/sysfs-devices-platform-soc-ipa
@@ -46,33 +46,69 @@ Description:
that is supported by the hardware. The possible values
are "MAPv4" or "MAPv5".
+What: .../XXXXXXX.ipa/endpoint_id/
+Date: July 2022
+KernelVersion: v5.19
+Contact: Alex Elder <elder@kernel.org>
+Description:
+ The .../XXXXXXX.ipa/endpoint_id/ directory contains
+ attributes that define IDs associated with IPA
+ endpoints. The "rx" or "tx" in an endpoint name is
+ from the perspective of the AP. An endpoint ID is a
+ small unsigned integer.
+
+What: .../XXXXXXX.ipa/endpoint_id/modem_rx
+Date: July 2022
+KernelVersion: v5.19
+Contact: Alex Elder <elder@kernel.org>
+Description:
+ The .../XXXXXXX.ipa/endpoint_id/modem_rx file contains
+ the ID of the AP endpoint on which packets originating
+ from the embedded modem are received.
+
+What: .../XXXXXXX.ipa/endpoint_id/modem_tx
+Date: July 2022
+KernelVersion: v5.19
+Contact: Alex Elder <elder@kernel.org>
+Description:
+ The .../XXXXXXX.ipa/endpoint_id/modem_tx file contains
+ the ID of the AP endpoint on which packets destined
+ for the embedded modem are sent.
+
+What: .../XXXXXXX.ipa/endpoint_id/monitor_rx
+Date: July 2022
+KernelVersion: v5.19
+Contact: Alex Elder <elder@kernel.org>
+Description:
+ The .../XXXXXXX.ipa/endpoint_id/monitor_rx file contains
+ the ID of the AP endpoint on which IPA "monitor" data is
+ received. The monitor endpoint supplies replicas of
+ packets that enter the IPA hardware for processing.
+ Each replicated packet is preceded by a fixed-size "ODL"
+ header (see .../XXXXXXX.ipa/feature/monitor, above).
+ Large packets are truncated, to reduce the bandwidth
+ required to provide the monitor function.
+
What: .../XXXXXXX.ipa/modem/
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
- The .../XXXXXXX.ipa/modem/ directory contains a set of
- attributes describing properties of the modem execution
- environment reachable by the IPA hardware.
+ The .../XXXXXXX.ipa/modem/ directory contains attributes
+ describing properties of the modem embedded in the SoC.
What: .../XXXXXXX.ipa/modem/rx_endpoint_id
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
- The .../XXXXXXX.ipa/feature/rx_endpoint_id file contains
- the AP endpoint ID that receives packets originating from
- the modem execution environment. The "rx" is from the
- perspective of the AP; this endpoint is considered an "IPA
- producer". An endpoint ID is a small unsigned integer.
+ The .../XXXXXXX.ipa/modem/rx_endpoint_id file duplicates
+ the value found in .../XXXXXXX.ipa/endpoint_id/modem_rx.
What: .../XXXXXXX.ipa/modem/tx_endpoint_id
Date: June 2021
KernelVersion: v5.14
Contact: Alex Elder <elder@kernel.org>
Description:
- The .../XXXXXXX.ipa/feature/tx_endpoint_id file contains
- the AP endpoint ID used to transmit packets destined for
- the modem execution environment. The "tx" is from the
- perspective of the AP; this endpoint is considered an "IPA
- consumer". An endpoint ID is a small unsigned integer.
+ The .../XXXXXXX.ipa/modem/tx_endpoint_id file duplicates
+ the value found in .../XXXXXXX.ipa/endpoint_id/modem_tx.
diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index 1763e64dd152..54195530e97a 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -269,3 +269,39 @@ Description:
the current runtime PM status of the device, which may be
"suspended", "suspending", "resuming", "active", "error" (fatal
error), or "unsupported" (runtime PM is disabled).
+
+What: /sys/devices/.../power/runtime_active_time
+Date: Jul 2010
+Contact: Arjan van de Ven <arjan@linux.intel.com>
+Description:
+ Reports the total time that the device has been active.
+ Used for runtime PM statistics.
+
+What: /sys/devices/.../power/runtime_suspended_time
+Date: Jul 2010
+Contact: Arjan van de Ven <arjan@linux.intel.com>
+Description:
+ Reports total time that the device has been suspended.
+ Used for runtime PM statistics.
+
+What: /sys/devices/.../power/runtime_usage
+Date: Apr 2010
+Contact: Dominik Brodowski <linux@dominikbrodowski.net>
+Description:
+ Reports the runtime PM usage count of a device.
+
+What: /sys/devices/.../power/runtime_enabled
+Date: Apr 2010
+Contact: Dominik Brodowski <linux@dominikbrodowski.net>
+Description:
+ Is runtime PM enabled for this device?
+ States are "enabled", "disabled", "forbidden" or a
+ combination of the latter two.
+
+What: /sys/devices/.../power/runtime_active_kids
+Date: Apr 2010
+Contact: Dominik Brodowski <linux@dominikbrodowski.net>
+Description:
+ Reports the runtime PM children usage count of a device, or
+ 0 if the children will be ignored.
+
diff --git a/Documentation/ABI/testing/sysfs-devices-removable b/Documentation/ABI/testing/sysfs-devices-removable
index bda6c320c8d3..754ecb4587ca 100644
--- a/Documentation/ABI/testing/sysfs-devices-removable
+++ b/Documentation/ABI/testing/sysfs-devices-removable
@@ -7,10 +7,12 @@ Description:
bus / platform-specific way. This attribute is only present for
devices that can support determining such information:
- "removable": device can be removed from the platform by the user
- "fixed": device is fixed to the platform / cannot be removed
+ =========== ===================================================
+ "removable" device can be removed from the platform by the user
+ "fixed" device is fixed to the platform / cannot be removed
by the user.
- "unknown": The information is unavailable / cannot be deduced.
+ "unknown" The information is unavailable / cannot be deduced.
+ =========== ===================================================
Currently this is only supported by USB (which infers the
information from a combination of hub descriptor bits and
diff --git a/Documentation/ABI/testing/sysfs-devices-soc b/Documentation/ABI/testing/sysfs-devices-soc
index ea999e292f11..5269808ec35f 100644
--- a/Documentation/ABI/testing/sysfs-devices-soc
+++ b/Documentation/ABI/testing/sysfs-devices-soc
@@ -1,6 +1,6 @@
What: /sys/devices/socX
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
The /sys/devices/ directory contains a sub-directory for each
System-on-Chip (SoC) device on a running platform. Information
@@ -14,14 +14,14 @@ Description:
What: /sys/devices/socX/machine
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
Read-only attribute common to all SoCs. Contains the SoC machine
name (e.g. Ux500).
What: /sys/devices/socX/family
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
Read-only attribute common to all SoCs. Contains SoC family name
(e.g. DB8500).
@@ -59,7 +59,7 @@ Description:
What: /sys/devices/socX/soc_id
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
Read-only attribute supported by most SoCs. In the case of
ST-Ericsson's chips this contains the SoC serial number.
@@ -72,21 +72,21 @@ Description:
What: /sys/devices/socX/revision
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
Read-only attribute supported by most SoCs. Contains the SoC's
manufacturing revision number.
What: /sys/devices/socX/process
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
Read-only attribute supported ST-Ericsson's silicon. Contains the
the process by which the silicon chip was manufactured.
What: /sys/bus/soc
Date: January 2012
-contact: Lee Jones <lee.jones@linaro.org>
+contact: Lee Jones <lee@kernel.org>
Description:
The /sys/bus/soc/ directory contains the usual sub-folders
expected under most buses. /sys/bus/soc/devices is of particular
diff --git a/Documentation/ABI/testing/sysfs-devices-state_synced b/Documentation/ABI/testing/sysfs-devices-state_synced
index 0c922d7d02fc..c64636ddac41 100644
--- a/Documentation/ABI/testing/sysfs-devices-state_synced
+++ b/Documentation/ABI/testing/sysfs-devices-state_synced
@@ -21,4 +21,9 @@ Description:
at the time the kernel starts are not affected or limited in
any way by sync_state() callbacks.
+ Writing "1" to this file will force a call to the device's
+ sync_state() function if it hasn't been called already. The
+ sync_state() call happens independent of the state of the
+ consumer devices.
+
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index b46ef147616a..f54867cadb0f 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -7,7 +7,7 @@ Description:
Individual CPU attributes are contained in subdirectories
named by the kernel's logical CPU number, e.g.:
- /sys/devices/system/cpu/cpu#/
+ /sys/devices/system/cpu/cpuX/
What: /sys/devices/system/cpu/kernel_max
/sys/devices/system/cpu/offline
@@ -53,7 +53,7 @@ Description: Dynamic addition and removal of CPU's. This is not hotplug
the system. Information written to the file to remove CPU's
is architecture specific.
-What: /sys/devices/system/cpu/cpu#/node
+What: /sys/devices/system/cpu/cpuX/node
Date: October 2009
Contact: Linux memory management mailing list <linux-mm@kvack.org>
Description: Discover NUMA node a CPU belongs to
@@ -67,41 +67,42 @@ Description: Discover NUMA node a CPU belongs to
/sys/devices/system/cpu/cpu42/node2 -> ../../node/node2
-What: /sys/devices/system/cpu/cpu#/topology/core_id
- /sys/devices/system/cpu/cpu#/topology/core_siblings
- /sys/devices/system/cpu/cpu#/topology/core_siblings_list
- /sys/devices/system/cpu/cpu#/topology/physical_package_id
- /sys/devices/system/cpu/cpu#/topology/thread_siblings
- /sys/devices/system/cpu/cpu#/topology/thread_siblings_list
+What: /sys/devices/system/cpu/cpuX/topology/core_siblings
+ /sys/devices/system/cpu/cpuX/topology/core_siblings_list
+ /sys/devices/system/cpu/cpuX/topology/physical_package_id
+ /sys/devices/system/cpu/cpuX/topology/thread_siblings
+ /sys/devices/system/cpu/cpuX/topology/thread_siblings_list
+ /sys/devices/system/cpu/cpuX/topology/ppin
Date: December 2008
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: CPU topology files that describe a logical CPU's relationship
to other cores and threads in the same physical package.
- One cpu# directory is created per logical CPU in the system,
+ One cpuX directory is created per logical CPU in the system,
e.g. /sys/devices/system/cpu/cpu42/.
Briefly, the files above are:
- core_id: the CPU core ID of cpu#. Typically it is the
- hardware platform's identifier (rather than the kernel's).
- The actual value is architecture and platform dependent.
-
- core_siblings: internal kernel map of cpu#'s hardware threads
+ core_siblings: internal kernel map of cpuX's hardware threads
within the same physical_package_id.
core_siblings_list: human-readable list of the logical CPU
- numbers within the same physical_package_id as cpu#.
+ numbers within the same physical_package_id as cpuX.
- physical_package_id: physical package id of cpu#. Typically
+ physical_package_id: physical package id of cpuX. Typically
corresponds to a physical socket number, but the actual value
is architecture and platform dependent.
- thread_siblings: internal kernel map of cpu#'s hardware
- threads within the same core as cpu#
+ thread_siblings: internal kernel map of cpuX's hardware
+ threads within the same core as cpuX
+
+ thread_siblings_list: human-readable list of cpuX's hardware
+ threads within the same core as cpuX
- thread_siblings_list: human-readable list of cpu#'s hardware
- threads within the same core as cpu#
+ ppin: human-readable Protected Processor Identification
+ Number of the socket the cpu# belongs to. There should be
+ one per physical_package_id. File is readable only to
+ admin.
See Documentation/admin-guide/cputopology.rst for more information.
@@ -135,7 +136,7 @@ Description: Discover cpuidle policy and mechanism
Documentation/driver-api/pm/cpuidle.rst for more information.
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/name
/sys/devices/system/cpu/cpuX/cpuidle/stateN/latency
/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
@@ -174,7 +175,7 @@ Description:
(a count).
======== ==== =================================================
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/desc
Date: February 2008
KernelVersion: v2.6.25
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -182,7 +183,7 @@ Description:
(RO) A small description about the idle state (string).
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/disable
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/disable
Date: March 2012
KernelVersion: v3.10
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -195,14 +196,14 @@ Description:
does not reflect it. Likewise, if one enables a deep state but a
lighter state still is disabled, then this has no effect.
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/default_status
Date: December 2019
KernelVersion: v5.6
Contact: Linux power management list <linux-pm@vger.kernel.org>
Description:
(RO) The default status of this state, "enabled" or "disabled".
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/residency
Date: March 2014
KernelVersion: v3.15
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -211,7 +212,7 @@ Description:
time (in microseconds) this cpu should spend in this idle state
to make the transition worth the effort.
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/s2idle/
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/s2idle/
Date: March 2018
KernelVersion: v4.17
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -221,7 +222,7 @@ Description:
This attribute group is only present for states that can be
used in suspend-to-idle with suspended timekeeping.
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/s2idle/time
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/s2idle/time
Date: March 2018
KernelVersion: v4.17
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -229,7 +230,7 @@ Description:
Total time spent by the CPU in suspend-to-idle (with scheduler
tick suspended) after requesting this state.
-What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/s2idle/usage
+What: /sys/devices/system/cpu/cpuX/cpuidle/state<N>/s2idle/usage
Date: March 2018
KernelVersion: v4.17
Contact: Linux power management list <linux-pm@vger.kernel.org>
@@ -237,7 +238,7 @@ Description:
Total number of times this state has been requested by the CPU
while entering suspend-to-idle.
-What: /sys/devices/system/cpu/cpu#/cpufreq/*
+What: /sys/devices/system/cpu/cpuX/cpufreq/*
Date: pre-git history
Contact: linux-pm@vger.kernel.org
Description: Discover and change clock speed of CPUs
@@ -252,7 +253,7 @@ Description: Discover and change clock speed of CPUs
See files in Documentation/cpu-freq/ for more information.
-What: /sys/devices/system/cpu/cpu#/cpufreq/freqdomain_cpus
+What: /sys/devices/system/cpu/cpuX/cpufreq/freqdomain_cpus
Date: June 2013
Contact: linux-pm@vger.kernel.org
Description: Discover CPUs in the same CPU frequency coordination domain
@@ -295,22 +296,22 @@ Description: Processor frequency boosting control
This switch controls the boost setting for the whole system.
Boosting allows the CPU and the firmware to run at a frequency
- beyond it's nominal limit.
+ beyond its nominal limit.
More details can be found in
Documentation/admin-guide/pm/cpufreq.rst
-What: /sys/devices/system/cpu/cpu#/crash_notes
- /sys/devices/system/cpu/cpu#/crash_notes_size
+What: /sys/devices/system/cpu/cpuX/crash_notes
+ /sys/devices/system/cpu/cpuX/crash_notes_size
Date: April 2013
Contact: kexec@lists.infradead.org
Description: address and size of the percpu note.
crash_notes: the physical address of the memory that holds the
- note of cpu#.
+ note of cpuX.
- crash_notes_size: size of the note of cpu#.
+ crash_notes_size: size of the note of cpuX.
What: /sys/devices/system/cpu/intel_pstate/max_perf_pct
@@ -487,12 +488,13 @@ What: /sys/devices/system/cpu/cpuX/regs/
/sys/devices/system/cpu/cpuX/regs/identification/
/sys/devices/system/cpu/cpuX/regs/identification/midr_el1
/sys/devices/system/cpu/cpuX/regs/identification/revidr_el1
+ /sys/devices/system/cpu/cpuX/regs/identification/smidr_el1
Date: June 2016
Contact: Linux ARM Kernel Mailing list <linux-arm-kernel@lists.infradead.org>
Description: AArch64 CPU registers
'identification' directory exposes the CPU ID registers for
- identifying model and revision of the CPU.
+ identifying model and revision of the CPU and SMCU.
What: /sys/devices/system/cpu/aarch32_el0
Date: May 2021
@@ -503,12 +505,12 @@ Description: Identifies the subset of CPUs in the system that can execute
If absent, then all or none of the CPUs can execute AArch32
applications and execve() will behave accordingly.
-What: /sys/devices/system/cpu/cpu#/cpu_capacity
+What: /sys/devices/system/cpu/cpuX/cpu_capacity
Date: December 2016
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: information about CPUs heterogeneity.
- cpu_capacity: capacity of cpu#.
+ cpu_capacity: capacity of cpuX.
What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/meltdown
@@ -520,6 +522,8 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/srbds
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
+ /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+ /sys/devices/system/cpu/vulnerabilities/retbleed
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@@ -560,7 +564,7 @@ Description: Control Symmetric Multi Threading (SMT)
If control status is "forceoff" or "notsupported" writes
are rejected.
-What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias
+What: /sys/devices/system/cpu/cpuX/power/energy_perf_bias
Date: March 2019
Contact: linux-pm@vger.kernel.org
Description: Intel Energy and Performance Bias Hint (EPB)
@@ -662,7 +666,23 @@ Description: Preferred MTE tag checking mode
================ ==============================================
"sync" Prefer synchronous mode
+ "asymm" Prefer asymmetric mode
"async" Prefer asynchronous mode
================ ==============================================
See also: Documentation/arm64/memory-tagging-extension.rst
+
+What: /sys/devices/system/cpu/nohz_full
+Date: Apr 2015
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:
+ (RO) the list of CPUs that are in nohz_full mode.
+ These CPUs are set by boot parameter "nohz_full=".
+
+What: /sys/devices/system/cpu/isolated
+Date: Apr 2015
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:
+ (RO) the list of CPUs that are isolated and don't
+ participate in load balancing. These CPUs are set by
+ boot parameter "isolcpus=".
diff --git a/Documentation/ABI/testing/sysfs-devices-vfio-dev b/Documentation/ABI/testing/sysfs-devices-vfio-dev
new file mode 100644
index 000000000000..e21424fd9666
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-vfio-dev
@@ -0,0 +1,8 @@
+What: /sys/.../<device>/vfio-dev/vfioX/
+Date: September 2022
+Contact: Yi Liu <yi.l.liu@intel.com>
+Description:
+ This directory is created when the device is bound to a
+ vfio driver. The layout under this directory matches what
+ exists for a standard 'struct device'. 'X' is a unique
+ index marking this device in vfio.
diff --git a/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing b/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing
new file mode 100644
index 000000000000..910df0e5815a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-aspeed-uart-routing
@@ -0,0 +1,27 @@
+What: /sys/bus/platform/drivers/aspeed-uart-routing/\*/uart\*
+Date: September 2021
+Contact: Oskar Senft <osk@google.com>
+ Chia-Wei Wang <chiawei_wang@aspeedtech.com>
+Description: Selects the RX source of the UARTx device.
+
+ When read, each file shows the list of available options with currently
+ selected option marked by brackets "[]". The list of available options
+ depends on the selected file.
+
+ e.g.
+ cat /sys/bus/platform/drivers/aspeed-uart-routing/\*.uart_routing/uart1
+ [io1] io2 io3 io4 uart2 uart3 uart4 io6
+
+ In this case, UART1 gets its input from IO1 (physical serial port 1).
+
+Users: OpenBMC. Proposed changes should be mailed to
+ openbmc@lists.ozlabs.org
+
+What: /sys/bus/platform/drivers/aspeed-uart-routing/\*/io\*
+Date: September 2021
+Contact: Oskar Senft <osk@google.com>
+ Chia-Wei Wang <chiawei_wang@aspeedtech.com>
+Description: Selects the RX source of IOx serial port. The current selection
+ will be marked by brackets "[]".
+Users: OpenBMC. Proposed changes should be mailed to
+ openbmc@lists.ozlabs.org
diff --git a/Documentation/ABI/testing/sysfs-driver-bd9571mwv-regulator b/Documentation/ABI/testing/sysfs-driver-bd9571mwv-regulator
index 42214b4ff14a..90596d8bb51c 100644
--- a/Documentation/ABI/testing/sysfs-driver-bd9571mwv-regulator
+++ b/Documentation/ABI/testing/sysfs-driver-bd9571mwv-regulator
@@ -26,6 +26,6 @@ Description: Read/write the current state of DDR Backup Mode, which controls
DDR Backup Mode must be explicitly enabled by the user,
to invoke step 1.
- See also Documentation/devicetree/bindings/mfd/bd9571mwv.txt.
+ See also Documentation/devicetree/bindings/mfd/rohm,bd9571mwv.yaml.
Users: User space applications for embedded boards equipped with a
BD9571MWV PMIC.
diff --git a/Documentation/ABI/testing/sysfs-driver-ccp b/Documentation/ABI/testing/sysfs-driver-ccp
new file mode 100644
index 000000000000..7aded9b75553
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-ccp
@@ -0,0 +1,87 @@
+What: /sys/bus/pci/devices/<BDF>/fused_part
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/fused_part file reports
+ whether the CPU or APU has been fused to prevent tampering.
+ 0: Not fused
+ 1: Fused
+
+What: /sys/bus/pci/devices/<BDF>/debug_lock_on
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/debug_lock_on reports
+ whether the AMD CPU or APU has been unlocked for debugging.
+ Possible values:
+ 0: Not locked
+ 1: Locked
+
+What: /sys/bus/pci/devices/<BDF>/tsme_status
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/tsme_status file reports
+ the status of transparent secure memory encryption on AMD systems.
+ Possible values:
+ 0: Not active
+ 1: Active
+
+What: /sys/bus/pci/devices/<BDF>/anti_rollback_status
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/anti_rollback_status file reports
+ whether the PSP is enforcing rollback protection.
+ Possible values:
+ 0: Not enforcing
+ 1: Enforcing
+
+What: /sys/bus/pci/devices/<BDF>/rpmc_production_enabled
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/rpmc_production_enabled file reports
+ whether Replay Protected Monotonic Counter support has been enabled.
+ Possible values:
+ 0: Not enabled
+ 1: Enabled
+
+What: /sys/bus/pci/devices/<BDF>/rpmc_spirom_available
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/rpmc_spirom_available file reports
+ whether an Replay Protected Monotonic Counter supported SPI is installed
+ on the system.
+ Possible values:
+ 0: Not present
+ 1: Present
+
+What: /sys/bus/pci/devices/<BDF>/hsp_tpm_available
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/hsp_tpm_available file reports
+ whether the HSP TPM has been activated.
+ Possible values:
+ 0: Not activated or present
+ 1: Activated
+
+What: /sys/bus/pci/devices/<BDF>/rom_armor_enforced
+Date: June 2022
+KernelVersion: 5.19
+Contact: mario.limonciello@amd.com
+Description:
+ The /sys/bus/pci/devices/<BDF>/rom_armor_enforced file reports
+ whether RomArmor SPI protection is enforced.
+ Possible values:
+ 0: Not enforced
+ 1: Enforced
diff --git a/Documentation/ABI/testing/sysfs-driver-chromeos-acpi b/Documentation/ABI/testing/sysfs-driver-chromeos-acpi
new file mode 100644
index 000000000000..c308926e1568
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-chromeos-acpi
@@ -0,0 +1,137 @@
+What: /sys/bus/platform/devices/GGL0001:*/BINF.2
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns active EC firmware of current boot (boolean).
+
+ == ===============================
+ 0 Read only (recovery) firmware.
+ 1 Rewritable firmware.
+ == ===============================
+
+What: /sys/bus/platform/devices/GGL0001:*/BINF.3
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns main firmware type for current boot (integer).
+
+ == =====================================
+ 0 Recovery.
+ 1 Normal.
+ 2 Developer.
+ 3 Netboot (factory installation only).
+ == =====================================
+
+What: /sys/bus/platform/devices/GGL0001:*/CHSW
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns switch position for Chrome OS specific hardware
+ switches when the firmware is booted (integer).
+
+ ==== ===========================================
+ 0 No changes.
+ 2 Recovery button was pressed.
+ 4 Recovery button was pressed (EC firmware).
+ 32 Developer switch was enabled.
+ 512 Firmware write protection was disabled.
+ ==== ===========================================
+
+What: /sys/bus/platform/devices/GGL0001:*/FMAP
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns physical memory address of the start of the main
+ processor firmware flashmap.
+
+What: /sys/bus/platform/devices/GGL0001:*/FRID
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns firmware version for the read-only portion of the
+ main processor firmware.
+
+What: /sys/bus/platform/devices/GGL0001:*/FWID
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns firmware version for the rewritable portion of the
+ main processor firmware.
+
+What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.0
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns type of the GPIO signal for the Chrome OS specific
+ GPIO assignments (integer).
+
+ =========== ==================================
+ 1 Recovery button.
+ 2 Developer mode switch.
+ 3 Firmware write protection switch.
+ 256 to 511 Debug header GPIO 0 to GPIO 255.
+ =========== ==================================
+
+What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.1
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns signal attributes of the GPIO signal (integer bitfield).
+
+ == =======================
+ 0 Signal is active low.
+ 1 Signal is active high.
+ == =======================
+
+What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.2
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns the GPIO number on the specified GPIO
+ controller.
+
+What: /sys/bus/platform/devices/GGL0001:*/GPIO.X/GPIO.3
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns name of the GPIO controller.
+
+What: /sys/bus/platform/devices/GGL0001:*/HWID
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns hardware ID for the Chromebook.
+
+What: /sys/bus/platform/devices/GGL0001:*/MECK
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns the SHA-1 or SHA-256 hash that is read out of the
+ Management Engine extended registers during boot. The hash
+ is exported via ACPI so the OS can verify that the Management
+ Engine firmware has not changed. If Management Engine is not
+ present, or if the firmware was unable to read the extended registers, this buffer size can be zero.
+
+What: /sys/bus/platform/devices/GGL0001:*/VBNV.0
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns offset in CMOS bank 0 of the verified boot non-volatile
+ storage block, counting from the first writable CMOS byte
+ (that is, 'offset = 0' is the byte following the 14 bytes of
+ clock data).
+
+What: /sys/bus/platform/devices/GGL0001:*/VBNV.1
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Return the size in bytes of the verified boot non-volatile
+ storage block.
+
+What: /sys/bus/platform/devices/GGL0001:*/VDAT
+Date: May 2022
+KernelVersion: 5.19
+Description:
+ Returns the verified boot data block shared between the
+ firmware verification step and the kernel verification step
+ (binary).
diff --git a/Documentation/ABI/testing/sysfs-driver-eud b/Documentation/ABI/testing/sysfs-driver-eud
new file mode 100644
index 000000000000..83f3872182a4
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-eud
@@ -0,0 +1,9 @@
+What: /sys/bus/platform/drivers/eud/.../enable
+Date: February 2022
+Contact: Souradeep Chowdhury <quic_schowdhu@quicinc.com>
+Description:
+ The Enable/Disable sysfs interface for Embedded
+ USB Debugger(EUD). This enables and disables the
+ EUD based on a 1 or a 0 value. By enabling EUD,
+ the user is able to activate the mini-usb hub of
+ EUD for debug and trace capabilities.
diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs
index 1f127f71d2b4..1b98b6503b23 100644
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -16,7 +16,7 @@ Description: Version of the application running on the device's CPU
What: /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
Date: Jun 2019
-KernelVersion: not yet upstreamed
+KernelVersion: 5.7
Contact: ogabbay@kernel.org
Description: Allows the user to set the maximum clock frequency, in MHz.
The device clock might be set to lower value than the maximum.
@@ -26,7 +26,7 @@ Description: Allows the user to set the maximum clock frequency, in MHz.
What: /sys/class/habanalabs/hl<n>/clk_cur_freq_mhz
Date: Jun 2019
-KernelVersion: not yet upstreamed
+KernelVersion: 5.7
Contact: ogabbay@kernel.org
Description: Displays the current frequency, in MHz, of the device clock.
This property is valid only for the Gaudi ASIC family
@@ -69,6 +69,12 @@ KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Displays the device's version from the eFuse
+What: /sys/class/habanalabs/hl<n>/fw_os_ver
+Date: Dec 2021
+KernelVersion: 5.18
+Contact: ogabbay@kernel.org
+Description: Version of the firmware OS running on the device's CPU
+
What: /sys/class/habanalabs/hl<n>/hard_reset
Date: Jan 2019
KernelVersion: 5.1
@@ -115,7 +121,7 @@ What: /sys/class/habanalabs/hl<n>/infineon_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Version of the Device's power supply F/W code
+Description: Version of the Device's power supply F/W code. Relevant only to GOYA and GAUDI
What: /sys/class/habanalabs/hl<n>/max_power
Date: Jan 2019
@@ -170,6 +176,12 @@ KernelVersion: 5.1
Contact: ogabbay@kernel.org
Description: Version of the device's preboot F/W code
+What: /sys/class/habanalabs/hl<n>/security_enabled
+Date: Oct 2022
+KernelVersion: 6.1
+Contact: obitton@habana.ai
+Description: Displays the device's security status
+
What: /sys/class/habanalabs/hl<n>/soft_reset
Date: Jan 2019
KernelVersion: 5.1
@@ -189,7 +201,19 @@ What: /sys/class/habanalabs/hl<n>/status
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Status of the card: "Operational", "Malfunction", "In reset".
+Description: Status of the card:
+
+ * "operational" - Device is available for work.
+ * "in reset" - Device is going through reset, will be
+ available shortly.
+ * "disabled" - Device is not usable.
+ * "needs reset" - Device is not usable until a hard reset
+ is initiated.
+ * "in device creation" - Device is not available yet, as it
+ is still initializing.
+ * "in reset after device release" - Device is going through
+ a compute-reset which is executed after a device release
+ (relevant for Gaudi2 only).
What: /sys/class/habanalabs/hl<n>/thermal_ver
Date: Jan 2019
@@ -220,4 +244,10 @@ What: /sys/class/habanalabs/hl<n>/uboot_ver
Date: Jan 2019
KernelVersion: 5.1
Contact: ogabbay@kernel.org
-Description: Version of the u-boot running on the device's CPU \ No newline at end of file
+Description: Version of the u-boot running on the device's CPU
+
+What: /sys/class/habanalabs/hl<n>/vrm_ver
+Date: Jan 2022
+KernelVersion: 5.17
+Contact: ogabbay@kernel.org
+Description: Version of the Device's Voltage Regulator Monitor F/W code. N/A to GOYA and GAUDI
diff --git a/Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon b/Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
new file mode 100644
index 000000000000..8d7d8f05f6cd
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-intel-i915-hwmon
@@ -0,0 +1,77 @@
+What: /sys/devices/.../hwmon/hwmon<i>/in0_input
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RO. Current Voltage in millivolt.
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/power1_max
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RW. Card reactive sustained (PL1/Tau) power limit in microwatts.
+
+ The power controller will throttle the operating frequency
+ if the power averaged over a window (typically seconds)
+ exceeds this limit. A read value of 0 means that the PL1
+ power limit is disabled, writing 0 disables the
+ limit. Writing values > 0 will enable the power limit.
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/power1_rated_max
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RO. Card default power limit (default TDP setting).
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/power1_max_interval
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RW. Sustained power limit interval (Tau in PL1/Tau) in
+ milliseconds over which sustained power is averaged.
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/power1_crit
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RW. Card reactive critical (I1) power limit in microwatts.
+
+ Card reactive critical (I1) power limit in microwatts is exposed
+ for client products. The power controller will throttle the
+ operating frequency if the power averaged over a window exceeds
+ this limit.
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/curr1_crit
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RW. Card reactive critical (I1) power limit in milliamperes.
+
+ Card reactive critical (I1) power limit in milliamperes is
+ exposed for server products. The power controller will throttle
+ the operating frequency if the power averaged over a window
+ exceeds this limit.
+
+ Only supported for particular Intel i915 graphics platforms.
+
+What: /sys/devices/.../hwmon/hwmon<i>/energy1_input
+Date: February 2023
+KernelVersion: 6.2
+Contact: intel-gfx@lists.freedesktop.org
+Description: RO. Energy input of device or gt in microjoules.
+
+ For i915 device level hwmon devices (name "i915") this
+ reflects energy input for the entire device. For gt level
+ hwmon devices (name "i915_gtN") this reflects energy input
+ for the gt.
+
+ Only supported for particular Intel i915 graphics platforms.
diff --git a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
index 9773925138af..a8ab58035c95 100644
--- a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
+++ b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc
@@ -1,4 +1,4 @@
-What: /sys/bus/spi/devices/.../bmc_version
+What: /sys/bus/.../drivers/intel-m10-bmc/.../bmc_version
Date: June 2020
KernelVersion: 5.10
Contact: Xu Yilun <yilun.xu@intel.com>
@@ -6,7 +6,7 @@ Description: Read only. Returns the hardware build version of Intel
MAX10 BMC chip.
Format: "0x%x".
-What: /sys/bus/spi/devices/.../bmcfw_version
+What: /sys/bus/.../drivers/intel-m10-bmc/.../bmcfw_version
Date: June 2020
KernelVersion: 5.10
Contact: Xu Yilun <yilun.xu@intel.com>
@@ -14,7 +14,7 @@ Description: Read only. Returns the firmware version of Intel MAX10
BMC chip.
Format: "0x%x".
-What: /sys/bus/spi/devices/.../mac_address
+What: /sys/bus/.../drivers/intel-m10-bmc/.../mac_address
Date: January 2021
KernelVersion: 5.12
Contact: Russ Weight <russell.h.weight@intel.com>
@@ -25,7 +25,7 @@ Description: Read only. Returns the first MAC address in a block
space.
Format: "%02x:%02x:%02x:%02x:%02x:%02x".
-What: /sys/bus/spi/devices/.../mac_count
+What: /sys/bus/.../drivers/intel-m10-bmc/.../mac_count
Date: January 2021
KernelVersion: 5.12
Contact: Russ Weight <russell.h.weight@intel.com>
diff --git a/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
new file mode 100644
index 000000000000..0a41afe0ab4c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-intel-m10-bmc-sec-update
@@ -0,0 +1,61 @@
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/sr_root_entry_hash
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns the root entry hash for the static
+ region if one is programmed, else it returns the
+ string: "hash not programmed". This file is only
+ visible if the underlying device supports it.
+ Format: string.
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/pr_root_entry_hash
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns the root entry hash for the partial
+ reconfiguration region if one is programmed, else it
+ returns the string: "hash not programmed". This file
+ is only visible if the underlying device supports it.
+ Format: string.
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/bmc_root_entry_hash
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns the root entry hash for the BMC image
+ if one is programmed, else it returns the string:
+ "hash not programmed". This file is only visible if the
+ underlying device supports it.
+ Format: string.
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/sr_canceled_csks
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns a list of indices for canceled code
+ signing keys for the static region. The standard bitmap
+ list format is used (e.g. "1,2-6,9").
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/pr_canceled_csks
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns a list of indices for canceled code
+ signing keys for the partial reconfiguration region. The
+ standard bitmap list format is used (e.g. "1,2-6,9").
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/bmc_canceled_csks
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns a list of indices for canceled code
+ signing keys for the BMC. The standard bitmap list format
+ is used (e.g. "1,2-6,9").
+
+What: /sys/bus/platform/drivers/intel-m10bmc-sec-update/.../security/flash_count
+Date: Sep 2022
+KernelVersion: 5.20
+Contact: Russ Weight <russell.h.weight@intel.com>
+Description: Read only. Returns number of times the secure update
+ staging area has been flashed.
+ Format: "%u".
diff --git a/Documentation/ABI/testing/sysfs-driver-intel_sdsi b/Documentation/ABI/testing/sysfs-driver-intel_sdsi
new file mode 100644
index 000000000000..f8afed127107
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-intel_sdsi
@@ -0,0 +1,90 @@
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ This directory contains interface files for accessing Intel
+ On Demand (formerly Software Defined Silicon or SDSi) features
+ on a CPU. X represents the socket instance (though not the
+ socket ID). The socket ID is determined by reading the
+ registers file and decoding it per the specification.
+
+ Some files communicate with On Demand hardware through a
+ mailbox. Should the operation fail, one of the following error
+ codes may be returned:
+
+ ========== =====
+ Error Code Cause
+ ========== =====
+ EIO General mailbox failure. Log may indicate cause.
+ EBUSY Mailbox is owned by another agent.
+ EPERM On Demand capability is not enabled in hardware.
+ EPROTO Failure in mailbox protocol detected by driver.
+ See log for details.
+ EOVERFLOW For provision commands, the size of the data
+ exceeds what may be written.
+ ESPIPE Seeking is not allowed.
+ ETIMEDOUT Failure to complete mailbox transaction in time.
+ ========== =====
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/guid
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (RO) The GUID for the registers file. The GUID identifies
+ the layout of the registers file in this directory.
+ Information about the register layouts for a particular GUID
+ is available at http://github.com/intel/intel-sdsi
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/registers
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (RO) Contains information needed by applications to provision
+ a CPU and monitor status information. The layout of this file
+ is determined by the GUID in this directory. Information about
+ the layout for a particular GUID is available at
+ http://github.com/intel/intel-sdsi
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/provision_akc
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (WO) Used to write an Authentication Key Certificate (AKC) to
+ the On Demand NVRAM for the CPU. The AKC is used to authenticate
+ a Capability Activation Payload. Mailbox command.
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/provision_cap
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (WO) Used to write a Capability Activation Payload (CAP) to the
+ On Demand NVRAM for the CPU. CAPs are used to activate a given
+ CPU feature. A CAP is validated by On Demand hardware using a
+ previously provisioned AKC file. Upon successful authentication,
+ the CPU configuration is updated. A cold reboot is required to
+ fully activate the feature. Mailbox command.
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/meter_certificate
+Date: Nov 2022
+KernelVersion: 6.2
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (RO) Used to read back the current meter certificate for the CPU
+ from Intel On Demand hardware. The meter certificate contains
+ utilization metrics of On Demand enabled features. Mailbox
+ command.
+
+What: /sys/bus/auxiliary/devices/intel_vsec.sdsi.X/state_certificate
+Date: Feb 2022
+KernelVersion: 5.18
+Contact: "David E. Box" <david.e.box@linux.intel.com>
+Description:
+ (RO) Used to read back the current state certificate for the CPU
+ from On Demand hardware. The state certificate contains
+ information about the current licenses on the CPU. Mailbox
+ command.
diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat
new file mode 100644
index 000000000000..087842b1969e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-qat
@@ -0,0 +1,49 @@
+What: /sys/bus/pci/devices/<BDF>/qat/state
+Date: June 2022
+KernelVersion: 6.0
+Contact: qat-linux@intel.com
+Description: (RW) Reports the current state of the QAT device. Write to
+ the file to start or stop the device.
+
+ The values are:
+
+ * up: the device is up and running
+ * down: the device is down
+
+
+ It is possible to transition the device from up to down only
+ if the device is up and vice versa.
+
+ This attribute is only available for qat_4xxx devices.
+
+What: /sys/bus/pci/devices/<BDF>/qat/cfg_services
+Date: June 2022
+KernelVersion: 6.0
+Contact: qat-linux@intel.com
+Description: (RW) Reports the current configuration of the QAT device.
+ Write to the file to change the configured services.
+
+ The values are:
+
+ * sym;asym: the device is configured for running crypto
+ services
+ * dc: the device is configured for running compression services
+
+ It is possible to set the configuration only if the device
+ is in the `down` state (see /sys/bus/pci/devices/<BDF>/qat/state)
+
+ The following example shows how to change the configuration of
+ a device configured for running crypto services in order to
+ run data compression::
+
+ # cat /sys/bus/pci/devices/<BDF>/qat/state
+ up
+ # cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
+ sym;asym
+ # echo down > /sys/bus/pci/devices/<BDF>/qat/state
+ # echo dc > /sys/bus/pci/devices/<BDF>/qat/cfg_services
+ # echo up > /sys/bus/pci/devices/<BDF>/qat/state
+ # cat /sys/bus/pci/devices/<BDF>/qat/cfg_services
+ dc
+
+ This attribute is only available for qat_4xxx devices.
diff --git a/Documentation/ABI/testing/sysfs-driver-typec-displayport b/Documentation/ABI/testing/sysfs-driver-typec-displayport
index 231471ad0d4b..256c87c5219a 100644
--- a/Documentation/ABI/testing/sysfs-driver-typec-displayport
+++ b/Documentation/ABI/testing/sysfs-driver-typec-displayport
@@ -47,3 +47,18 @@ Description:
USB SuperSpeed protocol. From user perspective pin assignments C
and E are equal, where all channels on the connector are used
for carrying DisplayPort protocol (allowing higher resolutions).
+
+What: /sys/bus/typec/devices/.../displayport/hpd
+Date: Dec 2022
+Contact: Badhri Jagan Sridharan <badhri@google.com>
+Description:
+ VESA DisplayPort Alt Mode on USB Type-C Standard defines how
+ HotPlugDetect(HPD) shall be supported on the USB-C connector when
+ operating in DisplayPort Alt Mode. This is a read only node which
+ reflects the current state of HPD.
+
+ Valid values:
+ - 1: when HPD’s logical state is high (HPD_High) as defined
+ by VESA DisplayPort Alt Mode on USB Type-C Standard.
+ - 0 when HPD’s logical state is low (HPD_Low) as defined by
+ VESA DisplayPort Alt Mode on USB Type-C Standard.
diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce
index 08f2591138af..d3f0b8f3c589 100644
--- a/Documentation/ABI/testing/sysfs-driver-uacce
+++ b/Documentation/ABI/testing/sysfs-driver-uacce
@@ -19,6 +19,24 @@ Contact: linux-accelerators@lists.ozlabs.org
Description: Available instances left of the device
Return -ENODEV if uacce_ops get_available_instances is not provided
+What: /sys/class/uacce/<dev_name>/isolate_strategy
+Date: Nov 2022
+KernelVersion: 6.1
+Contact: linux-accelerators@lists.ozlabs.org
+Description: (RW) A sysfs node that configure the error threshold for the hardware
+ isolation strategy. This size is a configured integer value, which is the
+ number of threshold for hardware errors occurred in one hour. The default is 0.
+ 0 means never isolate the device. The maximum value is 65535. You can write
+ a number of threshold based on your hardware.
+
+What: /sys/class/uacce/<dev_name>/isolate
+Date: Nov 2022
+KernelVersion: 6.1
+Contact: linux-accelerators@lists.ozlabs.org
+Description: (R) A sysfs node that read the device isolated state. The value 1
+ means the device is unavailable. The 0 means the device is
+ available.
+
What: /sys/class/uacce/<dev_name>/algorithms
Date: Feb 2020
KernelVersion: 5.7
diff --git a/Documentation/ABI/testing/sysfs-driver-ufs b/Documentation/ABI/testing/sysfs-driver-ufs
index ec3a7149ced5..228aa43e14ed 100644
--- a/Documentation/ABI/testing/sysfs-driver-ufs
+++ b/Documentation/ABI/testing/sysfs-driver-ufs
@@ -13,6 +13,7 @@ Description:
Interface specification for more details.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/device_type
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/device_type
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the device type. This is one of the UFS
@@ -22,6 +23,7 @@ Description: This file shows the device type. This is one of the UFS
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/device_class
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/device_class
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the device class. This is one of the UFS
@@ -31,6 +33,7 @@ Description: This file shows the device class. This is one of the UFS
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/device_sub_class
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/device_sub_class
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the UFS storage subclass. This is one of
@@ -40,6 +43,7 @@ Description: This file shows the UFS storage subclass. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/protocol
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/protocol
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the protocol supported by an UFS device.
@@ -50,6 +54,7 @@ Description: This file shows the protocol supported by an UFS device.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/number_of_luns
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/number_of_luns
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows number of logical units. This is one of
@@ -59,6 +64,7 @@ Description: This file shows number of logical units. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/number_of_wluns
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/number_of_wluns
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows number of well known logical units.
@@ -69,6 +75,7 @@ Description: This file shows number of well known logical units.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/boot_enable
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/boot_enable
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows value that indicates whether the device is
@@ -79,6 +86,7 @@ Description: This file shows value that indicates whether the device is
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/descriptor_access_enable
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/descriptor_access_enable
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows value that indicates whether the device
@@ -90,6 +98,7 @@ Description: This file shows value that indicates whether the device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/initial_power_mode
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/initial_power_mode
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows value that defines the power mode after
@@ -100,6 +109,7 @@ Description: This file shows value that defines the power mode after
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/high_priority_lun
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/high_priority_lun
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the high priority lun. This is one of
@@ -109,6 +119,7 @@ Description: This file shows the high priority lun. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/secure_removal_type
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/secure_removal_type
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the secure removal type. This is one of
@@ -118,6 +129,7 @@ Description: This file shows the secure removal type. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/support_security_lun
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/support_security_lun
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the security lun is supported.
@@ -128,6 +140,7 @@ Description: This file shows whether the security lun is supported.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/bkops_termination_latency
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/bkops_termination_latency
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the background operations termination
@@ -138,6 +151,7 @@ Description: This file shows the background operations termination
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/initial_active_icc_level
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/initial_active_icc_level
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the initial active ICC level. This is one
@@ -147,6 +161,7 @@ Description: This file shows the initial active ICC level. This is one
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/specification_version
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/specification_version
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the specification version. This is one
@@ -156,6 +171,7 @@ Description: This file shows the specification version. This is one
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/manufacturing_date
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/manufacturing_date
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the manufacturing date in BCD format.
@@ -166,6 +182,7 @@ Description: This file shows the manufacturing date in BCD format.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/manufacturer_id
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/manufacturer_id
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the manufacturer ID. This is one of the
@@ -175,6 +192,7 @@ Description: This file shows the manufacturer ID. This is one of the
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/rtt_capability
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/rtt_capability
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum number of outstanding RTTs
@@ -185,6 +203,7 @@ Description: This file shows the maximum number of outstanding RTTs
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/rtc_update
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/rtc_update
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the frequency and method of the realtime
@@ -195,6 +214,7 @@ Description: This file shows the frequency and method of the realtime
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/ufs_features
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/ufs_features
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows which features are supported by the device.
@@ -205,6 +225,7 @@ Description: This file shows which features are supported by the device.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/ffu_timeout
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/ffu_timeout
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the FFU timeout. This is one of the
@@ -214,6 +235,7 @@ Description: This file shows the FFU timeout. This is one of the
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/queue_depth
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/queue_depth
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the device queue depth. This is one of the
@@ -223,6 +245,7 @@ Description: This file shows the device queue depth. This is one of the
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/device_version
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/device_version
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the device version. This is one of the
@@ -232,6 +255,7 @@ Description: This file shows the device version. This is one of the
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/number_of_secure_wpa
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/number_of_secure_wpa
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows number of secure write protect areas
@@ -242,6 +266,7 @@ Description: This file shows number of secure write protect areas
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/psa_max_data_size
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/psa_max_data_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum amount of data that may be
@@ -253,6 +278,7 @@ Description: This file shows the maximum amount of data that may be
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/psa_state_timeout
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/psa_state_timeout
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the command maximum timeout for a change
@@ -264,6 +290,7 @@ Description: This file shows the command maximum timeout for a change
What: /sys/bus/platform/drivers/ufshcd/*/interconnect_descriptor/unipro_version
+What: /sys/bus/platform/devices/*.ufs/interconnect_descriptor/unipro_version
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the MIPI UniPro version number in BCD format.
@@ -274,6 +301,7 @@ Description: This file shows the MIPI UniPro version number in BCD format.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/interconnect_descriptor/mphy_version
+What: /sys/bus/platform/devices/*.ufs/interconnect_descriptor/mphy_version
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the MIPI M-PHY version number in BCD format.
@@ -285,6 +313,7 @@ Description: This file shows the MIPI M-PHY version number in BCD format.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/raw_device_capacity
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/raw_device_capacity
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the total memory quantity available to
@@ -296,6 +325,7 @@ Description: This file shows the total memory quantity available to
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/max_number_of_luns
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/max_number_of_luns
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum number of logical units
@@ -306,6 +336,7 @@ Description: This file shows the maximum number of logical units
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/segment_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/segment_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the segment size. This is one of the UFS
@@ -315,6 +346,7 @@ Description: This file shows the segment size. This is one of the UFS
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/allocation_unit_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/allocation_unit_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the allocation unit size. This is one of
@@ -324,6 +356,7 @@ Description: This file shows the allocation unit size. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/min_addressable_block_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/min_addressable_block_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the minimum addressable block size. This
@@ -334,6 +367,7 @@ Description: This file shows the minimum addressable block size. This
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/optimal_read_block_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/optimal_read_block_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the optimal read block size. This is one
@@ -344,6 +378,7 @@ Description: This file shows the optimal read block size. This is one
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/optimal_write_block_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/optimal_write_block_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the optimal write block size. This is one
@@ -354,6 +389,7 @@ Description: This file shows the optimal write block size. This is one
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/max_in_buffer_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/max_in_buffer_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum data-in buffer size. This
@@ -364,6 +400,7 @@ Description: This file shows the maximum data-in buffer size. This
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/max_out_buffer_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/max_out_buffer_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum data-out buffer size. This
@@ -374,6 +411,7 @@ Description: This file shows the maximum data-out buffer size. This
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/rpmb_rw_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/rpmb_rw_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum number of RPMB frames allowed
@@ -384,6 +422,7 @@ Description: This file shows the maximum number of RPMB frames allowed
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/dyn_capacity_resource_policy
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/dyn_capacity_resource_policy
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the dynamic capacity resource policy. This
@@ -394,6 +433,7 @@ Description: This file shows the dynamic capacity resource policy. This
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/data_ordering
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/data_ordering
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows support for out-of-order data transfer.
@@ -404,6 +444,7 @@ Description: This file shows support for out-of-order data transfer.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/max_number_of_contexts
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/max_number_of_contexts
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows maximum available number of contexts which
@@ -414,6 +455,7 @@ Description: This file shows maximum available number of contexts which
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/sys_data_tag_unit_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/sys_data_tag_unit_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows system data tag unit size. This is one of
@@ -423,6 +465,7 @@ Description: This file shows system data tag unit size. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/sys_data_tag_resource_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/sys_data_tag_resource_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows maximum storage area size allocated by
@@ -434,6 +477,7 @@ Description: This file shows maximum storage area size allocated by
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/secure_removal_types
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/secure_removal_types
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows supported secure removal types. This is
@@ -444,6 +488,7 @@ Description: This file shows supported secure removal types. This is
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/memory_types
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/memory_types
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows supported memory types. This is one of
@@ -454,6 +499,7 @@ Description: This file shows supported memory types. This is one of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/*_memory_max_alloc_units
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/*_memory_max_alloc_units
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum number of allocation units for
@@ -465,6 +511,7 @@ Description: This file shows the maximum number of allocation units for
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/*_memory_capacity_adjustment_factor
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/*_memory_capacity_adjustment_factor
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the memory capacity adjustment factor for
@@ -477,6 +524,7 @@ Description: This file shows the memory capacity adjustment factor for
What: /sys/bus/platform/drivers/ufshcd/*/health_descriptor/eol_info
+What: /sys/bus/platform/devices/*.ufs/health_descriptor/eol_info
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows preend of life information. This is one
@@ -487,6 +535,7 @@ Description: This file shows preend of life information. This is one
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/health_descriptor/life_time_estimation_a
+What: /sys/bus/platform/devices/*.ufs/health_descriptor/life_time_estimation_a
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows indication of the device life time
@@ -497,6 +546,7 @@ Description: This file shows indication of the device life time
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/health_descriptor/life_time_estimation_b
+What: /sys/bus/platform/devices/*.ufs/health_descriptor/life_time_estimation_b
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows indication of the device life time
@@ -508,6 +558,7 @@ Description: This file shows indication of the device life time
What: /sys/bus/platform/drivers/ufshcd/*/power_descriptor/active_icc_levels_vcc*
+What: /sys/bus/platform/devices/*.ufs/power_descriptor/active_icc_levels_vcc*
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows maximum VCC, VCCQ and VCCQ2 value for
@@ -519,6 +570,7 @@ Description: This file shows maximum VCC, VCCQ and VCCQ2 value for
What: /sys/bus/platform/drivers/ufshcd/*/string_descriptors/manufacturer_name
+What: /sys/bus/platform/devices/*.ufs/string_descriptors/manufacturer_name
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file contains a device manufacturer name string.
@@ -528,6 +580,7 @@ Description: This file contains a device manufacturer name string.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/string_descriptors/product_name
+What: /sys/bus/platform/devices/*.ufs/string_descriptors/product_name
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file contains a product name string. The full information
@@ -536,6 +589,7 @@ Description: This file contains a product name string. The full information
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/string_descriptors/oem_id
+What: /sys/bus/platform/devices/*.ufs/string_descriptors/oem_id
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file contains a OEM ID string. The full information
@@ -544,6 +598,7 @@ Description: This file contains a OEM ID string. The full information
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/string_descriptors/serial_number
+What: /sys/bus/platform/devices/*.ufs/string_descriptors/serial_number
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file contains a device serial number string. The full
@@ -553,6 +608,7 @@ Description: This file contains a device serial number string. The full
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/string_descriptors/product_revision
+What: /sys/bus/platform/devices/*.ufs/string_descriptors/product_revision
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file contains a product revision string. The full
@@ -684,6 +740,7 @@ Description: This file shows the granularity of the LUN. This is one of
What: /sys/bus/platform/drivers/ufshcd/*/flags/device_init
+What: /sys/bus/platform/devices/*.ufs/flags/device_init
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the device init status. The full information
@@ -692,6 +749,7 @@ Description: This file shows the device init status. The full information
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/permanent_wpe
+What: /sys/bus/platform/devices/*.ufs/flags/permanent_wpe
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether permanent write protection is enabled.
@@ -701,6 +759,7 @@ Description: This file shows whether permanent write protection is enabled.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/power_on_wpe
+What: /sys/bus/platform/devices/*.ufs/flags/power_on_wpe
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether write protection is enabled on all
@@ -711,6 +770,7 @@ Description: This file shows whether write protection is enabled on all
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/bkops_enable
+What: /sys/bus/platform/devices/*.ufs/flags/bkops_enable
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the device background operations are
@@ -720,6 +780,7 @@ Description: This file shows whether the device background operations are
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/life_span_mode_enable
+What: /sys/bus/platform/devices/*.ufs/flags/life_span_mode_enable
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the device life span mode is enabled.
@@ -729,6 +790,7 @@ Description: This file shows whether the device life span mode is enabled.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/phy_resource_removal
+What: /sys/bus/platform/devices/*.ufs/flags/phy_resource_removal
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether physical resource removal is enable.
@@ -738,6 +800,7 @@ Description: This file shows whether physical resource removal is enable.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/busy_rtc
+What: /sys/bus/platform/devices/*.ufs/flags/busy_rtc
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the device is executing internal
@@ -747,6 +810,7 @@ Description: This file shows whether the device is executing internal
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/disable_fw_update
+What: /sys/bus/platform/devices/*.ufs/flags/disable_fw_update
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the device FW update is permanently
@@ -757,6 +821,7 @@ Description: This file shows whether the device FW update is permanently
What: /sys/bus/platform/drivers/ufshcd/*/attributes/boot_lun_enabled
+What: /sys/bus/platform/devices/*.ufs/attributes/boot_lun_enabled
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the boot lun enabled UFS device attribute.
@@ -766,6 +831,7 @@ Description: This file provides the boot lun enabled UFS device attribute.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/current_power_mode
+What: /sys/bus/platform/devices/*.ufs/attributes/current_power_mode
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the current power mode UFS device attribute.
@@ -775,6 +841,7 @@ Description: This file provides the current power mode UFS device attribute.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/active_icc_level
+What: /sys/bus/platform/devices/*.ufs/attributes/active_icc_level
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the active icc level UFS device attribute.
@@ -784,6 +851,7 @@ Description: This file provides the active icc level UFS device attribute.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/ooo_data_enabled
+What: /sys/bus/platform/devices/*.ufs/attributes/ooo_data_enabled
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the out of order data transfer enabled UFS
@@ -793,6 +861,7 @@ Description: This file provides the out of order data transfer enabled UFS
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/bkops_status
+What: /sys/bus/platform/devices/*.ufs/attributes/bkops_status
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the background operations status UFS device
@@ -802,6 +871,7 @@ Description: This file provides the background operations status UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/purge_status
+What: /sys/bus/platform/devices/*.ufs/attributes/purge_status
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the purge operation status UFS device
@@ -811,6 +881,7 @@ Description: This file provides the purge operation status UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/max_data_in_size
+What: /sys/bus/platform/devices/*.ufs/attributes/max_data_in_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum data size in a DATA IN
@@ -820,6 +891,7 @@ Description: This file shows the maximum data size in a DATA IN
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/max_data_out_size
+What: /sys/bus/platform/devices/*.ufs/attributes/max_data_out_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the maximum number of bytes that can be
@@ -829,6 +901,7 @@ Description: This file shows the maximum number of bytes that can be
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/reference_clock_frequency
+What: /sys/bus/platform/devices/*.ufs/attributes/reference_clock_frequency
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the reference clock frequency UFS device
@@ -838,6 +911,7 @@ Description: This file provides the reference clock frequency UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/configuration_descriptor_lock
+What: /sys/bus/platform/devices/*.ufs/attributes/configuration_descriptor_lock
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows whether the configuration descriptor is locked.
@@ -845,6 +919,7 @@ Description: This file shows whether the configuration descriptor is locked.
UFS specifications 2.1. The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/max_number_of_rtt
+What: /sys/bus/platform/devices/*.ufs/attributes/max_number_of_rtt
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the maximum current number of
@@ -855,6 +930,7 @@ Description: This file provides the maximum current number of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/exception_event_control
+What: /sys/bus/platform/devices/*.ufs/attributes/exception_event_control
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the exception event control UFS device
@@ -864,6 +940,7 @@ Description: This file provides the exception event control UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/exception_event_status
+What: /sys/bus/platform/devices/*.ufs/attributes/exception_event_status
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the exception event status UFS device
@@ -873,6 +950,7 @@ Description: This file provides the exception event status UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/ffu_status
+What: /sys/bus/platform/devices/*.ufs/attributes/ffu_status
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file provides the ffu status UFS device attribute.
@@ -882,6 +960,7 @@ Description: This file provides the ffu status UFS device attribute.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/psa_state
+What: /sys/bus/platform/devices/*.ufs/attributes/psa_state
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file show the PSA feature status. The full information
@@ -890,6 +969,7 @@ Description: This file show the PSA feature status. The full information
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/psa_data_size
+What: /sys/bus/platform/devices/*.ufs/attributes/psa_data_size
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
Description: This file shows the amount of data that the host plans to
@@ -903,7 +983,7 @@ Description: This file shows the amount of data that the host plans to
What: /sys/class/scsi_device/*/device/dyn_cap_needed
Date: February 2018
Contact: Stanislav Nijnikov <stanislav.nijnikov@wdc.com>
-Description: This file shows the The amount of physical memory needed
+Description: This file shows the amount of physical memory needed
to be removed from the physical memory resources pool of
the particular logical unit. The full information about
the attribute could be found at UFS specifications 2.1.
@@ -912,6 +992,7 @@ Description: This file shows the The amount of physical memory needed
What: /sys/bus/platform/drivers/ufshcd/*/rpm_lvl
+What: /sys/bus/platform/devices/*.ufs/rpm_lvl
Date: September 2014
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry could be used to set or show the UFS device
@@ -938,6 +1019,7 @@ Description: This entry could be used to set or show the UFS device
== ====================================================
What: /sys/bus/platform/drivers/ufshcd/*/rpm_target_dev_state
+What: /sys/bus/platform/devices/*.ufs/rpm_target_dev_state
Date: February 2018
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry shows the target power mode of an UFS device
@@ -946,6 +1028,7 @@ Description: This entry shows the target power mode of an UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/rpm_target_link_state
+What: /sys/bus/platform/devices/*.ufs/rpm_target_link_state
Date: February 2018
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry shows the target state of an UFS UIC link
@@ -954,6 +1037,7 @@ Description: This entry shows the target state of an UFS UIC link
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/spm_lvl
+What: /sys/bus/platform/devices/*.ufs/spm_lvl
Date: September 2014
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry could be used to set or show the UFS device
@@ -980,6 +1064,7 @@ Description: This entry could be used to set or show the UFS device
== ====================================================
What: /sys/bus/platform/drivers/ufshcd/*/spm_target_dev_state
+What: /sys/bus/platform/devices/*.ufs/spm_target_dev_state
Date: February 2018
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry shows the target power mode of an UFS device
@@ -988,6 +1073,7 @@ Description: This entry shows the target power mode of an UFS device
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/spm_target_link_state
+What: /sys/bus/platform/devices/*.ufs/spm_target_link_state
Date: February 2018
Contact: Subhash Jadavani <subhashj@codeaurora.org>
Description: This entry shows the target state of an UFS UIC link
@@ -996,6 +1082,7 @@ Description: This entry shows the target state of an UFS UIC link
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/monitor_enable
+What: /sys/bus/platform/devices/*.ufs/monitor/monitor_enable
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the status of performance monitor enablement
@@ -1003,6 +1090,7 @@ Description: This file shows the status of performance monitor enablement
is stopped, the performance data collected is also cleared.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/monitor_chunk_size
+What: /sys/bus/platform/devices/*.ufs/monitor/monitor_chunk_size
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file tells the monitor to focus on requests transferring
@@ -1010,6 +1098,7 @@ Description: This file tells the monitor to focus on requests transferring
It can only be changed when monitor is disabled.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_total_sectors
+What: /sys/bus/platform/devices/*.ufs/monitor/read_total_sectors
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how many sectors (in 512 Bytes) have been
@@ -1018,6 +1107,7 @@ Description: This file shows how many sectors (in 512 Bytes) have been
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_total_busy
+What: /sys/bus/platform/devices/*.ufs/monitor/read_total_busy
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how long (in micro seconds) has been spent
@@ -1026,6 +1116,7 @@ Description: This file shows how long (in micro seconds) has been spent
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_nr_requests
+What: /sys/bus/platform/devices/*.ufs/monitor/read_nr_requests
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how many read requests have been sent after
@@ -1034,6 +1125,7 @@ Description: This file shows how many read requests have been sent after
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_max
+What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_max
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the maximum latency (in micro seconds) of
@@ -1042,6 +1134,7 @@ Description: This file shows the maximum latency (in micro seconds) of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_min
+What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_min
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the minimum latency (in micro seconds) of
@@ -1050,6 +1143,7 @@ Description: This file shows the minimum latency (in micro seconds) of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_avg
+What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_avg
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the average latency (in micro seconds) of
@@ -1058,6 +1152,7 @@ Description: This file shows the average latency (in micro seconds) of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_sum
+What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_sum
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the total latency (in micro seconds) of
@@ -1066,6 +1161,7 @@ Description: This file shows the total latency (in micro seconds) of
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_total_sectors
+What: /sys/bus/platform/devices/*.ufs/monitor/write_total_sectors
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how many sectors (in 512 Bytes) have been sent
@@ -1074,6 +1170,7 @@ Description: This file shows how many sectors (in 512 Bytes) have been sent
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_total_busy
+What: /sys/bus/platform/devices/*.ufs/monitor/write_total_busy
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how long (in micro seconds) has been spent
@@ -1082,6 +1179,7 @@ Description: This file shows how long (in micro seconds) has been spent
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_nr_requests
+What: /sys/bus/platform/devices/*.ufs/monitor/write_nr_requests
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows how many write requests have been sent after
@@ -1090,6 +1188,7 @@ Description: This file shows how many write requests have been sent after
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_max
+What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_max
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the maximum latency (in micro seconds) of write
@@ -1098,6 +1197,7 @@ Description: This file shows the maximum latency (in micro seconds) of write
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_min
+What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_min
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the minimum latency (in micro seconds) of write
@@ -1106,6 +1206,7 @@ Description: This file shows the minimum latency (in micro seconds) of write
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_avg
+What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_avg
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the average latency (in micro seconds) of write
@@ -1114,6 +1215,7 @@ Description: This file shows the average latency (in micro seconds) of write
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_sum
+What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_sum
Date: January 2021
Contact: Can Guo <cang@codeaurora.org>
Description: This file shows the total latency (in micro seconds) of write
@@ -1122,6 +1224,7 @@ Description: This file shows the total latency (in micro seconds) of write
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_presv_us_en
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_presv_us_en
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows if preserve user-space was configured
@@ -1129,6 +1232,7 @@ Description: This entry shows if preserve user-space was configured
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_shared_alloc_units
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_shared_alloc_units
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the shared allocated units of WB buffer
@@ -1136,6 +1240,7 @@ Description: This entry shows the shared allocated units of WB buffer
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_type
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_type
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the configured WB type.
@@ -1144,6 +1249,7 @@ Description: This entry shows the configured WB type.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_buff_cap_adj
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_buff_cap_adj
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the total user-space decrease in shared
@@ -1154,6 +1260,7 @@ Description: This entry shows the total user-space decrease in shared
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_max_alloc_units
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_max_alloc_units
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the Maximum total WriteBooster Buffer size
@@ -1162,6 +1269,7 @@ Description: This entry shows the Maximum total WriteBooster Buffer size
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_max_wb_luns
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_max_wb_luns
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the maximum number of luns that can support
@@ -1170,6 +1278,7 @@ Description: This entry shows the maximum number of luns that can support
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_sup_red_type
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_sup_red_type
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: The supportability of user space reduction mode
@@ -1184,6 +1293,7 @@ Description: The supportability of user space reduction mode
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_sup_wb_type
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_sup_wb_type
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: The supportability of WriteBooster Buffer type.
@@ -1198,6 +1308,7 @@ Description: The supportability of WriteBooster Buffer type.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_enable
+What: /sys/bus/platform/devices/*.ufs/flags/wb_enable
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the status of WriteBooster.
@@ -1210,6 +1321,7 @@ Description: This entry shows the status of WriteBooster.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_flush_en
+What: /sys/bus/platform/devices/*.ufs/flags/wb_flush_en
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows if flush is enabled.
@@ -1222,6 +1334,7 @@ Description: This entry shows if flush is enabled.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_flush_during_h8
+What: /sys/bus/platform/devices/*.ufs/flags/wb_flush_during_h8
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: Flush WriteBooster Buffer during hibernate state.
@@ -1236,6 +1349,7 @@ Description: Flush WriteBooster Buffer during hibernate state.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_avail_buf
+What: /sys/bus/platform/devices/*.ufs/attributes/wb_avail_buf
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the amount of unused WriteBooster buffer
@@ -1244,6 +1358,7 @@ Description: This entry shows the amount of unused WriteBooster buffer
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_cur_buf
+What: /sys/bus/platform/devices/*.ufs/attributes/wb_cur_buf
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the amount of unused current buffer.
@@ -1251,6 +1366,7 @@ Description: This entry shows the amount of unused current buffer.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_flush_status
+What: /sys/bus/platform/devices/*.ufs/attributes/wb_flush_status
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows the flush operation status.
@@ -1267,6 +1383,7 @@ Description: This entry shows the flush operation status.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_life_time_est
+What: /sys/bus/platform/devices/*.ufs/attributes/wb_life_time_est
Date: June 2020
Contact: Asutosh Das <asutoshd@codeaurora.org>
Description: This entry shows an indication of the WriteBooster Buffer
@@ -1289,6 +1406,7 @@ Description: This entry shows the configured size of WriteBooster buffer.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/wb_on
+What: /sys/bus/platform/devices/*.ufs/wb_on
Date: January 2021
Contact: Bean Huo <beanhuo@micron.com>
Description: This node is used to set or display whether UFS WriteBooster is
@@ -1299,7 +1417,17 @@ Description: This node is used to set or display whether UFS WriteBooster is
platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can
disable/enable WriteBooster through this sysfs node.
+What: /sys/bus/platform/drivers/ufshcd/*/enable_wb_buf_flush
+What: /sys/bus/platform/devices/*.ufs/enable_wb_buf_flush
+Date: July 2022
+Contact: Jinyoung Choi <j-young.choi@samsung.com>
+Description: This entry shows the status of WriteBooster buffer flushing
+ and it can be used to enable or disable the flushing.
+ If flushing is enabled, the device executes the flush
+ operation when the command queue is empty.
+
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/hpb_version
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the HPB specification version.
@@ -1310,6 +1438,7 @@ Description: This entry shows the HPB specification version.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_control
+What: /sys/bus/platform/devices/*.ufs/device_descriptor/hpb_control
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows an indication of the HPB control mode.
@@ -1319,6 +1448,7 @@ Description: This entry shows an indication of the HPB control mode.
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/hpb_region_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/hpb_region_size
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the bHPBRegionSize which can be calculated
@@ -1328,6 +1458,7 @@ Description: This entry shows the bHPBRegionSize which can be calculated
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/hpb_number_lu
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/hpb_number_lu
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the maximum number of HPB LU supported by
@@ -1338,6 +1469,7 @@ Description: This entry shows the maximum number of HPB LU supported by
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/hpb_subregion_size
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/hpb_subregion_size
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the bHPBSubRegionSize, which can be
@@ -1349,6 +1481,7 @@ Description: This entry shows the bHPBSubRegionSize, which can be
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/hpb_max_active_regions
+What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/hpb_max_active_regions
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the maximum number of active HPB regions that
@@ -1394,7 +1527,7 @@ Description: This entry shows the number of reads that cannot be changed to
The file is read only.
-What: /sys/class/scsi_device/*/device/hpb_stats/rb_noti_cnt
+What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_noti_cnt
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the number of response UPIUs that has
@@ -1402,19 +1535,23 @@ Description: This entry shows the number of response UPIUs that has
The file is read only.
-What: /sys/class/scsi_device/*/device/hpb_stats/rb_active_cnt
+What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_active_cnt
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
-Description: This entry shows the number of active sub-regions recommended by
- response UPIUs.
+Description: For the HPB device control mode, this entry shows the number of
+ active sub-regions recommended by response UPIUs. For the HPB host control
+ mode, this entry shows the number of active sub-regions recommended by the
+ HPB host control mode heuristic algorithm.
The file is read only.
-What: /sys/class/scsi_device/*/device/hpb_stats/rb_inactive_cnt
+What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_inactive_cnt
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
-Description: This entry shows the number of inactive regions recommended by
- response UPIUs.
+Description: For the HPB device control mode, this entry shows the number of
+ inactive regions recommended by response UPIUs. For the HPB host control
+ mode, this entry shows the number of inactive regions recommended by the
+ HPB host control mode heuristic algorithm.
The file is read only.
@@ -1434,6 +1571,7 @@ Description: This entry shows the requeue timeout threshold for write buffer
this entry.
What: /sys/bus/platform/drivers/ufshcd/*/attributes/max_data_size_hpb_single_cmd
+What: /sys/bus/platform/devices/*.ufs/attributes/max_data_size_hpb_single_cmd
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the maximum HPB data size for using a single HPB
@@ -1450,6 +1588,7 @@ Description: This entry shows the maximum HPB data size for using a single HPB
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/flags/hpb_enable
+What: /sys/bus/platform/devices/*.ufs/flags/hpb_enable
Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the status of HPB.
@@ -1461,6 +1600,43 @@ Description: This entry shows the status of HPB.
The file is read only.
+Contact: Daniil Lunev <dlunev@chromium.org>
+What: /sys/bus/platform/drivers/ufshcd/*/capabilities/
+What: /sys/bus/platform/devices/*.ufs/capabilities/
+Date: August 2022
+Description: The group represents the effective capabilities of the
+ host-device pair. i.e. the capabilities which are enabled in the
+ driver for the specific host controller, supported by the host
+ controller and are supported and/or have compatible
+ configuration on the device side.
+
+Contact: Daniil Lunev <dlunev@chromium.org>
+What: /sys/bus/platform/drivers/ufshcd/*/capabilities/clock_scaling
+What: /sys/bus/platform/devices/*.ufs/capabilities/clock_scaling
+Date: August 2022
+Contact: Daniil Lunev <dlunev@chromium.org>
+Description: Indicates status of clock scaling.
+
+ == ============================
+ 0 Clock scaling is not supported.
+ 1 Clock scaling is supported.
+ == ============================
+
+ The file is read only.
+
+What: /sys/bus/platform/drivers/ufshcd/*/capabilities/write_booster
+What: /sys/bus/platform/devices/*.ufs/capabilities/write_booster
+Date: August 2022
+Contact: Daniil Lunev <dlunev@chromium.org>
+Description: Indicates status of Write Booster.
+
+ == ============================
+ 0 Write Booster can not be enabled.
+ 1 Write Booster can be enabled.
+ == ============================
+
+ The file is read only.
+
What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld
Date: February 2021
Contact: Avri Altman <avri.altman@wdc.com>
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
index ac2947b98950..fac0f429a869 100644
--- a/Documentation/ABI/testing/sysfs-driver-xen-blkback
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -29,7 +29,7 @@ Description:
What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
Date: December 2019
KernelVersion: 5.6
-Contact: SeongJae Park <sjpark@amazon.de>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
When memory pressure is reported to blkback this option
controls the duration in milliseconds that blkback will not
@@ -39,8 +39,8 @@ Description:
What: /sys/module/xen_blkback/parameters/feature_persistent
Date: September 2020
KernelVersion: 5.10
-Contact: SeongJae Park <sjpark@amazon.de>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
Whether to enable the persistent grants feature or not. Note
- that this option only takes effect on newly created backends.
+ that this option only takes effect on newly connected backends.
The default is Y (enable).
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
index 28008905615f..4d36c5a10546 100644
--- a/Documentation/ABI/testing/sysfs-driver-xen-blkfront
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
@@ -12,8 +12,8 @@ Description:
What: /sys/module/xen_blkfront/parameters/feature_persistent
Date: September 2020
KernelVersion: 5.10
-Contact: SeongJae Park <sjpark@amazon.de>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
Whether to enable the persistent grants feature or not. Note
- that this option only takes effect on newly created frontends.
+ that this option only takes effect on newly connected frontends.
The default is Y (enable).
diff --git a/Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager b/Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager
new file mode 100644
index 000000000000..57b9b68a73ee
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xilinx-tmr-manager
@@ -0,0 +1,16 @@
+What: /sys/devices/platform/amba_pl/<dev>/errcnt
+Date: Nov 2022
+Contact: appana.durga.kedareswara.rao@amd.com
+Description: This control file provides the fault detection count.
+ This file cannot be written.
+ Example:
+ # cat /sys/devices/platform/amba_pl/44a10000.tmr_manager/errcnt
+ 1
+
+What: /sys/devices/platform/amba_pl/<dev>/dis_block_break
+Date: Nov 2022
+Contact: appana.durga.kedareswara.rao@amd.com
+Description: Write any value to it, This control file enables the break signal.
+ This file is write only.
+ Example:
+ # echo <any value> > /sys/devices/platform/amba_pl/44a10000.tmr_manager/dis_block_break
diff --git a/Documentation/ABI/testing/sysfs-driver-zynqmp-fpga b/Documentation/ABI/testing/sysfs-driver-zynqmp-fpga
new file mode 100644
index 000000000000..8f93d27b6d91
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-zynqmp-fpga
@@ -0,0 +1,73 @@
+What: /sys/bus/platform/drivers/zynqmp_fpga_manager/firmware:zynqmp-firmware:pcap/status
+Date: February 2023
+KernelVersion: 6.4
+Contact: Nava kishore Manne <nava.kishore.manne@amd.com>
+Description: (RO) Read fpga status.
+ Read returns a hexadecimal value that tells the current status
+ of the FPGA device. Each bit position in the status value is
+ described Below(see ug570 chapter 9).
+ https://docs.xilinx.com/v/u/en-US/ug570-ultrascale-configuration
+
+ ====================== ==============================================
+ BIT(0) 0: No CRC error
+ 1: CRC error
+
+ BIT(1) 0: Decryptor security not set
+ 1: Decryptor security set
+
+ BIT(2) 0: MMCMs/PLLs are not locked
+ 1: MMCMs/PLLs are locked
+
+ BIT(3) 0: DCI not matched
+ 1: DCI matched
+
+ BIT(4) 0: Start-up sequence has not finished
+ 1: Start-up sequence has finished
+
+ BIT(5) 0: All I/Os are placed in High-Z state
+ 1: All I/Os behave as configured
+
+ BIT(6) 0: Flip-flops and block RAM are write disabled
+ 1: Flip-flops and block RAM are write enabled
+
+ BIT(7) 0: GHIGH_B_STATUS asserted
+ 1: GHIGH_B_STATUS deasserted
+
+ BIT(8) to BIT(10) Status of the mode pins
+
+ BIT(11) 0: Initialization has not finished
+ 1: Initialization finished
+
+ BIT(12) Value on INIT_B_PIN pin
+
+ BIT(13) 0: Signal not released
+ 1: Signal released
+
+ BIT(14) Value on DONE_PIN pin.
+
+ BIT(15) 0: No IDCODE_ERROR
+ 1: IDCODE_ERROR
+
+ BIT(16) 0: No SECURITY_ERROR
+ 1: SECURITY_ERROR
+
+ BIT(17) System Monitor over-temperature if set
+
+ BIT(18) to BIT(20) Start-up state machine (0 to 7)
+ Phase 0 = 000
+ Phase 1 = 001
+ Phase 2 = 011
+ Phase 3 = 010
+ Phase 4 = 110
+ Phase 5 = 111
+ Phase 6 = 101
+ Phase 7 = 100
+
+ BIT(25) to BIT(26) Indicates the detected bus width
+ 00 = x1
+ 01 = x8
+ 10 = x16
+ 11 = x32
+ ====================== ==============================================
+
+ The other bits are reserved.
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-esrt b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
index 31b57676d4ad..4c2d440487dd 100644
--- a/Documentation/ABI/testing/sysfs-firmware-efi-esrt
+++ b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
@@ -24,14 +24,14 @@ Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: The version of the ESRT structure provided by the firmware.
-What: /sys/firmware/efi/esrt/entries/entry$N/
+What: /sys/firmware/efi/esrt/entries/entry<N>/
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: Each ESRT entry is identified by a GUID, and each gets a
subdirectory under entries/ .
example: /sys/firmware/efi/esrt/entries/entry0/
-What: /sys/firmware/efi/esrt/entries/entry$N/fw_type
+What: /sys/firmware/efi/esrt/entries/entry<N>/fw_type
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: What kind of firmware entry this is:
@@ -43,33 +43,33 @@ Description: What kind of firmware entry this is:
3 UEFI Driver
== ===============
-What: /sys/firmware/efi/esrt/entries/entry$N/fw_class
+What: /sys/firmware/efi/esrt/entries/entry<N>/fw_class
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: This is the entry's guid, and will match the directory name.
-What: /sys/firmware/efi/esrt/entries/entry$N/fw_version
+What: /sys/firmware/efi/esrt/entries/entry<N>/fw_version
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: The version of the firmware currently installed. This is a
32-bit unsigned integer.
-What: /sys/firmware/efi/esrt/entries/entry$N/lowest_supported_fw_version
+What: /sys/firmware/efi/esrt/entries/entry<N>/lowest_supported_fw_version
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: The lowest version of the firmware that can be installed.
-What: /sys/firmware/efi/esrt/entries/entry$N/capsule_flags
+What: /sys/firmware/efi/esrt/entries/entry<N>/capsule_flags
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: Flags that must be passed to UpdateCapsule()
-What: /sys/firmware/efi/esrt/entries/entry$N/last_attempt_version
+What: /sys/firmware/efi/esrt/entries/entry<N>/last_attempt_version
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: The last firmware version for which an update was attempted.
-What: /sys/firmware/efi/esrt/entries/entry$N/last_attempt_status
+What: /sys/firmware/efi/esrt/entries/entry<N>/last_attempt_status
Date: February 2015
Contact: Peter Jones <pjones@redhat.com>
Description: The result of the last firmware update attempt for the
diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index 000000000000..141a6b371469
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,29 @@
+What: /sys/firmware/papr/energy_scale_info
+Date: February 2022
+Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: Directory hosting a set of platform attributes like
+ energy/frequency on Linux running as a PAPR guest.
+
+ Each file in a directory contains a platform
+ attribute hierarchy pertaining to performance/
+ energy-savings mode and processor frequency.
+
+What: /sys/firmware/papr/energy_scale_info/<id>
+Date: February 2022
+Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: Energy, frequency attributes directory for POWERVM servers
+
+What: /sys/firmware/papr/energy_scale_info/<id>/desc
+Date: February 2022
+Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: String description of the energy attribute of <id>
+
+What: /sys/firmware/papr/energy_scale_info/<id>/value
+Date: February 2022
+Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: Numeric value of the energy attribute of <id>
+
+What: /sys/firmware/papr/energy_scale_info/<id>/value_desc
+Date: February 2022
+Contact: Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
+Description: String value of the energy attribute of <id>
diff --git a/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg b/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
index ee0d6dbc810e..54d1bfd0db12 100644
--- a/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
+++ b/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
@@ -12,8 +12,9 @@ Description:
configuration data to the guest userspace.
The authoritative guest-side hardware interface documentation
- to the fw_cfg device can be found in "docs/specs/fw_cfg.txt"
- in the QEMU source tree.
+ to the fw_cfg device can be found in "docs/specs/fw_cfg.rst"
+ in the QEMU source tree, or online at:
+ https://qemu-project.gitlab.io/qemu/specs/fw_cfg.html
**SysFS fw_cfg Interface**
diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs
new file mode 100644
index 000000000000..284224d1b56f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-erofs
@@ -0,0 +1,18 @@
+What: /sys/fs/erofs/features/
+Date: November 2021
+Contact: "Huang Jianan" <huangjianan@oppo.com>
+Description: Shows all enabled kernel features.
+ Supported features:
+ zero_padding, compr_cfgs, big_pcluster, chunked_file,
+ device_table, compr_head2, sb_chksum, ztailpacking,
+ dedupe, fragments.
+
+What: /sys/fs/erofs/<disk>/sync_decompress
+Date: November 2021
+Contact: "Huang Jianan" <huangjianan@oppo.com>
+Description: Control strategy of sync decompression:
+
+ - 0 (default, auto): enable for readpage, and enable for
+ readahead on atomic contexts only.
+ - 1 (force on): enable for readpage and readahead.
+ - 2 (force off): disable for all situations.
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index f627e705e663..8140fc98f5ae 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -49,15 +49,23 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description: Controls the in-place-update policy.
updates in f2fs. User can set:
- ==== =================
- 0x01 F2FS_IPU_FORCE
- 0x02 F2FS_IPU_SSR
- 0x04 F2FS_IPU_UTIL
- 0x08 F2FS_IPU_SSR_UTIL
- 0x10 F2FS_IPU_FSYNC
- 0x20 F2FS_IPU_ASYNC,
- 0x40 F2FS_IPU_NOCACHE
- ==== =================
+ ===== =============== ===================================================
+ value policy description
+ 0x00 DISABLE disable IPU(=default option in LFS mode)
+ 0x01 FORCE all the time
+ 0x02 SSR if SSR mode is activated
+ 0x04 UTIL if FS utilization is over threashold
+ 0x08 SSR_UTIL if SSR mode is activated and FS utilization is over
+ threashold
+ 0x10 FSYNC activated in fsync path only for high performance
+ flash storages. IPU will be triggered only if the
+ # of dirty pages over min_fsync_blocks.
+ (=default option)
+ 0x20 ASYNC do IPU given by asynchronous write requests
+ 0x40 NOCACHE disable IPU bio cache
+ 0x80 HONOR_OPU_WRITE use OPU write prior to IPU write if inode has
+ FI_OPU_WRITE flag
+ ===== =============== ===================================================
Refer segment.h for details.
@@ -98,13 +106,47 @@ Description: Controls the issue rate of discard commands that consist of small
checkpoint is triggered, and issued during the checkpoint.
By default, it is disabled with 0.
+What: /sys/fs/f2fs/<disk>/max_ordered_discard
+Date: October 2022
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: Controls the maximum ordered discard, the unit size is one block(4KB).
+ Set it to 16 by default.
+
+What: /sys/fs/f2fs/<disk>/max_discard_request
+Date: December 2021
+Contact: "Konstantin Vyshetsky" <vkon@google.com>
+Description: Controls the number of discards a thread will issue at a time.
+ Higher number will allow the discard thread to finish its work
+ faster, at the cost of higher latency for incomming I/O.
+
+What: /sys/fs/f2fs/<disk>/min_discard_issue_time
+Date: December 2021
+Contact: "Konstantin Vyshetsky" <vkon@google.com>
+Description: Controls the interval the discard thread will wait between
+ issuing discard requests when there are discards to be issued and
+ no I/O aware interruptions occur.
+
+What: /sys/fs/f2fs/<disk>/mid_discard_issue_time
+Date: December 2021
+Contact: "Konstantin Vyshetsky" <vkon@google.com>
+Description: Controls the interval the discard thread will wait between
+ issuing discard requests when there are discards to be issued and
+ an I/O aware interruption occurs.
+
+What: /sys/fs/f2fs/<disk>/max_discard_issue_time
+Date: December 2021
+Contact: "Konstantin Vyshetsky" <vkon@google.com>
+Description: Controls the interval the discard thread will wait when there are
+ no discard operations to be issued.
+
What: /sys/fs/f2fs/<disk>/discard_granularity
Date: July 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Controls discard granularity of inner discard thread. Inner thread
will not issue discards with size that is smaller than granularity.
The unit size is one block(4KB), now only support configuring
- in range of [1, 512]. Default value is 4(=16KB).
+ in range of [1, 512]. Default value is 16.
+ For small devices, default value is 1.
What: /sys/fs/f2fs/<disk>/umount_discard_timeout
Date: January 2019
@@ -112,6 +154,11 @@ Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Set timeout to issue discard commands during umount.
Default: 5 secs
+What: /sys/fs/f2fs/<disk>/pending_discard
+Date: November 2021
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Shows the number of pending discard commands in the queue.
+
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -143,12 +190,6 @@ Description: Controls the memory footprint used by free nids and cached
nat entries. By default, 1 is set, which indicates
10 MB / 1 GB RAM.
-What: /sys/fs/f2fs/<disk>/batched_trim_sections
-Date: February 2015
-Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description: Controls the trimming rate in batch mode.
- <deprecated>
-
What: /sys/fs/f2fs/<disk>/cp_interval
Date: October 2015
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
@@ -202,7 +243,7 @@ Description: Shows total written kbytes issued to disk.
What: /sys/fs/f2fs/<disk>/features
Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
+Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/>
Shows all enabled features in current device.
Supported features:
encryption, blkzoned, extra_attr, projquota, inode_checksum,
@@ -264,11 +305,16 @@ Description: Shows current reserved blocks in system, it may be temporarily
What: /sys/fs/f2fs/<disk>/gc_urgent
Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
-Description: Do background GC aggressively when set. When gc_urgent = 1,
- background thread starts to do GC by given gc_urgent_sleep_time
- interval. When gc_urgent = 2, F2FS will lower the bar of
- checking idle in order to process outstanding discard commands
- and GC a little bit aggressively. It is set to 0 by default.
+Description: Do background GC aggressively when set. Set to 0 by default.
+ gc urgent high(1): does GC forcibly in a period of given
+ gc_urgent_sleep_time and ignores I/O idling check. uses greedy
+ GC approach and turns SSR mode on.
+ gc urgent low(2): lowers the bar of checking I/O idling in
+ order to process outstanding discard commands and GC a
+ little bit aggressively. uses cost benefit GC approach.
+ gc urgent mid(3): does GC forcibly in a period of given
+ gc_urgent_sleep_time and executes a mid level of I/O idling check.
+ uses cost benefit GC approach.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017
@@ -425,8 +471,33 @@ Description: Show status of f2fs superblock in real time.
0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP
0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted
0x2000 SBI_IS_RESIZEFS resizefs is in process
+ 0x4000 SBI_IS_FREEZING freefs is in process
====== ===================== =================================
+What: /sys/fs/f2fs/<disk>/stat/cp_status
+Date: September 2022
+Contact: "Chao Yu" <chao.yu@oppo.com>
+Description: Show status of f2fs checkpoint in real time.
+
+ =============================== ==============================
+ cp flag value
+ CP_UMOUNT_FLAG 0x00000001
+ CP_ORPHAN_PRESENT_FLAG 0x00000002
+ CP_COMPACT_SUM_FLAG 0x00000004
+ CP_ERROR_FLAG 0x00000008
+ CP_FSCK_FLAG 0x00000010
+ CP_FASTBOOT_FLAG 0x00000020
+ CP_CRC_RECOVERY_FLAG 0x00000040
+ CP_NAT_BITS_FLAG 0x00000080
+ CP_TRIMMED_FLAG 0x00000100
+ CP_NOCRC_RECOVERY_FLAG 0x00000200
+ CP_LARGE_NAT_BITMAP_FLAG 0x00000400
+ CP_QUOTA_NEED_FSCK_FLAG 0x00000800
+ CP_DISABLED_FLAG 0x00001000
+ CP_DISABLED_QUICK_FLAG 0x00002000
+ CP_RESIZEFS_FLAG 0x00004000
+ =============================== ==============================
+
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
Date: January 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
@@ -498,7 +569,7 @@ Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show how many segments have been reclaimed by GC during a specific
GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
- 3: GC idle AT, 4: GC urgent high, 5: GC urgent low)
+ 3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid)
You can re-initialize this value to "0".
What: /sys/fs/f2fs/<disk>/gc_segment_mode
@@ -512,3 +583,160 @@ Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: You can control the multiplier value of bdi device readahead window size
between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option.
+
+What: /sys/fs/f2fs/<disk>/max_fragment_chunk
+Date: August 2021
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: With "mode=fragment:block" mount options, we can scatter block allocation.
+ f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
+ in the length of 1..<max_fragment_hole> by turns. This value can be set
+ between 1..512 and the default value is 4.
+
+What: /sys/fs/f2fs/<disk>/max_fragment_hole
+Date: August 2021
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: With "mode=fragment:block" mount options, we can scatter block allocation.
+ f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
+ in the length of 1..<max_fragment_hole> by turns. This value can be set
+ between 1..512 and the default value is 4.
+
+What: /sys/fs/f2fs/<disk>/gc_remaining_trials
+Date: October 2022
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: You can set the trial count limit for GC urgent and idle mode with this value.
+ If GC thread gets to the limit, the mode will turn back to GC normal mode.
+ By default, the value is zero, which means there is no limit like before.
+
+What: /sys/fs/f2fs/<disk>/max_roll_forward_node_blocks
+Date: January 2022
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Controls max # of node block writes to be used for roll forward
+ recovery. This can limit the roll forward recovery time.
+
+What: /sys/fs/f2fs/<disk>/unusable_blocks_per_sec
+Date: June 2022
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Shows the number of unusable blocks in a section which was defined by
+ the zone capacity reported by underlying zoned device.
+
+What: /sys/fs/f2fs/<disk>/current_atomic_write
+Date: July 2022
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Show the total current atomic write block count, which is not committed yet.
+ This is a read-only entry.
+
+What: /sys/fs/f2fs/<disk>/peak_atomic_write
+Date: July 2022
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Show the peak value of total current atomic write block count after boot.
+ If you write "0" here, you can initialize to "0".
+
+What: /sys/fs/f2fs/<disk>/committed_atomic_block
+Date: July 2022
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Show the accumulated total committed atomic write block count after boot.
+ If you write "0" here, you can initialize to "0".
+
+What: /sys/fs/f2fs/<disk>/revoked_atomic_block
+Date: July 2022
+Contact: "Daeho Jeong" <daehojeong@google.com>
+Description: Show the accumulated total revoked atomic write block count after boot.
+ If you write "0" here, you can initialize to "0".
+
+What: /sys/fs/f2fs/<disk>/gc_mode
+Date: October 2022
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: Show the current gc_mode as a string.
+ This is a read-only entry.
+
+What: /sys/fs/f2fs/<disk>/discard_urgent_util
+Date: November 2022
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: When space utilization exceeds this, do background DISCARD aggressively.
+ Does DISCARD forcibly in a period of given min_discard_issue_time when the number
+ of discards is not 0 and set discard granularity to 1.
+ Default: 80
+
+What: /sys/fs/f2fs/<disk>/hot_data_age_threshold
+Date: November 2022
+Contact: "Ping Xiong" <xiongping1@xiaomi.com>
+Description: When DATA SEPARATION is on, it controls the age threshold to indicate
+ the data blocks as hot. By default it was initialized as 262144 blocks
+ (equals to 1GB).
+
+What: /sys/fs/f2fs/<disk>/warm_data_age_threshold
+Date: November 2022
+Contact: "Ping Xiong" <xiongping1@xiaomi.com>
+Description: When DATA SEPARATION is on, it controls the age threshold to indicate
+ the data blocks as warm. By default it was initialized as 2621440 blocks
+ (equals to 10GB).
+
+What: /sys/fs/f2fs/<disk>/fault_rate
+Date: May 2016
+Contact: "Sheng Yong" <shengyong@oppo.com>
+Contact: "Chao Yu" <chao@kernel.org>
+Description: Enable fault injection in all supported types with
+ specified injection rate.
+
+What: /sys/fs/f2fs/<disk>/fault_type
+Date: May 2016
+Contact: "Sheng Yong" <shengyong@oppo.com>
+Contact: "Chao Yu" <chao@kernel.org>
+Description: Support configuring fault injection type, should be
+ enabled with fault_injection option, fault type value
+ is shown below, it supports single or combined type.
+
+ =================== ===========
+ Type_Name Type_Value
+ =================== ===========
+ FAULT_KMALLOC 0x000000001
+ FAULT_KVMALLOC 0x000000002
+ FAULT_PAGE_ALLOC 0x000000004
+ FAULT_PAGE_GET 0x000000008
+ FAULT_ALLOC_BIO 0x000000010 (obsolete)
+ FAULT_ALLOC_NID 0x000000020
+ FAULT_ORPHAN 0x000000040
+ FAULT_BLOCK 0x000000080
+ FAULT_DIR_DEPTH 0x000000100
+ FAULT_EVICT_INODE 0x000000200
+ FAULT_TRUNCATE 0x000000400
+ FAULT_READ_IO 0x000000800
+ FAULT_CHECKPOINT 0x000001000
+ FAULT_DISCARD 0x000002000
+ FAULT_WRITE_IO 0x000004000
+ FAULT_SLAB_ALLOC 0x000008000
+ FAULT_DQUOT_INIT 0x000010000
+ FAULT_LOCK_OP 0x000020000
+ FAULT_BLKADDR 0x000040000
+ =================== ===========
+
+What: /sys/fs/f2fs/<disk>/discard_io_aware_gran
+Date: January 2023
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: Controls background discard granularity of inner discard thread
+ when is not in idle. Inner thread will not issue discards with size that
+ is smaller than granularity. The unit size is one block(4KB), now only
+ support configuring in range of [0, 512].
+ Default: 512
+
+What: /sys/fs/f2fs/<disk>/last_age_weight
+Date: January 2023
+Contact: "Ping Xiong" <xiongping1@xiaomi.com>
+Description: When DATA SEPARATION is on, it controls the weight of last data block age.
+
+What: /sys/fs/f2fs/<disk>/compress_watermark
+Date: February 2023
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: When compress cache is on, it controls free memory watermark
+ in order to limit caching compress page. If free memory is lower
+ than watermark, then deny caching compress page. The value should be in
+ range of (0, 100], by default it was initialized as 20(%).
+
+What: /sys/fs/f2fs/<disk>/compress_percent
+Date: February 2023
+Contact: "Yangtao Li" <frank.li@vivo.com>
+Description: When compress cache is on, it controls cached page
+ percent(compress pages / free_ram) in order to limit caching compress page.
+ If cached page percent exceed threshold, then deny caching compress page.
+ The value should be in range of (0, 100], by default it was initialized
+ as 20(%).
diff --git a/Documentation/ABI/testing/sysfs-fs-ubifs b/Documentation/ABI/testing/sysfs-fs-ubifs
new file mode 100644
index 000000000000..af5afda30220
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ubifs
@@ -0,0 +1,35 @@
+What: /sys/fs/ubifsX_Y/error_magic
+Date: October 2021
+KernelVersion: 5.16
+Contact: linux-mtd@lists.infradead.org
+Description:
+ Exposes magic errors: every node starts with a magic number.
+
+ This counter keeps track of the number of accesses of nodes
+ with a corrupted magic number.
+
+ The counter is reset to 0 with a remount.
+
+What: /sys/fs/ubifsX_Y/error_node
+Date: October 2021
+KernelVersion: 5.16
+Contact: linux-mtd@lists.infradead.org
+Description:
+ Exposes node errors. Every node embeds its type.
+
+ This counter keeps track of the number of accesses of nodes
+ with a corrupted node type.
+
+ The counter is reset to 0 with a remount.
+
+What: /sys/fs/ubifsX_Y/error_crc
+Date: October 2021
+KernelVersion: 5.16
+Contact: linux-mtd@lists.infradead.org
+Description:
+ Exposes crc errors: every node embeds a crc checksum.
+
+ This counter keeps track of the number of accesses of nodes
+ with a bad crc checksum.
+
+ The counter is reset to 0 with a remount.
diff --git a/Documentation/ABI/testing/sysfs-kernel-address_bits b/Documentation/ABI/testing/sysfs-kernel-address_bits
new file mode 100644
index 000000000000..5d09ff84d4d6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-address_bits
@@ -0,0 +1,10 @@
+What: /sys/kernel/address_bit
+Date: May 2023
+KernelVersion: 6.3
+Contact: Thomas Weißschuh <linux@weissschuh.net>
+Description:
+ The address size of the running kernel in bits.
+
+ Access: Read
+
+Users: util-linux
diff --git a/Documentation/ABI/testing/sysfs-kernel-cpu_byteorder b/Documentation/ABI/testing/sysfs-kernel-cpu_byteorder
new file mode 100644
index 000000000000..f0e6ac1b5356
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-cpu_byteorder
@@ -0,0 +1,12 @@
+What: /sys/kernel/cpu_byteorder
+Date: February 2023
+KernelVersion: 6.2
+Contact: Thomas Weißschuh <linux@weissschuh.net>
+Description:
+ The endianness of the running kernel.
+
+ Access: Read
+
+ Valid values:
+ "little", "big"
+Users: util-linux
diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index b15af6a5bc08..a42d4383d999 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -53,7 +53,6 @@ Description: /sys/kernel/iommu_groups/<grp_id>/type shows the type of default
The default domain type of a group may be modified only when
- - The group has only one device.
- The device in the group is not bound to any device driver.
So, the users must unbind the appropriate driver before
changing the default domain type.
diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch
index bea7bd5a1d5f..a5df9b4910dc 100644
--- a/Documentation/ABI/testing/sysfs-kernel-livepatch
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -55,6 +55,14 @@ Description:
The object directory contains subdirectories for each function
that is patched within the object.
+What: /sys/kernel/livepatch/<patch>/<object>/patched
+Date: August 2022
+KernelVersion: 6.1.0
+Contact: live-patching@vger.kernel.org
+Description:
+ An attribute which indicates whether the object is currently
+ patched.
+
What: /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
Date: Nov 2014
KernelVersion: 3.19.0
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
new file mode 100644
index 000000000000..2744f21b5a6b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -0,0 +1,346 @@
+what: /sys/kernel/mm/damon/
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Interface for Data Access MONitoring (DAMON). Contains files
+ for controlling DAMON. For more details on DAMON itself,
+ please refer to Documentation/admin-guide/mm/damon/index.rst.
+
+What: /sys/kernel/mm/damon/admin/
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Interface for privileged users of DAMON. Contains files for
+ controlling DAMON that aimed to be used by privileged users.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/nr_kdamonds
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for controlling each DAMON worker thread (kdamond)
+ named '0' to 'N-1' under the kdamonds/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/state
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing 'on' or 'off' to this file makes the kdamond starts or
+ stops, respectively. Reading the file returns the keywords
+ based on the current status. Writing 'commit' to this file
+ makes the kdamond reads the user inputs in the sysfs files
+ except 'state' again. Writing 'update_schemes_stats' to the
+ file updates contents of schemes stats files of the kdamond.
+ Writing 'update_schemes_tried_regions' to the file updates
+ contents of 'tried_regions' directory of every scheme directory
+ of this kdamond. Writing 'clear_schemes_tried_regions' to the
+ file removes contents of the 'tried_regions' directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the pid of the kdamond if it is
+ running.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/nr_contexts
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for controlling each DAMON context named '0' to
+ 'N-1' under the contexts/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/avail_operations
+Date: Apr 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the available monitoring operations
+ sets on the currently running kernel.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a keyword for a monitoring operations set ('vaddr' for
+ virtual address spaces monitoring, 'fvaddr' for fixed virtual
+ address ranges monitoring, and 'paddr' for the physical address
+ space monitoring) to this file makes the context to use the
+ operations set. Reading the file returns the keyword for the
+ operations set the context is set to use.
+
+ Note that only the operations sets that listed in
+ 'avail_operations' file are valid inputs.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a value to this file sets the sampling interval of the
+ DAMON context in microseconds as the value. Reading this file
+ returns the value.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/aggr_us
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a value to this file sets the aggregation interval of
+ the DAMON context in microseconds as the value. Reading this
+ file returns the value.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/update_us
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a value to this file sets the update interval of the
+ DAMON context in microseconds as the value. Reading this file
+ returns the value.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/min
+
+WDate: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a value to this file sets the minimum number of
+ monitoring regions of the DAMON context as the value. Reading
+ this file returns the value.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/max
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a value to this file sets the maximum number of
+ monitoring regions of the DAMON context as the value. Reading
+ this file returns the value.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/nr_targets
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for controlling each DAMON target of the context
+ named '0' to 'N-1' under the contexts/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/pid_target
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the pid of
+ the target process if the context is for virtual address spaces
+ monitoring, respectively.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/nr_regions
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for setting each DAMON target memory region of the
+ context named '0' to 'N-1' under the regions/ directory. In
+ case of the virtual address space monitoring, DAMON
+ automatically sets the target memory region based on the target
+ processes' mappings.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/start
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the start
+ address of the monitoring region.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/<R>/end
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the end
+ address of the monitoring region.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/nr_schemes
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for controlling each DAMON-based operation scheme
+ of the context named '0' to 'N-1' under the schemes/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/action
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the action
+ of the scheme.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/min
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the mimimum
+ size of the scheme's target regions in bytes.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/sz/max
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the maximum
+ size of the scheme's target regions in bytes.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/min
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the manimum
+ 'nr_accesses' of the scheme's target regions.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/nr_accesses/max
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the maximum
+ 'nr_accesses' of the scheme's target regions.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/min
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the minimum
+ 'age' of the scheme's target regions.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/access_pattern/age/max
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the maximum
+ 'age' of the scheme's target regions.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/ms
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the time
+ quota of the scheme in milliseconds.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/bytes
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the size
+ quota of the scheme in bytes.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/reset_interval_ms
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the quotas
+ charge reset interval of the scheme in milliseconds.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/sz_permil
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the
+ under-quota limit regions prioritization weight for 'size' in
+ permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/nr_accesses_permil
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the
+ under-quota limit regions prioritization weight for
+ 'nr_accesses' in permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/age_permil
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the
+ under-quota limit regions prioritization weight for 'age' in
+ permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/metric
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the metric
+ of the watermarks for the scheme. The writable/readable
+ keywords for this file are 'none' for disabling the watermarks
+ feature, or 'free_mem_rate' for the system's global free memory
+ rate in permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/interval_us
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the metric
+ check interval of the watermarks for the scheme in
+ microseconds.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/high
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the high
+ watermark of the scheme in permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/mid
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the mid
+ watermark of the scheme in permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/watermarks/low
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the low
+ watermark of the scheme in permil.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/nr_filters
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing a number 'N' to this file creates the number of
+ directories for setting filters of the scheme named '0' to
+ 'N-1' under the filters/ directory.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/type
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing to and reading from this file sets and gets the type of
+ the memory of the interest. 'anon' for anonymous pages, or
+ 'memcg' for specific memory cgroup can be written and read.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: If 'memcg' is written to the 'type' file, writing to and
+ reading from this file sets and gets the path to the memory
+ cgroup of the interest.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/matching
+Date: Dec 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Writing 'Y' or 'N' to this file sets whether to filter out
+ pages that do or do not match to the 'type' and 'memcg_path',
+ respectively. Filter out means the action of the scheme will
+ not be applied to.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the number of regions that the action
+ of the scheme has tried to be applied.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_tried
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the total size of regions that the
+ action of the scheme has tried to be applied in bytes.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_applied
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the number of regions that the action
+ of the scheme has successfully applied.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/sz_applied
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the total size of regions that the
+ action of the scheme has successfully applied in bytes.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/qt_exceeds
+Date: Mar 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the number of the exceed events of
+ the scheme's quotas.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/start
+Date: Oct 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the start address of a memory region
+ that corresponding DAMON-based Operation Scheme's action has
+ tried to be applied.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/end
+Date: Oct 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the end address of a memory region
+ that corresponding DAMON-based Operation Scheme's action has
+ tried to be applied.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/nr_accesses
+Date: Oct 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the 'nr_accesses' of a memory region
+ that corresponding DAMON-based Operation Scheme's action has
+ tried to be applied.
+
+What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/tried_regions/<R>/age
+Date: Oct 2022
+Contact: SeongJae Park <sj@kernel.org>
+Description: Reading this file returns the 'age' of a memory region that
+ corresponding DAMON-based Operation Scheme's action has tried
+ to be applied.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
index 1c9bed5595f5..6041a025b65a 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -41,7 +41,7 @@ Description: Kernel Samepage Merging daemon sysfs interface
sleep_millisecs: how many milliseconds ksm should sleep between
scans.
- See Documentation/vm/ksm.rst for more information.
+ See Documentation/mm/ksm.rst for more information.
What: /sys/kernel/mm/ksm/merge_across_nodes
Date: January 2013
@@ -51,3 +51,11 @@ Description: Control merging pages across different NUMA nodes.
When it is set to 0 only pages from the same node are merged,
otherwise pages from all nodes can be merged together (default).
+
+What: /sys/kernel/mm/ksm/general_profit
+Date: April 2023
+KernelVersion: 6.4
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Measure how effective KSM is.
+ general_profit: how effective is KSM. The formula for the
+ calculation is in Documentation/admin-guide/mm/ksm.rst.
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
new file mode 100644
index 000000000000..721a05b90109
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
@@ -0,0 +1,25 @@
+What: /sys/devices/virtual/memory_tiering/
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: A collection of all the memory tiers allocated.
+
+ Individual memory tier details are contained in subdirectories
+ named by the abstract distance of the memory tier.
+
+ /sys/devices/virtual/memory_tiering/memory_tierN/
+
+
+What: /sys/devices/virtual/memory_tiering/memory_tierN/
+ /sys/devices/virtual/memory_tiering/memory_tierN/nodelist
+Date: August 2022
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description: Directory with details of a specific memory tier
+
+ This is the directory containing information about a particular
+ memory tier, memtierN, where N is derived based on abstract distance.
+
+ A smaller value of N implies a higher (faster) memory tier in the
+ hierarchy.
+
+ nodelist: NUMA nodes that are part of this memory tier.
+
diff --git a/Documentation/ABI/testing/sysfs-kernel-oops_count b/Documentation/ABI/testing/sysfs-kernel-oops_count
new file mode 100644
index 000000000000..156cca9dbc96
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-oops_count
@@ -0,0 +1,6 @@
+What: /sys/kernel/oops_count
+Date: November 2022
+KernelVersion: 6.2.0
+Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
+Description:
+ Shows how many times the system has Oopsed since last boot.
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index c9f12baf8baa..cd5fb8fa3ddf 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -10,7 +10,7 @@ Description:
any cache it aliases, if any).
Users: kernel memory tuning tools
-What: /sys/kernel/slab/cache/aliases
+What: /sys/kernel/slab/<cache>/aliases
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -19,7 +19,7 @@ Description:
The aliases file is read-only and specifies how many caches
have merged into this cache.
-What: /sys/kernel/slab/cache/align
+What: /sys/kernel/slab/<cache>/align
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -28,7 +28,7 @@ Description:
The align file is read-only and specifies the cache's object
alignment in bytes.
-What: /sys/kernel/slab/cache/alloc_calls
+What: /sys/kernel/slab/<cache>/alloc_calls
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -37,9 +37,9 @@ Description:
The alloc_calls file is read-only and lists the kernel code
locations from which allocations for this cache were performed.
The alloc_calls file only contains information if debugging is
- enabled for that cache (see Documentation/vm/slub.rst).
+ enabled for that cache (see Documentation/mm/slub.rst).
-What: /sys/kernel/slab/cache/alloc_fastpath
+What: /sys/kernel/slab/<cache>/alloc_fastpath
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -50,7 +50,7 @@ Description:
current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/alloc_from_partial
+What: /sys/kernel/slab/<cache>/alloc_from_partial
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -62,7 +62,7 @@ Description:
count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/alloc_refill
+What: /sys/kernel/slab/<cache>/alloc_refill
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -73,7 +73,7 @@ Description:
remote cpu frees. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/alloc_slab
+What: /sys/kernel/slab/<cache>/alloc_slab
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -84,7 +84,7 @@ Description:
clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/alloc_slowpath
+What: /sys/kernel/slab/<cache>/alloc_slowpath
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -96,7 +96,7 @@ Description:
clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/cache_dma
+What: /sys/kernel/slab/<cache>/cache_dma
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -106,7 +106,7 @@ Description:
are from ZONE_DMA.
Available when CONFIG_ZONE_DMA is enabled.
-What: /sys/kernel/slab/cache/cpu_slabs
+What: /sys/kernel/slab/<cache>/cpu_slabs
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -115,7 +115,7 @@ Description:
The cpu_slabs file is read-only and displays how many cpu slabs
are active and their NUMA locality.
-What: /sys/kernel/slab/cache/cpuslab_flush
+What: /sys/kernel/slab/<cache>/cpuslab_flush
Date: April 2009
KernelVersion: 2.6.31
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -128,7 +128,7 @@ Description:
current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/ctor
+What: /sys/kernel/slab/<cache>/ctor
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -138,7 +138,7 @@ Description:
constructor function, which is invoked for each object when a
new slab is allocated.
-What: /sys/kernel/slab/cache/deactivate_empty
+What: /sys/kernel/slab/<cache>/deactivate_empty
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -148,7 +148,7 @@ Description:
was deactivated. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/deactivate_full
+What: /sys/kernel/slab/<cache>/deactivate_full
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -158,7 +158,7 @@ Description:
was deactivated. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/deactivate_remote_frees
+What: /sys/kernel/slab/<cache>/deactivate_remote_frees
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -169,7 +169,7 @@ Description:
remotely. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/deactivate_to_head
+What: /sys/kernel/slab/<cache>/deactivate_to_head
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -180,7 +180,7 @@ Description:
list. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/deactivate_to_tail
+What: /sys/kernel/slab/<cache>/deactivate_to_tail
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -191,7 +191,7 @@ Description:
list. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/destroy_by_rcu
+What: /sys/kernel/slab/<cache>/destroy_by_rcu
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -200,7 +200,7 @@ Description:
The destroy_by_rcu file is read-only and specifies whether
slabs (not objects) are freed by rcu.
-What: /sys/kernel/slab/cache/free_add_partial
+What: /sys/kernel/slab/<cache>/free_add_partial
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -211,7 +211,7 @@ Description:
partial list. It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/free_calls
+What: /sys/kernel/slab/<cache>/free_calls
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -219,9 +219,9 @@ Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Description:
The free_calls file is read-only and lists the locations of
object frees if slab debugging is enabled (see
- Documentation/vm/slub.rst).
+ Documentation/mm/slub.rst).
-What: /sys/kernel/slab/cache/free_fastpath
+What: /sys/kernel/slab/<cache>/free_fastpath
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -232,7 +232,7 @@ Description:
It can be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/free_frozen
+What: /sys/kernel/slab/<cache>/free_frozen
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -243,7 +243,7 @@ Description:
clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/free_remove_partial
+What: /sys/kernel/slab/<cache>/free_remove_partial
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -255,7 +255,7 @@ Description:
count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/free_slab
+What: /sys/kernel/slab/<cache>/free_slab
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -266,7 +266,7 @@ Description:
the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/free_slowpath
+What: /sys/kernel/slab/<cache>/free_slowpath
Date: February 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -277,7 +277,7 @@ Description:
be written to clear the current count.
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/hwcache_align
+What: /sys/kernel/slab/<cache>/hwcache_align
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -286,7 +286,7 @@ Description:
The hwcache_align file is read-only and specifies whether
objects are aligned on cachelines.
-What: /sys/kernel/slab/cache/min_partial
+What: /sys/kernel/slab/<cache>/min_partial
Date: February 2009
KernelVersion: 2.6.30
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -297,7 +297,7 @@ Description:
allocating new slabs. Such slabs may be reclaimed by utilizing
the shrink file.
-What: /sys/kernel/slab/cache/object_size
+What: /sys/kernel/slab/<cache>/object_size
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -306,7 +306,7 @@ Description:
The object_size file is read-only and specifies the cache's
object size.
-What: /sys/kernel/slab/cache/objects
+What: /sys/kernel/slab/<cache>/objects
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -315,7 +315,7 @@ Description:
The objects file is read-only and displays how many objects are
active and from which nodes they are from.
-What: /sys/kernel/slab/cache/objects_partial
+What: /sys/kernel/slab/<cache>/objects_partial
Date: April 2008
KernelVersion: 2.6.26
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -325,7 +325,7 @@ Description:
objects are on partial slabs and from which nodes they are
from.
-What: /sys/kernel/slab/cache/objs_per_slab
+What: /sys/kernel/slab/<cache>/objs_per_slab
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -333,9 +333,9 @@ Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Description:
The file objs_per_slab is read-only and specifies how many
objects may be allocated from a single slab of the order
- specified in /sys/kernel/slab/cache/order.
+ specified in /sys/kernel/slab/<cache>/order.
-What: /sys/kernel/slab/cache/order
+What: /sys/kernel/slab/<cache>/order
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -352,7 +352,7 @@ Description:
order is used and this sysfs entry can not be used to change
the order at run time.
-What: /sys/kernel/slab/cache/order_fallback
+What: /sys/kernel/slab/<cache>/order_fallback
Date: April 2008
KernelVersion: 2.6.26
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -365,7 +365,7 @@ Description:
Available when CONFIG_SLUB_STATS is enabled.
-What: /sys/kernel/slab/cache/partial
+What: /sys/kernel/slab/<cache>/partial
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -374,7 +374,7 @@ Description:
The partial file is read-only and displays how long many
partial slabs there are and how long each node's list is.
-What: /sys/kernel/slab/cache/poison
+What: /sys/kernel/slab/<cache>/poison
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -383,7 +383,7 @@ Description:
The poison file specifies whether objects should be poisoned
when a new slab is allocated.
-What: /sys/kernel/slab/cache/reclaim_account
+What: /sys/kernel/slab/<cache>/reclaim_account
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -392,7 +392,7 @@ Description:
The reclaim_account file specifies whether the cache's objects
are reclaimable (and grouped by their mobility).
-What: /sys/kernel/slab/cache/red_zone
+What: /sys/kernel/slab/<cache>/red_zone
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -401,7 +401,7 @@ Description:
The red_zone file specifies whether the cache's objects are red
zoned.
-What: /sys/kernel/slab/cache/remote_node_defrag_ratio
+What: /sys/kernel/slab/<cache>/remote_node_defrag_ratio
Date: January 2008
KernelVersion: 2.6.25
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -415,7 +415,7 @@ Description:
Available when CONFIG_NUMA is enabled.
-What: /sys/kernel/slab/cache/sanity_checks
+What: /sys/kernel/slab/<cache>/sanity_checks
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -426,7 +426,7 @@ Description:
checks. Caches that enable sanity_checks cannot be merged with
caches that do not.
-What: /sys/kernel/slab/cache/shrink
+What: /sys/kernel/slab/<cache>/shrink
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -442,7 +442,7 @@ Description:
adversely impact other running applications. So it
should be used with care.
-What: /sys/kernel/slab/cache/slab_size
+What: /sys/kernel/slab/<cache>/slab_size
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -451,7 +451,7 @@ Description:
The slab_size file is read-only and specifies the object size
with metadata (debugging information and alignment) in bytes.
-What: /sys/kernel/slab/cache/slabs
+What: /sys/kernel/slab/<cache>/slabs
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -461,7 +461,7 @@ Description:
there are (both cpu and partial) and from which nodes they are
from.
-What: /sys/kernel/slab/cache/store_user
+What: /sys/kernel/slab/<cache>/store_user
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -470,7 +470,7 @@ Description:
The store_user file specifies whether the location of
allocation or free should be tracked for a cache.
-What: /sys/kernel/slab/cache/total_objects
+What: /sys/kernel/slab/<cache>/total_objects
Date: April 2008
KernelVersion: 2.6.26
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -479,7 +479,7 @@ Description:
The total_objects file is read-only and displays how many total
objects a cache has and from which nodes they are from.
-What: /sys/kernel/slab/cache/trace
+What: /sys/kernel/slab/<cache>/trace
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -488,7 +488,7 @@ Description:
The trace file specifies whether object allocations and frees
should be traced.
-What: /sys/kernel/slab/cache/validate
+What: /sys/kernel/slab/<cache>/validate
Date: May 2007
KernelVersion: 2.6.22
Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
@@ -496,3 +496,24 @@ Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
Description:
Writing to the validate file causes SLUB to traverse all of its
cache's objects and check the validity of metadata.
+
+What: /sys/kernel/slab/<cache>/usersize
+Date: Jun 2017
+Contact: David Windsor <dave@nullcore.net>
+Description:
+ The usersize file is read-only and contains the usercopy
+ region size.
+
+What: /sys/kernel/slab/<cache>/slabs_cpu_partial
+Date: Aug 2011
+Contact: Christoph Lameter <cl@linux.com>
+Description:
+ This read-only file shows the number of partialli allocated
+ frozen slabs.
+
+What: /sys/kernel/slab/<cache>/cpu_partial
+Date: Aug 2011
+Contact: Christoph Lameter <cl@linux.com>
+Description:
+ This read-only file shows the number of per cpu partial
+ pages to keep around.
diff --git a/Documentation/ABI/testing/sysfs-kernel-warn_count b/Documentation/ABI/testing/sysfs-kernel-warn_count
new file mode 100644
index 000000000000..90a029813717
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-warn_count
@@ -0,0 +1,6 @@
+What: /sys/kernel/warn_count
+Date: November 2022
+KernelVersion: 6.2.0
+Contact: Linux Kernel Hardening List <linux-hardening@vger.kernel.org>
+Description:
+ Shows how many times the system has Warned since last boot.
diff --git a/Documentation/ABI/testing/sysfs-mce b/Documentation/ABI/testing/sysfs-mce
new file mode 100644
index 000000000000..83172f50e27c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-mce
@@ -0,0 +1,97 @@
+What: /sys/devices/system/machinecheck/machinecheckX/
+Contact: Andi Kleen <ak@linux.intel.com>
+Date: Feb, 2007
+Description:
+ (X = CPU number)
+
+ Machine checks report internal hardware error conditions
+ detected by the CPU. Uncorrected errors typically cause a
+ machine check (often with panic), corrected ones cause a
+ machine check log entry.
+
+ For more details about the x86 machine check architecture
+ see the Intel and AMD architecture manuals from their
+ developer websites.
+
+ For more details about the architecture
+ see http://one.firstfloor.org/~andi/mce.pdf
+
+ Each CPU has its own directory.
+
+What: /sys/devices/system/machinecheck/machinecheckX/bank<Y>
+Contact: Andi Kleen <ak@linux.intel.com>
+Date: Feb, 2007
+Description:
+ (Y bank number)
+
+ 64bit Hex bitmask enabling/disabling specific subevents for
+ bank Y.
+
+ When a bit in the bitmask is zero then the respective
+ subevent will not be reported.
+
+ By default all events are enabled.
+
+ Note that BIOS maintain another mask to disable specific events
+ per bank. This is not visible here
+
+What: /sys/devices/system/machinecheck/machinecheckX/check_interval
+Contact: Andi Kleen <ak@linux.intel.com>
+Date: Feb, 2007
+Description:
+ The entries appear for each CPU, but they are truly shared
+ between all CPUs.
+
+ How often to poll for corrected machine check errors, in
+ seconds (Note output is hexadecimal). Default 5 minutes.
+ When the poller finds MCEs it triggers an exponential speedup
+ (poll more often) on the polling interval. When the poller
+ stops finding MCEs, it triggers an exponential backoff
+ (poll less often) on the polling interval. The check_interval
+ variable is both the initial and maximum polling interval.
+ 0 means no polling for corrected machine check errors
+ (but some corrected errors might be still reported
+ in other ways)
+
+What: /sys/devices/system/machinecheck/machinecheckX/trigger
+Contact: Andi Kleen <ak@linux.intel.com>
+Date: Feb, 2007
+Description:
+ The entries appear for each CPU, but they are truly shared
+ between all CPUs.
+
+ Program to run when a machine check event is detected.
+ This is an alternative to running mcelog regularly from cron
+ and allows to detect events faster.
+
+What: /sys/devices/system/machinecheck/machinecheckX/monarch_timeout
+Contact: Andi Kleen <ak@linux.intel.com>
+Date: Feb, 2007
+Description:
+ How long to wait for the other CPUs to machine check too on a
+ exception. 0 to disable waiting for other CPUs.
+
+ Unit: us
+
+What: /sys/devices/system/machinecheck/machinecheckX/ignore_ce
+Contact: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Date: Jun 2009
+Description:
+ Disables polling and CMCI for corrected errors.
+ All corrected events are not cleared and kept in bank MSRs.
+
+What: /sys/devices/system/machinecheck/machinecheckX/dont_log_ce
+Contact: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Date: Jun 2009
+Description:
+ Disables logging for corrected errors.
+ All reported corrected errors will be cleared silently.
+
+ This option will be useful if you never care about corrected
+ errors.
+
+What: /sys/devices/system/machinecheck/machinecheckX/cmci_disabled
+Contact: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Date: Jun 2009
+Description:
+ Disables the CMCI feature.
diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index 88bddf192ceb..08886367d047 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -41,6 +41,13 @@ KernelVersion: 3.3
Contact: Kay Sievers <kay.sievers@vrfy.org>
Description: Module size in bytes.
+What: /sys/module/*/initstate
+Date: Nov 2006
+KernelVersion: 2.6.19
+Contact: Kay Sievers <kay.sievers@vrfy.org>
+Description: Show the initialization state(live, coming, going) of
+ the module.
+
What: /sys/module/*/taint
Date: Jan 2012
KernelVersion: 3.3
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 04885738cf15..a77a004a1baa 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -57,3 +57,44 @@ Description:
* 0 - default,
* 1 - overboost,
* 2 - silent
+
+What: /sys/devices/platform/<platform>/gpu_mux_mode
+Date: Aug 2022
+KernelVersion: 6.1
+Contact: "Luke Jones" <luke@ljones.dev>
+Description:
+ Switch the GPU hardware MUX mode. Laptops with this feature can
+ can be toggled to boot with only the dGPU (discrete mode) or in
+ standard Optimus/Hybrid mode. On switch a reboot is required:
+
+ * 0 - Discrete GPU,
+ * 1 - Optimus/Hybrid,
+
+What: /sys/devices/platform/<platform>/dgpu_disable
+Date: Aug 2022
+KernelVersion: 5.17
+Contact: "Luke Jones" <luke@ljones.dev>
+Description:
+ Disable discrete GPU:
+ * 0 - Enable dGPU,
+ * 1 - Disable dGPU
+
+What: /sys/devices/platform/<platform>/egpu_enable
+Date: Aug 2022
+KernelVersion: 5.17
+Contact: "Luke Jones" <luke@ljones.dev>
+Description:
+ Enable the external GPU paired with ROG X-Flow laptops.
+ Toggling this setting will also trigger ACPI to disable the dGPU:
+
+ * 0 - Disable,
+ * 1 - Enable
+
+What: /sys/devices/platform/<platform>/panel_od
+Date: Aug 2022
+KernelVersion: 5.17
+Contact: "Luke Jones" <luke@ljones.dev>
+Description:
+ Enable an LCD response-time boost to reduce or remove ghosting:
+ * 0 - Disable,
+ * 1 - Enable
diff --git a/Documentation/ABI/testing/sysfs-platform-brcmstb-memc b/Documentation/ABI/testing/sysfs-platform-brcmstb-memc
new file mode 100644
index 000000000000..2f2b750ac2fd
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-brcmstb-memc
@@ -0,0 +1,15 @@
+What: /sys/bus/platform/devices/*/srpd
+Date: July 2022
+KernelVersion: 5.21
+Contact: Florian Fainelli <f.fainelli@gmail.com>
+Description:
+ Self Refresh Power Down (SRPD) inactivity timeout counted in
+ internal DDR controller clock cycles. Possible values range
+ from 0 (disable inactivity timeout) to 65535 (0xffff).
+
+What: /sys/bus/platform/devices/*/frequency
+Date: July 2022
+KernelVersion: 5.21
+Contact: Florian Fainelli <f.fainelli@gmail.com>
+Description:
+ DDR PHY frequency in Hz.
diff --git a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2 b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
index b0f4684a83fe..b9f7d924f28a 100644
--- a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
+++ b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
@@ -2,8 +2,8 @@ What: /sys/bus/platform/devices/ci_hdrc.0/role
Date: Mar 2017
Contact: Peter Chen <peter.chen@nxp.com>
Description:
- It returns string "gadget" or "host" when read it, it indicates
- current controller role.
+ When read, it returns string "gadget" or "host", indicating
+ the current controller role.
- It will do role switch when write "gadget" or "host" to it.
+ It will do role switch when "gadget" or "host" is written to it.
Only controller at dual-role configuration supports writing.
diff --git a/Documentation/ABI/testing/sysfs-platform-dell-privacy-wmi b/Documentation/ABI/testing/sysfs-platform-dell-privacy-wmi
index 7f9e18705861..1f1f274a6979 100644
--- a/Documentation/ABI/testing/sysfs-platform-dell-privacy-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-dell-privacy-wmi
@@ -1,55 +1,71 @@
What: /sys/bus/wmi/devices/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_supported_type
Date: Apr 2021
KernelVersion: 5.13
-Contact: "perry.yuan@dell.com>"
+Contact: "<perry.yuan@dell.com>"
Description:
Display which dell hardware level privacy devices are supported
“Dell Privacy” is a set of HW, FW, and SW features to enhance
Dell’s commitment to platform privacy for MIC, Camera, and
ePrivacy screens.
The supported hardware privacy devices are:
-Attributes:
- Microphone Mute:
+
+ Attributes:
+ Microphone Mute:
Identifies the local microphone can be muted by hardware, no applications
is available to capture system mic sound
- Camera Shutter:
+ Camera Shutter:
Identifies camera shutter controlled by hardware, which is a micromechanical
shutter assembly that is built onto the camera module to block capturing images
from outside the laptop
- supported:
+ Values:
+
+ supported:
The privacy device is supported by this system
- unsupported:
+ unsupported:
The privacy device is not supported on this system
- For example to check which privacy devices are supported:
+ For example to check which privacy devices are supported::
- # cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_supported_type
- [Microphone Mute] [supported]
- [Camera Shutter] [supported]
- [ePrivacy Screen] [unsupported]
+ # cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_supported_type
+ [Microphone Mute] [supported]
+ [Camera Shutter] [supported]
+ [ePrivacy Screen] [unsupported]
What: /sys/bus/wmi/devices/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_current_state
Date: Apr 2021
KernelVersion: 5.13
-Contact: "perry.yuan@dell.com>"
+Contact: "<perry.yuan@dell.com>"
Description:
Allow user space to check current dell privacy device state.
Describes the Device State class exposed by BIOS which can be
consumed by various applications interested in knowing the Privacy
feature capabilities
-Attributes:
- muted:
- Identifies the privacy device is turned off and cannot send stream to OS applications
- unmuted:
- Identifies the privacy device is turned on ,audio or camera driver can get
- stream from mic and camera module to OS applications
+ Attributes:
+ Microphone:
+ Identifies the local microphone can be muted by hardware, no applications
+ is available to capture system mic sound
+
+ Camera Shutter:
+ Identifies camera shutter controlled by hardware, which is a micromechanical
+ shutter assembly that is built onto the camera module to block capturing images
+ from outside the laptop
+
+ Values:
+ muted:
+ Identifies the privacy device is turned off
+ and cannot send stream to OS applications
+
+ unmuted:
+ Identifies the privacy device is turned on,
+ audio or camera driver can get stream from mic
+ and camera module to OS applications
- For example to check all supported current privacy device states:
+ For example to check all supported current privacy device states::
- # cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_current_state
- [Microphone] [unmuted]
- [Camera Shutter] [unmuted]
+ # cat /sys/bus/wmi/drivers/dell-privacy/6932965F-1671-4CEB-B988-D3AB0A901919/dell_privacy_current_state
+ [Microphone] [unmuted]
+ [Camera Shutter] [unmuted]
diff --git a/Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv b/Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
new file mode 100644
index 000000000000..1d97ad615c66
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-dell-wmi-ddv
@@ -0,0 +1,7 @@
+What: /sys/class/power_supply/<battery_name>/eppid
+Date: September 2022
+KernelVersion: 6.1
+Contact: Armin Wolf <W_Armin@gmx.de>
+Description:
+ Reports the Dell ePPID (electronic Dell Piece Part Identification)
+ of the ACPI battery.
diff --git a/Documentation/ABI/testing/sysfs-platform-dptf b/Documentation/ABI/testing/sysfs-platform-dptf
index 53c6b1000320..620fd20434a5 100644
--- a/Documentation/ABI/testing/sysfs-platform-dptf
+++ b/Documentation/ABI/testing/sysfs-platform-dptf
@@ -133,7 +133,10 @@ Contact: linux-acpi@vger.kernel.org
Description:
(RO) Presents SSC (spread spectrum clock) information for EMI
(Electro magnetic interference) control. This is a bit mask.
+
+ ======= ==========================================
Bits Description
+ ======= ==========================================
[7:0] Sets clock spectrum spread percentage:
0x00=0.2% , 0x3F=10%
1 LSB = 0.1% increase in spread (for
@@ -151,3 +154,4 @@ Description:
[10] 0: No white noise. 1: Add white noise
to spread waveform
[11] When 1, future writes are ignored.
+ ======= ==========================================
diff --git a/Documentation/ABI/testing/sysfs-platform-intel-ifs b/Documentation/ABI/testing/sysfs-platform-intel-ifs
new file mode 100644
index 000000000000..41b4d5b1e21c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-intel-ifs
@@ -0,0 +1,52 @@
+Device instance to test mapping
+intel_ifs_0 -> Scan Test
+intel_ifs_1 -> Array BIST test
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/run_test
+Date: Nov 16 2022
+KernelVersion: 6.2
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Write <cpu#> to trigger IFS test for one online core.
+ Note that the test is per core. The cpu# can be
+ for any thread on the core. Running on one thread
+ completes the test for the core containing that thread.
+ Example: to test the core containing cpu5: echo 5 >
+ /sys/devices/virtual/misc/intel_ifs_<N>/run_test
+Devices: all
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/status
+Date: Nov 16 2022
+KernelVersion: 6.2
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: The status of the last test. It can be one of "pass", "fail"
+ or "untested".
+Devices: all
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/details
+Date: Nov 16 2022
+KernelVersion: 6.2
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Additional information regarding the last test. The details file reports
+ the hex value of the STATUS MSR for this test. Note that the error_code field
+ may contain driver defined software code not defined in the Intel SDM.
+Devices: all
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/image_version
+Date: Nov 16 2022
+KernelVersion: 6.2
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Version (hexadecimal) of loaded IFS test image. If no test image
+ is loaded reports "none". Only present for device instances where a test image
+ is applicable.
+Devices: intel_ifs_0
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/current_batch
+Date: Nov 16 2022
+KernelVersion: 6.2
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Write a number less than or equal to 0xff to load an IFS test image.
+ The number written treated as the 2 digit suffix in the following file name:
+ /lib/firmware/intel/ifs_<N>/ff-mm-ss-02x.scan
+ Reading the file will provide the suffix of the currently loaded IFS test image.
+ This file is present only for device instances where a test image is applicable.
+Devices: intel_ifs_0
diff --git a/Documentation/ABI/testing/sysfs-platform-intel-pmc b/Documentation/ABI/testing/sysfs-platform-intel-pmc
index ef199af75ab0..f31d59b21f9b 100644
--- a/Documentation/ABI/testing/sysfs-platform-intel-pmc
+++ b/Documentation/ABI/testing/sysfs-platform-intel-pmc
@@ -11,8 +11,10 @@ Description:
to take effect.
Display global reset setting bits for PMC.
+
* bit 31 - global reset is locked
* bit 20 - global reset is set
+
Writing bit 20 value to the etr3 will induce
a platform "global reset" upon consequent platform reset,
in case the register is not locked.
diff --git a/Documentation/ABI/testing/sysfs-platform-lg-laptop b/Documentation/ABI/testing/sysfs-platform-lg-laptop
index cf47749b19df..0570cd524d0e 100644
--- a/Documentation/ABI/testing/sysfs-platform-lg-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-lg-laptop
@@ -17,6 +17,7 @@ Date: October 2018
KernelVersion: 4.20
Contact: "Matan Ziv-Av <matan@svgalib.org>
Description:
+ Deprecated use /sys/class/power_supply/CMB0/charge_control_end_threshold
Maximal battery charge level. Accepted values are 80 or 100.
What: /sys/devices/platform/lg-laptop/fan_mode
diff --git a/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl b/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
index e79ca22e2f45..9b99a81babb1 100644
--- a/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
+++ b/Documentation/ABI/testing/sysfs-platform-mellanox-bootctl
@@ -68,3 +68,10 @@ Description:
Wasted burnt and invalid
Invalid not burnt but marked as valid (error state).
======= ===============================================
+
+What: /sys/bus/platform/devices/MLNXBF04:00/bootfifo
+Date: Apr 2023
+KernelVersion: 6.4
+Contact: "Liming Sun <limings@nvidia.com>"
+Description:
+ The file used to access the BlueField boot fifo.
diff --git a/Documentation/ABI/testing/sysfs-platform-sst-atom b/Documentation/ABI/testing/sysfs-platform-sst-atom
index d5f6e21f0e42..0154b0fba759 100644
--- a/Documentation/ABI/testing/sysfs-platform-sst-atom
+++ b/Documentation/ABI/testing/sysfs-platform-sst-atom
@@ -1,4 +1,4 @@
-What: /sys/devices/platform/8086%x:00/firmware_version
+What: /sys/devices/platform/8086<x>:00/firmware_version
Date: November 2016
KernelVersion: 4.10
Contact: "Sebastien Guiriec" <sebastien.guiriec@intel.com>
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 90ec4987074b..a3942b1036e2 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -152,7 +152,7 @@ Description:
case further investigation is required to determine which
device is causing the problem. Note that genuine RTC clock
values (such as when pm_trace has not been used), can still
- match a device and output it's name here.
+ match a device and output its name here.
What: /sys/power/pm_async
Date: January 2009
@@ -413,6 +413,35 @@ Description:
The /sys/power/suspend_stats/last_failed_step file contains
the last failed step in the suspend/resume path.
+What: /sys/power/suspend_stats/last_hw_sleep
+Date: June 2023
+Contact: Mario Limonciello <mario.limonciello@amd.com>
+Description:
+ The /sys/power/suspend_stats/last_hw_sleep file
+ contains the duration of time spent in a hardware sleep
+ state in the most recent system suspend-resume cycle.
+ This number is measured in microseconds.
+
+What: /sys/power/suspend_stats/total_hw_sleep
+Date: June 2023
+Contact: Mario Limonciello <mario.limonciello@amd.com>
+Description:
+ The /sys/power/suspend_stats/total_hw_sleep file
+ contains the aggregate of time spent in a hardware sleep
+ state since the kernel was booted. This number
+ is measured in microseconds.
+
+What: /sys/power/suspend_stats/max_hw_sleep
+Date: June 2023
+Contact: Mario Limonciello <mario.limonciello@amd.com>
+Description:
+ The /sys/power/suspend_stats/max_hw_sleep file
+ contains the maximum amount of time that the hardware can
+ report for time spent in a hardware sleep state. When sleep
+ cycles are longer than this time, the values for
+ 'total_hw_sleep' and 'last_hw_sleep' may not be accurate.
+ This number is measured in microseconds.
+
What: /sys/power/sync_on_suspend
Date: October 2019
Contact: Jonas Meurer <jonas@freesources.org>
diff --git a/Documentation/ABI/testing/sysfs-ptp b/Documentation/ABI/testing/sysfs-ptp
index d378f57c1b73..9c317ac7c47a 100644
--- a/Documentation/ABI/testing/sysfs-ptp
+++ b/Documentation/ABI/testing/sysfs-ptp
@@ -6,7 +6,7 @@ Description:
providing a standardized interface to the ancillary
features of PTP hardware clocks.
-What: /sys/class/ptp/ptpN/
+What: /sys/class/ptp/ptp<N>/
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -14,7 +14,7 @@ Description:
hardware clock registered into the PTP class driver
subsystem.
-What: /sys/class/ptp/ptpN/clock_name
+What: /sys/class/ptp/ptp<N>/clock_name
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -25,7 +25,7 @@ Description:
MAC based ones. The string does not necessarily have
to be any kind of unique id.
-What: /sys/class/ptp/ptpN/max_adjustment
+What: /sys/class/ptp/ptp<N>/max_adjustment
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -33,42 +33,42 @@ Description:
frequency adjustment value (a positive integer) in
parts per billion.
-What: /sys/class/ptp/ptpN/max_vclocks
+What: /sys/class/ptp/ptp<N>/max_vclocks
Date: May 2021
Contact: Yangbo Lu <yangbo.lu@nxp.com>
Description:
This file contains the maximum number of ptp vclocks.
Write integer to re-configure it.
-What: /sys/class/ptp/ptpN/n_alarms
+What: /sys/class/ptp/ptp<N>/n_alarms
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
This file contains the number of periodic or one shot
alarms offer by the PTP hardware clock.
-What: /sys/class/ptp/ptpN/n_external_timestamps
+What: /sys/class/ptp/ptp<N>/n_external_timestamps
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
This file contains the number of external timestamp
channels offered by the PTP hardware clock.
-What: /sys/class/ptp/ptpN/n_periodic_outputs
+What: /sys/class/ptp/ptp<N>/n_periodic_outputs
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
This file contains the number of programmable periodic
output channels offered by the PTP hardware clock.
-What: /sys/class/ptp/ptpN/n_pins
+What: /sys/class/ptp/ptp<N>/n_pins
Date: March 2014
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
This file contains the number of programmable pins
offered by the PTP hardware clock.
-What: /sys/class/ptp/ptpN/n_vclocks
+What: /sys/class/ptp/ptp<N>/n_vclocks
Date: May 2021
Contact: Yangbo Lu <yangbo.lu@nxp.com>
Description:
@@ -81,7 +81,7 @@ Description:
switches the physical clock back to normal, adjustable
operation.
-What: /sys/class/ptp/ptpN/pins
+What: /sys/class/ptp/ptp<N>/pins
Date: March 2014
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -94,7 +94,7 @@ Description:
assignment may be changed by two writing numbers into
the file.
-What: /sys/class/ptp/ptpN/pps_available
+What: /sys/class/ptp/ptp<N>/pps_available
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -103,7 +103,7 @@ Description:
"1" means that the PPS is supported, while "0" means
not supported.
-What: /sys/class/ptp/ptpN/extts_enable
+What: /sys/class/ptp/ptp<N>/extts_enable
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -113,7 +113,7 @@ Description:
To disable external timestamps, write the channel
index followed by a "0" into the file.
-What: /sys/class/ptp/ptpN/fifo
+What: /sys/class/ptp/ptp<N>/fifo
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -121,7 +121,7 @@ Description:
the form of three integers: channel index, seconds,
and nanoseconds.
-What: /sys/class/ptp/ptpN/period
+What: /sys/class/ptp/ptp<N>/period
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
@@ -132,7 +132,7 @@ Description:
period nanoseconds. To disable a periodic output, set
all the seconds and nanoseconds values to zero.
-What: /sys/class/ptp/ptpN/pps_enable
+What: /sys/class/ptp/ptp<N>/pps_enable
Date: September 2010
Contact: Richard Cochran <richardcochran@gmail.com>
Description:
diff --git a/Documentation/ABI/testing/sysfs-secvar b/Documentation/ABI/testing/sysfs-secvar
index feebb8c57294..857cf12b0904 100644
--- a/Documentation/ABI/testing/sysfs-secvar
+++ b/Documentation/ABI/testing/sysfs-secvar
@@ -18,6 +18,14 @@ Description: A string indicating which backend is in use by the firmware.
This determines the format of the variable and the accepted
format of variable updates.
+ On powernv/OPAL, this value is provided by the OPAL firmware
+ and is expected to be "ibm,edk2-compat-v1".
+
+ On pseries/PLPKS, this is generated by the kernel based on the
+ version number in the SB_VERSION variable in the keystore, and
+ has the form "ibm,plpks-sb-v<version>", or
+ "ibm,plpks-sb-unknown" if there is no SB_VERSION variable.
+
What: /sys/firmware/secvar/vars/<variable name>
Date: August 2019
Contact: Nayna Jain <nayna@linux.ibm.com>
@@ -34,7 +42,7 @@ Description: An integer representation of the size of the content of the
What: /sys/firmware/secvar/vars/<variable_name>/data
Date: August 2019
-Contact: Nayna Jain h<nayna@linux.ibm.com>
+Contact: Nayna Jain <nayna@linux.ibm.com>
Description: A read-only file containing the value of the variable. The size
of the file represents the maximum size of the variable data.
@@ -44,3 +52,68 @@ Contact: Nayna Jain <nayna@linux.ibm.com>
Description: A write-only file that is used to submit the new value for the
variable. The size of the file represents the maximum size of
the variable data that can be written.
+
+What: /sys/firmware/secvar/config
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: This optional directory contains read-only config attributes as
+ defined by the secure variable implementation. All data is in
+ ASCII format. The directory is only created if the backing
+ implementation provides variables to populate it, which at
+ present is only PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/version
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Config version as reported by the hypervisor in ASCII decimal
+ format.
+
+ Currently only provided by PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/max_object_size
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Maximum allowed size of objects in the keystore in bytes,
+ represented in ASCII decimal format.
+
+ This is not necessarily the same as the max size that can be
+ written to an update file as writes can contain more than
+ object data, you should use the size of the update file for
+ that purpose.
+
+ Currently only provided by PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/total_size
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Total size of the PLPKS in bytes, represented in ASCII decimal
+ format.
+
+ Currently only provided by PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/used_space
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Current space consumed by the key store, in bytes, represented
+ in ASCII decimal format.
+
+ Currently only provided by PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/supported_policies
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Bitmask of supported policy flags by the hypervisor,
+ represented as an 8 byte hexadecimal ASCII string. Consult the
+ hypervisor documentation for what these flags are.
+
+ Currently only provided by PLPKS on the pseries platform.
+
+What: /sys/firmware/secvar/config/signed_update_algorithms
+Date: February 2023
+Contact: Nayna Jain <nayna@linux.ibm.com>
+Description: Bitmask of flags indicating which algorithms the hypervisor
+ supports for signed update of objects, represented as a 16 byte
+ hexadecimal ASCII string. Consult the hypervisor documentation
+ for what these flags mean.
+
+ Currently only provided by PLPKS on the pseries platform.
diff --git a/Documentation/ABI/testing/sysfs-timecard b/Documentation/ABI/testing/sysfs-timecard
new file mode 100644
index 000000000000..220478156297
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-timecard
@@ -0,0 +1,288 @@
+What: /sys/class/timecard/
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This directory contains files and directories
+ providing a standardized interface to the ancillary
+ features of the OpenCompute timecard.
+
+What: /sys/class/timecard/ocpN/
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This directory contains the attributes of the Nth timecard
+ registered.
+
+What: /sys/class/timecard/ocpN/available_clock_sources
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) The list of available time sources that the PHC
+ uses for clock adjustments.
+
+ ==== =================================================
+ NONE no adjustments
+ PPS adjustments come from the PPS1 selector (default)
+ TOD adjustments from the GNSS/TOD module
+ IRIG adjustments from external IRIG-B signal
+ DCF adjustments from external DCF signal
+ ==== =================================================
+
+What: /sys/class/timecard/ocpN/available_sma_inputs
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Set of available destinations (sinks) for a SMA
+ input signal.
+
+ ===== ================================================
+ 10Mhz signal is used as the 10Mhz reference clock
+ PPS1 signal is sent to the PPS1 selector
+ PPS2 signal is sent to the PPS2 selector
+ TS1 signal is sent to timestamper 1
+ TS2 signal is sent to timestamper 2
+ TS3 signal is sent to timestamper 3
+ TS4 signal is sent to timestamper 4
+ IRIG signal is sent to the IRIG-B module
+ DCF signal is sent to the DCF module
+ FREQ1 signal is sent to frequency counter 1
+ FREQ2 signal is sent to frequency counter 2
+ FREQ3 signal is sent to frequency counter 3
+ FREQ4 signal is sent to frequency counter 4
+ None signal input is disabled
+ ===== ================================================
+
+What: /sys/class/timecard/ocpN/available_sma_outputs
+Date: May 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Set of available sources for a SMA output signal.
+
+ ===== ================================================
+ 10Mhz output is from the 10Mhz reference clock
+ PHC output PPS is from the PHC clock
+ MAC output PPS is from the Miniature Atomic Clock
+ GNSS1 output PPS is from the first GNSS module
+ GNSS2 output PPS is from the second GNSS module
+ IRIG output is from the PHC, in IRIG-B format
+ DCF output is from the PHC, in DCF format
+ GEN1 output is from frequency generator 1
+ GEN2 output is from frequency generator 2
+ GEN3 output is from frequency generator 3
+ GEN4 output is from frequency generator 4
+ GND output is GND
+ VCC output is VCC
+ ===== ================================================
+
+What: /sys/class/timecard/ocpN/clock_source
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) Contains the current synchronization source used by
+ the PHC. May be changed by writing one of the listed
+ values from the available_clock_sources attribute set.
+
+What: /sys/class/timecard/ocpN/clock_status_drift
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Contains the current drift value used by the firmware
+ for internal disciplining of the atomic clock.
+
+What: /sys/class/timecard/ocpN/clock_status_offset
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Contains the current offset value used by the firmware
+ for internal disciplining of the atomic clock.
+
+What: /sys/class/timecard/ocpN/freqX
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Optional directory containing the sysfs nodes for
+ frequency counter <X>.
+
+What: /sys/class/timecard/ocpN/freqX/frequency
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Contains the measured frequency over the specified
+ measurement period.
+
+What: /sys/class/timecard/ocpN/freqX/seconds
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) Specifies the number of seconds from 0-255 that the
+ frequency should be measured over. Write 0 to disable.
+
+What: /sys/class/timecard/ocpN/genX
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Optional directory containing the sysfs nodes for
+ frequency generator <X>.
+
+What: /sys/class/timecard/ocpN/genX/duty
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Specifies the signal duty cycle as a percentage from 1-99.
+
+What: /sys/class/timecard/ocpN/genX/period
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Specifies the signal period in nanoseconds.
+
+What: /sys/class/timecard/ocpN/genX/phase
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Specifies the signal phase offset in nanoseconds.
+
+What: /sys/class/timecard/ocpN/genX/polarity
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Specifies the signal polarity, either 1 or 0.
+
+What: /sys/class/timecard/ocpN/genX/running
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Either 0 or 1, showing if the signal generator is running.
+
+What: /sys/class/timecard/ocpN/genX/start
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Shows the time in <sec>.<nsec> that the signal generator
+ started running.
+
+What: /sys/class/timecard/ocpN/genX/signal
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) Used to start the signal generator, and summarize
+ the current status.
+
+ The signal generator may be started by writing the signal
+ period, followed by the optional signal values. If the
+ optional values are not provided, they default to the current
+ settings, which may be obtained from the other sysfs nodes.
+
+ period [duty [phase [polarity]]]
+
+ echo 500000000 > signal # 1/2 second period
+ echo 1000000 40 100 > signal
+ echo 0 > signal # turn off generator
+
+ Period and phase are specified in nanoseconds. Duty cycle is
+ a percentage from 1-99. Polarity is 1 or 0.
+
+ Reading this node will return:
+
+ period duty phase polarity start_time
+
+What: /sys/class/timecard/ocpN/gnss_sync
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Indicates whether a valid GNSS signal is received,
+ or when the signal was lost.
+
+What: /sys/class/timecard/ocpN/i2c
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This optional attribute links to the associated i2c device.
+
+What: /sys/class/timecard/ocpN/irig_b_mode
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) An integer from 0-7 indicating the timecode format
+ of the IRIG-B output signal: B00<n>
+
+What: /sys/class/timecard/ocpN/pps
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This optional attribute links to the associated PPS device.
+
+What: /sys/class/timecard/ocpN/ptp
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This attribute links to the associated PTP device.
+
+What: /sys/class/timecard/ocpN/serialnum
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RO) Provides the serial number of the timecard.
+
+What: /sys/class/timecard/ocpN/sma1
+What: /sys/class/timecard/ocpN/sma2
+What: /sys/class/timecard/ocpN/sma3
+What: /sys/class/timecard/ocpN/sma4
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) These attributes specify the direction of the signal
+ on the associated SMA connectors, and also the signal sink
+ or source.
+
+ The display format of the attribute is a space separated
+ list of signals, prefixed by the input/output direction.
+
+ The signal direction may be changed (if supported) by
+ prefixing the signal list with either "in:" or "out:".
+ If neither prefix is present, then the direction is unchanged.
+
+ The output signal may be changed by writing one of the listed
+ values from the available_sma_outputs attribute set.
+
+ The input destinations may be changed by writing multiple
+ values from the available_sma_inputs attribute set,
+ separated by spaces. If there are duplicated input
+ destinations between connectors, the lowest numbered SMA
+ connector is given priority.
+
+ Note that not all input combinations may make sense.
+
+ The 10Mhz reference clock input is currently only valid
+ on SMA1 and may not be combined with other destination sinks.
+
+What: /sys/class/timecard/ocpN/tod_correction
+Date: March 2022
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) The incoming GNSS signal is in UTC time, and the NMEA
+ format messages do not provide a TAI offset. This sets the
+ correction value for the incoming time.
+
+ If UBX_LS is enabled, this should be 0, and the offset is
+ taken from the UBX-NAV-TIMELS message.
+
+What: /sys/class/timecard/ocpN/ts_window_adjust
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) When retrieving the PHC with the PTP SYS_OFFSET_EXTENDED
+ ioctl, a system timestamp is made before and after the PHC
+ time is retrieved. The midpoint between the two system
+ timestamps is usually taken to be the SYS time associated
+ with the PHC time. This estimate may be wrong, as it depends
+ on PCI latencies, and when the PHC time was latched
+
+ The attribute value reduces the end timestamp by the given
+ number of nanoseconds, so the computed midpoint matches the
+ retrieved PHC time.
+
+ The initial value is set based on measured PCI latency and
+ the estimated point where the FPGA latches the PHC time. This
+ value may be changed by writing an unsigned integer.
+
+What: /sys/class/timecard/ocpN/ttyGNSS
+What: /sys/class/timecard/ocpN/ttyGNSS2
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: These optional attributes link to the TTY serial ports
+ associated with the GNSS devices.
+
+What: /sys/class/timecard/ocpN/ttyMAC
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This optional attribute links to the TTY serial port
+ associated with the Miniature Atomic Clock.
+
+What: /sys/class/timecard/ocpN/ttyNMEA
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: This optional attribute links to the TTY serial port
+ which outputs the PHC time in NMEA ZDA format.
+
+What: /sys/class/timecard/ocpN/utc_tai_offset
+Date: September 2021
+Contact: Jonathan Lemon <jonathan.lemon@gmail.com>
+Description: (RW) The DCF and IRIG output signals are in UTC, while the
+ TimeCard operates on TAI. This attribute allows setting the
+ offset in seconds, which is added to the TAI timebase for
+ these formats.
+
+ The offset may be changed by writing an unsigned integer.
diff --git a/Documentation/ABI/testing/sysfs-tty b/Documentation/ABI/testing/sysfs-tty
index e157130a6792..820e412d38a8 100644
--- a/Documentation/ABI/testing/sysfs-tty
+++ b/Documentation/ABI/testing/sysfs-tty
@@ -9,7 +9,7 @@ Description:
The file supports poll() to detect virtual
console switches.
-What: /sys/class/tty/tty0/active
+What: /sys/class/tty/tty<x>/active
Date: Nov 2010
Contact: Kay Sievers <kay.sievers@vrfy.org>
Description:
@@ -18,7 +18,7 @@ Description:
The file supports poll() to detect virtual
console switches.
-What: /sys/class/tty/ttyS0/uartclk
+What: /sys/class/tty/ttyS<x>/uartclk
Date: Sep 2012
Contact: Tomas Hlavacek <tmshlvck@gmail.com>
Description:
@@ -29,7 +29,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/type
+What: /sys/class/tty/ttyS<x>/type
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -38,7 +38,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/line
+What: /sys/class/tty/ttyS<x>/line
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -47,7 +47,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/port
+What: /sys/class/tty/ttyS<x>/port
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -56,7 +56,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/irq
+What: /sys/class/tty/ttyS<x>/irq
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -65,7 +65,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/flags
+What: /sys/class/tty/ttyS<x>/flags
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -74,7 +74,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/xmit_fifo_size
+What: /sys/class/tty/ttyS<x>/xmit_fifo_size
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -83,7 +83,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/close_delay
+What: /sys/class/tty/ttyS<x>/close_delay
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -92,7 +92,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/closing_wait
+What: /sys/class/tty/ttyS<x>/closing_wait
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -101,7 +101,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/custom_divisor
+What: /sys/class/tty/ttyS<x>/custom_divisor
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -110,7 +110,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/io_type
+What: /sys/class/tty/ttyS<x>/io_type
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -120,7 +120,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/iomem_base
+What: /sys/class/tty/ttyS<x>/iomem_base
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -129,7 +129,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/iomem_reg_shift
+What: /sys/class/tty/ttyS<x>/iomem_reg_shift
Date: October 2012
Contact: Alan Cox <alan@linux.intel.com>
Description:
@@ -139,7 +139,7 @@ Description:
These sysfs values expose the TIOCGSERIAL interface via
sysfs rather than via ioctls.
-What: /sys/class/tty/ttyS0/rx_trig_bytes
+What: /sys/class/tty/ttyS<x>/rx_trig_bytes
Date: May 2014
Contact: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Description:
@@ -155,7 +155,7 @@ Description:
16550A, which has 1/4/8/14 bytes trigger, the RX trigger is
automatically changed to 4 bytes.
-What: /sys/class/tty/ttyS0/console
+What: /sys/class/tty/ttyS<x>/console
Date: February 2020
Contact: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Description:
diff --git a/Documentation/COPYING-logo b/Documentation/COPYING-logo
deleted file mode 100644
index b21c7cf7d9f6..000000000000
--- a/Documentation/COPYING-logo
+++ /dev/null
@@ -1,13 +0,0 @@
-This is the full-colour version of the currently unofficial Linux logo
-("currently unofficial" just means that there has been no paperwork and
-that I have not really announced it yet). It was created by Larry Ewing,
-and is freely usable as long as you acknowledge Larry as the original
-artist.
-
-Note that there are black-and-white versions of this available that
-scale down to smaller sizes and are better for letterheads or whatever
-you want to use it for: for the full range of logos take a look at
-Larry's web-page:
-
- https://www.isc.tamu.edu/~lewing/linux/
-
diff --git a/Documentation/Kconfig b/Documentation/Kconfig
index e549a61f4d96..3a0e7ac0c4e3 100644
--- a/Documentation/Kconfig
+++ b/Documentation/Kconfig
@@ -1,23 +1,28 @@
-config WARN_MISSING_DOCUMENTS
+if COMPILE_TEST
+
+menu "Documentation"
+config WARN_MISSING_DOCUMENTS
bool "Warn if there's a missing documentation file"
- depends on COMPILE_TEST
help
- It is not uncommon that a document gets renamed.
- This option makes the Kernel to check for missing dependencies,
- warning when something is missing. Works only if the Kernel
- is built from a git tree.
+ It is not uncommon that a document gets renamed.
+ This option makes the Kernel to check for missing dependencies,
+ warning when something is missing. Works only if the Kernel
+ is built from a git tree.
- If unsure, select 'N'.
+ If unsure, select 'N'.
config WARN_ABI_ERRORS
bool "Warn if there are errors at ABI files"
- depends on COMPILE_TEST
help
- The files under Documentation/ABI should follow what's
- described at Documentation/ABI/README. Yet, as they're manually
- written, it would be possible that some of those files would
- have errors that would break them for being parsed by
- scripts/get_abi.pl. Add a check to verify them.
+ The files under Documentation/ABI should follow what's
+ described at Documentation/ABI/README. Yet, as they're manually
+ written, it would be possible that some of those files would
+ have errors that would break them for being parsed by
+ scripts/get_abi.pl. Add a check to verify them.
+
+ If unsure, select 'N'.
+
+endmenu
- If unsure, select 'N'.
+endif
diff --git a/Documentation/Makefile b/Documentation/Makefile
index c3feb657b654..023fa658a0a8 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -19,14 +19,16 @@ endif
SPHINXBUILD = sphinx-build
SPHINXOPTS =
SPHINXDIRS = .
+DOCS_THEME =
+DOCS_CSS =
_SPHINXDIRS = $(sort $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst)))
SPHINX_CONF = conf.py
PAPER =
BUILDDIR = $(obj)/output
PDFLATEX = xelatex
-LATEXOPTS = -interaction=batchmode
+LATEXOPTS = -interaction=batchmode -no-shell-escape
-ifeq ($(KBUILD_VERBOSE),0)
+ifeq ($(findstring 1, $(KBUILD_VERBOSE)),)
SPHINXOPTS += "-q"
endif
@@ -84,12 +86,24 @@ quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
-D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
$(ALLSPHINXOPTS) \
$(abspath $(srctree)/$(src)/$5) \
- $(abspath $(BUILDDIR)/$3/$4)
+ $(abspath $(BUILDDIR)/$3/$4) && \
+ if [ "x$(DOCS_CSS)" != "x" ]; then \
+ cp $(if $(patsubst /%,,$(DOCS_CSS)),$(abspath $(srctree)/$(DOCS_CSS)),$(DOCS_CSS)) $(BUILDDIR)/$3/_static/; \
+ fi
htmldocs:
@$(srctree)/scripts/sphinx-pre-install --version-check
@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
+texinfodocs:
+ @$(srctree)/scripts/sphinx-pre-install --version-check
+ @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,texinfo,$(var),texinfo,$(var)))
+
+# Note: the 'info' Make target is generated by sphinx itself when
+# running the texinfodocs target define above.
+infodocs: texinfodocs
+ $(MAKE) -C $(BUILDDIR)/texinfo info
+
linkcheckdocs:
@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
@@ -138,6 +152,8 @@ cleandocs:
dochelp:
@echo ' Linux kernel internal documentation in different formats from ReST:'
@echo ' htmldocs - HTML'
+ @echo ' texinfodocs - Texinfo'
+ @echo ' infodocs - Info'
@echo ' latexdocs - LaTeX'
@echo ' pdfdocs - PDF'
@echo ' epubdocs - EPUB'
@@ -154,4 +170,8 @@ dochelp:
@echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
@echo ' configuration. This is e.g. useful to build with nit-picking config.'
@echo
+ @echo ' make DOCS_THEME={sphinx-theme} selects a different Sphinx theme.'
+ @echo
+ @echo ' make DOCS_CSS={a .css file} adds a DOCS_CSS override file for html/epub output.'
+ @echo
@echo ' Default location for the generated documents is Documentation/output'
diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
index 38ea1f604b6d..4d2333e7ae06 100644
--- a/Documentation/PCI/endpoint/index.rst
+++ b/Documentation/PCI/endpoint/index.rst
@@ -13,6 +13,8 @@ PCI Endpoint Framework
pci-test-howto
pci-ntb-function
pci-ntb-howto
+ pci-vntb-function
+ pci-vntb-howto
function/binding/pci-test
function/binding/pci-ntb
diff --git a/Documentation/PCI/endpoint/pci-vntb-function.rst b/Documentation/PCI/endpoint/pci-vntb-function.rst
new file mode 100644
index 000000000000..0c51f53ab972
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-vntb-function.rst
@@ -0,0 +1,129 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+PCI vNTB Function
+=================
+
+:Author: Frank Li <Frank.Li@nxp.com>
+
+The difference between PCI NTB function and PCI vNTB function is
+
+PCI NTB function need at two endpoint instances and connect HOST1
+and HOST2.
+
+PCI vNTB function only use one host and one endpoint(EP), use NTB
+connect EP and PCI host
+
+.. code-block:: text
+
+
+ +------------+ +---------------------------------------+
+ | | | |
+ +------------+ | +--------------+
+ | NTB | | | NTB |
+ | NetDev | | | NetDev |
+ +------------+ | +--------------+
+ | NTB | | | NTB |
+ | Transfer | | | Transfer |
+ +------------+ | +--------------+
+ | | | | |
+ | PCI NTB | | | |
+ | EPF | | | |
+ | Driver | | | PCI Virtual |
+ | | +---------------+ | NTB Driver |
+ | | | PCI EP NTB |<------>| |
+ | | | FN Driver | | |
+ +------------+ +---------------+ +--------------+
+ | | | | | |
+ | PCI BUS | <-----> | PCI EP BUS | | Virtual PCI |
+ | | PCI | | | BUS |
+ +------------+ +---------------+--------+--------------+
+ PCI RC PCI EP
+
+Constructs used for Implementing vNTB
+=====================================
+
+ 1) Config Region
+ 2) Self Scratchpad Registers
+ 3) Peer Scratchpad Registers
+ 4) Doorbell (DB) Registers
+ 5) Memory Window (MW)
+
+
+Config Region:
+--------------
+
+It is same as PCI NTB Function driver
+
+Scratchpad Registers:
+---------------------
+
+It is appended after Config region.
+
+.. code-block:: text
+
+
+ +--------------------------------------------------+ Base
+ | |
+ | |
+ | |
+ | Common Config Register |
+ | |
+ | |
+ | |
+ +-----------------------+--------------------------+ Base + span_offset
+ | | |
+ | Peer Span Space | Span Space |
+ | | |
+ | | |
+ +-----------------------+--------------------------+ Base + span_offset
+ | | | + span_count * 4
+ | | |
+ | Span Space | Peer Span Space |
+ | | |
+ +-----------------------+--------------------------+
+ Virtual PCI Pcie Endpoint
+ NTB Driver NTB Driver
+
+
+Doorbell Registers:
+-------------------
+
+ Doorbell Registers are used by the hosts to interrupt each other.
+
+Memory Window:
+--------------
+
+ Actual transfer of data between the two hosts will happen using the
+ memory window.
+
+Modeling Constructs:
+====================
+
+32-bit BARs.
+
+====== ===============
+BAR NO CONSTRUCTS USED
+====== ===============
+BAR0 Config Region
+BAR1 Doorbell
+BAR2 Memory Window 1
+BAR3 Memory Window 2
+BAR4 Memory Window 3
+BAR5 Memory Window 4
+====== ===============
+
+64-bit BARs.
+
+====== ===============================
+BAR NO CONSTRUCTS USED
+====== ===============================
+BAR0 Config Region + Scratchpad
+BAR1
+BAR2 Doorbell
+BAR3
+BAR4 Memory Window 1
+BAR5
+====== ===============================
+
+
diff --git a/Documentation/PCI/endpoint/pci-vntb-howto.rst b/Documentation/PCI/endpoint/pci-vntb-howto.rst
new file mode 100644
index 000000000000..4ab8e4a26d4b
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-vntb-howto.rst
@@ -0,0 +1,167 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================================
+PCI Non-Transparent Bridge (NTB) Endpoint Function (EPF) User Guide
+===================================================================
+
+:Author: Frank Li <Frank.Li@nxp.com>
+
+This document is a guide to help users use pci-epf-vntb function driver
+and ntb_hw_epf host driver for NTB functionality. The list of steps to
+be followed in the host side and EP side is given below. For the hardware
+configuration and internals of NTB using configurable endpoints see
+Documentation/PCI/endpoint/pci-vntb-function.rst
+
+Endpoint Device
+===============
+
+Endpoint Controller Devices
+---------------------------
+
+To find the list of endpoint controller devices in the system::
+
+ # ls /sys/class/pci_epc/
+ 5f010000.pcie_ep
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+ # ls /sys/kernel/config/pci_ep/controllers
+ 5f010000.pcie_ep
+
+Endpoint Function Drivers
+-------------------------
+
+To find the list of endpoint function drivers in the system::
+
+ # ls /sys/bus/pci-epf/drivers
+ pci_epf_ntb pci_epf_test pci_epf_vntb
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+ # ls /sys/kernel/config/pci_ep/functions
+ pci_epf_ntb pci_epf_test pci_epf_vntb
+
+
+Creating pci-epf-vntb Device
+----------------------------
+
+PCI endpoint function device can be created using the configfs. To create
+pci-epf-vntb device, the following commands can be used::
+
+ # mount -t configfs none /sys/kernel/config
+ # cd /sys/kernel/config/pci_ep/
+ # mkdir functions/pci_epf_vntb/func1
+
+The "mkdir func1" above creates the pci-epf-ntb function device that will
+be probed by pci_epf_vntb driver.
+
+The PCI endpoint framework populates the directory with the following
+configurable fields::
+
+ # ls functions/pci_epf_ntb/func1
+ baseclass_code deviceid msi_interrupts pci-epf-ntb.0
+ progif_code secondary subsys_id vendorid
+ cache_line_size interrupt_pin msix_interrupts primary
+ revid subclass_code subsys_vendor_id
+
+The PCI endpoint function driver populates these entries with default values
+when the device is bound to the driver. The pci-epf-vntb driver populates
+vendorid with 0xffff and interrupt_pin with 0x0001::
+
+ # cat functions/pci_epf_vntb/func1/vendorid
+ 0xffff
+ # cat functions/pci_epf_vntb/func1/interrupt_pin
+ 0x0001
+
+
+Configuring pci-epf-vntb Device
+-------------------------------
+
+The user can configure the pci-epf-vntb device using its configfs entry. In order
+to change the vendorid and the deviceid, the following
+commands can be used::
+
+ # echo 0x1957 > functions/pci_epf_vntb/func1/vendorid
+ # echo 0x0809 > functions/pci_epf_vntb/func1/deviceid
+
+In order to configure NTB specific attributes, a new sub-directory to func1
+should be created::
+
+ # mkdir functions/pci_epf_vntb/func1/pci_epf_vntb.0/
+
+The NTB function driver will populate this directory with various attributes
+that can be configured by the user::
+
+ # ls functions/pci_epf_vntb/func1/pci_epf_vntb.0/
+ db_count mw1 mw2 mw3 mw4 num_mws
+ spad_count
+
+A sample configuration for NTB function is given below::
+
+ # echo 4 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/db_count
+ # echo 128 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/spad_count
+ # echo 1 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/num_mws
+ # echo 0x100000 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/mw1
+
+A sample configuration for virtual NTB driver for virutal PCI bus::
+
+ # echo 0x1957 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/vntb_vid
+ # echo 0x080A > functions/pci_epf_vntb/func1/pci_epf_vntb.0/vntb_pid
+ # echo 0x10 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/vbus_number
+
+Binding pci-epf-ntb Device to EP Controller
+--------------------------------------------
+
+NTB function device should be attached to PCI endpoint controllers
+connected to the host.
+
+ # ln -s controllers/5f010000.pcie_ep functions/pci-epf-ntb/func1/primary
+
+Once the above step is completed, the PCI endpoint controllers are ready to
+establish a link with the host.
+
+
+Start the Link
+--------------
+
+In order for the endpoint device to establish a link with the host, the _start_
+field should be populated with '1'. For NTB, both the PCI endpoint controllers
+should establish link with the host (imx8 don't need this steps)::
+
+ # echo 1 > controllers/5f010000.pcie_ep/start
+
+RootComplex Device
+==================
+
+lspci Output at Host side
+-------------------------
+
+Note that the devices listed here correspond to the values populated in
+"Creating pci-epf-ntb Device" section above::
+
+ # lspci
+ 00:00.0 PCI bridge: Freescale Semiconductor Inc Device 0000 (rev 01)
+ 01:00.0 RAM memory: Freescale Semiconductor Inc Device 0809
+
+Endpoint Device / Virtual PCI bus
+=================================
+
+lspci Output at EP Side / Virtual PCI bus
+-----------------------------------------
+
+Note that the devices listed here correspond to the values populated in
+"Creating pci-epf-ntb Device" section above::
+
+ # lspci
+ 10:00.0 Unassigned class [ffff]: Dawicontrol Computersysteme GmbH Device 1234 (rev ff)
+
+Using ntb_hw_epf Device
+-----------------------
+
+The host side software follows the standard NTB software architecture in Linux.
+All the existing client side NTB utilities like NTB Transport Client and NTB
+Netdev, NTB Ping Pong Test Client and NTB Tool Test Client can be used with NTB
+function device.
+
+For more information on NTB see
+:doc:`Non-Transparent Bridge <../../driver-api/ntb>`
diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
index c17c87af1968..e73f84aebde3 100644
--- a/Documentation/PCI/index.rst
+++ b/Documentation/PCI/index.rst
@@ -1,8 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0
-=======================
-Linux PCI Bus Subsystem
-=======================
+=================
+PCI Bus Subsystem
+=================
.. toctree::
:maxdepth: 2
diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
index aa2046af69f7..8ae461e97c54 100644
--- a/Documentation/PCI/msi-howto.rst
+++ b/Documentation/PCI/msi-howto.rst
@@ -285,3 +285,13 @@ to bridges between the PCI root and the device, MSIs are disabled.
It is also worth checking the device driver to see whether it supports MSIs.
For example, it may contain calls to pci_alloc_irq_vectors() with the
PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
+
+
+List of device drivers MSI(-X) APIs
+===================================
+
+The PCI/MSI subystem has a dedicated C file for its exported device driver
+APIs — `drivers/pci/msi/api.c`. The following functions are exported:
+
+.. kernel-doc:: drivers/pci/msi/api.c
+ :export:
diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst
index 187f43a03200..9981d330da8f 100644
--- a/Documentation/PCI/pci-error-recovery.rst
+++ b/Documentation/PCI/pci-error-recovery.rst
@@ -83,6 +83,7 @@ This structure has the form::
int (*mmio_enabled)(struct pci_dev *dev);
int (*slot_reset)(struct pci_dev *dev);
void (*resume)(struct pci_dev *dev);
+ void (*cor_error_detected)(struct pci_dev *dev);
};
The possible channel states are::
@@ -417,10 +418,15 @@ That is, the recovery API only requires that:
- drivers/next/e100.c
- drivers/net/e1000
- drivers/net/e1000e
- - drivers/net/ixgb
- drivers/net/ixgbe
- drivers/net/cxgb3
- drivers/net/s2io.c
+ The cor_error_detected() callback is invoked in handle_error_source() when
+ the error severity is "correctable". The callback is optional and allows
+ additional logging to be done if desired. See example:
+
+ - drivers/cxl/pci.c
+
The End
-------
diff --git a/Documentation/PCI/pci-iov-howto.rst b/Documentation/PCI/pci-iov-howto.rst
index b9fd003206f1..27d35933cea2 100644
--- a/Documentation/PCI/pci-iov-howto.rst
+++ b/Documentation/PCI/pci-iov-howto.rst
@@ -125,14 +125,14 @@ Following piece of code illustrates the usage of the SR-IOV API.
...
}
- static int dev_suspend(struct pci_dev *dev, pm_message_t state)
+ static int dev_suspend(struct device *dev)
{
...
return 0;
}
- static int dev_resume(struct pci_dev *dev)
+ static int dev_resume(struct device *dev)
{
...
@@ -165,8 +165,7 @@ Following piece of code illustrates the usage of the SR-IOV API.
.id_table = dev_id_table,
.probe = dev_probe,
.remove = dev_remove,
- .suspend = dev_suspend,
- .resume = dev_resume,
+ .driver.pm = &dev_pm_ops,
.shutdown = dev_shutdown,
.sriov_configure = dev_sriov_configure,
};
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
index 87c6f4a6ca32..cced568d78e9 100644
--- a/Documentation/PCI/pci.rst
+++ b/Documentation/PCI/pci.rst
@@ -273,25 +273,25 @@ Set the DMA mask size
While all drivers should explicitly indicate the DMA capability
(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
+to "register" this capability by calling dma_set_mask() with
appropriate parameters. In general this allows more efficient DMA
on systems where System RAM exists above 4G _physical_ address.
Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
+dma_set_mask() as they are 64-bit DMA devices.
Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
+can directly address "coherent memory" in System RAM above 4G physical
+address by calling dma_set_coherent_mask().
Again, this includes drivers for all PCI-X and PCIe compliant devices.
Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
+("coherent") data.
Setup shared control data
-------------------------
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+Once the DMA masks are set, the driver can allocate "coherent" (a.k.a. shared)
memory. See Documentation/core-api/dma-api.rst for a full description of
the DMA APIs. This section is just a reminder that it needs to be done
before enabling DMA on the device.
@@ -367,7 +367,7 @@ steps need to be performed:
- Disable the device from generating IRQs
- Release the IRQ (free_irq())
- Stop all DMA activity
- - Release DMA buffers (both streaming and consistent)
+ - Release DMA buffers (both streaming and coherent)
- Unregister from other subsystems (e.g. scsi or netdev)
- Disable device from responding to MMIO/IO Port addresses
- Release MMIO/IO Port resource(s)
@@ -420,7 +420,7 @@ Once DMA is stopped, clean up streaming DMA first.
I.e. unmap data buffers and return buffers to "upstream"
owners if there is one.
-Then clean up "consistent" buffers which contain the control data.
+Then clean up "coherent" buffers which contain the control data.
See Documentation/core-api/dma-api.rst for details on unmapping interfaces.
diff --git a/Documentation/PCI/sysfs-pci.rst b/Documentation/PCI/sysfs-pci.rst
index 742fbd21dc1f..f495185aa88a 100644
--- a/Documentation/PCI/sysfs-pci.rst
+++ b/Documentation/PCI/sysfs-pci.rst
@@ -125,7 +125,7 @@ implementation of that functionality. To support the historical interface of
mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP.
Alternatively, platforms which set HAVE_PCI_MMAP may provide their own
-implementation of pci_mmap_page_range() instead of defining
+implementation of pci_mmap_resource_range() instead of defining
ARCH_GENERIC_PCI_MMAP_RESOURCE.
Platforms which support write-combining maps of PCI resources must define
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
index f4efd6897b09..b34990c7c377 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
@@ -973,7 +973,7 @@ The ``->dynticks`` field counts the corresponding CPU's transitions to
and from either dyntick-idle or user mode, so that this counter has an
even value when the CPU is in dyntick-idle mode or user mode and an odd
value otherwise. The transitions to/from user mode need to be counted
-for user mode adaptive-ticks support (see timers/NO_HZ.txt).
+for user mode adaptive-ticks support (see Documentation/timers/no_hz.rst).
The ``->rcu_need_heavy_qs`` field is used to record the fact that the
RCU core code would really like to see a quiescent state from the
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 6f89cf1e567d..93d899d53258 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -277,7 +277,7 @@ the following access functions:
Again, only one request in a given batch need actually carry out a
grace-period operation, which means there must be an efficient way to
-identify which of many concurrent reqeusts will initiate the grace
+identify which of many concurrent requests will initiate the grace
period, and that there be an efficient way for the remaining requests to
wait for that grace period to complete. However, that is the topic of
the next section.
@@ -405,8 +405,8 @@ Use of Workqueues
In earlier implementations, the task requesting the expedited grace
period also drove it to completion. This straightforward approach had
the disadvantage of needing to account for POSIX signals sent to user
-tasks, so more recent implemementations use the Linux kernel's
-`workqueues <https://www.kernel.org/doc/Documentation/core-api/workqueue.rst>`__.
+tasks, so more recent implementations use the Linux kernel's
+workqueues (see Documentation/core-api/workqueue.rst).
The requesting task still does counter snapshotting and funnel-lock
processing, but the task reaching the top of the funnel lock does a
@@ -465,7 +465,7 @@ corresponding disadvantage that workqueues cannot be used until they are
initialized, which does not happen until some time after the scheduler
spawns the first task. Given that there are parts of the kernel that
really do want to execute grace periods during this mid-boot “dead
-zone”, expedited grace periods must do something else during thie time.
+zone”, expedited grace periods must do something else during this time.
What they do is to fall back to the old practice of requiring that the
requesting task drive the expedited grace period, as was the case before
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel0.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel0.svg
index 98af66557908..16b1ff0ad38c 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel0.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel0.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel1.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel1.svg
index e0184a37aec7..684a4b969725 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel1.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel1.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel2.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel2.svg
index 1bc3fed54d58..8fb2454d9544 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel2.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel2.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel3.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel3.svg
index 6d8a1bffb3e4..5d4f22d5662c 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel3.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel3.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel4.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel4.svg
index 44018fd6342b..b89b02869914 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel4.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel4.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel5.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel5.svg
index e5eef50454fb..90f1c77bea2f 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel5.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel5.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel6.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel6.svg
index fbd2c1892886..3e5651da031a 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel6.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel6.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel7.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel7.svg
index 502e159ed278..9483f08d345e 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel7.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel7.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel8.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel8.svg
index 677401551c7d..1101ec30e604 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel8.svg
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Funnel8.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -125,7 +125,7 @@
y="492.36218" /></flowRegion><flowPara
id="flowPara2991" /></flowRoot> <text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="362.371"
y="262.51819"
id="text4441"
diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
index eeb351296df1..5750f125361b 100644
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
@@ -168,7 +168,7 @@ an ``atomic_add_return()`` of zero) to detect idle CPUs.
+-----------------------------------------------------------------------+
The approach must be extended to handle one final case, that of waking a
-task blocked in ``synchronize_rcu()``. This task might be affinitied to
+task blocked in ``synchronize_rcu()``. This task might be affined to
a CPU that is not yet aware that the grace period has ended, and thus
might not yet be subject to the grace period's memory ordering.
Therefore, there is an ``smp_mb()`` after the return from
@@ -202,49 +202,44 @@ newly arrived RCU callbacks against future grace periods:
1 static void rcu_prepare_for_idle(void)
2 {
3 bool needwake;
- 4 struct rcu_data *rdp;
- 5 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- 6 struct rcu_node *rnp;
- 7 struct rcu_state *rsp;
- 8 int tne;
- 9
- 10 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
- 11 rcu_is_nocb_cpu(smp_processor_id()))
- 12 return;
+ 4 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ 5 struct rcu_node *rnp;
+ 6 int tne;
+ 7
+ 8 lockdep_assert_irqs_disabled();
+ 9 if (rcu_rdp_is_offloaded(rdp))
+ 10 return;
+ 11
+ 12 /* Handle nohz enablement switches conservatively. */
13 tne = READ_ONCE(tick_nohz_active);
- 14 if (tne != rdtp->tick_nohz_enabled_snap) {
- 15 if (rcu_cpu_has_callbacks(NULL))
- 16 invoke_rcu_core();
- 17 rdtp->tick_nohz_enabled_snap = tne;
+ 14 if (tne != rdp->tick_nohz_enabled_snap) {
+ 15 if (!rcu_segcblist_empty(&rdp->cblist))
+ 16 invoke_rcu_core(); /* force nohz to see update. */
+ 17 rdp->tick_nohz_enabled_snap = tne;
18 return;
- 19 }
+ 19 }
20 if (!tne)
21 return;
- 22 if (rdtp->all_lazy &&
- 23 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
- 24 rdtp->all_lazy = false;
- 25 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
- 26 invoke_rcu_core();
- 27 return;
- 28 }
- 29 if (rdtp->last_accelerate == jiffies)
- 30 return;
- 31 rdtp->last_accelerate = jiffies;
- 32 for_each_rcu_flavor(rsp) {
- 33 rdp = this_cpu_ptr(rsp->rda);
- 34 if (rcu_segcblist_pend_cbs(&rdp->cblist))
- 35 continue;
- 36 rnp = rdp->mynode;
- 37 raw_spin_lock_rcu_node(rnp);
- 38 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- 39 raw_spin_unlock_rcu_node(rnp);
- 40 if (needwake)
- 41 rcu_gp_kthread_wake(rsp);
- 42 }
- 43 }
+ 22
+ 23 /*
+ 24 * If we have not yet accelerated this jiffy, accelerate all
+ 25 * callbacks on this CPU.
+ 26 */
+ 27 if (rdp->last_accelerate == jiffies)
+ 28 return;
+ 29 rdp->last_accelerate = jiffies;
+ 30 if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ 31 rnp = rdp->mynode;
+ 32 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ 33 needwake = rcu_accelerate_cbs(rnp, rdp);
+ 34 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ 35 if (needwake)
+ 36 rcu_gp_kthread_wake();
+ 37 }
+ 38 }
But the only part of ``rcu_prepare_for_idle()`` that really matters for
-this discussion are lines 37–39. We will therefore abbreviate this
+this discussion are lines 32–34. We will therefore abbreviate this
function as follows:
.. kernel-figure:: rcu_node-lock.svg
diff --git a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
index 4b4014fda770..87851a8fac1e 100644
--- a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
+++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
@@ -88,7 +88,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -103,7 +103,7 @@
id="text2993"
y="-261.66608"
x="412.12299"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
xml:space="preserve"
transform="matrix(0,1,-1,0,0,0)"><tspan
y="-261.66608"
@@ -135,7 +135,7 @@
</g>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.04738"
y="268.18076"
id="text4429"
@@ -146,7 +146,7 @@
y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.04738"
y="439.13766"
id="text4441"
@@ -157,7 +157,7 @@
y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="255.60869"
y="309.29346"
id="text4445"
@@ -168,7 +168,7 @@
y="309.29346">r1 = READ_ONCE(a);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="255.14423"
y="520.61786"
id="text4449"
@@ -179,7 +179,7 @@
y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="384.71124"
id="text4453"
@@ -190,7 +190,7 @@
y="384.71124">r2 = READ_ONCE(b);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="582.13617"
id="text4457"
@@ -201,7 +201,7 @@
y="582.13617">r3 = READ_ONCE(c);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.08231"
y="213.91006"
id="text4461"
@@ -212,7 +212,7 @@
y="213.91006">thread0()</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="252.34512"
y="213.91006"
id="text4461-6"
@@ -223,7 +223,7 @@
y="213.91006">thread1()</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.42557"
y="213.91006"
id="text4461-2"
@@ -251,7 +251,7 @@
inkscape:connector-curvature="0" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="111.75929"
y="251.53981"
id="text4429-8"
@@ -262,7 +262,7 @@
y="251.53981">rcu_read_lock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="367.91556"
id="text4429-8-9"
@@ -273,7 +273,7 @@
y="367.91556">rcu_read_lock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="597.40289"
id="text4429-8-9-3"
@@ -284,7 +284,7 @@
y="597.40289">rcu_read_unlock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="111.75929"
y="453.15311"
id="text4429-8-9-3-1"
@@ -300,7 +300,7 @@
inkscape:connector-curvature="0" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="394.94427"
y="345.66351"
id="text4648"
@@ -324,7 +324,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.11968"
y="475.77856"
id="text4648-4"
@@ -361,7 +361,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="254.85066"
y="348.96619"
id="text4648-4-3"
diff --git a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
index 48cd1623d4d4..e2a8af592bab 100644
--- a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
+++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
@@ -116,7 +116,7 @@
<flowRoot
xml:space="preserve"
id="flowRoot2985"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"><flowRegion
id="flowRegion2987"><rect
id="rect2989"
width="82.85714"
@@ -131,7 +131,7 @@
id="text2993"
y="-261.66608"
x="436.12299"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
xml:space="preserve"
transform="matrix(0,1,-1,0,0,0)"><tspan
y="-261.66608"
@@ -163,7 +163,7 @@
</g>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.04738"
y="268.18076"
id="text4429"
@@ -174,7 +174,7 @@
y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.04738"
y="487.13766"
id="text4441"
@@ -185,7 +185,7 @@
y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="255.60869"
y="297.29346"
id="text4445"
@@ -196,7 +196,7 @@
y="297.29346">r1 = READ_ONCE(a);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="255.14423"
y="554.61786"
id="text4449"
@@ -207,7 +207,7 @@
y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="370.71124"
id="text4453"
@@ -218,7 +218,7 @@
y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="572.13617"
id="text4457"
@@ -229,7 +229,7 @@
y="572.13617">r2 = READ_ONCE(c);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.08231"
y="213.91006"
id="text4461"
@@ -240,7 +240,7 @@
y="213.91006">thread0()</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="252.34512"
y="213.91006"
id="text4461-6"
@@ -251,7 +251,7 @@
y="213.91006">thread1()</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.42557"
y="213.91006"
id="text4461-2"
@@ -281,7 +281,7 @@
sodipodi:nodetypes="cc" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="111.75929"
y="251.53981"
id="text4429-8"
@@ -292,7 +292,7 @@
y="251.53981">rcu_read_lock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="353.91556"
id="text4429-8-9"
@@ -303,7 +303,7 @@
y="353.91556">rcu_read_lock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="396.10254"
y="587.40289"
id="text4429-8-9-3"
@@ -314,7 +314,7 @@
y="587.40289">rcu_read_unlock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="111.75929"
y="501.15311"
id="text4429-8-9-3-1"
@@ -331,7 +331,7 @@
sodipodi:nodetypes="cc" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="394.94427"
y="331.66351"
id="text4648"
@@ -355,7 +355,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="112.11968"
y="523.77856"
id="text4648-4"
@@ -392,7 +392,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="254.85066"
y="336.96619"
id="text4648-4-3"
@@ -421,7 +421,7 @@
id="text2993-7"
y="-261.66608"
x="440.12299"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
xml:space="preserve"
transform="matrix(0,1,-1,0,0,0)"><tspan
y="-261.66608"
@@ -453,7 +453,7 @@
</g>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="541.70508"
y="387.6217"
id="text4445-0"
@@ -464,7 +464,7 @@
y="387.6217">r3 = READ_ONCE(d);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="541.2406"
y="646.94611"
id="text4449-6"
@@ -488,7 +488,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="540.94702"
y="427.29443"
id="text4648-4-3-1"
@@ -499,7 +499,7 @@
y="427.29443">QS</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="686.27747"
y="461.83929"
id="text4453-7"
@@ -510,7 +510,7 @@
y="461.83929">r4 = READ_ONCE(b);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="686.27747"
y="669.26422"
id="text4457-9"
@@ -521,7 +521,7 @@
y="669.26422">r5 = READ_ONCE(e);</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="686.27747"
y="445.04358"
id="text4429-8-9-33"
@@ -532,7 +532,7 @@
y="445.04358">rcu_read_lock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="686.27747"
y="684.53094"
id="text4429-8-9-3-8"
@@ -543,7 +543,7 @@
y="684.53094">rcu_read_unlock();</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="685.11914"
y="422.79153"
id="text4648-9"
@@ -567,7 +567,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="397.85934"
y="609.59003"
id="text4648-5"
@@ -591,7 +591,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="256.75986"
y="586.99133"
id="text4648-5-2"
@@ -615,7 +615,7 @@
sodipodi:open="true" />
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="546.22791"
y="213.91006"
id="text4461-2-5"
@@ -626,7 +626,7 @@
y="213.91006">thread3()</tspan></text>
<text
xml:space="preserve"
- style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:monospace;-inkscape-font-specification:monospace"
x="684.00067"
y="213.91006"
id="text4461-2-1"
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 45278e2974c0..49387d823619 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -370,8 +370,8 @@ pointer fetched by rcu_dereference() may not be used outside of the
outermost RCU read-side critical section containing that
rcu_dereference(), unless protection of the corresponding data
element has been passed from RCU to some other synchronization
-mechanism, most commonly locking or `reference
-counting <https://www.kernel.org/doc/Documentation/RCU/rcuref.txt>`__.
+mechanism, most commonly locking or reference counting
+(see ../../rcuref.rst).
.. |high-quality implementation of C11 memory_order_consume [PDF]| replace:: high-quality implementation of C11 ``memory_order_consume`` [PDF]
.. _high-quality implementation of C11 memory_order_consume [PDF]: http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf
@@ -1844,10 +1844,10 @@ that meets this requirement.
Furthermore, NMI handlers can be interrupted by what appear to RCU to be
normal interrupts. One way that this can happen is for code that
-directly invokes rcu_irq_enter() and rcu_irq_exit() to be called
+directly invokes ct_irq_enter() and ct_irq_exit() to be called
from an NMI handler. This astonishing fact of life prompted the current
-code structure, which has rcu_irq_enter() invoking
-rcu_nmi_enter() and rcu_irq_exit() invoking rcu_nmi_exit().
+code structure, which has ct_irq_enter() invoking
+ct_nmi_enter() and ct_irq_exit() invoking ct_nmi_exit().
And yes, I also learned of this requirement the hard way.
Loadable Modules
@@ -1858,7 +1858,7 @@ unloaded. After a given module has been unloaded, any attempt to call
one of its functions results in a segmentation fault. The module-unload
functions must therefore cancel any delayed calls to loadable-module
functions, for example, any outstanding mod_timer() must be dealt
-with via del_timer_sync() or similar.
+with via timer_shutdown_sync() or similar.
Unfortunately, there is no way to cancel an RCU callback; once you
invoke call_rcu(), the callback function is eventually going to be
@@ -2195,7 +2195,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
sections, and RCU believes this CPU to be idle, no problem. This
sort of thing is used by some architectures for light-weight
exception handlers, which can then avoid the overhead of
- rcu_irq_enter() and rcu_irq_exit() at exception entry and
+ ct_irq_enter() and ct_irq_exit() at exception entry and
exit, respectively. Some go further and avoid the entireties of
irq_enter() and irq_exit().
Just make very sure you are running some of your tests with
@@ -2226,7 +2226,7 @@ scheduling-clock interrupt be enabled when RCU needs it to be:
+-----------------------------------------------------------------------+
| **Answer**: |
+-----------------------------------------------------------------------+
-| One approach is to do ``rcu_irq_exit();rcu_irq_enter();`` every so |
+| One approach is to do ``ct_irq_exit();ct_irq_enter();`` every so |
| often. But given that long-running interrupt handlers can cause other |
| problems, not least for response time, shouldn't you work to keep |
| your interrupt handler's runtime within reasonable bounds? |
@@ -2654,6 +2654,38 @@ synchronize_rcu(), and rcu_barrier(), respectively. In
three APIs are therefore implemented by separate functions that check
for voluntary context switches.
+Tasks Rude RCU
+~~~~~~~~~~~~~~
+
+Some forms of tracing need to wait for all preemption-disabled regions
+of code running on any online CPU, including those executed when RCU is
+not watching. This means that synchronize_rcu() is insufficient, and
+Tasks Rude RCU must be used instead. This flavor of RCU does its work by
+forcing a workqueue to be scheduled on each online CPU, hence the "Rude"
+moniker. And this operation is considered to be quite rude by real-time
+workloads that don't want their ``nohz_full`` CPUs receiving IPIs and
+by battery-powered systems that don't want their idle CPUs to be awakened.
+
+The tasks-rude-RCU API is also reader-marking-free and thus quite compact,
+consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(),
+and rcu_barrier_tasks_rude().
+
+Tasks Trace RCU
+~~~~~~~~~~~~~~~
+
+Some forms of tracing need to sleep in readers, but cannot tolerate
+SRCU's read-side overhead, which includes a full memory barrier in both
+srcu_read_lock() and srcu_read_unlock(). This need is handled by a
+Tasks Trace RCU that uses scheduler locking and IPIs to synchronize with
+readers. Real-time systems that cannot tolerate IPIs may build their
+kernels with ``CONFIG_TASKS_TRACE_RCU_READ_MB=y``, which avoids the IPIs at
+the expense of adding full memory barriers to the read-side primitives.
+
+The tasks-trace-RCU API is also reasonably compact,
+consisting of rcu_read_lock_trace(), rcu_read_unlock_trace(),
+rcu_read_lock_trace_held(), call_rcu_tasks_trace(),
+synchronize_rcu_tasks_trace(), and rcu_barrier_tasks_trace().
+
Possible Future Changes
-----------------------
diff --git a/Documentation/RCU/NMI-RCU.rst b/Documentation/RCU/NMI-RCU.rst
index 2a92bc685ef1..dff60a80b386 100644
--- a/Documentation/RCU/NMI-RCU.rst
+++ b/Documentation/RCU/NMI-RCU.rst
@@ -8,7 +8,7 @@ Although RCU is usually used to protect read-mostly data structures,
it is possible to use RCU to provide dynamic non-maskable interrupt
handlers, as well as dynamic irq handlers. This document describes
how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
-work in "arch/x86/kernel/traps.c".
+work in an old version of "arch/x86/kernel/traps.c".
The relevant pieces of code are listed below, each followed by a
brief explanation::
@@ -116,7 +116,7 @@ Answer to Quick Quiz:
This same sad story can happen on other CPUs when using
a compiler with aggressive pointer-value speculation
- optimizations.
+ optimizations. (But please don't!)
More important, the rcu_dereference_sched() makes it
clear to someone reading the code that the pointer is
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 588d97366a46..db8f16b392aa 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -201,7 +201,7 @@ work looked at debugging uses of RCU [Seyster:2011:RFA:2075416.2075425].
In 2012, Josh Triplett received his Ph.D. with his dissertation
covering RCU-protected resizable hash tables and the relationship
between memory barriers and read-side traversal order: If the updater
-is making changes in the opposite direction from the read-side traveral
+is making changes in the opposite direction from the read-side traversal
order, the updater need only execute a memory-barrier instruction,
but if in the same direction, the updater needs to wait for a grace
period between the individual updates [JoshTriplettPhD]. Also in 2012,
@@ -1245,7 +1245,7 @@ Oregon Health and Sciences University"
[Viewed September 5, 2005]"
,annotation={
First posting showing how RCU can be safely adapted for
- preemptable RCU read side critical sections.
+ preemptible RCU read side critical sections.
}
}
@@ -1888,7 +1888,7 @@ Revised:
\url{https://lore.kernel.org/r/20070910183004.GA3299@linux.vnet.ibm.com}
[Viewed October 25, 2007]"
,annotation={
- Final patch for preemptable RCU to -rt. (Later patches were
+ Final patch for preemptible RCU to -rt. (Later patches were
to mainline, eventually incorporated.)
}
}
@@ -2275,7 +2275,7 @@ lot of {Linux} into your technology!!!"
\url{https://lore.kernel.org/r/20090724001429.GA17374@linux.vnet.ibm.com}
[Viewed August 15, 2009]"
,annotation={
- First posting of simple and fast preemptable RCU.
+ First posting of simple and fast preemptible RCU.
}
}
@@ -2639,7 +2639,7 @@ lot of {Linux} into your technology!!!"
RCU-protected hash tables, barriers vs. read-side traversal order.
.
If the updater is making changes in the opposite direction from
- the read-side traveral order, the updater need only execute a
+ the read-side traversal order, the updater need only execute a
memory-barrier instruction, but if in the same direction, the
updater needs to wait for a grace period between the individual
updates.
diff --git a/Documentation/RCU/UP.rst b/Documentation/RCU/UP.rst
index e26dda27430c..4060d7a2f62a 100644
--- a/Documentation/RCU/UP.rst
+++ b/Documentation/RCU/UP.rst
@@ -38,7 +38,7 @@ by having call_rcu() directly invoke its arguments only if it was called
from process context. However, this can fail in a similar manner.
Suppose that an RCU-based algorithm again scans a linked list containing
-elements A, B, and C in process contexts, but that it invokes a function
+elements A, B, and C in process context, but that it invokes a function
on each element as it is scanned. Suppose further that this function
deletes element B from the list, then passes it to call_rcu() for deferred
freeing. This may be a bit unconventional, but it is perfectly legal
@@ -59,7 +59,8 @@ Example 3: Death by Deadlock
Suppose that call_rcu() is invoked while holding a lock, and that the
callback function must acquire this same lock. In this case, if
call_rcu() were to directly invoke the callback, the result would
-be self-deadlock.
+be self-deadlock *even if* this invocation occurred from a later
+call_rcu() invocation a full grace period later.
In some cases, it would possible to restructure to code so that
the call_rcu() is delayed until after the lock is released. However,
@@ -85,6 +86,14 @@ Quick Quiz #2:
:ref:`Answers to Quick Quiz <answer_quick_quiz_up>`
+It is important to note that userspace RCU implementations *do*
+permit call_rcu() to directly invoke callbacks, but only if a full
+grace period has elapsed since those callbacks were queued. This is
+the case because some userspace environments are extremely constrained.
+Nevertheless, people writing userspace RCU implementations are strongly
+encouraged to avoid invoking callbacks from call_rcu(), thus obtaining
+the deadlock-avoidance benefits called out above.
+
Summary
-------
@@ -98,7 +107,7 @@ UP systems, including PREEMPT SMP builds running on UP systems.
Quick Quiz #3:
Why can't synchronize_rcu() return immediately on UP systems running
- preemptable RCU?
+ preemptible RCU?
.. _answer_quick_quiz_up:
@@ -134,7 +143,7 @@ Answer to Quick Quiz #2:
Answer to Quick Quiz #3:
Why can't synchronize_rcu() return immediately on UP systems
- running preemptable RCU?
+ running preemptible RCU?
Because some other task might have been preempted in the middle
of an RCU read-side critical section. If synchronize_rcu()
diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst
deleted file mode 100644
index 4051ea3871ef..000000000000
--- a/Documentation/RCU/arrayRCU.rst
+++ /dev/null
@@ -1,165 +0,0 @@
-.. _array_rcu_doc:
-
-Using RCU to Protect Read-Mostly Arrays
-=======================================
-
-Although RCU is more commonly used to protect linked lists, it can
-also be used to protect arrays. Three situations are as follows:
-
-1. :ref:`Hash Tables <hash_tables>`
-
-2. :ref:`Static Arrays <static_arrays>`
-
-3. :ref:`Resizable Arrays <resizable_arrays>`
-
-Each of these three situations involves an RCU-protected pointer to an
-array that is separately indexed. It might be tempting to consider use
-of RCU to instead protect the index into an array, however, this use
-case is **not** supported. The problem with RCU-protected indexes into
-arrays is that compilers can play way too many optimization games with
-integers, which means that the rules governing handling of these indexes
-are far more trouble than they are worth. If RCU-protected indexes into
-arrays prove to be particularly valuable (which they have not thus far),
-explicit cooperation from the compiler will be required to permit them
-to be safely used.
-
-That aside, each of the three RCU-protected pointer situations are
-described in the following sections.
-
-.. _hash_tables:
-
-Situation 1: Hash Tables
-------------------------
-
-Hash tables are often implemented as an array, where each array entry
-has a linked-list hash chain. Each hash chain can be protected by RCU
-as described in the listRCU.txt document. This approach also applies
-to other array-of-list situations, such as radix trees.
-
-.. _static_arrays:
-
-Situation 2: Static Arrays
---------------------------
-
-Static arrays, where the data (rather than a pointer to the data) is
-located in each array element, and where the array is never resized,
-have not been used with RCU. Rik van Riel recommends using seqlock in
-this situation, which would also have minimal read-side overhead as long
-as updates are rare.
-
-Quick Quiz:
- Why is it so important that updates be rare when using seqlock?
-
-:ref:`Answer to Quick Quiz <answer_quick_quiz_seqlock>`
-
-.. _resizable_arrays:
-
-Situation 3: Resizable Arrays
-------------------------------
-
-Use of RCU for resizable arrays is demonstrated by the grow_ary()
-function formerly used by the System V IPC code. The array is used
-to map from semaphore, message-queue, and shared-memory IDs to the data
-structure that represents the corresponding IPC construct. The grow_ary()
-function does not acquire any locks; instead its caller must hold the
-ids->sem semaphore.
-
-The grow_ary() function, shown below, does some limit checks, allocates a
-new ipc_id_ary, copies the old to the new portion of the new, initializes
-the remainder of the new, updates the ids->entries pointer to point to
-the new array, and invokes ipc_rcu_putref() to free up the old array.
-Note that rcu_assign_pointer() is used to update the ids->entries pointer,
-which includes any memory barriers required on whatever architecture
-you are running on::
-
- static int grow_ary(struct ipc_ids* ids, int newsize)
- {
- struct ipc_id_ary* new;
- struct ipc_id_ary* old;
- int i;
- int size = ids->entries->size;
-
- if(newsize > IPCMNI)
- newsize = IPCMNI;
- if(newsize <= size)
- return newsize;
-
- new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize +
- sizeof(struct ipc_id_ary));
- if(new == NULL)
- return size;
- new->size = newsize;
- memcpy(new->p, ids->entries->p,
- sizeof(struct kern_ipc_perm *)*size +
- sizeof(struct ipc_id_ary));
- for(i=size;i<newsize;i++) {
- new->p[i] = NULL;
- }
- old = ids->entries;
-
- /*
- * Use rcu_assign_pointer() to make sure the memcpyed
- * contents of the new array are visible before the new
- * array becomes visible.
- */
- rcu_assign_pointer(ids->entries, new);
-
- ipc_rcu_putref(old);
- return newsize;
- }
-
-The ipc_rcu_putref() function decrements the array's reference count
-and then, if the reference count has dropped to zero, uses call_rcu()
-to free the array after a grace period has elapsed.
-
-The array is traversed by the ipc_lock() function. This function
-indexes into the array under the protection of rcu_read_lock(),
-using rcu_dereference() to pick up the pointer to the array so
-that it may later safely be dereferenced -- memory barriers are
-required on the Alpha CPU. Since the size of the array is stored
-with the array itself, there can be no array-size mismatches, so
-a simple check suffices. The pointer to the structure corresponding
-to the desired IPC object is placed in "out", with NULL indicating
-a non-existent entry. After acquiring "out->lock", the "out->deleted"
-flag indicates whether the IPC object is in the process of being
-deleted, and, if not, the pointer is returned::
-
- struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
- {
- struct kern_ipc_perm* out;
- int lid = id % SEQ_MULTIPLIER;
- struct ipc_id_ary* entries;
-
- rcu_read_lock();
- entries = rcu_dereference(ids->entries);
- if(lid >= entries->size) {
- rcu_read_unlock();
- return NULL;
- }
- out = entries->p[lid];
- if(out == NULL) {
- rcu_read_unlock();
- return NULL;
- }
- spin_lock(&out->lock);
-
- /* ipc_rmid() may have already freed the ID while ipc_lock
- * was spinning: here verify that the structure is still valid
- */
- if (out->deleted) {
- spin_unlock(&out->lock);
- rcu_read_unlock();
- return NULL;
- }
- return out;
- }
-
-.. _answer_quick_quiz_seqlock:
-
-Answer to Quick Quiz:
- Why is it so important that updates be rare when using seqlock?
-
- The reason that it is important that updates be rare when
- using seqlock is that frequent updates can livelock readers.
- One way to avoid this problem is to assign a seqlock for
- each array entry rather than to the entire array.
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index f4545b7c9a63..bd3c58c44bef 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -32,8 +32,8 @@ over a rather long period of time, but improvements are always welcome!
for lockless updates. This does result in the mildly
counter-intuitive situation where rcu_read_lock() and
rcu_read_unlock() are used to protect updates, however, this
- approach provides the same potential simplifications that garbage
- collectors do.
+ approach can provide the same simplifications to certain types
+ of lockless algorithms that garbage collectors do.
1. Does the update code have proper mutual exclusion?
@@ -49,12 +49,12 @@ over a rather long period of time, but improvements are always welcome!
them -- even x86 allows later loads to be reordered to precede
earlier stores), and be prepared to explain why this added
complexity is worthwhile. If you choose #c, be prepared to
- explain how this single task does not become a major bottleneck on
- big multiprocessor machines (for example, if the task is updating
- information relating to itself that other tasks can read, there
- by definition can be no bottleneck). Note that the definition
- of "large" has changed significantly: Eight CPUs was "large"
- in the year 2000, but a hundred CPUs was unremarkable in 2017.
+ explain how this single task does not become a major bottleneck
+ on large systems (for example, if the task is updating information
+ relating to itself that other tasks can read, there by definition
+ can be no bottleneck). Note that the definition of "large" has
+ changed significantly: Eight CPUs was "large" in the year 2000,
+ but a hundred CPUs was unremarkable in 2017.
2. Do the RCU read-side critical sections make proper use of
rcu_read_lock() and friends? These primitives are needed
@@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome!
As a rough rule of thumb, any dereference of an RCU-protected
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
rcu_read_lock_sched(), or by the appropriate update-side lock.
- Disabling of preemption can serve as rcu_read_lock_sched(), but
- is less readable and prevents lockdep from detecting locking issues.
+ Explicit disabling of preemption (preempt_disable(), for example)
+ can serve as rcu_read_lock_sched(), but is less readable and
+ prevents lockdep from detecting locking issues.
+
+ Please note that you *cannot* rely on code known to be built
+ only in non-preemptible kernels. Such code can and will break,
+ especially in kernels built with CONFIG_PREEMPT_COUNT=y.
Letting RCU-protected pointers "leak" out of an RCU read-side
critical section is every bit as bad as letting them leak out
@@ -92,33 +97,38 @@ over a rather long period of time, but improvements are always welcome!
b. Proceed as in (a) above, but also maintain per-element
locks (that are acquired by both readers and writers)
- that guard per-element state. Of course, fields that
- the readers refrain from accessing can be guarded by
- some other lock acquired only by updaters, if desired.
+ that guard per-element state. Fields that the readers
+ refrain from accessing can be guarded by some other lock
+ acquired only by updaters, if desired.
- This works quite well, also.
+ This also works quite well.
c. Make updates appear atomic to readers. For example,
pointer updates to properly aligned fields will
appear atomic, as will individual atomic primitives.
Sequences of operations performed under a lock will *not*
appear to be atomic to RCU readers, nor will sequences
- of multiple atomic primitives.
+ of multiple atomic primitives. One alternative is to
+ move multiple individual fields to a separate structure,
+ thus solving the multiple-field problem by imposing an
+ additional level of indirection.
This can work, but is starting to get a bit tricky.
- d. Carefully order the updates and the reads so that
- readers see valid data at all phases of the update.
- This is often more difficult than it sounds, especially
- given modern CPUs' tendency to reorder memory references.
- One must usually liberally sprinkle memory barriers
- (smp_wmb(), smp_rmb(), smp_mb()) through the code,
- making it difficult to understand and to test.
-
- It is usually better to group the changing data into
- a separate structure, so that the change may be made
- to appear atomic by updating a pointer to reference
- a new structure containing updated values.
+ d. Carefully order the updates and the reads so that readers
+ see valid data at all phases of the update. This is often
+ more difficult than it sounds, especially given modern
+ CPUs' tendency to reorder memory references. One must
+ usually liberally sprinkle memory-ordering operations
+ through the code, making it difficult to understand and
+ to test. Where it works, it is better to use things
+ like smp_store_release() and smp_load_acquire(), but in
+ some cases the smp_mb() full memory barrier is required.
+
+ As noted earlier, it is usually better to group the
+ changing data into a separate structure, so that the
+ change may be made to appear atomic by updating a pointer
+ to reference a new structure containing updated values.
4. Weakly ordered CPUs pose special challenges. Almost all CPUs
are weakly ordered -- even x86 CPUs allow later loads to be
@@ -140,8 +150,7 @@ over a rather long period of time, but improvements are always welcome!
prevents destructive compiler optimizations. However,
with a bit of devious creativity, it is possible to
mishandle the return value from rcu_dereference().
- Please see rcu_dereference.txt in this directory for
- more information.
+ Please see rcu_dereference.rst for more information.
The rcu_dereference() primitive is used by the
various "_rcu()" list-traversal primitives, such
@@ -151,7 +160,7 @@ over a rather long period of time, but improvements are always welcome!
primitives. This is particularly useful in code that
is common to readers and updaters. However, lockdep
will complain if you access rcu_dereference() outside
- of an RCU read-side critical section. See lockdep.txt
+ of an RCU read-side critical section. See lockdep.rst
to learn what to do about this.
Of course, neither rcu_dereference() nor the "_rcu()"
@@ -184,23 +193,29 @@ over a rather long period of time, but improvements are always welcome!
when publicizing a pointer to a structure that can
be traversed by an RCU read-side critical section.
-5. If call_rcu() or call_srcu() is used, the callback function will
- be called from softirq context. In particular, it cannot block.
+5. If any of call_rcu(), call_srcu(), call_rcu_tasks(),
+ call_rcu_tasks_rude(), or call_rcu_tasks_trace() is used,
+ the callback function may be invoked from softirq context,
+ and in any case with bottom halves disabled. In particular,
+ this callback function cannot block. If you need the callback
+ to block, run that code in a workqueue handler scheduled from
+ the callback. The queue_rcu_work() function does this for you
+ in the case of call_rcu().
6. Since synchronize_rcu() can block, it cannot be called
from any sort of irq context. The same rule applies
- for synchronize_srcu(), synchronize_rcu_expedited(), and
- synchronize_srcu_expedited().
+ for synchronize_srcu(), synchronize_rcu_expedited(),
+ synchronize_srcu_expedited(), synchronize_rcu_tasks(),
+ synchronize_rcu_tasks_rude(), and synchronize_rcu_tasks_trace().
The expedited forms of these primitives have the same semantics
- as the non-expedited forms, but expediting is both expensive and
- (with the exception of synchronize_srcu_expedited()) unfriendly
- to real-time workloads. Use of the expedited primitives should
- be restricted to rare configuration-change operations that would
- not normally be undertaken while a real-time workload is running.
- However, real-time workloads can use rcupdate.rcu_normal kernel
- boot parameter to completely disable expedited grace periods,
- though this might have performance implications.
+ as the non-expedited forms, but expediting is more CPU intensive.
+ Use of the expedited primitives should be restricted to rare
+ configuration-change operations that would not normally be
+ undertaken while a real-time workload is running. Note that
+ IPI-sensitive real-time workloads can use the rcupdate.rcu_normal
+ kernel boot parameter to completely disable expedited grace
+ periods, though this might have performance implications.
In particular, if you find yourself invoking one of the expedited
primitives repeatedly in a loop, please do everyone a favor:
@@ -208,8 +223,9 @@ over a rather long period of time, but improvements are always welcome!
a single non-expedited primitive to cover the entire batch.
This will very likely be faster than the loop containing the
expedited primitive, and will be much much easier on the rest
- of the system, especially to real-time workloads running on
- the rest of the system.
+ of the system, especially to real-time workloads running on the
+ rest of the system. Alternatively, instead use asynchronous
+ primitives such as call_rcu().
7. As of v4.20, a given kernel implements only one RCU flavor, which
is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
@@ -232,7 +248,8 @@ over a rather long period of time, but improvements are always welcome!
the corresponding readers must use rcu_read_lock_trace() and
rcu_read_unlock_trace(). If an updater uses call_rcu_tasks_rude()
or synchronize_rcu_tasks_rude(), then the corresponding readers
- must use anything that disables interrupts.
+ must use anything that disables preemption, for example,
+ preempt_disable() and preempt_enable().
Mixing things up will result in confusion and broken kernels, and
has even resulted in an exploitable security issue. Therefore,
@@ -246,15 +263,16 @@ over a rather long period of time, but improvements are always welcome!
that this usage is safe is that readers can use anything that
disables BH when updaters use call_rcu() or synchronize_rcu().
-8. Although synchronize_rcu() is slower than is call_rcu(), it
- usually results in simpler code. So, unless update performance is
- critically important, the updaters cannot block, or the latency of
- synchronize_rcu() is visible from userspace, synchronize_rcu()
- should be used in preference to call_rcu(). Furthermore,
- kfree_rcu() usually results in even simpler code than does
- synchronize_rcu() without synchronize_rcu()'s multi-millisecond
- latency. So please take advantage of kfree_rcu()'s "fire and
- forget" memory-freeing capabilities where it applies.
+8. Although synchronize_rcu() is slower than is call_rcu(),
+ it usually results in simpler code. So, unless update
+ performance is critically important, the updaters cannot block,
+ or the latency of synchronize_rcu() is visible from userspace,
+ synchronize_rcu() should be used in preference to call_rcu().
+ Furthermore, kfree_rcu() and kvfree_rcu() usually result
+ in even simpler code than does synchronize_rcu() without
+ synchronize_rcu()'s multi-millisecond latency. So please take
+ advantage of kfree_rcu()'s and kvfree_rcu()'s "fire and forget"
+ memory-freeing capabilities where it applies.
An especially important property of the synchronize_rcu()
primitive is that it automatically self-limits: if grace periods
@@ -264,8 +282,8 @@ over a rather long period of time, but improvements are always welcome!
cases where grace periods are delayed, as failing to do so can
result in excessive realtime latencies or even OOM conditions.
- Ways of gaining this self-limiting property when using call_rcu()
- include:
+ Ways of gaining this self-limiting property when using call_rcu(),
+ kfree_rcu(), or kvfree_rcu() include:
a. Keeping a count of the number of data-structure elements
used by the RCU-protected data structure, including
@@ -297,17 +315,21 @@ over a rather long period of time, but improvements are always welcome!
here is that superuser already has lots of ways to crash
the machine.
- d. Periodically invoke synchronize_rcu(), permitting a limited
+ d. Periodically invoke rcu_barrier(), permitting a limited
number of updates per grace period.
- The same cautions apply to call_srcu() and kfree_rcu().
+ The same cautions apply to call_srcu(), call_rcu_tasks(),
+ call_rcu_tasks_rude(), and call_rcu_tasks_trace(). This is
+ why there is an srcu_barrier(), rcu_barrier_tasks(),
+ rcu_barrier_tasks_rude(), and rcu_barrier_tasks_rude(),
+ respectively.
- Note that although these primitives do take action to avoid memory
- exhaustion when any given CPU has too many callbacks, a determined
- user could still exhaust memory. This is especially the case
- if a system with a large number of CPUs has been configured to
- offload all of its RCU callbacks onto a single CPU, or if the
- system has relatively little free memory.
+ Note that although these primitives do take action to avoid
+ memory exhaustion when any given CPU has too many callbacks,
+ a determined user or administrator can still exhaust memory.
+ This is especially the case if a system with a large number of
+ CPUs has been configured to offload all of its RCU callbacks onto
+ a single CPU, or if the system has relatively little free memory.
9. All RCU list-traversal primitives, which include
rcu_dereference(), list_for_each_entry_rcu(), and
@@ -323,7 +345,7 @@ over a rather long period of time, but improvements are always welcome!
primitives when the update-side lock is held is that doing so
can be quite helpful in reducing code bloat when common code is
shared between readers and updaters. Additional primitives
- are provided for this case, as discussed in lockdep.txt.
+ are provided for this case, as discussed in lockdep.rst.
One exception to this rule is when data is only ever added to
the linked data structure, and is never removed during any
@@ -336,14 +358,14 @@ over a rather long period of time, but improvements are always welcome!
and you don't hold the appropriate update-side lock, you *must*
use the "_rcu()" variants of the list macros. Failing to do so
will break Alpha, cause aggressive compilers to generate bad code,
- and confuse people trying to read your code.
+ and confuse people trying to understand your code.
11. Any lock acquired by an RCU callback must be acquired elsewhere
- with softirq disabled, e.g., via spin_lock_irqsave(),
- spin_lock_bh(), etc. Failing to disable softirq on a given
- acquisition of that lock will result in deadlock as soon as
- the RCU softirq handler happens to run your RCU callback while
- interrupting that acquisition's critical section.
+ with softirq disabled, e.g., via spin_lock_bh(). Failing to
+ disable softirq on a given acquisition of that lock will result
+ in deadlock as soon as the RCU softirq handler happens to run
+ your RCU callback while interrupting that acquisition's critical
+ section.
12. RCU callbacks can be and are executed in parallel. In many cases,
the callback code simply wrappers around kfree(), so that this
@@ -364,7 +386,17 @@ over a rather long period of time, but improvements are always welcome!
for some real-time workloads, this is the whole point of using
the rcu_nocbs= kernel boot parameter.
-13. Unlike other forms of RCU, it *is* permissible to block in an
+ In addition, do not assume that callbacks queued in a given order
+ will be invoked in that order, even if they all are queued on the
+ same CPU. Furthermore, do not assume that same-CPU callbacks will
+ be invoked serially. For example, in recent kernels, CPUs can be
+ switched between offloaded and de-offloaded callback invocation,
+ and while a given CPU is undergoing such a switch, its callbacks
+ might be concurrently invoked by that CPU's softirq handler and
+ that CPU's rcuo kthread. At such times, that CPU's callbacks
+ might be executed both concurrently and out of order.
+
+13. Unlike most flavors of RCU, it *is* permissible to block in an
SRCU read-side critical section (demarked by srcu_read_lock()
and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
Please note that if you don't need to sleep in read-side critical
@@ -404,6 +436,12 @@ over a rather long period of time, but improvements are always welcome!
never sends IPIs to other CPUs, so it is easier on
real-time workloads than is synchronize_rcu_expedited().
+ It is also permissible to sleep in RCU Tasks Trace read-side
+ critical, which are delimited by rcu_read_lock_trace() and
+ rcu_read_unlock_trace(). However, this is a specialized flavor
+ of RCU, and you should not use it without first checking with
+ its current users. In most cases, you should instead use SRCU.
+
Note that rcu_assign_pointer() relates to SRCU just as it does to
other forms of RCU, but instead of rcu_dereference() you should
use srcu_dereference() in order to avoid lockdep splats.
@@ -434,50 +472,62 @@ over a rather long period of time, but improvements are always welcome!
find problems as follows:
CONFIG_PROVE_LOCKING:
- check that accesses to RCU-protected data
- structures are carried out under the proper RCU
- read-side critical section, while holding the right
- combination of locks, or whatever other conditions
- are appropriate.
+ check that accesses to RCU-protected data structures
+ are carried out under the proper RCU read-side critical
+ section, while holding the right combination of locks,
+ or whatever other conditions are appropriate.
CONFIG_DEBUG_OBJECTS_RCU_HEAD:
- check that you don't pass the
- same object to call_rcu() (or friends) before an RCU
- grace period has elapsed since the last time that you
- passed that same object to call_rcu() (or friends).
+ check that you don't pass the same object to call_rcu()
+ (or friends) before an RCU grace period has elapsed
+ since the last time that you passed that same object to
+ call_rcu() (or friends).
__rcu sparse checks:
- tag the pointer to the RCU-protected data
- structure with __rcu, and sparse will warn you if you
- access that pointer without the services of one of the
- variants of rcu_dereference().
+ tag the pointer to the RCU-protected data structure
+ with __rcu, and sparse will warn you if you access that
+ pointer without the services of one of the variants
+ of rcu_dereference().
These debugging aids can help you find problems that are
otherwise extremely difficult to spot.
-17. If you register a callback using call_rcu() or call_srcu(), and
- pass in a function defined within a loadable module, then it in
- necessary to wait for all pending callbacks to be invoked after
- the last invocation and before unloading that module. Note that
- it is absolutely *not* sufficient to wait for a grace period!
- The current (say) synchronize_rcu() implementation is *not*
- guaranteed to wait for callbacks registered on other CPUs.
- Or even on the current CPU if that CPU recently went offline
- and came back online.
+17. If you pass a callback function defined within a module to one of
+ call_rcu(), call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(),
+ or call_rcu_tasks_trace(), then it is necessary to wait for all
+ pending callbacks to be invoked before unloading that module.
+ Note that it is absolutely *not* sufficient to wait for a grace
+ period! For example, synchronize_rcu() implementation is *not*
+ guaranteed to wait for callbacks registered on other CPUs via
+ call_rcu(). Or even on the current CPU if that CPU recently
+ went offline and came back online.
You instead need to use one of the barrier functions:
- call_rcu() -> rcu_barrier()
- call_srcu() -> srcu_barrier()
+ - call_rcu_tasks() -> rcu_barrier_tasks()
+ - call_rcu_tasks_rude() -> rcu_barrier_tasks_rude()
+ - call_rcu_tasks_trace() -> rcu_barrier_tasks_trace()
However, these barrier functions are absolutely *not* guaranteed
- to wait for a grace period. In fact, if there are no call_rcu()
- callbacks waiting anywhere in the system, rcu_barrier() is within
- its rights to return immediately.
-
- So if you need to wait for both an RCU grace period and for
- all pre-existing call_rcu() callbacks, you will need to execute
- both rcu_barrier() and synchronize_rcu(), if necessary, using
- something like workqueues to to execute them concurrently.
-
- See rcubarrier.txt for more information.
+ to wait for a grace period. For example, if there are no
+ call_rcu() callbacks queued anywhere in the system, rcu_barrier()
+ can and will return immediately.
+
+ So if you need to wait for both a grace period and for all
+ pre-existing callbacks, you will need to invoke both functions,
+ with the pair depending on the flavor of RCU:
+
+ - Either synchronize_rcu() or synchronize_rcu_expedited(),
+ together with rcu_barrier()
+ - Either synchronize_srcu() or synchronize_srcu_expedited(),
+ together with and srcu_barrier()
+ - synchronize_rcu_tasks() and rcu_barrier_tasks()
+ - synchronize_tasks_rude() and rcu_barrier_tasks_rude()
+ - synchronize_tasks_trace() and rcu_barrier_tasks_trace()
+
+ If necessary, you can use something like workqueues to execute
+ the requisite pair of functions concurrently.
+
+ See rcubarrier.rst for more information.
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index e703d3dbe60c..84a79903f6a8 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -9,7 +9,6 @@ RCU concepts
.. toctree::
:maxdepth: 3
- arrayRCU
checklist
lockdep
lockdep-splat
diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst
index 2a643e293fb4..bdc4bcc5289f 100644
--- a/Documentation/RCU/listRCU.rst
+++ b/Documentation/RCU/listRCU.rst
@@ -3,11 +3,10 @@
Using RCU to Protect Read-Mostly Linked Lists
=============================================
-One of the best applications of RCU is to protect read-mostly linked lists
-(``struct list_head`` in list.h). One big advantage of this approach
-is that all of the required memory barriers are included for you in
-the list macros. This document describes several applications of RCU,
-with the best fits first.
+One of the most common uses of RCU is protecting read-mostly linked lists
+(``struct list_head`` in list.h). One big advantage of this approach is
+that all of the required memory ordering is provided by the list macros.
+This document describes several list-based RCU use cases.
Example 1: Read-mostly list: Deferred Destruction
@@ -35,7 +34,8 @@ The code traversing the list of all processes typically looks like::
}
rcu_read_unlock();
-The simplified code for removing a process from a task list is::
+The simplified and heavily inlined code for removing a process from a
+task list is::
void release_task(struct task_struct *p)
{
@@ -45,39 +45,48 @@ The simplified code for removing a process from a task list is::
call_rcu(&p->rcu, delayed_put_task_struct);
}
-When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under
-``tasklist_lock`` writer lock protection, to remove the task from the list of
-all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals
-from corrupting the list. Readers using ``for_each_process()`` are not protected
-with the ``tasklist_lock``. To prevent readers from noticing changes in the list
-pointers, the ``task_struct`` object is freed only after one or more grace
-periods elapse (with the help of call_rcu()). This deferring of destruction
-ensures that any readers traversing the list will see valid ``p->tasks.next``
-pointers and deletion/freeing can happen in parallel with traversal of the list.
-This pattern is also called an **existence lock**, since RCU pins the object in
-memory until all existing readers finish.
+When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)``
+via __exit_signal() and __unhash_process() under ``tasklist_lock``
+writer lock protection. The list_del_rcu() invocation removes
+the task from the list of all tasks. The ``tasklist_lock``
+prevents concurrent list additions/removals from corrupting the
+list. Readers using ``for_each_process()`` are not protected with the
+``tasklist_lock``. To prevent readers from noticing changes in the list
+pointers, the ``task_struct`` object is freed only after one or more
+grace periods elapse, with the help of call_rcu(), which is invoked via
+put_task_struct_rcu_user(). This deferring of destruction ensures that
+any readers traversing the list will see valid ``p->tasks.next`` pointers
+and deletion/freeing can happen in parallel with traversal of the list.
+This pattern is also called an **existence lock**, since RCU refrains
+from invoking the delayed_put_task_struct() callback function until
+all existing readers finish, which guarantees that the ``task_struct``
+object in question will remain in existence until after the completion
+of all RCU readers that might possibly have a reference to that object.
Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates
----------------------------------------------------------------------
-The best applications are cases where, if reader-writer locking were
-used, the read-side lock would be dropped before taking any action
-based on the results of the search. The most celebrated example is
-the routing table. Because the routing table is tracking the state of
-equipment outside of the computer, it will at times contain stale data.
-Therefore, once the route has been computed, there is no need to hold
-the routing table static during transmission of the packet. After all,
-you can hold the routing table static all you want, but that won't keep
-the external Internet from changing, and it is the state of the external
-Internet that really matters. In addition, routing entries are typically
-added or deleted, rather than being modified in place.
-
-A straightforward example of this use of RCU may be found in the
-system-call auditing support. For example, a reader-writer locked
+Some reader-writer locking use cases compute a value while holding
+the read-side lock, but continue to use that value after that lock is
+released. These use cases are often good candidates for conversion
+to RCU. One prominent example involves network packet routing.
+Because the packet-routing data tracks the state of equipment outside
+of the computer, it will at times contain stale data. Therefore, once
+the route has been computed, there is no need to hold the routing table
+static during transmission of the packet. After all, you can hold the
+routing table static all you want, but that won't keep the external
+Internet from changing, and it is the state of the external Internet
+that really matters. In addition, routing entries are typically added
+or deleted, rather than being modified in place. This is a rare example
+of the finite speed of light and the non-zero size of atoms actually
+helping make synchronization be lighter weight.
+
+A straightforward example of this type of RCU use case may be found in
+the system-call auditing support. For example, a reader-writer locked
implementation of ``audit_filter_task()`` might be as follows::
- static enum audit_state audit_filter_task(struct task_struct *tsk)
+ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
{
struct audit_entry *e;
enum audit_state state;
@@ -86,6 +95,8 @@ implementation of ``audit_filter_task()`` might be as follows::
/* Note: audit_filter_mutex held by caller. */
list_for_each_entry(e, &audit_tsklist, list) {
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ if (state == AUDIT_STATE_RECORD)
+ *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
read_unlock(&auditsc_lock);
return state;
}
@@ -101,7 +112,7 @@ you are turning auditing off, it is OK to audit a few extra system calls.
This means that RCU can be easily applied to the read side, as follows::
- static enum audit_state audit_filter_task(struct task_struct *tsk)
+ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
{
struct audit_entry *e;
enum audit_state state;
@@ -110,6 +121,8 @@ This means that RCU can be easily applied to the read side, as follows::
/* Note: audit_filter_mutex held by caller. */
list_for_each_entry_rcu(e, &audit_tsklist, list) {
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ if (state == AUDIT_STATE_RECORD)
+ *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
@@ -118,13 +131,15 @@ This means that RCU can be easily applied to the read side, as follows::
return AUDIT_BUILD_CONTEXT;
}
-The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock()
-and rcu_read_unlock(), respectively, and the list_for_each_entry() has
-become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives
-insert the read-side memory barriers that are required on DEC Alpha CPUs.
+The read_lock() and read_unlock() calls have become rcu_read_lock()
+and rcu_read_unlock(), respectively, and the list_for_each_entry()
+has become list_for_each_entry_rcu(). The **_rcu()** list-traversal
+primitives add READ_ONCE() and diagnostic checks for incorrect use
+outside of an RCU read-side critical section.
The changes to the update side are also straightforward. A reader-writer lock
-might be used as follows for deletion and insertion::
+might be used as follows for deletion and insertion in these simplified
+versions of audit_del_rule() and audit_add_rule()::
static inline int audit_del_rule(struct audit_rule *rule,
struct list_head *list)
@@ -188,16 +203,16 @@ Following are the RCU equivalents for these two functions::
return 0;
}
-Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a
+Normally, the write_lock() and write_unlock() would be replaced by a
spin_lock() and a spin_unlock(). But in this case, all callers hold
``audit_filter_mutex``, so no additional locking is required. The
-``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the
+auditsc_lock can therefore be eliminated, since use of RCU eliminates the
need for writers to exclude readers.
The list_del(), list_add(), and list_add_tail() primitives have been
replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu().
-The **_rcu()** list-manipulation primitives add memory barriers that are needed on
-weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the
+The **_rcu()** list-manipulation primitives add memory barriers that are
+needed on weakly ordered CPUs. The list_del_rcu() primitive omits the
pointer poisoning debug-assist code that would otherwise cause concurrent
readers to fail spectacularly.
@@ -238,7 +253,9 @@ need to be filled in)::
The RCU version creates a copy, updates the copy, then replaces the old
entry with the newly updated entry. This sequence of actions, allowing
concurrent reads while making a copy to perform an update, is what gives
-RCU (*read-copy update*) its name. The RCU code is as follows::
+RCU (*read-copy update*) its name.
+
+The RCU version of audit_upd_rule() is as follows::
static inline int audit_upd_rule(struct audit_rule *rule,
struct list_head *list,
@@ -267,6 +284,9 @@ RCU (*read-copy update*) its name. The RCU code is as follows::
Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the
writer lock would become a spinlock in this sort of code.
+The update_lsm_rule() does something very similar, for those who would
+prefer to look at real Linux-kernel code.
+
Another use of this pattern can be found in the openswitch driver's *connection
tracking table* code in ``ct_limit_set()``. The table holds connection tracking
entries and has a limit on the maximum entries. There is one such table
@@ -281,9 +301,10 @@ Example 4: Eliminating Stale Data
---------------------------------
The auditing example above tolerates stale data, as do most algorithms
-that are tracking external state. Because there is a delay from the
-time the external state changes before Linux becomes aware of the change,
-additional RCU-induced staleness is generally not a problem.
+that are tracking external state. After all, given there is a delay
+from the time the external state changes before Linux becomes aware
+of the change, and so as noted earlier, a small quantity of additional
+RCU-induced staleness is generally not a problem.
However, there are many examples where stale data cannot be tolerated.
One example in the Linux kernel is the System V IPC (see the shm_lock()
@@ -302,7 +323,7 @@ Quick Quiz:
If the system-call audit module were to ever need to reject stale data, one way
to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the
-audit_entry structure, and modify ``audit_filter_task()`` as follows::
+``audit_entry`` structure, and modify audit_filter_task() as follows::
static enum audit_state audit_filter_task(struct task_struct *tsk)
{
@@ -319,6 +340,8 @@ audit_entry structure, and modify ``audit_filter_task()`` as follows::
return AUDIT_BUILD_CONTEXT;
}
rcu_read_unlock();
+ if (state == AUDIT_STATE_RECORD)
+ *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
return state;
}
}
@@ -326,12 +349,6 @@ audit_entry structure, and modify ``audit_filter_task()`` as follows::
return AUDIT_BUILD_CONTEXT;
}
-Note that this example assumes that entries are only added and deleted.
-Additional mechanism is required to deal correctly with the update-in-place
-performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would
-need additional memory barriers to ensure that the list_add_rcu() was really
-executed before the list_del_rcu().
-
The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the
spinlock as follows::
@@ -357,24 +374,32 @@ spinlock as follows::
This too assumes that the caller holds ``audit_filter_mutex``.
+Note that this example assumes that entries are only added and deleted.
+Additional mechanism is required to deal correctly with the update-in-place
+performed by audit_upd_rule(). For one thing, audit_upd_rule() would
+need to hold the locks of both the old ``audit_entry`` and its replacement
+while executing the list_replace_rcu().
+
Example 5: Skipping Stale Objects
---------------------------------
-For some usecases, reader performance can be improved by skipping stale objects
-during read-side list traversal if the object in concern is pending destruction
-after one or more grace periods. One such example can be found in the timerfd
-subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to
-setting of the system time, then all programmed timerfds that depend on this
-clock get triggered and processes waiting on them to expire are woken up in
-advance of their scheduled expiry. To facilitate this, all such timers are added
-to an RCU-managed ``cancel_list`` when they are setup in
+For some use cases, reader performance can be improved by skipping
+stale objects during read-side list traversal, where stale objects
+are those that will be removed and destroyed after one or more grace
+periods. One such example can be found in the timerfd subsystem. When a
+``CLOCK_REALTIME`` clock is reprogrammed (for example due to setting
+of the system time) then all programmed ``timerfds`` that depend on
+this clock get triggered and processes waiting on them are awakened in
+advance of their scheduled expiry. To facilitate this, all such timers
+are added to an RCU-managed ``cancel_list`` when they are setup in
``timerfd_setup_cancel()``::
static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
{
spin_lock(&ctx->cancel_lock);
- if ((ctx->clockid == CLOCK_REALTIME &&
+ if ((ctx->clockid == CLOCK_REALTIME ||
+ ctx->clockid == CLOCK_REALTIME_ALARM) &&
(flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
if (!ctx->might_cancel) {
ctx->might_cancel = true;
@@ -382,13 +407,16 @@ to an RCU-managed ``cancel_list`` when they are setup in
list_add_rcu(&ctx->clist, &cancel_list);
spin_unlock(&cancel_lock);
}
+ } else {
+ __timerfd_remove_cancel(ctx);
}
spin_unlock(&ctx->cancel_lock);
}
-When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the
-timerfd object is cleared, the object removed from the ``cancel_list`` and
-destroyed::
+When a timerfd is freed (fd is closed), then the ``might_cancel``
+flag of the timerfd object is cleared, the object removed from the
+``cancel_list`` and destroyed, as shown in this simplified and inlined
+version of timerfd_release()::
int timerfd_release(struct inode *inode, struct file *file)
{
@@ -403,7 +431,10 @@ destroyed::
}
spin_unlock(&ctx->cancel_lock);
- hrtimer_cancel(&ctx->t.tmr);
+ if (isalarm(ctx))
+ alarm_cancel(&ctx->t.alarm);
+ else
+ hrtimer_cancel(&ctx->t.tmr);
kfree_rcu(ctx, rcu);
return 0;
}
@@ -416,6 +447,7 @@ objects::
void timerfd_clock_was_set(void)
{
+ ktime_t moffs = ktime_mono_to_real(0);
struct timerfd_ctx *ctx;
unsigned long flags;
@@ -424,7 +456,7 @@ objects::
if (!ctx->might_cancel)
continue;
spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->moffs != ktime_mono_to_real(0)) {
+ if (ctx->moffs != moffs) {
ctx->moffs = KTIME_MAX;
ctx->ticks++;
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
@@ -434,10 +466,10 @@ objects::
rcu_read_unlock();
}
-The key point here is, because RCU-traversal of the ``cancel_list`` happens
-while objects are being added and removed to the list, sometimes the traversal
-can step on an object that has been removed from the list. In this example, it
-is seen that it is better to skip such objects using a flag.
+The key point is that because RCU-protected traversal of the
+``cancel_list`` happens concurrently with object addition and removal,
+sometimes the traversal can access an object that has been removed from
+the list. In this example, a flag is used to skip such objects.
Summary
diff --git a/Documentation/RCU/lockdep.rst b/Documentation/RCU/lockdep.rst
index cc860a0c296b..69e73a39bd11 100644
--- a/Documentation/RCU/lockdep.rst
+++ b/Documentation/RCU/lockdep.rst
@@ -17,7 +17,9 @@ state::
rcu_read_lock_held() for normal RCU.
rcu_read_lock_bh_held() for RCU-bh.
rcu_read_lock_sched_held() for RCU-sched.
+ rcu_read_lock_any_held() for any of normal RCU, RCU-bh, and RCU-sched.
srcu_read_lock_held() for SRCU.
+ rcu_read_lock_trace_held() for RCU Tasks Trace.
These functions are conservative, and will therefore return 1 if they
aren't certain (for example, if CONFIG_DEBUG_LOCK_ALLOC is not set).
@@ -53,6 +55,8 @@ checking of rcu_dereference() primitives:
is invoked by both SRCU readers and updaters.
rcu_dereference_raw(p):
Don't check. (Use sparingly, if at all.)
+ rcu_dereference_raw_check(p):
+ Don't do lockdep at all. (Use sparingly, if at all.)
rcu_dereference_protected(p, c):
Use explicit check expression "c", and omit all barriers
and compiler constraints. This is useful when the data
@@ -61,13 +65,12 @@ checking of rcu_dereference() primitives:
rcu_access_pointer(p):
Return the value of the pointer and omit all barriers,
but retain the compiler constraints that prevent duplicating
- or coalescsing. This is useful when when testing the
+ or coalescing. This is useful when testing the
value of the pointer itself, for example, against NULL.
The rcu_dereference_check() check expression can be any boolean
-expression, but would normally include a lockdep expression. However,
-any boolean expression can be used. For a moderately ornate example,
-consider the following::
+expression, but would normally include a lockdep expression. For a
+moderately ornate example, consider the following::
file = rcu_dereference_check(fdt->fd[fd],
lockdep_is_held(&files->file_lock) ||
@@ -93,10 +96,10 @@ code, it could instead be written as follows::
atomic_read(&files->count) == 1);
This would verify cases #2 and #3 above, and furthermore lockdep would
-complain if this was used in an RCU read-side critical section unless one
-of these two cases held. Because rcu_dereference_protected() omits all
-barriers and compiler constraints, it generates better code than do the
-other flavors of rcu_dereference(). On the other hand, it is illegal
+complain even if this was used in an RCU read-side critical section unless
+one of these two cases held. Because rcu_dereference_protected() omits
+all barriers and compiler constraints, it generates better code than do
+the other flavors of rcu_dereference(). On the other hand, it is illegal
to use rcu_dereference_protected() if either the RCU-protected pointer
or the RCU-protected data that it points to can change concurrently.
diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst
index 0e03c6ef3147..bf6617b330a7 100644
--- a/Documentation/RCU/rcu.rst
+++ b/Documentation/RCU/rcu.rst
@@ -10,9 +10,8 @@ A "grace period" must elapse between the two parts, and this grace period
must be long enough that any readers accessing the item being deleted have
since dropped their references. For example, an RCU-protected deletion
from a linked list would first remove the item from the list, wait for
-a grace period to elapse, then free the element. See the
-:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` for more information on
-using RCU with linked lists.
+a grace period to elapse, then free the element. See listRCU.rst for more
+information on using RCU with linked lists.
Frequently Asked Questions
--------------------------
@@ -50,7 +49,7 @@ Frequently Asked Questions
- If I am running on a uniprocessor kernel, which can only do one
thing at a time, why should I wait for a grace period?
- See :ref:`Documentation/RCU/UP.rst <up_doc>` for more information.
+ See UP.rst for more information.
- How can I see where RCU is currently used in the Linux kernel?
@@ -64,13 +63,13 @@ Frequently Asked Questions
- What guidelines should I follow when writing code that uses RCU?
- See the checklist.txt file in this directory.
+ See checklist.rst.
- Why the name "RCU"?
"RCU" stands for "read-copy update".
- :ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` has more information on where
- this name came from, search for "read-copy update" to find it.
+ listRCU.rst has more information on where this name came from, search
+ for "read-copy update" to find it.
- I hear that RCU is patented? What is with that?
@@ -78,15 +77,17 @@ Frequently Asked Questions
search for the string "Patent" in Documentation/RCU/RTFP.txt to find them.
Of these, one was allowed to lapse by the assignee, and the
others have been contributed to the Linux kernel under GPL.
+ Many (but not all) have long since expired.
There are now also LGPL implementations of user-level RCU
available (https://liburcu.org/).
- I hear that RCU needs work in order to support realtime kernels?
- Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
+ Realtime-friendly RCU are enabled via the CONFIG_PREEMPTION
kernel configuration parameter.
- Where can I find more information on RCU?
See the Documentation/RCU/RTFP.txt file.
- Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/).
+ Or point your browser at (https://docs.google.com/document/d/1X0lThx8OK0ZgLMqVoXiR4ZrGURHrXK6NyLRbeXe3Xac/edit)
+ or (https://docs.google.com/document/d/1GCdQC8SDbb54W1shjEXqGZ0Rq8a6kIeYutdSIajfpLA/edit?usp=sharing).
diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
index 0b418a5b243c..3b739f6243c8 100644
--- a/Documentation/RCU/rcu_dereference.rst
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -19,8 +19,9 @@ Follow these rules to keep your RCU code working properly:
can reload the value, and won't your code have fun with two
different values for a single pointer! Without rcu_dereference(),
DEC Alpha can load a pointer, dereference that pointer, and
- return data preceding initialization that preceded the store of
- the pointer.
+ return data preceding initialization that preceded the store
+ of the pointer. (As noted later, in recent kernels READ_ONCE()
+ also prevents DEC Alpha from playing these tricks.)
In addition, the volatile cast in rcu_dereference() prevents the
compiler from deducing the resulting pointer value. Please see
@@ -34,7 +35,7 @@ Follow these rules to keep your RCU code working properly:
takes on the role of the lockless_dereference() primitive that
was removed in v4.15.
-- You are only permitted to use rcu_dereference on pointer values.
+- You are only permitted to use rcu_dereference() on pointer values.
The compiler simply knows too much about integral values to
trust it to carry dependencies through integer operations.
There are a very few exceptions, namely that you can temporarily
@@ -128,10 +129,16 @@ Follow these rules to keep your RCU code working properly:
This sort of comparison occurs frequently when scanning
RCU-protected circular linked lists.
- Note that if checks for being within an RCU read-side
- critical section are not required and the pointer is never
- dereferenced, rcu_access_pointer() should be used in place
- of rcu_dereference().
+ Note that if the pointer comparison is done outside
+ of an RCU read-side critical section, and the pointer
+ is never dereferenced, rcu_access_pointer() should be
+ used in place of rcu_dereference(). In most cases,
+ it is best to avoid accidental dereferences by testing
+ the rcu_access_pointer() return value directly, without
+ assigning it to a variable.
+
+ Within an RCU read-side critical section, there is little
+ reason to use rcu_access_pointer().
- The comparison is against a pointer that references memory
that was initialized "a long time ago." The reason
@@ -234,6 +241,7 @@ precautions. To see this, consider the following code fragment::
struct foo *q;
int r1, r2;
+ rcu_read_lock();
p = rcu_dereference(gp2);
if (p == NULL)
return;
@@ -242,7 +250,10 @@ precautions. To see this, consider the following code fragment::
if (p == q) {
/* The compiler decides that q->c is same as p->c. */
r2 = p->c; /* Could get 44 on weakly order system. */
+ } else {
+ r2 = p->c - r1; /* Unconditional access to p->c. */
}
+ rcu_read_unlock();
do_something_with(r1, r2);
}
@@ -291,6 +302,7 @@ Then one approach is to use locking, for example, as follows::
struct foo *q;
int r1, r2;
+ rcu_read_lock();
p = rcu_dereference(gp2);
if (p == NULL)
return;
@@ -300,7 +312,12 @@ Then one approach is to use locking, for example, as follows::
if (p == q) {
/* The compiler decides that q->c is same as p->c. */
r2 = p->c; /* Locking guarantees r2 == 144. */
+ } else {
+ spin_lock(&q->lock);
+ r2 = q->c - r1;
+ spin_unlock(&q->lock);
}
+ rcu_read_unlock();
spin_unlock(&p->lock);
do_something_with(r1, r2);
}
@@ -358,7 +375,7 @@ the exact value of "p" even in the not-equals case. This allows the
compiler to make the return values independent of the load from "gp",
in turn destroying the ordering between this load and the loads of the
return values. This can result in "p->b" returning pre-initialization
-garbage values.
+garbage values on weakly ordered systems.
In short, rcu_dereference() is *not* optional when you are going to
dereference the resulting pointer.
@@ -424,7 +441,7 @@ member of the rcu_dereference() to use in various situations:
SPARSE CHECKING OF RCU-PROTECTED POINTERS
-----------------------------------------
-The sparse static-analysis tool checks for direct access to RCU-protected
+The sparse static-analysis tool checks for non-RCU access to RCU-protected
pointers, which can result in "interesting" bugs due to compiler
optimizations involving invented loads and perhaps also load tearing.
For example, suppose someone mistakenly does something like this::
diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst
index 3b4a24877496..6da7f66da2a8 100644
--- a/Documentation/RCU/rcubarrier.rst
+++ b/Documentation/RCU/rcubarrier.rst
@@ -5,37 +5,12 @@ RCU and Unloadable Modules
[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/]
-RCU (read-copy update) is a synchronization mechanism that can be thought
-of as a replacement for read-writer locking (among other things), but with
-very low-overhead readers that are immune to deadlock, priority inversion,
-and unbounded latency. RCU read-side critical sections are delimited
-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION
-kernels, generate no code whatsoever.
-
-This means that RCU writers are unaware of the presence of concurrent
-readers, so that RCU updates to shared data must be undertaken quite
-carefully, leaving an old version of the data structure in place until all
-pre-existing readers have finished. These old versions are needed because
-such readers might hold a reference to them. RCU updates can therefore be
-rather expensive, and RCU is thus best suited for read-mostly situations.
-
-How can an RCU writer possibly determine when all readers are finished,
-given that readers might well leave absolutely no trace of their
-presence? There is a synchronize_rcu() primitive that blocks until all
-pre-existing readers have completed. An updater wishing to delete an
-element p from a linked list might do the following, while holding an
-appropriate lock, of course::
-
- list_del_rcu(p);
- synchronize_rcu();
- kfree(p);
-
-But the above code cannot be used in IRQ context -- the call_rcu()
-primitive must be used instead. This primitive takes a pointer to an
-rcu_head struct placed within the RCU-protected data structure and
-another pointer to a function that may be invoked later to free that
-structure. Code to delete an element p from the linked list from IRQ
-context might then be as follows::
+RCU updaters sometimes use call_rcu() to initiate an asynchronous wait for
+a grace period to elapse. This primitive takes a pointer to an rcu_head
+struct placed within the RCU-protected data structure and another pointer
+to a function that may be invoked later to free that structure. Code to
+delete an element p from the linked list from IRQ context might then be
+as follows::
list_del_rcu(p);
call_rcu(&p->rcu, p_callback);
@@ -54,7 +29,7 @@ IRQ context. The function p_callback() might be defined as follows::
Unloading Modules That Use call_rcu()
-------------------------------------
-But what if p_callback is defined in an unloadable module?
+But what if the p_callback() function is defined in an unloadable module?
If we unload the module while some RCU callbacks are pending,
the CPUs executing these callbacks are going to be severely
@@ -67,20 +42,21 @@ grace period to elapse, it does not wait for the callbacks to complete.
One might be tempted to try several back-to-back synchronize_rcu()
calls, but this is still not guaranteed to work. If there is a very
-heavy RCU-callback load, then some of the callbacks might be deferred
-in order to allow other processing to proceed. Such deferral is required
-in realtime kernels in order to avoid excessive scheduling latencies.
+heavy RCU-callback load, then some of the callbacks might be deferred in
+order to allow other processing to proceed. For but one example, such
+deferral is required in realtime kernels in order to avoid excessive
+scheduling latencies.
rcu_barrier()
-------------
-We instead need the rcu_barrier() primitive. Rather than waiting for
-a grace period to elapse, rcu_barrier() waits for all outstanding RCU
-callbacks to complete. Please note that rcu_barrier() does **not** imply
-synchronize_rcu(), in particular, if there are no RCU callbacks queued
-anywhere, rcu_barrier() is within its rights to return immediately,
-without waiting for a grace period to elapse.
+This situation can be handled by the rcu_barrier() primitive. Rather
+than waiting for a grace period to elapse, rcu_barrier() waits for all
+outstanding RCU callbacks to complete. Please note that rcu_barrier()
+does **not** imply synchronize_rcu(), in particular, if there are no RCU
+callbacks queued anywhere, rcu_barrier() is within its rights to return
+immediately, without waiting for anything, let alone a grace period.
Pseudo-code using rcu_barrier() is as follows:
@@ -89,83 +65,86 @@ Pseudo-code using rcu_barrier() is as follows:
3. Allow the module to be unloaded.
There is also an srcu_barrier() function for SRCU, and you of course
-must match the flavor of rcu_barrier() with that of call_rcu(). If your
-module uses multiple flavors of call_rcu(), then it must also use multiple
-flavors of rcu_barrier() when unloading that module. For example, if
-it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
-srcu_struct_2, then the following three lines of code will be required
-when unloading::
-
- 1 rcu_barrier();
- 2 srcu_barrier(&srcu_struct_1);
- 3 srcu_barrier(&srcu_struct_2);
-
-The rcutorture module makes use of rcu_barrier() in its exit function
-as follows::
-
- 1 static void
- 2 rcu_torture_cleanup(void)
- 3 {
- 4 int i;
- 5
- 6 fullstop = 1;
- 7 if (shuffler_task != NULL) {
- 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
- 9 kthread_stop(shuffler_task);
- 10 }
- 11 shuffler_task = NULL;
+must match the flavor of srcu_barrier() with that of call_srcu().
+If your module uses multiple srcu_struct structures, then it must also
+use multiple invocations of srcu_barrier() when unloading that module.
+For example, if it uses call_rcu(), call_srcu() on srcu_struct_1, and
+call_srcu() on srcu_struct_2, then the following three lines of code
+will be required when unloading::
+
+ 1 rcu_barrier();
+ 2 srcu_barrier(&srcu_struct_1);
+ 3 srcu_barrier(&srcu_struct_2);
+
+If latency is of the essence, workqueues could be used to run these
+three functions concurrently.
+
+An ancient version of the rcutorture module makes use of rcu_barrier()
+in its exit function as follows::
+
+ 1 static void
+ 2 rcu_torture_cleanup(void)
+ 3 {
+ 4 int i;
+ 5
+ 6 fullstop = 1;
+ 7 if (shuffler_task != NULL) {
+ 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+ 9 kthread_stop(shuffler_task);
+ 10 }
+ 11 shuffler_task = NULL;
12
- 13 if (writer_task != NULL) {
- 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
- 15 kthread_stop(writer_task);
- 16 }
- 17 writer_task = NULL;
+ 13 if (writer_task != NULL) {
+ 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+ 15 kthread_stop(writer_task);
+ 16 }
+ 17 writer_task = NULL;
18
- 19 if (reader_tasks != NULL) {
- 20 for (i = 0; i < nrealreaders; i++) {
- 21 if (reader_tasks[i] != NULL) {
- 22 VERBOSE_PRINTK_STRING(
- 23 "Stopping rcu_torture_reader task");
- 24 kthread_stop(reader_tasks[i]);
- 25 }
- 26 reader_tasks[i] = NULL;
- 27 }
- 28 kfree(reader_tasks);
- 29 reader_tasks = NULL;
- 30 }
- 31 rcu_torture_current = NULL;
+ 19 if (reader_tasks != NULL) {
+ 20 for (i = 0; i < nrealreaders; i++) {
+ 21 if (reader_tasks[i] != NULL) {
+ 22 VERBOSE_PRINTK_STRING(
+ 23 "Stopping rcu_torture_reader task");
+ 24 kthread_stop(reader_tasks[i]);
+ 25 }
+ 26 reader_tasks[i] = NULL;
+ 27 }
+ 28 kfree(reader_tasks);
+ 29 reader_tasks = NULL;
+ 30 }
+ 31 rcu_torture_current = NULL;
32
- 33 if (fakewriter_tasks != NULL) {
- 34 for (i = 0; i < nfakewriters; i++) {
- 35 if (fakewriter_tasks[i] != NULL) {
- 36 VERBOSE_PRINTK_STRING(
- 37 "Stopping rcu_torture_fakewriter task");
- 38 kthread_stop(fakewriter_tasks[i]);
- 39 }
- 40 fakewriter_tasks[i] = NULL;
- 41 }
- 42 kfree(fakewriter_tasks);
- 43 fakewriter_tasks = NULL;
- 44 }
+ 33 if (fakewriter_tasks != NULL) {
+ 34 for (i = 0; i < nfakewriters; i++) {
+ 35 if (fakewriter_tasks[i] != NULL) {
+ 36 VERBOSE_PRINTK_STRING(
+ 37 "Stopping rcu_torture_fakewriter task");
+ 38 kthread_stop(fakewriter_tasks[i]);
+ 39 }
+ 40 fakewriter_tasks[i] = NULL;
+ 41 }
+ 42 kfree(fakewriter_tasks);
+ 43 fakewriter_tasks = NULL;
+ 44 }
45
- 46 if (stats_task != NULL) {
- 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
- 48 kthread_stop(stats_task);
- 49 }
- 50 stats_task = NULL;
+ 46 if (stats_task != NULL) {
+ 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+ 48 kthread_stop(stats_task);
+ 49 }
+ 50 stats_task = NULL;
51
- 52 /* Wait for all RCU callbacks to fire. */
- 53 rcu_barrier();
+ 52 /* Wait for all RCU callbacks to fire. */
+ 53 rcu_barrier();
54
- 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+ 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
56
- 57 if (cur_ops->cleanup != NULL)
- 58 cur_ops->cleanup();
- 59 if (atomic_read(&n_rcu_torture_error))
- 60 rcu_torture_print_module_parms("End of test: FAILURE");
- 61 else
- 62 rcu_torture_print_module_parms("End of test: SUCCESS");
- 63 }
+ 57 if (cur_ops->cleanup != NULL)
+ 58 cur_ops->cleanup();
+ 59 if (atomic_read(&n_rcu_torture_error))
+ 60 rcu_torture_print_module_parms("End of test: FAILURE");
+ 61 else
+ 62 rcu_torture_print_module_parms("End of test: SUCCESS");
+ 63 }
Line 6 sets a global variable that prevents any RCU callbacks from
re-posting themselves. This will not be necessary in most cases, since
@@ -190,16 +169,17 @@ Quick Quiz #1:
:ref:`Answer to Quick Quiz #1 <answer_rcubarrier_quiz_1>`
Your module might have additional complications. For example, if your
-module invokes call_rcu() from timers, you will need to first cancel all
-the timers, and only then invoke rcu_barrier() to wait for any remaining
+module invokes call_rcu() from timers, you will need to first refrain
+from posting new timers, cancel (or wait for) all the already-posted
+timers, and only then invoke rcu_barrier() to wait for any remaining
RCU callbacks to complete.
-Of course, if you module uses call_rcu(), you will need to invoke
+Of course, if your module uses call_rcu(), you will need to invoke
rcu_barrier() before unloading. Similarly, if your module uses
call_srcu(), you will need to invoke srcu_barrier() before unloading,
and on the same srcu_struct structure. If your module uses call_rcu()
-**and** call_srcu(), then you will need to invoke rcu_barrier() **and**
-srcu_barrier().
+**and** call_srcu(), then (as noted above) you will need to invoke
+rcu_barrier() **and** srcu_barrier().
Implementing rcu_barrier()
@@ -211,27 +191,40 @@ queues. His implementation queues an RCU callback on each of the per-CPU
callback queues, and then waits until they have all started executing, at
which point, all earlier RCU callbacks are guaranteed to have completed.
-The original code for rcu_barrier() was as follows::
-
- 1 void rcu_barrier(void)
- 2 {
- 3 BUG_ON(in_interrupt());
- 4 /* Take cpucontrol mutex to protect against CPU hotplug */
- 5 mutex_lock(&rcu_barrier_mutex);
- 6 init_completion(&rcu_barrier_completion);
- 7 atomic_set(&rcu_barrier_cpu_count, 0);
- 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
- 9 wait_for_completion(&rcu_barrier_completion);
- 10 mutex_unlock(&rcu_barrier_mutex);
- 11 }
-
-Line 3 verifies that the caller is in process context, and lines 5 and 10
+The original code for rcu_barrier() was roughly as follows::
+
+ 1 void rcu_barrier(void)
+ 2 {
+ 3 BUG_ON(in_interrupt());
+ 4 /* Take cpucontrol mutex to protect against CPU hotplug */
+ 5 mutex_lock(&rcu_barrier_mutex);
+ 6 init_completion(&rcu_barrier_completion);
+ 7 atomic_set(&rcu_barrier_cpu_count, 1);
+ 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+ 9 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ 10 complete(&rcu_barrier_completion);
+ 11 wait_for_completion(&rcu_barrier_completion);
+ 12 mutex_unlock(&rcu_barrier_mutex);
+ 13 }
+
+Line 3 verifies that the caller is in process context, and lines 5 and 12
use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the
global completion and counters at a time, which are initialized on lines
6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is
shown below. Note that the final "1" in on_each_cpu()'s argument list
ensures that all the calls to rcu_barrier_func() will have completed
-before on_each_cpu() returns. Line 9 then waits for the completion.
+before on_each_cpu() returns. Line 9 removes the initial count from
+rcu_barrier_cpu_count, and if this count is now zero, line 10 finalizes
+the completion, which prevents line 11 from blocking. Either way,
+line 11 then waits (if needed) for the completion.
+
+.. _rcubarrier_quiz_2:
+
+Quick Quiz #2:
+ Why doesn't line 8 initialize rcu_barrier_cpu_count to zero,
+ thereby avoiding the need for lines 9 and 10?
+
+:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
This code was rewritten in 2008 and several times thereafter, but this
still gives the general idea.
@@ -239,21 +232,21 @@ still gives the general idea.
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
to post an RCU callback, as follows::
- 1 static void rcu_barrier_func(void *notused)
- 2 {
- 3 int cpu = smp_processor_id();
- 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- 5 struct rcu_head *head;
- 6
- 7 head = &rdp->barrier;
- 8 atomic_inc(&rcu_barrier_cpu_count);
- 9 call_rcu(head, rcu_barrier_callback);
- 10 }
+ 1 static void rcu_barrier_func(void *notused)
+ 2 {
+ 3 int cpu = smp_processor_id();
+ 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ 5 struct rcu_head *head;
+ 6
+ 7 head = &rdp->barrier;
+ 8 atomic_inc(&rcu_barrier_cpu_count);
+ 9 call_rcu(head, rcu_barrier_callback);
+ 10 }
Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure,
which contains the struct rcu_head that needed for the later call to
call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line
-8 increments a global counter. This counter will later be decremented
+8 increments the global counter. This counter will later be decremented
by the callback. Line 9 then registers the rcu_barrier_callback() on
the current CPU's queue.
@@ -261,33 +254,34 @@ The rcu_barrier_callback() function simply atomically decrements the
rcu_barrier_cpu_count variable and finalizes the completion when it
reaches zero, as follows::
- 1 static void rcu_barrier_callback(struct rcu_head *notused)
- 2 {
- 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
- 4 complete(&rcu_barrier_completion);
- 5 }
+ 1 static void rcu_barrier_callback(struct rcu_head *notused)
+ 2 {
+ 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+ 4 complete(&rcu_barrier_completion);
+ 5 }
-.. _rcubarrier_quiz_2:
+.. _rcubarrier_quiz_3:
-Quick Quiz #2:
+Quick Quiz #3:
What happens if CPU 0's rcu_barrier_func() executes
immediately (thus incrementing rcu_barrier_cpu_count to the
value one), but the other CPU's rcu_barrier_func() invocations
are delayed for a full grace period? Couldn't this result in
rcu_barrier() returning prematurely?
-:ref:`Answer to Quick Quiz #2 <answer_rcubarrier_quiz_2>`
+:ref:`Answer to Quick Quiz #3 <answer_rcubarrier_quiz_3>`
The current rcu_barrier() implementation is more complex, due to the need
to avoid disturbing idle CPUs (especially on battery-powered systems)
and the need to minimally disturb non-idle CPUs in real-time systems.
-However, the code above illustrates the concepts.
+In addition, a great many optimizations have been applied. However,
+the code above illustrates the concepts.
rcu_barrier() Summary
---------------------
-The rcu_barrier() primitive has seen relatively little use, since most
+The rcu_barrier() primitive is used relatively infrequently, since most
code using RCU is in the core kernel rather than in modules. However, if
you are using RCU from an unloadable module, you need to use rcu_barrier()
so that your module may be safely unloaded.
@@ -302,7 +296,8 @@ Quick Quiz #1:
Is there any other situation where rcu_barrier() might
be required?
-Answer: Interestingly enough, rcu_barrier() was not originally
+Answer:
+ Interestingly enough, rcu_barrier() was not originally
implemented for module unloading. Nikita Danilov was using
RCU in a filesystem, which resulted in a similar situation at
filesystem-unmount time. Dipankar Sarma coded up rcu_barrier()
@@ -318,13 +313,48 @@ Answer: Interestingly enough, rcu_barrier() was not originally
.. _answer_rcubarrier_quiz_2:
Quick Quiz #2:
+ Why doesn't line 8 initialize rcu_barrier_cpu_count to zero,
+ thereby avoiding the need for lines 9 and 10?
+
+Answer:
+ Suppose that the on_each_cpu() function shown on line 8 was
+ delayed, so that CPU 0's rcu_barrier_func() executed and
+ the corresponding grace period elapsed, all before CPU 1's
+ rcu_barrier_func() started executing. This would result in
+ rcu_barrier_cpu_count being decremented to zero, so that line
+ 11's wait_for_completion() would return immediately, failing to
+ wait for CPU 1's callbacks to be invoked.
+
+ Note that this was not a problem when the rcu_barrier() code
+ was first added back in 2005. This is because on_each_cpu()
+ disables preemption, which acted as an RCU read-side critical
+ section, thus preventing CPU 0's grace period from completing
+ until on_each_cpu() had dealt with all of the CPUs. However,
+ with the advent of preemptible RCU, rcu_barrier() no longer
+ waited on nonpreemptible regions of code in preemptible kernels,
+ that being the job of the new rcu_barrier_sched() function.
+
+ However, with the RCU flavor consolidation around v4.20, this
+ possibility was once again ruled out, because the consolidated
+ RCU once again waits on nonpreemptible regions of code.
+
+ Nevertheless, that extra count might still be a good idea.
+ Relying on these sort of accidents of implementation can result
+ in later surprise bugs when the implementation changes.
+
+:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
+
+.. _answer_rcubarrier_quiz_3:
+
+Quick Quiz #3:
What happens if CPU 0's rcu_barrier_func() executes
immediately (thus incrementing rcu_barrier_cpu_count to the
value one), but the other CPU's rcu_barrier_func() invocations
are delayed for a full grace period? Couldn't this result in
rcu_barrier() returning prematurely?
-Answer: This cannot happen. The reason is that on_each_cpu() has its last
+Answer:
+ This cannot happen. The reason is that on_each_cpu() has its last
argument, the wait flag, set to "1". This flag is passed through
to smp_call_function() and further to smp_call_function_on_cpu(),
causing this latter to spin until the cross-CPU invocation of
@@ -336,18 +366,15 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last
Therefore, on_each_cpu() disables preemption across its call
to smp_call_function() and also across the local call to
- rcu_barrier_func(). This prevents the local CPU from context
- switching, again preventing grace periods from completing. This
+ rcu_barrier_func(). Because recent RCU implementations treat
+ preemption-disabled regions of code as RCU read-side critical
+ sections, this prevents grace periods from completing. This
means that all CPUs have executed rcu_barrier_func() before
the first rcu_barrier_callback() can possibly execute, in turn
preventing rcu_barrier_cpu_count from prematurely reaching zero.
- Currently, -rt implementations of RCU keep but a single global
- queue for RCU callbacks, and thus do not suffer from this
- problem. However, when the -rt RCU eventually does have per-CPU
- callback queues, things will have to change. One simple change
- is to add an rcu_read_lock() before line 8 of rcu_barrier()
- and an rcu_read_unlock() after line 8 of this same function. If
- you can think of a better change, please let me know!
+ But if on_each_cpu() ever decides to forgo disabling preemption,
+ as might well happen due to real-time latency considerations,
+ initializing rcu_barrier_cpu_count to one will save the day.
-:ref:`Back to Quick Quiz #2 <rcubarrier_quiz_2>`
+:ref:`Back to Quick Quiz #3 <rcubarrier_quiz_3>`
diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst
index a9fc774bc400..9a734bf54b76 100644
--- a/Documentation/RCU/rculist_nulls.rst
+++ b/Documentation/RCU/rculist_nulls.rst
@@ -8,25 +8,25 @@ This section describes how to use hlist_nulls to
protect read-mostly linked lists and
objects using SLAB_TYPESAFE_BY_RCU allocations.
-Please read the basics in Documentation/RCU/listRCU.rst
+Please read the basics in listRCU.rst.
Using 'nulls'
=============
Using special makers (called 'nulls') is a convenient way
-to solve following problem :
+to solve following problem.
-A typical RCU linked list managing objects which are
-allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
-use following algos :
+Without 'nulls', a typical RCU linked list managing objects which are
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use the following
+algorithms:
-1) Lookup algo
---------------
+1) Lookup algorithm
+-------------------
::
- rcu_read_lock()
begin:
+ rcu_read_lock()
obj = lockless_lookup(key);
if (obj) {
if (!try_get_ref(obj)) // might fail for free objects
@@ -38,6 +38,7 @@ use following algos :
*/
if (obj->key != key) { // not the object we expected
put_ref(obj);
+ rcu_read_unlock();
goto begin;
}
}
@@ -52,9 +53,9 @@ but a version with an additional memory barrier (smp_rmb())
{
struct hlist_node *node, *next;
for (pos = rcu_dereference((head)->first);
- pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
- ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
- pos = rcu_dereference(next))
+ pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
+ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+ pos = rcu_dereference(next))
if (obj->key == key)
return obj;
return NULL;
@@ -64,9 +65,9 @@ And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
struct hlist_node *node;
for (pos = rcu_dereference((head)->first);
- pos && ({ prefetch(pos->next); 1; }) &&
- ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
- pos = rcu_dereference(pos->next))
+ pos && ({ prefetch(pos->next); 1; }) &&
+ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+ pos = rcu_dereference(pos->next))
if (obj->key == key)
return obj;
return NULL;
@@ -82,36 +83,32 @@ Quoting Corey Minyard::
solved by pre-fetching the "next" field (with proper barriers) before
checking the key."
-2) Insert algo
---------------
+2) Insertion algorithm
+----------------------
We need to make sure a reader cannot read the new 'obj->obj_next' value
-and previous value of 'obj->key'. Or else, an item could be deleted
+and previous value of 'obj->key'. Otherwise, an item could be deleted
from a chain, and inserted into another chain. If new chain was empty
-before the move, 'next' pointer is NULL, and lockless reader can
-not detect it missed following items in original chain.
+before the move, 'next' pointer is NULL, and lockless reader can not
+detect the fact that it missed following items in original chain.
::
/*
- * Please note that new inserts are done at the head of list,
- * not in the middle or end.
- */
+ * Please note that new inserts are done at the head of list,
+ * not in the middle or end.
+ */
obj = kmem_cache_alloc(...);
lock_chain(); // typically a spin_lock()
obj->key = key;
- /*
- * we need to make sure obj->key is updated before obj->next
- * or obj->refcnt
- */
- smp_wmb();
- atomic_set(&obj->refcnt, 1);
+ atomic_set_release(&obj->refcnt, 1); // key before refcnt
hlist_add_head_rcu(&obj->obj_node, list);
unlock_chain(); // typically a spin_unlock()
-3) Remove algo
---------------
+3) Removal algorithm
+--------------------
+
Nothing special here, we can use a standard RCU hlist deletion.
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
very very fast (before the end of RCU grace period)
@@ -133,7 +130,7 @@ Avoiding extra smp_rmb()
========================
With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
-and extra smp_wmb() in insert function.
+and extra _release() in insert function.
For example, if we choose to store the slot number as the 'nulls'
end-of-list marker for each slot of the hash table, we can detect
@@ -142,59 +139,61 @@ to another chain) checking the final 'nulls' value if
the lookup met the end of chain. If final 'nulls' value
is not the slot number, then we must restart the lookup at
the beginning. If the object was moved to the same chain,
-then the reader doesn't care : It might eventually
+then the reader doesn't care: It might occasionally
scan the list again without harm.
-1) lookup algo
---------------
+1) lookup algorithm
+-------------------
::
head = &table[slot];
- rcu_read_lock();
begin:
+ rcu_read_lock();
hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
if (obj->key == key) {
- if (!try_get_ref(obj)) // might fail for free objects
+ if (!try_get_ref(obj)) { // might fail for free objects
+ rcu_read_unlock();
goto begin;
+ }
if (obj->key != key) { // not the object we expected
put_ref(obj);
+ rcu_read_unlock();
goto begin;
}
- goto out;
+ goto out;
+ }
+ }
+
+ // If the nulls value we got at the end of this lookup is
+ // not the expected one, we must restart lookup.
+ // We probably met an item that was moved to another chain.
+ if (get_nulls_value(node) != slot) {
+ put_ref(obj);
+ rcu_read_unlock();
+ goto begin;
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
obj = NULL;
out:
rcu_read_unlock();
-2) Insert function
-------------------
+2) Insert algorithm
+-------------------
::
/*
- * Please note that new inserts are done at the head of list,
- * not in the middle or end.
- */
+ * Please note that new inserts are done at the head of list,
+ * not in the middle or end.
+ */
obj = kmem_cache_alloc(cachep);
lock_chain(); // typically a spin_lock()
obj->key = key;
+ atomic_set_release(&obj->refcnt, 1); // key before refcnt
/*
- * changes to obj->key must be visible before refcnt one
- */
- smp_wmb();
- atomic_set(&obj->refcnt, 1);
- /*
- * insert obj in RCU way (readers might be traversing chain)
- */
+ * insert obj in RCU way (readers might be traversing chain)
+ */
hlist_nulls_add_head_rcu(&obj->obj_node, list);
unlock_chain(); // typically a spin_unlock()
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 5036df24ae61..ca7b7cd806a1 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -25,10 +25,10 @@ warnings:
- A CPU looping with bottom halves disabled.
-- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel
- without invoking schedule(). If the looping in the kernel is
- really expected and desirable behavior, you might need to add
- some calls to cond_resched().
+- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the
+ kernel without potentially invoking schedule(). If the looping
+ in the kernel is really expected and desirable behavior, you
+ might need to add some calls to cond_resched().
- Booting Linux using a console connection that is too slow to
keep up with the boot-time console-message rate. For example,
@@ -96,18 +96,29 @@ warnings:
the ``rcu_.*timer wakeup didn't happen for`` console-log message,
which will include additional debugging information.
+- A low-level kernel issue that either fails to invoke one of the
+ variants of rcu_eqs_enter(true), rcu_eqs_exit(true), ct_idle_enter(),
+ ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one
+ hand, or that invokes one of them too many times on the other.
+ Historically, the most frequent issue has been an omission
+ of either irq_enter() or irq_exit(), which in turn invoke
+ ct_irq_enter() or ct_irq_exit(), respectively. Building your
+ kernel with CONFIG_RCU_EQS_DEBUG=y can help track down these types
+ of issues, which sometimes arise in architecture-specific code.
+
- A bug in the RCU implementation.
-- A hardware failure. This is quite unlikely, but has occurred
- at least once in real life. A CPU failed in a running system,
- becoming unresponsive, but not causing an immediate crash.
- This resulted in a series of RCU CPU stall warnings, eventually
- leading the realization that the CPU had failed.
+- A hardware failure. This is quite unlikely, but is not at all
+ uncommon in large datacenter. In one memorable case some decades
+ back, a CPU failed in a running system, becoming unresponsive,
+ but not causing an immediate crash. This resulted in a series
+ of RCU CPU stall warnings, eventually leading the realization
+ that the CPU had failed.
-The RCU, RCU-sched, and RCU-tasks implementations have CPU stall warning.
-Note that SRCU does *not* have CPU stall warnings. Please note that
-RCU only detects CPU stalls when there is a grace period in progress.
-No grace period, no CPU stall warnings.
+The RCU, RCU-sched, RCU-tasks, and RCU-tasks-trace implementations have
+CPU stall warning. Note that SRCU does *not* have CPU stall warnings.
+Please note that RCU only detects CPU stalls when there is a grace period
+in progress. No grace period, no CPU stall warnings.
To diagnose the cause of the stall, inspect the stack traces.
The offending function will usually be near the top of the stack.
@@ -152,6 +163,26 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
Stall-warning messages may be enabled and disabled completely via
/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
+CONFIG_RCU_EXP_CPU_STALL_TIMEOUT
+--------------------------------
+
+ Same as the CONFIG_RCU_CPU_STALL_TIMEOUT parameter but only for
+ the expedited grace period. This parameter defines the period
+ of time that RCU will wait from the beginning of an expedited
+ grace period until it issues an RCU CPU stall warning. This time
+ period is normally 20 milliseconds on Android devices. A zero
+ value causes the CONFIG_RCU_CPU_STALL_TIMEOUT value to be used,
+ after conversion to milliseconds.
+
+ This configuration parameter may be changed at runtime via the
+ /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout, however
+ this parameter is checked only at the beginning of a cycle. If you
+ are in a current stall cycle, setting it to a new value will change
+ the timeout for the -next- stall.
+
+ Stall-warning messages may be enabled and disabled completely via
+ /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
+
RCU_STALL_DELAY_DELTA
---------------------
@@ -175,16 +206,21 @@ RCU_STALL_RAT_DELAY
rcupdate.rcu_task_stall_timeout
-------------------------------
- This boot/sysfs parameter controls the RCU-tasks stall warning
- interval. A value of zero or less suppresses RCU-tasks stall
- warnings. A positive value sets the stall-warning interval
- in seconds. An RCU-tasks stall warning starts with the line:
+ This boot/sysfs parameter controls the RCU-tasks and
+ RCU-tasks-trace stall warning intervals. A value of zero or less
+ suppresses RCU-tasks stall warnings. A positive value sets the
+ stall-warning interval in seconds. An RCU-tasks stall warning
+ starts with the line:
INFO: rcu_tasks detected stalls on tasks:
And continues with the output of sched_show_task() for each
task stalling the current RCU-tasks grace period.
+ An RCU-tasks-trace stall warning starts (and continues) similarly:
+
+ INFO: rcu_tasks_trace detected stalls on tasks
+
Interpreting RCU's CPU Stall-Detector "Splats"
==============================================
@@ -218,7 +254,8 @@ dynticks counter, which will have an even-numbered value if the CPU
is in dyntick-idle mode and an odd-numbered value otherwise. The hex
number between the two "/"s is the value of the nesting, which will be
a small non-negative number if in the idle loop (as shown above) and a
-very large positive number otherwise.
+very large positive number otherwise. The number following the final
+"/" is the NMI nesting, which will be a small non-negative number.
The "softirq=" portion of the message tracks the number of RCU softirq
handlers that the stalled CPU has executed. The number before the "/"
@@ -244,17 +281,6 @@ period (in this case 2603), the grace-period sequence number (7075), and
an estimate of the total number of RCU callbacks queued across all CPUs
(625 in this case).
-In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
-for each CPU::
-
- 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
-
-The "last_accelerate:" prints the low-order 16 bits (in hex) of the
-jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
-from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
-processing is enabled.
-
If the grace period ends just as the stall warning starts printing,
there will be a spurious stall-warning message, which will include
the following::
@@ -364,3 +390,95 @@ for example, "P3421".
It is entirely possible to see stall warnings from normal and from
expedited grace periods at about the same time during the same run.
+
+RCU_CPU_STALL_CPUTIME
+=====================
+
+In kernels built with CONFIG_RCU_CPU_STALL_CPUTIME=y or booted with
+rcupdate.rcu_cpu_stall_cputime=1, the following additional information
+is supplied with each RCU CPU stall warning::
+
+ rcu: hardirqs softirqs csw/system
+ rcu: number: 624 45 0
+ rcu: cputime: 69 1 2425 ==> 2500(ms)
+
+These statistics are collected during the sampling period. The values
+in row "number:" are the number of hard interrupts, number of soft
+interrupts, and number of context switches on the stalled CPU. The
+first three values in row "cputime:" indicate the CPU time in
+milliseconds consumed by hard interrupts, soft interrupts, and tasks
+on the stalled CPU. The last number is the measurement interval, again
+in milliseconds. Because user-mode tasks normally do not cause RCU CPU
+stalls, these tasks are typically kernel tasks, which is why only the
+system CPU time are considered.
+
+The sampling period is shown as follows::
+
+ |<------------first timeout---------->|<-----second timeout----->|
+ |<--half timeout-->|<--half timeout-->| |
+ | |<--first period-->| |
+ | |<-----------second sampling period---------->|
+ | | | |
+ snapshot time point 1st-stall 2nd-stall
+
+The following describes four typical scenarios:
+
+1. A CPU looping with interrupts disabled.
+
+ ::
+
+ rcu: hardirqs softirqs csw/system
+ rcu: number: 0 0 0
+ rcu: cputime: 0 0 0 ==> 2500(ms)
+
+ Because interrupts have been disabled throughout the measurement
+ interval, there are no interrupts and no context switches.
+ Furthermore, because CPU time consumption was measured using interrupt
+ handlers, the system CPU consumption is misleadingly measured as zero.
+ This scenario will normally also have "(0 ticks this GP)" printed on
+ this CPU's summary line.
+
+2. A CPU looping with bottom halves disabled.
+
+ This is similar to the previous example, but with non-zero number of
+ and CPU time consumed by hard interrupts, along with non-zero CPU
+ time consumed by in-kernel execution::
+
+ rcu: hardirqs softirqs csw/system
+ rcu: number: 624 0 0
+ rcu: cputime: 49 0 2446 ==> 2500(ms)
+
+ The fact that there are zero softirqs gives a hint that these were
+ disabled, perhaps via local_bh_disable(). It is of course possible
+ that there were no softirqs, perhaps because all events that would
+ result in softirq execution are confined to other CPUs. In this case,
+ the diagnosis should continue as shown in the next example.
+
+3. A CPU looping with preemption disabled.
+
+ Here, only the number of context switches is zero::
+
+ rcu: hardirqs softirqs csw/system
+ rcu: number: 624 45 0
+ rcu: cputime: 69 1 2425 ==> 2500(ms)
+
+ This situation hints that the stalled CPU was looping with preemption
+ disabled.
+
+4. No looping, but massive hard and soft interrupts.
+
+ ::
+
+ rcu: hardirqs softirqs csw/system
+ rcu: number: xx xx 0
+ rcu: cputime: xx xx 0 ==> 2500(ms)
+
+ Here, the number and CPU time of hard interrupts are all non-zero,
+ but the number of context switches and the in-kernel CPU time consumed
+ are zero. The number and cputime of soft interrupts will usually be
+ non-zero, but could be zero, for example, if the CPU was spinning
+ within a single hard interrupt handler.
+
+ If this type of RCU CPU stall warning can be reproduced, you can
+ narrow it down by looking at /proc/interrupts or by writing code to
+ trace each interrupt, for example, by referring to show_interrupts().
diff --git a/Documentation/RCU/torture.rst b/Documentation/RCU/torture.rst
index a90147713062..b3b6dfa85248 100644
--- a/Documentation/RCU/torture.rst
+++ b/Documentation/RCU/torture.rst
@@ -206,23 +206,34 @@ values for memory may require disabling the callback-flooding tests
using the --bootargs parameter discussed below.
Sometimes additional debugging is useful, and in such cases the --kconfig
-parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``.
+parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_RCU_EQS_DEBUG=y'``.
+In addition, there are the --gdb, --kasan, and --kcsan parameters.
+Note that --gdb limits you to one scenario per kvm.sh run and requires
+that you have another window open from which to run ``gdb`` as instructed
+by the script.
Kernel boot arguments can also be supplied, for example, to control
rcutorture's module parameters. For example, to test a change to RCU's
CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
This will of course result in the scripting reporting a failure, namely
-the resuling RCU CPU stall warning. As noted above, reducing memory may
+the resulting RCU CPU stall warning. As noted above, reducing memory may
require disabling rcutorture's callback-flooding tests::
kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
--bootargs 'rcutorture.fwd_progress=0'
Sometimes all that is needed is a full set of kernel builds. This is
-what the --buildonly argument does.
+what the --buildonly parameter does.
-Finally, the --trust-make argument allows each kernel build to reuse what
-it can from the previous kernel build.
+The --duration parameter can override the default run time of 30 minutes.
+For example, ``--duration 2d`` would run for two days, ``--duration 3h``
+would run for three hours, ``--duration 5m`` would run for five minutes,
+and ``--duration 45s`` would run for 45 seconds. This last can be useful
+for tracking down rare boot-time failures.
+
+Finally, the --trust-make parameter allows each kernel build to reuse what
+it can from the previous kernel build. Please note that without the
+--trust-make parameter, your tags files may be demolished.
There are additional more arcane arguments that are documented in the
source code of the kvm.sh script.
@@ -291,3 +302,73 @@ the following summary at the end of the run on a 12-CPU system::
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
CPU count limited from 16 to 12
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
+
+
+Repeated Runs
+=============
+
+Suppose that you are chasing down a rare boot-time failure. Although you
+could use kvm.sh, doing so will rebuild the kernel on each run. If you
+need (say) 1,000 runs to have confidence that you have fixed the bug,
+these pointless rebuilds can become extremely annoying.
+
+This is why kvm-again.sh exists.
+
+Suppose that a previous kvm.sh run left its output in this directory::
+
+ tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
+
+Then this run can be re-run without rebuilding as follow:
+
+ kvm-again.sh tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28
+
+A few of the original run's kvm.sh parameters may be overridden, perhaps
+most notably --duration and --bootargs. For example::
+
+ kvm-again.sh tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28 \
+ --duration 45s
+
+would re-run the previous test, but for only 45 seconds, thus facilitating
+tracking down the aforementioned rare boot-time failure.
+
+
+Distributed Runs
+================
+
+Although kvm.sh is quite useful, its testing is confined to a single
+system. It is not all that hard to use your favorite framework to cause
+(say) 5 instances of kvm.sh to run on your 5 systems, but this will very
+likely unnecessarily rebuild kernels. In addition, manually distributing
+the desired rcutorture scenarios across the available systems can be
+painstaking and error-prone.
+
+And this is why the kvm-remote.sh script exists.
+
+If you the following command works::
+
+ ssh system0 date
+
+and if it also works for system1, system2, system3, system4, and system5,
+and all of these systems have 64 CPUs, you can type::
+
+ kvm-remote.sh "system0 system1 system2 system3 system4 system5" \
+ --cpus 64 --duration 8h --configs "5*CFLIST"
+
+This will build each default scenario's kernel on the local system, then
+spread each of five instances of each scenario over the systems listed,
+running each scenario for eight hours. At the end of the runs, the
+results will be gathered, recorded, and printed. Most of the parameters
+that kvm.sh will accept can be passed to kvm-remote.sh, but the list of
+systems must come first.
+
+The kvm.sh ``--dryrun scenarios`` argument is useful for working out
+how many scenarios may be run in one batch across a group of systems.
+
+You can also re-run a previous remote run in a manner similar to kvm.sh:
+
+ kvm-remote.sh "system0 system1 system2 system3 system4 system5" \
+ tools/testing/selftests/rcutorture/res/2022.11.03-11.26.28-remote \
+ --duration 24h
+
+In this case, most of the kvm-again.sh parameters may be supplied following
+the pathname of the old run-results directory.
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 17e95ab2a201..8eddef28d3a1 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -6,26 +6,33 @@ What is RCU? -- "Read, Copy, Update"
Please note that the "What is RCU?" LWN series is an excellent place
to start learning about RCU:
-| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
-| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
-| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
-| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
-| 2010 Big API Table http://lwn.net/Articles/419086/
-| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
-| 2014 Big API Table http://lwn.net/Articles/609973/
+| 1. What is RCU, Fundamentally? https://lwn.net/Articles/262464/
+| 2. What is RCU? Part 2: Usage https://lwn.net/Articles/263130/
+| 3. RCU part 3: the RCU API https://lwn.net/Articles/264090/
+| 4. The RCU API, 2010 Edition https://lwn.net/Articles/418853/
+| 2010 Big API Table https://lwn.net/Articles/419086/
+| 5. The RCU API, 2014 Edition https://lwn.net/Articles/609904/
+| 2014 Big API Table https://lwn.net/Articles/609973/
+| 6. The RCU API, 2019 Edition https://lwn.net/Articles/777036/
+| 2019 Big API Table https://lwn.net/Articles/777165/
+
+For those preferring video:
+
+| 1. Unraveling RCU Mysteries: Fundamentals https://www.linuxfoundation.org/webinars/unraveling-rcu-usage-mysteries
+| 2. Unraveling RCU Mysteries: Additional Use Cases https://www.linuxfoundation.org/webinars/unraveling-rcu-usage-mysteries-additional-use-cases
What is RCU?
RCU is a synchronization mechanism that was added to the Linux kernel
during the 2.5 development effort that is optimized for read-mostly
-situations. Although RCU is actually quite simple once you understand it,
-getting there can sometimes be a challenge. Part of the problem is that
-most of the past descriptions of RCU have been written with the mistaken
-assumption that there is "one true way" to describe RCU. Instead,
-the experience has been that different people must take different paths
-to arrive at an understanding of RCU. This document provides several
-different paths, as follows:
+situations. Although RCU is actually quite simple, making effective use
+of it requires you to think differently about your code. Another part
+of the problem is the mistaken assumption that there is "one true way" to
+describe and to use RCU. Instead, the experience has been that different
+people must take different paths to arrive at an understanding of RCU,
+depending on their experiences and use cases. This document provides
+several different paths, as follows:
:ref:`1. RCU OVERVIEW <1_whatisRCU>`
@@ -39,9 +46,11 @@ different paths, as follows:
:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>`
-:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>`
+:ref:`7. ANALOGY WITH REFERENCE COUNTING <7_whatisRCU>`
+
+:ref:`8. FULL LIST OF RCU APIs <8_whatisRCU>`
-:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>`
+:ref:`9. ANSWERS TO QUICK QUIZZES <9_whatisRCU>`
People who prefer starting with a conceptual overview should focus on
Section 1, though most readers will profit by reading this section at
@@ -153,34 +162,36 @@ rcu_read_lock()
^^^^^^^^^^^^^^^
void rcu_read_lock(void);
- Used by a reader to inform the reclaimer that the reader is
- entering an RCU read-side critical section. It is illegal
- to block while in an RCU read-side critical section, though
- kernels built with CONFIG_PREEMPT_RCU can preempt RCU
- read-side critical sections. Any RCU-protected data structure
- accessed during an RCU read-side critical section is guaranteed to
- remain unreclaimed for the full duration of that critical section.
- Reference counts may be used in conjunction with RCU to maintain
- longer-term references to data structures.
+ This temporal primitive is used by a reader to inform the
+ reclaimer that the reader is entering an RCU read-side critical
+ section. It is illegal to block while in an RCU read-side
+ critical section, though kernels built with CONFIG_PREEMPT_RCU
+ can preempt RCU read-side critical sections. Any RCU-protected
+ data structure accessed during an RCU read-side critical section
+ is guaranteed to remain unreclaimed for the full duration of that
+ critical section. Reference counts may be used in conjunction
+ with RCU to maintain longer-term references to data structures.
rcu_read_unlock()
^^^^^^^^^^^^^^^^^
void rcu_read_unlock(void);
- Used by a reader to inform the reclaimer that the reader is
- exiting an RCU read-side critical section. Note that RCU
- read-side critical sections may be nested and/or overlapping.
+ This temporal primitives is used by a reader to inform the
+ reclaimer that the reader is exiting an RCU read-side critical
+ section. Note that RCU read-side critical sections may be nested
+ and/or overlapping.
synchronize_rcu()
^^^^^^^^^^^^^^^^^
void synchronize_rcu(void);
- Marks the end of updater code and the beginning of reclaimer
- code. It does this by blocking until all pre-existing RCU
- read-side critical sections on all CPUs have completed.
- Note that synchronize_rcu() will **not** necessarily wait for
- any subsequent RCU read-side critical sections to complete.
- For example, consider the following sequence of events::
+ This temporal primitive marks the end of updater code and the
+ beginning of reclaimer code. It does this by blocking until
+ all pre-existing RCU read-side critical sections on all CPUs
+ have completed. Note that synchronize_rcu() will **not**
+ necessarily wait for any subsequent RCU read-side critical
+ sections to complete. For example, consider the following
+ sequence of events::
CPU 0 CPU 1 CPU 2
----------------- ------------------------- ---------------
@@ -207,13 +218,13 @@ synchronize_rcu()
to be useful in all but the most read-intensive situations,
synchronize_rcu()'s overhead must also be quite small.
- The call_rcu() API is a callback form of synchronize_rcu(),
- and is described in more detail in a later section. Instead of
- blocking, it registers a function and argument which are invoked
- after all ongoing RCU read-side critical sections have completed.
- This callback variant is particularly useful in situations where
- it is illegal to block or where update-side performance is
- critically important.
+ The call_rcu() API is an asynchronous callback form of
+ synchronize_rcu(), and is described in more detail in a later
+ section. Instead of blocking, it registers a function and
+ argument which are invoked after all ongoing RCU read-side
+ critical sections have completed. This callback variant is
+ particularly useful in situations where it is illegal to block
+ or where update-side performance is critically important.
However, the call_rcu() API should not be used lightly, as use
of the synchronize_rcu() API generally results in simpler code.
@@ -222,7 +233,7 @@ synchronize_rcu()
be delayed. This property results in system resilience in face
of denial-of-service attacks. Code using call_rcu() should limit
update rate in order to gain this same sort of resilience. See
- checklist.txt for some approaches to limiting the update rate.
+ checklist.rst for some approaches to limiting the update rate.
rcu_assign_pointer()
^^^^^^^^^^^^^^^^^^^^
@@ -232,11 +243,13 @@ rcu_assign_pointer()
would be cool to be able to declare a function in this manner.
(Compiler experts will no doubt disagree.)
- The updater uses this function to assign a new value to an
+ The updater uses this spatial macro to assign a new value to an
RCU-protected pointer, in order to safely communicate the change
- in value from the updater to the reader. This macro does not
- evaluate to an rvalue, but it does execute any memory-barrier
- instructions required for a given CPU architecture.
+ in value from the updater to the reader. This is a spatial (as
+ opposed to temporal) macro. It does not evaluate to an rvalue,
+ but it does execute any memory-barrier instructions required
+ for a given CPU architecture. Its ordering properties are that
+ of a store-release operation.
Perhaps just as important, it serves to document (1) which
pointers are protected by RCU and (2) the point at which a
@@ -251,14 +264,15 @@ rcu_dereference()
Like rcu_assign_pointer(), rcu_dereference() must be implemented
as a macro.
- The reader uses rcu_dereference() to fetch an RCU-protected
- pointer, which returns a value that may then be safely
- dereferenced. Note that rcu_dereference() does not actually
- dereference the pointer, instead, it protects the pointer for
- later dereferencing. It also executes any needed memory-barrier
- instructions for a given CPU architecture. Currently, only Alpha
- needs memory barriers within rcu_dereference() -- on other CPUs,
- it compiles to nothing, not even a compiler directive.
+ The reader uses the spatial rcu_dereference() macro to fetch
+ an RCU-protected pointer, which returns a value that may
+ then be safely dereferenced. Note that rcu_dereference()
+ does not actually dereference the pointer, instead, it
+ protects the pointer for later dereferencing. It also
+ executes any needed memory-barrier instructions for a given
+ CPU architecture. Currently, only Alpha needs memory barriers
+ within rcu_dereference() -- on other CPUs, it compiles to a
+ volatile load.
Common coding practice uses rcu_dereference() to copy an
RCU-protected pointer to a local variable, then dereferences
@@ -316,7 +330,7 @@ rcu_dereference()
must prohibit. The rcu_dereference_protected() variant takes
a lockdep expression to indicate which locks must be acquired
by the caller. If the indicated protection is not provided,
- a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
+ a lockdep splat is emitted. See Design/Requirements/Requirements.rst
and the API's code comments for more details and example usage.
.. [2] If the list_for_each_entry_rcu() instance might be used by
@@ -351,12 +365,15 @@ reader, updater, and reclaimer.
synchronize_rcu() & call_rcu()
-The RCU infrastructure observes the time sequence of rcu_read_lock(),
+The RCU infrastructure observes the temporal sequence of rcu_read_lock(),
rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in
order to determine when (1) synchronize_rcu() invocations may return
to their callers and (2) call_rcu() callbacks may be invoked. Efficient
implementations of the RCU infrastructure make heavy use of batching in
order to amortize their overhead over many uses of the corresponding APIs.
+The rcu_assign_pointer() and rcu_dereference() invocations communicate
+spatial changes via stores to and loads from the RCU-protected pointer in
+question.
There are at least three flavors of RCU usage in the Linux kernel. The diagram
above shows the most common one. On the updater side, the rcu_assign_pointer(),
@@ -388,7 +405,9 @@ b. RCU applied to networking data structures that may be subjected
c. RCU applied to scheduler and interrupt/NMI-handler tasks.
Again, most uses will be of (a). The (b) and (c) cases are important
-for specialized uses, but are relatively uncommon.
+for specialized uses, but are relatively uncommon. The SRCU, RCU-Tasks,
+RCU-Tasks-Rude, and RCU-Tasks-Trace have similar relationships among
+their assorted primitives.
.. _3_whatisRCU:
@@ -397,8 +416,7 @@ for specialized uses, but are relatively uncommon.
This section shows a simple use of the core RCU API to protect a
global pointer to a dynamically allocated structure. More-typical
-uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
-:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
+uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst.
::
struct foo {
@@ -465,7 +483,7 @@ So, to sum up:
- Within an RCU read-side critical section, use rcu_dereference()
to dereference RCU-protected pointers.
-- Use some solid scheme (such as locks or semaphores) to
+- Use some solid design (such as locks or semaphores) to
keep concurrent updates from interfering with each other.
- Use rcu_assign_pointer() to update an RCU-protected pointer.
@@ -480,10 +498,9 @@ So, to sum up:
RCU read-side critical sections that might be referencing that
data item.
-See checklist.txt for additional rules to follow when using RCU.
-And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
-<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
-<NMI_rcu_doc>`.
+See checklist.rst for additional rules to follow when using RCU.
+And again, more-typical uses of RCU may be found in listRCU.rst,
+arrayRCU.rst, and NMI-RCU.rst.
.. _4_whatisRCU:
@@ -577,7 +594,15 @@ to avoid having to write your own callback::
kfree_rcu(old_fp, rcu);
-Again, see checklist.txt for additional rules governing the use of RCU.
+If the occasional sleep is permitted, the single-argument form may
+be used, omitting the rcu_head structure from struct foo.
+
+ kfree_rcu_mightsleep(old_fp);
+
+This variant almost never blocks, but might do so by invoking
+synchronize_rcu() in response to memory-allocation failure.
+
+Again, see checklist.rst for additional rules governing the use of RCU.
.. _5_whatisRCU:
@@ -594,7 +619,7 @@ lacking both functionality and performance. However, they are useful
in getting a feel for how RCU works. See kernel/rcu/update.c for a
production-quality implementation, and see:
- http://www.rdrop.com/users/paulmck/RCU
+ https://docs.google.com/document/d/1X0lThx8OK0ZgLMqVoXiR4ZrGURHrXK6NyLRbeXe3Xac/edit
for papers describing the Linux kernel RCU implementation. The OLS'01
and OLS'02 papers are a good introduction, and the dissertation provides
@@ -661,7 +686,7 @@ been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
promotes synchronize_rcu() to a full memory barrier in compliance with
the "Memory-Barrier Guarantees" listed in:
- Documentation/RCU/Design/Requirements/Requirements.rst
+ Design/Requirements/Requirements.rst
It is possible to nest rcu_read_lock(), since reader-writer locks may
be recursively acquired. Note also that rcu_read_lock() is immune
@@ -677,7 +702,7 @@ Quick Quiz #1:
occur when using this algorithm in a real-world Linux
kernel? How could this deadlock be avoided?
-:ref:`Answers to Quick Quiz <8_whatisRCU>`
+:ref:`Answers to Quick Quiz <9_whatisRCU>`
5B. "TOY" EXAMPLE #2: CLASSIC RCU
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -732,7 +757,7 @@ Quick Quiz #2:
Give an example where Classic RCU's read-side
overhead is **negative**.
-:ref:`Answers to Quick Quiz <8_whatisRCU>`
+:ref:`Answers to Quick Quiz <9_whatisRCU>`
.. _quiz_3:
@@ -741,7 +766,7 @@ Quick Quiz #3:
critical section, what the heck do you do in
CONFIG_PREEMPT_RT, where normal spinlocks can block???
-:ref:`Answers to Quick Quiz <8_whatisRCU>`
+:ref:`Answers to Quick Quiz <9_whatisRCU>`
.. _6_whatisRCU:
@@ -872,7 +897,86 @@ be used in place of synchronize_rcu().
.. _7_whatisRCU:
-7. FULL LIST OF RCU APIs
+7. ANALOGY WITH REFERENCE COUNTING
+-----------------------------------
+
+The reader-writer analogy (illustrated by the previous section) is not
+always the best way to think about using RCU. Another helpful analogy
+considers RCU an effective reference count on everything which is
+protected by RCU.
+
+A reference count typically does not prevent the referenced object's
+values from changing, but does prevent changes to type -- particularly the
+gross change of type that happens when that object's memory is freed and
+re-allocated for some other purpose. Once a type-safe reference to the
+object is obtained, some other mechanism is needed to ensure consistent
+access to the data in the object. This could involve taking a spinlock,
+but with RCU the typical approach is to perform reads with SMP-aware
+operations such as smp_load_acquire(), to perform updates with atomic
+read-modify-write operations, and to provide the necessary ordering.
+RCU provides a number of support functions that embed the required
+operations and ordering, such as the list_for_each_entry_rcu() macro
+used in the previous section.
+
+A more focused view of the reference counting behavior is that,
+between rcu_read_lock() and rcu_read_unlock(), any reference taken with
+rcu_dereference() on a pointer marked as ``__rcu`` can be treated as
+though a reference-count on that object has been temporarily increased.
+This prevents the object from changing type. Exactly what this means
+will depend on normal expectations of objects of that type, but it
+typically includes that spinlocks can still be safely locked, normal
+reference counters can be safely manipulated, and ``__rcu`` pointers
+can be safely dereferenced.
+
+Some operations that one might expect to see on an object for
+which an RCU reference is held include:
+
+ - Copying out data that is guaranteed to be stable by the object's type.
+ - Using kref_get_unless_zero() or similar to get a longer-term
+ reference. This may fail of course.
+ - Acquiring a spinlock in the object, and checking if the object still
+ is the expected object and if so, manipulating it freely.
+
+The understanding that RCU provides a reference that only prevents a
+change of type is particularly visible with objects allocated from a
+slab cache marked ``SLAB_TYPESAFE_BY_RCU``. RCU operations may yield a
+reference to an object from such a cache that has been concurrently freed
+and the memory reallocated to a completely different object, though of
+the same type. In this case RCU doesn't even protect the identity of the
+object from changing, only its type. So the object found may not be the
+one expected, but it will be one where it is safe to take a reference
+(and then potentially acquiring a spinlock), allowing subsequent code
+to check whether the identity matches expectations. It is tempting
+to simply acquire the spinlock without first taking the reference, but
+unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
+initialized after each and every call to kmem_cache_alloc(), which renders
+reference-free spinlock acquisition completely unsafe. Therefore, when
+using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
+(Those willing to use a kmem_cache constructor may also use locking,
+including cache-friendly sequence locking.)
+
+With traditional reference counting -- such as that implemented by the
+kref library in Linux -- there is typically code that runs when the last
+reference to an object is dropped. With kref, this is the function
+passed to kref_put(). When RCU is being used, such finalization code
+must not be run until all ``__rcu`` pointers referencing the object have
+been updated, and then a grace period has passed. Every remaining
+globally visible pointer to the object must be considered to be a
+potential counted reference, and the finalization code is typically run
+using call_rcu() only after all those pointers have been changed.
+
+To see how to choose between these two analogies -- of RCU as a
+reader-writer lock and RCU as a reference counting system -- it is useful
+to reflect on the scale of the thing being protected. The reader-writer
+lock analogy looks at larger multi-part objects such as a linked list
+and shows how RCU can facilitate concurrency while elements are added
+to, and removed from, the list. The reference-count analogy looks at
+the individual objects and looks at how they can be accessed safely
+within whatever whole they are a part of.
+
+.. _8_whatisRCU:
+
+8. FULL LIST OF RCU APIs
-------------------------
The RCU APIs are documented in docbook-format header comments in the
@@ -968,6 +1072,30 @@ sched::
rcu_read_lock_sched_held
+RCU-Tasks::
+
+ Critical sections Grace period Barrier
+
+ N/A call_rcu_tasks rcu_barrier_tasks
+ synchronize_rcu_tasks
+
+
+RCU-Tasks-Rude::
+
+ Critical sections Grace period Barrier
+
+ N/A call_rcu_tasks_rude rcu_barrier_tasks_rude
+ synchronize_rcu_tasks_rude
+
+
+RCU-Tasks-Trace::
+
+ Critical sections Grace period Barrier
+
+ rcu_read_lock_trace call_rcu_tasks_trace rcu_barrier_tasks_trace
+ rcu_read_unlock_trace synchronize_rcu_tasks_trace
+
+
SRCU::
Critical sections Grace period Barrier
@@ -985,14 +1113,20 @@ SRCU: Initialization/cleanup::
init_srcu_struct
cleanup_srcu_struct
-All: lockdep-checked RCU-protected pointer access::
+All: lockdep-checked RCU utility APIs::
- rcu_access_pointer
- rcu_dereference_raw
RCU_LOCKDEP_WARN
rcu_sleep_check
RCU_NONIDLE
+All: Unchecked RCU-protected pointer access::
+
+ rcu_dereference_raw
+
+All: Unchecked RCU-protected pointer access with dereferencing prohibited::
+
+ rcu_access_pointer
+
See the comment headers in the source code (or the docbook generated
from them) for more information.
@@ -1002,42 +1136,50 @@ list can be helpful:
a. Will readers need to block? If so, you need SRCU.
-b. What about the -rt patchset? If readers would need to block
- in an non-rt kernel, you need SRCU. If readers would block
- in a -rt kernel, but not in a non-rt kernel, SRCU is not
- necessary. (The -rt patchset turns spinlocks into sleeplocks,
- hence this distinction.)
+b. Will readers need to block and are you doing tracing, for
+ example, ftrace or BPF? If so, you need RCU-tasks,
+ RCU-tasks-rude, and/or RCU-tasks-trace.
-c. Do you need to treat NMI handlers, hardirq handlers,
+c. What about the -rt patchset? If readers would need to block in
+ an non-rt kernel, you need SRCU. If readers would block when
+ acquiring spinlocks in a -rt kernel, but not in a non-rt kernel,
+ SRCU is not necessary. (The -rt patchset turns spinlocks into
+ sleeplocks, hence this distinction.)
+
+d. Do you need to treat NMI handlers, hardirq handlers,
and code segments with preemption disabled (whether
via preempt_disable(), local_irq_save(), local_bh_disable(),
or some other mechanism) as if they were explicit RCU readers?
- If so, RCU-sched is the only choice that will work for you.
-
-d. Do you need RCU grace periods to complete even in the face
- of softirq monopolization of one or more of the CPUs? For
- example, is your code subject to network-based denial-of-service
- attacks? If so, you should disable softirq across your readers,
- for example, by using rcu_read_lock_bh().
-
-e. Is your workload too update-intensive for normal use of
+ If so, RCU-sched readers are the only choice that will work
+ for you, but since about v4.20 you use can use the vanilla RCU
+ update primitives.
+
+e. Do you need RCU grace periods to complete even in the face of
+ softirq monopolization of one or more of the CPUs? For example,
+ is your code subject to network-based denial-of-service attacks?
+ If so, you should disable softirq across your readers, for
+ example, by using rcu_read_lock_bh(). Since about v4.20 you
+ use can use the vanilla RCU update primitives.
+
+f. Is your workload too update-intensive for normal use of
RCU, but inappropriate for other synchronization mechanisms?
If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
named SLAB_DESTROY_BY_RCU). But please be careful!
-f. Do you need read-side critical sections that are respected
- even though they are in the middle of the idle loop, during
- user-mode execution, or on an offlined CPU? If so, SRCU is the
- only choice that will work for you.
+g. Do you need read-side critical sections that are respected even
+ on CPUs that are deep in the idle loop, during entry to or exit
+ from user-mode execution, or on an offlined CPU? If so, SRCU
+ and RCU Tasks Trace are the only choices that will work for you,
+ with SRCU being strongly preferred in almost all cases.
-g. Otherwise, use RCU.
+h. Otherwise, use RCU.
Of course, this all assumes that you have determined that RCU is in fact
the right tool for your job.
-.. _8_whatisRCU:
+.. _9_whatisRCU:
-8. ANSWERS TO QUICK QUIZZES
+9. ANSWERS TO QUICK QUIZZES
----------------------------
Quick Quiz #1:
diff --git a/Documentation/accel/index.rst b/Documentation/accel/index.rst
new file mode 100644
index 000000000000..e94a0160b6a0
--- /dev/null
+++ b/Documentation/accel/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Compute Accelerators
+====================
+
+.. toctree::
+ :maxdepth: 1
+
+ introduction
+ qaic/index
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/accel/introduction.rst b/Documentation/accel/introduction.rst
new file mode 100644
index 000000000000..89984dfececf
--- /dev/null
+++ b/Documentation/accel/introduction.rst
@@ -0,0 +1,110 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Introduction
+============
+
+The Linux compute accelerators subsystem is designed to expose compute
+accelerators in a common way to user-space and provide a common set of
+functionality.
+
+These devices can be either stand-alone ASICs or IP blocks inside an SoC/GPU.
+Although these devices are typically designed to accelerate
+Machine-Learning (ML) and/or Deep-Learning (DL) computations, the accel layer
+is not limited to handling these types of accelerators.
+
+Typically, a compute accelerator will belong to one of the following
+categories:
+
+- Edge AI - doing inference at an edge device. It can be an embedded ASIC/FPGA,
+ or an IP inside a SoC (e.g. laptop web camera). These devices
+ are typically configured using registers and can work with or without DMA.
+
+- Inference data-center - single/multi user devices in a large server. This
+ type of device can be stand-alone or an IP inside a SoC or a GPU. It will
+ have on-board DRAM (to hold the DL topology), DMA engines and
+ command submission queues (either kernel or user-space queues).
+ It might also have an MMU to manage multiple users and might also enable
+ virtualization (SR-IOV) to support multiple VMs on the same device. In
+ addition, these devices will usually have some tools, such as profiler and
+ debugger.
+
+- Training data-center - Similar to Inference data-center cards, but typically
+ have more computational power and memory b/w (e.g. HBM) and will likely have
+ a method of scaling-up/out, i.e. connecting to other training cards inside
+ the server or in other servers, respectively.
+
+All these devices typically have different runtime user-space software stacks,
+that are tailored-made to their h/w. In addition, they will also probably
+include a compiler to generate programs to their custom-made computational
+engines. Typically, the common layer in user-space will be the DL frameworks,
+such as PyTorch and TensorFlow.
+
+Sharing code with DRM
+=====================
+
+Because this type of devices can be an IP inside GPUs or have similar
+characteristics as those of GPUs, the accel subsystem will use the
+DRM subsystem's code and functionality. i.e. the accel core code will
+be part of the DRM subsystem and an accel device will be a new type of DRM
+device.
+
+This will allow us to leverage the extensive DRM code-base and
+collaborate with DRM developers that have experience with this type of
+devices. In addition, new features that will be added for the accelerator
+drivers can be of use to GPU drivers as well.
+
+Differentiation from GPUs
+=========================
+
+Because we want to prevent the extensive user-space graphic software stack
+from trying to use an accelerator as a GPU, the compute accelerators will be
+differentiated from GPUs by using a new major number and new device char files.
+
+Furthermore, the drivers will be located in a separate place in the kernel
+tree - drivers/accel/.
+
+The accelerator devices will be exposed to the user space with the dedicated
+261 major number and will have the following convention:
+
+- device char files - /dev/accel/accel\*
+- sysfs - /sys/class/accel/accel\*/
+- debugfs - /sys/kernel/debug/accel/\*/
+
+Getting Started
+===============
+
+First, read the DRM documentation at Documentation/gpu/index.rst.
+Not only it will explain how to write a new DRM driver but it will also
+contain all the information on how to contribute, the Code Of Conduct and
+what is the coding style/documentation. All of that is the same for the
+accel subsystem.
+
+Second, make sure the kernel is configured with CONFIG_DRM_ACCEL.
+
+To expose your device as an accelerator, two changes are needed to
+be done in your driver (as opposed to a standard DRM driver):
+
+- Add the DRIVER_COMPUTE_ACCEL feature flag in your drm_driver's
+ driver_features field. It is important to note that this driver feature is
+ mutually exclusive with DRIVER_RENDER and DRIVER_MODESET. Devices that want
+ to expose both graphics and compute device char files should be handled by
+ two drivers that are connected using the auxiliary bus framework.
+
+- Change the open callback in your driver fops structure to accel_open().
+ Alternatively, your driver can use DEFINE_DRM_ACCEL_FOPS macro to easily
+ set the correct function operations pointers structure.
+
+External References
+===================
+
+email threads
+-------------
+
+* `Initial discussion on the New subsystem for acceleration devices <https://lkml.org/lkml/2022/7/31/83>`_ - Oded Gabbay (2022)
+* `patch-set to add the new subsystem <https://lkml.org/lkml/2022/10/22/544>`_ - Oded Gabbay (2022)
+
+Conference talks
+----------------
+
+* `LPC 2022 Accelerators BOF outcomes summary <https://airlied.blogspot.com/2022/09/accelerators-bof-outcomes-summary.html>`_ - Dave Airlie (2022)
diff --git a/Documentation/accel/qaic/aic100.rst b/Documentation/accel/qaic/aic100.rst
new file mode 100644
index 000000000000..c80d0f1307db
--- /dev/null
+++ b/Documentation/accel/qaic/aic100.rst
@@ -0,0 +1,510 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+===============================
+ Qualcomm Cloud AI 100 (AIC100)
+===============================
+
+Overview
+========
+
+The Qualcomm Cloud AI 100/AIC100 family of products (including SA9000P - part of
+Snapdragon Ride) are PCIe adapter cards which contain a dedicated SoC ASIC for
+the purpose of efficiently running Artificial Intelligence (AI) Deep Learning
+inference workloads. They are AI accelerators.
+
+The PCIe interface of AIC100 is capable of PCIe Gen4 speeds over eight lanes
+(x8). An individual SoC on a card can have up to 16 NSPs for running workloads.
+Each SoC has an A53 management CPU. On card, there can be up to 32 GB of DDR.
+
+Multiple AIC100 cards can be hosted in a single system to scale overall
+performance. AIC100 cards are multi-user capable and able to execute workloads
+from multiple users in a concurrent manner.
+
+Hardware Description
+====================
+
+An AIC100 card consists of an AIC100 SoC, on-card DDR, and a set of misc
+peripherals (PMICs, etc).
+
+An AIC100 card can either be a PCIe HHHL form factor (a traditional PCIe card),
+or a Dual M.2 card. Both use PCIe to connect to the host system.
+
+As a PCIe endpoint/adapter, AIC100 uses the standard VendorID(VID)/
+DeviceID(DID) combination to uniquely identify itself to the host. AIC100
+uses the standard Qualcomm VID (0x17cb). All AIC100 SKUs use the same
+AIC100 DID (0xa100).
+
+AIC100 does not implement FLR (function level reset).
+
+AIC100 implements MSI but does not implement MSI-X. AIC100 requires 17 MSIs to
+operate (1 for MHI, 16 for the DMA Bridge).
+
+As a PCIe device, AIC100 utilizes BARs to provide host interfaces to the device
+hardware. AIC100 provides 3, 64-bit BARs.
+
+* The first BAR is 4K in size, and exposes the MHI interface to the host.
+
+* The second BAR is 2M in size, and exposes the DMA Bridge interface to the
+ host.
+
+* The third BAR is variable in size based on an individual AIC100's
+ configuration, but defaults to 64K. This BAR currently has no purpose.
+
+From the host perspective, AIC100 has several key hardware components -
+
+* MHI (Modem Host Interface)
+* QSM (QAIC Service Manager)
+* NSPs (Neural Signal Processor)
+* DMA Bridge
+* DDR
+
+MHI
+---
+
+AIC100 has one MHI interface over PCIe. MHI itself is documented at
+Documentation/mhi/index.rst MHI is the mechanism the host uses to communicate
+with the QSM. Except for workload data via the DMA Bridge, all interaction with
+the device occurs via MHI.
+
+QSM
+---
+
+QAIC Service Manager. This is an ARM A53 CPU that runs the primary
+firmware of the card and performs on-card management tasks. It also
+communicates with the host via MHI. Each AIC100 has one of
+these.
+
+NSP
+---
+
+Neural Signal Processor. Each AIC100 has up to 16 of these. These are
+the processors that run the workloads on AIC100. Each NSP is a Qualcomm Hexagon
+(Q6) DSP with HVX and HMX. Each NSP can only run one workload at a time, but
+multiple NSPs may be assigned to a single workload. Since each NSP can only run
+one workload, AIC100 is limited to 16 concurrent workloads. Workload
+"scheduling" is under the purview of the host. AIC100 does not automatically
+timeslice.
+
+DMA Bridge
+----------
+
+The DMA Bridge is custom DMA engine that manages the flow of data
+in and out of workloads. AIC100 has one of these. The DMA Bridge has 16
+channels, each consisting of a set of request/response FIFOs. Each active
+workload is assigned a single DMA Bridge channel. The DMA Bridge exposes
+hardware registers to manage the FIFOs (head/tail pointers), but requires host
+memory to store the FIFOs.
+
+DDR
+---
+
+AIC100 has on-card DDR. In total, an AIC100 can have up to 32 GB of DDR.
+This DDR is used to store workloads, data for the workloads, and is used by the
+QSM for managing the device. NSPs are granted access to sections of the DDR by
+the QSM. The host does not have direct access to the DDR, and must make
+requests to the QSM to transfer data to the DDR.
+
+High-level Use Flow
+===================
+
+AIC100 is a multi-user, programmable accelerator typically used for running
+neural networks in inferencing mode to efficiently perform AI operations.
+AIC100 is not intended for training neural networks. AIC100 can be utilized
+for generic compute workloads.
+
+Assuming a user wants to utilize AIC100, they would follow these steps:
+
+1. Compile the workload into an ELF targeting the NSP(s)
+2. Make requests to the QSM to load the workload and related artifacts into the
+ device DDR
+3. Make a request to the QSM to activate the workload onto a set of idle NSPs
+4. Make requests to the DMA Bridge to send input data to the workload to be
+ processed, and other requests to receive processed output data from the
+ workload.
+5. Once the workload is no longer required, make a request to the QSM to
+ deactivate the workload, thus putting the NSPs back into an idle state.
+6. Once the workload and related artifacts are no longer needed for future
+ sessions, make requests to the QSM to unload the data from DDR. This frees
+ the DDR to be used by other users.
+
+
+Boot Flow
+=========
+
+AIC100 uses a flashless boot flow, derived from Qualcomm MSMs.
+
+When AIC100 is first powered on, it begins executing PBL (Primary Bootloader)
+from ROM. PBL enumerates the PCIe link, and initializes the BHI (Boot Host
+Interface) component of MHI.
+
+Using BHI, the host points PBL to the location of the SBL (Secondary Bootloader)
+image. The PBL pulls the image from the host, validates it, and begins
+execution of SBL.
+
+SBL initializes MHI, and uses MHI to notify the host that the device has entered
+the SBL stage. SBL performs a number of operations:
+
+* SBL initializes the majority of hardware (anything PBL left uninitialized),
+ including DDR.
+* SBL offloads the bootlog to the host.
+* SBL synchronizes timestamps with the host for future logging.
+* SBL uses the Sahara protocol to obtain the runtime firmware images from the
+ host.
+
+Once SBL has obtained and validated the runtime firmware, it brings the NSPs out
+of reset, and jumps into the QSM.
+
+The QSM uses MHI to notify the host that the device has entered the QSM stage
+(AMSS in MHI terms). At this point, the AIC100 device is fully functional, and
+ready to process workloads.
+
+Userspace components
+====================
+
+Compiler
+--------
+
+An open compiler for AIC100 based on upstream LLVM can be found at:
+https://github.com/quic/software-kit-for-qualcomm-cloud-ai-100-cc
+
+Usermode Driver (UMD)
+---------------------
+
+An open UMD that interfaces with the qaic kernel driver can be found at:
+https://github.com/quic/software-kit-for-qualcomm-cloud-ai-100
+
+Sahara loader
+-------------
+
+An open implementation of the Sahara protocol called kickstart can be found at:
+https://github.com/andersson/qdl
+
+MHI Channels
+============
+
+AIC100 defines a number of MHI channels for different purposes. This is a list
+of the defined channels, and their uses.
+
++----------------+---------+----------+----------------------------------------+
+| Channel name | IDs | EEs | Purpose |
++================+=========+==========+========================================+
+| QAIC_LOOPBACK | 0 & 1 | AMSS | Any data sent to the device on this |
+| | | | channel is sent back to the host. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_SAHARA | 2 & 3 | SBL | Used by SBL to obtain the runtime |
+| | | | firmware from the host. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_DIAG | 4 & 5 | AMSS | Used to communicate with QSM via the |
+| | | | DIAG protocol. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_SSR | 6 & 7 | AMSS | Used to notify the host of subsystem |
+| | | | restart events, and to offload SSR |
+| | | | crashdumps. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_QDSS | 8 & 9 | AMSS | Used for the Qualcomm Debug Subsystem. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_CONTROL | 10 & 11 | AMSS | Used for the Neural Network Control |
+| | | | (NNC) protocol. This is the primary |
+| | | | channel between host and QSM for |
+| | | | managing workloads. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_LOGGING | 12 & 13 | SBL | Used by the SBL to send the bootlog to |
+| | | | the host. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_STATUS | 14 & 15 | AMSS | Used to notify the host of Reliability,|
+| | | | Accessibility, Serviceability (RAS) |
+| | | | events. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_TELEMETRY | 16 & 17 | AMSS | Used to get/set power/thermal/etc |
+| | | | attributes. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_DEBUG | 18 & 19 | AMSS | Not used. |
++----------------+---------+----------+----------------------------------------+
+| QAIC_TIMESYNC | 20 & 21 | SBL/AMSS | Used to synchronize timestamps in the |
+| | | | device side logs with the host time |
+| | | | source. |
++----------------+---------+----------+----------------------------------------+
+
+DMA Bridge
+==========
+
+Overview
+--------
+
+The DMA Bridge is one of the main interfaces to the host from the device
+(the other being MHI). As part of activating a workload to run on NSPs, the QSM
+assigns that network a DMA Bridge channel. A workload's DMA Bridge channel
+(DBC for short) is solely for the use of that workload and is not shared with
+other workloads.
+
+Each DBC is a pair of FIFOs that manage data in and out of the workload. One
+FIFO is the request FIFO. The other FIFO is the response FIFO.
+
+Each DBC contains 4 registers in hardware:
+
+* Request FIFO head pointer (offset 0x0). Read only by the host. Indicates the
+ latest item in the FIFO the device has consumed.
+* Request FIFO tail pointer (offset 0x4). Read/write by the host. Host
+ increments this register to add new items to the FIFO.
+* Response FIFO head pointer (offset 0x8). Read/write by the host. Indicates
+ the latest item in the FIFO the host has consumed.
+* Response FIFO tail pointer (offset 0xc). Read only by the host. Device
+ increments this register to add new items to the FIFO.
+
+The values in each register are indexes in the FIFO. To get the location of the
+FIFO element pointed to by the register: FIFO base address + register * element
+size.
+
+DBC registers are exposed to the host via the second BAR. Each DBC consumes
+4KB of space in the BAR.
+
+The actual FIFOs are backed by host memory. When sending a request to the QSM
+to activate a network, the host must donate memory to be used for the FIFOs.
+Due to internal mapping limitations of the device, a single contiguous chunk of
+memory must be provided per DBC, which hosts both FIFOs. The request FIFO will
+consume the beginning of the memory chunk, and the response FIFO will consume
+the end of the memory chunk.
+
+Request FIFO
+------------
+
+A request FIFO element has the following structure:
+
+.. code-block:: c
+
+ struct request_elem {
+ u16 req_id;
+ u8 seq_id;
+ u8 pcie_dma_cmd;
+ u32 reserved;
+ u64 pcie_dma_source_addr;
+ u64 pcie_dma_dest_addr;
+ u32 pcie_dma_len;
+ u32 reserved;
+ u64 doorbell_addr;
+ u8 doorbell_attr;
+ u8 reserved;
+ u16 reserved;
+ u32 doorbell_data;
+ u32 sem_cmd0;
+ u32 sem_cmd1;
+ u32 sem_cmd2;
+ u32 sem_cmd3;
+ };
+
+Request field descriptions:
+
+req_id
+ request ID. A request FIFO element and a response FIFO element with
+ the same request ID refer to the same command.
+
+seq_id
+ sequence ID within a request. Ignored by the DMA Bridge.
+
+pcie_dma_cmd
+ describes the DMA element of this request.
+
+ * Bit(7) is the force msi flag, which overrides the DMA Bridge MSI logic
+ and generates a MSI when this request is complete, and QSM
+ configures the DMA Bridge to look at this bit.
+ * Bits(6:5) are reserved.
+ * Bit(4) is the completion code flag, and indicates that the DMA Bridge
+ shall generate a response FIFO element when this request is
+ complete.
+ * Bit(3) indicates if this request is a linked list transfer(0) or a bulk
+ transfer(1).
+ * Bit(2) is reserved.
+ * Bits(1:0) indicate the type of transfer. No transfer(0), to device(1),
+ from device(2). Value 3 is illegal.
+
+pcie_dma_source_addr
+ source address for a bulk transfer, or the address of the linked list.
+
+pcie_dma_dest_addr
+ destination address for a bulk transfer.
+
+pcie_dma_len
+ length of the bulk transfer. Note that the size of this field
+ limits transfers to 4G in size.
+
+doorbell_addr
+ address of the doorbell to ring when this request is complete.
+
+doorbell_attr
+ doorbell attributes.
+
+ * Bit(7) indicates if a write to a doorbell is to occur.
+ * Bits(6:2) are reserved.
+ * Bits(1:0) contain the encoding of the doorbell length. 0 is 32-bit,
+ 1 is 16-bit, 2 is 8-bit, 3 is reserved. The doorbell address
+ must be naturally aligned to the specified length.
+
+doorbell_data
+ data to write to the doorbell. Only the bits corresponding to
+ the doorbell length are valid.
+
+sem_cmdN
+ semaphore command.
+
+ * Bit(31) indicates this semaphore command is enabled.
+ * Bit(30) is the to-device DMA fence. Block this request until all
+ to-device DMA transfers are complete.
+ * Bit(29) is the from-device DMA fence. Block this request until all
+ from-device DMA transfers are complete.
+ * Bits(28:27) are reserved.
+ * Bits(26:24) are the semaphore command. 0 is NOP. 1 is init with the
+ specified value. 2 is increment. 3 is decrement. 4 is wait
+ until the semaphore is equal to the specified value. 5 is wait
+ until the semaphore is greater or equal to the specified value.
+ 6 is "P", wait until semaphore is greater than 0, then
+ decrement by 1. 7 is reserved.
+ * Bit(23) is reserved.
+ * Bit(22) is the semaphore sync. 0 is post sync, which means that the
+ semaphore operation is done after the DMA transfer. 1 is
+ presync, which gates the DMA transfer. Only one presync is
+ allowed per request.
+ * Bit(21) is reserved.
+ * Bits(20:16) is the index of the semaphore to operate on.
+ * Bits(15:12) are reserved.
+ * Bits(11:0) are the semaphore value to use in operations.
+
+Overall, a request is processed in 4 steps:
+
+1. If specified, the presync semaphore condition must be true
+2. If enabled, the DMA transfer occurs
+3. If specified, the postsync semaphore conditions must be true
+4. If enabled, the doorbell is written
+
+By using the semaphores in conjunction with the workload running on the NSPs,
+the data pipeline can be synchronized such that the host can queue multiple
+requests of data for the workload to process, but the DMA Bridge will only copy
+the data into the memory of the workload when the workload is ready to process
+the next input.
+
+Response FIFO
+-------------
+
+Once a request is fully processed, a response FIFO element is generated if
+specified in pcie_dma_cmd. The structure of a response FIFO element:
+
+.. code-block:: c
+
+ struct response_elem {
+ u16 req_id;
+ u16 completion_code;
+ };
+
+req_id
+ matches the req_id of the request that generated this element.
+
+completion_code
+ status of this request. 0 is success. Non-zero is an error.
+
+The DMA Bridge will generate a MSI to the host as a reaction to activity in the
+response FIFO of a DBC. The DMA Bridge hardware has an IRQ storm mitigation
+algorithm, where it will only generate a MSI when the response FIFO transitions
+from empty to non-empty (unless force MSI is enabled and triggered). In
+response to this MSI, the host is expected to drain the response FIFO, and must
+take care to handle any race conditions between draining the FIFO, and the
+device inserting elements into the FIFO.
+
+Neural Network Control (NNC) Protocol
+=====================================
+
+The NNC protocol is how the host makes requests to the QSM to manage workloads.
+It uses the QAIC_CONTROL MHI channel.
+
+Each NNC request is packaged into a message. Each message is a series of
+transactions. A passthrough type transaction can contain elements known as
+commands.
+
+QSM requires NNC messages be little endian encoded and the fields be naturally
+aligned. Since there are 64-bit elements in some NNC messages, 64-bit alignment
+must be maintained.
+
+A message contains a header and then a series of transactions. A message may be
+at most 4K in size from QSM to the host. From the host to the QSM, a message
+can be at most 64K (maximum size of a single MHI packet), but there is a
+continuation feature where message N+1 can be marked as a continuation of
+message N. This is used for exceedingly large DMA xfer transactions.
+
+Transaction descriptions
+------------------------
+
+passthrough
+ Allows userspace to send an opaque payload directly to the QSM.
+ This is used for NNC commands. Userspace is responsible for managing
+ the QSM message requirements in the payload.
+
+dma_xfer
+ DMA transfer. Describes an object that the QSM should DMA into the
+ device via address and size tuples.
+
+activate
+ Activate a workload onto NSPs. The host must provide memory to be
+ used by the DBC.
+
+deactivate
+ Deactivate an active workload and return the NSPs to idle.
+
+status
+ Query the QSM about it's NNC implementation. Returns the NNC version,
+ and if CRC is used.
+
+terminate
+ Release a user's resources.
+
+dma_xfer_cont
+ Continuation of a previous DMA transfer. If a DMA transfer
+ cannot be specified in a single message (highly fragmented), this
+ transaction can be used to specify more ranges.
+
+validate_partition
+ Query to QSM to determine if a partition identifier is valid.
+
+Each message is tagged with a user id, and a partition id. The user id allows
+QSM to track resources, and release them when the user goes away (eg the process
+crashes). A partition id identifies the resource partition that QSM manages,
+which this message applies to.
+
+Messages may have CRCs. Messages should have CRCs applied until the QSM
+reports via the status transaction that CRCs are not needed. The QSM on the
+SA9000P requires CRCs for black channel safing.
+
+Subsystem Restart (SSR)
+=======================
+
+SSR is the concept of limiting the impact of an error. An AIC100 device may
+have multiple users, each with their own workload running. If the workload of
+one user crashes, the fallout of that should be limited to that workload and not
+impact other workloads. SSR accomplishes this.
+
+If a particular workload crashes, QSM notifies the host via the QAIC_SSR MHI
+channel. This notification identifies the workload by it's assigned DBC. A
+multi-stage recovery process is then used to cleanup both sides, and get the
+DBC/NSPs into a working state.
+
+When SSR occurs, any state in the workload is lost. Any inputs that were in
+process, or queued by not yet serviced, are lost. The loaded artifacts will
+remain in on-card DDR, but the host will need to re-activate the workload if
+it desires to recover the workload.
+
+Reliability, Accessibility, Serviceability (RAS)
+================================================
+
+AIC100 is expected to be deployed in server systems where RAS ideology is
+applied. Simply put, RAS is the concept of detecting, classifying, and
+reporting errors. While PCIe has AER (Advanced Error Reporting) which factors
+into RAS, AER does not allow for a device to report details about internal
+errors. Therefore, AIC100 implements a custom RAS mechanism. When a RAS event
+occurs, QSM will report the event with appropriate details via the QAIC_STATUS
+MHI channel. A sysadmin may determine that a particular device needs
+additional service based on RAS reports.
+
+Telemetry
+=========
+
+QSM has the ability to report various physical attributes of the device, and in
+some cases, to allow the host to control them. Examples include thermal limits,
+thermal readings, and power readings. These items are communicated via the
+QAIC_TELEMETRY MHI channel.
diff --git a/Documentation/accel/qaic/index.rst b/Documentation/accel/qaic/index.rst
new file mode 100644
index 000000000000..ad19b88d1a66
--- /dev/null
+++ b/Documentation/accel/qaic/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+=====================================
+ accel/qaic Qualcomm Cloud AI driver
+=====================================
+
+The accel/qaic driver supports the Qualcomm Cloud AI machine learning
+accelerator cards.
+
+.. toctree::
+
+ qaic
+ aic100
diff --git a/Documentation/accel/qaic/qaic.rst b/Documentation/accel/qaic/qaic.rst
new file mode 100644
index 000000000000..72a70ab6e3a8
--- /dev/null
+++ b/Documentation/accel/qaic/qaic.rst
@@ -0,0 +1,170 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+=============
+ QAIC driver
+=============
+
+The QAIC driver is the Kernel Mode Driver (KMD) for the AIC100 family of AI
+accelerator products.
+
+Interrupts
+==========
+
+While the AIC100 DMA Bridge hardware implements an IRQ storm mitigation
+mechanism, it is still possible for an IRQ storm to occur. A storm can happen
+if the workload is particularly quick, and the host is responsive. If the host
+can drain the response FIFO as quickly as the device can insert elements into
+it, then the device will frequently transition the response FIFO from empty to
+non-empty and generate MSIs at a rate equivalent to the speed of the
+workload's ability to process inputs. The lprnet (license plate reader network)
+workload is known to trigger this condition, and can generate in excess of 100k
+MSIs per second. It has been observed that most systems cannot tolerate this
+for long, and will crash due to some form of watchdog due to the overhead of
+the interrupt controller interrupting the host CPU.
+
+To mitigate this issue, the QAIC driver implements specific IRQ handling. When
+QAIC receives an IRQ, it disables that line. This prevents the interrupt
+controller from interrupting the CPU. Then AIC drains the FIFO. Once the FIFO
+is drained, QAIC implements a "last chance" polling algorithm where QAIC will
+sleep for a time to see if the workload will generate more activity. The IRQ
+line remains disabled during this time. If no activity is detected, QAIC exits
+polling mode and reenables the IRQ line.
+
+This mitigation in QAIC is very effective. The same lprnet usecase that
+generates 100k IRQs per second (per /proc/interrupts) is reduced to roughly 64
+IRQs over 5 minutes while keeping the host system stable, and having the same
+workload throughput performance (within run to run noise variation).
+
+
+Neural Network Control (NNC) Protocol
+=====================================
+
+The implementation of NNC is split between the KMD (QAIC) and UMD. In general
+QAIC understands how to encode/decode NNC wire protocol, and elements of the
+protocol which require kernel space knowledge to process (for example, mapping
+host memory to device IOVAs). QAIC understands the structure of a message, and
+all of the transactions. QAIC does not understand commands (the payload of a
+passthrough transaction).
+
+QAIC handles and enforces the required little endianness and 64-bit alignment,
+to the degree that it can. Since QAIC does not know the contents of a
+passthrough transaction, it relies on the UMD to satisfy the requirements.
+
+The terminate transaction is of particular use to QAIC. QAIC is not aware of
+the resources that are loaded onto a device since the majority of that activity
+occurs within NNC commands. As a result, QAIC does not have the means to
+roll back userspace activity. To ensure that a userspace client's resources
+are fully released in the case of a process crash, or a bug, QAIC uses the
+terminate command to let QSM know when a user has gone away, and the resources
+can be released.
+
+QSM can report a version number of the NNC protocol it supports. This is in the
+form of a Major number and a Minor number.
+
+Major number updates indicate changes to the NNC protocol which impact the
+message format, or transactions (impacts QAIC).
+
+Minor number updates indicate changes to the NNC protocol which impact the
+commands (does not impact QAIC).
+
+uAPI
+====
+
+QAIC defines a number of driver specific IOCTLs as part of the userspace API.
+This section describes those APIs.
+
+DRM_IOCTL_QAIC_MANAGE
+ This IOCTL allows userspace to send a NNC request to the QSM. The call will
+ block until a response is received, or the request has timed out.
+
+DRM_IOCTL_QAIC_CREATE_BO
+ This IOCTL allows userspace to allocate a buffer object (BO) which can send
+ or receive data from a workload. The call will return a GEM handle that
+ represents the allocated buffer. The BO is not usable until it has been
+ sliced (see DRM_IOCTL_QAIC_ATTACH_SLICE_BO).
+
+DRM_IOCTL_QAIC_MMAP_BO
+ This IOCTL allows userspace to prepare an allocated BO to be mmap'd into the
+ userspace process.
+
+DRM_IOCTL_QAIC_ATTACH_SLICE_BO
+ This IOCTL allows userspace to slice a BO in preparation for sending the BO
+ to the device. Slicing is the operation of describing what portions of a BO
+ get sent where to a workload. This requires a set of DMA transfers for the
+ DMA Bridge, and as such, locks the BO to a specific DBC.
+
+DRM_IOCTL_QAIC_EXECUTE_BO
+ This IOCTL allows userspace to submit a set of sliced BOs to the device. The
+ call is non-blocking. Success only indicates that the BOs have been queued
+ to the device, but does not guarantee they have been executed.
+
+DRM_IOCTL_QAIC_PARTIAL_EXECUTE_BO
+ This IOCTL operates like DRM_IOCTL_QAIC_EXECUTE_BO, but it allows userspace
+ to shrink the BOs sent to the device for this specific call. If a BO
+ typically has N inputs, but only a subset of those is available, this IOCTL
+ allows userspace to indicate that only the first M bytes of the BO should be
+ sent to the device to minimize data transfer overhead. This IOCTL dynamically
+ recomputes the slicing, and therefore has some processing overhead before the
+ BOs can be queued to the device.
+
+DRM_IOCTL_QAIC_WAIT_BO
+ This IOCTL allows userspace to determine when a particular BO has been
+ processed by the device. The call will block until either the BO has been
+ processed and can be re-queued to the device, or a timeout occurs.
+
+DRM_IOCTL_QAIC_PERF_STATS_BO
+ This IOCTL allows userspace to collect performance statistics on the most
+ recent execution of a BO. This allows userspace to construct an end to end
+ timeline of the BO processing for a performance analysis.
+
+DRM_IOCTL_QAIC_PART_DEV
+ This IOCTL allows userspace to request a duplicate "shadow device". This extra
+ accelN device is associated with a specific partition of resources on the
+ AIC100 device and can be used for limiting a process to some subset of
+ resources.
+
+Userspace Client Isolation
+==========================
+
+AIC100 supports multiple clients. Multiple DBCs can be consumed by a single
+client, and multiple clients can each consume one or more DBCs. Workloads
+may contain sensitive information therefore only the client that owns the
+workload should be allowed to interface with the DBC.
+
+Clients are identified by the instance associated with their open(). A client
+may only use memory they allocate, and DBCs that are assigned to their
+workloads. Attempts to access resources assigned to other clients will be
+rejected.
+
+Module parameters
+=================
+
+QAIC supports the following module parameters:
+
+**datapath_polling (bool)**
+
+Configures QAIC to use a polling thread for datapath events instead of relying
+on the device interrupts. Useful for platforms with broken multiMSI. Must be
+set at QAIC driver initialization. Default is 0 (off).
+
+**mhi_timeout_ms (unsigned int)**
+
+Sets the timeout value for MHI operations in milliseconds (ms). Must be set
+at the time the driver detects a device. Default is 2000 (2 seconds).
+
+**control_resp_timeout_s (unsigned int)**
+
+Sets the timeout value for QSM responses to NNC messages in seconds (s). Must
+be set at the time the driver is sending a request to QSM. Default is 60 (one
+minute).
+
+**wait_exec_default_timeout_ms (unsigned int)**
+
+Sets the default timeout for the wait_exec ioctl in milliseconds (ms). Must be
+set prior to the waic_exec ioctl call. A value specified in the ioctl call
+overrides this for that call. Default is 5000 (5 seconds).
+
+**datapath_poll_interval_us (unsigned int)**
+
+Sets the polling interval in microseconds (us) when datapath polling is active.
+Takes effect at the next polling interval. Default is 100 (100 us).
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 1b8b46deeb29..f61c01fc376e 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -13,6 +13,10 @@ a) waiting for a CPU (while being runnable)
b) completion of synchronous block I/O initiated by the task
c) swapping in pages
d) memory reclaim
+e) thrashing
+f) direct compact
+g) write-protect copy
+h) IRQ/SOFTIRQ
and makes these statistics available to userspace through
the taskstats interface.
@@ -41,11 +45,12 @@ generic data structure to userspace corresponding to per-pid and per-tgid
statistics. The delay accounting functionality populates specific fields of
this structure. See
- include/linux/taskstats.h
+ include/uapi/linux/taskstats.h
for a description of the fields pertaining to delay accounting.
It will generally be in the form of counters returning the cumulative
-delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
+delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page
+cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc.
Taking the difference of two successive readings of a given
counter (say cpu_delay_total) for a task will give the delay
@@ -88,41 +93,41 @@ seen.
General format of the getdelays command::
- getdelays [-t tgid] [-p pid] [-c cmd...]
-
+ getdelays [-dilv] [-t tgid] [-p pid]
Get delays, since system boot, for pid 10::
- # ./getdelays -p 10
+ # ./getdelays -d -p 10
(output similar to next case)
Get sum of delays, since system boot, for all pids with tgid 5::
- # ./getdelays -t 5
-
-
- CPU count real total virtual total delay total
- 7876 92005750 100000000 24001500
- IO count delay total
- 0 0
- SWAP count delay total
- 0 0
- RECLAIM count delay total
- 0 0
-
-Get delays seen in executing a given simple command::
-
- # ./getdelays -c ls /
-
- bin data1 data3 data5 dev home media opt root srv sys usr
- boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var
-
-
- CPU count real total virtual total delay total
- 6 4000250 4000000 0
- IO count delay total
- 0 0
- SWAP count delay total
- 0 0
- RECLAIM count delay total
- 0 0
+ # ./getdelays -d -t 5
+ print delayacct stats ON
+ TGID 5
+
+
+ CPU count real total virtual total delay total delay average
+ 8 7000000 6872122 3382277 0.423ms
+ IO count delay total delay average
+ 0 0 0.000ms
+ SWAP count delay total delay average
+ 0 0 0.000ms
+ RECLAIM count delay total delay average
+ 0 0 0.000ms
+ THRASHING count delay total delay average
+ 0 0 0.000ms
+ COMPACT count delay total delay average
+ 0 0 0.000ms
+ WPCOPY count delay total delay average
+ 0 0 0.000ms
+ IRQ count delay total delay average
+ 0 0 0.000ms
+
+Get IO accounting for pid 1, it works only with -p::
+
+ # ./getdelays -i -p 1
+ printing IO accounting
+ linuxrc: read=65536, write=0, cancelled_write=0
+
+The above command can be used with -v to get more debug information.
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index f2b3439edcc2..df6062eb3abb 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -37,11 +37,7 @@ Pressure interface
Pressure information for each resource is exported through the
respective file in /proc/pressure/ -- cpu, memory, and io.
-The format for CPU is as such::
-
- some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-and for memory and IO::
+The format is as such::
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
@@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is
still doing productive work. As such, time spent in this subset of the
stall state is tracked separately and exported in the "full" averages.
+CPU full is undefined at the system level, but has been reported
+since 5.13, so it is set to zero for backward compatibility.
+
The ratios (in %) are tracked as recent trends over ten, sixty, and
three hundred second windows, which gives insight into short term events
as well as medium and long term trends. The total absolute stall time
@@ -92,7 +91,8 @@ Triggers can be set on more than one psi metric and more than one trigger
for the same psi metric can be specified. However for each trigger a separate
file descriptor is required to be able to poll it separately from others,
therefore for each trigger a separate open() syscall should be made even
-when opening the same psi interface file.
+when opening the same psi interface file. Write operations to a file descriptor
+with an already existing psi trigger will fail with EBUSY.
Monitors activate only when system enters stall state for the monitored
psi metric and deactivates upon exit from the stall state. While system is
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
after which monitors are most likely not needed and psi averages can be used
instead.
+Unprivileged users can also create monitors, with the only limitation that the
+window size must be a multiple of 2s, in order to prevent excessive resource
+usage.
+
When activated, psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when system is
bouncing in and out of the stall state.
diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index caa3c09a5c3f..9a969c0157f1 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -1,9 +1,9 @@
.. _readme:
-Linux kernel release 5.x <http://kernel.org/>
+Linux kernel release 6.x <http://kernel.org/>
=============================================
-These are the release notes for Linux version 5. Read them carefully,
+These are the release notes for Linux version 6. Read them carefully,
as they tell you what this is all about, explain how to install the
kernel, and what to do if something goes wrong.
@@ -63,7 +63,7 @@ Installing the kernel source
directory where you have permissions (e.g. your home directory) and
unpack it::
- xz -cd linux-5.x.tar.xz | tar xvf -
+ xz -cd linux-6.x.tar.xz | tar xvf -
Replace "X" with the version number of the latest kernel.
@@ -72,12 +72,12 @@ Installing the kernel source
files. They should match the library, and not get messed up by
whatever the kernel-du-jour happens to be.
- - You can also upgrade between 5.x releases by patching. Patches are
+ - You can also upgrade between 6.x releases by patching. Patches are
distributed in the xz format. To install by patching, get all the
newer patch files, enter the top level directory of the kernel source
- (linux-5.x) and execute::
+ (linux-6.x) and execute::
- xz -cd ../patch-5.x.xz | patch -p1
+ xz -cd ../patch-6.x.xz | patch -p1
Replace "x" for all versions bigger than the version "x" of your current
source tree, **in_order**, and you should be ok. You may want to remove
@@ -85,13 +85,13 @@ Installing the kernel source
that there are no failed patches (some-file-name# or some-file-name.rej).
If there are, either you or I have made a mistake.
- Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
+ Unlike patches for the 6.x kernels, patches for the 6.x.y kernels
(also known as the -stable kernels) are not incremental but instead apply
- directly to the base 5.x kernel. For example, if your base kernel is 5.0
- and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
- and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
- want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
- patch -R) **before** applying the 5.0.3 patch. You can read more on this in
+ directly to the base 6.x kernel. For example, if your base kernel is 6.0
+ and you want to apply the 6.0.3 patch, you must not first apply the 6.0.1
+ and 6.0.2 patches. Similarly, if you are running kernel version 6.0.2 and
+ want to jump to 6.0.3, you must first reverse the 6.0.2 patch (that is,
+ patch -R) **before** applying the 6.0.3 patch. You can read more on this in
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
Alternatively, the script patch-kernel can be used to automate this
@@ -114,7 +114,7 @@ Installing the kernel source
Software requirements
---------------------
- Compiling and running the 5.x kernels requires up-to-date
+ Compiling and running the 6.x kernels requires up-to-date
versions of various software packages. Consult
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
required and how to get updates for these packages. Beware that using
@@ -132,12 +132,12 @@ Build directory for the kernel
place for the output files (including .config).
Example::
- kernel source code: /usr/src/linux-5.x
+ kernel source code: /usr/src/linux-6.x
build directory: /home/name/build/kernel
To configure and build the kernel, use::
- cd /usr/src/linux-5.x
+ cd /usr/src/linux-6.x
make O=/home/name/build/kernel menuconfig
make O=/home/name/build/kernel
sudo make O=/home/name/build/kernel modules_install install
@@ -262,8 +262,6 @@ Compiling the kernel
- Make sure you have at least gcc 5.1 available.
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
- Please note that you can still run a.out user programs with this kernel.
-
- Do a ``make`` to create a compressed kernel image. It is also
possible to do ``make install`` if you have lilo installed to suit the
kernel makefiles, but you may want to check your particular lilo setup first.
@@ -332,85 +330,10 @@ Compiling the kernel
If something goes wrong
-----------------------
- - If you have problems that seem to be due to kernel bugs, please check
- the file MAINTAINERS to see if there is a particular person associated
- with the part of the kernel that you are having trouble with. If there
- isn't anyone listed there, then the second best thing is to mail
- them to me (torvalds@linux-foundation.org), and possibly to any other
- relevant mailing-list or to the newsgroup.
-
- - In all bug-reports, *please* tell what kernel you are talking about,
- how to duplicate the problem, and what your setup is (use your common
- sense). If the problem is new, tell me so, and if the problem is
- old, please try to tell me when you first noticed it.
-
- - If the bug results in a message like::
-
- unable to handle kernel paging request at address C0000010
- Oops: 0002
- EIP: 0010:XXXXXXXX
- eax: xxxxxxxx ebx: xxxxxxxx ecx: xxxxxxxx edx: xxxxxxxx
- esi: xxxxxxxx edi: xxxxxxxx ebp: xxxxxxxx
- ds: xxxx es: xxxx fs: xxxx gs: xxxx
- Pid: xx, process nr: xx
- xx xx xx xx xx xx xx xx xx xx
-
- or similar kernel debugging information on your screen or in your
- system log, please duplicate it *exactly*. The dump may look
- incomprehensible to you, but it does contain information that may
- help debugging the problem. The text above the dump is also
- important: it tells something about why the kernel dumped code (in
- the above example, it's due to a bad kernel pointer). More information
- on making sense of the dump is in Documentation/admin-guide/bug-hunting.rst
-
- - If you compiled the kernel with CONFIG_KALLSYMS you can send the dump
- as is, otherwise you will have to use the ``ksymoops`` program to make
- sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
- This utility can be downloaded from
- https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
- Alternatively, you can do the dump lookup by hand:
-
- - In debugging dumps like the above, it helps enormously if you can
- look up what the EIP value means. The hex value as such doesn't help
- me or anybody else very much: it will depend on your particular
- kernel setup. What you should do is take the hex value from the EIP
- line (ignore the ``0010:``), and look it up in the kernel namelist to
- see which kernel function contains the offending address.
-
- To find out the kernel function name, you'll need to find the system
- binary associated with the kernel that exhibited the symptom. This is
- the file 'linux/vmlinux'. To extract the namelist and match it against
- the EIP from the kernel crash, do::
-
- nm vmlinux | sort | less
-
- This will give you a list of kernel addresses sorted in ascending
- order, from which it is simple to find the function that contains the
- offending address. Note that the address given by the kernel
- debugging messages will not necessarily match exactly with the
- function addresses (in fact, that is very unlikely), so you can't
- just 'grep' the list: the list will, however, give you the starting
- point of each kernel function, so by looking for the function that
- has a starting address lower than the one you are searching for but
- is followed by a function with a higher address you will find the one
- you want. In fact, it may be a good idea to include a bit of
- "context" in your problem report, giving a few lines around the
- interesting one.
-
- If you for some reason cannot do the above (you have a pre-compiled
- kernel image or similar), telling me as much about your setup as
- possible will help. Please read
- 'Documentation/admin-guide/reporting-issues.rst' for details.
-
- - Alternatively, you can use gdb on a running kernel. (read-only; i.e. you
- cannot change values or set break points.) To do this, first compile the
- kernel with -g; edit arch/x86/Makefile appropriately, then do a ``make
- clean``. You'll also need to enable CONFIG_PROC_FS (via ``make config``).
-
- After you've rebooted with the new kernel, do ``gdb vmlinux /proc/kcore``.
- You can now use all the usual gdb commands. The command to look up the
- point where your system crashed is ``l *0xXXXXXXXX``. (Replace the XXXes
- with the EIP value.)
-
- gdb'ing a non-running kernel currently fails because ``gdb`` (wrongly)
- disregards the starting offset for which the kernel is compiled.
+If you have problems that seem to be due to kernel bugs, please follow the
+instructions at 'Documentation/admin-guide/reporting-issues.rst'.
+
+Hints on understanding kernel bug reports are in
+'Documentation/admin-guide/bug-hunting.rst'. More on debugging the kernel
+with gdb is in 'Documentation/dev-tools/gdb-kernel-debugging.rst' and
+'Documentation/dev-tools/kgdb.rst'.
diff --git a/Documentation/admin-guide/acpi/cppc_sysfs.rst b/Documentation/admin-guide/acpi/cppc_sysfs.rst
index fccf22114e85..e53d76365aa7 100644
--- a/Documentation/admin-guide/acpi/cppc_sysfs.rst
+++ b/Documentation/admin-guide/acpi/cppc_sysfs.rst
@@ -4,6 +4,8 @@
Collaborative Processor Performance Control (CPPC)
==================================================
+.. _cppc_sysfs:
+
CPPC
====
diff --git a/Documentation/admin-guide/acpi/dsdt-override.rst b/Documentation/admin-guide/acpi/dsdt-override.rst
deleted file mode 100644
index 50bd7f194bf4..000000000000
--- a/Documentation/admin-guide/acpi/dsdt-override.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Overriding DSDT
-===============
-
-Linux supports a method of overriding the BIOS DSDT:
-
-CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
-
-When to use this method is described in detail on the
-Linux/ACPI home page:
-https://01.org/linux-acpi/documentation/overriding-dsdt
diff --git a/Documentation/admin-guide/acpi/fan_performance_states.rst b/Documentation/admin-guide/acpi/fan_performance_states.rst
index 98fe5c333121..b9e4b4d146c1 100644
--- a/Documentation/admin-guide/acpi/fan_performance_states.rst
+++ b/Documentation/admin-guide/acpi/fan_performance_states.rst
@@ -60,3 +60,31 @@ For example::
When a given field is not populated or its value provided by the platform
firmware is invalid, the "not-defined" string is shown instead of the value.
+
+ACPI Fan Fine Grain Control
+=============================
+
+When _FIF object specifies support for fine grain control, then fan speed
+can be set from 0 to 100% with the recommended minimum "step size" via
+_FSL object. User can adjust fan speed using thermal sysfs cooling device.
+
+Here use can look at fan performance states for a reference speed (speed_rpm)
+and set it by changing cooling device cur_state. If the fine grain control
+is supported then user can also adjust to some other speeds which are
+not defined in the performance states.
+
+The support of fine grain control is presented via sysfs attribute
+"fine_grain_control". If fine grain control is present, this attribute
+will show "1" otherwise "0".
+
+This sysfs attribute is presented in the same directory as performance states.
+
+ACPI Fan Performance Feedback
+=============================
+
+The optional _FST object provides status information for the fan device.
+This includes field to provide current fan speed in revolutions per minute
+at which the fan is rotating.
+
+This speed is presented in the sysfs using the attribute "fan_speed_rpm",
+in the same directory as performance states.
diff --git a/Documentation/admin-guide/acpi/index.rst b/Documentation/admin-guide/acpi/index.rst
index 71277689ad97..b078fdb8f4c9 100644
--- a/Documentation/admin-guide/acpi/index.rst
+++ b/Documentation/admin-guide/acpi/index.rst
@@ -9,7 +9,6 @@ the Linux ACPI support.
:maxdepth: 1
initrd_table_override
- dsdt-override
ssdt-overlays
cppc_sysfs
fan_performance_states
diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst
index 8d3a2d045c0a..bb5032a99234 100644
--- a/Documentation/admin-guide/bcache.rst
+++ b/Documentation/admin-guide/bcache.rst
@@ -204,7 +204,7 @@ For example::
This should present your unmodified backing device data in /dev/loop0
If your cache is in writethrough mode, then you can safely discard the
-cache device without loosing data.
+cache device without losing data.
E) Wiping a cache device
diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst
index bd9a4901fe46..9f73253ea353 100644
--- a/Documentation/admin-guide/blockdev/drbd/figures.rst
+++ b/Documentation/admin-guide/blockdev/drbd/figures.rst
@@ -25,6 +25,6 @@ Sub graphs of DRBD's state transitions
:alt: disk-states-8.dot
:align: center
-.. kernel-figure:: node-states-8.dot
- :alt: node-states-8.dot
+.. kernel-figure:: peer-states-8.dot
+ :alt: peer-states-8.dot
:align: center
diff --git a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
deleted file mode 100644
index bfa54e1f8016..000000000000
--- a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
+++ /dev/null
@@ -1,13 +0,0 @@
-digraph node_states {
- Secondary -> Primary [ label = "ioctl_set_state()" ]
- Primary -> Secondary [ label = "ioctl_set_state()" ]
-}
-
-digraph peer_states {
- Secondary -> Primary [ label = "recv state packet" ]
- Primary -> Secondary [ label = "recv state packet" ]
- Primary -> Unknown [ label = "connection lost" ]
- Secondary -> Unknown [ label = "connection lost" ]
- Unknown -> Primary [ label = "connected" ]
- Unknown -> Secondary [ label = "connected" ]
-}
diff --git a/Documentation/admin-guide/blockdev/drbd/peer-states-8.dot b/Documentation/admin-guide/blockdev/drbd/peer-states-8.dot
new file mode 100644
index 000000000000..6dc3954954d6
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/peer-states-8.dot
@@ -0,0 +1,8 @@
+digraph peer_states {
+ Secondary -> Primary [ label = "recv state packet" ]
+ Primary -> Secondary [ label = "recv state packet" ]
+ Primary -> Unknown [ label = "connection lost" ]
+ Secondary -> Unknown [ label = "connection lost" ]
+ Unknown -> Primary [ label = "connected" ]
+ Unknown -> Secondary [ label = "connected" ]
+}
diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst
index b903cf152091..957ccf617797 100644
--- a/Documentation/admin-guide/blockdev/index.rst
+++ b/Documentation/admin-guide/blockdev/index.rst
@@ -1,8 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0
-===========================
-The Linux RapidIO Subsystem
-===========================
+=============
+Block Devices
+=============
.. toctree::
:maxdepth: 1
diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst
index d78dfe559dcf..faf2ac4b1509 100644
--- a/Documentation/admin-guide/blockdev/nbd.rst
+++ b/Documentation/admin-guide/blockdev/nbd.rst
@@ -14,7 +14,7 @@ to borrow disk space from another computer.
Unlike NFS, it is possible to put any filesystem on it, etc.
For more information, or to download the nbd-client and nbd-server
-tools, go to http://nbd.sf.net/.
+tools, go to https://github.com/NetworkBlockDevice/nbd.
The nbd kernel module need only be installed on the client
system, as the nbd-server is completely in userspace. In fact,
diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst
index e1ce90af602a..e85ad37cc0e5 100644
--- a/Documentation/admin-guide/blockdev/paride.rst
+++ b/Documentation/admin-guide/blockdev/paride.rst
@@ -3,6 +3,7 @@ Linux and parallel port IDE devices
===================================
PARIDE v1.03 (c) 1997-8 Grant Guenther <grant@torque.net>
+PATA_PARPORT (c) 2023 Ondrej Zary
1. Introduction
===============
@@ -51,27 +52,15 @@ parallel port IDE subsystem, including:
as well as most of the clone and no-name products on the market.
-To support such a wide range of devices, PARIDE, the parallel port IDE
-subsystem, is actually structured in three parts. There is a base
-paride module which provides a registry and some common methods for
-accessing the parallel ports. The second component is a set of
-high-level drivers for each of the different types of supported devices:
+To support such a wide range of devices, pata_parport is actually structured
+in two parts. There is a base pata_parport module which provides an interface
+to kernel libata subsystem, registry and some common methods for accessing
+the parallel ports.
- === =============
- pd IDE disk
- pcd ATAPI CD-ROM
- pf ATAPI disk
- pt ATAPI tape
- pg ATAPI generic
- === =============
-
-(Currently, the pg driver is only used with CD-R drives).
-
-The high-level drivers function according to the relevant standards.
-The third component of PARIDE is a set of low-level protocol drivers
-for each of the parallel port IDE adapter chips. Thanks to the interest
-and encouragement of Linux users from many parts of the world,
-support is available for almost all known adapter protocols:
+The second component is a set of low-level protocol drivers for each of the
+parallel port IDE adapter chips. Thanks to the interest and encouragement of
+Linux users from many parts of the world, support is available for almost all
+known adapter protocols:
==== ====================================== ====
aten ATEN EH-100 (HK)
@@ -91,251 +80,87 @@ support is available for almost all known adapter protocols:
==== ====================================== ====
-2. Using the PARIDE subsystem
-=============================
+2. Using pata_parport subsystem
+===============================
While configuring the Linux kernel, you may choose either to build
-the PARIDE drivers into your kernel, or to build them as modules.
+the pata_parport drivers into your kernel, or to build them as modules.
In either case, you will need to select "Parallel port IDE device support"
-as well as at least one of the high-level drivers and at least one
-of the parallel port communication protocols. If you do not know
-what kind of parallel port adapter is used in your drive, you could
-begin by checking the file names and any text files on your DOS
+and at least one of the parallel port communication protocols.
+If you do not know what kind of parallel port adapter is used in your drive,
+you could begin by checking the file names and any text files on your DOS
installation floppy. Alternatively, you can look at the markings on
the adapter chip itself. That's usually sufficient to identify the
correct device.
-You can actually select all the protocol modules, and allow the PARIDE
+You can actually select all the protocol modules, and allow the pata_parport
subsystem to try them all for you.
For the "brand-name" products listed above, here are the protocol
and high-level drivers that you would use:
- ================ ============ ====== ========
- Manufacturer Model Driver Protocol
- ================ ============ ====== ========
- MicroSolutions CD-ROM pcd bpck
- MicroSolutions PD drive pf bpck
- MicroSolutions hard-drive pd bpck
- MicroSolutions 8000t tape pt bpck
- SyQuest EZ, SparQ pd epat
- Imation Superdisk pf epat
- Maxell Superdisk pf friq
- Avatar Shark pd epat
- FreeCom CD-ROM pcd frpw
- Hewlett-Packard 5GB Tape pt epat
- Hewlett-Packard 7200e (CD) pcd epat
- Hewlett-Packard 7200e (CD-R) pg epat
- ================ ============ ====== ========
-
-2.1 Configuring built-in drivers
----------------------------------
-
-We recommend that you get to know how the drivers work and how to
-configure them as loadable modules, before attempting to compile a
-kernel with the drivers built-in.
-
-If you built all of your PARIDE support directly into your kernel,
-and you have just a single parallel port IDE device, your kernel should
-locate it automatically for you. If you have more than one device,
-you may need to give some command line options to your bootloader
-(eg: LILO), how to do that is beyond the scope of this document.
-
-The high-level drivers accept a number of command line parameters, all
-of which are documented in the source files in linux/drivers/block/paride.
-By default, each driver will automatically try all parallel ports it
-can find, and all protocol types that have been installed, until it finds
-a parallel port IDE adapter. Once it finds one, the probe stops. So,
-if you have more than one device, you will need to tell the drivers
-how to identify them. This requires specifying the port address, the
-protocol identification number and, for some devices, the drive's
-chain ID. While your system is booting, a number of messages are
-displayed on the console. Like all such messages, they can be
-reviewed with the 'dmesg' command. Among those messages will be
-some lines like::
-
- paride: bpck registered as protocol 0
- paride: epat registered as protocol 1
-
-The numbers will always be the same until you build a new kernel with
-different protocol selections. You should note these numbers as you
-will need them to identify the devices.
+ ================ ============ ========
+ Manufacturer Model Protocol
+ ================ ============ ========
+ MicroSolutions CD-ROM bpck
+ MicroSolutions PD drive bpck
+ MicroSolutions hard-drive bpck
+ MicroSolutions 8000t tape bpck
+ SyQuest EZ, SparQ epat
+ Imation Superdisk epat
+ Maxell Superdisk friq
+ Avatar Shark epat
+ FreeCom CD-ROM frpw
+ Hewlett-Packard 5GB Tape epat
+ Hewlett-Packard 7200e (CD) epat
+ Hewlett-Packard 7200e (CD-R) epat
+ ================ ============ ========
+
+All parports and all protocol drivers are probed automatically unless probe=0
+parameter is used. So just "modprobe epat" is enough for a Imation SuperDisk
+drive to work.
+
+Manual device creation::
+
+ # echo "port protocol mode unit delay" >/sys/bus/pata_parport/new_device
+
+where:
+
+ ======== ================================================
+ port parport name (or "auto" for all parports)
+ protocol protocol name (or "auto" for all protocols)
+ mode mode number (protocol-specific) or -1 for probe
+ unit unit number (for backpack only, see below)
+ delay I/O delay (see troubleshooting section below)
+ ======== ================================================
If you happen to be using a MicroSolutions backpack device, you will
also need to know the unit ID number for each drive. This is usually
the last two digits of the drive's serial number (but read MicroSolutions'
documentation about this).
-As an example, let's assume that you have a MicroSolutions PD/CD drive
-with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
-EZ-135 connected to the chained port on the PD/CD drive and also an
-Imation Superdisk connected to port 0x278. You could give the following
-options on your boot command::
-
- pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
-
-In the last option, pf.drive1 configures device /dev/pf1, the 0x378
-is the parallel port base address, the 0 is the protocol registration
-number and 36 is the chain ID.
-
-Please note: while PARIDE will work both with and without the
-PARPORT parallel port sharing system that is included by the
-"Parallel port support" option, PARPORT must be included and enabled
-if you want to use chains of devices on the same parallel port.
-
-2.2 Loading and configuring PARIDE as modules
-----------------------------------------------
-
-It is much faster and simpler to get to understand the PARIDE drivers
-if you use them as loadable kernel modules.
-
-Note 1:
- using these drivers with the "kerneld" automatic module loading
- system is not recommended for beginners, and is not documented here.
-
-Note 2:
- if you build PARPORT support as a loadable module, PARIDE must
- also be built as loadable modules, and PARPORT must be loaded before
- the PARIDE modules.
-
-To use PARIDE, you must begin by::
-
- insmod paride
-
-this loads a base module which provides a registry for the protocols,
-among other tasks.
-
-Then, load as many of the protocol modules as you think you might need.
-As you load each module, it will register the protocols that it supports,
-and print a log message to your kernel log file and your console. For
-example::
-
- # insmod epat
- paride: epat registered as protocol 0
- # insmod kbic
- paride: k951 registered as protocol 1
- paride: k971 registered as protocol 2
-
-Finally, you can load high-level drivers for each kind of device that
-you have connected. By default, each driver will autoprobe for a single
-device, but you can support up to four similar devices by giving their
-individual coordinates when you load the driver.
-
-For example, if you had two no-name CD-ROM drives both using the
-KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
-you could give the following command::
-
- # insmod pcd drive0=0x378,1 drive1=0x3bc,1
-
-For most adapters, giving a port address and protocol number is sufficient,
-but check the source files in linux/drivers/block/paride for more
-information. (Hopefully someone will write some man pages one day !).
-
-As another example, here's what happens when PARPORT is installed, and
-a SyQuest EZ-135 is attached to port 0x378::
-
- # insmod paride
- paride: version 1.0 installed
- # insmod epat
- paride: epat registered as protocol 0
- # insmod pd
- pd: pd version 1.0, major 45, cluster 64, nice 0
- pda: Sharing parport1 at 0x378
- pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
- pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
- pda: pda1
-
-Note that the last line is the output from the generic partition table
-scanner - in this case it reports that it has found a disk with one partition.
-
-2.3 Using a PARIDE device
---------------------------
-
-Once the drivers have been loaded, you can access PARIDE devices in the
-same way as their traditional counterparts. You will probably need to
-create the device "special files". Here is a simple script that you can
-cut to a file and execute::
-
- #!/bin/bash
- #
- # mkd -- a script to create the device special files for the PARIDE subsystem
- #
- function mkdev {
- mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
- }
- #
- function pd {
- D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
- mkdev pd$D b 45 $[ $1 * 16 ]
- for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
- do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
- done
- }
- #
- cd /dev
- #
- for u in 0 1 2 3 ; do pd $u ; done
- for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
- for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
- for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
- for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
- for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
- #
- # end of mkd
-
-With the device files and drivers in place, you can access PARIDE devices
-like any other Linux device. For example, to mount a CD-ROM in pcd0, use::
-
- mount /dev/pcd0 /cdrom
-
-If you have a fresh Avatar Shark cartridge, and the drive is pda, you
-might do something like::
-
- fdisk /dev/pda -- make a new partition table with
- partition 1 of type 83
-
- mke2fs /dev/pda1 -- to build the file system
-
- mkdir /shark -- make a place to mount the disk
-
- mount /dev/pda1 /shark
-
-Devices like the Imation superdisk work in the same way, except that
-they do not have a partition table. For example to make a 120MB
-floppy that you could share with a DOS system::
-
- mkdosfs /dev/pf0
- mount /dev/pf0 /mnt
-
-
-2.4 The pf driver
-------------------
-
-The pf driver is intended for use with parallel port ATAPI disk
-devices. The most common devices in this category are PD drives
-and LS-120 drives. Traditionally, media for these devices are not
-partitioned. Consequently, the pf driver does not support partitioned
-media. This may be changed in a future version of the driver.
-
-2.5 Using the pt driver
-------------------------
-
-The pt driver for parallel port ATAPI tape drives is a minimal driver.
-It does not yet support many of the standard tape ioctl operations.
-For best performance, a block size of 32KB should be used. You will
-probably want to set the parallel port delay to 0, if you can.
-
-2.6 Using the pg driver
-------------------------
-
-The pg driver can be used in conjunction with the cdrecord program
-to create CD-ROMs. Please get cdrecord version 1.6.1 or later
-from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media
-your parallel port should ideally be set to EPP mode, and the "port delay"
-should be set to 0. With those settings it is possible to record at 2x
-speed without any buffer underruns. If you cannot get the driver to work
-in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
+If you omit the parameters from the end, defaults will be used, e.g.:
+
+Probe all parports with all protocols::
+
+ # echo auto >/sys/bus/pata_parport/new_device
+
+Probe parport0 using protocol epat and mode 4 (EPP-16)::
+
+ # echo "parport0 epat 4" >/sys/bus/pata_parport/new_device
+
+Probe parport0 using all protocols::
+
+ # echo "parport0 auto" >/sys/bus/pata_parport/new_device
+
+Probe all parports using protoocol epat::
+
+ # echo "auto epat" >/sys/bus/pata_parport/new_device
+
+Deleting devices::
+
+ # echo pata_parport.0 >/sys/bus/pata_parport/delete_device
3. Troubleshooting
@@ -344,9 +169,9 @@ in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
3.1 Use EPP mode if you can
----------------------------
-The most common problems that people report with the PARIDE drivers
+The most common problems that people report with the pata_parport drivers
concern the parallel port CMOS settings. At this time, none of the
-PARIDE protocol modules support ECP mode, or any ECP combination modes.
+protocol modules support ECP mode, or any ECP combination modes.
If you are able to do so, please set your parallel port into EPP mode
using your CMOS setup procedure.
@@ -354,17 +179,14 @@ using your CMOS setup procedure.
-------------------------
Some parallel ports cannot reliably transfer data at full speed. To
-offset the errors, the PARIDE protocol modules introduce a "port
+offset the errors, the protocol modules introduce a "port
delay" between each access to the i/o ports. Each protocol sets
a default value for this delay. In most cases, the user can override
the default and set it to 0 - resulting in somewhat higher transfer
rates. In some rare cases (especially with older 486 systems) the
default delays are not long enough. if you experience corrupt data
transfers, or unexpected failures, you may wish to increase the
-port delay. The delay can be programmed using the "driveN" parameters
-to each of the high-level drivers. Please see the notes above, or
-read the comments at the beginning of the driver source files in
-linux/drivers/block/paride.
+port delay.
3.3 Some drives need a printer reset
-------------------------------------
@@ -374,66 +196,12 @@ that do not always power up correctly. We have noticed this with some
drives based on OnSpec and older Freecom adapters. In these rare cases,
the adapter can often be reinitialised by issuing a "printer reset" on
the parallel port. As the reset operation is potentially disruptive in
-multiple device environments, the PARIDE drivers will not do it
+multiple device environments, the pata_parport drivers will not do it
automatically. You can however, force a printer reset by doing::
insmod lp reset=1
rmmod lp
If you have one of these marginal cases, you should probably build
-your paride drivers as modules, and arrange to do the printer reset
-before loading the PARIDE drivers.
-
-3.4 Use the verbose option and dmesg if you need help
-------------------------------------------------------
-
-While a lot of testing has gone into these drivers to make them work
-as smoothly as possible, problems will arise. If you do have problems,
-please check all the obvious things first: does the drive work in
-DOS with the manufacturer's drivers ? If that doesn't yield any useful
-clues, then please make sure that only one drive is hooked to your system,
-and that either (a) PARPORT is enabled or (b) no other device driver
-is using your parallel port (check in /proc/ioports). Then, load the
-appropriate drivers (you can load several protocol modules if you want)
-as in::
-
- # insmod paride
- # insmod epat
- # insmod bpck
- # insmod kbic
- ...
- # insmod pd verbose=1
-
-(using the correct driver for the type of device you have, of course).
-The verbose=1 parameter will cause the drivers to log a trace of their
-activity as they attempt to locate your drive.
-
-Use 'dmesg' to capture a log of all the PARIDE messages (any messages
-beginning with paride:, a protocol module's name or a driver's name) and
-include that with your bug report. You can submit a bug report in one
-of two ways. Either send it directly to the author of the PARIDE suite,
-by e-mail to grant@torque.net, or join the linux-parport mailing list
-and post your report there.
-
-3.5 For more information or help
----------------------------------
-
-You can join the linux-parport mailing list by sending a mail message
-to:
-
- linux-parport-request@torque.net
-
-with the single word::
-
- subscribe
-
-in the body of the mail message (not in the subject line). Please be
-sure that your mail program is correctly set up when you do this, as
-the list manager is a robot that will subscribe you using the reply
-address in your mail headers. REMOVE any anti-spam gimmicks you may
-have in your mail headers, when sending mail to the list server.
-
-You might also find some useful information on the linux-parport
-web pages (although they are not always up to date) at
-
- http://web.archive.org/web/%2E/http://www.torque.net/parport/
+your pata_parport drivers as modules, and arrange to do the printer reset
+before loading the pata_parport drivers.
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index 700329d25f57..e4551579cb12 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -315,8 +315,8 @@ To use the feature, admin should set up backing device via::
echo /dev/sda5 > /sys/block/zramX/backing_dev
-before disksize setting. It supports only partition at this moment.
-If admin wants to use incompressible page writeback, they could do via::
+before disksize setting. It supports only partitions at this moment.
+If admin wants to use incompressible page writeback, they could do it via::
echo huge > /sys/block/zramX/writeback
@@ -328,15 +328,33 @@ as idle::
From now on, any pages on zram are idle pages. The idle mark
will be removed until someone requests access of the block.
IOW, unless there is access request, those pages are still idle pages.
+Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be
+marked as idle based on how long (in seconds) it's been since they were
+last accessed::
+
+ echo 86400 > /sys/block/zramX/idle
+
+In this example all pages which haven't been accessed in more than 86400
+seconds (one day) will be marked idle.
Admin can request writeback of those idle pages at right timing via::
echo idle > /sys/block/zramX/writeback
-With the command, zram writeback idle pages from memory to the storage.
+With the command, zram will writeback idle pages from memory to the storage.
+
+Additionally, if a user choose to writeback only huge and idle pages
+this can be accomplished with::
+
+ echo huge_idle > /sys/block/zramX/writeback
-If admin want to write a specific page in zram device to backing device,
-they could write a page index into the interface.
+If a user chooses to writeback only incompressible pages (pages that none of
+algorithms can compress) this can be accomplished with::
+
+ echo incompressible > /sys/block/zramX/writeback
+
+If an admin wants to write a specific page in zram device to the backing device,
+they could write a page index into the interface::
echo "page_index=1251" > /sys/block/zramX/writeback
@@ -346,7 +364,7 @@ to guarantee storage health for entire product life.
To overcome the concern, zram supports "writeback_limit" feature.
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
-any writeback. IOW, if admin wants to apply writeback budget, he should
+any writeback. IOW, if admin wants to apply writeback budget, they should
enable writeback_limit_enable via::
$ echo 1 > /sys/block/zramX/writeback_limit_enable
@@ -357,7 +375,7 @@ until admin sets the budget via /sys/block/zramX/writeback_limit.
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
assigned via /sys/block/zramX/writeback_limit is meaningless.)
-If admin want to limit writeback as per-day 400M, he could do it
+If admin wants to limit writeback as per-day 400M, they could do it
like below::
$ MB_SHIFT=20
@@ -367,16 +385,16 @@ like below::
$ echo 1 > /sys/block/zram0/writeback_limit_enable
If admins want to allow further write again once the budget is exhausted,
-he could do it like below::
+they could do it like below::
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
/sys/block/zram0/writeback_limit
-If admin wants to see remaining writeback budget since last set::
+If an admin wants to see the remaining writeback budget since last set::
$ cat /sys/block/zramX/writeback_limit
-If admin want to disable writeback limit, he could do::
+If an admin wants to disable writeback limit, they could do::
$ echo 0 > /sys/block/zramX/writeback_limit_enable
@@ -385,9 +403,90 @@ system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
writeback happened until you reset the zram to allocate extra writeback
budget in next setting is user's job.
-If admin wants to measure writeback count in a certain period, he could
+If admin wants to measure writeback count in a certain period, they could
know it via /sys/block/zram0/bd_stat's 3rd column.
+recompression
+-------------
+
+With CONFIG_ZRAM_MULTI_COMP, zram can recompress pages using alternative
+(secondary) compression algorithms. The basic idea is that alternative
+compression algorithm can provide better compression ratio at a price of
+(potentially) slower compression/decompression speeds. Alternative compression
+algorithm can, for example, be more successful compressing huge pages (those
+that default algorithm failed to compress). Another application is idle pages
+recompression - pages that are cold and sit in the memory can be recompressed
+using more effective algorithm and, hence, reduce zsmalloc memory usage.
+
+With CONFIG_ZRAM_MULTI_COMP, zram supports up to 4 compression algorithms:
+one primary and up to 3 secondary ones. Primary zram compressor is explained
+in "3) Select compression algorithm", secondary algorithms are configured
+using recomp_algorithm device attribute.
+
+Example:::
+
+ #show supported recompression algorithms
+ cat /sys/block/zramX/recomp_algorithm
+ #1: lzo lzo-rle lz4 lz4hc [zstd]
+ #2: lzo lzo-rle lz4 [lz4hc] zstd
+
+Alternative compression algorithms are sorted by priority. In the example
+above, zstd is used as the first alternative algorithm, which has priority
+of 1, while lz4hc is configured as a compression algorithm with priority 2.
+Alternative compression algorithm's priority is provided during algorithms
+configuration:::
+
+ #select zstd recompression algorithm, priority 1
+ echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm
+
+ #select deflate recompression algorithm, priority 2
+ echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm
+
+Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress,
+which controls recompression.
+
+Examples:::
+
+ #IDLE pages recompression is activated by `idle` mode
+ echo "type=idle" > /sys/block/zramX/recompress
+
+ #HUGE pages recompression is activated by `huge` mode
+ echo "type=huge" > /sys/block/zram0/recompress
+
+ #HUGE_IDLE pages recompression is activated by `huge_idle` mode
+ echo "type=huge_idle" > /sys/block/zramX/recompress
+
+The number of idle pages can be significant, so user-space can pass a size
+threshold (in bytes) to the recompress knob: zram will recompress only pages
+of equal or greater size:::
+
+ #recompress all pages larger than 3000 bytes
+ echo "threshold=3000" > /sys/block/zramX/recompress
+
+ #recompress idle pages larger than 2000 bytes
+ echo "type=idle threshold=2000" > /sys/block/zramX/recompress
+
+Recompression of idle pages requires memory tracking.
+
+During re-compression for every page, that matches re-compression criteria,
+ZRAM iterates the list of registered alternative compression algorithms in
+order of their priorities. ZRAM stops either when re-compression was
+successful (re-compressed object is smaller in size than the original one)
+and matches re-compression criteria (e.g. size threshold) or when there are
+no secondary algorithms left to try. If none of the secondary algorithms can
+successfully re-compressed the page such a page is marked as incompressible,
+so ZRAM will not attempt to re-compress it in the future.
+
+This re-compression behaviour, when it iterates through the list of
+registered compression algorithms, increases our chances of finding the
+algorithm that successfully compresses a particular page. Sometimes, however,
+it is convenient (and sometimes even necessary) to limit recompression to
+only one particular algorithm so that it will not try any other algorithms.
+This can be achieved by providing a algo=NAME parameter:::
+
+ #use zstd algorithm only (if registered)
+ echo "type=huge algo=zstd" > /sys/block/zramX/recompress
+
memory tracking
===============
@@ -398,9 +497,11 @@ pages of the process with*pagemap.
If you enable the feature, you could see block state via
/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
- 300 75.033841 .wh.
- 301 63.806904 s...
- 302 63.806919 ..hi
+ 300 75.033841 .wh...
+ 301 63.806904 s.....
+ 302 63.806919 ..hi..
+ 303 62.801919 ....r.
+ 304 146.781902 ..hi.n
First column
zram's block index.
@@ -417,6 +518,10 @@ Third column
huge page
i:
idle page
+ r:
+ recompressed page (secondary compression algorithm)
+ n:
+ none (including secondary) of algorithms could compress it
First line of above example says 300th block is accessed at 75.033841sec
and the block's state is huge so it is written back to the backing
diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index a1860fc0ca88..91339efdcb54 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -158,9 +158,15 @@ Each key-value pair is shown in each line with following style::
Boot Kernel With a Boot Config
==============================
-Since the boot configuration file is loaded with initrd, it will be added
-to the end of the initrd (initramfs) image file with padding, size,
-checksum and 12-byte magic word as below.
+There are two options to boot the kernel with bootconfig: attaching the
+bootconfig to the initrd image or embedding it in the kernel itself.
+
+Attaching a Boot Config to Initrd
+---------------------------------
+
+Since the boot configuration file is loaded with initrd by default,
+it will be added to the end of the initrd (initramfs) image file with
+padding, size, checksum and 12-byte magic word as below.
[initrd][bootconfig][padding][size(le32)][checksum(le32)][#BOOTCONFIG\n]
@@ -195,7 +201,30 @@ To remove the config from the image, you can use -d option as below::
Then add "bootconfig" on the normal kernel command line to tell the
kernel to look for the bootconfig at the end of the initrd file.
+Alternatively, build your kernel with the ``CONFIG_BOOT_CONFIG_FORCE``
+Kconfig option selected.
+
+Embedding a Boot Config into Kernel
+-----------------------------------
+
+If you can not use initrd, you can also embed the bootconfig file in the
+kernel by Kconfig options. In this case, you need to recompile the kernel
+with the following configs::
+
+ CONFIG_BOOT_CONFIG_EMBED=y
+ CONFIG_BOOT_CONFIG_EMBED_FILE="/PATH/TO/BOOTCONFIG/FILE"
+
+``CONFIG_BOOT_CONFIG_EMBED_FILE`` requires an absolute path or a relative
+path to the bootconfig file from source tree or object tree.
+The kernel will embed it as the default bootconfig.
+
+Just as when attaching the bootconfig to the initrd, you need ``bootconfig``
+option on the kernel command line to enable the embedded bootconfig, or,
+alternatively, build your kernel with the ``CONFIG_BOOT_CONFIG_FORCE``
+Kconfig option selected.
+Note that even if you set this option, you can override the embedded
+bootconfig by another bootconfig which attached to the initrd.
Kernel parameters via Boot Config
=================================
@@ -204,7 +233,7 @@ In addition to the kernel command line, the boot config can be used for
passing the kernel parameters. All the key-value pairs under ``kernel``
key will be passed to kernel cmdline directly. Moreover, the key-value
pairs under ``init`` will be passed to init process via the cmdline.
-The parameters are concatinated with user-given kernel cmdline string
+The parameters are concatenated with user-given kernel cmdline string
as the following order, so that the command line parameter can override
bootconfig parameters (this depends on how the subsystem handles parameters
but in general, earlier parameter will be overwritten by later one.)::
diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
index 16253eda192e..dabb80cdd25a 100644
--- a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -106,7 +106,7 @@ Proportional weight policy files
see Documentation/block/bfq-iosched.rst.
blkio.bfq.weight_device
- Specifes per cgroup per device weights, overriding the default group
+ Specifies per cgroup per device weights, overriding the default group
weight. For more details, see Documentation/block/bfq-iosched.rst.
Following is the format::
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
index b0688011ed06..9343148ee993 100644
--- a/Documentation/admin-guide/cgroup-v1/cgroups.rst
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -80,6 +80,8 @@ access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rs
you to associate a set of CPUs and a set of memory nodes with the
tasks in each cgroup.
+.. _cgroups-why-needed:
+
1.2 Why are cgroups needed ?
----------------------------
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
index 5d844ed4df69..ae646d621a8a 100644
--- a/Documentation/admin-guide/cgroup-v1/cpusets.rst
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -719,7 +719,7 @@ There are ways to query or modify cpusets:
cat, rmdir commands from the shell, or their equivalent from C.
- via the C library libcpuset.
- via the C library libcgroup.
- (http://sourceforge.net/projects/libcg/)
+ (https://github.com/libcgroup/libcgroup/)
- via the python application cset.
(http://code.google.com/p/cpuset/)
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
index 338f2c7d7a1c..0fa724d82abb 100644
--- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -29,12 +29,14 @@ Brief summary of control files::
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
+ hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
files include::
hugetlb.1GB.limit_in_bytes
hugetlb.1GB.max_usage_in_bytes
+ hugetlb.1GB.numa_stat
hugetlb.1GB.usage_in_bytes
hugetlb.1GB.failcnt
hugetlb.1GB.rsvd.limit_in_bytes
@@ -43,6 +45,7 @@ files include::
hugetlb.1GB.rsvd.failcnt
hugetlb.64KB.limit_in_bytes
hugetlb.64KB.max_usage_in_bytes
+ hugetlb.64KB.numa_stat
hugetlb.64KB.usage_in_bytes
hugetlb.64KB.failcnt
hugetlb.64KB.rsvd.limit_in_bytes
@@ -51,6 +54,7 @@ files include::
hugetlb.64KB.rsvd.failcnt
hugetlb.32MB.limit_in_bytes
hugetlb.32MB.max_usage_in_bytes
+ hugetlb.32MB.numa_stat
hugetlb.32MB.usage_in_bytes
hugetlb.32MB.failcnt
hugetlb.32MB.rsvd.limit_in_bytes
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
index 45b94f7b3beb..a402359abb99 100644
--- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -97,7 +97,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
=============
Page Cache is charged at
- - add_to_page_cache_locked().
+ - filemap_add_folio().
The logic is very clear. (About migration, see below)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 41191b5fb69d..47d1d7d932a8 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -2,18 +2,18 @@
Memory Resource Controller
==========================
-NOTE:
+.. caution::
This document is hopelessly outdated and it asks for a complete
rewrite. It still contains a useful information so we are keeping it
here but make sure to check the current code if you need a deeper
understanding.
-NOTE:
+.. note::
The Memory Resource Controller has generically been referred to as the
memory controller in this document. Do not confuse memory controller
used here with the memory controller that is used in hardware.
-(For editors) In this document:
+.. hint::
When we mention a cgroup (cgroupfs's directory) with memory controller,
we call it "memory cgroup". When you see git-log and source code, you'll
see patch's title and function names tend to use "memcg".
@@ -23,7 +23,7 @@ Benefits and Purpose of the memory controller
=============================================
The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
+from the rest of the system. The article on LWN [12]_ mentions some probable
uses of the memory controller. The memory controller can be used to
a. Isolate an application or a group of applications
@@ -55,7 +55,8 @@ Features:
- Root cgroup has no limit controls.
Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
+ basically functionality. (See :ref:`section 2.7
+ <cgroup-v1-memory-kernel-extension>`)
Brief summary of control files.
@@ -64,6 +65,7 @@ Brief summary of control files.
threads
cgroup.procs show list of processes
cgroup.event_control an interface for event_fd()
+ This knob is not available on CONFIG_PREEMPT_RT systems.
memory.usage_in_bytes show current usage for memory
(See 5.5 for details)
memory.memsw.usage_in_bytes show current usage for memory+Swap
@@ -75,6 +77,7 @@ Brief summary of control files.
memory.max_usage_in_bytes show max memory usage recorded
memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
memory.soft_limit_in_bytes set/show soft limit of memory usage
+ This knob is not available on CONFIG_PREEMPT_RT systems.
memory.stat show various statistics
memory.use_hierarchy set/show hierarchical account enabled
This knob is deprecated and shouldn't be
@@ -84,13 +87,13 @@ Brief summary of control files.
memory.swappiness set/show swappiness parameter of vmscan
(See sysctl's vm.swappiness)
memory.move_charge_at_immigrate set/show controls of moving charges
+ This knob is deprecated and shouldn't be
+ used.
memory.oom_control set/show oom controls.
memory.numa_stat show the number of memory usage per numa
node
- memory.kmem.limit_in_bytes set/show hard limit for kernel memory
- This knob is deprecated and shouldn't be
- used. It is planned that this be removed in
- the foreseeable future.
+ memory.kmem.limit_in_bytes This knob is deprecated and writing to
+ it will return -ENOTSUPP.
memory.kmem.usage_in_bytes show current kernel memory allocation
memory.kmem.failcnt show the number of kernel memory usage
hits limits
@@ -107,16 +110,16 @@ Brief summary of control files.
==========
The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
+controller was posted by Balbir Singh [1]_. At the time the RFC was posted
there were several implementations for memory control. The goal of the
RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
+for memory control. The first RSS controller was posted by Balbir Singh [2]_
+in Feb 2007. Pavel Emelianov [3]_ [4]_ [5]_ has since posted three versions
+of the RSS controller. At OLS, at the resource management BoF, everyone
+suggested that we handle both page cache and RSS together. Another request was
+raised to allow user space handling of OOM. The current memory controller is
at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
+Cache Control [11]_.
2. Memory Control
=================
@@ -147,7 +150,8 @@ specific data structure (mem_cgroup) associated with it.
2.2. Accounting
---------------
-::
+.. code-block::
+ :caption: Figure 1: Hierarchy of Accounting
+--------------------+
| mem_cgroup |
@@ -167,7 +171,6 @@ specific data structure (mem_cgroup) associated with it.
| | | |
+---------------+ +---------------+
- (Figure 1: Hierarchy of Accounting)
Figure 1 shows the important aspects of the controller
@@ -221,8 +224,9 @@ behind this approach is that a cgroup that aggressively uses a shared
page will eventually get charged for it (once it is uncharged from
the cgroup that brought it in -- this will happen on memory pressure).
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+But see :ref:`section 8.2 <cgroup-v1-memory-movable-charges>` when moving a
+task to another cgroup, its pages may be recharged to the new cgroup, if
+move_charge_at_immigrate has been chosen.
2.4 Swap Extension
--------------------------------------
@@ -244,7 +248,8 @@ In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
By using the memsw limit, you can avoid system OOM which can be caused by swap
shortage.
-**why 'memory+swap' rather than swap**
+2.4.1 why 'memory+swap' rather than swap
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
to move account from memory to swap...there is no change in usage of
@@ -252,7 +257,8 @@ memory+swap. In other words, when we want to limit the usage of swap without
affecting global LRU, memory+swap limit is better than just limiting swap from
an OS point of view.
-**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+2.4.2. What happens when a cgroup hits memory.memsw.limit_in_bytes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
in this cgroup. Then, swap-out will not be done by cgroup routine and file
@@ -268,26 +274,26 @@ global VM. When a cgroup goes over its limit, we first try
to reclaim memory from the cgroup so as to make space for the new
pages that the cgroup has touched. If the reclaim is unsuccessful,
an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
+cgroup. (See :ref:`10. OOM Control <cgroup-v1-memory-oom-control>` below.)
The reclaim algorithm has not been modified for cgroups, except that
pages that are selected for reclaiming come from the per-cgroup LRU
list.
-NOTE:
- Reclaim does not work for the root cgroup, since we cannot set any
- limits on the root cgroup.
+.. note::
+ Reclaim does not work for the root cgroup, since we cannot set any
+ limits on the root cgroup.
-Note2:
- When panic_on_oom is set to "2", the whole system will panic.
+.. note::
+ When panic_on_oom is set to "2", the whole system will panic.
When oom event notifier is registered, event will be delivered.
-(See oom_control section)
+(See :ref:`oom_control <cgroup-v1-memory-oom-control>` section)
2.6 Locking
-----------
-Lock order is as follows:
+Lock order is as follows::
Page lock (PG_locked bit of page->flags)
mm->page_table_lock or split pte_lock
@@ -299,7 +305,9 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
isolating a page from its LRU under lruvec->lru_lock.
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+.. _cgroup-v1-memory-kernel-extension:
+
+2.7 Kernel Memory Extension
-----------------------------------------------
With the Kernel memory extension, the Memory Controller is able to limit
@@ -367,10 +375,10 @@ U != 0, K < U:
never greater than the total memory, and freely set U at the cost of his
QoS.
-WARNING:
- In the current implementation, memory reclaim will NOT be
- triggered for a cgroup when it hits K while staying below U, which makes
- this setup impractical.
+ .. warning::
+ In the current implementation, memory reclaim will NOT be triggered for
+ a cgroup when it hits K while staying below U, which makes this setup
+ impractical.
U != 0, K >= U:
Since kmem charges will also be fed to the user counter and reclaim will be
@@ -381,47 +389,41 @@ U != 0, K >= U:
3. User Interface
=================
-3.0. Configuration
-------------------
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+To use the user interface:
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
--------------------------------------------------------------------
-
-::
+1. Enable CONFIG_CGROUPS and CONFIG_MEMCG options
+2. Prepare the cgroups (see :ref:`Why are cgroups needed?
+ <cgroups-why-needed>` for the background information)::
# mount -t tmpfs none /sys/fs/cgroup
# mkdir /sys/fs/cgroup/memory
# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-3.2. Make the new group and move bash into it::
+3. Make the new group and move bash into it::
# mkdir /sys/fs/cgroup/memory/0
# echo $$ > /sys/fs/cgroup/memory/0/tasks
-Since now we're in the 0 cgroup, we can alter the memory limit::
+4. Since now we're in the 0 cgroup, we can alter the memory limit::
# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-NOTE:
- We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
- mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
- Gibibytes.)
+ The limit can now be queried::
+
+ # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+ 4194304
-NOTE:
- We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+.. note::
+ We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+ mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+ Gibibytes.)
-NOTE:
- We cannot set limits on the root cgroup any more.
+.. note::
+ We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
-::
+.. note::
+ We cannot set limits on the root cgroup any more.
- # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
- 4194304
We can check the usage::
@@ -460,6 +462,8 @@ test because it has noise of shared objects/status.
But the above two are testing extreme situations.
Trying usual test under memory controller is always helpful.
+.. _cgroup-v1-memory-test-troubleshoot:
+
4.1 Troubleshooting
-------------------
@@ -472,8 +476,11 @@ terminated by the OOM killer. There are several causes for this:
A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
some of the pages cached in the cgroup (page cache pages).
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
+To know what happens, disabling OOM_Kill as per :ref:`"10. OOM Control"
+<cgroup-v1-memory-oom-control>` (below) and seeing what happens will be
+helpful.
+
+.. _cgroup-v1-memory-test-task-migration:
4.2 Task migration
------------------
@@ -484,15 +491,16 @@ remain charged to it, the charge is dropped when the page is freed or
reclaimed.
You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
+See :ref:`8. "Move charges at task migration" <cgroup-v1-memory-move-charges>`
4.3 Removing a cgroup
---------------------
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
+A cgroup can be removed by rmdir, but as discussed in :ref:`sections 4.1
+<cgroup-v1-memory-test-troubleshoot>` and :ref:`4.2
+<cgroup-v1-memory-test-task-migration>`, a cgroup might have some charge
+associated with it, even though all tasks have migrated away from it. (because
+we charge against pages, not against tasks.)
We move the stats to parent, and no change on the charge except uncharging
from the child.
@@ -518,74 +526,69 @@ will be charged as a new owner of it.
charged file caches. Some out-of-use page caches may keep charged until
memory pressure happens. If you want to avoid that, force_empty will be useful.
- Also, note that when memory.kmem.limit_in_bytes is set the charges due to
- kernel pages will still be seen. This is not considered a failure and the
- write will still return success. In this case, it is expected that
- memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
5.2 stat file
-------------
-memory.stat file includes following statistics
-
-per-memory cgroup local status
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-=============== ===============================================================
-cache # of bytes of page cache memory.
-rss # of bytes of anonymous and swap cache memory (includes
- transparent hugepages).
-rss_huge # of bytes of anonymous transparent hugepages.
-mapped_file # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin # of charging events to the memory cgroup. The charging
- event happens each time a page is accounted as either mapped
- anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout # of uncharging events to the memory cgroup. The uncharging
- event happens each time a page is unaccounted from the cgroup.
-swap # of bytes of swap usage
-dirty # of bytes that are waiting to get written back to the disk.
-writeback # of bytes of file/anon cache that are queued for syncing to
- disk.
-inactive_anon # of bytes of anonymous and swap cache memory on inactive
- LRU list.
-active_anon # of bytes of anonymous and swap cache memory on active
- LRU list.
-inactive_file # of bytes of file-backed memory on inactive LRU list.
-active_file # of bytes of file-backed memory on active LRU list.
-unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
-=============== ===============================================================
-
-status considering hierarchy (see memory.use_hierarchy settings)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ===================================================
-hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
- under which the memory cgroup is
-hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
- hierarchy under which memory cgroup is.
-
-total_<counter> # hierarchical version of <counter>, which in
- addition to the cgroup's own value includes the
- sum of all hierarchical children's values of
- <counter>, i.e. total_cache
-========================= ===================================================
-
-The following additional stats are dependent on CONFIG_DEBUG_VM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ========================================
-recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file VM internal parameter. (see mm/vmscan.c)
-========================= ========================================
-
-Memo:
+memory.stat file includes following statistics:
+
+ * per-memory cgroup local status
+
+ =============== ===============================================================
+ cache # of bytes of page cache memory.
+ rss # of bytes of anonymous and swap cache memory (includes
+ transparent hugepages).
+ rss_huge # of bytes of anonymous transparent hugepages.
+ mapped_file # of bytes of mapped file (includes tmpfs/shmem)
+ pgpgin # of charging events to the memory cgroup. The charging
+ event happens each time a page is accounted as either mapped
+ anon page(RSS) or cache page(Page Cache) to the cgroup.
+ pgpgout # of uncharging events to the memory cgroup. The uncharging
+ event happens each time a page is unaccounted from the
+ cgroup.
+ swap # of bytes of swap usage
+ dirty # of bytes that are waiting to get written back to the disk.
+ writeback # of bytes of file/anon cache that are queued for syncing to
+ disk.
+ inactive_anon # of bytes of anonymous and swap cache memory on inactive
+ LRU list.
+ active_anon # of bytes of anonymous and swap cache memory on active
+ LRU list.
+ inactive_file # of bytes of file-backed memory and MADV_FREE anonymous
+ memory (LazyFree pages) on inactive LRU list.
+ active_file # of bytes of file-backed memory on active LRU list.
+ unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
+ =============== ===============================================================
+
+ * status considering hierarchy (see memory.use_hierarchy settings):
+
+ ========================= ===================================================
+ hierarchical_memory_limit # of bytes of memory limit with regard to
+ hierarchy
+ under which the memory cgroup is
+ hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
+ hierarchy under which memory cgroup is.
+
+ total_<counter> # hierarchical version of <counter>, which in
+ addition to the cgroup's own value includes the
+ sum of all hierarchical children's values of
+ <counter>, i.e. total_cache
+ ========================= ===================================================
+
+ * additional vm parameters (depends on CONFIG_DEBUG_VM):
+
+ ========================= ========================================
+ recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
+ recent_rotated_file VM internal parameter. (see mm/vmscan.c)
+ recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
+ recent_scanned_file VM internal parameter. (see mm/vmscan.c)
+ ========================= ========================================
+
+.. hint::
recent_rotated means recent frequency of LRU rotation.
recent_scanned means recent # of scans to LRU.
showing for better debug please see the code for meanings.
-Note:
+.. note::
Only anonymous and swap cache memory is listed as part of 'rss' stat.
This should not be confused with the true 'resident set size' or the
amount of physical memory used by the cgroup.
@@ -716,15 +719,25 @@ If we want to change this to 1G, we can at any time use::
# echo 1G > memory.soft_limit_in_bytes
-NOTE1:
+.. note::
Soft limits take effect over a long period of time, since they involve
reclaiming memory for balancing between memory cgroups
-NOTE2:
+
+.. note::
It is recommended to set the soft limit always below the hard limit,
otherwise the hard limit will take precedence.
-8. Move charges at task migration
-=================================
+.. _cgroup-v1-memory-move-charges:
+
+8. Move charges at task migration (DEPRECATED!)
+===============================================
+
+THIS IS DEPRECATED!
+
+It's expensive and unreliable! It's better practice to launch workload
+tasks directly from inside their target cgroup. Use dedicated workload
+cgroups to allow fine-grained policy adjustments without having to
+move physical pages between control domains.
Users can move charges associated with a task along with task migration, that
is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
@@ -741,23 +754,29 @@ If you want to enable it::
# echo (some positive value) > memory.move_charge_at_immigrate
-Note:
+.. note::
Each bits of move_charge_at_immigrate has its own meaning about what type
- of charges should be moved. See 8.2 for details.
-Note:
+ of charges should be moved. See :ref:`section 8.2
+ <cgroup-v1-memory-movable-charges>` for details.
+
+.. note::
Charges are moved only when you move mm->owner, in other words,
a leader of a thread group.
-Note:
+
+.. note::
If we cannot find enough space for the task in the destination cgroup, we
try to make space by reclaiming memory. Task migration may fail if we
cannot make enough space.
-Note:
+
+.. note::
It can take several seconds if you move charges much.
And if you want disable it again::
# echo 0 > memory.move_charge_at_immigrate
+.. _cgroup-v1-memory-movable-charges:
+
8.2 Type of charges which can be moved
--------------------------------------
@@ -807,6 +826,8 @@ threshold in any direction.
It's applicable for root and non-root cgroup.
+.. _cgroup-v1-memory-oom-control:
+
10. OOM Control
===============
@@ -962,15 +983,16 @@ commented and discussed quite extensively in the community.
References
==========
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
+.. [1] Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+.. [2] Singh, Balbir. Memory Controller (RSS Control),
http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
+.. [3] Emelianov, Pavel. Resource controllers based on process cgroups
https://lore.kernel.org/r/45ED7DEC.7010403@sw.ru
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+.. [4] Emelianov, Pavel. RSS controller based on process cgroups (v2)
https://lore.kernel.org/r/461A3010.90403@sw.ru
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+.. [5] Emelianov, Pavel. RSS controller based on process cgroups (v3)
https://lore.kernel.org/r/465D9739.8070209@openvz.org
+
6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
subsystem (v3), http://lwn.net/Articles/235534/
@@ -980,7 +1002,8 @@ References
https://lore.kernel.org/r/464D267A.50107@linux.vnet.ibm.com
10. Singh, Balbir. Memory controller v6 test results,
https://lore.kernel.org/r/20070819094658.654.84837.sendpatchset@balbir-laptop
-11. Singh, Balbir. Memory controller introduction (v6),
- https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop
-12. Corbet, Jonathan, Controlling memory use in cgroups,
- http://lwn.net/Articles/243795/
+
+.. [11] Singh, Balbir. Memory controller introduction (v6),
+ https://lore.kernel.org/r/20070817084228.26003.12568.sendpatchset@balbir-laptop
+.. [12] Corbet, Jonathan, Controlling memory use in cgroups,
+ http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 4d8c27eca96b..f67c0829350b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -184,6 +184,14 @@ cgroup v2 currently supports the following mount options.
ignored on non-init namespace mounts. Please refer to the
Delegation section for details.
+ favordynmods
+ Reduce the latencies of dynamic cgroup modifications such as
+ task migrations and controller on/offs at the cost of making
+ hot path operations such as forks and exits more expensive.
+ The static usage pattern of creating a cgroup, enabling
+ controllers, and then seeding it with CLONE_INTO_CGROUP is
+ not affected by this option.
+
memory_localevents
Only populate memory.events with data for the current cgroup,
and not any subtrees. This is legacy behaviour, the default
@@ -611,10 +619,12 @@ process migrations.
and is an example of this type.
+.. _cgroupv2-limits-distributor:
+
Limits
------
-A child can only consume upto the configured amount of the resource.
+A child can only consume up to the configured amount of the resource.
Limits can be over-committed - the sum of the limits of children can
exceed the amount of resource available to the parent.
@@ -627,15 +637,16 @@ process migrations.
"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
on an IO device and is an example of this type.
+.. _cgroupv2-protections-distributor:
Protections
-----------
-A cgroup is protected upto the configured amount of the resource
+A cgroup is protected up to the configured amount of the resource
as long as the usages of all its ancestors are under their
protected levels. Protections can be hard guarantees or best effort
soft boundaries. Protections can also be over-committed in which case
-only upto the amount available to the parent is protected among
+only up to the amount available to the parent is protected among
children.
Protections are in the range [0, max] and defaults to 0, which is
@@ -968,6 +979,29 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects
the whole thread-group.
+ cgroup.pressure
+ A read-write single value file that allowed values are "0" and "1".
+ The default is "1".
+
+ Writing "0" to the file will disable the cgroup PSI accounting.
+ Writing "1" to the file will re-enable the cgroup PSI accounting.
+
+ This control attribute is not hierarchical, so disable or enable PSI
+ accounting in a cgroup does not affect PSI accounting in descendants
+ and doesn't need pass enablement via ancestors from root.
+
+ The reason this control attribute exists is that PSI accounts stalls for
+ each cgroup separately and aggregates it at each level of the hierarchy.
+ This may cause non-negligible overhead for some workloads when under
+ deep level of the hierarchy, in which case this control attribute can
+ be used to disable PSI accounting in the non-leaf cgroups.
+
+ irq.pressure
+ A read-write nested-keyed file.
+
+ Shows pressure stall information for IRQ/SOFTIRQ. See
+ :ref:`Documentation/accounting/psi.rst <psi>` for details.
+
Controllers
===========
@@ -1016,6 +1050,8 @@ All time durations are in microseconds.
- nr_periods
- nr_throttled
- throttled_usec
+ - nr_bursts
+ - burst_usec
cpu.weight
A read-write single value file which exists on non-root
@@ -1043,10 +1079,16 @@ All time durations are in microseconds.
$MAX $PERIOD
- which indicates that the group may consume upto $MAX in each
+ which indicates that the group may consume up to $MAX in each
$PERIOD duration. "max" for $MAX indicates no limit. If only
one number is written, $MAX is updated.
+ cpu.max.burst
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ The burst in the range [0, $MAX].
+
cpu.pressure
A read-write nested-keyed file.
@@ -1200,6 +1242,41 @@ PAGE_SIZE multiple when read back.
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
+ memory.reclaim
+ A write-only nested-keyed file which exists for all cgroups.
+
+ This is a simple interface to trigger memory reclaim in the
+ target cgroup.
+
+ This file accepts a single key, the number of bytes to reclaim.
+ No nested keys are currently supported.
+
+ Example::
+
+ echo "1G" > memory.reclaim
+
+ The interface can be later extended with nested keys to
+ configure the reclaim behavior. For example, specify the
+ type of memory to reclaim from (anon, file, ..).
+
+ Please note that the kernel can over or under reclaim from
+ the target cgroup. If less bytes are reclaimed than the
+ specified amount, -EAGAIN is returned.
+
+ Please note that the proactive reclaim (triggered by this
+ interface) is not meant to indicate memory pressure on the
+ memory cgroup. Therefore socket memory balancing triggered by
+ the memory reclaim normally is not exercised in this case.
+ This means that the networking layer will not adapt based on
+ reclaim induced by memory.reclaim.
+
+ memory.peak
+ A read-only single value file which exists on non-root
+ cgroups.
+
+ The max memory usage recorded for the cgroup and its
+ descendants since the creation of the cgroup.
+
memory.oom.group
A read-write single value file which exists on non-root
cgroups. The default value is "0".
@@ -1260,6 +1337,9 @@ PAGE_SIZE multiple when read back.
The number of processes belonging to this cgroup
killed by any kind of OOM killer.
+ oom_group_kill
+ The number of times a group OOM has occurred.
+
memory.events.local
Similar to memory.events but the fields in the file are local
to the cgroup i.e. not hierarchical. The file modified event
@@ -1290,12 +1370,22 @@ PAGE_SIZE multiple when read back.
Amount of memory used to cache filesystem data,
including tmpfs and shared memory.
+ kernel (npn)
+ Amount of total kernel memory, including
+ (kernel_stack, pagetables, percpu, vmalloc, slab) in
+ addition to other kernel memory use cases.
+
kernel_stack
Amount of memory allocated to kernel stacks.
pagetables
Amount of memory allocated for page tables.
+ sec_pagetables
+ Amount of memory allocated for secondary page tables,
+ this currently includes KVM mmu allocations on x86
+ and arm64.
+
percpu (npn)
Amount of memory used for storing per-cpu kernel
data structures.
@@ -1303,10 +1393,19 @@ PAGE_SIZE multiple when read back.
sock (npn)
Amount of memory used in network transmission buffers
+ vmalloc (npn)
+ Amount of memory used for vmap backed memory.
+
shmem
Amount of cached filesystem data that is swap-backed,
such as tmpfs, shm segments, shared anonymous mmap()s
+ zswap
+ Amount of memory consumed by the zswap compression backend.
+
+ zswapped
+ Amount of application memory swapped out to zswap.
+
file_mapped
Amount of cached filesystem data mapped with mmap()
@@ -1380,6 +1479,30 @@ PAGE_SIZE multiple when read back.
workingset_nodereclaim
Number of times a shadow node has been reclaimed
+ pgscan (npn)
+ Amount of scanned pages (in an inactive LRU list)
+
+ pgsteal (npn)
+ Amount of reclaimed pages
+
+ pgscan_kswapd (npn)
+ Amount of scanned pages by kswapd (in an inactive LRU list)
+
+ pgscan_direct (npn)
+ Amount of scanned pages directly (in an inactive LRU list)
+
+ pgscan_khugepaged (npn)
+ Amount of scanned pages by khugepaged (in an inactive LRU list)
+
+ pgsteal_kswapd (npn)
+ Amount of reclaimed pages by kswapd
+
+ pgsteal_direct (npn)
+ Amount of reclaimed pages directly
+
+ pgsteal_khugepaged (npn)
+ Amount of reclaimed pages by khugepaged
+
pgfault (npn)
Total number of page faults incurred
@@ -1389,12 +1512,6 @@ PAGE_SIZE multiple when read back.
pgrefill (npn)
Amount of scanned pages (in an active LRU list)
- pgscan (npn)
- Amount of scanned pages (in an inactive LRU list)
-
- pgsteal (npn)
- Amount of reclaimed pages
-
pgactivate (npn)
Amount of pages moved to the active LRU list
@@ -1497,6 +1614,21 @@ PAGE_SIZE multiple when read back.
higher than the limit for an extended period of time. This
reduces the impact on the workload and memory management.
+ memory.zswap.current
+ A read-only single value file which exists on non-root
+ cgroups.
+
+ The total amount of memory consumed by the zswap compression
+ backend.
+
+ memory.zswap.max
+ A read-write single value file which exists on non-root
+ cgroups. The default is "max".
+
+ Zswap usage hard limit. If a cgroup's zswap pool reaches this
+ limit, it will refuse to take any more stores before existing
+ entries fault back in or are written out to disk.
+
memory.pressure
A read-only nested-keyed file.
@@ -1862,7 +1994,7 @@ IO Latency Interface Files
io.latency
This takes a similar format as the other controllers.
- "MAJOR:MINOR target=<target time in microseconds"
+ "MAJOR:MINOR target=<target time in microseconds>"
io.stat
If the controller is enabled you will see extra stats in io.stat in
@@ -2090,75 +2222,93 @@ Cpuset Interface Files
It accepts only the following input values when written to.
- ======== ================================
- "root" a partition root
- "member" a non-root member of a partition
- ======== ================================
-
- When set to be a partition root, the current cgroup is the
- root of a new partition or scheduling domain that comprises
- itself and all its descendants except those that are separate
- partition roots themselves and their descendants. The root
- cgroup is always a partition root.
-
- There are constraints on where a partition root can be set.
- It can only be set in a cgroup if all the following conditions
- are true.
-
- 1) The "cpuset.cpus" is not empty and the list of CPUs are
- exclusive, i.e. they are not shared by any of its siblings.
- 2) The parent cgroup is a partition root.
- 3) The "cpuset.cpus" is also a proper subset of the parent's
- "cpuset.cpus.effective".
- 4) There is no child cgroups with cpuset enabled. This is for
- eliminating corner cases that have to be handled if such a
- condition is allowed.
-
- Setting it to partition root will take the CPUs away from the
- effective CPUs of the parent cgroup. Once it is set, this
- file cannot be reverted back to "member" if there are any child
- cgroups with cpuset enabled.
-
- A parent partition cannot distribute all its CPUs to its
- child partitions. There must be at least one cpu left in the
- parent partition.
-
- Once becoming a partition root, changes to "cpuset.cpus" is
- generally allowed as long as the first condition above is true,
- the change will not take away all the CPUs from the parent
- partition and the new "cpuset.cpus" value is a superset of its
- children's "cpuset.cpus" values.
-
- Sometimes, external factors like changes to ancestors'
- "cpuset.cpus" or cpu hotplug can cause the state of the partition
- root to change. On read, the "cpuset.sched.partition" file
- can show the following values.
-
- ============== ==============================
- "member" Non-root member of a partition
- "root" Partition root
- "root invalid" Invalid partition root
- ============== ==============================
-
- It is a partition root if the first 2 partition root conditions
- above are true and at least one CPU from "cpuset.cpus" is
- granted by the parent cgroup.
-
- A partition root can become invalid if none of CPUs requested
- in "cpuset.cpus" can be granted by the parent cgroup or the
- parent cgroup is no longer a partition root itself. In this
- case, it is not a real partition even though the restriction
- of the first partition root condition above will still apply.
- The cpu affinity of all the tasks in the cgroup will then be
- associated with CPUs in the nearest ancestor partition.
-
- An invalid partition root can be transitioned back to a
- real partition root if at least one of the requested CPUs
- can now be granted by its parent. In this case, the cpu
- affinity of all the tasks in the formerly invalid partition
- will be associated to the CPUs of the newly formed partition.
- Changing the partition state of an invalid partition root to
- "member" is always allowed even if child cpusets are present.
+ ========== =====================================
+ "member" Non-root member of a partition
+ "root" Partition root
+ "isolated" Partition root without load balancing
+ ========== =====================================
+
+ The root cgroup is always a partition root and its state
+ cannot be changed. All other non-root cgroups start out as
+ "member".
+
+ When set to "root", the current cgroup is the root of a new
+ partition or scheduling domain that comprises itself and all
+ its descendants except those that are separate partition roots
+ themselves and their descendants.
+
+ When set to "isolated", the CPUs in that partition root will
+ be in an isolated state without any load balancing from the
+ scheduler. Tasks placed in such a partition with multiple
+ CPUs should be carefully distributed and bound to each of the
+ individual CPUs for optimal performance.
+
+ The value shown in "cpuset.cpus.effective" of a partition root
+ is the CPUs that the partition root can dedicate to a potential
+ new child partition root. The new child subtracts available
+ CPUs from its parent "cpuset.cpus.effective".
+
+ A partition root ("root" or "isolated") can be in one of the
+ two possible states - valid or invalid. An invalid partition
+ root is in a degraded state where some state information may
+ be retained, but behaves more like a "member".
+
+ All possible state transitions among "member", "root" and
+ "isolated" are allowed.
+
+ On read, the "cpuset.cpus.partition" file can show the following
+ values.
+
+ ============================= =====================================
+ "member" Non-root member of a partition
+ "root" Partition root
+ "isolated" Partition root without load balancing
+ "root invalid (<reason>)" Invalid partition root
+ "isolated invalid (<reason>)" Invalid isolated partition root
+ ============================= =====================================
+
+ In the case of an invalid partition root, a descriptive string on
+ why the partition is invalid is included within parentheses.
+
+ For a partition root to become valid, the following conditions
+ must be met.
+
+ 1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
+ are not shared by any of its siblings (exclusivity rule).
+ 2) The parent cgroup is a valid partition root.
+ 3) The "cpuset.cpus" is not empty and must contain at least
+ one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
+ 4) The "cpuset.cpus.effective" cannot be empty unless there is
+ no task associated with this partition.
+
+ External events like hotplug or changes to "cpuset.cpus" can
+ cause a valid partition root to become invalid and vice versa.
+ Note that a task cannot be moved to a cgroup with empty
+ "cpuset.cpus.effective".
+
+ For a valid partition root with the sibling cpu exclusivity
+ rule enabled, changes made to "cpuset.cpus" that violate the
+ exclusivity rule will invalidate the partition as well as its
+ sibling partitions with conflicting cpuset.cpus values. So
+ care must be taking in changing "cpuset.cpus".
+
+ A valid non-root parent partition may distribute out all its CPUs
+ to its child partitions when there is no task associated with it.
+
+ Care must be taken to change a valid partition root to
+ "member" as all its child partitions, if present, will become
+ invalid causing disruption to tasks running in those child
+ partitions. These inactivated partitions could be recovered if
+ their parent is switched back to a partition root with a proper
+ set of "cpuset.cpus".
+
+ Poll and inotify events are triggered whenever the state of
+ "cpuset.cpus.partition" changes. That includes changes caused
+ by write to "cpuset.cpus.partition", cpu hotplug or other
+ changes that modify the validity status of the partition.
+ This will allow user space agents to monitor unexpected changes
+ to "cpuset.cpus.partition" without the need to do continuous
+ polling.
Device controller
@@ -2252,6 +2402,11 @@ HugeTLB Interface Files
are local to the cgroup i.e. not hierarchical. The file modified event
generated on this file reflects only the local events.
+ hugetlb.<hugepagesize>.numa_stat
+ Similar to memory.numa_stat, it shows the numa information of the
+ hugetlb pages of <hugepagesize> in this cgroup. Only active in
+ use hugetlb pages are included. The per-node values are in bytes.
+
Misc
----
@@ -2310,6 +2465,16 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_
Limits can be set higher than the capacity value in the misc.capacity
file.
+ misc.events
+ A read-only flat-keyed file which exists on non-root cgroups. The
+ following entries are defined. Unless specified otherwise, a value
+ change in this file generates a file modified event. All fields in
+ this file are hierarchical.
+
+ max
+ The number of times the cgroup's resource usage was
+ about to go over the max boundary.
+
Migration and Ownership
~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst
index f170d8820258..2e151cd8c2e4 100644
--- a/Documentation/admin-guide/cifs/usage.rst
+++ b/Documentation/admin-guide/cifs/usage.rst
@@ -399,7 +399,7 @@ A partial list of the supported mount options follows:
sep
if first mount option (after the -o), overrides
the comma as the separator between the mount
- parms. e.g.::
+ parameters. e.g.::
-o user=myname,password=mypassword,domain=mydom
@@ -734,10 +734,9 @@ SecurityFlags Flags which control security negotiation and
using weaker password hashes is 0x37037 (lanman,
plaintext, ntlm, ntlmv2, signing allowed). Some
SecurityFlags require the corresponding menuconfig
- options to be enabled (lanman and plaintext require
- CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
- plaintext authentication currently requires also
- enabling lanman authentication in the security flags
+ options to be enabled. Enabling plaintext
+ authentication currently requires also enabling
+ lanman authentication in the security flags
because the cifs module only supports sending
laintext passwords using the older lanman dialect
form of the session setup SMB. (e.g. for authentication
@@ -766,7 +765,7 @@ cifsFYI If set to non-zero value, additional debug information
Some debugging statements are not compiled into the
cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
kernel configuration. cifsFYI may be set to one or
- nore of the following flags (7 sets them all)::
+ more of the following flags (7 sets them all)::
+-----------------------------------------------+------+
| log cifs informational messages | 0x01 |
@@ -859,7 +858,7 @@ CIFS kernel module parameters
These module parameters can be specified or modified either during the time of
module loading or during the runtime by using the interface::
- /proc/module/cifs/parameters/<param>
+ /sys/module/cifs/parameters/<param>
i.e.::
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
index b085dbac60a5..d29cacc9b3c3 100644
--- a/Documentation/admin-guide/cputopology.rst
+++ b/Documentation/admin-guide/cputopology.rst
@@ -8,22 +8,22 @@ to /proc/cpuinfo output of some architectures. They reside in
Documentation/ABI/stable/sysfs-devices-system-cpu.
Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
+However the die, cluster, book, and drawer hierarchy related sysfs files will
+only be created if an architecture provides the related macros as described
+below.
For an architecture to support this feature, it must define some of
these macros in include/asm-XXX/topology.h::
#define topology_physical_package_id(cpu)
#define topology_die_id(cpu)
+ #define topology_cluster_id(cpu)
#define topology_core_id(cpu)
#define topology_book_id(cpu)
#define topology_drawer_id(cpu)
#define topology_sibling_cpumask(cpu)
#define topology_core_cpumask(cpu)
+ #define topology_cluster_cpumask(cpu)
#define topology_die_cpumask(cpu)
#define topology_book_cpumask(cpu)
#define topology_drawer_cpumask(cpu)
@@ -39,15 +39,16 @@ not defined by include/asm-XXX/topology.h:
1) topology_physical_package_id: -1
2) topology_die_id: -1
-3) topology_core_id: 0
-4) topology_sibling_cpumask: just the given CPU
-5) topology_core_cpumask: just the given CPU
-6) topology_die_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+3) topology_cluster_id: -1
+4) topology_core_id: 0
+5) topology_book_id: -1
+6) topology_drawer_id: -1
+7) topology_sibling_cpumask: just the given CPU
+8) topology_core_cpumask: just the given CPU
+9) topology_cluster_cpumask: just the given CPU
+10) topology_die_cpumask: just the given CPU
+11) topology_book_cpumask: just the given CPU
+12) topology_drawer_cpumask: just the given CPU
Additionally, CPU topology information is provided under
/sys/devices/system/cpu and includes these files. The internal
diff --git a/Documentation/admin-guide/device-mapper/cache-policies.rst b/Documentation/admin-guide/device-mapper/cache-policies.rst
index b17fe352fc41..13da4d831d46 100644
--- a/Documentation/admin-guide/device-mapper/cache-policies.rst
+++ b/Documentation/admin-guide/device-mapper/cache-policies.rst
@@ -70,7 +70,7 @@ the entries (each hotspot block covers a larger area than a single
cache block).
All this means smq uses ~25bytes per cache block. Still a lot of
-memory, but a substantial improvement nontheless.
+memory, but a substantial improvement nonetheless.
Level balancing
^^^^^^^^^^^^^^^
diff --git a/Documentation/admin-guide/device-mapper/dm-ebs.rst b/Documentation/admin-guide/device-mapper/dm-ebs.rst
index 534fa38e8862..c09f66db5621 100644
--- a/Documentation/admin-guide/device-mapper/dm-ebs.rst
+++ b/Documentation/admin-guide/device-mapper/dm-ebs.rst
@@ -31,7 +31,7 @@ Mandatory parameters:
Optional parameter:
- <underyling sectors>:
+ <underlying sectors>:
Number of sectors defining the logical block size of <dev path>.
2^N supported, e.g. 8 = emulate 8 sectors of 512 bytes = 4KiB.
If not provided, the logical block size of <dev path> will be used.
diff --git a/Documentation/admin-guide/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst
index 86138735879d..f7104c01b0f7 100644
--- a/Documentation/admin-guide/device-mapper/dm-flakey.rst
+++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst
@@ -39,6 +39,10 @@ Optional feature parameters:
If no feature parameters are present, during the periods of
unreliability, all I/O returns errors.
+ error_reads:
+ All read I/O is failed with an error signalled.
+ Write I/O is handled correctly.
+
drop_writes:
All write I/O is silently ignored.
Read I/O is handled correctly.
diff --git a/Documentation/admin-guide/device-mapper/dm-init.rst b/Documentation/admin-guide/device-mapper/dm-init.rst
index e5242ff17e9b..981d6a907699 100644
--- a/Documentation/admin-guide/device-mapper/dm-init.rst
+++ b/Documentation/admin-guide/device-mapper/dm-init.rst
@@ -123,3 +123,11 @@ Other examples (per target):
0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584
+
+For setups using device-mapper on top of asynchronously probed block
+devices (MMC, USB, ..), it may be necessary to tell dm-init to
+explicitly wait for them to become available before setting up the
+device-mapper tables. This can be done with the "dm-mod.waitfor="
+module parameter, which takes a list of devices to wait for::
+
+ dm-mod.waitfor=<device1>[,..,<deviceN>]
diff --git a/Documentation/admin-guide/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst
index 0fac051caeac..932383fe6e88 100644
--- a/Documentation/admin-guide/device-mapper/dm-zoned.rst
+++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst
@@ -46,7 +46,7 @@ just like conventional zones.
The zones of the device(s) are separated into 2 types:
1) Metadata zones: these are conventional zones used to store metadata.
-Metadata zones are not reported as useable capacity to the user.
+Metadata zones are not reported as usable capacity to the user.
2) Data zones: all remaining zones, the vast majority of which will be
sequential zones used exclusively to store user data. The conventional
diff --git a/Documentation/admin-guide/device-mapper/unstriped.rst b/Documentation/admin-guide/device-mapper/unstriped.rst
index 0a8d3eb3f072..5772ccdd1f5f 100644
--- a/Documentation/admin-guide/device-mapper/unstriped.rst
+++ b/Documentation/admin-guide/device-mapper/unstriped.rst
@@ -35,7 +35,7 @@ An example of undoing an existing dm-stripe
This small bash script will setup 4 loop devices and use the existing
striped target to combine the 4 devices into one. It then will use
-the unstriped target ontop of the striped device to access the
+the unstriped target on top of the striped device to access the
individual backing loop devices. We write data to the newly exposed
unstriped devices and verify the data written matches the correct
underlying device on the striped array::
@@ -110,8 +110,8 @@ to get a 92% reduction in read latency using this device mapper target.
Example dmsetup usage
=====================
-unstriped ontop of Intel NVMe device that has 2 cores
------------------------------------------------------
+unstriped on top of Intel NVMe device that has 2 cores
+------------------------------------------------------
::
@@ -124,8 +124,8 @@ respectively::
/dev/mapper/nvmset0
/dev/mapper/nvmset1
-unstriped ontop of striped with 4 drives using 128K chunk size
---------------------------------------------------------------
+unstriped on top of striped with 4 drives using 128K chunk size
+---------------------------------------------------------------
::
diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index 1a6b91368e59..a65c1602cb23 100644
--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -141,6 +141,10 @@ root_hash_sig_key_desc <key_description>
also gain new certificates at run time if they are signed by a certificate
already in the secondary trusted keyring.
+try_verify_in_tasklet
+ If verity hashes are in cache, verify data blocks in kernel tasklet instead
+ of workqueue. This option can reduce IO latency.
+
Theory of operation
===================
diff --git a/Documentation/admin-guide/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst
index 10429779a91a..60c16b7fd5ac 100644
--- a/Documentation/admin-guide/device-mapper/writecache.rst
+++ b/Documentation/admin-guide/device-mapper/writecache.rst
@@ -20,6 +20,7 @@ Constructor parameters:
size)
5. the number of optional parameters (the parameters with an argument
count as two)
+
start_sector n (default: 0)
offset from the start of cache device in 512-byte sectors
high_watermark n (default: 50)
@@ -74,20 +75,21 @@ Constructor parameters:
the origin volume in the last n milliseconds
Status:
+
1. error indicator - 0 if there was no error, otherwise error number
2. the number of blocks
3. the number of free blocks
4. the number of blocks under writeback
-5. the number of read requests
-6. the number of read requests that hit the cache
-7. the number of write requests
-8. the number of write requests that hit uncommitted block
-9. the number of write requests that hit committed block
-10. the number of write requests that bypass the cache
-11. the number of write requests that are allocated in the cache
+5. the number of read blocks
+6. the number of read blocks that hit the cache
+7. the number of write blocks
+8. the number of write blocks that hit uncommitted block
+9. the number of write blocks that hit committed block
+10. the number of write blocks that bypass the cache
+11. the number of write blocks that are allocated in the cache
12. the number of write requests that are blocked on the freelist
13. the number of flush requests
-14. the number of discard requests
+14. the number of discarded blocks
Messages:
flush
diff --git a/Documentation/admin-guide/devices.rst b/Documentation/admin-guide/devices.rst
index 035275fedbdd..e3776d77374b 100644
--- a/Documentation/admin-guide/devices.rst
+++ b/Documentation/admin-guide/devices.rst
@@ -7,10 +7,9 @@ This list is the Linux Device List, the official registry of allocated
device numbers and ``/dev`` directory nodes for the Linux operating
system.
-The LaTeX version of this document is no longer maintained, nor is
-the document that used to reside at lanana.org. This version in the
-mainline Linux kernel is the master document. Updates shall be sent
-as patches to the kernel maintainers (see the
+The version of this document at lanana.org is no longer maintained. This
+version in the mainline Linux kernel is the master document. Updates
+shall be sent as patches to the kernel maintainers (see the
:ref:`Documentation/process/submitting-patches.rst <submittingpatches>` document).
Specifically explore the sections titled "CHAR and MISC DRIVERS", and
"BLOCK LAYER" in the MAINTAINERS file to find the right maintainers
diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 922c23bb4372..06c525e01ea5 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -1933,7 +1933,7 @@
...
255= /dev/umem/d15p15 15th partition of 16th board.
- 117 char COSA/SRP synchronous serial card
+ 117 char [REMOVED] COSA/SRP synchronous serial card
0 = /dev/cosa0c0 1st board, 1st channel
1 = /dev/cosa0c1 1st board, 2nd channel
...
@@ -2339,13 +2339,7 @@
disks (see major number 3) except that the limit on
partitions is 31.
- 162 char Raw block device interface
- 0 = /dev/rawctl Raw I/O control device
- 1 = /dev/raw/raw1 First raw I/O device
- 2 = /dev/raw/raw2 Second raw I/O device
- ...
- max minor number of raw device is set by kernel config
- MAX_RAW_DEVS or raw module parameter 'max_raw_devs'
+ 162 char Used for (now removed) raw block device interface
163 char
@@ -3086,6 +3080,11 @@
...
255 = /dev/osd255 256th OSD Device
+ 261 char Compute Acceleration Devices
+ 0 = /dev/accel/accel0 First acceleration device
+ 1 = /dev/accel/accel1 Second acceleration device
+ ...
+
384-511 char RESERVED FOR DYNAMIC ASSIGNMENT
Character devices that request a dynamic allocation of major
number will take numbers starting from 511 and downward,
diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst
index b119b8277b3e..8dc668cc1216 100644
--- a/Documentation/admin-guide/dynamic-debug-howto.rst
+++ b/Documentation/admin-guide/dynamic-debug-howto.rst
@@ -5,143 +5,115 @@ Dynamic debug
Introduction
============
-This document describes how to use the dynamic debug (dyndbg) feature.
+Dynamic debug allows you to dynamically enable/disable kernel
+debug-print code to obtain additional kernel information.
-Dynamic debug is designed to allow you to dynamically enable/disable
-kernel code to obtain additional kernel information. Currently, if
-``CONFIG_DYNAMIC_DEBUG`` is set, then all ``pr_debug()``/``dev_dbg()`` and
-``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically
-enabled per-callsite.
+If ``/proc/dynamic_debug/control`` exists, your kernel has dynamic
+debug. You'll need root access (sudo su) to use this.
-If you do not want to enable dynamic debug globally (i.e. in some embedded
-system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
-debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
-modules which you'd like to dynamically debug later.
-
-If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just
-shortcut for ``print_hex_dump(KERN_DEBUG)``.
-
-For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
-its ``prefix_str`` argument, if it is constant string; or ``hexdump``
-in case ``prefix_str`` is built dynamically.
+Dynamic debug provides:
-Dynamic debug has even more useful features:
+ * a Catalog of all *prdbgs* in your kernel.
+ ``cat /proc/dynamic_debug/control`` to see them.
- * Simple query language allows turning on and off debugging
- statements by matching any combination of 0 or 1 of:
+ * a Simple query/command language to alter *prdbgs* by selecting on
+ any combination of 0 or 1 of:
- source filename
- function name
- line number (including ranges of line numbers)
- module name
- format string
-
- * Provides a debugfs control file: ``<debugfs>/dynamic_debug/control``
- which can be read to display the complete list of known debug
- statements, to help guide you
-
-Controlling dynamic debug Behaviour
-===================================
-
-The behaviour of ``pr_debug()``/``dev_dbg()`` are controlled via writing to a
-control file in the 'debugfs' filesystem. Thus, you must first mount
-the debugfs filesystem, in order to make use of this feature.
-Subsequently, we refer to the control file as:
-``<debugfs>/dynamic_debug/control``. For example, if you want to enable
-printing from source file ``svcsock.c``, line 1603 you simply do::
-
- nullarbor:~ # echo 'file svcsock.c line 1603 +p' >
- <debugfs>/dynamic_debug/control
-
-If you make a mistake with the syntax, the write will fail thus::
-
- nullarbor:~ # echo 'file svcsock.c wtf 1 +p' >
- <debugfs>/dynamic_debug/control
- -bash: echo: write error: Invalid argument
-
-Note, for systems without 'debugfs' enabled, the control file can be
-found in ``/proc/dynamic_debug/control``.
+ - class name (as known/declared by each module)
Viewing Dynamic Debug Behaviour
===============================
-You can view the currently configured behaviour of all the debug
-statements via::
+You can view the currently configured behaviour in the *prdbg* catalog::
- nullarbor:~ # cat <debugfs>/dynamic_debug/control
+ :#> head -n7 /proc/dynamic_debug/control
# filename:lineno [module]function flags format
- net/sunrpc/svc_rdma.c:323 [svcxprt_rdma]svc_rdma_cleanup =_ "SVCRDMA Module Removed, deregister RPC RDMA transport\012"
- net/sunrpc/svc_rdma.c:341 [svcxprt_rdma]svc_rdma_init =_ "\011max_inline : %d\012"
- net/sunrpc/svc_rdma.c:340 [svcxprt_rdma]svc_rdma_init =_ "\011sq_depth : %d\012"
- net/sunrpc/svc_rdma.c:338 [svcxprt_rdma]svc_rdma_init =_ "\011max_requests : %d\012"
- ...
+ init/main.c:1179 [main]initcall_blacklist =_ "blacklisting initcall %s\012
+ init/main.c:1218 [main]initcall_blacklisted =_ "initcall %s blacklisted\012"
+ init/main.c:1424 [main]run_init_process =_ " with arguments:\012"
+ init/main.c:1426 [main]run_init_process =_ " %s\012"
+ init/main.c:1427 [main]run_init_process =_ " with environment:\012"
+ init/main.c:1429 [main]run_init_process =_ " %s\012"
+The 3rd space-delimited column shows the current flags, preceded by
+a ``=`` for easy use with grep/cut. ``=p`` shows enabled callsites.
-You can also apply standard Unix text manipulation filters to this
-data, e.g.::
+Controlling dynamic debug Behaviour
+===================================
- nullarbor:~ # grep -i rdma <debugfs>/dynamic_debug/control | wc -l
- 62
+The behaviour of *prdbg* sites are controlled by writing
+query/commands to the control file. Example::
- nullarbor:~ # grep -i tcp <debugfs>/dynamic_debug/control | wc -l
- 42
+ # grease the interface
+ :#> alias ddcmd='echo $* > /proc/dynamic_debug/control'
-The third column shows the currently enabled flags for each debug
-statement callsite (see below for definitions of the flags). The
-default value, with no flags enabled, is ``=_``. So you can view all
-the debug statement callsites with any non-default flags::
+ :#> ddcmd '-p; module main func run* +p'
+ :#> grep =p /proc/dynamic_debug/control
+ init/main.c:1424 [main]run_init_process =p " with arguments:\012"
+ init/main.c:1426 [main]run_init_process =p " %s\012"
+ init/main.c:1427 [main]run_init_process =p " with environment:\012"
+ init/main.c:1429 [main]run_init_process =p " %s\012"
- nullarbor:~ # awk '$3 != "=_"' <debugfs>/dynamic_debug/control
- # filename:lineno [module]function flags format
- net/sunrpc/svcsock.c:1603 [sunrpc]svc_send p "svc_process: st_sendto returned %d\012"
+Error messages go to console/syslog::
+
+ :#> ddcmd mode foo +p
+ dyndbg: unknown keyword "mode"
+ dyndbg: query parse failed
+ bash: echo: write error: Invalid argument
+
+If debugfs is also enabled and mounted, ``dynamic_debug/control`` is
+also under the mount-dir, typically ``/sys/kernel/debug/``.
Command Language Reference
==========================
-At the lexical level, a command comprises a sequence of words separated
+At the basic lexical level, a command is a sequence of words separated
by spaces or tabs. So these are all equivalent::
- nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
- <debugfs>/dynamic_debug/control
- nullarbor:~ # echo -n ' file svcsock.c line 1603 +p ' >
- <debugfs>/dynamic_debug/control
- nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd file svcsock.c line 1603 +p
+ :#> ddcmd "file svcsock.c line 1603 +p"
+ :#> ddcmd ' file svcsock.c line 1603 +p '
Command submissions are bounded by a write() system call.
Multiple commands can be written together, separated by ``;`` or ``\n``::
- ~# echo "func pnpacpi_get_resources +p; func pnp_assign_mem +p" \
- > <debugfs>/dynamic_debug/control
+ :#> ddcmd "func pnpacpi_get_resources +p; func pnp_assign_mem +p"
+ :#> ddcmd <<"EOC"
+ func pnpacpi_get_resources +p
+ func pnp_assign_mem +p
+ EOC
+ :#> cat query-batch-file > /proc/dynamic_debug/control
-If your query set is big, you can batch them too::
+You can also use wildcards in each query term. The match rule supports
+``*`` (matches zero or more characters) and ``?`` (matches exactly one
+character). For example, you can match all usb drivers::
- ~# cat query-batch-file > <debugfs>/dynamic_debug/control
+ :#> ddcmd file "drivers/usb/*" +p # "" to suppress shell expansion
-Another way is to use wildcards. The match rule supports ``*`` (matches
-zero or more characters) and ``?`` (matches exactly one character). For
-example, you can match all usb drivers::
-
- ~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
-
-At the syntactical level, a command comprises a sequence of match
-specifications, followed by a flags change specification::
+Syntactically, a command is pairs of keyword values, followed by a
+flags change or setting::
command ::= match-spec* flags-spec
-The match-spec's are used to choose a subset of the known pr_debug()
-callsites to which to apply the flags-spec. Think of them as a query
-with implicit ANDs between each pair. Note that an empty list of
-match-specs will select all debug statement callsites.
+The match-spec's select *prdbgs* from the catalog, upon which to apply
+the flags-spec, all constraints are ANDed together. An absent keyword
+is the same as keyword "*".
+
-A match specification comprises a keyword, which controls the
-attribute of the callsite to be compared, and a value to compare
-against. Possible keywords are:::
+A match specification is a keyword, which selects the attribute of
+the callsite to be compared, and a value to compare against. Possible
+keywords are:::
match-spec ::= 'func' string |
'file' string |
'module' string |
'format' string |
+ 'class' string |
'line' line-range
line-range ::= lineno |
@@ -203,6 +175,16 @@ format
format "nfsd: SETATTR" // a neater way to match a format with whitespace
format 'nfsd: SETATTR' // yet another way to match a format with whitespace
+class
+ The given class_name is validated against each module, which may
+ have declared a list of known class_names. If the class_name is
+ found for a module, callsite & class matching and adjustment
+ proceeds. Examples::
+
+ class DRM_UT_KMS # a DRM.debug category
+ class JUNK # silent non-match
+ // class TLD_* # NOTICE: no wildcard in class names
+
line
The given line number or range of line numbers is compared
against the line number of each ``pr_debug()`` callsite. A single
@@ -228,17 +210,16 @@ of the characters::
The flags are::
p enables the pr_debug() callsite.
- f Include the function name in the printed message
- l Include line number in the printed message
- m Include module name in the printed message
- t Include thread ID in messages not generated from interrupt context
- _ No flags are set. (Or'd with others on input)
+ _ enables no flags.
-For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only ``p`` flag
-have meaning, other flags ignored.
+ Decorator flags add to the message-prefix, in order:
+ t Include thread ID, or <intr>
+ m Include module name
+ f Include the function name
+ l Include line number
-For display, the flags are preceded by ``=``
-(mnemonic: what the flags are currently equal to).
+For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only
+the ``p`` flag has meaning, other flags are ignored.
Note the regexp ``^[-+=][flmpt_]+$`` matches a flags specification.
To clear all flags at once, use ``=_`` or ``-flmpt``.
@@ -249,8 +230,7 @@ Debug messages during Boot Process
To activate debug messages for core code and built-in modules during
the boot process, even before userspace and debugfs exists, use
-``dyndbg="QUERY"``, ``module.dyndbg="QUERY"``, or ``ddebug_query="QUERY"``
-(``ddebug_query`` is obsoleted by ``dyndbg``, and deprecated). QUERY follows
+``dyndbg="QUERY"`` or ``module.dyndbg="QUERY"``. QUERY follows
the syntax described above, but must not exceed 1023 characters. Your
bootloader may impose lower limits.
@@ -270,8 +250,7 @@ this boot parameter for debugging purposes.
If ``foo`` module is not built-in, ``foo.dyndbg`` will still be processed at
boot time, without effect, but will be reprocessed when module is
-loaded later. ``ddebug_query=`` and bare ``dyndbg=`` are only processed at
-boot.
+loaded later. Bare ``dyndbg=`` is only processed at boot.
Debug Messages at Module Initialization Time
@@ -315,7 +294,7 @@ For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
the debugfs interface if the debug messages are no longer needed::
- echo "module module_name -p" > <debugfs>/dynamic_debug/control
+ echo "module module_name -p" > /proc/dynamic_debug/control
Examples
========
@@ -323,43 +302,75 @@ Examples
::
// enable the message at line 1603 of file svcsock.c
- nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'file svcsock.c line 1603 +p'
// enable all the messages in file svcsock.c
- nullarbor:~ # echo -n 'file svcsock.c +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'file svcsock.c +p'
// enable all the messages in the NFS server module
- nullarbor:~ # echo -n 'module nfsd +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'module nfsd +p'
// enable all 12 messages in the function svc_process()
- nullarbor:~ # echo -n 'func svc_process +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'func svc_process +p'
// disable all 12 messages in the function svc_process()
- nullarbor:~ # echo -n 'func svc_process -p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'func svc_process -p'
// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
- nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
- <debugfs>/dynamic_debug/control
+ :#> ddcmd 'format "nfsd: READ" +p'
// enable messages in files of which the paths include string "usb"
- nullarbor:~ # echo -n 'file *usb* +p' > <debugfs>/dynamic_debug/control
+ :#> ddcmd 'file *usb* +p' > /proc/dynamic_debug/control
// enable all messages
- nullarbor:~ # echo -n '+p' > <debugfs>/dynamic_debug/control
+ :#> ddcmd '+p' > /proc/dynamic_debug/control
// add module, function to all enabled messages
- nullarbor:~ # echo -n '+mf' > <debugfs>/dynamic_debug/control
+ :#> ddcmd '+mf' > /proc/dynamic_debug/control
// boot-args example, with newlines and comments for readability
Kernel command line: ...
- // see whats going on in dyndbg=value processing
- dynamic_debug.verbose=1
- // enable pr_debugs in 2 builtins, #cmt is stripped
- dyndbg="module params +p #cmt ; module sys +p"
+ // see what's going on in dyndbg=value processing
+ dynamic_debug.verbose=3
+ // enable pr_debugs in the btrfs module (can be builtin or loadable)
+ btrfs.dyndbg="+p"
+ // enable pr_debugs in all files under init/
+ // and the function parse_one, #cmt is stripped
+ dyndbg="file init/* +p #cmt ; func parse_one +p"
// enable pr_debugs in 2 functions in a module loaded later
pc87360.dyndbg="func pc87360_init_device +p; func pc87360_find +p"
+
+Kernel Configuration
+====================
+
+Dynamic Debug is enabled via kernel config items::
+
+ CONFIG_DYNAMIC_DEBUG=y # build catalog, enables CORE
+ CONFIG_DYNAMIC_DEBUG_CORE=y # enable mechanics only, skip catalog
+
+If you do not want to enable dynamic debug globally (i.e. in some embedded
+system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
+debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
+modules which you'd like to dynamically debug later.
+
+
+Kernel *prdbg* API
+==================
+
+The following functions are cataloged and controllable when dynamic
+debug is enabled::
+
+ pr_debug()
+ dev_dbg()
+ print_hex_dump_debug()
+ print_hex_dump_bytes()
+
+Otherwise, they are off by default; ``ccflags += -DDEBUG`` or
+``#define DEBUG`` in a source file will enable them appropriately.
+
+If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is
+just a shortcut for ``print_hex_dump(KERN_DEBUG)``.
+
+For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
+its ``prefix_str`` argument, if it is constant string; or ``hexdump``
+in case ``prefix_str`` is built dynamically.
diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst
index 833edb0d0bc4..b24e7c40d832 100644
--- a/Documentation/admin-guide/efi-stub.rst
+++ b/Documentation/admin-guide/efi-stub.rst
@@ -7,10 +7,10 @@ as a PE/COFF image, thereby convincing EFI firmware loaders to load
it as an EFI executable. The code that modifies the bzImage header,
along with the EFI-specific entry point that the firmware loader
jumps to are collectively known as the "EFI boot stub", and live in
-arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
+arch/x86/boot/header.S and drivers/firmware/efi/libstub/x86-stub.c,
respectively. For ARM the EFI stub is implemented in
arch/arm/boot/compressed/efi-header.S and
-arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
+drivers/firmware/efi/libstub/arm32-stub.c. EFI stub code that is shared
between architectures is in drivers/firmware/efi/libstub.
For arm64, there is no compressed kernel support, so the Image itself
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index 4c559e08d11e..5740d85439ff 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -489,9 +489,6 @@ Files in /sys/fs/ext4/<devname>:
multiple of this tuning parameter if the stripe size is not set in the
ext4 superblock
- mb_max_inode_prealloc
- The maximum length of per-inode ext4_prealloc_space list.
-
mb_max_to_scan
The maximum number of extents the multiblock allocator will search to
find the best extent.
diff --git a/Documentation/admin-guide/filesystem-monitoring.rst b/Documentation/admin-guide/filesystem-monitoring.rst
new file mode 100644
index 000000000000..ab8dba76283c
--- /dev/null
+++ b/Documentation/admin-guide/filesystem-monitoring.rst
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================
+File system Monitoring with fanotify
+====================================
+
+File system Error Reporting
+===========================
+
+Fanotify supports the FAN_FS_ERROR event type for file system-wide error
+reporting. It is meant to be used by file system health monitoring
+daemons, which listen for these events and take actions (notify
+sysadmin, start recovery) when a file system problem is detected.
+
+By design, a FAN_FS_ERROR notification exposes sufficient information
+for a monitoring tool to know a problem in the file system has happened.
+It doesn't necessarily provide a user space application with semantics
+to verify an IO operation was successfully executed. That is out of
+scope for this feature. Instead, it is only meant as a framework for
+early file system problem detection and reporting recovery tools.
+
+When a file system operation fails, it is common for dozens of kernel
+errors to cascade after the initial failure, hiding the original failure
+log, which is usually the most useful debug data to troubleshoot the
+problem. For this reason, FAN_FS_ERROR tries to report only the first
+error that occurred for a file system since the last notification, and
+it simply counts additional errors. This ensures that the most
+important pieces of information are never lost.
+
+FAN_FS_ERROR requires the fanotify group to be setup with the
+FAN_REPORT_FID flag.
+
+At the time of this writing, the only file system that emits FAN_FS_ERROR
+notifications is Ext4.
+
+A FAN_FS_ERROR Notification has the following format::
+
+ ::
+
+ [ Notification Metadata (Mandatory) ]
+ [ Generic Error Record (Mandatory) ]
+ [ FID record (Mandatory) ]
+
+The order of records is not guaranteed, and new records might be added
+in the future. Therefore, applications must not rely on the order and
+must be prepared to skip over unknown records. Please refer to
+``samples/fanotify/fs-monitor.c`` for an example parser.
+
+Generic error record
+--------------------
+
+The generic error record provides enough information for a file system
+agnostic tool to learn about a problem in the file system, without
+providing any additional details about the problem. This record is
+identified by ``struct fanotify_event_info_header.info_type`` being set
+to FAN_EVENT_INFO_TYPE_ERROR.
+
+ ::
+
+ struct fanotify_event_info_error {
+ struct fanotify_event_info_header hdr;
+ __s32 error;
+ __u32 error_count;
+ };
+
+The `error` field identifies the type of error using errno values.
+`error_count` tracks the number of errors that occurred and were
+suppressed to preserve the original error information, since the last
+notification.
+
+FID record
+----------
+
+The FID record can be used to uniquely identify the inode that triggered
+the error through the combination of fsid and file handle. A file system
+specific application can use that information to attempt a recovery
+procedure. Errors that are not related to an inode are reported with an
+empty file handle of type FILEID_INVALID.
diff --git a/Documentation/admin-guide/gpio/gpio-sim.rst b/Documentation/admin-guide/gpio/gpio-sim.rst
new file mode 100644
index 000000000000..1cc5567a4bbe
--- /dev/null
+++ b/Documentation/admin-guide/gpio/gpio-sim.rst
@@ -0,0 +1,134 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+Configfs GPIO Simulator
+=======================
+
+The configfs GPIO Simulator (gpio-sim) provides a way to create simulated GPIO
+chips for testing purposes. The lines exposed by these chips can be accessed
+using the standard GPIO character device interface as well as manipulated
+using sysfs attributes.
+
+Creating simulated chips
+------------------------
+
+The gpio-sim module registers a configfs subsystem called ``'gpio-sim'``. For
+details of the configfs filesystem, please refer to the configfs documentation.
+
+The user can create a hierarchy of configfs groups and items as well as modify
+values of exposed attributes. Once the chip is instantiated, this hierarchy
+will be translated to appropriate device properties. The general structure is:
+
+**Group:** ``/config/gpio-sim``
+
+This is the top directory of the gpio-sim configfs tree.
+
+**Group:** ``/config/gpio-sim/gpio-device``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/dev_name``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/live``
+
+This is a directory representing a GPIO platform device. The ``'dev_name'``
+attribute is read-only and allows the user-space to read the platform device
+name (e.g. ``'gpio-sim.0'``). The ``'live'`` attribute allows to trigger the
+actual creation of the device once it's fully configured. The accepted values
+are: ``'1'`` to enable the simulated device and ``'0'`` to disable and tear
+it down.
+
+**Group:** ``/config/gpio-sim/gpio-device/gpio-bankX``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/gpio-bankX/chip_name``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/gpio-bankX/num_lines``
+
+This group represents a bank of GPIOs under the top platform device. The
+``'chip_name'`` attribute is read-only and allows the user-space to read the
+device name of the bank device. The ``'num_lines'`` attribute allows to specify
+the number of lines exposed by this bank.
+
+**Group:** ``/config/gpio-sim/gpio-device/gpio-bankX/lineY``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/gpio-bankX/lineY/name``
+
+This group represents a single line at the offset Y. The 'name' attribute
+allows to set the line name as represented by the 'gpio-line-names' property.
+
+**Item:** ``/config/gpio-sim/gpio-device/gpio-bankX/lineY/hog``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/gpio-bankX/lineY/hog/name``
+
+**Attribute:** ``/config/gpio-sim/gpio-device/gpio-bankX/lineY/hog/direction``
+
+This item makes the gpio-sim module hog the associated line. The ``'name'``
+attribute specifies the in-kernel consumer name to use. The ``'direction'``
+attribute specifies the hog direction and must be one of: ``'input'``,
+``'output-high'`` and ``'output-low'``.
+
+Inside each bank directory, there's a set of attributes that can be used to
+configure the new chip. Additionally the user can ``mkdir()`` subdirectories
+inside the chip's directory that allow to pass additional configuration for
+specific lines. The name of those subdirectories must take the form of:
+``'line<offset>'`` (e.g. ``'line0'``, ``'line20'``, etc.) as the name will be
+used by the module to assign the config to the specific line at given offset.
+
+Once the confiuration is complete, the ``'live'`` attribute must be set to 1 in
+order to instantiate the chip. It can be set back to 0 to destroy the simulated
+chip. The module will synchronously wait for the new simulated device to be
+successfully probed and if this doesn't happen, writing to ``'live'`` will
+result in an error.
+
+Simulated GPIO chips can also be defined in device-tree. The compatible string
+must be: ``"gpio-simulator"``. Supported properties are:
+
+ ``"gpio-sim,label"`` - chip label
+
+Other standard GPIO properties (like ``"gpio-line-names"``, ``"ngpios"`` or
+``"gpio-hog"``) are also supported. Please refer to the GPIO documentation for
+details.
+
+An example device-tree code defining a GPIO simulator:
+
+.. code-block :: none
+
+ gpio-sim {
+ compatible = "gpio-simulator";
+
+ bank0 {
+ gpio-controller;
+ #gpio-cells = <2>;
+ ngpios = <16>;
+ gpio-sim,label = "dt-bank0";
+ gpio-line-names = "", "sim-foo", "", "sim-bar";
+ };
+
+ bank1 {
+ gpio-controller;
+ #gpio-cells = <2>;
+ ngpios = <8>;
+ gpio-sim,label = "dt-bank1";
+
+ line3 {
+ gpio-hog;
+ gpios = <3 0>;
+ output-high;
+ line-name = "sim-hog-from-dt";
+ };
+ };
+ };
+
+Manipulating simulated lines
+----------------------------
+
+Each simulated GPIO chip creates a separate sysfs group under its device
+directory for each exposed line
+(e.g. ``/sys/devices/platform/gpio-sim.X/gpiochipY/``). The name of each group
+is of the form: ``'sim_gpioX'`` where X is the offset of the line. Inside each
+group there are two attributes:
+
+ ``pull`` - allows to read and set the current simulated pull setting for
+ every line, when writing the value must be one of: ``'pull-up'``,
+ ``'pull-down'``
+
+ ``value`` - allows to read the current value of the line which may be
+ different from the pull if the line is being driven from
+ user-space
diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst
index 7db367572f30..f6861ca16ffe 100644
--- a/Documentation/admin-guide/gpio/index.rst
+++ b/Documentation/admin-guide/gpio/index.rst
@@ -10,6 +10,7 @@ gpio
gpio-aggregator
sysfs
gpio-mockup
+ gpio-sim
.. only:: subproject and html
diff --git a/Documentation/admin-guide/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst
index ec09ffd983e7..35171d15f78d 100644
--- a/Documentation/admin-guide/gpio/sysfs.rst
+++ b/Documentation/admin-guide/gpio/sysfs.rst
@@ -145,7 +145,7 @@ requested using gpio_request()::
/* export the GPIO to userspace */
int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
- /* reverse gpio_export() */
+ /* reverse gpiod_export() */
void gpiod_unexport(struct gpio_desc *desc);
/* create a sysfs link to an exported GPIO node */
diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
index 0febe458597c..cf1eeefdfc32 100644
--- a/Documentation/admin-guide/hw-vuln/core-scheduling.rst
+++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
@@ -61,8 +61,9 @@ arg3:
``pid`` of the task for which the operation applies.
arg4:
- ``pid_type`` for which the operation applies. It is of type ``enum pid_type``.
- For example, if arg4 is ``PIDTYPE_TGID``, then the operation of this command
+ ``pid_type`` for which the operation applies. It is one of
+ ``PR_SCHED_CORE_SCOPE_``-prefixed macro constants. For example, if arg4
+ is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command
will be performed for all tasks in the task group of ``pid``.
arg5:
diff --git a/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst
new file mode 100644
index 000000000000..875616d675fe
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/cross-thread-rsb.rst
@@ -0,0 +1,91 @@
+
+.. SPDX-License-Identifier: GPL-2.0
+
+Cross-Thread Return Address Predictions
+=======================================
+
+Certain AMD and Hygon processors are subject to a cross-thread return address
+predictions vulnerability. When running in SMT mode and one sibling thread
+transitions out of C0 state, the other sibling thread could use return target
+predictions from the sibling thread that transitioned out of C0.
+
+The Spectre v2 mitigations protect the Linux kernel, as it fills the return
+address prediction entries with safe targets when context switching to the idle
+thread. However, KVM does allow a VMM to prevent exiting guest mode when
+transitioning out of C0. This could result in a guest-controlled return target
+being consumed by the sibling thread.
+
+Affected processors
+-------------------
+
+The following CPUs are vulnerable:
+
+ - AMD Family 17h processors
+ - Hygon Family 18h processors
+
+Related CVEs
+------------
+
+The following CVE entry is related to this issue:
+
+ ============== =======================================
+ CVE-2022-27672 Cross-Thread Return Address Predictions
+ ============== =======================================
+
+Problem
+-------
+
+Affected SMT-capable processors support 1T and 2T modes of execution when SMT
+is enabled. In 2T mode, both threads in a core are executing code. For the
+processor core to enter 1T mode, it is required that one of the threads
+requests to transition out of the C0 state. This can be communicated with the
+HLT instruction or with an MWAIT instruction that requests non-C0.
+When the thread re-enters the C0 state, the processor transitions back
+to 2T mode, assuming the other thread is also still in C0 state.
+
+In affected processors, the return address predictor (RAP) is partitioned
+depending on the SMT mode. For instance, in 2T mode each thread uses a private
+16-entry RAP, but in 1T mode, the active thread uses a 32-entry RAP. Upon
+transition between 1T/2T mode, the RAP contents are not modified but the RAP
+pointers (which control the next return target to use for predictions) may
+change. This behavior may result in return targets from one SMT thread being
+used by RET predictions in the sibling thread following a 1T/2T switch. In
+particular, a RET instruction executed immediately after a transition to 1T may
+use a return target from the thread that just became idle. In theory, this
+could lead to information disclosure if the return targets used do not come
+from trustworthy code.
+
+Attack scenarios
+----------------
+
+An attack can be mounted on affected processors by performing a series of CALL
+instructions with targeted return locations and then transitioning out of C0
+state.
+
+Mitigation mechanism
+--------------------
+
+Before entering idle state, the kernel context switches to the idle thread. The
+context switch fills the RAP entries (referred to as the RSB in Linux) with safe
+targets by performing a sequence of CALL instructions.
+
+Prevent a guest VM from directly putting the processor into an idle state by
+intercepting HLT and MWAIT instructions.
+
+Both mitigations are required to fully address this issue.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+Use existing Spectre v2 mitigations that will fill the RSB on context switch.
+
+Mitigation control for KVM - module parameter
+---------------------------------------------
+
+By default, the KVM hypervisor mitigates this issue by intercepting guest
+attempts to transition out of C0. A VMM can use the KVM_CAP_X86_DISABLE_EXITS
+capability to override those interceptions, but since this is not common, the
+mitigation that covers this path is not enabled by default.
+
+The mitigation for the KVM_CAP_X86_DISABLE_EXITS capability can be turned on
+using the boolean module parameter mitigate_smt_rsb, e.g. ``kvm.mitigate_smt_rsb=1``.
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 8cbc711cda93..e0614760a99e 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -17,3 +17,5 @@ are configurable at compile, boot or run time.
special-register-buffer-data-sampling.rst
core-scheduling.rst
l1d_flush.rst
+ processor_mmio_stale_data.rst
+ cross-thread-rsb.rst
diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst
index 2d19c9f4c1fe..48ca0bd85604 100644
--- a/Documentation/admin-guide/hw-vuln/mds.rst
+++ b/Documentation/admin-guide/hw-vuln/mds.rst
@@ -58,14 +58,14 @@ Because the buffers are potentially shared between Hyper-Threads cross
Hyper-Thread attacks are possible.
Deeper technical information is available in the MDS specific x86
-architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
+architecture section: :ref:`Documentation/arch/x86/mds.rst <mds>`.
Attack scenarios
----------------
-Attacks against the MDS vulnerabilities can be mounted from malicious non
-priviledged user space applications running on hosts or guest. Malicious
+Attacks against the MDS vulnerabilities can be mounted from malicious non-
+privileged user space applications running on hosts or guest. Malicious
guest OSes can obviously mount attacks as well.
Contrary to other speculation based vulnerabilities the MDS vulnerability
diff --git a/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
new file mode 100644
index 000000000000..c98fd11907cc
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
@@ -0,0 +1,260 @@
+=========================================
+Processor MMIO Stale Data Vulnerabilities
+=========================================
+
+Processor MMIO Stale Data Vulnerabilities are a class of memory-mapped I/O
+(MMIO) vulnerabilities that can expose data. The sequences of operations for
+exposing data range from simple to very complex. Because most of the
+vulnerabilities require the attacker to have access to MMIO, many environments
+are not affected. System environments using virtualization where MMIO access is
+provided to untrusted guests may need mitigation. These vulnerabilities are
+not transient execution attacks. However, these vulnerabilities may propagate
+stale data into core fill buffers where the data can subsequently be inferred
+by an unmitigated transient execution attack. Mitigation for these
+vulnerabilities includes a combination of microcode update and software
+changes, depending on the platform and usage model. Some of these mitigations
+are similar to those used to mitigate Microarchitectural Data Sampling (MDS) or
+those used to mitigate Special Register Buffer Data Sampling (SRBDS).
+
+Data Propagators
+================
+Propagators are operations that result in stale data being copied or moved from
+one microarchitectural buffer or register to another. Processor MMIO Stale Data
+Vulnerabilities are operations that may result in stale data being directly
+read into an architectural, software-visible state or sampled from a buffer or
+register.
+
+Fill Buffer Stale Data Propagator (FBSDP)
+-----------------------------------------
+Stale data may propagate from fill buffers (FB) into the non-coherent portion
+of the uncore on some non-coherent writes. Fill buffer propagation by itself
+does not make stale data architecturally visible. Stale data must be propagated
+to a location where it is subject to reading or sampling.
+
+Sideband Stale Data Propagator (SSDP)
+-------------------------------------
+The sideband stale data propagator (SSDP) is limited to the client (including
+Intel Xeon server E3) uncore implementation. The sideband response buffer is
+shared by all client cores. For non-coherent reads that go to sideband
+destinations, the uncore logic returns 64 bytes of data to the core, including
+both requested data and unrequested stale data, from a transaction buffer and
+the sideband response buffer. As a result, stale data from the sideband
+response and transaction buffers may now reside in a core fill buffer.
+
+Primary Stale Data Propagator (PSDP)
+------------------------------------
+The primary stale data propagator (PSDP) is limited to the client (including
+Intel Xeon server E3) uncore implementation. Similar to the sideband response
+buffer, the primary response buffer is shared by all client cores. For some
+processors, MMIO primary reads will return 64 bytes of data to the core fill
+buffer including both requested data and unrequested stale data. This is
+similar to the sideband stale data propagator.
+
+Vulnerabilities
+===============
+Device Register Partial Write (DRPW) (CVE-2022-21166)
+-----------------------------------------------------
+Some endpoint MMIO registers incorrectly handle writes that are smaller than
+the register size. Instead of aborting the write or only copying the correct
+subset of bytes (for example, 2 bytes for a 2-byte write), more bytes than
+specified by the write transaction may be written to the register. On
+processors affected by FBSDP, this may expose stale data from the fill buffers
+of the core that created the write transaction.
+
+Shared Buffers Data Sampling (SBDS) (CVE-2022-21125)
+----------------------------------------------------
+After propagators may have moved data around the uncore and copied stale data
+into client core fill buffers, processors affected by MFBDS can leak data from
+the fill buffer. It is limited to the client (including Intel Xeon server E3)
+uncore implementation.
+
+Shared Buffers Data Read (SBDR) (CVE-2022-21123)
+------------------------------------------------
+It is similar to Shared Buffer Data Sampling (SBDS) except that the data is
+directly read into the architectural software-visible state. It is limited to
+the client (including Intel Xeon server E3) uncore implementation.
+
+Affected Processors
+===================
+Not all the CPUs are affected by all the variants. For instance, most
+processors for the server market (excluding Intel Xeon E3 processors) are
+impacted by only Device Register Partial Write (DRPW).
+
+Below is the list of affected Intel processors [#f1]_:
+
+ =================== ============ =========
+ Common name Family_Model Steppings
+ =================== ============ =========
+ HASWELL_X 06_3FH 2,4
+ SKYLAKE_L 06_4EH 3
+ BROADWELL_X 06_4FH All
+ SKYLAKE_X 06_55H 3,4,6,7,11
+ BROADWELL_D 06_56H 3,4,5
+ SKYLAKE 06_5EH 3
+ ICELAKE_X 06_6AH 4,5,6
+ ICELAKE_D 06_6CH 1
+ ICELAKE_L 06_7EH 5
+ ATOM_TREMONT_D 06_86H All
+ LAKEFIELD 06_8AH 1
+ KABYLAKE_L 06_8EH 9 to 12
+ ATOM_TREMONT 06_96H 1
+ ATOM_TREMONT_L 06_9CH 0
+ KABYLAKE 06_9EH 9 to 13
+ COMETLAKE 06_A5H 2,3,5
+ COMETLAKE_L 06_A6H 0,1
+ ROCKETLAKE 06_A7H 1
+ =================== ============ =========
+
+If a CPU is in the affected processor list, but not affected by a variant, it
+is indicated by new bits in MSR IA32_ARCH_CAPABILITIES. As described in a later
+section, mitigation largely remains the same for all the variants, i.e. to
+clear the CPU fill buffers via VERW instruction.
+
+New bits in MSRs
+================
+Newer processors and microcode update on existing affected processors added new
+bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate
+specific variants of Processor MMIO Stale Data vulnerabilities and mitigation
+capability.
+
+MSR IA32_ARCH_CAPABILITIES
+--------------------------
+Bit 13 - SBDR_SSDP_NO - When set, processor is not affected by either the
+ Shared Buffers Data Read (SBDR) vulnerability or the sideband stale
+ data propagator (SSDP).
+Bit 14 - FBSDP_NO - When set, processor is not affected by the Fill Buffer
+ Stale Data Propagator (FBSDP).
+Bit 15 - PSDP_NO - When set, processor is not affected by Primary Stale Data
+ Propagator (PSDP).
+Bit 17 - FB_CLEAR - When set, VERW instruction will overwrite CPU fill buffer
+ values as part of MD_CLEAR operations. Processors that do not
+ enumerate MDS_NO (meaning they are affected by MDS) but that do
+ enumerate support for both L1D_FLUSH and MD_CLEAR implicitly enumerate
+ FB_CLEAR as part of their MD_CLEAR support.
+Bit 18 - FB_CLEAR_CTRL - Processor supports read and write to MSR
+ IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]. On such processors, the FB_CLEAR_DIS
+ bit can be set to cause the VERW instruction to not perform the
+ FB_CLEAR action. Not all processors that support FB_CLEAR will support
+ FB_CLEAR_CTRL.
+
+MSR IA32_MCU_OPT_CTRL
+---------------------
+Bit 3 - FB_CLEAR_DIS - When set, VERW instruction does not perform the FB_CLEAR
+action. This may be useful to reduce the performance impact of FB_CLEAR in
+cases where system software deems it warranted (for example, when performance
+is more critical, or the untrusted software has no MMIO access). Note that
+FB_CLEAR_DIS has no impact on enumeration (for example, it does not change
+FB_CLEAR or MD_CLEAR enumeration) and it may not be supported on all processors
+that enumerate FB_CLEAR.
+
+Mitigation
+==========
+Like MDS, all variants of Processor MMIO Stale Data vulnerabilities have the
+same mitigation strategy to force the CPU to clear the affected buffers before
+an attacker can extract the secrets.
+
+This is achieved by using the otherwise unused and obsolete VERW instruction in
+combination with a microcode update. The microcode clears the affected CPU
+buffers when the VERW instruction is executed.
+
+Kernel reuses the MDS function to invoke the buffer clearing:
+
+ mds_clear_cpu_buffers()
+
+On MDS affected CPUs, the kernel already invokes CPU buffer clear on
+kernel/userspace, hypervisor/guest and C-state (idle) transitions. No
+additional mitigation is needed on such CPUs.
+
+For CPUs not affected by MDS or TAA, mitigation is needed only for the attacker
+with MMIO capability. Therefore, VERW is not required for kernel/userspace. For
+virtualization case, VERW is only needed at VMENTER for a guest with MMIO
+capability.
+
+Mitigation points
+-----------------
+Return to user space
+^^^^^^^^^^^^^^^^^^^^
+Same mitigation as MDS when affected by MDS/TAA, otherwise no mitigation
+needed.
+
+C-State transition
+^^^^^^^^^^^^^^^^^^
+Control register writes by CPU during C-state transition can propagate data
+from fill buffer to uncore buffers. Execute VERW before C-state transition to
+clear CPU fill buffers.
+
+Guest entry point
+^^^^^^^^^^^^^^^^^
+Same mitigation as MDS when processor is also affected by MDS/TAA, otherwise
+execute VERW at VMENTER only for MMIO capable guests. On CPUs not affected by
+MDS/TAA, guest without MMIO access cannot extract secrets using Processor MMIO
+Stale Data vulnerabilities, so there is no need to execute VERW for such guests.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The kernel command line allows to control the Processor MMIO Stale Data
+mitigations at boot time with the option "mmio_stale_data=". The valid
+arguments for this option are:
+
+ ========== =================================================================
+ full If the CPU is vulnerable, enable mitigation; CPU buffer clearing
+ on exit to userspace and when entering a VM. Idle transitions are
+ protected as well. It does not automatically disable SMT.
+ full,nosmt Same as full, with SMT disabled on vulnerable CPUs. This is the
+ complete mitigation.
+ off Disables mitigation completely.
+ ========== =================================================================
+
+If the CPU is affected and mmio_stale_data=off is not supplied on the kernel
+command line, then the kernel selects the appropriate mitigation.
+
+Mitigation status information
+-----------------------------
+The Linux kernel provides a sysfs interface to enumerate the current
+vulnerability status of the system: whether the system is vulnerable, and
+which mitigations are active. The relevant sysfs file is:
+
+ /sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+
+The possible values in this file are:
+
+ .. list-table::
+
+ * - 'Not affected'
+ - The processor is not vulnerable
+ * - 'Vulnerable'
+ - The processor is vulnerable, but no mitigation enabled
+ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
+ - The processor is vulnerable, but microcode is not updated. The
+ mitigation is enabled on a best effort basis.
+ * - 'Mitigation: Clear CPU buffers'
+ - The processor is vulnerable and the CPU buffer clearing mitigation is
+ enabled.
+ * - 'Unknown: No mitigations'
+ - The processor vulnerability status is unknown because it is
+ out of Servicing period. Mitigation is not attempted.
+
+Definitions:
+------------
+
+Servicing period: The process of providing functional and security updates to
+Intel processors or platforms, utilizing the Intel Platform Update (IPU)
+process or other similar mechanisms.
+
+End of Servicing Updates (ESU): ESU is the date at which Intel will no
+longer provide Servicing, such as through IPU or other similar update
+processes. ESU dates will typically be aligned to end of quarter.
+
+If the processor is vulnerable then the following information is appended to
+the above information:
+
+ ======================== ===========================================
+ 'SMT vulnerable' SMT is enabled
+ 'SMT disabled' SMT is disabled
+ 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
+ ======================== ===========================================
+
+References
+----------
+.. [#f1] Affected Processors
+ https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index e05e581af5cf..4d186f599d90 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -60,8 +60,8 @@ privileged data touched during the speculative execution.
Spectre variant 1 attacks take advantage of speculative execution of
conditional branches, while Spectre variant 2 attacks use speculative
execution of indirect branches to leak privileged memory.
-See :ref:`[1] <spec_ref1>` :ref:`[5] <spec_ref5>` :ref:`[7] <spec_ref7>`
-:ref:`[10] <spec_ref10>` :ref:`[11] <spec_ref11>`.
+See :ref:`[1] <spec_ref1>` :ref:`[5] <spec_ref5>` :ref:`[6] <spec_ref6>`
+:ref:`[7] <spec_ref7>` :ref:`[10] <spec_ref10>` :ref:`[11] <spec_ref11>`.
Spectre variant 1 (Bounds Check Bypass)
---------------------------------------
@@ -131,6 +131,19 @@ steer its indirect branch speculations to gadget code, and measure the
speculative execution's side effects left in level 1 cache to infer the
victim's data.
+Yet another variant 2 attack vector is for the attacker to poison the
+Branch History Buffer (BHB) to speculatively steer an indirect branch
+to a specific Branch Target Buffer (BTB) entry, even if the entry isn't
+associated with the source address of the indirect branch. Specifically,
+the BHB might be shared across privilege levels even in the presence of
+Enhanced IBRS.
+
+Currently the only known real-world BHB attack vector is via
+unprivileged eBPF. Therefore, it's highly recommended to not enable
+unprivileged eBPF, especially when eIBRS is used (without retpolines).
+For a full mitigation against BHB attacks, it's recommended to use
+retpolines (or eIBRS combined with retpolines).
+
Attack scenarios
----------------
@@ -364,13 +377,15 @@ The possible values in this file are:
- Kernel status:
- ==================================== =================================
- 'Not affected' The processor is not vulnerable
- 'Vulnerable' Vulnerable, no mitigation
- 'Mitigation: Full generic retpoline' Software-focused mitigation
- 'Mitigation: Full AMD retpoline' AMD-specific software mitigation
- 'Mitigation: Enhanced IBRS' Hardware-focused mitigation
- ==================================== =================================
+ ======================================== =================================
+ 'Not affected' The processor is not vulnerable
+ 'Mitigation: None' Vulnerable, no mitigation
+ 'Mitigation: Retpolines' Use Retpoline thunks
+ 'Mitigation: LFENCE' Use LFENCE instructions
+ 'Mitigation: Enhanced IBRS' Hardware-focused mitigation
+ 'Mitigation: Enhanced IBRS + Retpolines' Hardware-focused + Retpolines
+ 'Mitigation: Enhanced IBRS + LFENCE' Hardware-focused + LFENCE
+ ======================================== =================================
- Firmware status: Show if Indirect Branch Restricted Speculation (IBRS) is
used to protect against Spectre variant 2 attacks when calling firmware (x86 only).
@@ -407,6 +422,14 @@ The possible values in this file are:
'RSB filling' Protection of RSB on context switch enabled
============= ===========================================
+ - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status:
+
+ =========================== =======================================================
+ 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled
+ 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable
+ 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB
+ =========================== =======================================================
+
Full mitigation might require a microcode update from the CPU
vendor. When the necessary microcode is not available, the kernel will
report vulnerability.
@@ -456,8 +479,16 @@ Spectre variant 2
On Intel Skylake-era systems the mitigation covers most, but not all,
cases. See :ref:`[3] <spec_ref3>` for more details.
- On CPUs with hardware mitigation for Spectre variant 2 (e.g. Enhanced
- IBRS on x86), retpoline is automatically disabled at run time.
+ On CPUs with hardware mitigation for Spectre variant 2 (e.g. IBRS
+ or enhanced IBRS on x86), retpoline is automatically disabled at run time.
+
+ Systems which support enhanced IBRS (eIBRS) enable IBRS protection once at
+ boot, by setting the IBRS bit, and they're automatically protected against
+ Spectre v2 variant attacks, including cross-thread branch target injections
+ on SMT systems (STIBP). In other words, eIBRS enables STIBP too.
+
+ Legacy IBRS systems clear the IBRS bit on exit to userspace and
+ therefore explicitly enable STIBP for that
The retpoline mitigation is turned on by default on vulnerable
CPUs. It can be forced on or off by the administrator
@@ -468,7 +499,7 @@ Spectre variant 2
before invoking any firmware code to prevent Spectre variant 2 exploits
using the firmware.
- Using kernel address space randomization (CONFIG_RANDOMIZE_SLAB=y
+ Using kernel address space randomization (CONFIG_RANDOMIZE_BASE=y
and CONFIG_SLAB_FREELIST_RANDOM=y in the kernel configuration) makes
attacks on the kernel generally more difficult.
@@ -481,18 +512,20 @@ Spectre variant 2
For Spectre variant 2 mitigation, individual user programs
can be compiled with return trampolines for indirect branches.
This protects them from consuming poisoned entries in the branch
- target buffer left by malicious software. Alternatively, the
- programs can disable their indirect branch speculation via prctl()
- (See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
+ target buffer left by malicious software.
+
+ On legacy IBRS systems, at return to userspace, implicit STIBP is disabled
+ because the kernel clears the IBRS bit. In this case, the userspace programs
+ can disable indirect branch speculation via prctl() (See
+ :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
On x86, this will turn on STIBP to guard against attacks from the
sibling thread when the user program is running, and use IBPB to
flush the branch target buffer when switching to/from the program.
Restricting indirect branch speculation on a user program will
also prevent the program from launching a variant 2 attack
- on x86. All sand-boxed SECCOMP programs have indirect branch
- speculation restricted by default. Administrators can change
- that behavior via the kernel command line and sysfs control files.
+ on x86. Administrators can change that behavior via the kernel
+ command line and sysfs control files.
See :ref:`spectre_mitigation_control_command_line`.
Programs that disable their indirect branch speculation will have
@@ -584,71 +617,26 @@ kernel command line.
Specific mitigations can also be selected manually:
- retpoline
- replace indirect branches
- retpoline,generic
- google's original retpoline
- retpoline,amd
- AMD-specific minimal thunk
+ retpoline auto pick between generic,lfence
+ retpoline,generic Retpolines
+ retpoline,lfence LFENCE; indirect branch
+ retpoline,amd alias for retpoline,lfence
+ eibrs Enhanced/Auto IBRS
+ eibrs,retpoline Enhanced/Auto IBRS + Retpolines
+ eibrs,lfence Enhanced/Auto IBRS + LFENCE
+ ibrs use IBRS to protect kernel
Not specifying this option is equivalent to
spectre_v2=auto.
-For user space mitigation:
-
- spectre_v2_user=
-
- [X86] Control mitigation of Spectre variant 2
- (indirect branch speculation) vulnerability between
- user space tasks
-
- on
- Unconditionally enable mitigations. Is
- enforced by spectre_v2=on
-
- off
- Unconditionally disable mitigations. Is
- enforced by spectre_v2=off
-
- prctl
- Indirect branch speculation is enabled,
- but mitigation can be enabled via prctl
- per thread. The mitigation control state
- is inherited on fork.
-
- prctl,ibpb
- Like "prctl" above, but only STIBP is
- controlled per thread. IBPB is issued
- always when switching between different user
- space processes.
-
- seccomp
- Same as "prctl" above, but all seccomp
- threads will enable the mitigation unless
- they explicitly opt out.
-
- seccomp,ibpb
- Like "seccomp" above, but only STIBP is
- controlled per thread. IBPB is issued
- always when switching between different
- user space processes.
-
- auto
- Kernel selects the mitigation depending on
- the available CPU features and vulnerability.
-
- Default mitigation:
- If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
-
- Not specifying this option is equivalent to
- spectre_v2_user=auto.
-
In general the kernel by default selects
reasonable mitigations for the current CPU. To
disable Spectre variant 2 mitigations, boot with
spectre_v2=off. Spectre variant 1 mitigations
cannot be disabled.
+For spectre_v2_user see Documentation/admin-guide/kernel-parameters.txt
+
Mitigation selection guide
--------------------------
@@ -674,9 +662,8 @@ Mitigation selection guide
off by disabling their indirect branch speculation when they are run
(See :ref:`Documentation/userspace-api/spec_ctrl.rst <set_spec_ctrl>`).
This prevents untrusted programs from polluting the branch target
- buffer. All programs running in SECCOMP sandboxes have indirect
- branch speculation restricted by default. This behavior can be
- changed via the kernel command line and sysfs control files. See
+ buffer. This behavior can be changed via the kernel command line
+ and sysfs control files. See
:ref:`spectre_mitigation_control_command_line`.
3. High security mode
@@ -730,7 +717,7 @@ AMD white papers:
.. _spec_ref6:
-[6] `Software techniques for managing speculation on AMD processors <https://developer.amd.com/wp-content/resources/90343-B_SoftwareTechniquesforManagingSpeculation_WP_7-18Update_FNL.pdf>`_.
+[6] `Software techniques for managing speculation on AMD processors <https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf>`_.
ARM white papers:
diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
index 76673affd917..014167ef8dd1 100644
--- a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
+++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
@@ -63,7 +63,7 @@ attacker needs to begin a TSX transaction and raise an asynchronous abort
which in turn potentially leaks data stored in the buffers.
More detailed technical information is available in the TAA specific x86
-architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
+architecture section: :ref:`Documentation/arch/x86/tsx_async_abort.rst <tsx_async_abort>`.
Attack scenarios
diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst
index 121de96e395e..d494601717f1 100644
--- a/Documentation/admin-guide/hw_random.rst
+++ b/Documentation/admin-guide/hw_random.rst
@@ -1,6 +1,6 @@
-==========================================================
-Linux support for random number generator in i8xx chipsets
-==========================================================
+=================================
+Hardware random number generators
+=================================
Introduction
============
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index dc00afcabb95..43ea35613dfc 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -35,7 +35,8 @@ problems and bugs in particular.
:maxdepth: 1
reporting-issues
- security-bugs
+ reporting-regressions
+ quickly-build-trimmed-linux
bug-hunting
bug-bisect
tainted-kernels
@@ -55,6 +56,17 @@ ABI will be found here.
sysfs-rules
+This is the beginning of a section with information of interest to
+application developers and system integrators doing analysis of the
+Linux kernel for safety critical applications. Documents supporting
+analysis of kernel interactions with applications, and key kernel
+subsystems expectations will be found here.
+
+.. toctree::
+ :maxdepth: 1
+
+ workload-tracing
+
The rest of this manual consists of various unordered guides on how to
configure specific aspects of kernel behavior to your liking.
@@ -82,6 +94,7 @@ configure specific aspects of kernel behavior to your liking.
edid
efi-stub
ext4
+ filesystem-monitoring
nfs/index
gpio/index
highuid
@@ -114,6 +127,7 @@ configure specific aspects of kernel behavior to your liking.
svga
syscall-user-dispatch
sysrq
+ thermal/index
thunderbolt
ufs
unicode
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
index 9b14b0c2c9c4..609a3201fd4e 100644
--- a/Documentation/admin-guide/iostats.rst
+++ b/Documentation/admin-guide/iostats.rst
@@ -76,7 +76,7 @@ Field 3 -- # of sectors read (unsigned long)
Field 4 -- # of milliseconds spent reading (unsigned int)
This is the total number of milliseconds spent by all reads (as
- measured from __make_request() to end_that_request_last()).
+ measured from blk_mq_alloc_request() to __blk_mq_end_request()).
Field 5 -- # of writes completed (unsigned long)
This is the total number of writes completed successfully.
@@ -89,7 +89,7 @@ Field 7 -- # of sectors written (unsigned long)
Field 8 -- # of milliseconds spent writing (unsigned int)
This is the total number of milliseconds spent by all writes (as
- measured from __make_request() to end_that_request_last()).
+ measured from blk_mq_alloc_request() to __blk_mq_end_request()).
Field 9 -- # of I/Os currently in progress (unsigned int)
The only field that should go to zero. Incremented as requests are
@@ -120,7 +120,7 @@ Field 14 -- # of sectors discarded (unsigned long)
Field 15 -- # of milliseconds spent discarding (unsigned int)
This is the total number of milliseconds spent by all discards (as
- measured from __make_request() to end_that_request_last()).
+ measured from blk_mq_alloc_request() to __blk_mq_end_request()).
Field 16 -- # of flush requests completed
This is the total number of flush requests completed successfully.
diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt
index 82aecdcae8a6..030de95e3e6b 100644
--- a/Documentation/admin-guide/kdump/gdbmacros.txt
+++ b/Documentation/admin-guide/kdump/gdbmacros.txt
@@ -312,10 +312,10 @@ define dmesg
set var $prev_flags = $info->flags
end
- set var $id = ($id + 1) & $id_mask
if ($id == $end_id)
loop_break
end
+ set var $id = ($id + 1) & $id_mask
end
end
document dmesg
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index cb30ca3df27c..a748e7eb4429 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -146,9 +146,9 @@ System kernel config options
CONFIG_SYSFS=y
Note that "sysfs file system support" might not appear in the "Pseudo
- filesystems" menu if "Configure standard kernel features (for small
- systems)" is not enabled in "General Setup." In this case, check the
- .config file itself to ensure that sysfs is turned on, as follows::
+ filesystems" menu if "Configure standard kernel features (expert users)"
+ is not enabled in "General Setup." In this case, check the .config file
+ itself to ensure that sysfs is turned on, as follows::
grep 'CONFIG_SYSFS' .config
@@ -533,6 +533,10 @@ the following command::
cp /proc/vmcore <dump-file>
+or use scp to write out the dump file between hosts on a network, e.g::
+
+ scp /proc/vmcore remote_username@remote_ip:<dump-file>
+
You can also use makedumpfile utility to write out the dump file
with specified options to filter out unwanted contents, e.g::
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 3861a25faae1..c18d94fa6470 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@ variables.
Offset of the free_list's member. This value is used to compute the number
of free pages.
-Each zone has a free_area structure array called free_area[MAX_ORDER].
+Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
The free_list represents a linked list of free page blocks.
(list_head, next|prev)
@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
information. Makedumpfile gets the start address of the vmalloc region
from this.
-(zone.free_area, MAX_ORDER)
----------------------------
+(zone.free_area, MAX_ORDER + 1)
+-------------------------------
Free areas descriptor. User-space tools use this value to iterate the
free_area ranges. MAX_ORDER is used by the zone buddy allocator.
@@ -200,7 +200,7 @@ prb
A pointer to the printk ringbuffer (struct printk_ringbuffer). This
may be pointing to the static boot ringbuffer or the dynamically
-allocated ringbuffer, depending on when the the core dump occurred.
+allocated ringbuffer, depending on when the core dump occurred.
Used by user-space tools to read the active kernel log buffer.
printk_rb_static
@@ -494,6 +494,14 @@ architecture which is used to lookup the page-tables for the Virtual
addresses in the higher VA range (refer to ARMv8 ARM document for
more details).
+MODULES_VADDR|MODULES_END|VMALLOC_START|VMALLOC_END|VMEMMAP_START|VMEMMAP_END
+-----------------------------------------------------------------------------
+
+Used to get the correct ranges:
+ MODULES_VADDR ~ MODULES_END-1 : Kernel module space.
+ VMALLOC_START ~ VMALLOC_END-1 : vmalloc() / ioremap() space.
+ VMEMMAP_START ~ VMEMMAP_END-1 : vmemmap region, used for struct page array.
+
arm
===
@@ -587,3 +595,32 @@ X2TLB
-----
Indicates whether the crashed kernel enabled SH extended mode.
+
+RISCV64
+=======
+
+VA_BITS
+-------
+
+The maximum number of bits for virtual addresses. Used to compute the
+virtual memory ranges.
+
+PAGE_OFFSET
+-----------
+
+Indicates the virtual kernel start address of the direct-mapped RAM region.
+
+phys_ram_base
+-------------
+
+Indicates the start physical RAM address.
+
+MODULES_VADDR|MODULES_END|VMALLOC_START|VMALLOC_END|VMEMMAP_START|VMEMMAP_END|KERNEL_LINK_ADDR
+----------------------------------------------------------------------------------------------
+
+Used to get the correct ranges:
+
+ * MODULES_VADDR ~ MODULES_END : Kernel module space.
+ * VMALLOC_START ~ VMALLOC_END : vmalloc() / ioremap() space.
+ * VMEMMAP_START ~ VMEMMAP_END : vmemmap space, used for struct page array.
+ * KERNEL_LINK_ADDR : start address of Kernel link and BPF
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 01ba293a2d70..1ba8f2a44aac 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -99,6 +99,7 @@ parameter is applicable::
ALSA ALSA sound support is enabled.
APIC APIC support is enabled.
APM Advanced Power Management support is enabled.
+ APPARMOR AppArmor support is enabled.
ARM ARM architecture is enabled.
ARM64 ARM64 architecture is enabled.
AX25 Appropriate AX.25 support is enabled.
@@ -108,15 +109,15 @@ parameter is applicable::
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
EFI EFI Partitioning (GPT) is enabled
- EIDE EIDE/ATAPI support is enabled.
EVM Extended Verification Module
FB The frame buffer device is enabled.
FTRACE Function tracing enabled.
GCOV GCOV profiling is enabled.
+ HIBERNATION HIBERNATION is enabled.
HW Appropriate hardware is enabled.
+ HYPER_V HYPERV support is enabled.
IA-64 IA-64 architecture is enabled.
IMA Integrity measurement architecture is enabled.
- IOSCHED More than one I/O scheduler is enabled.
IP_PNP IP DHCP, BOOTP, or RARP is enabled.
IPV6 IPv6 support is enabled.
ISAPNP ISA PnP code is enabled.
@@ -127,10 +128,11 @@ parameter is applicable::
KVM Kernel Virtual Machine support is enabled.
LIBATA Libata driver is enabled
LP Printer support is enabled.
+ LOONGARCH LoongArch architecture is enabled.
LOOP Loopback device support is enabled.
M68k M68k architecture is enabled.
These options have more detailed description inside of
- Documentation/m68k/kernel-options.rst.
+ Documentation/arch/m68k/kernel-options.rst.
MDA MDA console support is enabled.
MIPS MIPS architecture is enabled.
MOUSE Appropriate mouse support is enabled.
@@ -140,9 +142,7 @@ parameter is applicable::
NUMA NUMA support is enabled.
NFS Appropriate NFS support is enabled.
OF Devicetree is enabled.
- OSS OSS sound support is enabled.
PV_OPS A paravirtualized kernel is enabled.
- PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
PARISC The PA-RISC architecture is enabled.
PCI PCI bus support is enabled.
PCIE PCI Express support is enabled.
@@ -160,7 +160,6 @@ parameter is applicable::
the Documentation/scsi/ sub-directory.
SECURITY Different security models are enabled.
SELINUX SELinux support is enabled.
- APPARMOR AppArmor support is enabled.
SERIAL Serial support is enabled.
SH SuperH architecture is enabled.
SMP The kernel is an SMP kernel.
@@ -168,7 +167,6 @@ parameter is applicable::
SWSUSP Software suspend (hibernation) is enabled.
SUSPEND System suspend states are enabled.
TPM TPM drivers are enabled.
- TS Appropriate touchscreen support is enabled.
UMS USB Mass Storage support is enabled.
USB USB support is enabled.
USBHID USB Human Interface Device support is enabled.
@@ -177,11 +175,10 @@ parameter is applicable::
VGA The VGA console has been enabled.
VT Virtual terminal support is enabled.
WDT Watchdog support is enabled.
- XT IBM PC/XT MFM hard disk support is enabled.
X86-32 X86-32, aka i386 architecture is enabled.
X86-64 X86-64 architecture is enabled.
More X86-64 boot options can be found in
- Documentation/x86/x86_64/boot-options.rst.
+ Documentation/arch/x86/x86_64/boot-options.rst.
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
X86_UV SGI UV support is enabled.
XEN Xen support is enabled
@@ -196,10 +193,10 @@ In addition, the following text indicates that the option::
Parameters denoted with BOOT are actually interpreted by the boot
loader, and have no meaning to the kernel directly.
Do not modify the syntax of boot loader parameters without extreme
-need or coordination with <Documentation/x86/boot.rst>.
+need or coordination with <Documentation/arch/x86/boot.rst>.
There are also arch-specific kernel-parameters not documented here.
-See for example <Documentation/x86/x86_64/boot-options.rst>.
+See for example <Documentation/arch/x86/x86_64/boot-options.rst>.
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
a trailing = on the name of any parameter states that that parameter will
@@ -211,7 +208,7 @@ The number of kernel parameters is not limited, but the length of the
complete command line (parameters including spaces etc.) is limited to
a fixed number of characters. This limit depends on the architecture
and is between 256 and 4096 characters. It is defined in the file
-./include/asm/setup.h as COMMAND_LINE_SIZE.
+./include/uapi/asm-generic/setup.h as COMMAND_LINE_SIZE.
Finally, the [KMG] suffix is commonly described after a number of kernel
parameter values. These 'K', 'M', and 'G' letters represent the _binary_
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 43dc35fe5bc0..9e5bab29685f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -225,14 +225,23 @@
For broken nForce2 BIOS resulting in XT-PIC timer.
acpi_sleep= [HW,ACPI] Sleep options
- Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
- old_ordering, nonvs, sci_force_enable, nobl }
+ Format: { s3_bios, s3_mode, s3_beep, s4_hwsig,
+ s4_nohwsig, old_ordering, nonvs,
+ sci_force_enable, nobl }
See Documentation/power/video.rst for information on
s3_bios and s3_mode.
s3_beep is for debugging; it makes the PC's speaker beep
as soon as the kernel's real-mode entry point is called.
+ s4_hwsig causes the kernel to check the ACPI hardware
+ signature during resume from hibernation, and gracefully
+ refuse to resume if it has changed. This complies with
+ the ACPI specification but not with reality, since
+ Windows does not do this and many laptops do change it
+ on docking. So the default behaviour is to allow resume
+ and simply warn when the signature changes, unless the
+ s4_hwsig option is enabled.
s4_nohwsig prevents ACPI hardware signature from being
- used during resume from hibernation.
+ used (or even warned about) during resume.
old_ordering causes the ACPI 1.0 ordering of the _PTS
control method, with respect to putting devices into
low power states, to be enforced (the ACPI 2.0 ordering
@@ -312,6 +321,8 @@
force_enable - Force enable the IOMMU on platforms known
to be buggy with IOMMU enabled. Use this
option with care.
+ pgtbl_v1 - Use v1 page table for DMA-API (Default).
+ pgtbl_v2 - Use v2 page table for DMA-API.
amd_iommu_dump= [HW,X86-64]
Enable AMD IOMMU driver option to dump the ACPI table
@@ -328,6 +339,29 @@
This mode requires kvm-amd.avic=1.
(Default when IOMMU HW support is present.)
+ amd_pstate= [X86]
+ disable
+ Do not enable amd_pstate as the default
+ scaling driver for the supported processors
+ passive
+ Use amd_pstate with passive mode as a scaling driver.
+ In this mode autonomous selection is disabled.
+ Driver requests a desired performance level and platform
+ tries to match the same performance level if it is
+ satisfied by guaranteed performance level.
+ active
+ Use amd_pstate_epp driver instance as the scaling driver,
+ driver provides a hint to the hardware if software wants
+ to bias toward performance (0x0) or energy efficiency (0xff)
+ to the CPPC firmware. then CPPC power algorithm will
+ calculate the runtime workload and adjust the realtime cores
+ frequency.
+ guided
+ Activate guided autonomous mode. Driver requests minimum and
+ maximum performance level and the platform autonomously
+ selects a performance level in this range and appropriate
+ to the current workload.
+
amijoy.map= [HW,JOY] Amiga joystick support
Map of devices attached to JOY0DAT and JOY1DAT
Format: <a>,<b>
@@ -367,18 +401,16 @@
autoconf= [IPV6]
See Documentation/networking/ipv6.rst.
- show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller
- Limit apic dumping. The parameter defines the maximal
- number of local apics being dumped. Also it is possible
- to set it to "all" by meaning -- no limit here.
- Format: { 1 (default) | 2 | ... | all }.
- The parameter valid if only apic=debug or
- apic=verbose is specified.
- Example: apic=debug show_lapic=all
-
apm= [APM] Advanced Power Management
See header of arch/x86/kernel/apm_32.c.
+ apparmor= [APPARMOR] Disable or enable AppArmor at boot time
+ Format: { "0" | "1" }
+ See security/apparmor/Kconfig help text
+ 0 -- disable.
+ 1 -- enable.
+ Default value is set via kernel config option.
+
arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
Format: <io>,<irq>,<nodeID>
@@ -391,6 +423,12 @@
arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension
support
+ arm64.nosve [ARM64] Unconditionally disable Scalable Vector
+ Extension support
+
+ arm64.nosme [ARM64] Unconditionally disable Scalable Matrix
+ Extension support
+
ataflop= [HW,M68k]
atarimouse= [HW,MOUSE] Atari Mouse
@@ -452,13 +490,21 @@
Format: <io>,<irq>,<mode>
See header of drivers/net/hamradio/baycom_ser_hdx.c.
+ bert_disable [ACPI]
+ Disable BERT OS support on buggy BIOSes.
+
+ bgrt_disable [ACPI][X86]
+ Disable BGRT to avoid flickering OEM logo.
+
blkdevparts= Manual partition parsing of block device(s) for
embedded devices based on command line input.
See Documentation/block/cmdline-partition.rst
boot_delay= Milliseconds to delay each printk during boot.
- Values larger than 10 seconds (10000) are changed to
- no delay (0).
+ Only works if CONFIG_BOOT_PRINTK_DELAY is enabled,
+ and you may also have to specify "lpj=". Boot_delay
+ values larger than 10 seconds (10000) are assumed
+ erroneous and ignored.
Format: integer
bootconfig [KNL]
@@ -467,12 +513,6 @@
See Documentation/admin-guide/bootconfig.rst
- bert_disable [ACPI]
- Disable BERT OS support on buggy BIOSes.
-
- bgrt_disable [ACPI][X86]
- Disable BGRT to avoid flickering OEM logo.
-
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
@@ -540,8 +580,9 @@
Format: <string>
nosocket -- Disable socket memory accounting.
nokmem -- Disable kernel memory accounting.
+ nobpf -- Disable BPF memory accounting.
- checkreqprot [SELINUX] Set initial checkreqprot flag value.
+ checkreqprot= [SELINUX] Set initial checkreqprot flag value.
Format: { "0" | "1" }
See security/selinux/Kconfig help text.
0 -- check protection applied by kernel (includes
@@ -554,6 +595,25 @@
cio_ignore= [S390]
See Documentation/s390/common_io.rst for details.
+
+ clearcpuid=X[,X...] [X86]
+ Disable CPUID feature X for the kernel. See
+ arch/x86/include/asm/cpufeatures.h for the valid bit
+ numbers X. Note the Linux-specific bits are not necessarily
+ stable over kernel options, but the vendor-specific
+ ones should be.
+ X can also be a string as appearing in the flags: line
+ in /proc/cpuinfo which does not have the above
+ instability issue. However, not all features have names
+ in /proc/cpuinfo.
+ Note that using this option will taint your kernel.
+ Also note that user programs calling CPUID directly
+ or using the feature without checking anything
+ will still see it. This just prevents it from
+ being used by the kernel or shown in /proc/cpuinfo.
+ Also note the kernel might malfunction if you disable
+ some critical bits.
+
clk_ignore_unused
[CLK]
Prevents the clock framework from automatically gating
@@ -603,8 +663,8 @@
clocksource.max_cswd_read_retries= [KNL]
Number of clocksource_watchdog() retries due to
external delays before the clock will be marked
- unstable. Defaults to three retries, that is,
- four attempts to read the clock under test.
+ unstable. Defaults to two retries, that is,
+ three attempts to read the clock under test.
clocksource.verify_n_cpus= [KNL]
Limit the number of CPUs checked for clocksources
@@ -622,19 +682,6 @@
Defaults to zero when built as a module and to
10 seconds when built into the kernel.
- clearcpuid=BITNUM[,BITNUM...] [X86]
- Disable CPUID feature X for the kernel. See
- arch/x86/include/asm/cpufeatures.h for the valid bit
- numbers. Note the Linux specific bits are not necessarily
- stable over kernel options, but the vendor specific
- ones should be.
- Also note that user programs calling CPUID directly
- or using the feature without checking anything
- will still see it. This just prevents it from
- being used by the kernel or shown in /proc/cpuinfo.
- Also note the kernel might malfunction if you disable
- some critical bits.
-
cma=nn[MG]@[start[MG][-end[MG]]]
[KNL,CMA]
Sets the size of kernel global memory area for
@@ -649,7 +696,7 @@
Sets the size of kernel per-numa memory area for
contiguous memory allocations. A value of 0 disables
per-numa CMA altogether. And If this option is not
- specificed, the default value is 0.
+ specified, the default value is 0.
With per-numa CMA enabled, DMA users on node nid will
first try to allocate buffer from the pernuma area
which is located in node nid, if the allocation fails,
@@ -680,6 +727,17 @@
condev= [HW,S390] console device
conmode=
+ con3215_drop= [S390] 3215 console drop mode.
+ Format: y|n|Y|N|1|0
+ When set to true, drop data on the 3215 console when
+ the console buffer is full. In this case the
+ operator using a 3270 terminal emulator (for example
+ x3270) does not have to enter the clear key for the
+ console output to advance and the kernel to continue.
+ This leads to a much faster boot time when a 3270
+ terminal emulator is active. If no 3270 terminal
+ emulator is used, this parameter has no effect.
+
console= [KNL] Output console device and options.
tty<n> Use the virtual console device <n>.
@@ -715,6 +773,12 @@
hvc<n> Use the hypervisor console device <n>. This is for
both Xen and PowerPC hypervisors.
+ { null | "" }
+ Use to disable console output, i.e., to have kernel
+ console messages discarded.
+ This must be the only console= parameter used on the
+ kernel command line.
+
If the device connected to the port is not a TTY but a braille
device, prepend "brl," before the device type, for instance
console=brl,ttyS0
@@ -750,6 +814,24 @@
0: default value, disable debugging
1: enable debugging at boot time
+ cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
+ Format:
+ <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+
+ cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
+ CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
+ Some features depend on CPU0. Known dependencies are:
+ 1. Resume from suspend/hibernate depends on CPU0.
+ Suspend/hibernate will fail if CPU0 is offline and you
+ need to online CPU0 before suspend/hibernate.
+ 2. PIC interrupts also depend on CPU0. CPU0 can't be
+ removed if a PIC interrupt is detected.
+ It's said poweroff/reboot may depend on CPU0 on some
+ machines although I haven't seen such issues so far
+ after CPU0 is offline on a few tested machines.
+ If the dependencies are under your control, you can
+ turn on cpu0_hotplug.
+
cpuidle.off=1 [CPU_IDLE]
disable the cpuidle sub-system
@@ -770,9 +852,13 @@
on every CPU online, such as boot, and resume from suspend.
Default: 10000
- cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
- Format:
- <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+ crash_kexec_post_notifiers
+ Run kdump after running panic-notifiers and dumping
+ kmsg. This only for the users who doubt kdump always
+ succeeds in any situation.
+ Note that this also increases risks of kdump failure,
+ because some panic notifiers can make the crashed
+ kernel more unstable.
crashkernel=size[KMG][@offset[KMG]]
[KNL] Using kexec, Linux can switch to a 'crash kernel'
@@ -780,7 +866,7 @@
memory region [offset, offset + size] for that kernel
image. If '@offset' is omitted, then a suitable offset
is selected automatically.
- [KNL, X86-64] Select a region under 4G first, and
+ [KNL, X86-64, ARM64] Select a region under 4G first, and
fall back to reserve region above 4G when '@offset'
hasn't been specified.
See Documentation/admin-guide/kdump/kdump.rst for further details.
@@ -793,22 +879,25 @@
Documentation/admin-guide/kdump/kdump.rst for an example.
crashkernel=size[KMG],high
- [KNL, X86-64] range could be above 4G. Allow kernel
+ [KNL, X86-64, ARM64] range could be above 4G. Allow kernel
to allocate physical memory region from top, so could
be above 4G if system have more than 4G ram installed.
Otherwise memory region will be allocated below 4G, if
available.
It will be ignored if crashkernel=X is specified.
crashkernel=size[KMG],low
- [KNL, X86-64] range under 4G. When crashkernel=X,high
+ [KNL, X86-64, ARM64] range under 4G. When crashkernel=X,high
is passed, kernel could allocate physical memory region
above 4G, that cause second kernel crash on system
that require some amount of low memory, e.g. swiotlb
requires at least 64M+32K low memory, also enough extra
low memory is needed to make sure DMA buffers for 32-bit
- devices won't run out. Kernel would try to allocate at
- at least 256M below 4G automatically.
- This one let user to specify own low range under 4G
+ devices won't run out. Kernel would try to allocate
+ default size of memory below 4G automatically. The default
+ size is platform dependent.
+ --> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB)
+ --> arm64: 128MiB
+ This one lets the user specify own low range under 4G
for second kernel instead.
0: to disable low allocation.
It will be ignored when crashkernel=X,high is not used
@@ -823,15 +912,14 @@
cs89x0_media= [HW,NET]
Format: { rj45 | aui | bnc }
- csdlock_debug= [KNL] Enable debug add-ons of cross-CPU function call
- handling. When switched on, additional debug data is
- printed to the console in case a hanging CPU is
- detected, and that CPU is pinged again in order to try
- to resolve the hang situation.
- 0: disable csdlock debugging (default)
- 1: enable basic csdlock debugging (minor impact)
- ext: enable extended csdlock debugging (more impact,
- but more data)
+ csdlock_debug= [KNL] Enable or disable debug add-ons of cross-CPU
+ function call handling. When switched on,
+ additional debug data is printed to the console
+ in case a hanging CPU is detected, and that
+ CPU is pinged again in order to try to resolve
+ the hang situation. The default value of this
+ option depends on the CSD_LOCK_WAIT_DEBUG_DEFAULT
+ Kconfig option.
dasd= [HW,NET]
See header of drivers/s390/block/dasd_devmap.c.
@@ -841,11 +929,6 @@
Format: <port#>,<type>
See also Documentation/input/devices/joystick-parport.rst
- ddebug_query= [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
- time. See
- Documentation/admin-guide/dynamic-debug-howto.rst for
- details. Deprecated, see dyndbg.
-
debug [KNL] Enable kernel debugging (events log level).
debug_boot_weak_hash
@@ -868,9 +951,6 @@
debug_objects [KNL] Enable object debugging
- no_debug_objects
- [KNL] Disable object debugging
-
debug_guardpage_minorder=
[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
parameter allows control of the order of pages that will
@@ -884,7 +964,7 @@
driver code when a CPU writes to (or reads from) a
random memory location. Note that there exists a class
of memory corruptions problems caused by buggy H/W or
- F/W or by drivers badly programing DMA (basically when
+ F/W or by drivers badly programming DMA (basically when
memory is written at bus level and the CPU MMU is
bypassed) which are not detectable by
CONFIG_DEBUG_PAGEALLOC, hence this option will not help
@@ -916,10 +996,6 @@
debugpat [X86] Enable PAT debugging
- decnet.addr= [HW,NET]
- Format: <area>[,<node>]
- See also Documentation/networking/decnet.rst.
-
default_hugepagesz=
[HW] The size of the default HugeTLB page. This is
the size represented by the legacy /proc/ hugepages
@@ -935,11 +1011,39 @@
[KNL] Debugging option to set a timeout in seconds for
deferred probe to give up waiting on dependencies to
probe. Only specific dependencies (subsystems or
- drivers) that have opted in will be ignored. A timeout of 0
- will timeout at the end of initcalls. This option will also
+ drivers) that have opted in will be ignored. A timeout
+ of 0 will timeout at the end of initcalls. If the time
+ out hasn't expired, it'll be restarted by each
+ successful driver registration. This option will also
dump out devices still on the deferred probe list after
retrying.
+ delayacct [KNL] Enable per-task delay accounting
+
+ dell_smm_hwmon.ignore_dmi=
+ [HW] Continue probing hardware even if DMI data
+ indicates that the driver is running on unsupported
+ hardware.
+
+ dell_smm_hwmon.force=
+ [HW] Activate driver even if SMM BIOS signature does
+ not match list of supported models and enable otherwise
+ blacklisted features.
+
+ dell_smm_hwmon.power_status=
+ [HW] Report power status in /proc/i8k
+ (disabled by default).
+
+ dell_smm_hwmon.restricted=
+ [HW] Allow controlling fans only if SYS_ADMIN
+ capability is set.
+
+ dell_smm_hwmon.fan_mult=
+ [HW] Factor to multiply fan speed with.
+
+ dell_smm_hwmon.fan_max=
+ [HW] Maximum configurable fan speed.
+
dfltcc= [HW,S390]
Format: { on | off | def_only | inf_only | always }
on: s390 zlib hardware support for compression on
@@ -961,32 +1065,12 @@
can be useful when debugging issues that require an SLB
miss to occur.
- stress_slb [PPC]
- Limits the number of kernel SLB entries, and flushes
- them frequently to increase the rate of SLB faults
- on kernel addresses.
-
disable= [IPV6]
See Documentation/networking/ipv6.rst.
- hardened_usercopy=
- [KNL] Under CONFIG_HARDENED_USERCOPY, whether
- hardening is enabled for this boot. Hardened
- usercopy checking is used to protect the kernel
- from reading or writing beyond known memory
- allocation boundaries as a proactive defense
- against bounds-checking flaws in the kernel's
- copy_to_user()/copy_from_user() interface.
- on Perform hardened usercopy checks (default).
- off Disable hardened usercopy checks.
-
disable_radix [PPC]
Disable RADIX MMU mode on POWER9
- radix_hcall_invalidate=on [PPC/PSERIES]
- Disable RADIX GTSE feature and use hcall for TLB
- invalidate.
-
disable_tlbie [PPC]
Disable TLBIE instruction. Currently does not work
with KVM, with HASH MMU, or with coherent accelerators.
@@ -1042,7 +1126,10 @@
driver later using sysfs.
driver_async_probe= [KNL]
- List of driver names to be probed asynchronously.
+ List of driver names to be probed asynchronously. *
+ matches with all driver names. If * is specified, the
+ rest of the listed driver names are those that will NOT
+ match the *.
Format: <driver_name1>,<driver_name2>...
drm.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
@@ -1085,12 +1172,6 @@
Documentation/admin-guide/dynamic-debug-howto.rst
for details.
- nopku [X86] Disable Memory Protection Keys CPU feature found
- in some Intel CPUs.
-
- <module>.async_probe [KNL]
- Enable asynchronous probe on this module.
-
early_ioremap_debug [KNL]
Enable debug messages in early_ioremap support. This
is useful for tracking down temporary early mappings
@@ -1110,10 +1191,10 @@
specified, the serial port must already be setup and
configured.
- uart[8250],io,<addr>[,options]
- uart[8250],mmio,<addr>[,options]
- uart[8250],mmio32,<addr>[,options]
- uart[8250],mmio32be,<addr>[,options]
+ uart[8250],io,<addr>[,options[,uartclk]]
+ uart[8250],mmio,<addr>[,options[,uartclk]]
+ uart[8250],mmio32,<addr>[,options[,uartclk]]
+ uart[8250],mmio32be,<addr>[,options[,uartclk]]
uart[8250],0x<addr>[,options]
Start an early, polled-mode console on the 8250/16550
UART at the specified I/O port or MMIO address.
@@ -1122,7 +1203,9 @@
If none of [io|mmio|mmio32|mmio32be], <addr> is assumed
to be equivalent to 'mmio'. 'options' are specified
in the same format described for "console=ttyS<n>"; if
- unspecified, the h/w is not initialized.
+ unspecified, the h/w is not initialized. 'uartclk' is
+ the uart clock frequency; if unspecified, it is set
+ to 'BASE_BAUD' * 16.
pl011,<addr>
pl011,mmio32,<addr>
@@ -1248,7 +1331,7 @@
Append ",keep" to not disable it when the real console
takes over.
- Only one of vga, efi, serial, or usb debug port can
+ Only one of vga, serial, or usb debug port can
be used at a time.
Currently only ttyS0 and ttyS1 may be specified by
@@ -1263,7 +1346,7 @@
Interaction with the standard serial driver is not
very good.
- The VGA and EFI output is eventually overwritten by
+ The VGA output is eventually overwritten by
the real console.
The xen option can only be used in Xen domains.
@@ -1282,17 +1365,6 @@
force: enforce the use of EDAC to report H/W event.
default: on.
- ekgdboc= [X86,KGDB] Allow early kernel console debugging
- ekgdboc=kbd
-
- This is designed to be used in conjunction with
- the boot argument: earlyprintk=vga
-
- This parameter works in place of the kgdboc parameter
- but can only be used if the backing tty is available
- very early in the boot process. For early debugging
- via a serial port see kgdboc_earlycon instead.
-
edd= [EDD]
Format: {"off" | "on" | "skip[mbr]"}
@@ -1354,6 +1426,17 @@
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
+ ekgdboc= [X86,KGDB] Allow early kernel console debugging
+ Format: ekgdboc=kbd
+
+ This is designed to be used in conjunction with
+ the boot argument: earlyprintk=vga
+
+ This parameter works in place of the kgdboc parameter
+ but can only be used if the backing tty is available
+ very early in the boot process. For early debugging
+ via a serial port see kgdboc_earlycon instead.
+
elanfreq= [X86-32]
See comment before function elanfreq_setup() in
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
@@ -1375,7 +1458,7 @@
(in particular on some ATI chipsets).
The kernel tries to set a reasonable default.
- enforcing [SELINUX] Set initial enforcing status.
+ enforcing= [SELINUX] Set initial enforcing status.
Format: {"0" | "1"}
See security/selinux/Kconfig help text.
0 -- permissive (log only, no denials).
@@ -1397,6 +1480,14 @@
Permit 'security.evm' to be updated regardless of
current integrity status.
+ early_page_ext [KNL] Enforces page_ext initialization to earlier
+ stages so cover more early boot allocations.
+ Please note that as side effect some optimizations
+ might be disabled to achieve that (e.g. parallelized
+ memory initialization is disabled) so the boot process
+ might take longer, especially on systems with a lot of
+ memory. Available with CONFIG_PAGE_EXTENSION=y.
+
failslab=
fail_usercopy=
fail_page_alloc=
@@ -1431,6 +1522,23 @@
as early as possible in order to facilitate early
boot debugging.
+ ftrace_boot_snapshot
+ [FTRACE] On boot up, a snapshot will be taken of the
+ ftrace ring buffer that can be read at:
+ /sys/kernel/tracing/snapshot.
+ This is useful if you need tracing information from kernel
+ boot up that is likely to be overridden by user space
+ start up functionality.
+
+ Optionally, the snapshot can also be defined for a tracing
+ instance that was created by the trace_instance= command
+ line parameter.
+
+ trace_instance=foo,sched_switch ftrace_boot_snapshot=foo
+
+ The above will cause the "foo" tracing instance to trigger
+ a snapshot at the end of boot up.
+
ftrace_dump_on_oops[=orig_cpu]
[FTRACE] will dump the trace buffers on oops.
If no parameter is passed, ftrace will dump
@@ -1493,6 +1601,20 @@
dependencies. This only applies for fw_devlink=on|rpm.
Format: <bool>
+ fw_devlink.sync_state =
+ [KNL] When all devices that could probe have finished
+ probing, this parameter controls what to do with
+ devices that haven't yet received their sync_state()
+ calls.
+ Format: { strict | timeout }
+ strict -- Default. Continue waiting on consumers to
+ probe successfully.
+ timeout -- Give up waiting on consumers and call
+ sync_state() on any devices that haven't yet
+ received their sync_state() calls after
+ deferred_probe_timeout has expired or by
+ late_initcall() if !CONFIG_MODULES.
+
gamecon.map[2|3]=
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
support via parallel port (up to 5 devices per port)
@@ -1544,6 +1666,17 @@
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
Default: 1024
+ hardened_usercopy=
+ [KNL] Under CONFIG_HARDENED_USERCOPY, whether
+ hardening is enabled for this boot. Hardened
+ usercopy checking is used to protect the kernel
+ from reading or writing beyond known memory
+ allocation boundaries as a proactive defense
+ against bounds-checking flaws in the kernel's
+ copy_to_user()/copy_from_user() interface.
+ on Perform hardened usercopy checks (default).
+ off Disable hardened usercopy checks.
+
hardlockup_all_cpu_backtrace=
[KNL] Should the hard-lockup detector generate
backtraces on all cpus.
@@ -1564,6 +1697,15 @@
corresponding firmware-first mode error processing
logic will be disabled.
+ hibernate= [HIBERNATION]
+ noresume Don't check if there's a hibernation image
+ present during boot.
+ nocompress Don't compress/decompress hibernation images.
+ no Disable hibernation and resume.
+ protect_image Turn on image protection during restoration
+ (that will set all pages holding image data
+ during restoration read-only).
+
highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
size of <nn>. This works even on boxes that have no
highmem otherwise. This also works to reduce highmem
@@ -1575,6 +1717,19 @@
hlt [BUGS=ARM,SH]
+ hostname= [KNL] Set the hostname (aka UTS nodename).
+ Format: <string>
+ This allows setting the system's hostname during early
+ startup. This sets the name returned by gethostname.
+ Using this parameter to set the hostname makes it
+ possible to ensure the hostname is correctly set before
+ any userspace processes run, avoiding the possibility
+ that a process may call gethostname before the hostname
+ has been explicitly set, resulting in the calling
+ process getting an incorrect result. The string must
+ not exceed the maximum allowed hostname length (usually
+ 64 characters) and will be truncated otherwise.
+
hpet= [X86-32,HPET] option to control HPET usage
Format: { enable (default) | disable | force |
verbose }
@@ -1586,22 +1741,16 @@
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
- hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
- of gigantic hugepages.
- Format: nn[KMGTPE]
-
- Reserve a CMA area of given size and allocate gigantic
- hugepages using the CMA allocator. If enabled, the
- boot-time allocation of gigantic hugepages is skipped.
-
hugepages= [HW] Number of HugeTLB pages to allocate at boot.
If this follows hugepagesz (below), it specifies
the number of pages of hugepagesz to be allocated.
If this is the first HugeTLB parameter on the command
line, it specifies the number of pages to allocate for
- the default huge page size. See also
- Documentation/admin-guide/mm/hugetlbpage.rst.
- Format: <integer>
+ the default huge page size. If using node format, the
+ number of pages to allocate per-node can be specified.
+ See also Documentation/admin-guide/mm/hugetlbpage.rst.
+ Format: <integer> or (node format)
+ <node>:<integer>[,<node>:<integer>]
hugepagesz=
[HW] The size of the HugeTLB pages. This is used in
@@ -1613,22 +1762,35 @@
Documentation/admin-guide/mm/hugetlbpage.rst.
Format: size[KMG]
+ hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
+ of gigantic hugepages. Or using node format, the size
+ of a CMA area per node can be specified.
+ Format: nn[KMGTPE] or (node format)
+ <node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
+
+ Reserve a CMA area of given size and allocate gigantic
+ hugepages using the CMA allocator. If enabled, the
+ boot-time allocation of gigantic hugepages is skipped.
+
hugetlb_free_vmemmap=
- [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+ [KNL] Requires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
enabled.
+ Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
Allows heavy hugetlb users to free up some more
- memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+ memory (7 * PAGE_SIZE for each 2MB hugetlb page).
Format: { on | off (default) }
- on: enable the feature
- off: disable the feature
+ on: enable HVO
+ off: disable HVO
- Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y,
+ Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
the default is on.
- This is not compatible with memory_hotplug.memmap_on_memory.
- If both parameters are enabled, hugetlb_free_vmemmap takes
- precedence over memory_hotplug.memmap_on_memory.
+ Note that the vmemmap pages may be allocated from the added
+ memory block itself when memory_hotplug.memmap_on_memory is
+ enabled, those vmemmap pages cannot be optimized even if this
+ feature is enabled. Other vmemmap pages not allocated from
+ the added memory block itself do not be affected.
hung_task_panic=
[KNL] Should the hung task detector generate panics.
@@ -1650,12 +1812,6 @@
which allow the hypervisor to 'idle' the
guest on lock contention.
- keep_bootcon [KNL]
- Do not unregister boot console at start. This is only
- useful for debugging when something happens in the window
- between unregistering the boot console and initializing
- the real console.
-
i2c_bus= [HW] Override the default board specific I2C bus speed
or register an additional I2C bus that is not
registered from board initialization code.
@@ -1690,20 +1846,11 @@
architectures force reset to be always executed
i8042.unlock [HW] Unlock (ignore) the keylock
i8042.kbdreset [HW] Reset device connected to KBD port
+ i8042.probe_defer
+ [HW] Allow deferred probing upon i8042 probe errors
i810= [HW,DRM]
- i8k.ignore_dmi [HW] Continue probing hardware even if DMI data
- indicates that the driver is running on unsupported
- hardware.
- i8k.force [HW] Activate i8k driver even if SMM BIOS signature
- does not match list of supported models.
- i8k.power_status
- [HW] Report power status in /proc/i8k
- (disabled by default)
- i8k.restricted [HW] Allow controlling fans only if SYS_ADMIN
- capability is set.
-
i915.invert_brightness=
[DRM] Invert the sense of the variable that is used to
set the brightness of the panel backlight. Normally a
@@ -1721,26 +1868,6 @@
icn= [HW,ISDN]
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
- ide-core.nodma= [HW] (E)IDE subsystem
- Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
- .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
- .cdrom .chs .ignore_cable are additional options
- See Documentation/ide/ide.rst.
-
- ide-generic.probe-mask= [HW] (E)IDE subsystem
- Format: <int>
- Probe mask for legacy ISA IDE ports. Depending on
- platform up to 6 ports are supported, enabled by
- setting corresponding bits in the mask to 1. The
- default value is 0x0, which has a special meaning.
- On systems that have PCI, it triggers scanning the
- PCI bus for the first and the second port, which
- are then probed. On systems without PCI the value
- of 0x0 enables probing the two first ports as if it
- was 0x3.
-
- ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
- Claim all unknown PCI IDE storage controllers.
idle= [X86]
Format: idle=poll, idle=halt, idle=nomwait
@@ -1866,7 +1993,8 @@
ima_template= [IMA]
Select one of defined IMA measurements template formats.
- Formats: { "ima" | "ima-ng" | "ima-sig" }
+ Formats: { "ima" | "ima-ng" | "ima-ngv2" | "ima-sig" |
+ "ima-sigv2" }
Default: "ima-ng"
ima_template_fmt=
@@ -2198,39 +2326,73 @@
ivrs_ioapic [HW,X86-64]
Provide an override to the IOAPIC-ID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map IOAPIC-ID decimal 10 to
- PCI device 00:14.0 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map IOAPIC-ID decimal 10 to
+ PCI segment 0x1 and PCI device 00:14.0,
+ write the parameter as:
+ ivrs_ioapic=10@0001:00:14.0
+
+ Deprecated formats:
+ * To map IOAPIC-ID decimal 10 to PCI device 00:14.0
+ write the parameter as:
ivrs_ioapic[10]=00:14.0
+ * To map IOAPIC-ID decimal 10 to PCI segment 0x1 and
+ PCI device 00:14.0 write the parameter as:
+ ivrs_ioapic[10]=0001:00:14.0
ivrs_hpet [HW,X86-64]
Provide an override to the HPET-ID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map HPET-ID decimal 0 to
- PCI device 00:14.0 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map HPET-ID decimal 10 to
+ PCI segment 0x1 and PCI device 00:14.0,
+ write the parameter as:
+ ivrs_hpet=10@0001:00:14.0
+
+ Deprecated formats:
+ * To map HPET-ID decimal 0 to PCI device 00:14.0
+ write the parameter as:
ivrs_hpet[0]=00:14.0
+ * To map HPET-ID decimal 10 to PCI segment 0x1 and
+ PCI device 00:14.0 write the parameter as:
+ ivrs_ioapic[10]=0001:00:14.0
ivrs_acpihid [HW,X86-64]
Provide an override to the ACPI-HID:UID<->DEVICE-ID
- mapping provided in the IVRS ACPI table. For
- example, to map UART-HID:UID AMD0020:0 to
- PCI device 00:14.5 write the parameter as:
+ mapping provided in the IVRS ACPI table.
+ By default, PCI segment is 0, and can be omitted.
+
+ For example, to map UART-HID:UID AMD0020:0 to
+ PCI segment 0x1 and PCI device ID 00:14.5,
+ write the parameter as:
+ ivrs_acpihid=AMD0020:0@0001:00:14.5
+
+ Deprecated formats:
+ * To map UART-HID:UID AMD0020:0 to PCI segment is 0,
+ PCI device ID 00:14.5, write the parameter as:
ivrs_acpihid[00:14.5]=AMD0020:0
+ * To map UART-HID:UID AMD0020:0 to PCI segment 0x1 and
+ PCI device ID 00:14.5, write the parameter as:
+ ivrs_acpihid[0001:00:14.5]=AMD0020:0
js= [HW,JOY] Analog joystick
See Documentation/input/joydev/joystick.rst.
- nokaslr [KNL]
- When CONFIG_RANDOMIZE_BASE is set, this disables
- kernel and module base offset ASLR (Address Space
- Layout Randomization).
-
kasan_multi_shot
[KNL] Enforce KASAN (Kernel Address Sanitizer) to print
report on every invalid memory access. Without this
parameter KASAN will print report only for the first
invalid access.
+ keep_bootcon [KNL]
+ Do not unregister boot console at start. This is only
+ useful for debugging when something happens in the window
+ between unregistering the boot console and initializing
+ the real console.
+
keepinitrd [HW,ARM]
kernelcore= [KNL,X86,IA-64,PPC]
@@ -2326,16 +2488,43 @@
0: force disabled
1: force enabled
+ kunit.enable= [KUNIT] Enable executing KUnit tests. Requires
+ CONFIG_KUNIT to be set to be fully enabled. The
+ default value can be overridden via
+ KUNIT_DEFAULT_ENABLED.
+ Default is 1 (enabled)
+
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
Default is 0 (don't ignore, but inject #GP)
+ kvm.eager_page_split=
+ [KVM,X86] Controls whether or not KVM will try to
+ proactively split all huge pages during dirty logging.
+ Eager page splitting reduces interruptions to vCPU
+ execution by eliminating the write-protection faults
+ and MMU lock contention that would otherwise be
+ required to split huge pages lazily.
+
+ VM workloads that rarely perform writes or that write
+ only to a small region of VM memory may benefit from
+ disabling eager page splitting to allow huge pages to
+ still be used for reads.
+
+ The behavior of eager page splitting depends on whether
+ KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
+ disabled, all huge pages in a memslot will be eagerly
+ split when dirty logging is enabled on that memslot. If
+ enabled, eager page splitting will be performed during
+ the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+ cleared.
+
+ Eager page splitting is only supported when kvm.tdp_mmu=Y.
+
+ Default is Y (on).
+
kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
Default is false (don't support).
- kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit
- KVM MMU at runtime.
- Default is 0 (off)
-
kvm.nx_huge_pages=
[KVM] Controls the software workaround for the
X86_BUG_ITLB_MULTIHIT bug.
@@ -2353,7 +2542,14 @@
[KVM] Controls how many 4KiB pages are periodically zapped
back to huge pages. 0 disables the recovery, otherwise if
the value is N KVM will zap 1/Nth of the 4KiB pages every
- minute. The default is 60.
+ period (see below). The default is 60.
+
+ kvm.nx_huge_pages_recovery_period_ms=
+ [KVM] Controls the time period at which KVM zaps 4KiB pages
+ back to huge pages. If the value is a non-zero N, KVM will
+ zap a portion (see ratio above) of the pages every N msecs.
+ If the value is 0 (the default), KVM will pick a period based
+ on the ratio, such that a page is zapped after 1 hour on average.
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
Default is 1 (enabled)
@@ -2365,14 +2561,22 @@
kvm-arm.mode=
[KVM,ARM] Select one of KVM/arm64's modes of operation.
+ none: Forcefully disable KVM.
+
nvhe: Standard nVHE-based mode, without support for
protected guests.
protected: nVHE-based mode with support for guests whose
state is kept private from the host.
- Not valid if the kernel is running in EL2.
- Defaults to VHE/nVHE based on hardware support.
+ nested: VHE-based mode with support for nested
+ virtualization. Requires at least ARMv8.3
+ hardware.
+
+ Defaults to VHE/nVHE based on hardware support. Setting
+ mode to "protected" will disable kexec and hibernation
+ for the host. "nested" is experimental and should be
+ used with extreme caution.
kvm-arm.vgic_v3_group0_trap=
[KVM,ARM] Trap guest accesses to GICv3 group-0
@@ -2403,8 +2607,12 @@
Default is 1 (enabled)
kvm-intel.emulate_invalid_guest_state=
- [KVM,Intel] Enable emulation of invalid guest states
- Default is 0 (disabled)
+ [KVM,Intel] Disable emulation of invalid guest state.
+ Ignored if kvm-intel.enable_unrestricted_guest=1, as
+ guest state is never invalid for unrestricted guests.
+ This param doesn't apply to nested guests (L2), as KVM
+ never emulates invalid L2 guest state.
+ Default is 1 (enabled)
kvm-intel.flexpriority=
[KVM,Intel] Disable FlexPriority feature (TPR shadow).
@@ -2548,14 +2756,14 @@
when set.
Format: <int>
- libata.force= [LIBATA] Force configurations. The format is comma-
- separated list of "[ID:]VAL" where ID is
- PORT[.DEVICE]. PORT and DEVICE are decimal numbers
- matching port, link or device. Basically, it matches
- the ATA ID string printed on console by libata. If
- the whole ID part is omitted, the last PORT and DEVICE
- values are used. If ID hasn't been specified yet, the
- configuration applies to all ports, links and devices.
+ libata.force= [LIBATA] Force configurations. The format is a comma-
+ separated list of "[ID:]VAL" where ID is PORT[.DEVICE].
+ PORT and DEVICE are decimal numbers matching port, link
+ or device. Basically, it matches the ATA ID string
+ printed on console by libata. If the whole ID part is
+ omitted, the last PORT and DEVICE values are used. If
+ ID hasn't been specified yet, the configuration applies
+ to all ports, links and devices.
If only DEVICE is omitted, the parameter applies to
the port and all links and devices behind it. DEVICE
@@ -2565,7 +2773,7 @@
host link and device attached to it.
The VAL specifies the configuration to force. As long
- as there's no ambiguity shortcut notation is allowed.
+ as there is no ambiguity, shortcut notation is allowed.
For example, both 1.5 and 1.5G would work for 1.5Gbps.
The following configurations can be forced.
@@ -2578,27 +2786,67 @@
udma[/][16,25,33,44,66,100,133] notation is also
allowed.
+ * nohrst, nosrst, norst: suppress hard, soft and both
+ resets.
+
+ * rstonce: only attempt one reset during hot-unplug
+ link recovery.
+
+ * [no]dbdelay: Enable or disable the extra 200ms delay
+ before debouncing a link PHY and device presence
+ detection.
+
* [no]ncq: Turn on or off NCQ.
- * [no]ncqtrim: Turn off queued DSM TRIM.
+ * [no]ncqtrim: Enable or disable queued DSM TRIM.
+
+ * [no]ncqati: Enable or disable NCQ trim on ATI chipset.
+
+ * [no]trim: Enable or disable (unqueued) TRIM.
+
+ * trim_zero: Indicate that TRIM command zeroes data.
+
+ * max_trim_128m: Set 128M maximum trim size limit.
+
+ * [no]dma: Turn on or off DMA transfers.
+
+ * atapi_dmadir: Enable ATAPI DMADIR bridge support.
- * nohrst, nosrst, norst: suppress hard, soft
- and both resets.
+ * atapi_mod16_dma: Enable the use of ATAPI DMA for
+ commands that are not a multiple of 16 bytes.
- * rstonce: only attempt one reset during
- hot-unplug link recovery
+ * [no]dmalog: Enable or disable the use of the
+ READ LOG DMA EXT command to access logs.
- * dump_id: dump IDENTIFY data.
+ * [no]iddevlog: Enable or disable access to the
+ identify device data log.
- * atapi_dmadir: Enable ATAPI DMADIR bridge support
+ * [no]logdir: Enable or disable access to the general
+ purpose log directory.
+
+ * max_sec_128: Set transfer size limit to 128 sectors.
+
+ * max_sec_1024: Set or clear transfer size limit to
+ 1024 sectors.
+
+ * max_sec_lba48: Set or clear transfer size limit to
+ 65535 sectors.
+
+ * [no]lpm: Enable or disable link power management.
+
+ * [no]setxfer: Indicate if transfer speed mode setting
+ should be skipped.
+
+ * [no]fua: Disable or enable FUA (Force Unit Access)
+ support for devices supporting this feature.
+
+ * dump_id: Dump IDENTIFY data.
* disable: Disable this device.
If there are multiple matching configurations changing
the same attribute, the last one is used.
- memblock=debug [KNL] Enable memblock debug messages.
-
load_ramdisk= [RAM] [Deprecated]
lockd.nlm_grace_period=P [NFS] Assign grace period.
@@ -2740,7 +2988,7 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
- max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
+ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater
than or equal to this physical address is ignored.
maxcpus= [SMP] Maximum number of processors that an SMP kernel
@@ -2761,7 +3009,7 @@
mce [X86-32] Machine Check Exception
- mce=option [X86-64] See Documentation/x86/x86_64/boot-options.rst
+ mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
md= [HW] RAID subsystems devices and level
See Documentation/admin-guide/md.rst.
@@ -2802,6 +3050,9 @@
For details see: Documentation/admin-guide/hw-vuln/mds.rst
+ mem=nn[KMG] [HEXAGON] Set the memory size.
+ Must be specified, otherwise memory size will be 0.
+
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used in cases as follows:
@@ -2809,6 +3060,13 @@
2 when the kernel is not able to see the whole system memory;
3 memory that lies after 'mem=' boundary is excluded from
the hypervisor, then assigned to KVM guests.
+ 4 to limit the memory available for kdump kernel.
+
+ [ARC,MICROBLAZE] - the limit applies only to low memory,
+ high memory is not affected.
+
+ [ARM64] - only limits memory covered by the linear
+ mapping. The NOMAP regions are not affected.
[X86] Work as limiting max address. Use together
with memmap= to avoid physical address space collisions.
@@ -2819,9 +3077,19 @@
in above case 3, memory may need be hot added after boot
if system memory of hypervisor is not sufficient.
+ mem=nn[KMG]@ss[KMG]
+ [ARM,MIPS] - override the memory layout reported by
+ firmware.
+ Define a memory region of size nn[KMG] starting at
+ ss[KMG].
+ Multiple different regions can be specified with
+ multiple mem= parameters on the command line.
+
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
+ memblock=debug [KNL] Enable memblock debug messages.
+
memchunk=nn[KMG]
[KNL,SH] Allow user to override the default size for
per-device physically contiguous DMA buffers.
@@ -2907,10 +3175,12 @@
[KNL,X86,ARM] Boolean flag to enable this feature.
Format: {on | off (default)}
When enabled, runtime hotplugged memory will
- allocate its internal metadata (struct pages)
- from the hotadded memory which will allow to
- hotadd a lot of memory without requiring
- additional memory to do so.
+ allocate its internal metadata (struct pages,
+ those vmemmap pages cannot be optimized even
+ if hugetlb_free_vmemmap is enabled) from the
+ hotadded memory which will allow to hotadd a
+ lot of memory without requiring additional
+ memory to do so.
This feature is disabled by default because it
has some implication on large (e.g. GB)
allocations in some configurations (e.g. small
@@ -2920,11 +3190,7 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
- This is not compatible with hugetlb_free_vmemmap. If
- both parameters are enabled, hugetlb_free_vmemmap takes
- precedence over memory_hotplug.memmap_on_memory.
-
- memtest= [KNL,X86,ARM,PPC,RISCV] Enable memtest
+ memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
Format: <integer>
default : 0 <disable>
Specifies the number of memtest passes to be
@@ -2942,7 +3208,7 @@
mem_encrypt=on: Activate SME
mem_encrypt=off: Do not activate SME
- Refer to Documentation/virt/kvm/amd-memory-encryption.rst
+ Refer to Documentation/virt/kvm/x86/amd-memory-encryption.rst
for details on when memory encryption can be activated.
mem_sleep_default= [SUSPEND] Default system suspend mode:
@@ -2951,9 +3217,6 @@
deep - Suspend-To-RAM or equivalent (if supported)
See Documentation/admin-guide/pm/sleep-states.rst.
- meye.*= [HW] Set MotionEye Camera parameters
- See Documentation/admin-guide/media/meye.rst.
-
mfgpt_irq= [IA-32] Specify the IRQ to use for the
Multi-Function General Purpose Timers on AMD Geode
platforms.
@@ -2965,7 +3228,7 @@
mga= [HW,DRM]
- min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this
+ min_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory below this
physical address is ignored.
mini2440= [ARM,HW,KNL]
@@ -3000,19 +3263,23 @@
improves system performance, but it may also
expose users to several CPU vulnerabilities.
Equivalent to: nopti [X86,PPC]
- kpti=0 [ARM64]
+ if nokaslr then kpti=0 [ARM64]
nospectre_v1 [X86,PPC]
nobp=0 [S390]
nospectre_v2 [X86,PPC,S390,ARM64]
spectre_v2_user=off [X86]
spec_store_bypass_disable=off [X86,PPC]
ssbd=force-off [ARM64]
+ nospectre_bhb [ARM64]
l1tf=off [X86]
mds=off [X86]
tsx_async_abort=off [X86]
kvm.nx_huge_pages=off [X86]
+ srbds=off [X86,INTEL]
no_entry_flush [PPC]
no_uaccess_flush [PPC]
+ mmio_stale_data=off [X86]
+ retbleed=off [X86]
Exceptions:
This does not have any effect on
@@ -3034,6 +3301,8 @@
Equivalent to: l1tf=flush,nosmt [X86]
mds=full,nosmt [X86]
tsx_async_abort=full,nosmt [X86]
+ mmio_stale_data=full,nosmt [X86]
+ retbleed=auto,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
@@ -3043,6 +3312,62 @@
log everything. Information is printed at KERN_DEBUG
so loglevel=8 may also need to be specified.
+ mmio_stale_data=
+ [X86,INTEL] Control mitigation for the Processor
+ MMIO Stale Data vulnerabilities.
+
+ Processor MMIO Stale Data is a class of
+ vulnerabilities that may expose data after an MMIO
+ operation. Exposed data could originate or end in
+ the same CPU buffers as affected by MDS and TAA.
+ Therefore, similar to MDS and TAA, the mitigation
+ is to clear the affected CPU buffers.
+
+ This parameter controls the mitigation. The
+ options are:
+
+ full - Enable mitigation on vulnerable CPUs
+
+ full,nosmt - Enable mitigation and disable SMT on
+ vulnerable CPUs.
+
+ off - Unconditionally disable mitigation
+
+ On MDS or TAA affected machines,
+ mmio_stale_data=off can be prevented by an active
+ MDS or TAA mitigation as these vulnerabilities are
+ mitigated with the same mechanism so in order to
+ disable this mitigation, you need to specify
+ mds=off and tsx_async_abort=off too.
+
+ Not specifying this option is equivalent to
+ mmio_stale_data=full.
+
+ For details see:
+ Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
+
+ <module>.async_probe[=<bool>] [KNL]
+ If no <bool> value is specified or if the value
+ specified is not a valid <bool>, enable asynchronous
+ probe on this module. Otherwise, enable/disable
+ asynchronous probe on this module as indicated by the
+ <bool> value. See also: module.async_probe
+
+ module.async_probe=<bool>
+ [KNL] When set to true, modules will use async probing
+ by default. To enable/disable async probing for a
+ specific module, use the module specific control that
+ is documented under <module>.async_probe. When both
+ module.async_probe and <module>.async_probe are
+ specified, <module>.async_probe takes precedence for
+ the specific module.
+
+ module.enable_dups_trace
+ [KNL] When CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS is set,
+ this means that duplicate request_module() calls will
+ trigger a WARN_ON() instead of a pr_warn(). Note that
+ if MODULE_DEBUG_AUTOLOAD_DUPS_TRACE is set, WARN_ON()s
+ will always be issued and this option does nothing.
module.sig_enforce
[KNL] When CONFIG_MODULE_SIG is set, this means that
modules without (valid) signatures will fail to load.
@@ -3089,20 +3414,6 @@
mtdparts= [MTD]
See drivers/mtd/parsers/cmdlinepart.c
- multitce=off [PPC] This parameter disables the use of the pSeries
- firmware feature for updating multiple TCE entries
- at a time.
-
- onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
-
- Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
-
- boundary - index of last SLC block on Flex-OneNAND.
- The remaining blocks are configured as MLC blocks.
- lock - Configure if Flex-OneNAND boundary should be locked.
- Once locked, the boundary cannot be changed.
- 1 indicates lock status, 0 indicates unlock status.
-
mtdset= [ARM]
ARM/S3C2412 JIVE boot control
@@ -3129,6 +3440,10 @@
Used for mtrr cleanup. It is spare mtrr entries number.
Set to 2 or more if your graphical card needs more.
+ multitce=off [PPC] This parameter disables the use of the pSeries
+ firmware feature for updating multiple TCE entries
+ at a time.
+
n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
netdev= [NET] Network devices parameters
@@ -3138,20 +3453,24 @@
This usage is only documented in each driver source
file if at all.
+ netpoll.carrier_timeout=
+ [NET] Specifies amount of time (in seconds) that
+ netpoll should wait for a carrier. By default netpoll
+ waits 4 seconds.
+
nf_conntrack.acct=
[NETFILTER] Enable connection tracking flow accounting
0 to disable accounting
1 to enable accounting
Default value is 0.
- nfsaddrs= [NFS] Deprecated. Use ip= instead.
- See Documentation/admin-guide/nfs/nfsroot.rst.
-
- nfsroot= [NFS] nfs root filesystem for disk-less boxes.
- See Documentation/admin-guide/nfs/nfsroot.rst.
+ nfs.cache_getent=
+ [NFS] sets the pathname to the program which is used
+ to update the NFS client cache entries.
- nfsrootdebug [NFS] enable nfsroot debugging messages.
- See Documentation/admin-guide/nfs/nfsroot.rst.
+ nfs.cache_getent_timeout=
+ [NFS] sets the timeout after which an attempt to
+ update a cache entry is deemed to have failed.
nfs.callback_nr_threads=
[NFSv4] set the total number of threads that the
@@ -3162,18 +3481,6 @@
[NFS] set the TCP port on which the NFSv4 callback
channel should listen.
- nfs.cache_getent=
- [NFS] sets the pathname to the program which is used
- to update the NFS client cache entries.
-
- nfs.cache_getent_timeout=
- [NFS] sets the timeout after which an attempt to
- update a cache entry is deemed to have failed.
-
- nfs.idmap_cache_timeout=
- [NFS] set the maximum lifetime for idmapper cache
- entries.
-
nfs.enable_ino64=
[NFS] enable 64-bit inode numbers.
If zero, the NFS client will fake up a 32-bit inode
@@ -3181,6 +3488,10 @@
of returning the full 64-bit number.
The default is to return 64-bit inode numbers.
+ nfs.idmap_cache_timeout=
+ [NFS] set the maximum lifetime for idmapper cache
+ entries.
+
nfs.max_session_cb_slots=
[NFSv4.1] Sets the maximum number of session
slots the client will assign to the callback
@@ -3208,21 +3519,14 @@
will be autodetected by the client, and it will fall
back to using the idmapper.
To turn off this behaviour, set the value to '0'.
+
nfs.nfs4_unique_id=
[NFS4] Specify an additional fixed unique ident-
ification string that NFSv4 clients can insert into
their nfs_client_id4 string. This is typically a
UUID that is generated at system install time.
- nfs.send_implementation_id =
- [NFSv4.1] Send client implementation identification
- information in exchange_id requests.
- If zero, no implementation identification information
- will be sent.
- The default is to send the implementation identification
- information.
-
- nfs.recover_lost_locks =
+ nfs.recover_lost_locks=
[NFSv4] Attempt to recover locks that were lost due
to a lease timeout on the server. Please note that
doing this risks data corruption, since there are
@@ -3234,7 +3538,15 @@
The default parameter value of '0' causes the kernel
not to attempt recovery of lost locks.
- nfs4.layoutstats_timer =
+ nfs.send_implementation_id=
+ [NFSv4.1] Send client implementation identification
+ information in exchange_id requests.
+ If zero, no implementation identification information
+ will be sent.
+ The default is to send the implementation identification
+ information.
+
+ nfs4.layoutstats_timer=
[NFSv4.2] Change the rate at which the kernel sends
layoutstats to the pNFS metadata server.
@@ -3243,6 +3555,11 @@
driver. A non-zero value sets the minimum interval
in seconds between layoutstats transmissions.
+ nfsd.inter_copy_offload_enable=
+ [NFSv4.2] When set to 1, the server will support
+ server-to-server copies for which this server is
+ the destination of the copy.
+
nfsd.nfs4_disable_idmapping=
[NFSv4] When set to the default of '1', the NFSv4
server will return only numeric uids and gids to
@@ -3250,6 +3567,23 @@
and gids from such clients. This is intended to ease
migration from NFSv2/v3.
+ nfsd.nfsd4_ssc_umount_timeout=
+ [NFSv4.2] When used as the destination of a
+ server-to-server copy, knfsd temporarily mounts
+ the source server. It caches the mount in case
+ it will be needed again, and discards it if not
+ used for the number of milliseconds specified by
+ this parameter.
+
+ nfsaddrs= [NFS] Deprecated. Use ip= instead.
+ See Documentation/admin-guide/nfs/nfsroot.rst.
+
+ nfsroot= [NFS] nfs root filesystem for disk-less boxes.
+ See Documentation/admin-guide/nfs/nfsroot.rst.
+
+ nfsrootdebug [NFS] enable nfsroot debugging messages.
+ See Documentation/admin-guide/nfs/nfsroot.rst.
+
nmi_backtrace.backtrace_idle [KNL]
Dump stacks even of idle CPUs in response to an
NMI stack-backtrace request.
@@ -3274,45 +3608,15 @@
These settings can be accessed at runtime via
the nmi_watchdog and hardlockup_panic sysctls.
- netpoll.carrier_timeout=
- [NET] Specifies amount of time (in seconds) that
- netpoll should wait for a carrier. By default netpoll
- waits 4 seconds.
-
no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
emulation library even if a 387 maths coprocessor
is present.
- no5lvl [X86-64] Disable 5-level paging mode. Forces
- kernel to use 4-level paging instead.
-
- nofsgsbase [X86] Disables FSGSBASE instructions.
+ no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces
+ kernel to use 3-level paging instead.
- no_console_suspend
- [HW] Never suspend the console
- Disable suspending of consoles during suspend and
- hibernate operations. Once disabled, debugging
- messages can reach various consoles while the rest
- of the system is being put to sleep (ie, while
- debugging driver suspend/resume hooks). This may
- not work reliably with all consoles, but is known
- to work with serial and VGA consoles.
- To facilitate more flexible debugging, we also add
- console_suspend, a printk module parameter to control
- it. Users could use console_suspend (usually
- /sys/module/printk/parameters/console_suspend) to
- turn on/off it dynamically.
-
- novmcoredd [KNL,KDUMP]
- Disable device dump. Device dump allows drivers to
- append dump data to vmcore so you can collect driver
- specified debug info. Drivers can append the data
- without any limit and this data is stored in memory,
- so this may cause significant memory stress. Disabling
- device dump can help save memory but the driver debug
- data will be no longer available. This parameter
- is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP
- is set.
+ no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces
+ kernel to use 4-level paging instead.
noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
caches in the slab allocator. Saves per-node memory,
@@ -3328,14 +3632,25 @@
noautogroup Disable scheduler automatic task group creation.
- nobats [PPC] Do not use BATs for mapping kernel lowmem
- on "Classic" PPC cores.
-
nocache [ARM]
- noclflush [BUGS=X86] Don't use the CLFLUSH instruction
+ no_console_suspend
+ [HW] Never suspend the console
+ Disable suspending of consoles during suspend and
+ hibernate operations. Once disabled, debugging
+ messages can reach various consoles while the rest
+ of the system is being put to sleep (ie, while
+ debugging driver suspend/resume hooks). This may
+ not work reliably with all consoles, but is known
+ to work with serial and VGA consoles.
+ To facilitate more flexible debugging, we also add
+ console_suspend, a printk module parameter to control
+ it. Users could use console_suspend (usually
+ /sys/module/printk/parameters/console_suspend) to
+ turn on/off it dynamically.
- delayacct [KNL] Enable per-task delay accounting
+ no_debug_objects
+ [KNL] Disable object debugging
nodsp [SH] Disable hardware DSP at boot time.
@@ -3345,19 +3660,6 @@
noexec [IA-64]
- noexec [X86]
- On X86-32 available only on PAE configured kernels.
- noexec=on: enable non-executable mappings (default)
- noexec=off: disable non-executable mappings
-
- nosmap [X86,PPC]
- Disable SMAP (Supervisor Mode Access Prevention)
- even if it is supported by processor.
-
- nosmep [X86,PPC]
- Disable SMEP (Supervisor Mode Execution Prevention)
- even if it is supported by processor.
-
noexec32 [X86-64]
This affects only 32-bit executables.
noexec32=on: enable non-executable mappings (default)
@@ -3365,70 +3667,18 @@
noexec32=off: disable non-executable mappings
read implies executable mappings
+ no_file_caps Tells the kernel not to honor file capabilities. The
+ only way then for a file to be executed with privilege
+ is to be setuid root or executed by root.
+
nofpu [MIPS,SH] Disable hardware FPU at boot time.
+ nofsgsbase [X86] Disables FSGSBASE instructions.
+
nofxsr [BUGS=X86-32] Disables x86 floating point extended
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
- nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
-
- nohugevmalloc [PPC] Disable kernel huge vmalloc mappings.
-
- nosmt [KNL,S390] Disable symmetric multithreading (SMT).
- Equivalent to smt=1.
-
- [KNL,X86] Disable symmetric multithreading (SMT).
- nosmt=force: Force disable SMT, cannot be undone
- via the sysfs control file.
-
- nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1
- (bounds check bypass). With this option data leaks are
- possible in the system.
-
- nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
- the Spectre variant 2 (indirect branch prediction)
- vulnerability. System may allow data leaks with this
- option.
-
- nospec_store_bypass_disable
- [HW] Disable all mitigations for the Speculative Store Bypass vulnerability
-
- no_uaccess_flush
- [PPC] Don't flush the L1-D cache after accessing user data.
-
- noxsave [BUGS=X86] Disables x86 extended register state save
- and restore using xsave. The kernel will fallback to
- enabling legacy floating-point and sse state.
-
- noxsaveopt [X86] Disables xsaveopt used in saving x86 extended
- register states. The kernel will fall back to use
- xsave to save the states. By using this parameter,
- performance of saving the states is degraded because
- xsave doesn't support modified optimization while
- xsaveopt supports it on xsaveopt enabled systems.
-
- noxsaves [X86] Disables xsaves and xrstors used in saving and
- restoring x86 extended register state in compacted
- form of xsave area. The kernel will fall back to use
- xsaveopt and xrstor to save and restore the states
- in standard form of xsave area. By using this
- parameter, xsave area per process might occupy more
- memory on xsaves enabled systems.
-
- nohlt [ARM,ARM64,MICROBLAZE,SH] Forces the kernel to busy wait
- in do_idle() and not use the arch_cpu_idle()
- implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP
- to be effective. This is useful on platforms where the
- sleep(SH) or wfi(ARM,ARM64) instructions do not work
- correctly or when doing power measurements to evalute
- the impact of the sleep instructions. This is also
- useful when using JTAG debugger.
-
- no_file_caps Tells the kernel not to honor file capabilities. The
- only way then for a file to be executed with privilege
- is to be setuid root or executed by root.
-
nohalt [IA-64] Tells the kernel not to use the power saving
function PAL_HALT_LIGHT when idle. This increases
power-consumption. On the positive side, it reduces
@@ -3446,13 +3696,25 @@
difficult since unequal pointers can no longer be
compared. However, if this command-line option is
specified, then all normal pointers will have their true
- value printed. Pointers printed via %pK may still be
- hashed. This option should only be specified when
+ value printed. This option should only be specified when
debugging the kernel. Please do not use on production
kernels.
nohibernate [HIBERNATION] Disable hibernation and resume.
+ nohlt [ARM,ARM64,MICROBLAZE,SH] Forces the kernel to busy wait
+ in do_idle() and not use the arch_cpu_idle()
+ implementation; requires CONFIG_GENERIC_IDLE_POLL_SETUP
+ to be effective. This is useful on platforms where the
+ sleep(SH) or wfi(ARM,ARM64) instructions do not work
+ correctly or when doing power measurements to evaluate
+ the impact of the sleep instructions. This is also
+ useful when using JTAG debugger.
+
+ nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
+
+ nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
+
nohz= [KNL] Boottime enable/disable dynamic ticks
Valid arguments: on, off
Default: on
@@ -3467,15 +3729,8 @@
just as if they had also been called out in the
rcu_nocbs= boot parameter.
- noiotrap [SH] Disables trapped I/O port accesses.
-
- noirqdebug [X86-32] Disables the code which attempts to detect and
- disable unhandled interrupt sources.
-
- no_timer_check [X86,APIC] Disables the code which tests for
- broken timer IRQ sources.
-
- noisapnp [ISAPNP] Disables ISA PnP code.
+ Note that this argument takes precedence over
+ the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
noinitrd [RAM] Tells the kernel not to load any configured
initial RAM disk.
@@ -3488,28 +3743,29 @@
noinvpcid [X86] Disable the INVPCID cpu feature.
+ noiotrap [SH] Disables trapped I/O port accesses.
+
+ noirqdebug [X86-32] Disables the code which attempts to detect and
+ disable unhandled interrupt sources.
+
+ noisapnp [ISAPNP] Disables ISA PnP code.
+
nojitter [IA-64] Disables jitter checking for ITC timers.
- no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+ nokaslr [KNL]
+ When CONFIG_RANDOMIZE_BASE is set, this disables
+ kernel and module base offset ASLR (Address Space
+ Layout Randomization).
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
fault handling.
- no-vmw-sched-clock
- [X86,PV_OPS] Disable paravirtualized VMware scheduler
- clock and use the default one.
-
- no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time
- accounting. steal time is computed, but won't
- influence scheduler behaviour
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
- noltlbs [PPC] Do not use large page/tlb entries for kernel
- lowmem mapping on PPC40x and PPC8xx
-
nomca [IA-64] Disable machine check abort handling
nomce [X86-32] Disable Machine Check Exception
@@ -3517,48 +3773,123 @@
nomfgpt [X86-32] Disable Multi-Function General Purpose
Timer usage (for AMD Geode machines).
+ nomodeset Disable kernel modesetting. Most systems' firmware
+ sets up a display mode and provides framebuffer memory
+ for output. With nomodeset, DRM and fbdev drivers will
+ not load if they could possibly displace the pre-
+ initialized output. Only the system framebuffer will
+ be available for use. The respective drivers will not
+ perform display-mode changes or accelerated rendering.
+
+ Useful as error fallback, or for testing and debugging.
+
+ nomodule Disable module load
+
nonmi_ipi [X86] Disable using NMI IPIs during panic/reboot to
shutdown the other cpus. Instead use the REBOOT_VECTOR
irq.
- nomodule Disable module load
-
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
nopcid [X86-64] Disable the PCID cpu feature.
+ nopku [X86] Disable Memory Protection Keys CPU feature found
+ in some Intel CPUs.
+
+ nopti [X86-64]
+ Equivalent to pti=off
+
+ nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
+ Disables the PV optimizations forcing the guest to run
+ as generic guest with no PV drivers. Currently support
+ XEN HVM, KVM, HYPER_V and VMWARE guest.
+
+ nopvspin [X86,XEN,KVM]
+ Disables the qspinlock slow path using PV optimizations
+ which allow the hypervisor to 'idle' the guest on lock
+ contention.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
noreplace-smp [X86-32,SMP] Don't replace SMP instructions
with UP alternatives
- nordrand [X86] Disable kernel use of the RDRAND and
- RDSEED instructions even if they are supported
- by the processor. RDRAND and RDSEED are still
- available to user space applications.
-
noresume [SWSUSP] Disables resume and restores original swap
space.
+ nosbagart [IA-64]
+
no-scroll [VGA] Disables scrollback.
This is required for the Braillex ib80-piezo Braille
reader made by F.H. Papenmeier (Germany).
- nosbagart [IA-64]
+ nosgx [X86-64,SGX] Disables Intel SGX kernel support.
- nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
+ nosmap [PPC]
+ Disable SMAP (Supervisor Mode Access Prevention)
+ even if it is supported by processor.
- nosgx [X86-64,SGX] Disables Intel SGX kernel support.
+ nosmep [PPC64s]
+ Disable SMEP (Supervisor Mode Execution Prevention)
+ even if it is supported by processor.
nosmp [SMP] Tells an SMP kernel to act as a UP kernel,
and disable the IO APIC. legacy for "maxcpus=0".
+ nosmt [KNL,S390] Disable symmetric multithreading (SMT).
+ Equivalent to smt=1.
+
+ [KNL,X86] Disable symmetric multithreading (SMT).
+ nosmt=force: Force disable SMT, cannot be undone
+ via the sysfs control file.
+
nosoftlockup [KNL] Disable the soft-lockup detector.
+ nospec_store_bypass_disable
+ [HW] Disable all mitigations for the Speculative Store Bypass vulnerability
+
+ nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch
+ history injection) vulnerability. System may allow data leaks
+ with this option.
+
+ nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1
+ (bounds check bypass). With this option data leaks are
+ possible in the system.
+
+ nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for
+ the Spectre variant 2 (indirect branch prediction)
+ vulnerability. System may allow data leaks with this
+ option.
+
+ no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized
+ steal time accounting. steal time is computed, but
+ won't influence scheduler behaviour
+
nosync [HW,M68K] Disables sync negotiation for all devices.
+ no_timer_check [X86,APIC] Disables the code which tests for
+ broken timer IRQ sources.
+
+ no_uaccess_flush
+ [PPC] Don't flush the L1-D cache after accessing user data.
+
+ novmcoredd [KNL,KDUMP]
+ Disable device dump. Device dump allows drivers to
+ append dump data to vmcore so you can collect driver
+ specified debug info. Drivers can append the data
+ without any limit and this data is stored in memory,
+ so this may cause significant memory stress. Disabling
+ device dump can help save memory but the driver debug
+ data will be no longer available. This parameter
+ is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP
+ is set.
+
+ no-vmw-sched-clock
+ [X86,PV_OPS] Disable paravirtualized VMware scheduler
+ clock and use the default one.
+
nowatchdog [KNL] Disable both lockup detectors, i.e.
soft-lockup and NMI watchdog (hard-lockup).
@@ -3566,19 +3897,28 @@
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
- cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
- CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
- Some features depend on CPU0. Known dependencies are:
- 1. Resume from suspend/hibernate depends on CPU0.
- Suspend/hibernate will fail if CPU0 is offline and you
- need to online CPU0 before suspend/hibernate.
- 2. PIC interrupts also depend on CPU0. CPU0 can't be
- removed if a PIC interrupt is detected.
- It's said poweroff/reboot may depend on CPU0 on some
- machines although I haven't seen such issues so far
- after CPU0 is offline on a few tested machines.
- If the dependencies are under your control, you can
- turn on cpu0_hotplug.
+ NOTE: this parameter will be ignored on systems with the
+ LEGACY_XAPIC_DISABLED bit set in the
+ IA32_XAPIC_DISABLE_STATUS MSR.
+
+ noxsave [BUGS=X86] Disables x86 extended register state save
+ and restore using xsave. The kernel will fallback to
+ enabling legacy floating-point and sse state.
+
+ noxsaveopt [X86] Disables xsaveopt used in saving x86 extended
+ register states. The kernel will fall back to use
+ xsave to save the states. By using this parameter,
+ performance of saving the states is degraded because
+ xsave doesn't support modified optimization while
+ xsaveopt supports it on xsaveopt enabled systems.
+
+ noxsaves [X86] Disables xsaves and xrstors used in saving and
+ restoring x86 extended register state in compacted
+ form of xsave area. The kernel will fall back to use
+ xsaveopt and xrstor to save and restore the states
+ in standard form of xsave area. By using this
+ parameter, xsave area per process might occupy more
+ memory on xsaves enabled systems.
nps_mtm_hs_ctr= [KNL,ARC]
This parameter sets the maximum duration, in
@@ -3632,6 +3972,16 @@
For example, to override I2C bus2:
omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
+ onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
+
+ Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
+
+ boundary - index of last SLC block on Flex-OneNAND.
+ The remaining blocks are configured as MLC blocks.
+ lock - Configure if Flex-OneNAND boundary should be locked.
+ Once locked, the boundary cannot be changed.
+ 1 indicates lock status, 0 indicates unlock status.
+
oops=panic Always panic on oopses. Default is to just kill the
process, but there is a small probability of
deadlocking the machine.
@@ -3664,7 +4014,7 @@
[KNL] Minimal page reporting order
Format: <integer>
Adjust the minimal page reporting order. The page
- reporting is disabled when it exceeds (MAX_ORDER-1).
+ reporting is disabled when it exceeds MAX_ORDER.
panic= [KNL] Kernel behaviour on panic: delay <timeout>
timeout > 0: seconds before rebooting
@@ -3680,6 +4030,11 @@
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
bit 5: print all printk messages in buffer
+ bit 6: print all CPUs backtrace (if available in the arch)
+ *Be aware* that this option may print a _lot_ of lines,
+ so there are risks of losing older messages in the log.
+ Use this option carefully, maybe worth to setup a
+ bigger log buffer with "log_buf_len" along with this.
panic_on_taint= Bitmask for conditionally calling panic() in add_taint()
Format: <hex>[,nousertaint]
@@ -3697,14 +4052,6 @@
panic_on_warn panic() instead of WARN(). Useful to cause kdump
on a WARN().
- crash_kexec_post_notifiers
- Run kdump after running panic-notifiers and dumping
- kmsg. This only for the users who doubt kdump always
- succeeds in any situation.
- Note that this also increases risks of kdump failure,
- because some panic notifiers can make the crashed
- kernel more unstable.
-
parkbd.port= [HW] Parallel port number the keyboard adapter is
connected to, default is 0.
Format: <parport#>
@@ -3831,10 +4178,6 @@
pcbit= [HW,ISDN]
- pcd. [PARIDE]
- See header of drivers/block/paride/pcd.c.
- See also Documentation/admin-guide/blockdev/paride.rst.
-
pci=option[,option...] [PCI] various PCI subsystem options.
Some options herein operate on a specific device
@@ -3949,6 +4292,15 @@
please report a bug.
nocrs [X86] Ignore PCI host bridge windows from ACPI.
If you need to use this, please report a bug.
+ use_e820 [X86] Use E820 reservations to exclude parts of
+ PCI host bridge windows. This is a workaround
+ for BIOS defects in host bridge _CRS methods.
+ If you need to use this, please report a bug to
+ <linux-pci@vger.kernel.org>.
+ no_e820 [X86] Ignore E820 reservations for PCI host
+ bridge windows. This is the default on modern
+ hardware. If you need to use this, please report
+ a bug to <linux-pci@vger.kernel.org>.
routeirq Do IRQ routing for all PCI devices.
This is normally done in pci_enable_device(),
so this option is a temporary workaround
@@ -4001,7 +4353,9 @@
specified, e.g., 12@pci:8086:9c22:103c:198f
for 4096-byte alignment.
ecrc= Enable/disable PCIe ECRC (transaction layer
- end-to-end CRC checking).
+ end-to-end CRC checking). Only effective if
+ OS has native AER control (either granted by
+ ACPI _OSC or forced via "pcie_ports=native")
bios: Use BIOS/firmware settings. This is the
the default.
off: Turn ECRC off
@@ -4088,9 +4442,6 @@
for debug and development, but should not be
needed on a platform with proper driver support.
- pd. [PARIDE]
- See Documentation/admin-guide/blockdev/paride.rst.
-
pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at
boot time.
Format: { 0 | 1 }
@@ -4103,14 +4454,8 @@
allocator. This parameter is primarily for debugging
and performance comparison.
- pf. [PARIDE]
- See Documentation/admin-guide/blockdev/paride.rst.
-
- pg. [PARIDE]
- See Documentation/admin-guide/blockdev/paride.rst.
-
pirq= [SMP,APIC] Manual mp-table setup
- See Documentation/x86/i386/IO-APIC.rst.
+ See Documentation/arch/x86/i386/IO-APIC.rst.
plip= [PPT,NET] Parallel port network link
Format: { parport<nr> | timid | 0 }
@@ -4120,6 +4465,14 @@
Override pmtimer IOPort with a hex value.
e.g. pmtmr=0x508
+ pmu_override= [PPC] Override the PMU.
+ This option takes over the PMU facility, so it is no
+ longer usable by perf. Setting this option starts the
+ PMU counters by setting MMCR0 to 0 (the FC bit is
+ cleared). If a number is given, then MMCR1 is set to
+ that number, otherwise (e.g., 'pmu_override=on'), MMCR1
+ remains 0.
+
pm_debug_messages [SUSPEND,KNL]
Enable suspend/resume debug messages during boot up.
@@ -4262,9 +4615,6 @@
pstore.backend= Specify the name of the pstore backend to use
- pt. [PARIDE]
- See Documentation/admin-guide/blockdev/paride.rst.
-
pti= [X86-64] Control Page Table Isolation of user and
kernel address spaces. Disabling this feature
removes hardening, but improves performance of
@@ -4277,9 +4627,6 @@
Not specifying this option is equivalent to pti=auto.
- nopti [X86-64]
- Equivalent to pti=off
-
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
@@ -4288,6 +4635,10 @@
r128= [HW,DRM]
+ radix_hcall_invalidate=on [PPC/PSERIES]
+ Disable RADIX GTSE feature and use hcall for TLB
+ invalidate.
+
raid= [HW,RAID]
See Documentation/admin-guide/md.rst.
@@ -4296,11 +4647,15 @@
ramdisk_start= [RAM] RAM disk image start address
- random.trust_cpu={on,off}
- [KNL] Enable or disable trusting the use of the
- CPU's random number generator (if available) to
- fully seed the kernel's CRNG. Default is controlled
- by CONFIG_RANDOM_TRUST_CPU.
+ random.trust_cpu=off
+ [KNL] Disable trusting the use of the CPU's
+ random number generator (if available) to
+ initialize the kernel's RNG.
+
+ random.trust_bootloader=off
+ [KNL] Disable trusting the use of the a seed
+ passed by the bootloader (if available) to
+ initialize the kernel's RNG.
randomize_kstack_offset=
[KNL] Enable or disable kernel stack offset
@@ -4319,19 +4674,33 @@
Disable the Correctable Errors Collector,
see CONFIG_RAS_CEC help text.
- rcu_nocbs= [KNL]
- The argument is a cpu list, as described above.
-
- In kernels built with CONFIG_RCU_NOCB_CPU=y, set
- the specified list of CPUs to be no-callback CPUs.
- Invocation of these CPUs' RCU callbacks will be
- offloaded to "rcuox/N" kthreads created for that
- purpose, where "x" is "p" for RCU-preempt, and
- "s" for RCU-sched, and "N" is the CPU number.
- This reduces OS jitter on the offloaded CPUs,
- which can be useful for HPC and real-time
- workloads. It can also improve energy efficiency
- for asymmetric multiprocessors.
+ rcu_nocbs[=cpu-list]
+ [KNL] The optional argument is a cpu list,
+ as described above.
+
+ In kernels built with CONFIG_RCU_NOCB_CPU=y,
+ enable the no-callback CPU mode, which prevents
+ such CPUs' callbacks from being invoked in
+ softirq context. Invocation of such CPUs' RCU
+ callbacks will instead be offloaded to "rcuox/N"
+ kthreads created for that purpose, where "x" is
+ "p" for RCU-preempt, "s" for RCU-sched, and "g"
+ for the kthreads that mediate grace periods; and
+ "N" is the CPU number. This reduces OS jitter on
+ the offloaded CPUs, which can be useful for HPC
+ and real-time workloads. It can also improve
+ energy efficiency for asymmetric multiprocessors.
+
+ If a cpulist is passed as an argument, the specified
+ list of CPUs is set to no-callback mode from boot.
+
+ Otherwise, if the '=' sign and the cpulist
+ arguments are omitted, no CPU will be set to
+ no-callback mode from boot but the mode may be
+ toggled at runtime via cpusets.
+
+ Note that this argument takes precedence over
+ the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
rcu_nocb_poll [KNL]
Rather than requiring that offloaded CPUs
@@ -4439,6 +4808,36 @@
(the least-favored priority). Otherwise, when
RCU_BOOST is not set, valid values are 0-99 and
the default is zero (non-realtime operation).
+ When RCU_NOCB_CPU is set, also adjust the
+ priority of NOCB callback kthreads.
+
+ rcutree.rcu_divisor= [KNL]
+ Set the shift-right count to use to compute
+ the callback-invocation batch limit bl from
+ the number of callbacks queued on this CPU.
+ The result will be bounded below by the value of
+ the rcutree.blimit kernel parameter. Every bl
+ callbacks, the softirq handler will exit in
+ order to allow the CPU to do other work.
+
+ Please note that this callback-invocation batch
+ limit applies only to non-offloaded callback
+ invocation. Offloaded callbacks are instead
+ invoked in the context of an rcuoc kthread, which
+ scheduler will preempt as it does any other task.
+
+ rcutree.nocb_nobypass_lim_per_jiffy= [KNL]
+ On callback-offloaded (rcu_nocbs) CPUs,
+ RCU reduces the lock contention that would
+ otherwise be caused by callback floods through
+ use of the ->nocb_bypass list. However, in the
+ common non-flooded case, RCU queues directly to
+ the main ->cblist in order to avoid the extra
+ overhead of the ->nocb_bypass list and its lock.
+ But if there are too many callbacks queued during
+ a single jiffy, RCU pre-queues the callbacks into
+ the ->nocb_bypass queue. The definition of "too
+ many" is supplied by this kernel boot parameter.
rcutree.rcu_nocb_gp_stride= [KNL]
Set the number of NOCB callback kthreads in
@@ -4465,10 +4864,6 @@
on rcutree.qhimark at boot time and to zero to
disable more aggressive help enlistment.
- rcutree.rcu_idle_gp_delay= [KNL]
- Set wakeup interval for idle CPUs that have
- RCU callbacks (RCU_FAST_NO_HZ=y).
-
rcutree.rcu_kick_kthreads= [KNL]
Cause the grace-period kthread to get an extra
wake_up() if it sleeps three times longer than
@@ -4579,8 +4974,12 @@
in seconds.
rcutorture.fwd_progress= [KNL]
- Enable RCU grace-period forward-progress testing
+ Specifies the number of kthreads to be used
+ for RCU grace-period forward-progress testing
for the types of RCU supporting this notion.
+ Defaults to 1 kthread, values less than zero or
+ greater than the number of CPUs cause the number
+ of CPUs to be used.
rcutorture.fwd_progress_div= [KNL]
Specify the fraction of a CPU-stall-warning
@@ -4749,6 +5148,29 @@
rcupdate.rcu_cpu_stall_timeout= [KNL]
Set timeout for RCU CPU stall warning messages.
+ The value is in seconds and the maximum allowed
+ value is 300 seconds.
+
+ rcupdate.rcu_exp_cpu_stall_timeout= [KNL]
+ Set timeout for expedited RCU CPU stall warning
+ messages. The value is in milliseconds
+ and the maximum allowed value is 21000
+ milliseconds. Please note that this value is
+ adjusted to an arch timer tick resolution.
+ Setting this to zero causes the value from
+ rcupdate.rcu_cpu_stall_timeout to be used (after
+ conversion from seconds to milliseconds).
+
+ rcupdate.rcu_cpu_stall_cputime= [KNL]
+ Provide statistics on the cputime and count of
+ interrupts and tasks during the sampling period. For
+ multiple continuous RCU stalls, all sampling periods
+ begin at half of the first RCU stall timeout.
+
+ rcupdate.rcu_exp_stall_task_details= [KNL]
+ Print stack dumps of any tasks blocking the
+ current expedited RCU grace period during an
+ expedited RCU CPU stall warning.
rcupdate.rcu_expedited= [KNL]
Use expedited grace-period primitives, for
@@ -4781,6 +5203,29 @@
period to instead use normal non-expedited
grace-period processing.
+ rcupdate.rcu_task_collapse_lim= [KNL]
+ Set the maximum number of callbacks present
+ at the beginning of a grace period that allows
+ the RCU Tasks flavors to collapse back to using
+ a single callback queue. This switching only
+ occurs when rcupdate.rcu_task_enqueue_lim is
+ set to the default value of -1.
+
+ rcupdate.rcu_task_contend_lim= [KNL]
+ Set the minimum number of callback-queuing-time
+ lock-contention events per jiffy required to
+ cause the RCU Tasks flavors to switch to per-CPU
+ callback queuing. This switching only occurs
+ when rcupdate.rcu_task_enqueue_lim is set to
+ the default value of -1.
+
+ rcupdate.rcu_task_enqueue_lim= [KNL]
+ Set the number of callback queues to use for the
+ RCU Tasks family of RCU flavors. The default
+ of -1 allows this to be automatically (and
+ dynamically) adjusted. This parameter is intended
+ for use in testing.
+
rcupdate.rcu_task_ipi_delay= [KNL]
Set time in jiffies during which RCU tasks will
avoid sending IPIs, starting with the beginning
@@ -4788,10 +5233,34 @@
number avoids disturbing real-time workloads,
but lengthens grace periods.
+ rcupdate.rcu_task_stall_info= [KNL]
+ Set initial timeout in jiffies for RCU task stall
+ informational messages, which give some indication
+ of the problem for those not patient enough to
+ wait for ten minutes. Informational messages are
+ only printed prior to the stall-warning message
+ for a given grace period. Disable with a value
+ less than or equal to zero. Defaults to ten
+ seconds. A change in value does not take effect
+ until the beginning of the next grace period.
+
+ rcupdate.rcu_task_stall_info_mult= [KNL]
+ Multiplier for time interval between successive
+ RCU task stall informational messages for a given
+ RCU tasks grace period. This value is clamped
+ to one through ten, inclusive. It defaults to
+ the value three, so that the first informational
+ message is printed 10 seconds into the grace
+ period, the second at 40 seconds, the third at
+ 160 seconds, and then the stall warning at 600
+ seconds would prevent a fourth at 640 seconds.
+
rcupdate.rcu_task_stall_timeout= [KNL]
- Set timeout in jiffies for RCU task stall warning
- messages. Disable with a value less than or equal
- to zero.
+ Set timeout in jiffies for RCU task stall
+ warning messages. Disable with a value less
+ than or equal to zero. Defaults to ten minutes.
+ A change in value does not take effect until
+ the beginning of the next grace period.
rcupdate.rcu_self_test= [KNL]
Run the RCU early boot self tests
@@ -4811,7 +5280,7 @@
rdt= [HW,X86,RDT]
Turn on/off individual RDT features. List is:
cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
- mba.
+ mba, smba, bmec.
E.g. to turn on cmt and turn off mba use:
rdt=cmt,!mba
@@ -4910,17 +5379,45 @@
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
- hibernate= [HIBERNATION]
- noresume Don't check if there's a hibernation image
- present during boot.
- nocompress Don't compress/decompress hibernation images.
- no Disable hibernation and resume.
- protect_image Turn on image protection during restoration
- (that will set all pages holding image data
- during restoration read-only).
-
retain_initrd [RAM] Keep initrd memory after extraction
+ retbleed= [X86] Control mitigation of RETBleed (Arbitrary
+ Speculative Code Execution with Return Instructions)
+ vulnerability.
+
+ AMD-based UNRET and IBPB mitigations alone do not stop
+ sibling threads from influencing the predictions of other
+ sibling threads. For that reason, STIBP is used on pro-
+ cessors that support it, and mitigate SMT on processors
+ that don't.
+
+ off - no mitigation
+ auto - automatically select a migitation
+ auto,nosmt - automatically select a mitigation,
+ disabling SMT if necessary for
+ the full mitigation (only on Zen1
+ and older without STIBP).
+ ibpb - On AMD, mitigate short speculation
+ windows on basic block boundaries too.
+ Safe, highest perf impact. It also
+ enables STIBP if present. Not suitable
+ on Intel.
+ ibpb,nosmt - Like "ibpb" above but will disable SMT
+ when STIBP is not available. This is
+ the alternative for systems which do not
+ have STIBP.
+ unret - Force enable untrained return thunks,
+ only effective on AMD f15h-f17h based
+ systems.
+ unret,nosmt - Like unret, but will disable SMT when STIBP
+ is not available. This is the alternative for
+ systems which do not have STIBP.
+
+ Selecting 'auto' will choose a mitigation method at run
+ time according to the CPU.
+
+ Not specifying this option is equivalent to retbleed=auto.
+
rfkill.default_state=
0 "airplane mode". All wifi, bluetooth, wimax, gps, fm,
etc. communication is blocked by default.
@@ -4945,6 +5442,8 @@
rodata= [KNL]
on Mark read-only kernel memory as read-only (default).
off Leave read-only kernel memory writable for debugging.
+ full Mark read-only kernel memory and aliases as read-only
+ [arm64]
rockchip.usb_uart
Enable the uart passthrough on the designated usb port
@@ -4982,6 +5481,18 @@
an IOTLB flush. Default is lazy flushing before reuse,
which is faster.
+ s390_iommu_aperture= [KNL,S390]
+ Specifies the size of the per device DMA address space
+ accessible through the DMA and IOMMU APIs as a decimal
+ factor of the size of main memory.
+ The default is 1 meaning that one can concurrently use
+ as many DMA addresses as physical memory is installed,
+ if supported by hardware, and thus map all of memory
+ once. With a value of 2 one can map all of memory twice
+ and so on. As a special case a factor of 0 imposes no
+ restrictions other than those given by hardware at the
+ cost of significant additional memory use for tables.
+
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
@@ -5120,18 +5631,22 @@
1 -- enable.
Default value is 1.
- apparmor= [APPARMOR] Disable or enable AppArmor at boot time
- Format: { "0" | "1" }
- See security/apparmor/Kconfig help text
- 0 -- disable.
- 1 -- enable.
- Default value is set via kernel config option.
-
serialnumber [BUGS=X86-32]
+ sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
+
shapers= [NET]
Maximal number of shapers.
+ show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller
+ Limit apic dumping. The parameter defines the maximal
+ number of local apics being dumped. Also it is possible
+ to set it to "all" by meaning -- no limit here.
+ Format: { 1 (default) | 2 | ... | all }.
+ The parameter valid if only apic=debug or
+ apic=verbose is specified.
+ Example: apic=debug show_lapic=all
+
simeth= [IA-64]
simscsi=
@@ -5152,7 +5667,7 @@
cache (risks via metadata attacks are mostly
unchanged). Debug options disable merging on their
own.
- For more information see Documentation/vm/slub.rst.
+ For more information see Documentation/mm/slub.rst.
slab_max_order= [MM, SLAB]
Determines the maximum allowed order for slabs.
@@ -5166,13 +5681,13 @@
slub_debug can create guard zones around objects and
may poison objects when not in use. Also tracks the
last alloc / free. For more information see
- Documentation/vm/slub.rst.
+ Documentation/mm/slub.rst.
slub_max_order= [MM, SLUB]
Determines the maximum allowed order for slabs.
A high setting may cause OOMs due to memory
fragmentation. For more information see
- Documentation/vm/slub.rst.
+ Documentation/mm/slub.rst.
slub_min_objects= [MM, SLUB]
The minimum number of objects per slab. SLUB will
@@ -5181,12 +5696,12 @@
the number of objects indicated. The higher the number
of objects the smaller the overhead of tracking slabs
and the less frequently locks need to be acquired.
- For more information see Documentation/vm/slub.rst.
+ For more information see Documentation/mm/slub.rst.
slub_min_order= [MM, SLUB]
Determines the minimum page order for slabs. Must be
lower than slub_max_order.
- For more information see Documentation/vm/slub.rst.
+ For more information see Documentation/mm/slub.rst.
slub_merge [MM, SLUB]
Same with slab_merge.
@@ -5198,6 +5713,17 @@
smart2= [HW]
Format: <io1>[,<io2>[,...,<io8>]]
+ smp.csd_lock_timeout= [KNL]
+ Specify the period of time in milliseconds
+ that smp_call_function() and friends will wait
+ for a CPU to release the CSD lock. This is
+ useful when diagnosing bugs involving CPUs
+ disabling interrupts for extended periods
+ of time. Defaults to 5,000 milliseconds, and
+ setting a value of zero disables this feature.
+ This feature may be more efficiently disabled
+ using the csdlock_debug- kernel parameter.
+
smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
smsc-ircc2.ircc_sir= [HW] SIR base I/O port
@@ -5209,7 +5735,7 @@
1: Fast pin select (default)
2: ATC IRMode
- smt [KNL,S390] Set the maximum number of threads (logical
+ smt= [KNL,S390] Set the maximum number of threads (logical
CPUs) to use per physical CPU on systems capable of
symmetric multithreading (SMT). Will be capped to the
actual hardware limit.
@@ -5261,8 +5787,13 @@
Specific mitigations can also be selected manually:
retpoline - replace indirect branches
- retpoline,generic - google's original retpoline
- retpoline,amd - AMD-specific minimal thunk
+ retpoline,generic - Retpolines
+ retpoline,lfence - LFENCE; indirect branch
+ retpoline,amd - alias for retpoline,lfence
+ eibrs - Enhanced/Auto IBRS
+ eibrs,retpoline - Enhanced/Auto IBRS + Retpolines
+ eibrs,lfence - Enhanced/Auto IBRS + LFENCE
+ ibrs - use IBRS to protect kernel
Not specifying this option is equivalent to
spectre_v2=auto.
@@ -5303,8 +5834,7 @@
auto - Kernel selects the mitigation depending on
the available CPU features and vulnerability.
- Default mitigation:
- If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
+ Default mitigation: "prctl"
Not specifying this option is equivalent to
spectre_v2_user=auto.
@@ -5348,7 +5878,7 @@
will disable SSB unless they explicitly opt out.
Default mitigations:
- X86: If CONFIG_SECCOMP=y "seccomp", otherwise "prctl"
+ X86: "prctl"
On powerpc the options are:
@@ -5426,6 +5956,30 @@
off: Disable mitigation and remove
performance impact to RDRAND and RDSEED
+ srcutree.big_cpu_lim [KNL]
+ Specifies the number of CPUs constituting a
+ large system, such that srcu_struct structures
+ should immediately allocate an srcu_node array.
+ This kernel-boot parameter defaults to 128,
+ but takes effect only when the low-order four
+ bits of srcutree.convert_to_big is equal to 3
+ (decide at boot).
+
+ srcutree.convert_to_big [KNL]
+ Specifies under what conditions an SRCU tree
+ srcu_struct structure will be converted to big
+ form, that is, with an rcu_node tree:
+
+ 0: Never.
+ 1: At init_srcu_struct() time.
+ 2: When rcutorture decides to.
+ 3: Decide at boot time (default).
+ 0x1X: Above plus if high contention.
+
+ Either way, the srcu_node tree will be sized based
+ on the actual runtime number of CPUs (nr_cpu_ids)
+ instead of the compile-time CONFIG_NR_CPUS.
+
srcutree.counter_wrap_check [KNL]
Specifies how frequently to check for
grace-period sequence counter wrap for the
@@ -5443,6 +5997,32 @@
expediting. Set to zero to disable automatic
expediting.
+ srcutree.srcu_max_nodelay [KNL]
+ Specifies the number of no-delay instances
+ per jiffy for which the SRCU grace period
+ worker thread will be rescheduled with zero
+ delay. Beyond this limit, worker thread will
+ be rescheduled with a sleep delay of one jiffy.
+
+ srcutree.srcu_max_nodelay_phase [KNL]
+ Specifies the per-grace-period phase, number of
+ non-sleeping polls of readers. Beyond this limit,
+ grace period worker thread will be rescheduled
+ with a sleep delay of one jiffy, between each
+ rescan of the readers, for a grace period phase.
+
+ srcutree.srcu_retry_check_delay [KNL]
+ Specifies number of microseconds of non-sleeping
+ delay between each non-sleeping poll of readers.
+
+ srcutree.small_contention_lim [KNL]
+ Specifies the number of update-side contention
+ events per jiffy will be tolerated before
+ initiating a conversion of an srcu_struct
+ structure to big form. Note that the value of
+ srcutree.convert_to_big must have the 0x10 bit
+ set for contention-based conversions to occur.
+
ssbd= [ARM64,HW]
Speculative Store Bypass Disable control
@@ -5497,6 +6077,25 @@
stifb= [HW]
Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
+ strict_sas_size=
+ [X86]
+ Format: <bool>
+ Enable or disable strict sigaltstack size checks
+ against the required signal frame size which
+ depends on the supported FPU features. This can
+ be used to filter out binaries which have
+ not yet been made aware of AT_MINSIGSTKSZ.
+
+ stress_hpt [PPC]
+ Limits the number of kernel HPT entries in the hash
+ page table to increase the rate of hash page table
+ faults on kernel addresses.
+
+ stress_slb [PPC]
+ Limits the number of kernel SLB entries, and flushes
+ them frequently to increase the rate of SLB faults
+ on kernel addresses.
+
sunrpc.min_resvport=
sunrpc.max_resvport=
[NFS,SUNRPC]
@@ -5552,14 +6151,12 @@
This parameter controls use of the Protected
Execution Facility on pSeries.
- swapaccount=[0|1]
- [KNL] Enable accounting of swap in memory resource
- controller if no parameter or 1 is given or disable
- it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
-
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
- Format: { <int> | force | noforce }
+ Format: { <int> [,<int>] | force | noforce }
<int> -- Number of I/O TLB slabs
+ <int> -- Second integer after comma. Number of swiotlb
+ areas with their own lock. Will be rounded up
+ to a power of 2.
force -- force using of bounce buffers even if they
wouldn't be automatically used by the kernel
noforce -- Never use bounce buffers (for debugging)
@@ -5575,15 +6172,6 @@
later by a loaded module cannot be set this way.
Example: sysctl.vm.swappiness=40
- sysfs.deprecated=0|1 [KNL]
- Enable/disable old style sysfs layout for old udev
- on older distributions. When this option is enabled
- very new udev will not work anymore. When this option
- is disabled (or CONFIG_SYSFS_DEPRECATED not compiled)
- in older udev will not work anymore.
- Default depends on CONFIG_SYSFS_DEPRECATED_V2 set in
- the kernel configuration.
-
sysrq_always_enabled
[KNL]
Ignore sysrq setting - this boot parameter will
@@ -5599,7 +6187,8 @@
tdfx= [HW,DRM]
- test_suspend= [SUSPEND][,N]
+ test_suspend= [SUSPEND]
+ Format: { "mem" | "standby" | "freeze" }[,N]
Specify "mem" (for Suspend-to-RAM) or "standby" (for
standby suspend) or "freeze" (for suspend type freeze)
as the system sleep state during system startup with
@@ -5683,22 +6272,95 @@
This will guarantee that all the other pcrs
are saved.
+ tp_printk [FTRACE]
+ Have the tracepoints sent to printk as well as the
+ tracing ring buffer. This is useful for early boot up
+ where the system hangs or reboots and does not give the
+ option for reading the tracing buffer or performing a
+ ftrace_dump_on_oops.
+
+ To turn off having tracepoints sent to printk,
+ echo 0 > /proc/sys/kernel/tracepoint_printk
+ Note, echoing 1 into this file without the
+ tracepoint_printk kernel cmdline option has no effect.
+
+ The tp_printk_stop_on_boot (see below) can also be used
+ to stop the printing of events to console at
+ late_initcall_sync.
+
+ ** CAUTION **
+
+ Having tracepoints sent to printk() and activating high
+ frequency tracepoints such as irq or sched, can cause
+ the system to live lock.
+
+ tp_printk_stop_on_boot [FTRACE]
+ When tp_printk (above) is set, it can cause a lot of noise
+ on the console. It may be useful to only include the
+ printing of events during boot up, as user space may
+ make the system inoperable.
+
+ This command line option will stop the printing of events
+ to console at the late_initcall_sync() time frame.
+
trace_buf_size=nn[KMG]
[FTRACE] will set tracing buffer size on each cpu.
+ trace_clock= [FTRACE] Set the clock used for tracing events
+ at boot up.
+ local - Use the per CPU time stamp counter
+ (converted into nanoseconds). Fast, but
+ depending on the architecture, may not be
+ in sync between CPUs.
+ global - Event time stamps are synchronize across
+ CPUs. May be slower than the local clock,
+ but better for some race conditions.
+ counter - Simple counting of events (1, 2, ..)
+ note, some counts may be skipped due to the
+ infrastructure grabbing the clock more than
+ once per event.
+ uptime - Use jiffies as the time stamp.
+ perf - Use the same clock that perf uses.
+ mono - Use ktime_get_mono_fast_ns() for time stamps.
+ mono_raw - Use ktime_get_raw_fast_ns() for time
+ stamps.
+ boot - Use ktime_get_boot_fast_ns() for time stamps.
+ Architectures may add more clocks. See
+ Documentation/trace/ftrace.rst for more details.
+
trace_event=[event-list]
[FTRACE] Set and start specified trace events in order
to facilitate early boot debugging. The event-list is a
comma-separated list of trace events to enable. See
also Documentation/trace/events.rst
+ trace_instance=[instance-info]
+ [FTRACE] Create a ring buffer instance early in boot up.
+ This will be listed in:
+
+ /sys/kernel/tracing/instances
+
+ Events can be enabled at the time the instance is created
+ via:
+
+ trace_instance=<name>,<system1>:<event1>,<system2>:<event2>
+
+ Note, the "<system*>:" portion is optional if the event is
+ unique.
+
+ trace_instance=foo,sched:sched_switch,irq_handler_entry,initcall
+
+ will enable the "sched_switch" event (note, the "sched:" is optional, and
+ the same thing would happen if it was left off). The irq_handler_entry
+ event, and all events under the "initcall" system.
+
trace_options=[option-list]
[FTRACE] Enable or disable tracer options at boot.
The option-list is a comma delimited list of options
that can be enabled or disabled just as if you were
to echo the option name into
- /sys/kernel/debug/tracing/trace_options
+ /sys/kernel/tracing/trace_options
For example, to enable stacktrace option (to dump the
stack trace of each event), add to the command line:
@@ -5708,42 +6370,30 @@
See also Documentation/trace/ftrace.rst "trace options"
section.
- tp_printk[FTRACE]
- Have the tracepoints sent to printk as well as the
- tracing ring buffer. This is useful for early boot up
- where the system hangs or reboots and does not give the
- option for reading the tracing buffer or performing a
- ftrace_dump_on_oops.
+ trace_trigger=[trigger-list]
+ [FTRACE] Add a event trigger on specific events.
+ Set a trigger on top of a specific event, with an optional
+ filter.
- To turn off having tracepoints sent to printk,
- echo 0 > /proc/sys/kernel/tracepoint_printk
- Note, echoing 1 into this file without the
- tracepoint_printk kernel cmdline option has no effect.
+ The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..."
+ Where more than one trigger may be specified that are comma deliminated.
- The tp_printk_stop_on_boot (see below) can also be used
- to stop the printing of events to console at
- late_initcall_sync.
+ For example:
- ** CAUTION **
+ trace_trigger="sched_switch.stacktrace if prev_state == 2"
- Having tracepoints sent to printk() and activating high
- frequency tracepoints such as irq or sched, can cause
- the system to live lock.
+ The above will enable the "stacktrace" trigger on the "sched_switch"
+ event but only trigger it if the "prev_state" of the "sched_switch"
+ event is "2" (TASK_UNINTERUPTIBLE).
- tp_printk_stop_on_boot[FTRACE]
- When tp_printk (above) is set, it can cause a lot of noise
- on the console. It may be useful to only include the
- printing of events during boot up, as user space may
- make the system inoperable.
+ See also "Event triggers" in Documentation/trace/events.rst
- This command line option will stop the printing of events
- to console at the late_initcall_sync() time frame.
traceoff_on_warning
[FTRACE] enable this option to disable tracing when a
warning is hit. This turns off "tracing_on". Tracing can
be enabled again by echoing '1' into the "tracing_on"
- file located in /sys/kernel/debug/tracing/
+ file located in /sys/kernel/tracing/
This option is useful, as it disables the trace before
the WARNING dump is called, which prevents the trace to
@@ -5767,11 +6417,22 @@
sources:
- "tpm"
- "tee"
+ - "caam"
If not specified then it defaults to iterating through
the trust source list starting with TPM and assigns the
first trust source as a backend which is initialized
successfully during iteration.
+ trusted.rng= [KEYS]
+ Format: <string>
+ The RNG used to generate key material for trusted keys.
+ Can be one of:
+ - "kernel"
+ - the same value as trusted.source: "tpm" or "tee"
+ - "default"
+ If not specified, "default" is used. In this case,
+ the RNG's choice is left to each individual trust source.
+
tsc= Disable clocksource stability checks for TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
@@ -5790,6 +6451,16 @@
in situations with strict latency requirements (where
interruptions from clocksource watchdog are not
acceptable).
+ [x86] recalibrate: force recalibration against a HW timer
+ (HPET or PM timer) on systems whose TSC frequency was
+ obtained from HW or FW using either an MSR or CPUID(0x15).
+ Warn if the difference is more than 500 ppm.
+ [x86] watchdog: Use TSC as the watchdog clocksource with
+ which to check other HW timers (HPET or PM timer), but
+ only on systems where TSC has been deemed trustworthy.
+ This will be suppressed by an earlier tsc=nowatchdog and
+ can be overridden by a later tsc=nowatchdog. A console
+ message will flag any such suppression or overriding.
tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
@@ -6079,7 +6750,7 @@
HIGHMEM regardless of setting
of CONFIG_HIGHPTE.
- vdso= [X86,SH]
+ vdso= [X86,SH,SPARC]
On X86_32, this is an alias for vdso32=. Otherwise:
vdso=1: enable VDSO (the default)
@@ -6105,11 +6776,12 @@
video= [FB] Frame buffer configuration
See Documentation/fb/modedb.rst.
- video.brightness_switch_enabled= [0,1]
+ video.brightness_switch_enabled= [ACPI]
+ Format: [0|1]
If set to 1, on receiving an ACPI notify event
generated by hotkey, video driver will adjust brightness
level and then send out the event to user space through
- the allocated input device; If set to 0, video driver
+ the allocated input device. If set to 0, video driver
will only send out the event without touching backlight
brightness level.
default: 1
@@ -6131,7 +6803,7 @@
Can be used multiple times for multiple devices.
vga= [BOOT,X86-32] Select a particular video mode
- See Documentation/x86/boot.rst and
+ See Documentation/arch/x86/boot.rst and
Documentation/admin-guide/svga.rst.
Use vga=ask for menu.
This is actually a boot loader parameter; the value is
@@ -6176,11 +6848,11 @@
functions are at fixed addresses, they make nice
targets for exploits that can control RIP.
- emulate [default] Vsyscalls turn into traps and are
- emulated reasonably safely. The vsyscall
- page is readable.
+ emulate Vsyscalls turn into traps and are emulated
+ reasonably safely. The vsyscall page is
+ readable.
- xonly Vsyscalls turn into traps and are
+ xonly [default] Vsyscalls turn into traps and are
emulated reasonably safely. The vsyscall
page is not readable.
@@ -6294,6 +6966,12 @@
When enabled, memory and cache locality will be
impacted.
+ writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of
+ ioremap_wc().
+
+ on - Enable writecombine, use WUC for ioremap_wc()
+ off - Disable writecombine, use SUC for ioremap_wc()
+
x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
default x2apic cluster mode on platforms
supporting x2apic.
@@ -6321,6 +6999,12 @@
Crash from Xen panic notifier, without executing late
panic() code such as dumping handler.
+ xen_msr_safe= [X86,XEN]
+ Format: <bool>
+ Select whether to always use non-faulting (safe) MSR
+ access functions when running as Xen PV guest. The
+ default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
+
xen_nopvspin [X86,XEN]
Disables the qspinlock slowpath using Xen PV optimizations.
This parameter is obsoleted by "nopvspin" parameter, which
@@ -6349,6 +7033,13 @@
improve timer resolution at the expense of processing
more timer interrupts.
+ xen.balloon_boot_timeout= [XEN]
+ The time (in seconds) to wait before giving up to boot
+ in case initial ballooning fails to free enough memory.
+ Applies only when running as HVM or PVH guest and
+ started with less memory configured than allowed at
+ max. Default is 180.
+
xen.event_eoi_delay= [XEN]
How long to delay EOI handling in case of event
storms (jiffies). Default is 10.
@@ -6364,16 +7055,6 @@
fairer and the number of possible event channels is
much higher. Default is on (use fifo events).
- nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
- Disables the PV optimizations forcing the guest to run
- as generic guest with no PV drivers. Currently support
- XEN HVM, KVM, HYPER_V and VMWARE guest.
-
- nopvspin [X86,XEN,KVM]
- Disables the qspinlock slow path using PV optimizations
- which allow the hypervisor to 'idle' the guest on lock
- contention.
-
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
@@ -6387,6 +7068,12 @@
controller on both pseries and powernv
platforms. Only useful on POWER9 and above.
+ xive.store-eoi=off [PPC]
+ By default on POWER10 and above, the kernel will use
+ stores for EOI handling when the XIVE interrupt mode
+ is active. This option allows the XIVE driver to use
+ loads instead, as on POWER9.
+
xhci-hcd.quirks [USB,KNL]
A hex value specifying bitmask with supplemental xhci
host controller quirks. Meaning of each bit can be
@@ -6410,3 +7097,4 @@
memory, and other data can't be written using
xmon commands.
off xmon is disabled.
+
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
index 5e51ee5b0358..993c2a05f5ee 100644
--- a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -25,7 +25,7 @@ References
- In order to locate kernel-generated OS jitter on CPU N:
- cd /sys/kernel/debug/tracing
+ cd /sys/kernel/tracing
echo 1 > max_graph_depth # Increase the "1" for more detail
echo function_graph > current_tracer
# run workload
@@ -208,7 +208,7 @@ Do at least one of the following:
2. Enable RCU to do its processing remotely via dyntick-idle by
doing all of the following:
- a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+ a. Build with CONFIG_NO_HZ=y.
b. Ensure that the CPU goes idle frequently, allowing other
CPUs to detect that it has passed through an RCU quiescent
state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst
index 6fbe165dcd27..67fd6932cef4 100644
--- a/Documentation/admin-guide/laptops/lg-laptop.rst
+++ b/Documentation/admin-guide/laptops/lg-laptop.rst
@@ -38,7 +38,7 @@ FN lock.
Battery care limit
------------------
-Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit
+Writing 80/100 to /sys/class/power_supply/CMB0/charge_control_end_threshold
sets the maximum capacity to charge the battery. Limiting the charge
reduces battery capacity loss over time.
diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
index 6721a80a2d4f..e27a1c3f634e 100644
--- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst
+++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
@@ -1488,7 +1488,7 @@ Example of command to set keyboard language is mentioned below::
Text corresponding to keyboard layout to be set in sysfs are: be(Belgian),
cz(Czech), da(Danish), de(German), en(English), es(Spain), et(Estonian),
fr(French), fr-ch(French(Switzerland)), hu(Hungarian), it(Italy), jp (Japan),
-nl(Dutch), nn(Norway), pl(Polish), pt(portugese), sl(Slovenian), sv(Sweden),
+nl(Dutch), nn(Norway), pl(Polish), pt(portuguese), sl(Slovenian), sv(Sweden),
tr(Turkey)
WWAN Antenna type
@@ -1520,15 +1520,15 @@ This sysfs attribute controls the keyboard "face" that will be shown on the
Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read
and set.
-- 1 = Home mode
-- 2 = Web-browser mode
-- 3 = Web-conference mode
-- 4 = Function mode
-- 5 = Layflat mode
+- 0 = Home mode
+- 1 = Web-browser mode
+- 2 = Web-conference mode
+- 3 = Function mode
+- 4 = Layflat mode
For more details about which buttons will appear depending on the mode, please
review the laptop's user guide:
-http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf
+https://download.lenovo.com/ibmdl/pub/pc/pccbbs/mobiles_pdf/x1carbon_2_ug_en.pdf
Battery charge control
----------------------
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index d8fc9a59c086..4ff2cc291d18 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -317,7 +317,7 @@ All md devices contain:
suspended (not supported yet)
All IO requests will block. The array can be reconfigured.
- Writing this, if accepted, will block until array is quiessent
+ Writing this, if accepted, will block until array is quiescent
readonly
no resync can happen. no superblocks get written.
diff --git a/Documentation/admin-guide/media/bttv.rst b/Documentation/admin-guide/media/bttv.rst
index 125f6f47123d..58cbaf6df694 100644
--- a/Documentation/admin-guide/media/bttv.rst
+++ b/Documentation/admin-guide/media/bttv.rst
@@ -909,7 +909,7 @@ DE hat diverse Treiber fuer diese Modelle (Stand 09/2002):
- TVPhone98 (Bt878)
- AVerTV und TVCapture98 w/VCR (Bt 878)
- AVerTVStudio und TVPhone98 w/VCR (Bt878)
- - AVerTV GO Serie (Kein SVideo Input)
+ - AVerTV GO Series (Kein SVideo Input)
- AVerTV98 (BT-878 chip)
- AVerTV98 mit Fernbedienung (BT-878 chip)
- AVerTV/FM98 (BT-878 chip)
diff --git a/Documentation/admin-guide/media/building.rst b/Documentation/admin-guide/media/building.rst
index 2d660b76caea..a06473429916 100644
--- a/Documentation/admin-guide/media/building.rst
+++ b/Documentation/admin-guide/media/building.rst
@@ -137,7 +137,7 @@ The ``LIRC user interface`` option adds enhanced functionality when using the
from remote controllers.
The ``Support for eBPF programs attached to lirc devices`` option allows
-the usage of special programs (called eBPF) that would allow aplications
+the usage of special programs (called eBPF) that would allow applications
to add extra remote controller decoding functionality to the Linux Kernel.
The ``Remote controller decoders`` option allows selecting the
diff --git a/Documentation/admin-guide/media/cec-drivers.rst b/Documentation/admin-guide/media/cec-drivers.rst
deleted file mode 100644
index 8d9686c08df9..000000000000
--- a/Documentation/admin-guide/media/cec-drivers.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=================================
-CEC driver-specific documentation
-=================================
-
-.. toctree::
- :maxdepth: 2
-
- pulse8-cec
diff --git a/Documentation/admin-guide/media/cec.rst b/Documentation/admin-guide/media/cec.rst
new file mode 100644
index 000000000000..6b30e355cf23
--- /dev/null
+++ b/Documentation/admin-guide/media/cec.rst
@@ -0,0 +1,380 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+HDMI CEC
+========
+
+Supported hardware in mainline
+==============================
+
+HDMI Transmitters:
+
+- Exynos4
+- Exynos5
+- STIH4xx HDMI CEC
+- V4L2 adv7511 (same HW, but a different driver from the drm adv7511)
+- stm32
+- Allwinner A10 (sun4i)
+- Raspberry Pi
+- dw-hdmi (Synopsis IP)
+- amlogic (meson ao-cec and ao-cec-g12a)
+- drm adv7511/adv7533
+- omap4
+- tegra
+- rk3288, rk3399
+- tda998x
+- DisplayPort CEC-Tunneling-over-AUX on i915, nouveau and amdgpu
+- ChromeOS EC CEC
+- CEC for SECO boards (UDOO x86).
+- Chrontel CH7322
+
+
+HDMI Receivers:
+
+- adv7604/11/12
+- adv7842
+- tc358743
+
+USB Dongles (see below for additional information on how to use these
+dongles):
+
+- Pulse-Eight: the pulse8-cec driver implements the following module option:
+ ``persistent_config``: by default this is off, but when set to 1 the driver
+ will store the current settings to the device's internal eeprom and restore
+ it the next time the device is connected to the USB port.
+- RainShadow Tech. Note: this driver does not support the persistent_config
+ module option of the Pulse-Eight driver. The hardware supports it, but I
+ have no plans to add this feature. But I accept patches :-)
+
+Miscellaneous:
+
+- vivid: emulates a CEC receiver and CEC transmitter.
+ Can be used to test CEC applications without actual CEC hardware.
+
+- cec-gpio. If the CEC pin is hooked up to a GPIO pin then
+ you can control the CEC line through this driver. This supports error
+ injection as well.
+
+- cec-gpio and Allwinner A10 (or any other driver that uses the CEC pin
+ framework to drive the CEC pin directly): the CEC pin framework uses
+ high-resolution timers. These timers are affected by NTP daemons that
+ speed up or slow down the clock to sync with the official time. The
+ chronyd server will by default increase or decrease the clock by
+ 1/12th. This will cause the CEC timings to go out of spec. To fix this,
+ add a 'maxslewrate 40000' line to chronyd.conf. This limits the clock
+ frequency change to 1/25th, which keeps the CEC timings within spec.
+
+
+Utilities
+=========
+
+Utilities are available here: https://git.linuxtv.org/v4l-utils.git
+
+``utils/cec-ctl``: control a CEC device
+
+``utils/cec-compliance``: test compliance of a remote CEC device
+
+``utils/cec-follower``: emulate a CEC follower device
+
+Note that ``cec-ctl`` has support for the CEC Hospitality Profile as is
+used in some hotel displays. See http://www.htng.org.
+
+Note that the libcec library (https://github.com/Pulse-Eight/libcec) supports
+the linux CEC framework.
+
+If you want to get the CEC specification, then look at the References of
+the HDMI wikipedia page: https://en.wikipedia.org/wiki/HDMI. CEC is part
+of the HDMI specification. HDMI 1.3 is freely available (very similar to
+HDMI 1.4 w.r.t. CEC) and should be good enough for most things.
+
+
+DisplayPort to HDMI Adapters with working CEC
+=============================================
+
+Background: most adapters do not support the CEC Tunneling feature,
+and of those that do many did not actually connect the CEC pin.
+Unfortunately, this means that while a CEC device is created, it
+is actually all alone in the world and will never be able to see other
+CEC devices.
+
+This is a list of known working adapters that have CEC Tunneling AND
+that properly connected the CEC pin. If you find adapters that work
+but are not in this list, then drop me a note.
+
+To test: hook up your DP-to-HDMI adapter to a CEC capable device
+(typically a TV), then run::
+
+ cec-ctl --playback # Configure the PC as a CEC Playback device
+ cec-ctl -S # Show the CEC topology
+
+The ``cec-ctl -S`` command should show at least two CEC devices,
+ourselves and the CEC device you are connected to (i.e. typically the TV).
+
+General note: I have only seen this work with the Parade PS175, PS176 and
+PS186 chipsets and the MegaChips 2900. While MegaChips 28x0 claims CEC support,
+I have never seen it work.
+
+USB-C to HDMI
+-------------
+
+Samsung Multiport Adapter EE-PW700: https://www.samsung.com/ie/support/model/EE-PW700BBEGWW/
+
+Kramer ADC-U31C/HF: https://www.kramerav.com/product/ADC-U31C/HF
+
+Club3D CAC-2504: https://www.club-3d.com/en/detail/2449/usb_3.1_type_c_to_hdmi_2.0_uhd_4k_60hz_active_adapter/
+
+DisplayPort to HDMI
+-------------------
+
+Club3D CAC-1080: https://www.club-3d.com/en/detail/2442/displayport_1.4_to_hdmi_2.0b_hdr/
+
+CableCreation (SKU: CD0712): https://www.cablecreation.com/products/active-displayport-to-hdmi-adapter-4k-hdr
+
+HP DisplayPort to HDMI True 4k Adapter (P/N 2JA63AA): https://www.hp.com/us-en/shop/pdp/hp-displayport-to-hdmi-true-4k-adapter
+
+Mini-DisplayPort to HDMI
+------------------------
+
+Club3D CAC-1180: https://www.club-3d.com/en/detail/2443/mini_displayport_1.4_to_hdmi_2.0b_hdr/
+
+Note that passive adapters will never work, you need an active adapter.
+
+The Club3D adapters in this list are all MegaChips 2900 based. Other Club3D adapters
+are PS176 based and do NOT have the CEC pin hooked up, so only the three Club3D
+adapters above are known to work.
+
+I suspect that MegaChips 2900 based designs in general are likely to work
+whereas with the PS176 it is more hit-and-miss (mostly miss). The PS186 is
+likely to have the CEC pin hooked up, it looks like they changed the reference
+design for that chipset.
+
+
+USB CEC Dongles
+===============
+
+These dongles appear as ``/dev/ttyACMX`` devices and need the ``inputattach``
+utility to create the ``/dev/cecX`` devices. Support for the Pulse-Eight
+has been added to ``inputattach`` 1.6.0. Support for the Rainshadow Tech has
+been added to ``inputattach`` 1.6.1.
+
+You also need udev rules to automatically start systemd services::
+
+ SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="2548", ATTRS{idProduct}=="1002", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="pulse8-cec-inputattach@%k.service"
+ SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="2548", ATTRS{idProduct}=="1001", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="pulse8-cec-inputattach@%k.service"
+ SUBSYSTEM=="tty", KERNEL=="ttyACM[0-9]*", ATTRS{idVendor}=="04d8", ATTRS{idProduct}=="ff59", ACTION=="add", TAG+="systemd", ENV{SYSTEMD_WANTS}+="rainshadow-cec-inputattach@%k.service"
+
+and these systemd services:
+
+For Pulse-Eight make /lib/systemd/system/pulse8-cec-inputattach@.service::
+
+ [Unit]
+ Description=inputattach for pulse8-cec device on %I
+
+ [Service]
+ Type=simple
+ ExecStart=/usr/bin/inputattach --pulse8-cec /dev/%I
+
+For the RainShadow Tech make /lib/systemd/system/rainshadow-cec-inputattach@.service::
+
+ [Unit]
+ Description=inputattach for rainshadow-cec device on %I
+
+ [Service]
+ Type=simple
+ ExecStart=/usr/bin/inputattach --rainshadow-cec /dev/%I
+
+
+For proper suspend/resume support create: /lib/systemd/system/restart-cec-inputattach.service::
+
+ [Unit]
+ Description=restart inputattach for cec devices
+ After=suspend.target
+
+ [Service]
+ Type=forking
+ ExecStart=/bin/bash -c 'for d in /dev/serial/by-id/usb-Pulse-Eight*; do /usr/bin/inputattach --daemon --pulse8-cec $d; done; for d in /dev/serial/by-id/usb-RainShadow_Tech*; do /usr/bin/inputattach --daemon --rainshadow-cec $d; done'
+
+ [Install]
+ WantedBy=suspend.target
+
+And run ``systemctl enable restart-cec-inputattach``.
+
+To automatically set the physical address of the CEC device whenever the
+EDID changes, you can use ``cec-ctl`` with the ``-E`` option::
+
+ cec-ctl -E /sys/class/drm/card0-DP-1/edid
+
+This assumes the dongle is connected to the card0-DP-1 output (``xrandr`` will tell
+you which output is used) and it will poll for changes to the EDID and update
+the Physical Address whenever they occur.
+
+To automatically run this command you can use cron. Edit crontab with
+``crontab -e`` and add this line::
+
+ @reboot /usr/local/bin/cec-ctl -E /sys/class/drm/card0-DP-1/edid
+
+This only works for display drivers that expose the EDID in ``/sys/class/drm``,
+such as the i915 driver.
+
+
+CEC Without HPD
+===============
+
+Some displays when in standby mode have no HDMI Hotplug Detect signal, but
+CEC is still enabled so connected devices can send an <Image View On> CEC
+message in order to wake up such displays. Unfortunately, not all CEC
+adapters can support this. An example is the Odroid-U3 SBC that has a
+level-shifter that is powered off when the HPD signal is low, thus
+blocking the CEC pin. Even though the SoC can use CEC without a HPD,
+the level-shifter will prevent this from functioning.
+
+There is a CEC capability flag to signal this: ``CEC_CAP_NEEDS_HPD``.
+If set, then the hardware cannot wake up displays with this behavior.
+
+Note for CEC application implementers: the <Image View On> message must
+be the first message you send, don't send any other messages before.
+Certain very bad but unfortunately not uncommon CEC implementations
+get very confused if they receive anything else but this message and
+they won't wake up.
+
+When writing a driver it can be tricky to test this. There are two
+ways to do this:
+
+1) Get a Pulse-Eight USB CEC dongle, connect an HDMI cable from your
+ device to the Pulse-Eight, but do not connect the Pulse-Eight to
+ the display.
+
+ Now configure the Pulse-Eight dongle::
+
+ cec-ctl -p0.0.0.0 --tv
+
+ and start monitoring::
+
+ sudo cec-ctl -M
+
+ On the device you are testing run::
+
+ cec-ctl --playback
+
+ It should report a physical address of f.f.f.f. Now run this
+ command::
+
+ cec-ctl -t0 --image-view-on
+
+ The Pulse-Eight should see the <Image View On> message. If not,
+ then something (hardware and/or software) is preventing the CEC
+ message from going out.
+
+ To make sure you have the wiring correct just connect the
+ Pulse-Eight to a CEC-enabled display and run the same command
+ on your device: now there is a HPD, so you should see the command
+ arriving at the Pulse-Eight.
+
+2) If you have another linux device supporting CEC without HPD, then
+ you can just connect your device to that device. Yes, you can connect
+ two HDMI outputs together. You won't have a HPD (which is what we
+ want for this test), but the second device can monitor the CEC pin.
+
+ Otherwise use the same commands as in 1.
+
+If CEC messages do not come through when there is no HPD, then you
+need to figure out why. Typically it is either a hardware restriction
+or the software powers off the CEC core when the HPD goes low. The
+first cannot be corrected of course, the second will likely required
+driver changes.
+
+
+Microcontrollers & CEC
+======================
+
+We have seen some CEC implementations in displays that use a microcontroller
+to sample the bus. This does not have to be a problem, but some implementations
+have timing issues. This is hard to discover unless you can hook up a low-level
+CEC debugger (see the next section).
+
+You will see cases where the CEC transmitter holds the CEC line high or low for
+a longer time than is allowed. For directed messages this is not a problem since
+if that happens the message will not be Acked and it will be retransmitted.
+For broadcast messages no such mechanism exists.
+
+It's not clear what to do about this. It is probably wise to transmit some
+broadcast messages twice to reduce the chance of them being lost. Specifically
+<Standby> and <Active Source> are candidates for that.
+
+
+Making a CEC debugger
+=====================
+
+By using a Raspberry Pi 4B and some cheap components you can make
+your own low-level CEC debugger.
+
+The critical component is one of these HDMI female-female passthrough connectors
+(full soldering type 1):
+
+https://elabbay.myshopify.com/collections/camera/products/hdmi-af-af-v1a-hdmi-type-a-female-to-hdmi-type-a-female-pass-through-adapter-breakout-board?variant=45533926147
+
+The video quality is variable and certainly not enough to pass-through 4kp60
+(594 MHz) video. You might be able to support 4kp30, but more likely you will
+be limited to 1080p60 (148.5 MHz). But for CEC testing that is fine.
+
+You need a breadboard and some breadboard wires:
+
+http://www.dx.com/p/diy-40p-male-to-female-male-to-male-female-to-female-dupont-line-wire-3pcs-356089#.WYLOOXWGN7I
+
+If you want to monitor the HPD and/or 5V lines as well, then you need one of
+these 5V to 3.3V level shifters:
+
+https://www.adafruit.com/product/757
+
+(This is just where I got these components, there are many other places you
+can get similar things).
+
+The ground pin of the HDMI connector needs to be connected to a ground
+pin of the Raspberry Pi, of course.
+
+The CEC pin of the HDMI connector needs to be connected to these pins:
+GPIO 6 and GPIO 7. The optional HPD pin of the HDMI connector should
+be connected via the level shifter to these pins: GPIO 23 and GPIO 12.
+The optional 5V pin of the HDMI connector should be connected via the
+level shifter to these pins: GPIO 25 and GPIO 22. Monitoring the HPD and
+5V lines is not necessary, but it is helpful.
+
+This device tree addition in ``arch/arm/boot/dts/bcm2711-rpi-4-b.dts``
+will hook up the cec-gpio driver correctly::
+
+ cec@6 {
+ compatible = "cec-gpio";
+ cec-gpios = <&gpio 6 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
+ hpd-gpios = <&gpio 23 GPIO_ACTIVE_HIGH>;
+ v5-gpios = <&gpio 25 GPIO_ACTIVE_HIGH>;
+ };
+
+ cec@7 {
+ compatible = "cec-gpio";
+ cec-gpios = <&gpio 7 (GPIO_ACTIVE_HIGH|GPIO_OPEN_DRAIN)>;
+ hpd-gpios = <&gpio 12 GPIO_ACTIVE_HIGH>;
+ v5-gpios = <&gpio 22 GPIO_ACTIVE_HIGH>;
+ };
+
+If you haven't hooked up the HPD and/or 5V lines, then just delete those
+lines.
+
+This dts change will enable two cec GPIO devices: I typically use one to
+send/receive CEC commands and the other to monitor. If you monitor using
+an unconfigured CEC adapter then it will use GPIO interrupts which makes
+monitoring very accurate.
+
+If you just want to monitor traffic, then a single instance is sufficient.
+The minimum configuration is one HDMI female-female passthrough connector
+and two female-female breadboard wires: one for connecting the HDMI ground
+pin to a ground pin on the Raspberry Pi, and the other to connect the HDMI
+CEC pin to GPIO 6 on the Raspberry Pi.
+
+The documentation on how to use the error injection is here: :ref:`cec_pin_error_inj`.
+
+``cec-ctl --monitor-pin`` will do low-level CEC bus sniffing and analysis.
+You can also store the CEC traffic to file using ``--store-pin`` and analyze
+it later using ``--analyze-pin``.
+
+You can also use this as a full-fledged CEC device by configuring it
+using ``cec-ctl --tv -p0.0.0.0`` or ``cec-ctl --playback -p1.0.0.0``.
diff --git a/Documentation/admin-guide/media/cpia2.rst b/Documentation/admin-guide/media/cpia2.rst
deleted file mode 100644
index f6ffef686462..000000000000
--- a/Documentation/admin-guide/media/cpia2.rst
+++ /dev/null
@@ -1,145 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-The cpia2 driver
-================
-
-Authors: Peter Pregler <Peter_Pregler@email.com>,
-Scott J. Bertin <scottbertin@yahoo.com>, and
-Jarl Totland <Jarl.Totland@bdc.no> for the original cpia driver, which
-this one was modelled from.
-
-Introduction
-------------
-
-This is a driver for STMicroelectronics's CPiA2 (second generation
-Colour Processor Interface ASIC) based cameras. This camera outputs an MJPEG
-stream at up to vga size. It implements the Video4Linux interface as much as
-possible. Since the V4L interface does not support compressed formats, only
-an mjpeg enabled application can be used with the camera. We have modified the
-gqcam application to view this stream.
-
-The driver is implemented as two kernel modules. The cpia2 module
-contains the camera functions and the V4L interface. The cpia2_usb module
-contains usb specific functions. The main reason for this was the size of the
-module was getting out of hand, so I separated them. It is not likely that
-there will be a parallel port version.
-
-Features
---------
-
-- Supports cameras with the Vision stv6410 (CIF) and stv6500 (VGA) cmos
- sensors. I only have the vga sensor, so can't test the other.
-- Image formats: VGA, QVGA, CIF, QCIF, and a number of sizes in between.
- VGA and QVGA are the native image sizes for the VGA camera. CIF is done
- in the coprocessor by scaling QVGA. All other sizes are done by clipping.
-- Palette: YCrCb, compressed with MJPEG.
-- Some compression parameters are settable.
-- Sensor framerate is adjustable (up to 30 fps CIF, 15 fps VGA).
-- Adjust brightness, color, contrast while streaming.
-- Flicker control settable for 50 or 60 Hz mains frequency.
-
-Making and installing the stv672 driver modules
------------------------------------------------
-
-Requirements
-~~~~~~~~~~~~
-
-Video4Linux must be either compiled into the kernel or
-available as a module. Video4Linux2 is automatically detected and made
-available at compile time.
-
-Setup
-~~~~~
-
-Use ``modprobe cpia2`` to load and ``modprobe -r cpia2`` to unload. This
-may be done automatically by your distribution.
-
-Driver options
-~~~~~~~~~~~~~~
-
-.. tabularcolumns:: |p{13ex}|L|
-
-
-============== ========================================================
-Option Description
-============== ========================================================
-video_nr video device to register (0=/dev/video0, etc)
- range -1 to 64. default is -1 (first available)
- If you have more than 1 camera, this MUST be -1.
-buffer_size Size for each frame buffer in bytes (default 68k)
-num_buffers Number of frame buffers (1-32, default 3)
-alternate USB Alternate (2-7, default 7)
-flicker_freq Frequency for flicker reduction(50 or 60, default 60)
-flicker_mode 0 to disable, or 1 to enable flicker reduction.
- (default 0). This is only effective if the camera
- uses a stv0672 coprocessor.
-============== ========================================================
-
-Setting the options
-~~~~~~~~~~~~~~~~~~~
-
-If you are using modules, edit /etc/modules.conf and add an options
-line like this::
-
- options cpia2 num_buffers=3 buffer_size=65535
-
-If the driver is compiled into the kernel, at boot time specify them
-like this::
-
- cpia2.num_buffers=3 cpia2.buffer_size=65535
-
-What buffer size should I use?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The maximum image size depends on the alternate you choose, and the
-frame rate achieved by the camera. If the compression engine is able to
-keep up with the frame rate, the maximum image size is given by the table
-below.
-
-The compression engine starts out at maximum compression, and will
-increase image quality until it is close to the size in the table. As long
-as the compression engine can keep up with the frame rate, after a short time
-the images will all be about the size in the table, regardless of resolution.
-
-At low alternate settings, the compression engine may not be able to
-compress the image enough and will reduce the frame rate by producing larger
-images.
-
-The default of 68k should be good for most users. This will handle
-any alternate at frame rates down to 15fps. For lower frame rates, it may
-be necessary to increase the buffer size to avoid having frames dropped due
-to insufficient space.
-
-========== ========== ======== =====
-Alternate bytes/ms 15fps 30fps
-========== ========== ======== =====
- 2 128 8533 4267
- 3 384 25600 12800
- 4 640 42667 21333
- 5 768 51200 25600
- 6 896 59733 29867
- 7 1023 68200 34100
-========== ========== ======== =====
-
-Table: Image size(bytes)
-
-
-How many buffers should I use?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For normal streaming, 3 should give the best results. With only 2,
-it is possible for the camera to finish sending one image just after a
-program has started reading the other. If this happens, the driver must drop
-a frame. The exception to this is if you have a heavily loaded machine. In
-this case use 2 buffers. You are probably not reading at the full frame rate.
-If the camera can send multiple images before a read finishes, it could
-overwrite the third buffer before the read finishes, leading to a corrupt
-image. Single and double buffering have extra checks to avoid overwriting.
-
-Using the camera
-~~~~~~~~~~~~~~~~
-
-We are providing a modified gqcam application to view the output. In
-order to avoid confusion, here it is called mview. There is also the qx5view
-program which can also control the lights on the qx5 microscope. MJPEG Tools
-(http://mjpeg.sourceforge.net) can also be used to record from the camera.
diff --git a/Documentation/admin-guide/media/davinci-vpbe.rst b/Documentation/admin-guide/media/davinci-vpbe.rst
deleted file mode 100644
index 9e6360fd02db..000000000000
--- a/Documentation/admin-guide/media/davinci-vpbe.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-The VPBE V4L2 driver design
-===========================
-
-Functional partitioning
------------------------
-
-Consists of the following:
-
- 1. V4L2 display driver
-
- Implements creation of video2 and video3 device nodes and
- provides v4l2 device interface to manage VID0 and VID1 layers.
-
- 2. Display controller
-
- Loads up VENC, OSD and external encoders such as ths8200. It provides
- a set of API calls to V4L2 drivers to set the output/standards
- in the VENC or external sub devices. It also provides
- a device object to access the services from OSD subdevice
- using sub device ops. The connection of external encoders to VENC LCD
- controller port is done at init time based on default output and standard
- selection or at run time when application change the output through
- V4L2 IOCTLs.
-
- When connected to an external encoder, vpbe controller is also responsible
- for setting up the interface between VENC and external encoders based on
- board specific settings (specified in board-xxx-evm.c). This allows
- interfacing external encoders such as ths8200. The setup_if_config()
- is implemented for this as well as configure_venc() (part of the next patch)
- API to set timings in VENC for a specific display resolution. As of this
- patch series, the interconnection and enabling and setting of the external
- encoders is not present, and would be a part of the next patch series.
-
- 3. VENC subdevice module
-
- Responsible for setting outputs provided through internal DACs and also
- setting timings at LCD controller port when external encoders are connected
- at the port or LCD panel timings required. When external encoder/LCD panel
- is connected, the timings for a specific standard/preset is retrieved from
- the board specific table and the values are used to set the timings in
- venc using non-standard timing mode.
-
- Support LCD Panel displays using the VENC. For example to support a Logic
- PD display, it requires setting up the LCD controller port with a set of
- timings for the resolution supported and setting the dot clock. So we could
- add the available outputs as a board specific entry (i.e add the "LogicPD"
- output name to board-xxx-evm.c). A table of timings for various LCDs
- supported can be maintained in the board specific setup file to support
- various LCD displays.As of this patch a basic driver is present, and this
- support for external encoders and displays forms a part of the next
- patch series.
-
- 4. OSD module
-
- OSD module implements all OSD layer management and hardware specific
- features. The VPBE module interacts with the OSD for enabling and
- disabling appropriate features of the OSD.
-
-Current status
---------------
-
-A fully functional working version of the V4L2 driver is available. This
-driver has been tested with NTSC and PAL standards and buffer streaming.
diff --git a/Documentation/admin-guide/media/dvb-drivers.rst b/Documentation/admin-guide/media/dvb-drivers.rst
index 8df637c375f9..66fa4edd0606 100644
--- a/Documentation/admin-guide/media/dvb-drivers.rst
+++ b/Documentation/admin-guide/media/dvb-drivers.rst
@@ -13,4 +13,3 @@ Digital TV driver-specific documentation
opera-firmware
technisat
ttusb-dec
- zr364xx
diff --git a/Documentation/admin-guide/media/fimc.rst b/Documentation/admin-guide/media/fimc.rst
index 56b149d9a527..267ef52fe387 100644
--- a/Documentation/admin-guide/media/fimc.rst
+++ b/Documentation/admin-guide/media/fimc.rst
@@ -14,7 +14,7 @@ data from LCD controller (FIMD) through the SoC internal writeback data
path. There are multiple FIMC instances in the SoCs (up to 4), having
slightly different capabilities, like pixel alignment constraints, rotator
availability, LCD writeback support, etc. The driver is located at
-drivers/media/platform/exynos4-is directory.
+drivers/media/platform/samsung/exynos4-is directory.
Supported SoCs
--------------
diff --git a/Documentation/admin-guide/media/i2c-cardlist.rst b/Documentation/admin-guide/media/i2c-cardlist.rst
index e60d459d18a9..1825a0bb47bd 100644
--- a/Documentation/admin-guide/media/i2c-cardlist.rst
+++ b/Documentation/admin-guide/media/i2c-cardlist.rst
@@ -58,27 +58,29 @@ Camera sensor devices
============ ==========================================================
Driver Name
============ ==========================================================
+ccs MIPI CCS compliant camera sensors (also SMIA++ and SMIA)
et8ek8 ET8EK8 camera sensor
hi556 Hynix Hi-556 sensor
+hi846 Hynix Hi-846 sensor
+imx208 Sony IMX208 sensor
imx214 Sony IMX214 sensor
imx219 Sony IMX219 sensor
imx258 Sony IMX258 sensor
imx274 Sony IMX274 sensor
imx290 Sony IMX290 sensor
imx319 Sony IMX319 sensor
+imx334 Sony IMX334 sensor
imx355 Sony IMX355 sensor
-m5mols Fujitsu M-5MOLS 8MP sensor
+imx412 Sony IMX412 sensor
mt9m001 mt9m001
-mt9m032 MT9M032 camera sensor
mt9m111 mt9m111, mt9m112 and mt9m131
mt9p031 Aptina MT9P031
-mt9t001 Aptina MT9T001
mt9t112 Aptina MT9T111/MT9T112
mt9v011 Micron mt9v011 sensor
mt9v032 Micron MT9V032 sensor
mt9v111 Aptina MT9V111 sensor
-noon010pc30 Siliconfile NOON010PC30 sensor
ov13858 OmniVision OV13858 sensor
+ov13b10 OmniVision OV13B10 sensor
ov2640 OmniVision OV2640 sensor
ov2659 OmniVision OV2659 sensor
ov2680 OmniVision OV2680 sensor
@@ -103,10 +105,6 @@ s5c73m3 Samsung S5C73M3 sensor
s5k4ecgx Samsung S5K4ECGX sensor
s5k5baf Samsung S5K5BAF sensor
s5k6a3 Samsung S5K6A3 sensor
-s5k6aa Samsung S5K6AAFX sensor
-smiapp SMIA++/SMIA sensor
-sr030pc30 Siliconfile SR030PC30 sensor
-vs6624 ST VS6624 sensor
============ ==========================================================
Flash devices
@@ -138,6 +136,7 @@ Driver Name
ad5820 AD5820 lens voice coil
ak7375 AK7375 lens voice coil
dw9714 DW9714 lens voice coil
+dw9768 DW9768 lens voice coil
dw9807-vcm DW9807 lens voice coil
============ ==========================================================
@@ -216,7 +215,6 @@ Video encoders
============ ==========================================================
Driver Name
============ ==========================================================
-ad9389b Analog Devices AD9389B encoder
adv7170 Analog Devices ADV7170 video encoder
adv7175 Analog Devices ADV7175 video encoder
adv7343 ADV7343 video encoder
@@ -278,7 +276,7 @@ tda9887 TDA 9885/6/7 analog IF demodulator
tea5761 TEA 5761 radio tuner
tea5767 TEA 5767 radio tuner
tua9001 Infineon TUA9001 silicon tuner
-tuner-xc2028 XCeive xc2028/xc3028 tuners
+xc2028 XCeive xc2028/xc3028 tuners
xc4000 Xceive XC4000 silicon tuner
xc5000 Xceive XC5000 silicon tuner
============ ==================================================
diff --git a/Documentation/admin-guide/media/imx7.rst b/Documentation/admin-guide/media/imx7.rst
index 1e442c97da47..2fa27718f52a 100644
--- a/Documentation/admin-guide/media/imx7.rst
+++ b/Documentation/admin-guide/media/imx7.rst
@@ -33,7 +33,7 @@ reference manual [#f1]_.
Entities
--------
-imx7-mipi-csi2
+imx-mipi-csi2
--------------
This is the MIPI CSI-2 receiver entity. It has one sink pad to receive the pixel
@@ -155,6 +155,66 @@ the resolutions supported by the sensor.
[fmt:SBGGR10_1X10/800x600@1/30 field:none colorspace:srgb]
-> "imx7-mipi-csis.0":0 [ENABLED]
+i.MX6ULL-EVK with OV5640
+------------------------
+
+On this platform a parallel OV5640 sensor is connected to the CSI port.
+The following example configures a video capture pipeline with an output
+of 640x480 and UYVY8_2X8 format:
+
+.. code-block:: none
+
+ # Setup links
+ media-ctl -l "'ov5640 1-003c':0 -> 'csi':0[1]"
+ media-ctl -l "'csi':1 -> 'csi capture':0[1]"
+
+ # Configure pads for pipeline
+ media-ctl -v -V "'ov5640 1-003c':0 [fmt:UYVY8_2X8/640x480 field:none]"
+
+After this streaming can start:
+
+.. code-block:: none
+
+ gst-launch-1.0 -v v4l2src device=/dev/video1 ! video/x-raw,format=UYVY,width=640,height=480 ! v4l2convert ! fbdevsink
+
+.. code-block:: none
+
+ # media-ctl -p
+ Media controller API version 5.14.0
+
+ Media device information
+ ------------------------
+ driver imx7-csi
+ model imx-media
+ serial
+ bus info
+ hw revision 0x0
+ driver version 5.14.0
+
+ Device topology
+ - entity 1: csi (2 pads, 2 links)
+ type V4L2 subdev subtype Unknown flags 0
+ device node name /dev/v4l-subdev0
+ pad0: Sink
+ [fmt:UYVY8_2X8/640x480 field:none colorspace:srgb xfer:srgb ycbcr:601 quantization:full-range]
+ <- "ov5640 1-003c":0 [ENABLED,IMMUTABLE]
+ pad1: Source
+ [fmt:UYVY8_2X8/640x480 field:none colorspace:srgb xfer:srgb ycbcr:601 quantization:full-range]
+ -> "csi capture":0 [ENABLED,IMMUTABLE]
+
+ - entity 4: csi capture (1 pad, 1 link)
+ type Node subtype V4L flags 0
+ device node name /dev/video1
+ pad0: Sink
+ <- "csi":1 [ENABLED,IMMUTABLE]
+
+ - entity 10: ov5640 1-003c (1 pad, 1 link)
+ type V4L2 subdev subtype Sensor flags 0
+ device node name /dev/v4l-subdev1
+ pad0: Source
+ [fmt:UYVY8_2X8/640x480@1/30 field:none colorspace:srgb xfer:srgb ycbcr:601 quantization:full-range]
+ -> "csi":0 [ENABLED,IMMUTABLE]
+
References
----------
diff --git a/Documentation/admin-guide/media/index.rst b/Documentation/admin-guide/media/index.rst
index c676af665111..43f4a292b245 100644
--- a/Documentation/admin-guide/media/index.rst
+++ b/Documentation/admin-guide/media/index.rst
@@ -38,13 +38,14 @@ The media subsystem
remote-controller
+ cec
+
dvb
cardlist
v4l-drivers
dvb-drivers
- cec-drivers
**Copyright** |copy| 1999-2020 : LinuxTV Developers
diff --git a/Documentation/admin-guide/media/ipu3.rst b/Documentation/admin-guide/media/ipu3.rst
index 52c1c04173da..83b3cd03b35c 100644
--- a/Documentation/admin-guide/media/ipu3.rst
+++ b/Documentation/admin-guide/media/ipu3.rst
@@ -51,10 +51,11 @@ to userspace as a V4L2 sub-device node and has two pads:
.. tabularcolumns:: |p{0.8cm}|p{4.0cm}|p{4.0cm}|
.. flat-table::
+ :header-rows: 1
- * - pad
- - direction
- - purpose
+ * - Pad
+ - Direction
+ - Purpose
* - 0
- sink
@@ -148,10 +149,11 @@ Each pipe has two sink pads and three source pads for the following purpose:
.. tabularcolumns:: |p{0.8cm}|p{4.0cm}|p{4.0cm}|
.. flat-table::
+ :header-rows: 1
- * - pad
- - direction
- - purpose
+ * - Pad
+ - Direction
+ - Purpose
* - 0
- sink
diff --git a/Documentation/admin-guide/media/ivtv.rst b/Documentation/admin-guide/media/ivtv.rst
index 7b8775d20214..101f16d0263e 100644
--- a/Documentation/admin-guide/media/ivtv.rst
+++ b/Documentation/admin-guide/media/ivtv.rst
@@ -159,7 +159,7 @@ whatever). Otherwise the device numbers can get confusing. The ivtv
Read-only
The raw YUV video output from the current video input. The YUV format
- is non-standard (V4L2_PIX_FMT_HM12).
+ is a 16x16 linear tiled NV12 format (V4L2_PIX_FMT_NV12_16L16)
Note that the YUV and PCM streams are not synchronized, so they are of
limited use.
diff --git a/Documentation/admin-guide/media/meye.rst b/Documentation/admin-guide/media/meye.rst
deleted file mode 100644
index 9098a1e65f8b..000000000000
--- a/Documentation/admin-guide/media/meye.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. include:: <isonum.txt>
-
-Vaio Picturebook Motion Eye Camera Driver
-=========================================
-
-Copyright |copy| 2001-2004 Stelian Pop <stelian@popies.net>
-
-Copyright |copy| 2001-2002 Alcôve <www.alcove.com>
-
-Copyright |copy| 2000 Andrew Tridgell <tridge@samba.org>
-
-This driver enable the use of video4linux compatible applications with the
-Motion Eye camera. This driver requires the "Sony Laptop Extras" driver (which
-can be found in the "Misc devices" section of the kernel configuration utility)
-to be compiled and installed (using its "camera=1" parameter).
-
-It can do at maximum 30 fps @ 320x240 or 15 fps @ 640x480.
-
-Grabbing is supported in packed YUV colorspace only.
-
-MJPEG hardware grabbing is supported via a private API (see below).
-
-Hardware supported
-------------------
-
-This driver supports the 'second' version of the MotionEye camera :)
-
-The first version was connected directly on the video bus of the Neomagic
-video card and is unsupported.
-
-The second one, made by Kawasaki Steel is fully supported by this
-driver (PCI vendor/device is 0x136b/0xff01)
-
-The third one, present in recent (more or less last year) Picturebooks
-(C1M* models), is not supported. The manufacturer has given the specs
-to the developers under a NDA (which allows the development of a GPL
-driver however), but things are not moving very fast (see
-http://r-engine.sourceforge.net/) (PCI vendor/device is 0x10cf/0x2011).
-
-There is a forth model connected on the USB bus in TR1* Vaio laptops.
-This camera is not supported at all by the current driver, in fact
-little information if any is available for this camera
-(USB vendor/device is 0x054c/0x0107).
-
-Driver options
---------------
-
-Several options can be passed to the meye driver using the standard
-module argument syntax (<param>=<value> when passing the option to the
-module or meye.<param>=<value> on the kernel boot line when meye is
-statically linked into the kernel). Those options are:
-
-.. code-block:: none
-
- gbuffers: number of capture buffers, default is 2 (32 max)
-
- gbufsize: size of each capture buffer, default is 614400
-
- video_nr: video device to register (0 = /dev/video0, etc)
-
-Module use
-----------
-
-In order to automatically load the meye module on use, you can put those lines
-in your /etc/modprobe.d/meye.conf file:
-
-.. code-block:: none
-
- alias char-major-81 videodev
- alias char-major-81-0 meye
- options meye gbuffers=32
-
-Usage:
-------
-
-.. code-block:: none
-
- xawtv >= 3.49 (<http://bytesex.org/xawtv/>)
- for display and uncompressed video capture:
-
- xawtv -c /dev/video0 -geometry 640x480
- or
- xawtv -c /dev/video0 -geometry 320x240
-
- motioneye (<http://popies.net/meye/>)
- for getting ppm or jpg snapshots, mjpeg video
-
-Bugs / Todo
------------
-
-- 'motioneye' still uses the meye private v4l1 API extensions.
diff --git a/Documentation/admin-guide/media/omap3isp.rst b/Documentation/admin-guide/media/omap3isp.rst
index bc447bbec7ce..f32e7375a1a2 100644
--- a/Documentation/admin-guide/media/omap3isp.rst
+++ b/Documentation/admin-guide/media/omap3isp.rst
@@ -17,7 +17,7 @@ Introduction
------------
This file documents the Texas Instruments OMAP 3 Image Signal Processor (ISP)
-driver located under drivers/media/platform/omap3isp. The original driver was
+driver located under drivers/media/platform/ti/omap3isp. The original driver was
written by Texas Instruments but since that it has been rewritten (twice) at
Nokia.
diff --git a/Documentation/admin-guide/media/omap4_camera.rst b/Documentation/admin-guide/media/omap4_camera.rst
index 24db4222d36d..2ada9b1e6897 100644
--- a/Documentation/admin-guide/media/omap4_camera.rst
+++ b/Documentation/admin-guide/media/omap4_camera.rst
@@ -25,7 +25,7 @@ As of Revision AB, the ISS is described in detail in section 8.
This driver is supporting **only** the CSI2-A/B interfaces for now.
It makes use of the Media Controller framework [#f2]_, and inherited most of the
-code from OMAP3 ISP driver (found under drivers/media/platform/omap3isp/\*),
+code from OMAP3 ISP driver (found under drivers/media/platform/ti/omap3isp/\*),
except that it doesn't need an IOMMU now for ISS buffers memory mapping.
Supports usage of MMAP buffers only (for now).
diff --git a/Documentation/admin-guide/media/other-usb-cardlist.rst b/Documentation/admin-guide/media/other-usb-cardlist.rst
index bbfdb1389c18..fb88db50e861 100644
--- a/Documentation/admin-guide/media/other-usb-cardlist.rst
+++ b/Documentation/admin-guide/media/other-usb-cardlist.rst
@@ -14,8 +14,6 @@ dvb-as102 nBox DVB-T Dongle 0b89:0007
dvb-as102 Sky IT Digital Key (green led) 2137:0001
b2c2-flexcop-usb Technisat/B2C2 FlexCop II/IIb/III 0af7:0101
Digital TV
-cpia2 Vision's CPiA2 cameras 0553:0100, 0553:0140,
- such as the Digital Blue QX5 0553:0151
go7007 WIS GO7007 MPEG encoder 1943:a250, 093b:a002,
093b:a004, 0eb1:6666,
0eb1:6668
@@ -66,7 +64,6 @@ pwc Visionite VCS-UC300 0d81:1900
pwc Visionite VCS-UM100 0d81:1910
s2255drv Sensoray 2255 1943:2255, 1943:2257
stk1160 STK1160 USB video capture dongle 05e1:0408
-stkwebcam Syntek DC1125 174f:a311, 05e1:0501
dvb-ttusb-budget Technotrend/Hauppauge Nova-USB devices 0b48:1003, 0b48:1004,
0b48:1005
dvb-ttusb_dec Technotrend/Hauppauge MPEG decoder 0b48:1006
@@ -78,15 +75,4 @@ dvb-ttusb_dec Technotrend/Hauppauge MPEG decoder
DEC2540-t 0b48:1009
usbtv Fushicai USBTV007 Audio-Video Grabber 1b71:3002, 1f71:3301,
1f71:3306
-zr364xx USB ZR364XX Camera 08ca:0109, 041e:4024,
- 0d64:0108, 0546:3187,
- 0d64:3108, 0595:4343,
- 0bb0:500d, 0feb:2004,
- 055f:b500, 08ca:2062,
- 052b:1a18, 04c8:0729,
- 04f2:a208, 0784:0040,
- 06d6:0034, 0a17:0062,
- 06d6:003b, 0a17:004e,
- 041e:405d, 08ca:2102,
- 06d6:003d
================ ====================================== =====================
diff --git a/Documentation/admin-guide/media/pci-cardlist.rst b/Documentation/admin-guide/media/pci-cardlist.rst
index f4d670e632f8..42528795d4da 100644
--- a/Documentation/admin-guide/media/pci-cardlist.rst
+++ b/Documentation/admin-guide/media/pci-cardlist.rst
@@ -77,7 +77,6 @@ ipu3-cio2 Intel ipu3-cio2 driver
ivtv Conexant cx23416/cx23415 MPEG encoder/decoder
ivtvfb Conexant cx23415 framebuffer
mantis MANTIS based cards
-meye Sony Vaio Picturebook Motion Eye
mxb Siemens-Nixdorf 'Multimedia eXtension Board'
netup-unidvb NetUP Universal DVB card
ngene Micronas nGene
diff --git a/Documentation/admin-guide/media/platform-cardlist.rst b/Documentation/admin-guide/media/platform-cardlist.rst
index 261e7772eb3e..1230ae4037ad 100644
--- a/Documentation/admin-guide/media/platform-cardlist.rst
+++ b/Documentation/admin-guide/media/platform-cardlist.rst
@@ -30,7 +30,6 @@ exynos-fimc-is EXYNOS4x12 FIMC-IS (Imaging Subsystem)
exynos-fimc-lite EXYNOS FIMC-LITE camera interface
exynos-gsc Samsung Exynos G-Scaler
exy Samsung S5P/EXYNOS4 SoC series Camera Subsystem
-fsl-viu Freescale VIU
imx-pxp i.MX Pixel Pipeline (PXP)
isdf TI DM365 ISIF video capture
mmp_camera Marvell Armada 610 integrated camera controller
@@ -60,6 +59,7 @@ s5p-mfc Samsung S5P MFC Video Codec
sh_veu SuperH VEU mem2mem video processing
sh_vou SuperH VOU video output
stm32-dcmi STM32 Digital Camera Memory Interface (DCMI)
+stm32-dma2d STM32 Chrom-Art Accelerator Unit
sun4i-csi Allwinner A10 CMOS Sensor Interface Support
sun6i-csi Allwinner V3s Camera Sensor Interface
sun8i-di Allwinner Deinterlace
@@ -72,7 +72,6 @@ via-camera VIAFB camera controller
video-mux Video Multiplexer
vpif_display TI DaVinci VPIF V4L2-Display
vpif_capture TI DaVinci VPIF video capture
-vpss TI DaVinci VPBE V4L2-Display
vsp1 Renesas VSP1 Video Processing Engine
xilinx-tpg Xilinx Video Test Pattern Generator
xilinx-video Xilinx Video IP (EXPERIMENTAL)
diff --git a/Documentation/admin-guide/media/pulse8-cec.rst b/Documentation/admin-guide/media/pulse8-cec.rst
deleted file mode 100644
index 356d08b519f3..000000000000
--- a/Documentation/admin-guide/media/pulse8-cec.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Pulse-Eight CEC Adapter driver
-==============================
-
-The pulse8-cec driver implements the following module option:
-
-``persistent_config``
----------------------
-
-By default this is off, but when set to 1 the driver will store the current
-settings to the device's internal eeprom and restore it the next time the
-device is connected to the USB port.
diff --git a/Documentation/admin-guide/media/si476x.rst b/Documentation/admin-guide/media/si476x.rst
index 87062301d6a1..c8882ee9f208 100644
--- a/Documentation/admin-guide/media/si476x.rst
+++ b/Documentation/admin-guide/media/si476x.rst
@@ -142,7 +142,7 @@ The drivers exposes following files:
indicator
0x18 lassi Signed Low side adjacent Channel
Strength indicator
- 0x19 hassi ditto fpr High side
+ 0x19 hassi ditto for High side
0x20 mult Multipath indicator
0x21 dev Frequency deviation
0x24 assi Adjacent channel SSI
diff --git a/Documentation/admin-guide/media/tm6000-cardlist.rst b/Documentation/admin-guide/media/tm6000-cardlist.rst
deleted file mode 100644
index 6d2769c0f4d8..000000000000
--- a/Documentation/admin-guide/media/tm6000-cardlist.rst
+++ /dev/null
@@ -1,83 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-TM6000 cards list
-=================
-
-.. tabularcolumns:: |p{1.4cm}|p{11.1cm}|p{4.2cm}|
-
-.. flat-table::
- :header-rows: 1
- :widths: 2 19 18
- :stub-columns: 0
-
- * - Card number
- - Card name
- - USB IDs
-
- * - 0
- - Unknown tm6000 video grabber
- -
-
- * - 1
- - Generic tm5600 board
- - 6000:0001
-
- * - 2
- - Generic tm6000 board
- -
-
- * - 3
- - Generic tm6010 board
- - 6000:0002
-
- * - 4
- - 10Moons UT 821
- -
-
- * - 5
- - 10Moons UT 330
- -
-
- * - 6
- - ADSTECH Dual TV USB
- - 06e1:f332
-
- * - 7
- - Freecom Hybrid Stick / Moka DVB-T Receiver Dual
- - 14aa:0620
-
- * - 8
- - ADSTECH Mini Dual TV USB
- - 06e1:b339
-
- * - 9
- - Hauppauge WinTV HVR-900H / WinTV USB2-Stick
- - 2040:6600, 2040:6601, 2040:6610, 2040:6611
-
- * - 10
- - Beholder Wander DVB-T/TV/FM USB2.0
- - 6000:dec0
-
- * - 11
- - Beholder Voyager TV/FM USB2.0
- - 6000:dec1
-
- * - 12
- - Terratec Cinergy Hybrid XE / Cinergy Hybrid-Stick
- - 0ccd:0086, 0ccd:00A5
-
- * - 13
- - Twinhan TU501(704D1)
- - 13d3:3240, 13d3:3241, 13d3:3243, 13d3:3264
-
- * - 14
- - Beholder Wander Lite DVB-T/TV/FM USB2.0
- - 6000:dec2
-
- * - 15
- - Beholder Voyager Lite TV/FM USB2.0
- - 6000:dec3
-
- * - 16
- - Terratec Grabster AV 150/250 MX
- - 0ccd:0079
diff --git a/Documentation/admin-guide/media/usb-cardlist.rst b/Documentation/admin-guide/media/usb-cardlist.rst
index 1e96f928e0af..5f5ab0723e48 100644
--- a/Documentation/admin-guide/media/usb-cardlist.rst
+++ b/Documentation/admin-guide/media/usb-cardlist.rst
@@ -43,7 +43,6 @@ Driver Name
airspy AirSpy
au0828 Auvitek AU0828
b2c2-flexcop-usb Technisat/B2C2 Air/Sky/Cable2PC USB
-cpia2 CPiA2 Video For Linux
cx231xx Conexant cx231xx USB video capture
dvb-as102 Abilis AS102 DVB receiver
dvb-ttusb-budget Technotrend/Hauppauge Nova - USB devices
@@ -93,15 +92,10 @@ pwc USB Philips Cameras
s2250 Sensoray 2250/2251
s2255drv USB Sensoray 2255 video capture device
smsusb Siano SMS1xxx based MDTV receiver
-stkwebcam USB Syntek DC1125 Camera
-tm6000-alsa TV Master TM5600/6000/6010 audio
-tm6000-dvb DVB Support for tm6000 based TV cards
-tm6000 TV Master TM5600/6000/6010 driver
ttusb_dec Technotrend/Hauppauge USB DEC devices
usbtv USBTV007 video capture
uvcvideo USB Video Class (UVC)
zd1301 ZyDAS ZD1301
-zr364xx USB ZR364XX Camera
====================== =========================================================
.. toctree::
@@ -110,7 +104,6 @@ zr364xx USB ZR364XX Camera
au0828-cardlist
cx231xx-cardlist
em28xx-cardlist
- tm6000-cardlist
siano-cardlist
gspca-cardlist
diff --git a/Documentation/admin-guide/media/v4l-drivers.rst b/Documentation/admin-guide/media/v4l-drivers.rst
index 9c7ebe2ca3bd..1c41f87c3917 100644
--- a/Documentation/admin-guide/media/v4l-drivers.rst
+++ b/Documentation/admin-guide/media/v4l-drivers.rst
@@ -11,15 +11,12 @@ Video4Linux (V4L) driver-specific documentation
bttv
cafe_ccic
- cpia2
cx88
- davinci-vpbe
fimc
imx
imx7
ipu3
ivtv
- meye
omap3isp
omap4_camera
philips
@@ -31,4 +28,5 @@ Video4Linux (V4L) driver-specific documentation
si4713
si476x
vimc
+ visl
vivid
diff --git a/Documentation/admin-guide/media/vimc.dot b/Documentation/admin-guide/media/vimc.dot
index 57863a13fa39..92a5bb631235 100644
--- a/Documentation/admin-guide/media/vimc.dot
+++ b/Documentation/admin-guide/media/vimc.dot
@@ -5,18 +5,22 @@ digraph board {
n00000001 [label="{{} | Sensor A\n/dev/v4l-subdev0 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
n00000001:port0 -> n00000005:port0 [style=bold]
n00000001:port0 -> n0000000b [style=bold]
+ n00000001 -> n00000002
+ n00000002 [label="{{} | Lens A\n/dev/v4l-subdev5 | {<port0>}}", shape=Mrecord, style=filled, fillcolor=green]
n00000003 [label="{{} | Sensor B\n/dev/v4l-subdev1 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
n00000003:port0 -> n00000008:port0 [style=bold]
n00000003:port0 -> n0000000f [style=bold]
+ n00000003 -> n00000004
+ n00000004 [label="{{} | Lens B\n/dev/v4l-subdev6 | {<port0>}}", shape=Mrecord, style=filled, fillcolor=green]
n00000005 [label="{{<port0> 0} | Debayer A\n/dev/v4l-subdev2 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000005:port1 -> n00000017:port0
+ n00000005:port1 -> n00000015:port0
n00000008 [label="{{<port0> 0} | Debayer B\n/dev/v4l-subdev3 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000008:port1 -> n00000017:port0 [style=dashed]
+ n00000008:port1 -> n00000015:port0 [style=dashed]
n0000000b [label="Raw Capture 0\n/dev/video0", shape=box, style=filled, fillcolor=yellow]
n0000000f [label="Raw Capture 1\n/dev/video1", shape=box, style=filled, fillcolor=yellow]
- n00000013 [label="RGB/YUV Input\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
- n00000013 -> n00000017:port0 [style=dashed]
- n00000017 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev4 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000017:port1 -> n0000001a [style=bold]
- n0000001a [label="RGB/YUV Capture\n/dev/video3", shape=box, style=filled, fillcolor=yellow]
+ n00000013 [label="{{} | RGB/YUV Input\n/dev/v4l-subdev4 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000013:port0 -> n00000015:port0 [style=dashed]
+ n00000015 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev5 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000015:port1 -> n00000018 [style=bold]
+ n00000018 [label="RGB/YUV Capture\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
}
diff --git a/Documentation/admin-guide/media/vimc.rst b/Documentation/admin-guide/media/vimc.rst
index 211cc8972410..29d843a8ddb1 100644
--- a/Documentation/admin-guide/media/vimc.rst
+++ b/Documentation/admin-guide/media/vimc.rst
@@ -35,11 +35,11 @@ of commands fits for the default topology:
media-ctl -d platform:vimc -V '"Sensor A":0[fmt:SBGGR8_1X8/640x480]'
media-ctl -d platform:vimc -V '"Debayer A":0[fmt:SBGGR8_1X8/640x480]'
- media-ctl -d platform:vimc -V '"Sensor B":0[fmt:SBGGR8_1X8/640x480]'
- media-ctl -d platform:vimc -V '"Debayer B":0[fmt:SBGGR8_1X8/640x480]'
- v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=1920,height=1440
+ media-ctl -d platform:vimc -V '"Scaler":0[fmt:RGB888_1X24/640x480]'
+ media-ctl -d platform:vimc -V '"Scaler":0[crop:(100,50)/400x150]'
+ media-ctl -d platform:vimc -V '"Scaler":1[fmt:RGB888_1X24/300x700]'
+ v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=300,height=700
v4l2-ctl -z platform:vimc -d "Raw Capture 0" -v pixelformat=BA81
- v4l2-ctl -z platform:vimc -d "Raw Capture 1" -v pixelformat=BA81
Subdevices
----------
@@ -53,6 +53,25 @@ vimc-sensor:
* 1 Pad source
+vimc-lens:
+ Ancillary lens for a sensor. Supports auto focus control. Linked to
+ a vimc-sensor using an ancillary link. The lens supports FOCUS_ABSOLUTE
+ control.
+
+.. code-block:: bash
+
+ media-ctl -p
+ ...
+ - entity 28: Lens A (0 pad, 0 link)
+ type V4L2 subdev subtype Lens flags 0
+ device node name /dev/v4l-subdev6
+ - entity 29: Lens B (0 pad, 0 link)
+ type V4L2 subdev subtype Lens flags 0
+ device node name /dev/v4l-subdev7
+ v4l2-ctl -d /dev/v4l-subdev7 -C focus_absolute
+ focus_absolute: 0
+
+
vimc-debayer:
Transforms images in bayer format into a non-bayer format.
Exposes:
@@ -61,9 +80,10 @@ vimc-debayer:
* 1 Pad source
vimc-scaler:
- Scale up the image by a factor of 3. E.g.: a 640x480 image becomes a
- 1920x1440 image. (this value can be configured, see at
- `Module options`_).
+ Re-size the image to meet the source pad resolution. E.g.: if the sync
+ pad is configured to 360x480 and the source to 1280x720, the image will
+ be stretched to fit the source resolution. Works for any resolution
+ within the vimc limitations (even shrinking the image if necessary).
Exposes:
* 1 Pad sink
@@ -76,15 +96,15 @@ vimc-capture:
* 1 Pad sink
* 1 Pad source
-
Module options
--------------
Vimc has a module parameter to configure the driver.
-* ``sca_mult=<unsigned int>``
+* ``allocator=<unsigned int>``
+
+ memory allocator selection, default is 0. It specifies the way buffers
+ will be allocated.
- Image size multiplier factor to be used to multiply both width and
- height, so the image size will be ``sca_mult^2`` bigger than the
- original one. Currently, only supports scaling up (the default value
- is 3).
+ - 0: vmalloc
+ - 1: dma-contig
diff --git a/Documentation/admin-guide/media/visl.rst b/Documentation/admin-guide/media/visl.rst
new file mode 100644
index 000000000000..7d2dc78341c9
--- /dev/null
+++ b/Documentation/admin-guide/media/visl.rst
@@ -0,0 +1,175 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+The Virtual Stateless Decoder Driver (visl)
+===========================================
+
+A virtual stateless decoder device for stateless uAPI development
+purposes.
+
+This tool's objective is to help the development and testing of
+userspace applications that use the V4L2 stateless API to decode media.
+
+A userspace implementation can use visl to run a decoding loop even when
+no hardware is available or when the kernel uAPI for the codec has not
+been upstreamed yet. This can reveal bugs at an early stage.
+
+This driver can also trace the contents of the V4L2 controls submitted
+to it. It can also dump the contents of the vb2 buffers through a
+debugfs interface. This is in many ways similar to the tracing
+infrastructure available for other popular encode/decode APIs out there
+and can help develop a userspace application by using another (working)
+one as a reference.
+
+.. note::
+
+ No actual decoding of video frames is performed by visl. The
+ V4L2 test pattern generator is used to write various debug information
+ to the capture buffers instead.
+
+Module parameters
+-----------------
+
+- visl_debug: Activates debug info, printing various debug messages through
+ dprintk. Also controls whether per-frame debug info is shown. Defaults to off.
+ Note that enabling this feature can result in slow performance through serial.
+
+- visl_transtime_ms: Simulated process time in milliseconds. Slowing down the
+ decoding speed can be useful for debugging.
+
+- visl_dprintk_frame_start, visl_dprintk_frame_nframes: Dictates a range of
+ frames where dprintk is activated. This only controls the dprintk tracing on a
+ per-frame basis. Note that printing a lot of data can be slow through serial.
+
+- keep_bitstream_buffers: Controls whether bitstream (i.e. OUTPUT) buffers are
+ kept after a decoding session. Defaults to false so as to reduce the amount of
+ clutter. keep_bitstream_buffers == false works well when live debugging the
+ client program with GDB.
+
+- bitstream_trace_frame_start, bitstream_trace_nframes: Similar to
+ visl_dprintk_frame_start, visl_dprintk_nframes, but controls the dumping of
+ buffer data through debugfs instead.
+
+What is the default use case for this driver?
+---------------------------------------------
+
+This driver can be used as a way to compare different userspace implementations.
+This assumes that a working client is run against visl and that the ftrace and
+OUTPUT buffer data is subsequently used to debug a work-in-progress
+implementation.
+
+Information on reference frames, their timestamps, the status of the OUTPUT and
+CAPTURE queues and more can be read directly from the CAPTURE buffers.
+
+Supported codecs
+----------------
+
+The following codecs are supported:
+
+- FWHT
+- MPEG2
+- VP8
+- VP9
+- H.264
+- HEVC
+
+visl trace events
+-----------------
+The trace events are defined on a per-codec basis, e.g.:
+
+.. code-block:: bash
+
+ $ ls /sys/kernel/debug/tracing/events/ | grep visl
+ visl_fwht_controls
+ visl_h264_controls
+ visl_hevc_controls
+ visl_mpeg2_controls
+ visl_vp8_controls
+ visl_vp9_controls
+
+For example, in order to dump HEVC SPS data:
+
+.. code-block:: bash
+
+ $ echo 1 > /sys/kernel/debug/tracing/events/visl_hevc_controls/v4l2_ctrl_hevc_sps/enable
+
+The SPS data will be dumped to the trace buffer, i.e.:
+
+.. code-block:: bash
+
+ $ cat /sys/kernel/debug/tracing/trace
+ video_parameter_set_id 0
+ seq_parameter_set_id 0
+ pic_width_in_luma_samples 1920
+ pic_height_in_luma_samples 1080
+ bit_depth_luma_minus8 0
+ bit_depth_chroma_minus8 0
+ log2_max_pic_order_cnt_lsb_minus4 4
+ sps_max_dec_pic_buffering_minus1 6
+ sps_max_num_reorder_pics 2
+ sps_max_latency_increase_plus1 0
+ log2_min_luma_coding_block_size_minus3 0
+ log2_diff_max_min_luma_coding_block_size 3
+ log2_min_luma_transform_block_size_minus2 0
+ log2_diff_max_min_luma_transform_block_size 3
+ max_transform_hierarchy_depth_inter 2
+ max_transform_hierarchy_depth_intra 2
+ pcm_sample_bit_depth_luma_minus1 0
+ pcm_sample_bit_depth_chroma_minus1 0
+ log2_min_pcm_luma_coding_block_size_minus3 0
+ log2_diff_max_min_pcm_luma_coding_block_size 0
+ num_short_term_ref_pic_sets 0
+ num_long_term_ref_pics_sps 0
+ chroma_format_idc 1
+ sps_max_sub_layers_minus1 0
+ flags AMP_ENABLED|SAMPLE_ADAPTIVE_OFFSET|TEMPORAL_MVP_ENABLED|STRONG_INTRA_SMOOTHING_ENABLED
+
+
+Dumping OUTPUT buffer data through debugfs
+------------------------------------------
+
+If the **VISL_DEBUGFS** Kconfig is enabled, visl will populate
+**/sys/kernel/debug/visl/bitstream** with OUTPUT buffer data according to the
+values of bitstream_trace_frame_start and bitstream_trace_nframes. This can
+highlight errors as broken clients may fail to fill the buffers properly.
+
+A single file is created for each processed OUTPUT buffer. Its name contains an
+integer that denotes the buffer sequence, i.e.:
+
+.. code-block:: c
+
+ snprintf(name, 32, "bitstream%d", run->src->sequence);
+
+Dumping the values is simply a matter of reading from the file, i.e.:
+
+For the buffer with sequence == 0:
+
+.. code-block:: bash
+
+ $ xxd /sys/kernel/debug/visl/bitstream/bitstream0
+ 00000000: 2601 af04 d088 bc25 a173 0e41 a4f2 3274 &......%.s.A..2t
+ 00000010: c668 cb28 e775 b4ac f53a ba60 f8fd 3aa1 .h.(.u...:.`..:.
+ 00000020: 46b4 bcfc 506c e227 2372 e5f5 d7ea 579f F...Pl.'#r....W.
+ 00000030: 6371 5eb5 0eb8 23b5 ca6a 5de5 983a 19e4 cq^...#..j]..:..
+ 00000040: e8c3 4320 b4ba a226 cbc1 4138 3a12 32d6 ..C ...&..A8:.2.
+ 00000050: fef3 247b 3523 4e90 9682 ac8e eb0c a389 ..${5#N.........
+ 00000060: ddd0 6cfc 0187 0e20 7aae b15b 1812 3d33 ..l.... z..[..=3
+ 00000070: e1c5 f425 a83a 00b7 4f18 8127 3c4c aefb ...%.:..O..'<L..
+
+For the buffer with sequence == 1:
+
+.. code-block:: bash
+
+ $ xxd /sys/kernel/debug/visl/bitstream/bitstream1
+ 00000000: 0201 d021 49e1 0c40 aa11 1449 14a6 01dc ...!I..@...I....
+ 00000010: 7023 889a c8cd 2cd0 13b4 dab0 e8ca 21fe p#....,.......!.
+ 00000020: c4c8 ab4c 486e 4e2f b0df 96cc c74e 8dde ...LHnN/.....N..
+ 00000030: 8ce7 ee36 d880 4095 4d64 30a0 ff4f 0c5e ...6..@.Md0..O.^
+ 00000040: f16b a6a1 d806 ca2a 0ece a673 7bea 1f37 .k.....*...s{..7
+ 00000050: 370f 5bb9 1dc4 ba21 6434 bc53 0173 cba0 7.[....!d4.S.s..
+ 00000060: dfe6 bc99 01ea b6e0 346b 92b5 c8de 9f5d ........4k.....]
+ 00000070: e7cc 3484 1769 fef2 a693 a945 2c8b 31da ..4..i.....E,.1.
+
+And so on.
+
+By default, the files are removed during STREAMOFF. This is to reduce the amount
+of clutter.
diff --git a/Documentation/admin-guide/media/vivid.rst b/Documentation/admin-guide/media/vivid.rst
index 6d7175f96f74..58ac25b2c385 100644
--- a/Documentation/admin-guide/media/vivid.rst
+++ b/Documentation/admin-guide/media/vivid.rst
@@ -392,7 +392,7 @@ Which one is returned depends on the chosen channel, each next valid channel
will cycle through the possible audio subchannel combinations. This allows
you to test the various combinations by just switching channels..
-Finally, for these inputs the v4l2_timecode struct is filled in in the
+Finally, for these inputs the v4l2_timecode struct is filled in the
dequeued v4l2_buffer struct.
@@ -580,7 +580,7 @@ Metadata Capture
----------------
The Metadata capture generates UVC format metadata. The PTS and SCR are
-transmitted based on the values set in vivid contols.
+transmitted based on the values set in vivid controls.
The Metadata device will only work for the Webcam input, it will give
back an error for all other inputs.
@@ -714,6 +714,20 @@ The Test Pattern Controls are all specific to video capture.
does the same for the EAV (End of Active Video) code.
+- Insert Video Guard Band
+
+ adds 4 columns of pixels with the HDMI Video Guard Band code at the
+ left hand side of the image. This only works with 3 or 4 byte RGB pixel
+ formats. The RGB pixel value 0xab/0x55/0xab turns out to be equivalent
+ to the HDMI Video Guard Band code that precedes each active video line
+ (see section 5.2.2.1 in the HDMI 1.3 Specification). To test if a video
+ receiver has correct HDMI Video Guard Band processing, enable this
+ control and then move the image to the left hand side of the screen.
+ That will result in video lines that start with multiple pixels that
+ have the same value as the Video Guard Band that precedes them.
+ Receivers that will just keep skipping Video Guard Band values will
+ now fail and either loose sync or these video lines will shift.
+
Capture Feature Selection Controls
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1304,7 +1318,7 @@ instance. This setup would require the following commands:
$ v4l2-ctl -d2 -i2
$ v4l2-ctl -d2 -c horizontal_movement=4
$ v4l2-ctl -d1 --overlay=1
- $ v4l2-ctl -d1 -c loop_video=1
+ $ v4l2-ctl -d0 -c loop_video=1
$ v4l2-ctl -d2 --stream-mmap --overlay=1
And from another console:
diff --git a/Documentation/admin-guide/media/zr364xx.rst b/Documentation/admin-guide/media/zr364xx.rst
deleted file mode 100644
index 7291e54b8be3..000000000000
--- a/Documentation/admin-guide/media/zr364xx.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Zoran 364xx based USB webcam module
-===================================
-
-site: http://royale.zerezo.com/zr364xx/
-
-mail: royale@zerezo.com
-
-
-Introduction
-------------
-
-
-This brings support under Linux for the Aiptek PocketDV 3300 and similar
-devices in webcam mode. If you just want to get on your PC the pictures
-and movies on the camera, you should use the usb-storage module instead.
-
-The driver works with several other cameras in webcam mode (see the list
-below).
-
-Possible chipsets are : ZR36430 (ZR36430BGC) and
-maybe ZR36431, ZR36440, ZR36442...
-
-You can try the experience changing the vendor/product ID values (look
-at the source code).
-
-You can get these values by looking at /var/log/messages when you plug
-your camera, or by typing : cat /sys/kernel/debug/usb/devices.
-
-
-Install
--------
-
-In order to use this driver, you must compile it with your kernel,
-with the following config options::
-
- ./scripts/config -e USB
- ./scripts/config -m MEDIA_SUPPORT
- ./scripts/config -e MEDIA_USB_SUPPORT
- ./scripts/config -e MEDIA_CAMERA_SUPPORT
- ./scripts/config -m USB_ZR364XX
-
-Usage
------
-
-modprobe zr364xx debug=X mode=Y
-
-- debug : set to 1 to enable verbose debug messages
-- mode : 0 = 320x240, 1 = 160x120, 2 = 640x480
-
-You can then use the camera with V4L2 compatible applications, for
-example Ekiga.
-
-To capture a single image, try this: dd if=/dev/video0 of=test.jpg bs=1M
-count=1
-
-links
------
-
-http://mxhaard.free.fr/ (support for many others cams including some Aiptek PocketDV)
-http://www.harmwal.nl/pccam880/ (this project also supports cameras based on this chipset)
-
-Supported devices
------------------
-
-====== ======= ============== ====================
-Vendor Product Distributor Model
-====== ======= ============== ====================
-0x08ca 0x0109 Aiptek PocketDV 3300
-0x08ca 0x0109 Maxell Maxcam PRO DV3
-0x041e 0x4024 Creative PC-CAM 880
-0x0d64 0x0108 Aiptek Fidelity 3200
-0x0d64 0x0108 Praktica DCZ 1.3 S
-0x0d64 0x0108 Genius Digital Camera (?)
-0x0d64 0x0108 DXG Technology Fashion Cam
-0x0546 0x3187 Polaroid iON 230
-0x0d64 0x3108 Praktica Exakta DC 2200
-0x0d64 0x3108 Genius G-Shot D211
-0x0595 0x4343 Concord Eye-Q Duo 1300
-0x0595 0x4343 Concord Eye-Q Duo 2000
-0x0595 0x4343 Fujifilm EX-10
-0x0595 0x4343 Ricoh RDC-6000
-0x0595 0x4343 Digitrex DSC 1300
-0x0595 0x4343 Firstline FDC 2000
-0x0bb0 0x500d Concord EyeQ Go Wireless
-0x0feb 0x2004 CRS Electronic 3.3 Digital Camera
-0x0feb 0x2004 Packard Bell DSC-300
-0x055f 0xb500 Mustek MDC 3000
-0x08ca 0x2062 Aiptek PocketDV 5700
-0x052b 0x1a18 Chiphead Megapix V12
-0x04c8 0x0729 Konica Revio 2
-0x04f2 0xa208 Creative PC-CAM 850
-0x0784 0x0040 Traveler Slimline X5
-0x06d6 0x0034 Trust Powerc@m 750
-0x0a17 0x0062 Pentax Optio 50L
-0x06d6 0x003b Trust Powerc@m 970Z
-0x0a17 0x004e Pentax Optio 50
-0x041e 0x405d Creative DiVi CAM 516
-0x08ca 0x2102 Aiptek DV T300
-0x06d6 0x003d Trust Powerc@m 910Z
-====== ======= ============== ====================
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
index 4e06ffabd78a..7367e6294ef6 100644
--- a/Documentation/admin-guide/mm/cma_debugfs.rst
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -5,10 +5,10 @@ CMA Debugfs Interface
The CMA debugfs interface is useful to retrieve basic information out of the
different CMA areas and to test allocation/release in each of the areas.
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
+Each CMA area represents a directory under <debugfs>/cma/, represented by
+its CMA name like below:
- <debugfs>/cma/cma-0
+ <debugfs>/cma/<cma_name>
The structure of the files created under that directory is as follows:
@@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
- [RO] bitmap: The bitmap of page states in the zone.
- [WO] alloc: Allocate N pages from that CMA area. For example::
- echo 5 > <debugfs>/cma/cma-2/alloc
+ echo 5 > <debugfs>/cma/<cma_name>/alloc
-would try to allocate 5 pages from the cma-2 area.
+would try to allocate 5 pages from the 'cma_name' area.
- [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst
index b966fcff993b..e796b0a7e4a5 100644
--- a/Documentation/admin-guide/mm/concepts.rst
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -1,5 +1,3 @@
-.. _mm_concepts:
-
=================
Concepts overview
=================
@@ -86,16 +84,15 @@ memory with the huge pages. The first one is `HugeTLB filesystem`, or
hugetlbfs. It is a pseudo filesystem that uses RAM as its backing
store. For the files created in this filesystem the data resides in
the memory and mapped using huge pages. The hugetlbfs is described at
-:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`.
+Documentation/admin-guide/mm/hugetlbpage.rst.
Another, more recent, mechanism that enables use of the huge pages is
called `Transparent HugePages`, or THP. Unlike the hugetlbfs that
requires users and/or system administrators to configure what parts of
the system memory should and can be mapped by the huge pages, THP
manages such mappings transparently to the user and hence the
-name. See
-:ref:`Documentation/admin-guide/mm/transhuge.rst <admin_guide_transhuge>`
-for more details about THP.
+name. See Documentation/admin-guide/mm/transhuge.rst for more details
+about THP.
Zones
=====
@@ -125,8 +122,8 @@ processor. Each bank is referred to as a `node` and for each node Linux
constructs an independent memory management subsystem. A node has its
own set of zones, lists of free and used pages and various statistics
counters. You can find more details about NUMA in
-:ref:`Documentation/vm/numa.rst <numa>` and in
-:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
+Documentation/mm/numa.rst` and in
+Documentation/admin-guide/mm/numa_memory_policy.rst.
Page cache
==========
diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst
index 8c5dde3a5754..33d37bb2fb4e 100644
--- a/Documentation/admin-guide/mm/damon/index.rst
+++ b/Documentation/admin-guide/mm/damon/index.rst
@@ -1,10 +1,10 @@
.. SPDX-License-Identifier: GPL-2.0
-========================
-Monitoring Data Accesses
-========================
+==========================
+DAMON: Data Access MONitor
+==========================
-:doc:`DAMON </vm/damon/index>` allows light-weight data access monitoring.
+:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
Using DAMON, users can analyze the memory access patterns of their systems and
optimize those.
@@ -13,3 +13,5 @@ optimize those.
start
usage
+ reclaim
+ lru_sort
diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst
new file mode 100644
index 000000000000..7b0775d281b4
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/lru_sort.rst
@@ -0,0 +1,294 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+DAMON-based LRU-lists Sorting
+=============================
+
+DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that
+aimed to be used for proactive and lightweight data access pattern based
+(de)prioritization of pages on their LRU-lists for making LRU-lists a more
+trusworthy data access pattern source.
+
+Where Proactive LRU-lists Sorting is Required?
+==============================================
+
+As page-granularity access checking overhead could be significant on huge
+systems, LRU lists are normally not proactively sorted but partially and
+reactively sorted for special events including specific user requests, system
+calls and memory pressure. As a result, LRU lists are sometimes not so
+perfectly prepared to be used as a trustworthy access pattern source for some
+situations including reclamation target pages selection under sudden memory
+pressure.
+
+Because DAMON can identify access patterns of best-effort accuracy while
+inducing only user-specified range of overhead, proactively running
+DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access
+pattern source with low and controlled overhead.
+
+How It Works?
+=============
+
+DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access
+rates that higher than a user-specified threshold) and cold pages (pages of
+memory regions that showing no access for a time that longer than a
+user-specified threshold) using DAMON, and prioritizes hot pages while
+deprioritizing cold pages on their LRU-lists. To avoid it consuming too much
+CPU for the prioritizations, a CPU time usage limit can be configured. Under
+the limit, it prioritizes and deprioritizes more hot and cold pages first,
+respectively. System administrators can also configure under what situation
+this scheme should automatically activated and deactivated with three memory
+pressure watermarks.
+
+Its default parameters for hotness/coldness thresholds and CPU quota limit are
+conservatively chosen. That is, the module under its default parameters could
+be widely used without harm for common situations while providing a level of
+benefits for systems having clear hot/cold access patterns under memory
+pressure while consuming only a limited small portion of CPU time.
+
+Interface: Module Parameters
+============================
+
+To use this feature, you should first ensure your system is running on a kernel
+that is built with ``CONFIG_DAMON_LRU_SORT=y``.
+
+To let sysadmins enable or disable it and tune for the given system,
+DAMON_LRU_SORT utilizes module parameters. That is, you can put
+``damon_lru_sort.<parameter>=<value>`` on the kernel boot command line or write
+proper values to ``/sys/module/damon_lru_sort/parameters/<parameter>`` files.
+
+Below are the description of each parameter.
+
+enabled
+-------
+
+Enable or disable DAMON_LRU_SORT.
+
+You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``.
+Setting it as ``N`` disables DAMON_LRU_SORT. Note that DAMON_LRU_SORT could do
+no real monitoring and LRU-lists sorting due to the watermarks-based activation
+condition. Refer to below descriptions for the watermarks parameter for this.
+
+commit_inputs
+-------------
+
+Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+
+Input parameters that updated while DAMON_LRU_SORT is running are not applied
+by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values
+of parametrs except ``enabled`` again. Once the re-reading is done, this
+parameter is set as ``N``. If invalid parameters are found while the
+re-reading, DAMON_LRU_SORT will be disabled.
+
+hot_thres_access_freq
+---------------------
+
+Access frequency threshold for hot memory regions identification in permil.
+
+If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT
+identifies the region as hot, and mark it as accessed on the LRU list, so that
+it could not be reclaimed under memory pressure. 50% by default.
+
+cold_min_age
+------------
+
+Time threshold for cold memory regions identification in microseconds.
+
+If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+identifies the region as cold, and mark it as unaccessed on the LRU list, so
+that it could be reclaimed first under memory pressure. 120 seconds by
+default.
+
+quota_ms
+--------
+
+Limit of time for trying the LRU lists sorting in milliseconds.
+
+DAMON_LRU_SORT tries to use only up to this time within a time window
+(quota_reset_interval_ms) for trying LRU lists sorting. This can be used
+for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
+limit is disabled.
+
+10 ms by default.
+
+quota_reset_interval_ms
+-----------------------
+
+The time quota charge reset interval in milliseconds.
+
+The charge reset interval for the quota of time (quota_ms). That is,
+DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
+milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
+
+1 second by default.
+
+wmarks_interval
+---------------
+
+The watermarks check time interval in microseconds.
+
+Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
+enabled but inactive due to its watermarks rule. 5 seconds by default.
+
+wmarks_high
+-----------
+
+Free memory rate (per thousand) for the high watermark.
+
+If free memory of the system in bytes per thousand bytes is higher than this,
+DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
+watermarks. 200 (20%) by default.
+
+wmarks_mid
+----------
+
+Free memory rate (per thousand) for the middle watermark.
+
+If free memory of the system in bytes per thousand bytes is between this and
+the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and
+the LRU-lists sorting. 150 (15%) by default.
+
+wmarks_low
+----------
+
+Free memory rate (per thousand) for the low watermark.
+
+If free memory of the system in bytes per thousand bytes is lower than this,
+DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the
+watermarks. 50 (5%) by default.
+
+sample_interval
+---------------
+
+Sampling interval for the monitoring in microseconds.
+
+The sampling interval of DAMON for the cold memory monitoring. Please refer to
+the DAMON documentation (:doc:`usage`) for more detail. 5ms by default.
+
+aggr_interval
+-------------
+
+Aggregation interval for the monitoring in microseconds.
+
+The aggregation interval of DAMON for the cold memory monitoring. Please
+refer to the DAMON documentation (:doc:`usage`) for more detail. 100ms by
+default.
+
+min_nr_regions
+--------------
+
+Minimum number of monitoring regions.
+
+The minimal number of monitoring regions of DAMON for the cold memory
+monitoring. This can be used to set lower-bound of the monitoring quality.
+But, setting this too high could result in increased monitoring overhead.
+Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by
+default.
+
+max_nr_regions
+--------------
+
+Maximum number of monitoring regions.
+
+The maximum number of monitoring regions of DAMON for the cold memory
+monitoring. This can be used to set upper-bound of the monitoring overhead.
+However, setting this too low could result in bad monitoring quality. Please
+refer to the DAMON documentation (:doc:`usage`) for more detail. 1000 by
+defaults.
+
+monitor_region_start
+--------------------
+
+Start of target memory region in physical address.
+
+The start physical address of memory region that DAMON_LRU_SORT will do work
+against. By default, biggest System RAM is used as the region.
+
+monitor_region_end
+------------------
+
+End of target memory region in physical address.
+
+The end physical address of memory region that DAMON_LRU_SORT will do work
+against. By default, biggest System RAM is used as the region.
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. Else,
+-1.
+
+nr_lru_sort_tried_hot_regions
+-----------------------------
+
+Number of hot memory regions that tried to be LRU-sorted.
+
+bytes_lru_sort_tried_hot_regions
+--------------------------------
+
+Total bytes of hot memory regions that tried to be LRU-sorted.
+
+nr_lru_sorted_hot_regions
+-------------------------
+
+Number of hot memory regions that successfully be LRU-sorted.
+
+bytes_lru_sorted_hot_regions
+----------------------------
+
+Total bytes of hot memory regions that successfully be LRU-sorted.
+
+nr_hot_quota_exceeds
+--------------------
+
+Number of times that the time quota limit for hot regions have exceeded.
+
+nr_lru_sort_tried_cold_regions
+------------------------------
+
+Number of cold memory regions that tried to be LRU-sorted.
+
+bytes_lru_sort_tried_cold_regions
+---------------------------------
+
+Total bytes of cold memory regions that tried to be LRU-sorted.
+
+nr_lru_sorted_cold_regions
+--------------------------
+
+Number of cold memory regions that successfully be LRU-sorted.
+
+bytes_lru_sorted_cold_regions
+-----------------------------
+
+Total bytes of cold memory regions that successfully be LRU-sorted.
+
+nr_cold_quota_exceeds
+---------------------
+
+Number of times that the time quota limit for cold regions have exceeded.
+
+Example
+=======
+
+Below runtime example commands make DAMON_LRU_SORT to find memory regions
+having >=50% access frequency and LRU-prioritize while LRU-deprioritizing
+memory regions that not accessed for 120 seconds. The prioritization and
+deprioritization is limited to be done using only up to 1% CPU time to avoid
+DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization. It also
+asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than
+50%, but start the real works if it becomes lower than 40%. If DAMON_RECLAIM
+doesn't make progress and therefore the free memory rate becomes lower than
+20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to
+the LRU-list based page granularity reclamation. ::
+
+ # cd /sys/module/damon_lru_sort/parameters
+ # echo 500 > hot_thres_access_freq
+ # echo 120000000 > cold_min_age
+ # echo 10 > quota_ms
+ # echo 1000 > quota_reset_interval_ms
+ # echo 500 > wmarks_high
+ # echo 400 > wmarks_mid
+ # echo 200 > wmarks_low
+ # echo Y > enabled
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
new file mode 100644
index 000000000000..343e25b252f4
--- /dev/null
+++ b/Documentation/admin-guide/mm/damon/reclaim.rst
@@ -0,0 +1,274 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+DAMON-based Reclamation
+=======================
+
+DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to
+be used for proactive and lightweight reclamation under light memory pressure.
+It doesn't aim to replace the LRU-list based page_granularity reclamation, but
+to be selectively used for different level of memory pressure and requirements.
+
+Where Proactive Reclamation is Required?
+========================================
+
+On general memory over-committed systems, proactively reclaiming cold pages
+helps saving memory and reducing latency spikes that incurred by the direct
+reclaim of the process or CPU consumption of kswapd, while incurring only
+minimal performance degradation [1]_ [2]_ .
+
+Free Pages Reporting [3]_ based memory over-commit virtualization systems are
+good example of the cases. In such systems, the guest VMs reports their free
+memory to host, and the host reallocates the reported memory to other guests.
+As a result, the memory of the systems are fully utilized. However, the
+guests could be not so memory-frugal, mainly because some kernel subsystems and
+user-space applications are designed to use as much memory as available. Then,
+guests could report only small amount of memory as free to host, results in
+memory utilization drop of the systems. Running the proactive reclamation in
+guests could mitigate this problem.
+
+How It Works?
+=============
+
+DAMON_RECLAIM finds memory regions that didn't accessed for specific time
+duration and page out. To avoid it consuming too much CPU for the paging out
+operation, a speed limit can be configured. Under the speed limit, it pages
+out memory regions that didn't accessed longer time first. System
+administrators can also configure under what situation this scheme should
+automatically activated and deactivated with three memory pressure watermarks.
+
+Interface: Module Parameters
+============================
+
+To use this feature, you should first ensure your system is running on a kernel
+that is built with ``CONFIG_DAMON_RECLAIM=y``.
+
+To let sysadmins enable or disable it and tune for the given system,
+DAMON_RECLAIM utilizes module parameters. That is, you can put
+``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
+proper values to ``/sys/module/damon_reclaim/parameters/<parameter>`` files.
+
+Below are the description of each parameter.
+
+enabled
+-------
+
+Enable or disable DAMON_RECLAIM.
+
+You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do
+no real monitoring and reclamation due to the watermarks-based activation
+condition. Refer to below descriptions for the watermarks parameter for this.
+
+commit_inputs
+-------------
+
+Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
+
+Input parameters that updated while DAMON_RECLAIM is running are not applied
+by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
+of parametrs except ``enabled`` again. Once the re-reading is done, this
+parameter is set as ``N``. If invalid parameters are found while the
+re-reading, DAMON_RECLAIM will be disabled.
+
+min_age
+-------
+
+Time threshold for cold memory regions identification in microseconds.
+
+If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+identifies the region as cold, and reclaims it.
+
+120 seconds by default.
+
+quota_ms
+--------
+
+Limit of time for the reclamation in milliseconds.
+
+DAMON_RECLAIM tries to use only up to this time within a time window
+(quota_reset_interval_ms) for trying reclamation of cold pages. This can be
+used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the
+limit is disabled.
+
+10 ms by default.
+
+quota_sz
+--------
+
+Limit of size of memory for the reclamation in bytes.
+
+DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time
+window (quota_reset_interval_ms) and makes no more than this limit is tried.
+This can be used for limiting consumption of CPU and IO. If this value is
+zero, the limit is disabled.
+
+128 MiB by default.
+
+quota_reset_interval_ms
+-----------------------
+
+The time/size quota charge reset interval in milliseconds.
+
+The charget reset interval for the quota of time (quota_ms) and size
+(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
+quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+milliseconds.
+
+1 second by default.
+
+wmarks_interval
+---------------
+
+Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+enabled but inactive due to its watermarks rule.
+
+wmarks_high
+-----------
+
+Free memory rate (per thousand) for the high watermark.
+
+If free memory of the system in bytes per thousand bytes is higher than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks
+the watermarks.
+
+wmarks_mid
+----------
+
+Free memory rate (per thousand) for the middle watermark.
+
+If free memory of the system in bytes per thousand bytes is between this and
+the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and
+the reclaiming.
+
+wmarks_low
+----------
+
+Free memory rate (per thousand) for the low watermark.
+
+If free memory of the system in bytes per thousand bytes is lower than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the
+watermarks. In the case, the system falls back to the LRU-list based page
+granularity reclamation logic.
+
+sample_interval
+---------------
+
+Sampling interval for the monitoring in microseconds.
+
+The sampling interval of DAMON for the cold memory monitoring. Please refer to
+the DAMON documentation (:doc:`usage`) for more detail.
+
+aggr_interval
+-------------
+
+Aggregation interval for the monitoring in microseconds.
+
+The aggregation interval of DAMON for the cold memory monitoring. Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+min_nr_regions
+--------------
+
+Minimum number of monitoring regions.
+
+The minimal number of monitoring regions of DAMON for the cold memory
+monitoring. This can be used to set lower-bound of the monitoring quality.
+But, setting this too high could result in increased monitoring overhead.
+Please refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+max_nr_regions
+--------------
+
+Maximum number of monitoring regions.
+
+The maximum number of monitoring regions of DAMON for the cold memory
+monitoring. This can be used to set upper-bound of the monitoring overhead.
+However, setting this too low could result in bad monitoring quality. Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+monitor_region_start
+--------------------
+
+Start of target memory region in physical address.
+
+The start physical address of memory region that DAMON_RECLAIM will do work
+against. That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims. By default, biggest System RAM is used as the region.
+
+monitor_region_end
+------------------
+
+End of target memory region in physical address.
+
+The end physical address of memory region that DAMON_RECLAIM will do work
+against. That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims. By default, biggest System RAM is used as the region.
+
+skip_anon
+---------
+
+Skip anonymous pages reclamation.
+
+If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+pages. By default, ``N``.
+
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
+-1.
+
+nr_reclaim_tried_regions
+------------------------
+
+Number of memory regions that tried to be reclaimed by DAMON_RECLAIM.
+
+bytes_reclaim_tried_regions
+---------------------------
+
+Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM.
+
+nr_reclaimed_regions
+--------------------
+
+Number of memory regions that successfully be reclaimed by DAMON_RECLAIM.
+
+bytes_reclaimed_regions
+-----------------------
+
+Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM.
+
+nr_quota_exceeds
+----------------
+
+Number of times that the time/space quota limits have exceeded.
+
+Example
+=======
+
+Below runtime example commands make DAMON_RECLAIM to find memory regions that
+not accessed for 30 seconds or more and pages out. The reclamation is limited
+to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too
+much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do
+nothing if the system's free memory rate is more than 50%, but start the real
+works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and
+therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
+do nothing again, so that we can fall back to the LRU-list based page
+granularity reclamation. ::
+
+ # cd /sys/module/damon_reclaim/parameters
+ # echo 30000000 > min_age
+ # echo $((1 * 1024 * 1024 * 1024)) > quota_sz
+ # echo 1000 > quota_reset_interval_ms
+ # echo 500 > wmarks_high
+ # echo 400 > wmarks_mid
+ # echo 200 > wmarks_low
+ # echo Y > enabled
+
+.. [1] https://research.google/pubs/pub48551/
+.. [2] https://lwn.net/Articles/787611/
+.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html
diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst
index d5eb89a8fc38..9f88afc734da 100644
--- a/Documentation/admin-guide/mm/damon/start.rst
+++ b/Documentation/admin-guide/mm/damon/start.rst
@@ -6,39 +6,9 @@ Getting Started
This document briefly describes how you can use DAMON by demonstrating its
default user space tool. Please note that this document describes only a part
-of its features for brevity. Please refer to :doc:`usage` for more details.
-
-
-TL; DR
-======
-
-Follow the commands below to monitor and visualize the memory access pattern of
-your workload. ::
-
- # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
- # mount -t debugfs none /sys/kernel/debug/
- # git clone https://github.com/awslabs/damo
- # ./damo/damo record $(pidof <your workload>)
- # ./damo/damo report heat --plot_ascii
-
-The final command draws the access heatmap of ``<your workload>``. The heatmap
-shows which memory region (x-axis) is accessed when (y-axis) and how frequently
-(number; the higher the more accesses have been observed). ::
-
- 111111111111111111111111111111111111111111111111111111110000
- 111121111111111111111111111111211111111111111111111111110000
- 000000000000000000000000000000000000000000000000001555552000
- 000000000000000000000000000000000000000000000222223555552000
- 000000000000000000000000000000000000000011111677775000000000
- 000000000000000000000000000000000000000488888000000000000000
- 000000000000000000000000000000000177888400000000000000000000
- 000000000000000000000000000046666522222100000000000000000000
- 000000000000000000000014444344444300000000000000000000000000
- 000000000000000002222245555510000000000000000000000000000000
- # access_frequency: 0 1 2 3 4 5 6 7 8 9
- # x-axis: space (140286319947776-140286426374096: 101.496 MiB)
- # y-axis: time (605442256436361-605479951866441: 37.695430s)
- # resolution: 60x10 (1.692 MiB and 3.770s for each character)
+of its features for brevity. Please refer to the usage `doc
+<https://github.com/awslabs/damo/blob/next/USAGE.md>`_ of the tool for more
+details.
Prerequisites
@@ -59,16 +29,9 @@ called DAMON Operator (DAMO). It is available at
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
your ``$PATH``. It's not mandatory, though.
-Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
-detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as
-below::
-
- # mount -t debugfs none /sys/kernel/debug/
-
-or append the following line to your ``/etc/fstab`` file so that your system
-can automatically mount debugfs upon booting::
-
- debugfs /sys/kernel/debug debugfs defaults 0 0
+Because DAMO is using the sysfs interface (refer to :doc:`usage` for the
+detail) of DAMON, you should ensure :doc:`sysfs </filesystems/sysfs>` is
+mounted.
Recording Data Access Patterns
@@ -91,24 +54,74 @@ pattern in the ``damon.data`` file.
Visualizing Recorded Patterns
=============================
-The following three commands visualize the recorded access patterns and save
-the results as separate image files. ::
-
- $ damo report heats --heatmap access_pattern_heatmap.png
- $ damo report wss --range 0 101 1 --plot wss_dist.png
- $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
-
-- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
- heatmap, showing which memory region (y-axis) got accessed when (x-axis)
- and how frequently (color).
-- ``wss_dist.png`` will show the distribution of the working set size.
-- ``wss_chron_change.png`` will show how the working set size has
- chronologically changed.
-
-You can view the visualizations of this example workload at [1]_.
-Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
-
-.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
-.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
-.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
-.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
+You can visualize the pattern in a heatmap, showing which memory region
+(x-axis) got accessed when (y-axis) and how frequently (number).::
+
+ $ sudo damo report heats --heatmap stdout
+ 22222222222222222222222222222222222222211111111111111111111111111111111111111100
+ 44444444444444444444444444444444444444434444444444444444444444444444444444443200
+ 44444444444444444444444444444444444444433444444444444444444444444444444444444200
+ 33333333333333333333333333333333333333344555555555555555555555555555555555555200
+ 33333333333333333333333333333333333344444444444444444444444444444444444444444200
+ 22222222222222222222222222222222222223355555555555555555555555555555555555555200
+ 00000000000000000000000000000000000000288888888888888888888888888888888888888400
+ 00000000000000000000000000000000000000288888888888888888888888888888888888888400
+ 33333333333333333333333333333333333333355555555555555555555555555555555555555200
+ 88888888888888888888888888888888888888600000000000000000000000000000000000000000
+ 88888888888888888888888888888888888888600000000000000000000000000000000000000000
+ 33333333333333333333333333333333333333444444444444444444444444444444444444443200
+ 00000000000000000000000000000000000000288888888888888888888888888888888888888400
+ [...]
+ # access_frequency: 0 1 2 3 4 5 6 7 8 9
+ # x-axis: space (139728247021568-139728453431248: 196.848 MiB)
+ # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s)
+ # resolution: 80x40 (2.461 MiB and 1.758 s for each character)
+
+You can also visualize the distribution of the working set size, sorted by the
+size.::
+
+ $ sudo damo report wss --range 0 101 10
+ # <percentile> <wss>
+ # target_id 18446632103789443072
+ # avr: 107.708 MiB
+ 0 0 B | |
+ 10 95.328 MiB |**************************** |
+ 20 95.332 MiB |**************************** |
+ 30 95.340 MiB |**************************** |
+ 40 95.387 MiB |**************************** |
+ 50 95.387 MiB |**************************** |
+ 60 95.398 MiB |**************************** |
+ 70 95.398 MiB |**************************** |
+ 80 95.504 MiB |**************************** |
+ 90 190.703 MiB |********************************************************* |
+ 100 196.875 MiB |***********************************************************|
+
+Using ``--sortby`` option with the above command, you can show how the working
+set size has chronologically changed.::
+
+ $ sudo damo report wss --range 0 101 10 --sortby time
+ # <percentile> <wss>
+ # target_id 18446632103789443072
+ # avr: 107.708 MiB
+ 0 3.051 MiB | |
+ 10 190.703 MiB |***********************************************************|
+ 20 95.336 MiB |***************************** |
+ 30 95.328 MiB |***************************** |
+ 40 95.387 MiB |***************************** |
+ 50 95.332 MiB |***************************** |
+ 60 95.320 MiB |***************************** |
+ 70 95.398 MiB |***************************** |
+ 80 95.398 MiB |***************************** |
+ 90 95.340 MiB |***************************** |
+ 100 95.398 MiB |***************************** |
+
+
+Data Access Pattern Aware Memory Management
+===========================================
+
+Below three commands make every memory region of size >=4K that doesn't
+accessed for >=60 seconds in your workload to be swapped out. ::
+
+ $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
+ $ echo "4K max 0 0 60s max pageout" >> test_scheme
+ $ damo schemes -c test_scheme <pid of your workload>
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index a72cda374aba..9b823fec974d 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -4,47 +4,515 @@
Detailed Usages
===============
-DAMON provides below three interfaces for different users.
+DAMON provides below interfaces for different users.
- *DAMON user space tool.*
- This is for privileged people such as system administrators who want a
- just-working human-friendly interface. Using this, users can use the DAMON’s
- major features in a human-friendly way. It may not be highly tuned for
- special cases, though. It supports only virtual address spaces monitoring.
-- *debugfs interface.*
- This is for privileged user space programmers who want more optimized use of
- DAMON. Using this, users can use DAMON’s major features by reading
- from and writing to special debugfs files. Therefore, you can write and use
- your personalized DAMON debugfs wrapper programs that reads/writes the
- debugfs files instead of you. The DAMON user space tool is also a reference
- implementation of such programs. It supports only virtual address spaces
- monitoring.
+ `This <https://github.com/awslabs/damo>`_ is for privileged people such as
+ system administrators who want a just-working human-friendly interface.
+ Using this, users can use the DAMON’s major features in a human-friendly way.
+ It may not be highly tuned for special cases, though. It supports both
+ virtual and physical address spaces monitoring. For more detail, please
+ refer to its `usage document
+ <https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
+- *sysfs interface.*
+ :ref:`This <sysfs_interface>` is for privileged user space programmers who
+ want more optimized use of DAMON. Using this, users can use DAMON’s major
+ features by reading from and writing to special sysfs files. Therefore,
+ you can write and use your personalized DAMON sysfs wrapper programs that
+ reads/writes the sysfs files instead of you. The `DAMON user space tool
+ <https://github.com/awslabs/damo>`_ is one example of such programs. It
+ supports both virtual and physical address spaces monitoring. Note that this
+ interface provides only simple :ref:`statistics <damos_stats>` for the
+ monitoring results. For detailed monitoring results, DAMON provides a
+ :ref:`tracepoint <tracepoint>`.
+- *debugfs interface. (DEPRECATED!)*
+ :ref:`This <debugfs_interface>` is almost identical to :ref:`sysfs interface
+ <sysfs_interface>`. This is deprecated, so users should move to the
+ :ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
+ move, please report your usecase to damon@lists.linux.dev and
+ linux-mm@kvack.org.
- *Kernel Space Programming Interface.*
- This is for kernel space programmers. Using this, users can utilize every
- feature of DAMON most flexibly and efficiently by writing kernel space
- DAMON application programs for you. You can even extend DAMON for various
- address spaces.
+ :doc:`This </mm/damon/api>` is for kernel space programmers. Using this,
+ users can utilize every feature of DAMON most flexibly and efficiently by
+ writing kernel space DAMON application programs for you. You can even extend
+ DAMON for various address spaces. For detail, please refer to the interface
+ :doc:`document </mm/damon/api>`.
-Nevertheless, you could write your own user space tool using the debugfs
-interface. A reference implementation is available at
-https://github.com/awslabs/damo. If you are a kernel programmer, you could
-refer to :doc:`/vm/damon/api` for the kernel space programming interface. For
-the reason, this document describes only the debugfs interface
+.. _sysfs_interface:
-debugfs Interface
-=================
+sysfs Interface
+===============
+
+DAMON sysfs interface is built when ``CONFIG_DAMON_SYSFS`` is defined. It
+creates multiple directories and files under its sysfs directory,
+``<sysfs>/kernel/mm/damon/``. You can control DAMON by writing to and reading
+from the files under the directory.
+
+For a short example, users can monitor the virtual address space of a given
+workload as below. ::
+
+ # cd /sys/kernel/mm/damon/admin/
+ # echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts
+ # echo vaddr > kdamonds/0/contexts/0/operations
+ # echo 1 > kdamonds/0/contexts/0/targets/nr_targets
+ # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
+ # echo on > kdamonds/0/state
+
+Files Hierarchy
+---------------
+
+The files hierarchy of DAMON sysfs interface is shown below. In the below
+figure, parents-children relations are represented with indentations, each
+directory is having ``/`` suffix, and files in each directory are separated by
+comma (","). ::
+
+ /sys/kernel/mm/damon/admin
+ │ kdamonds/nr_kdamonds
+ │ │ 0/state,pid
+ │ │ │ contexts/nr_contexts
+ │ │ │ │ 0/avail_operations,operations
+ │ │ │ │ │ monitoring_attrs/
+ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
+ │ │ │ │ │ │ nr_regions/min,max
+ │ │ │ │ │ targets/nr_targets
+ │ │ │ │ │ │ 0/pid_target
+ │ │ │ │ │ │ │ regions/nr_regions
+ │ │ │ │ │ │ │ │ 0/start,end
+ │ │ │ │ │ │ │ │ ...
+ │ │ │ │ │ │ ...
+ │ │ │ │ │ schemes/nr_schemes
+ │ │ │ │ │ │ 0/action
+ │ │ │ │ │ │ │ access_pattern/
+ │ │ │ │ │ │ │ │ sz/min,max
+ │ │ │ │ │ │ │ │ nr_accesses/min,max
+ │ │ │ │ │ │ │ │ age/min,max
+ │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
+ │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
+ │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
+ │ │ │ │ │ │ │ filters/nr_filters
+ │ │ │ │ │ │ │ │ 0/type,matching,memcg_id
+ │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
+ │ │ │ │ │ │ │ tried_regions/
+ │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age
+ │ │ │ │ │ │ │ │ ...
+ │ │ │ │ │ │ ...
+ │ │ │ │ ...
+ │ │ ...
+
+Root
+----
+
+The root of the DAMON sysfs interface is ``<sysfs>/kernel/mm/damon/``, and it
+has one directory named ``admin``. The directory contains the files for
+privileged user space programs' control of DAMON. User space tools or deamons
+having the root permission could use this directory.
+
+kdamonds/
+---------
+
+The monitoring-related information including request specifications and results
+are called DAMON context. DAMON executes each context with a kernel thread
+called kdamond, and multiple kdamonds could run in parallel.
+
+Under the ``admin`` directory, one directory, ``kdamonds``, which has files for
+controlling the kdamonds exist. In the beginning, this directory has only one
+file, ``nr_kdamonds``. Writing a number (``N``) to the file creates the number
+of child directories named ``0`` to ``N-1``. Each directory represents each
+kdamond.
+
+kdamonds/<N>/
+-------------
+
+In each kdamond directory, two files (``state`` and ``pid``) and one directory
+(``contexts``) exist.
+
+Reading ``state`` returns ``on`` if the kdamond is currently running, or
+``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be
+in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the
+user inputs in the sysfs files except ``state`` file again. Writing
+``update_schemes_stats`` to ``state`` file updates the contents of stats files
+for each DAMON-based operation scheme of the kdamond. For details of the
+stats, please refer to :ref:`stats section <sysfs_schemes_stats>`. Writing
+``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based
+operation scheme action tried regions directory for each DAMON-based operation
+scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state``
+file clears the DAMON-based operating scheme action tried regions directory for
+each DAMON-based operation scheme of the kdamond. For details of the
+DAMON-based operation scheme action tried regions directory, please refer to
+:ref:tried_regions section <sysfs_schemes_tried_regions>`.
+
+If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
+
+``contexts`` directory contains files for controlling the monitoring contexts
+that this kdamond will execute.
+
+kdamonds/<N>/contexts/
+----------------------
+
+In the beginning, this directory has only one file, ``nr_contexts``. Writing a
+number (``N``) to the file creates the number of child directories named as
+``0`` to ``N-1``. Each directory represents each monitoring context. At the
+moment, only one context per kdamond is supported, so only ``0`` or ``1`` can
+be written to the file.
+
+.. _sysfs_contexts:
+
+contexts/<N>/
+-------------
+
+In each context directory, two files (``avail_operations`` and ``operations``)
+and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
+exist.
+
+DAMON supports multiple types of monitoring operations, including those for
+virtual address space and the physical address space. You can get the list of
+available monitoring operations set on the currently running kernel by reading
+``avail_operations`` file. Based on the kernel configuration, the file will
+list some or all of below keywords.
+
+ - vaddr: Monitor virtual address spaces of specific processes
+ - fvaddr: Monitor fixed virtual address ranges
+ - paddr: Monitor the physical address space of the system
+
+Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
+differences between the operations sets in terms of the monitoring target
+regions.
+
+You can set and get what type of monitoring operations DAMON will use for the
+context by writing one of the keywords listed in ``avail_operations`` file and
+reading from the ``operations`` file.
+
+.. _sysfs_monitoring_attrs:
+
+contexts/<N>/monitoring_attrs/
+------------------------------
+
+Files for specifying attributes of the monitoring including required quality
+and efficiency of the monitoring are in ``monitoring_attrs`` directory.
+Specifically, two directories, ``intervals`` and ``nr_regions`` exist in this
+directory.
+
+Under ``intervals`` directory, three files for DAMON's sampling interval
+(``sample_us``), aggregation interval (``aggr_us``), and update interval
+(``update_us``) exist. You can set and get the values in micro-seconds by
+writing to and reading from the files.
+
+Under ``nr_regions`` directory, two files for the lower-bound and upper-bound
+of DAMON's monitoring regions (``min`` and ``max``, respectively), which
+controls the monitoring overhead, exist. You can set and get the values by
+writing to and rading from the files.
+
+For more details about the intervals and monitoring regions range, please refer
+to the Design document (:doc:`/mm/damon/design`).
+
+contexts/<N>/targets/
+---------------------
+
+In the beginning, this directory has only one file, ``nr_targets``. Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``. Each directory represents each monitoring target.
+
+targets/<N>/
+------------
+
+In each target directory, one file (``pid_target``) and one directory
+(``regions``) exist.
+
+If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
+be a process. You can specify the process to DAMON by writing the pid of the
+process to the ``pid_target`` file.
+
+.. _sysfs_regions:
+
+targets/<N>/regions
+-------------------
+
+When ``vaddr`` monitoring operations set is being used (``vaddr`` is written to
+the ``contexts/<N>/operations`` file), DAMON automatically sets and updates the
+monitoring target regions so that entire memory mappings of target processes
+can be covered. However, users could want to set the initial monitoring region
+to specific address ranges.
+
+In contrast, DAMON do not automatically sets and updates the monitoring target
+regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
+(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
+Therefore, users should set the monitoring target regions by themselves in the
+cases.
+
+For such cases, users can explicitly set the initial monitoring target regions
+as they want, by writing proper values to the files under this directory.
+
+In the beginning, this directory has only one file, ``nr_regions``. Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``. Each directory represents each initial monitoring target region.
+
+regions/<N>/
+------------
+
+In each region directory, you will find two files (``start`` and ``end``). You
+can set and get the start and end addresses of the initial monitoring target
+region by writing to and reading from the files, respectively.
+
+Each region should not overlap with others. ``end`` of directory ``N`` should
+be equal or smaller than ``start`` of directory ``N+1``.
+
+contexts/<N>/schemes/
+---------------------
+
+For usual DAMON-based data access aware memory management optimizations, users
+would normally want the system to apply a memory management action to a memory
+region of a specific access pattern. DAMON receives such formalized operation
+schemes from the user and applies those to the target memory regions. Users
+can get and set the schemes by reading from and writing to files under this
+directory.
+
+In the beginning, this directory has only one file, ``nr_schemes``. Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``. Each directory represents each DAMON-based operation scheme.
+
+schemes/<N>/
+------------
+
+In each scheme directory, five directories (``access_pattern``, ``quotas``,
+``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file
+(``action``) exist.
+
+The ``action`` file is for setting and getting what action you want to apply to
+memory regions having specific access pattern of the interest. The keywords
+that can be written to and read from the file and their meaning are as below.
+
+Note that support of each action depends on the running DAMON operations set
+`implementation <sysfs_contexts>`.
+
+ - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+ Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set.
+ - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``.
+ Supported by ``vaddr`` and ``fvaddr`` operations set.
+ - ``lru_prio``: Prioritize the region on its LRU lists.
+ Supported by ``paddr`` operations set.
+ - ``lru_deprio``: Deprioritize the region on its LRU lists.
+ Supported by ``paddr`` operations set.
+ - ``stat``: Do nothing but count the statistics.
+ Supported by all operations sets.
+
+schemes/<N>/access_pattern/
+---------------------------
+
+The target access pattern of each DAMON-based operation scheme is constructed
+with three ranges including the size of the region in bytes, number of
+monitored accesses per aggregate interval, and number of aggregated intervals
+for the age of the region.
+
+Under the ``access_pattern`` directory, three directories (``sz``,
+``nr_accesses``, and ``age``) each having two files (``min`` and ``max``)
+exist. You can set and get the access pattern for the given scheme by writing
+to and reading from the ``min`` and ``max`` files under ``sz``,
+``nr_accesses``, and ``age`` directories, respectively.
+
+schemes/<N>/quotas/
+-------------------
+
+Optimal ``target access pattern`` for each ``action`` is workload dependent, so
+not easy to find. Worse yet, setting a scheme of some action too aggressive
+can cause severe overhead. To avoid such overhead, users can limit time and
+size quota for each scheme. In detail, users can ask DAMON to try to use only
+up to specific time (``time quota``) for applying the action, and to apply the
+action to only up to specific amount (``size quota``) of memory regions having
+the target access pattern within a given time interval (``reset interval``).
+
+When the quota limit is expected to be exceeded, DAMON prioritizes found memory
+regions of the ``target access pattern`` based on their size, access frequency,
+and age. For personalized prioritization, users can set the weights for the
+three properties.
+
+Under ``quotas`` directory, three files (``ms``, ``bytes``,
+``reset_interval_ms``) and one directory (``weights``) having three files
+(``sz_permil``, ``nr_accesses_permil``, and ``age_permil``) in it exist.
+
+You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
+``reset interval`` in milliseconds by writing the values to the three files,
+respectively. You can also set the prioritization weights for size, access
+frequency, and age in per-thousand unit by writing the values to the three
+files under the ``weights`` directory.
+
+schemes/<N>/watermarks/
+-----------------------
+
+To allow easy activation and deactivation of each scheme based on system
+status, DAMON provides a feature called watermarks. The feature receives five
+values called ``metric``, ``interval``, ``high``, ``mid``, and ``low``. The
+``metric`` is the system metric such as free memory ratio that can be measured.
+If the metric value of the system is higher than the value in ``high`` or lower
+than ``low`` at the memoent, the scheme is deactivated. If the value is lower
+than ``mid``, the scheme is activated.
+
+Under the watermarks directory, five files (``metric``, ``interval_us``,
+``high``, ``mid``, and ``low``) for setting each value exist. You can set and
+get the five values by writing to the files, respectively.
+
+Keywords and meanings of those that can be written to the ``metric`` file are
+as below.
+
+ - none: Ignore the watermarks
+ - free_mem_rate: System's free memory rate (per thousand)
+
+The ``interval`` should written in microseconds unit.
+
+schemes/<N>/filters/
+--------------------
+
+Users could know something more than the kernel for specific types of memory.
+In the case, users could do their own management for the memory and hence
+doesn't want DAMOS bothers that. Users could limit DAMOS by setting the access
+pattern of the scheme and/or the monitoring regions for the purpose, but that
+can be inefficient in some cases. In such cases, users could set non-access
+pattern driven filters using files in this directory.
+
+In the beginning, this directory has only one file, ``nr_filters``. Writing a
+number (``N``) to the file creates the number of child directories named ``0``
+to ``N-1``. Each directory represents each filter. The filters are evaluated
+in the numeric order.
+
+Each filter directory contains three files, namely ``type``, ``matcing``, and
+``memcg_path``. You can write one of two special keywords, ``anon`` for
+anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of
+the memory cgroup filtering, you can specify the memory cgroup of the interest
+by writing the path of the memory cgroup from the cgroups mount point to
+``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to
+filter out pages that does or does not match to the type, respectively. Then,
+the scheme's action will not be applied to the pages that specified to be
+filtered out.
+
+For example, below restricts a DAMOS action to be applied to only non-anonymous
+pages of all memory cgroups except ``/having_care_already``.::
+
+ # echo 2 > nr_filters
+ # # filter out anonymous pages
+ echo anon > 0/type
+ echo Y > 0/matching
+ # # further filter out all cgroups except one at '/having_care_already'
+ echo memcg > 1/type
+ echo /having_care_already > 1/memcg_path
+ echo N > 1/matching
+
+Note that filters are currently supported only when ``paddr``
+`implementation <sysfs_contexts>` is being used.
+
+.. _sysfs_schemes_stats:
+
+schemes/<N>/stats/
+------------------
+
+DAMON counts the total number and bytes of regions that each scheme is tried to
+be applied, the two numbers for the regions that each scheme is successfully
+applied, and the total number of the quota limit exceeds. This statistics can
+be used for online analysis or tuning of the schemes.
+
+The statistics can be retrieved by reading the files under ``stats`` directory
+(``nr_tried``, ``sz_tried``, ``nr_applied``, ``sz_applied``, and
+``qt_exceeds``), respectively. The files are not updated in real time, so you
+should ask DAMON sysfs interface to updte the content of the files for the
+stats by writing a special keyword, ``update_schemes_stats`` to the relevant
+``kdamonds/<N>/state`` file.
+
+.. _sysfs_schemes_tried_regions:
+
+schemes/<N>/tried_regions/
+--------------------------
-DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
-its debugfs directory, ``<debugfs>/damon/``.
+When a special keyword, ``update_schemes_tried_regions``, is written to the
+relevant ``kdamonds/<N>/state`` file, DAMON creates directories named integer
+starting from ``0`` under this directory. Each directory contains files
+exposing detailed information about each of the memory region that the
+corresponding scheme's ``action`` has tried to be applied under this directory,
+during next :ref:`aggregation interval <sysfs_monitoring_attrs>`. The
+information includes address range, ``nr_accesses``, , and ``age`` of the
+region.
+
+The directories will be removed when another special keyword,
+``clear_schemes_tried_regions``, is written to the relevant
+``kdamonds/<N>/state`` file.
+
+tried_regions/<N>/
+------------------
+
+In each region directory, you will find four files (``start``, ``end``,
+``nr_accesses``, and ``age``). Reading the files will show the start and end
+addresses, ``nr_accesses``, and ``age`` of the region that corresponding
+DAMON-based operation scheme ``action`` has tried to be applied.
+
+Example
+~~~~~~~
+
+Below commands applies a scheme saying "If a memory region of size in [4KiB,
+8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region. For the paging out, use only up to
+10ms per second, and also don't page out more than 1GiB per second. Under the
+limitation, page out memory regions having longer age first. Also, check the
+free memory rate of the system every 5 seconds, start the monitoring and paging
+out when the free memory rate becomes lower than 50%, but stop it if the free
+memory rate becomes larger than 60%, or lower than 30%". ::
+
+ # cd <sysfs>/kernel/mm/damon/admin
+ # # populate directories
+ # echo 1 > kdamonds/nr_kdamonds; echo 1 > kdamonds/0/contexts/nr_contexts;
+ # echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
+ # cd kdamonds/0/contexts/0/schemes/0
+ # # set the basic access pattern and the action
+ # echo 4096 > access_pattern/sz/min
+ # echo 8192 > access_pattern/sz/max
+ # echo 0 > access_pattern/nr_accesses/min
+ # echo 5 > access_pattern/nr_accesses/max
+ # echo 10 > access_pattern/age/min
+ # echo 20 > access_pattern/age/max
+ # echo pageout > action
+ # # set quotas
+ # echo 10 > quotas/ms
+ # echo $((1024*1024*1024)) > quotas/bytes
+ # echo 1000 > quotas/reset_interval_ms
+ # # set watermark
+ # echo free_mem_rate > watermarks/metric
+ # echo 5000000 > watermarks/interval_us
+ # echo 600 > watermarks/high
+ # echo 500 > watermarks/mid
+ # echo 300 > watermarks/low
+
+Please note that it's highly recommended to use user space tools like `damo
+<https://github.com/awslabs/damo>`_ rather than manually reading and writing
+the files as above. Above is only for an example.
+
+.. _debugfs_interface:
+
+debugfs Interface (DEPRECATED!)
+===============================
+
+.. note::
+
+ THIS IS DEPRECATED!
+
+ DAMON debugfs interface is deprecated, so users should move to the
+ :ref:`sysfs interface <sysfs_interface>`. If you depend on this and cannot
+ move, please report your usecase to damon@lists.linux.dev and
+ linux-mm@kvack.org.
+
+DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
+``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
+``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
Attributes
----------
Users can get and set the ``sampling interval``, ``aggregation interval``,
-``regions update interval``, and min/max number of monitoring target regions by
+``update interval``, and min/max number of monitoring target regions by
reading from and writing to the ``attrs`` file. To know about the monitoring
-attributes in detail, please refer to the :doc:`/vm/damon/design`. For
+attributes in detail, please refer to the :doc:`/mm/damon/design`. For
example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and
1000, and then check it again::
@@ -71,9 +539,191 @@ check it again::
# cat target_ids
42 4242
+Users can also monitor the physical memory address space of the system by
+writing a special keyword, "``paddr\n``" to the file. Because physical address
+space monitoring doesn't support multiple targets, reading the file will show a
+fake value, ``42``, as below::
+
+ # cd <debugfs>/damon
+ # echo paddr > target_ids
+ # cat target_ids
+ 42
+
Note that setting the target ids doesn't start the monitoring.
+Initial Monitoring Target Regions
+---------------------------------
+
+In case of the virtual address space monitoring, DAMON automatically sets and
+updates the monitoring target regions so that entire memory mappings of target
+processes can be covered. However, users can want to limit the monitoring
+region to specific address ranges, such as the heap, the stack, or specific
+file-mapped area. Or, some users can know the initial access pattern of their
+workloads and therefore want to set optimal initial regions for the 'adaptive
+regions adjustment'.
+
+In contrast, DAMON do not automatically sets and updates the monitoring target
+regions in case of physical memory monitoring. Therefore, users should set the
+monitoring target regions by themselves.
+
+In such cases, users can explicitly set the initial monitoring target regions
+as they want, by writing proper values to the ``init_regions`` file. The input
+should be a sequence of three integers separated by white spaces that represent
+one region in below form.::
+
+ <target idx> <start address> <end address>
+
+The ``target idx`` should be the index of the target in ``target_ids`` file,
+starting from ``0``, and the regions should be passed in address order. For
+example, below commands will set a couple of address ranges, ``1-100`` and
+``100-200`` as the initial monitoring target region of pid 42, which is the
+first one (index ``0``) in ``target_ids``, and another couple of address
+ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one
+(index ``1``) in ``target_ids``.::
+
+ # cd <debugfs>/damon
+ # cat target_ids
+ 42 4242
+ # echo "0 1 100 \
+ 0 100 200 \
+ 1 20 40 \
+ 1 50 100" > init_regions
+
+Note that this sets the initial monitoring target regions only. In case of
+virtual memory monitoring, DAMON will automatically updates the boundary of the
+regions after one ``update interval``. Therefore, users should set the
+``update interval`` large enough in this case, if they don't want the
+update.
+
+
+Schemes
+-------
+
+For usual DAMON-based data access aware memory management optimizations, users
+would simply want the system to apply a memory management action to a memory
+region of a specific access pattern. DAMON receives such formalized operation
+schemes from the user and applies those to the target processes.
+
+Users can get and set the schemes by reading from and writing to ``schemes``
+debugfs file. Reading the file also shows the statistics of each scheme. To
+the file, each of the schemes should be represented in each line in below
+form::
+
+ <target access pattern> <action> <quota> <watermarks>
+
+You can disable schemes by simply writing an empty string to the file.
+
+Target Access Pattern
+~~~~~~~~~~~~~~~~~~~~~
+
+The ``<target access pattern>`` is constructed with three ranges in below
+form::
+
+ min-size max-size min-acc max-acc min-age max-age
+
+Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
+number of monitored accesses per aggregate interval for access frequency
+(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
+regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
+closed interval.
+
+Action
+~~~~~~
+
+The ``<action>`` is a predefined integer for memory management actions, which
+DAMON will apply to the regions having the target access pattern. The
+supported numbers and their meanings are as below.
+
+ - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if
+ ``target`` is ``paddr``.
+ - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if
+ ``target`` is ``paddr``.
+ - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``.
+ - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if
+ ``target`` is ``paddr``.
+ - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if
+ ``target`` is ``paddr``.
+ - 5: Do nothing but count the statistics
+
+Quota
+~~~~~
+
+Optimal ``target access pattern`` for each ``action`` is workload dependent, so
+not easy to find. Worse yet, setting a scheme of some action too aggressive
+can cause severe overhead. To avoid such overhead, users can limit time and
+size quota for the scheme via the ``<quota>`` in below form::
+
+ <ms> <sz> <reset interval> <priority weights>
+
+This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
+the action to memory regions of the ``target access pattern`` within the
+``<reset interval>`` milliseconds, and to apply the action to only up to
+``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
+``<ms>`` and ``<sz>`` zero disables the quota limits.
+
+When the quota limit is expected to be exceeded, DAMON prioritizes found memory
+regions of the ``target access pattern`` based on their size, access frequency,
+and age. For personalized prioritization, users can set the weights for the
+three properties in ``<priority weights>`` in below form::
+
+ <size weight> <access frequency weight> <age weight>
+
+Watermarks
+~~~~~~~~~~
+
+Some schemes would need to run based on current value of the system's specific
+metrics like free memory ratio. For such cases, users can specify watermarks
+for the condition.::
+
+ <metric> <check interval> <high mark> <middle mark> <low mark>
+
+``<metric>`` is a predefined integer for the metric to be checked. The
+supported numbers and their meanings are as below.
+
+ - 0: Ignore the watermarks
+ - 1: System's free memory rate (per thousand)
+
+The value of the metric is checked every ``<check interval>`` microseconds.
+
+If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
+scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
+is activated.
+
+.. _damos_stats:
+
+Statistics
+~~~~~~~~~~
+
+It also counts the total number and bytes of regions that each scheme is tried
+to be applied, the two numbers for the regions that each scheme is successfully
+applied, and the total number of the quota limit exceeds. This statistics can
+be used for online analysis or tuning of the schemes.
+
+The statistics can be shown by reading the ``schemes`` file. Reading the file
+will show each scheme you entered in each line, and the five numbers for the
+statistics will be added at the end of each line.
+
+Example
+~~~~~~~
+
+Below commands applies a scheme saying "If a memory region of size in [4KiB,
+8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region. For the paging out, use only up to
+10ms per second, and also don't page out more than 1GiB per second. Under the
+limitation, page out memory regions having longer age first. Also, check the
+free memory rate of the system every 5 seconds, start the monitoring and paging
+out when the free memory rate becomes lower than 50%, but stop it if the free
+memory rate becomes larger than 60%, or lower than 30%".::
+
+ # cd <debugfs>/damon
+ # scheme="4096 8192 0 5 10 20 2" # target access pattern and action
+ # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
+ # scheme+=" 0 0 100" # prioritization weights
+ # scheme+=" 1 5000000 600 500 300" # watermarks
+ # echo "$scheme" > schemes
+
+
Turning On/Off
--------------
@@ -96,6 +746,54 @@ the monitoring is turned on. If you write to the files while DAMON is running,
an error code such as ``-EBUSY`` will be returned.
+Monitoring Thread PID
+---------------------
+
+DAMON does requested monitoring with a kernel thread called ``kdamond``. You
+can get the pid of the thread by reading the ``kdamond_pid`` file. When the
+monitoring is turned off, reading the file returns ``none``. ::
+
+ # cd <debugfs>/damon
+ # cat monitor_on
+ off
+ # cat kdamond_pid
+ none
+ # echo on > monitor_on
+ # cat kdamond_pid
+ 18594
+
+
+Using Multiple Monitoring Threads
+---------------------------------
+
+One ``kdamond`` thread is created for each monitoring context. You can create
+and remove monitoring contexts for multiple ``kdamond`` required use case using
+the ``mk_contexts`` and ``rm_contexts`` files.
+
+Writing the name of the new context to the ``mk_contexts`` file creates a
+directory of the name on the DAMON debugfs directory. The directory will have
+DAMON debugfs files for the context. ::
+
+ # cd <debugfs>/damon
+ # ls foo
+ # ls: cannot access 'foo': No such file or directory
+ # echo foo > mk_contexts
+ # ls foo
+ # attrs init_regions kdamond_pid schemes target_ids
+
+If the context is not needed anymore, you can remove it and the corresponding
+directory by putting the name of the context to the ``rm_contexts`` file. ::
+
+ # echo foo > rm_contexts
+ # ls foo
+ # ls: cannot access 'foo': No such file or directory
+
+Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
+root directory only.
+
+
+.. _tracepoint:
+
Tracepoint for Monitoring Results
=================================
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 8abaeb144e44..e4d4b4a8dc97 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -1,5 +1,3 @@
-.. _hugetlbpage:
-
=============
HugeTLB Pages
=============
@@ -65,7 +63,7 @@ HugePages_Surp
may be temporarily larger than the maximum number of surplus huge
pages when the system is under memory pressure.
Hugepagesize
- is the default hugepage size (in Kb).
+ is the default hugepage size (in kB).
Hugetlb
is the total amount of memory (in kB), consumed by huge
pages of all sizes.
@@ -86,7 +84,7 @@ by increasing or decreasing the value of ``nr_hugepages``.
Note: When the feature of freeing unused vmemmap pages associated with each
hugetlb page is enabled, we can fail to free the huge pages triggered by
-the user when ths system is under memory pressure. Please try again later.
+the user when the system is under memory pressure. Please try again later.
Pages that are used as huge pages are reserved inside the kernel and cannot
be used for other purposes. Huge pages cannot be swapped out under
@@ -128,7 +126,9 @@ hugepages
implicitly specifies the number of huge pages of default size to
allocate. If the number of huge pages of default size is implicitly
specified, it can not be overwritten by a hugepagesz,hugepages
- parameter pair for the default size.
+ parameter pair for the default size. This parameter also has a
+ node format. The node format specifies the number of huge pages
+ to allocate on specific nodes.
For example, on an architecture with 2M default huge page size::
@@ -138,6 +138,14 @@ hugepages
indicating that the hugepages=512 parameter is ignored. If a hugepages
parameter is preceded by an invalid hugepagesz parameter, it will
be ignored.
+
+ Node format example::
+
+ hugepagesz=2M hugepages=0:1,1:2
+
+ It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
+ If the node number is invalid, the parameter will be ignored.
+
default_hugepagesz
Specify the default huge page size. This parameter can
only be specified once on the command line. default_hugepagesz can
@@ -154,8 +162,8 @@ default_hugepagesz
will all result in 256 2M huge pages being allocated. Valid default
huge page size is architecture dependent.
hugetlb_free_vmemmap
- When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
- unused vmemmap pages associated with each HugeTLB page.
+ When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB
+ Vmemmap Optimization (HVO).
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
indicates the current number of pre-allocated huge pages of the default size.
@@ -234,8 +242,12 @@ will exist, of the form::
hugepages-${size}kB
-Inside each of these directories, the same set of files will exist::
+Inside each of these directories, the set of files contained in ``/proc``
+will exist. In addition, two additional interfaces for demoting huge
+pages may exist::
+ demote
+ demote_size
nr_hugepages
nr_hugepages_mempolicy
nr_overcommit_hugepages
@@ -243,7 +255,29 @@ Inside each of these directories, the same set of files will exist::
resv_hugepages
surplus_hugepages
-which function as described above for the default huge page-sized case.
+The demote interfaces provide the ability to split a huge page into
+smaller huge pages. For example, the x86 architecture supports both
+1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512
+2MB huge pages. Demote interfaces are not available for the smallest
+huge page size. The demote interfaces are:
+
+demote_size
+ is the size of demoted pages. When a page is demoted a corresponding
+ number of huge pages of demote_size will be created. By default,
+ demote_size is set to the next smaller huge page size. If there are
+ multiple smaller huge page sizes, demote_size can be set to any of
+ these smaller sizes. Only huge page sizes less than the current huge
+ pages size are allowed.
+
+demote
+ is used to demote a number of huge pages. A user with root privileges
+ can write to this file. It may not be possible to demote the
+ requested number of huge pages. To determine how many pages were
+ actually demoted, compare the value of nr_hugepages before and after
+ writing to the demote interface. demote is a write only interface.
+
+The interfaces which are the same as in ``/proc`` (all except demote and
+demote_size) function as described above for the default huge page-sized case.
.. _mem_policy_and_hp_alloc:
@@ -277,7 +311,7 @@ memory policy mode--bind, preferred, local or interleave--may be used. The
resulting effect on persistent huge page allocation is as follows:
#. Regardless of mempolicy mode [see
- :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`],
+ Documentation/admin-guide/mm/numa_memory_policy.rst],
persistent huge pages will be distributed across the node or nodes
specified in the mempolicy as if "interleave" had been specified.
However, if a node in the policy does not contain sufficient contiguous
@@ -425,13 +459,13 @@ Examples
.. _map_hugetlb:
``map_hugetlb``
- see tools/testing/selftests/vm/map_hugetlb.c
+ see tools/testing/selftests/mm/map_hugetlb.c
``hugepage-shm``
- see tools/testing/selftests/vm/hugepage-shm.c
+ see tools/testing/selftests/mm/hugepage-shm.c
``hugepage-mmap``
- see tools/testing/selftests/vm/hugepage-mmap.c
+ see tools/testing/selftests/mm/hugepage-mmap.c
The `libhugetlbfs`_ library provides a wide range of userspace tools
to help with huge page usability, environment setup, and control.
diff --git a/Documentation/admin-guide/mm/idle_page_tracking.rst b/Documentation/admin-guide/mm/idle_page_tracking.rst
index df9394fb39c2..16fcf38dac56 100644
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -1,5 +1,3 @@
-.. _idle_page_tracking:
-
==================
Idle Page Tracking
==================
@@ -65,14 +63,13 @@ workload one should:
are not reclaimable, he or she can filter them out using
``/proc/kpageflags``.
-The page-types tool in the tools/vm directory can be used to assist in this.
+The page-types tool in the tools/mm directory can be used to assist in this.
If the tool is run initially with the appropriate option, it will mark all the
queried pages as idle. Subsequent runs of the tool can then show which pages have
their idle flag cleared in the interim.
-See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
-information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
-``/proc/kpagecgroup``.
+See Documentation/admin-guide/mm/pagemap.rst for more information about
+``/proc/pid/pagemap``, ``/proc/kpageflags``, and ``/proc/kpagecgroup``.
.. _impl_details:
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index cbd19d5e625f..1f883abf3f00 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -16,8 +16,7 @@ are described in Documentation/admin-guide/sysctl/vm.rst and in `man 5 proc`_.
.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
Linux memory management has its own jargon and if you are not yet
-familiar with it, consider reading
-:ref:`Documentation/admin-guide/mm/concepts.rst <mm_concepts>`.
+familiar with it, consider reading Documentation/admin-guide/mm/concepts.rst.
Here we document in detail how to interact with various mechanisms in
the Linux memory management.
@@ -32,10 +31,14 @@ the Linux memory management.
idle_page_tracking
ksm
memory-hotplug
+ multigen_lru
nommu-mmap
numa_memory_policy
numaperf
pagemap
+ shrinker_debugfs
soft-dirty
+ swap_numa
transhuge
userfaultfd
+ zswap
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 97d816791aca..7626392fe82c 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -1,5 +1,3 @@
-.. _admin_guide_ksm:
-
=======================
Kernel Samepage Merging
=======================
@@ -22,7 +20,7 @@ content which can be replaced by a single write-protected page (which
is automatically copied if a process later wants to update its
content). The amount of pages that KSM daemon scans in a single pass
and the time between the passes are configured using :ref:`sysfs
-intraface <ksm_sysfs>`
+interface <ksm_sysfs>`
KSM only merges anonymous (private) pages, never pagecache (file) pages.
KSM's merged pages were originally locked into kernel memory, but can now
@@ -159,6 +157,8 @@ stable_node_chains_prune_millisecs
The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
+general_profit
+ how effective is KSM. The calculation is explained below.
pages_shared
how many shared pages are being used
pages_sharing
@@ -184,6 +184,61 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
be increased accordingly.
+Monitoring KSM profit
+=====================
+
+KSM can save memory by merging identical pages, but also can consume
+additional memory, because it needs to generate a number of rmap_items to
+save each scanned page's brief rmap information. Some of these pages may
+be merged, but some may not be abled to be merged after being checked
+several times, which are unprofitable memory consumed.
+
+1) How to determine whether KSM save memory or consume memory in system-wide
+ range? Here is a simple approximate calculation for reference::
+
+ general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
+ sizeof(rmap_item);
+
+ where all_rmap_items can be easily obtained by summing ``pages_sharing``,
+ ``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
+
+2) The KSM profit inner a single process can be similarly obtained by the
+ following approximate calculation::
+
+ process_profit =~ ksm_merging_pages * sizeof(page) -
+ ksm_rmap_items * sizeof(rmap_item).
+
+ where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
+ and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``. The process profit
+ is also shown in ``/proc/<pid>/ksm_stat`` as ksm_process_profit.
+
+From the perspective of application, a high ratio of ``ksm_rmap_items`` to
+``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
+administrators have to rethink how to change madvise policy. Giving an example
+for reference, a page's size is usually 4K, and the rmap_item's size is
+separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture.
+so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU
+or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped,
+because the ksm profit is approximately zero or negative.
+
+Monitoring KSM events
+=====================
+
+There are some counters in /proc/vmstat that may be used to monitor KSM events.
+KSM might help save memory, it's a tradeoff by may suffering delay on KSM COW
+or on swapping in copy. Those events could help users evaluate whether or how
+to use KSM. For example, if cow_ksm increases too fast, user may decrease the
+range of madvise(, , MADV_MERGEABLE).
+
+cow_ksm
+ is incremented every time a KSM page triggers copy on write (COW)
+ when users try to write to a KSM page, we have to make a copy.
+
+ksm_swpin_copy
+ is incremented every time a KSM page is copied when swapping in
+ note that KSM page might be copied when swapping in because do_swap_page()
+ cannot do all the locking needed to reconstitute a cross-anon_vma KSM page.
+
--
Izik Eidus,
Hugh Dickins, 17 Nov 2009
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 03dfbc925252..1b02fe5807cc 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -1,5 +1,3 @@
-.. _admin_guide_memory_hotplug:
-
==================
Memory Hot(Un)Plug
==================
@@ -165,9 +163,8 @@ Or alternatively::
% echo 1 > /sys/devices/system/memory/memoryXXX/online
-The kernel will select the target zone automatically, usually defaulting to
-``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
-command line or if the memory block would intersect the ZONE_MOVABLE already.
+The kernel will select the target zone automatically, depending on the
+configured ``online_policy``.
One can explicitly request to associate an offline memory block with
ZONE_MOVABLE by::
@@ -198,6 +195,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
% echo online > /sys/devices/system/memory/auto_online_blocks
+Similarly to manual onlining, with ``online`` the kernel will select the
+target zone automatically, depending on the configured ``online_policy``.
+
Modifying the auto-online behavior will only affect all subsequently added
memory blocks only.
@@ -393,11 +393,16 @@ command line parameters are relevant:
======================== =======================================================
``memhp_default_state`` configure auto-onlining by essentially setting
``/sys/devices/system/memory/auto_online_blocks``.
-``movablecore`` configure automatic zone selection of the kernel. When
- set, the kernel will default to ZONE_MOVABLE, unless
- other zones can be kept contiguous.
+``movable_node`` configure automatic zone selection in the kernel when
+ using the ``contig-zones`` online policy. When
+ set, the kernel will default to ZONE_MOVABLE when
+ onlining a memory block, unless other zones can be kept
+ contiguous.
======================== =======================================================
+See Documentation/admin-guide/kernel-parameters.txt for a more generic
+description of these command line parameters.
+
Module Parameters
------------------
@@ -410,24 +415,118 @@ them with ``memory_hotplug.`` such as::
and they can be observed (and some even modified at runtime) via::
- /sys/modules/memory_hotplug/parameters/
+ /sys/module/memory_hotplug/parameters/
The following module parameters are currently defined:
-======================== =======================================================
-``memmap_on_memory`` read-write: Allocate memory for the memmap from the
- added memory block itself. Even if enabled, actual
- support depends on various other system properties and
- should only be regarded as a hint whether the behavior
- would be desired.
-
- While allocating the memmap from the memory block
- itself makes memory hotplug less likely to fail and
- keeps the memmap on the same NUMA node in any case, it
- can fragment physical memory in a way that huge pages
- in bigger granularity cannot be formed on hotplugged
- memory.
-======================== =======================================================
+================================ ===============================================
+``memmap_on_memory`` read-write: Allocate memory for the memmap from
+ the added memory block itself. Even if enabled,
+ actual support depends on various other system
+ properties and should only be regarded as a
+ hint whether the behavior would be desired.
+
+ While allocating the memmap from the memory
+ block itself makes memory hotplug less likely
+ to fail and keeps the memmap on the same NUMA
+ node in any case, it can fragment physical
+ memory in a way that huge pages in bigger
+ granularity cannot be formed on hotplugged
+ memory.
+``online_policy`` read-write: Set the basic policy used for
+ automatic zone selection when onlining memory
+ blocks without specifying a target zone.
+ ``contig-zones`` has been the kernel default
+ before this parameter was added. After an
+ online policy was configured and memory was
+ online, the policy should not be changed
+ anymore.
+
+ When set to ``contig-zones``, the kernel will
+ try keeping zones contiguous. If a memory block
+ intersects multiple zones or no zone, the
+ behavior depends on the ``movable_node`` kernel
+ command line parameter: default to ZONE_MOVABLE
+ if set, default to the applicable kernel zone
+ (usually ZONE_NORMAL) if not set.
+
+ When set to ``auto-movable``, the kernel will
+ try onlining memory blocks to ZONE_MOVABLE if
+ possible according to the configuration and
+ memory device details. With this policy, one
+ can avoid zone imbalances when eventually
+ hotplugging a lot of memory later and still
+ wanting to be able to hotunplug as much as
+ possible reliably, very desirable in
+ virtualized environments. This policy ignores
+ the ``movable_node`` kernel command line
+ parameter and isn't really applicable in
+ environments that require it (e.g., bare metal
+ with hotunpluggable nodes) where hotplugged
+ memory might be exposed via the
+ firmware-provided memory map early during boot
+ to the system instead of getting detected,
+ added and onlined later during boot (such as
+ done by virtio-mem or by some hypervisors
+ implementing emulated DIMMs). As one example, a
+ hotplugged DIMM will be onlined either
+ completely to ZONE_MOVABLE or completely to
+ ZONE_NORMAL, not a mixture.
+ As another example, as many memory blocks
+ belonging to a virtio-mem device will be
+ onlined to ZONE_MOVABLE as possible,
+ special-casing units of memory blocks that can
+ only get hotunplugged together. *This policy
+ does not protect from setups that are
+ problematic with ZONE_MOVABLE and does not
+ change the zone of memory blocks dynamically
+ after they were onlined.*
+``auto_movable_ratio`` read-write: Set the maximum MOVABLE:KERNEL
+ memory ratio in % for the ``auto-movable``
+ online policy. Whether the ratio applies only
+ for the system across all NUMA nodes or also
+ per NUMA nodes depends on the
+ ``auto_movable_numa_aware`` configuration.
+
+ All accounting is based on present memory pages
+ in the zones combined with accounting per
+ memory device. Memory dedicated to the CMA
+ allocator is accounted as MOVABLE, although
+ residing on one of the kernel zones. The
+ possible ratio depends on the actual workload.
+ The kernel default is "301" %, for example,
+ allowing for hotplugging 24 GiB to a 8 GiB VM
+ and automatically onlining all hotplugged
+ memory to ZONE_MOVABLE in many setups. The
+ additional 1% deals with some pages being not
+ present, for example, because of some firmware
+ allocations.
+
+ Note that ZONE_NORMAL memory provided by one
+ memory device does not allow for more
+ ZONE_MOVABLE memory for a different memory
+ device. As one example, onlining memory of a
+ hotplugged DIMM to ZONE_NORMAL will not allow
+ for another hotplugged DIMM to get onlined to
+ ZONE_MOVABLE automatically. In contrast, memory
+ hotplugged by a virtio-mem device that got
+ onlined to ZONE_NORMAL will allow for more
+ ZONE_MOVABLE memory within *the same*
+ virtio-mem device.
+``auto_movable_numa_aware`` read-write: Configure whether the
+ ``auto_movable_ratio`` in the ``auto-movable``
+ online policy also applies per NUMA
+ node in addition to the whole system across all
+ NUMA nodes. The kernel default is "Y".
+
+ Disabling NUMA awareness can be helpful when
+ dealing with NUMA nodes that should be
+ completely hotunpluggable, onlining the memory
+ completely to ZONE_MOVABLE automatically if
+ possible.
+
+ Parameter availability depends on CONFIG_NUMA.
+================================ ===============================================
ZONE_MOVABLE
============
@@ -552,8 +651,8 @@ block might fail:
- Concurrent activity that operates on the same physical memory area, such as
allocating gigantic pages, can result in temporary offlining failures.
-- Out of memory when dissolving huge pages, especially when freeing unused
- vmemmap pages associated with each hugetlb page is enabled.
+- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap
+ Optimization (HVO) is enabled.
Offlining code may be able to migrate huge page contents, but may not be able
to dissolve the source huge page because it fails allocating (unmovable) pages
diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst
new file mode 100644
index 000000000000..33e068830497
--- /dev/null
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -0,0 +1,162 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Quick start
+===========
+Build the kernel with the following configurations.
+
+* ``CONFIG_LRU_GEN=y``
+* ``CONFIG_LRU_GEN_ENABLED=y``
+
+All set!
+
+Runtime options
+===============
+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
+following subsections.
+
+Kill switch
+-----------
+``enabled`` accepts different values to enable or disable the
+following components. Its default value depends on
+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
+unless some of them have unforeseen side effects. Writing to
+``enabled`` has no effect when a component is not supported by the
+hardware, and valid values will be accepted even when the main switch
+is off.
+
+====== ===============================================================
+Values Components
+====== ===============================================================
+0x0001 The main switch for the multi-gen LRU.
+0x0002 Clearing the accessed bit in leaf page table entries in large
+ batches, when MMU sets it (e.g., on x86). This behavior can
+ theoretically worsen lock contention (mmap_lock). If it is
+ disabled, the multi-gen LRU will suffer a minor performance
+ degradation for workloads that contiguously map hot pages,
+ whose accessed bits can be otherwise cleared by fewer larger
+ batches.
+0x0004 Clearing the accessed bit in non-leaf page table entries as
+ well, when MMU sets it (e.g., on x86). This behavior was not
+ verified on x86 varieties other than Intel and AMD. If it is
+ disabled, the multi-gen LRU will suffer a negligible
+ performance degradation.
+[yYnN] Apply to all the components above.
+====== ===============================================================
+
+E.g.,
+::
+
+ echo y >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0007
+ echo 5 >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0005
+
+Thrashing prevention
+--------------------
+Personal computers are more sensitive to thrashing because it can
+cause janks (lags when rendering UI) and negatively impact user
+experience. The multi-gen LRU offers thrashing prevention to the
+majority of laptop and desktop users who do not have ``oomd``.
+
+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
+``N`` milliseconds from getting evicted. The OOM killer is triggered
+if this working set cannot be kept in memory. In other words, this
+option works as an adjustable pressure relief valve, and when open, it
+terminates applications that are hopefully not being used.
+
+Based on the average human detectable lag (~100ms), ``N=1000`` usually
+eliminates intolerable janks due to thrashing. Larger values like
+``N=3000`` make janks less noticeable at the risk of premature OOM
+kills.
+
+The default value ``0`` means disabled.
+
+Experimental features
+=====================
+``/sys/kernel/debug/lru_gen`` accepts commands described in the
+following subsections. Multiple command lines are supported, so does
+concatenation with delimiters ``,`` and ``;``.
+
+``/sys/kernel/debug/lru_gen_full`` provides additional stats for
+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
+evicted generations in this file.
+
+Working set estimation
+----------------------
+Working set estimation measures how much memory an application needs
+in a given time interval, and it is usually done with little impact on
+the performance of the application. E.g., data centers want to
+optimize job scheduling (bin packing) to improve memory utilizations.
+When a new job comes in, the job scheduler needs to find out whether
+each server it manages can allocate a certain amount of memory for
+this new job before it can pick a candidate. To do so, the job
+scheduler needs to estimate the working sets of the existing jobs.
+
+When it is read, ``lru_gen`` returns a histogram of numbers of pages
+accessed over different time intervals for each memcg and node.
+``MAX_NR_GENS`` decides the number of bins for each histogram. The
+histograms are noncumulative.
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages
+ ...
+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages
+
+Each bin contains an estimated number of pages that have been accessed
+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
+the former is the largest and that of the latter is the smallest.
+
+Users can write the following command to ``lru_gen`` to create a new
+generation ``max_gen_nr+1``:
+
+ ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
+
+``can_swap`` defaults to the swap setting and, if it is set to ``1``,
+it forces the scan of anon pages when swap is off, and vice versa.
+``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
+employs heuristics to reduce the overhead, which is likely to reduce
+the coverage as well.
+
+A typical use case is that a job scheduler runs this command at a
+certain time interval to create new generations, and it ranks the
+servers it manages based on the sizes of their cold pages defined by
+this time interval.
+
+Proactive reclaim
+-----------------
+Proactive reclaim induces page reclaim when there is no memory
+pressure. It usually targets cold pages only. E.g., when a new job
+comes in, the job scheduler wants to proactively reclaim cold pages on
+the server it selected, to improve the chance of successfully landing
+this new job.
+
+Users can write the following command to ``lru_gen`` to evict
+generations less than or equal to ``min_gen_nr``.
+
+ ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
+
+``min_gen_nr`` should be less than ``max_gen_nr-1``, since
+``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
+the active list) and therefore cannot be evicted. ``swappiness``
+overrides the default value in ``/proc/sys/vm/swappiness``.
+``nr_to_reclaim`` limits the number of pages to evict.
+
+A typical use case is that a job scheduler runs this command before it
+tries to land a new job on a server. If it fails to materialize enough
+cold pages because of the overestimation, it retries on the next
+server according to the ranking result obtained from the working set
+estimation step. This less forceful approach limits the impacts on the
+existing jobs.
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 64fd0ba0d057..46515ad2337f 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -1,5 +1,3 @@
-.. _numa_memory_policy:
-
==================
NUMA Memory Policy
==================
@@ -246,7 +244,7 @@ MPOL_INTERLEAVED
interleaved system default policy works in this mode.
MPOL_PREFERRED_MANY
- This mode specifices that the allocation should be preferrably
+ This mode specifies that the allocation should be preferably
satisfied from the nodemask specified in the policy. If there is
a memory pressure on all nodes in the nodemask, the allocation
can fall back to all existing numa nodes. This is effectively
@@ -360,7 +358,7 @@ and NUMA nodes. "Usage" here means one of the following:
2) examination of the policy to determine the policy mode and associated node
or node lists, if any, for page allocation. This is considered a "hot
path". Note that for MPOL_BIND, the "usage" extends across the entire
- allocation process, which may sleep during page reclaimation, because the
+ allocation process, which may sleep during page reclamation, because the
BIND policy nodemask is used, by reference, to filter ineligible nodes.
We can avoid taking an extra reference during the usages listed above as
@@ -408,7 +406,7 @@ follows:
Memory Policy APIs
==================
-Linux supports 3 system calls for controlling memory policy. These APIS
+Linux supports 4 system calls for controlling memory policy. These APIS
always affect only the calling task, the calling task's address space, or
some shared object mapped into the calling task's address space.
@@ -460,6 +458,20 @@ requested via the 'flags' argument.
See the mbind(2) man page for more details.
+Set home node for a Range of Task's Address Spacec::
+
+ long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
+ unsigned long home_node,
+ unsigned long flags);
+
+sys_set_mempolicy_home_node set the home node for a VMA policy present in the
+task's address range. The system call updates the home node only for the existing
+mempolicy range. Other address ranges are ignored. A home node is the NUMA node
+closest to which page allocation will come from. Specifying the home node override
+the default allocation policy to allocate memory close to the local node for an
+executing CPU.
+
+
Memory Policy Command Line Interface
====================================
diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst
index 166697325947..90a12b6a8bfc 100644
--- a/Documentation/admin-guide/mm/numaperf.rst
+++ b/Documentation/admin-guide/mm/numaperf.rst
@@ -1,6 +1,7 @@
-.. _numaperf:
+=======================
+NUMA Memory Performance
+=======================
-=============
NUMA Locality
=============
@@ -61,7 +62,6 @@ that are CPUs and hence suitable for generic task scheduling, and
IO initiators such as GPUs and NICs. Unlike access class 0, only
nodes containing CPUs are considered.
-================
NUMA Performance
================
@@ -96,7 +96,6 @@ for the platform.
Access class 1 takes the same form but only includes values for CPU to
memory activity.
-==========
NUMA Cache
==========
@@ -170,7 +169,6 @@ The "size" is the number of bytes provided by this cache level.
The "write_policy" will be 0 for write-back, and non-zero for
write-through caching.
-========
See Also
========
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index fb578fbbb76c..c8f380271cad 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -1,5 +1,3 @@
-.. _pagemap:
-
=============================
Examining Process Page Tables
=============================
@@ -19,11 +17,11 @@ There are four components to pagemap:
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see
- :ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
+ Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped (since 4.2)
* Bit 57 pte is uffd-wp write-protected (since 5.13) (see
- :ref:`Documentation/admin-guide/mm/userfaultfd.rst <userfaultfd>`)
- * Bits 57-60 zero
+ Documentation/admin-guide/mm/userfaultfd.rst)
+ * Bits 58-60 zero
* Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
* Bit 63 page present
@@ -46,7 +44,7 @@ There are four components to pagemap:
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
times each page is mapped, indexed by PFN.
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
number of times a page is mapped.
* ``/proc/kpageflags``. This file contains a 64-bit set of flags for each
@@ -90,13 +88,14 @@ Short descriptions to the page flags
====================================
0 - LOCKED
- page is being locked for exclusive access, e.g. by undergoing read/write IO
+ The page is being locked for exclusive access, e.g. by undergoing read/write
+ IO.
7 - SLAB
- page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
- When compound page is used, SLUB/SLQB will only set this flag on the head
- page; SLOB will not flag it at all.
+ The page is managed by the SLAB/SLUB kernel memory allocator.
+ When compound page is used, either will only set this flag on the head
+ page.
10 - BUDDY
- a free memory block managed by the buddy system allocator
+ A free memory block managed by the buddy system allocator.
The buddy system organizes free memory in blocks of various orders.
An order N block has 2^N physically contiguous pages, with the BUDDY flag
set for and _only_ for the first page.
@@ -104,75 +103,74 @@ Short descriptions to the page flags
A compound page with order N consists of 2^N physically contiguous pages.
A compound page with order 2 takes the form of "HTTT", where H donates its
head page and T donates its tail page(s). The major consumers of compound
- pages are hugeTLB pages
- (:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`),
+ pages are hugeTLB pages (Documentation/admin-guide/mm/hugetlbpage.rst),
the SLUB etc. memory allocators and various device drivers.
However in this interface, only huge/giga pages are made visible
to end users.
16 - COMPOUND_TAIL
A compound page tail (see description above).
17 - HUGE
- this is an integral part of a HugeTLB page
+ This is an integral part of a HugeTLB page.
19 - HWPOISON
- hardware detected memory corruption on this page: don't touch the data!
+ Hardware detected memory corruption on this page: don't touch the data!
20 - NOPAGE
- no page frame exists at the requested address
+ No page frame exists at the requested address.
21 - KSM
- identical memory pages dynamically shared between one or more processes
+ Identical memory pages dynamically shared between one or more processes.
22 - THP
- contiguous pages which construct transparent hugepages
+ Contiguous pages which construct transparent hugepages.
23 - OFFLINE
- page is logically offline
+ The page is logically offline.
24 - ZERO_PAGE
- zero page for pfn_zero or huge_zero page
+ Zero page for pfn_zero or huge_zero page.
25 - IDLE
- page has not been accessed since it was marked idle (see
- :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
+ The page has not been accessed since it was marked idle (see
+ Documentation/admin-guide/mm/idle_page_tracking.rst).
Note that this flag may be stale in case the page was accessed via
a PTE. To make sure the flag is up-to-date one has to read
``/sys/kernel/mm/page_idle/bitmap`` first.
26 - PGTABLE
- page is in use as a page table
+ The page is in use as a page table.
IO related page flags
---------------------
1 - ERROR
- IO error occurred
+ IO error occurred.
3 - UPTODATE
- page has up-to-date data
+ The page has up-to-date data.
ie. for file backed page: (in-memory data revision >= on-disk one)
4 - DIRTY
- page has been written to, hence contains new data
+ The page has been written to, hence contains new data.
i.e. for file backed page: (in-memory data revision > on-disk one)
8 - WRITEBACK
- page is being synced to disk
+ The page is being synced to disk.
LRU related page flags
----------------------
5 - LRU
- page is in one of the LRU lists
+ The page is in one of the LRU lists.
6 - ACTIVE
- page is in the active LRU list
+ The page is in the active LRU list.
18 - UNEVICTABLE
- page is in the unevictable (non-)LRU list It is somehow pinned and
+ The page is in the unevictable (non-)LRU list It is somehow pinned and
not a candidate for LRU page reclaims, e.g. ramfs pages,
- shmctl(SHM_LOCK) and mlock() memory segments
+ shmctl(SHM_LOCK) and mlock() memory segments.
2 - REFERENCED
- page has been referenced since last LRU list enqueue/requeue
+ The page has been referenced since last LRU list enqueue/requeue.
9 - RECLAIM
- page will be reclaimed soon after its pageout IO completed
+ The page will be reclaimed soon after its pageout IO completed.
11 - MMAP
- a memory mapped page
+ A memory mapped page.
12 - ANON
- a memory mapped page that is not part of a file
+ A memory mapped page that is not part of a file.
13 - SWAPCACHE
- page is mapped to swap space, i.e. has an associated swap entry
+ The page is mapped to swap space, i.e. has an associated swap entry.
14 - SWAPBACKED
- page is backed by swap/RAM
+ The page is backed by swap/RAM.
-The page-types tool in the tools/vm directory can be used to query the
+The page-types tool in the tools/mm directory can be used to query the
above flags.
Using pagemap to do something useful
@@ -196,6 +194,28 @@ you can go through every map in the process, find the PFNs, look those up
in kpagecount, and tally up the number of pages that are only referenced
once.
+Exceptions for Shared Memory
+============================
+
+Page table entries for shared pages are cleared when the pages are zapped or
+swapped out. This makes swapped out pages indistinguishable from never-allocated
+ones.
+
+In kernel space, the swap location can still be retrieved from the page cache.
+However, values stored only on the normal PTE get lost irretrievably when the
+page is swapped out (i.e. SOFT_DIRTY).
+
+In user space, whether the page is present, swapped or none can be deduced with
+the help of lseek and/or mincore system calls.
+
+lseek() can differentiate between accessed pages (present or swapped out) and
+holes (none/non-allocated) by specifying the SEEK_DATA flag on the file where
+the pages are backed. For anonymous shared pages, the file can be found in
+``/proc/pid/map_files/``.
+
+mincore() can differentiate between pages in memory (present, including swap
+cache) and out of memory (swapped out or none/non-allocated).
+
Other notes
===========
diff --git a/Documentation/admin-guide/mm/shrinker_debugfs.rst b/Documentation/admin-guide/mm/shrinker_debugfs.rst
new file mode 100644
index 000000000000..c582033bd113
--- /dev/null
+++ b/Documentation/admin-guide/mm/shrinker_debugfs.rst
@@ -0,0 +1,133 @@
+==========================
+Shrinker Debugfs Interface
+==========================
+
+Shrinker debugfs interface provides a visibility into the kernel memory
+shrinkers subsystem and allows to get information about individual shrinkers
+and interact with them.
+
+For each shrinker registered in the system a directory in **<debugfs>/shrinker/**
+is created. The directory's name is composed from the shrinker's name and an
+unique id: e.g. *kfree_rcu-0* or *sb-xfs:vda1-36*.
+
+Each shrinker directory contains **count** and **scan** files, which allow to
+trigger *count_objects()* and *scan_objects()* callbacks for each memcg and
+numa node (if applicable).
+
+Usage:
+------
+
+1. *List registered shrinkers*
+
+ ::
+
+ $ cd /sys/kernel/debug/shrinker/
+ $ ls
+ dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42
+ mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43
+ mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44
+ rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49
+ sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13
+ sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36
+ sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19
+ sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10
+ sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9
+ sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37
+ sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38
+ sb-dax-11 sb-proc-45 sb-tmpfs-35
+ sb-debugfs-7 sb-proc-46 sb-tmpfs-40
+
+2. *Get information about a specific shrinker*
+
+ ::
+
+ $ cd sb-btrfs\:vda2-24/
+ $ ls
+ count scan
+
+3. *Count objects*
+
+ Each line in the output has the following format::
+
+ <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
+ <cgroup inode id> <nr of objects on node 0> <nr of objects on node 1> ...
+ ...
+
+ If there are no objects on all numa nodes, a line is omitted. If there
+ are no objects at all, the output might be empty.
+
+ If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed
+ as cgroup inode id. If the shrinker is not numa-aware, 0's are printed
+ for all nodes except the first one.
+ ::
+
+ $ cat count
+ 1 224 2
+ 21 98 0
+ 55 818 10
+ 2367 2 0
+ 2401 30 0
+ 225 13 0
+ 599 35 0
+ 939 124 0
+ 1041 3 0
+ 1075 1 0
+ 1109 1 0
+ 1279 60 0
+ 1313 7 0
+ 1347 39 0
+ 1381 3 0
+ 1449 14 0
+ 1483 63 0
+ 1517 53 0
+ 1551 6 0
+ 1585 1 0
+ 1619 6 0
+ 1653 40 0
+ 1687 11 0
+ 1721 8 0
+ 1755 4 0
+ 1789 52 0
+ 1823 888 0
+ 1857 1 0
+ 1925 2 0
+ 1959 32 0
+ 2027 22 0
+ 2061 9 0
+ 2469 799 0
+ 2537 861 0
+ 2639 1 0
+ 2707 70 0
+ 2775 4 0
+ 2877 84 0
+ 293 1 0
+ 735 8 0
+
+4. *Scan objects*
+
+ The expected input format::
+
+ <cgroup inode id> <numa id> <number of objects to scan>
+
+ For a non-memcg-aware shrinker or on a system with no memory
+ cgrups **0** should be passed as cgroup id.
+ ::
+
+ $ cd /sys/kernel/debug/shrinker/
+ $ cd sb-btrfs\:vda2-24/
+
+ $ cat count | head -n 5
+ 1 212 0
+ 21 97 0
+ 55 802 5
+ 2367 2 0
+ 225 13 0
+
+ $ echo "55 0 200" > scan
+
+ $ cat count | head -n 5
+ 1 212 0
+ 21 96 0
+ 55 752 5
+ 2367 2 0
+ 225 13 0
diff --git a/Documentation/admin-guide/mm/soft-dirty.rst b/Documentation/admin-guide/mm/soft-dirty.rst
index cb0cfd6672fa..aeea936caa44 100644
--- a/Documentation/admin-guide/mm/soft-dirty.rst
+++ b/Documentation/admin-guide/mm/soft-dirty.rst
@@ -1,5 +1,3 @@
-.. _soft_dirty:
-
===============
Soft-Dirty PTEs
===============
diff --git a/Documentation/vm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst
index e0466f2db8fa..2e630627bcee 100644
--- a/Documentation/vm/swap_numa.rst
+++ b/Documentation/admin-guide/mm/swap_numa.rst
@@ -1,5 +1,3 @@
-.. _swap_numa:
-
===========================================
Automatically bind swap device to numa node
===========================================
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index c9c37f16eef8..b0cc8243e093 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -1,5 +1,3 @@
-.. _admin_guide_transhuge:
-
============================
Transparent Hugepage Support
============================
@@ -191,7 +189,14 @@ allocation failure to throttle the next allocation attempt::
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
-The khugepaged progress can be seen in the number of pages collapsed::
+The khugepaged progress can be seen in the number of pages collapsed (note
+that this counter may not be an exact count of the number of pages
+collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
+being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
+one 2M hugepage. Each may happen independently, or together, depending on
+the type of memory and the failures that occur. As such, this value should
+be interpreted roughly as a sign of progress, and counters in /proc/vmstat
+consulted for more accurate accounting)::
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
@@ -366,10 +371,9 @@ thp_split_pmd
page table entry.
thp_zero_page_alloc
- is incremented every time a huge zero page is
- successfully allocated. It includes allocations which where
- dropped due race with other allocation. Note, it doesn't count
- every map of the huge zero page, only its allocation.
+ is incremented every time a huge zero page used for thp is
+ successfully allocated. Note, it doesn't count every map of
+ the huge zero page, only its allocation.
thp_zero_page_alloc_failed
is incremented if kernel fails to allocate
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 6528036093e1..7c304e432205 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -1,5 +1,3 @@
-.. _userfaultfd:
-
===========
Userfaultfd
===========
@@ -17,7 +15,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
Design
======
-Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
+Userspace creates a new userfaultfd, initializes it, and registers one or more
+regions of virtual memory with it. Then, any page faults which occur within the
+region(s) result in a message being delivered to the userfaultfd, notifying
+userspace of the fault.
The ``userfaultfd`` (aside from registering and unregistering virtual
memory ranges) provides two primary functionalities:
@@ -34,12 +35,11 @@ The real advantage of userfaults if compared to regular virtual memory
management of mremap/mprotect is that the userfaults in all their
operations never involve heavyweight structures like vmas (in fact the
``userfaultfd`` runtime load never takes the mmap_lock for writing).
-
Vmas are not suitable for page- (or hugepage) granular fault tracking
when dealing with virtual address spaces that could span
Terabytes. Too many vmas would be needed for that.
-The ``userfaultfd`` once opened by invoking the syscall, can also be
+The ``userfaultfd``, once created, can also be
passed using unix domain sockets to a manager process, so the same
manager process could handle the userfaults of a multitude of
different processes without them being aware about what is going on
@@ -50,6 +50,39 @@ is a corner case that would currently return ``-EBUSY``).
API
===
+Creating a userfaultfd
+----------------------
+
+There are two ways to create a new userfaultfd, each of which provide ways to
+restrict access to this functionality (since historically userfaultfds which
+handle kernel page faults have been a useful tool for exploiting the kernel).
+
+The first way, supported since userfaultfd was introduced, is the
+userfaultfd(2) syscall. Access to this is controlled in several ways:
+
+- Any user can always create a userfaultfd which traps userspace page faults
+ only. Such a userfaultfd can be created using the userfaultfd(2) syscall
+ with the flag UFFD_USER_MODE_ONLY.
+
+- In order to also trap kernel page faults for the address space, either the
+ process needs the CAP_SYS_PTRACE capability, or the system must have
+ vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
+ is set to 0.
+
+The second way, added to the kernel more recently, is by opening
+/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
+yields equivalent userfaultfds to the userfaultfd(2) syscall.
+
+Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
+filesystem permissions (user/group/mode), which gives fine grained access to
+userfaultfd specifically, without also granting other unrelated privileges at
+the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
+to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
+vm.unprivileged_userfaultfd is not considered.
+
+Initializing a userfaultfd
+--------------------------
+
When first opened the ``userfaultfd`` must be enabled invoking the
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
a later API version) which will specify the ``read/POLLIN`` protocol
@@ -186,6 +219,31 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
used.
+Userfaultfd write-protect mode currently behave differently on none ptes
+(when e.g. page is missing) over different types of memories.
+
+For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
+(e.g. when pages are missing and not populated). For file-backed memories
+like shmem and hugetlbfs, none ptes will be write protected just like a
+present pte. In other words, there will be a userfaultfd write fault
+message generated when writing to a missing page on file typed memories,
+as long as the page range was write-protected before. Such a message will
+not be generated on anonymous memories by default.
+
+If the application wants to be able to write protect none ptes on anonymous
+memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ. On
+newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
+and set the feature bit in advance to make sure none ptes will also be
+write protected even upon anonymous memory.
+
+When using ``UFFDIO_REGISTER_MODE_WP`` in combination with either
+``UFFDIO_REGISTER_MODE_MISSING`` or ``UFFDIO_REGISTER_MODE_MINOR``, when
+resolving missing / minor faults with ``UFFDIO_COPY`` or ``UFFDIO_CONTINUE``
+respectively, it may be desirable for the new page / mapping to be
+write-protected (so future writes will also result in a WP fault). These ioctls
+support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP``
+respectively) to configure the mapping this way.
+
QEMU/KVM
========
diff --git a/Documentation/vm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 8edb8d578caf..c5c2c7dbb155 100644
--- a/Documentation/vm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -1,5 +1,3 @@
-.. _zswap:
-
=====
zswap
=====
@@ -14,13 +12,7 @@ for potentially reduced swap I/O. This trade-off can also result in a
significant performance improvement if reads from the compressed cache are
faster than reads from a swap device.
-.. note::
- Zswap is a new feature as of v3.11 and interacts heavily with memory
- reclaim. This interaction has not been fully explored on the large set of
- potential configurations and workloads that exist. For this reason, zswap
- is a work in progress and should be considered experimental.
-
- Some potential benefits:
+Some potential benefits:
* Desktop/laptop users with limited RAM capacities can mitigate the
performance impact of swapping.
@@ -76,9 +68,7 @@ e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
means the compression ratio will always be 2:1 or worse (because of half-full
zbud pages). The zsmalloc type zpool has a more complex compressed page
-storage method, and it can achieve greater storage densities. However,
-zsmalloc does not implement compressed page eviction, so once zswap fills it
-cannot evict the oldest page, it can only reject new pages.
+storage method, and it can achieve greater storage densities.
When a swap page is passed from frontswap to zswap, zswap maintains a mapping
of the swap entry, a combination of the swap type and swap offset, to the zpool
@@ -130,9 +120,25 @@ attribute, e.g.::
echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
When zswap same-filled page identification is disabled at runtime, it will stop
-checking for the same-value filled pages during store operation. However, the
-existing pages which are marked as same-value filled pages remain stored
-unchanged in zswap until they are either loaded or invalidated.
+checking for the same-value filled pages during store operation.
+In other words, every page will be then considered non-same-value filled.
+However, the existing pages which are marked as same-value filled pages remain
+stored unchanged in zswap until they are either loaded or invalidated.
+
+In some circumstances it might be advantageous to make use of just the zswap
+ability to efficiently store same-filled pages without enabling the whole
+compressed page storage.
+In this case the handling of non-same-value pages by zswap (enabled by default)
+can be disabled by setting the ``non_same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``.
+It can also be enabled and disabled at runtime using the sysfs
+``non_same_filled_pages_enabled`` attribute, e.g.::
+
+ echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled
+
+Disabling both ``zswap.same_filled_pages_enabled`` and
+``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new
+pages by zswap.
To prevent zswap from shrinking pool when zswap is full and there's a high
pressure on swap (this will result in flipping pages in and out zswap pool
diff --git a/Documentation/admin-guide/nfs/nfs-client.rst b/Documentation/admin-guide/nfs/nfs-client.rst
index 6adb6457bc69..36760685dd34 100644
--- a/Documentation/admin-guide/nfs/nfs-client.rst
+++ b/Documentation/admin-guide/nfs/nfs-client.rst
@@ -36,10 +36,9 @@ administrative requirements that require particular behavior that does not
work well as part of an nfs_client_id4 string.
The nfs.nfs4_unique_id boot parameter specifies a unique string that can be
-used instead of a system's node name when an NFS client identifies itself to
-a server. Thus, if the system's node name is not unique, or it changes, its
-nfs.nfs4_unique_id stays the same, preventing collision with other clients
-or loss of state during NFS reboot recovery or transparent state migration.
+used together with a system's node name when an NFS client identifies itself to
+a server. Thus, if the system's node name is not unique, its
+nfs.nfs4_unique_id can help prevent collisions with other clients.
The nfs.nfs4_unique_id string is typically a UUID, though it can contain
anything that is believed to be unique across all NFS clients. An
@@ -53,8 +52,12 @@ outstanding NFSv4 state has expired, to prevent loss of NFSv4 state.
This string can be stored in an NFS client's grub.conf, or it can be provided
via a net boot facility such as PXE. It may also be specified as an nfs.ko
-module parameter. Specifying a uniquifier string is not support for NFS
-clients running in containers.
+module parameter.
+
+This uniquifier string will be the same for all NFS clients running in
+containers unless it is overridden by a value written to
+/sys/fs/nfs/net/nfs_client/identifier which will be local to the network
+namespace of the process which writes.
The DNS resolver
diff --git a/Documentation/admin-guide/perf/alibaba_pmu.rst b/Documentation/admin-guide/perf/alibaba_pmu.rst
new file mode 100644
index 000000000000..11de998bb480
--- /dev/null
+++ b/Documentation/admin-guide/perf/alibaba_pmu.rst
@@ -0,0 +1,100 @@
+=============================================================
+Alibaba's T-Head SoC Uncore Performance Monitoring Unit (PMU)
+=============================================================
+
+The Yitian 710, custom-built by Alibaba Group's chip development business,
+T-Head, implements uncore PMU for performance and functional debugging to
+facilitate system maintenance.
+
+DDR Sub-System Driveway (DRW) PMU Driver
+=========================================
+
+Yitian 710 employs eight DDR5/4 channels, four on each die. Each DDR5 channel
+is independent of others to service system memory requests. And one DDR5
+channel is split into two independent sub-channels. The DDR Sub-System Driveway
+implements separate PMUs for each sub-channel to monitor various performance
+metrics.
+
+The Driveway PMU devices are named as ali_drw_<sys_base_addr> with perf.
+For example, ali_drw_21000 and ali_drw_21080 are two PMU devices for two
+sub-channels of the same channel in die 0. And the PMU device of die 1 is
+prefixed with ali_drw_400XXXXX, e.g. ali_drw_40021000.
+
+Each sub-channel has 36 PMU counters in total, which is classified into
+four groups:
+
+- Group 0: PMU Cycle Counter. This group has one pair of counters
+ pmu_cycle_cnt_low and pmu_cycle_cnt_high, that is used as the cycle count
+ based on DDRC core clock.
+
+- Group 1: PMU Bandwidth Counters. This group has 8 counters that are used
+ to count the total access number of either the eight bank groups in a
+ selected rank, or four ranks separately in the first 4 counters. The base
+ transfer unit is 64B.
+
+- Group 2: PMU Retry Counters. This group has 10 counters, that intend to
+ count the total retry number of each type of uncorrectable error.
+
+- Group 3: PMU Common Counters. This group has 16 counters, that are used
+ to count the common events.
+
+For now, the Driveway PMU driver only uses counters in group 0 and group 3.
+
+The DDR Controller (DDRCTL) and DDR PHY combine to create a complete solution
+for connecting an SoC application bus to DDR memory devices. The DDRCTL
+receives transactions Host Interface (HIF) which is custom-defined by Synopsys.
+These transactions are queued internally and scheduled for access while
+satisfying the SDRAM protocol timing requirements, transaction priorities, and
+dependencies between the transactions. The DDRCTL in turn issues commands on
+the DDR PHY Interface (DFI) to the PHY module, which launches and captures data
+to and from the SDRAM. The driveway PMUs have hardware logic to gather
+statistics and performance logging signals on HIF, DFI, etc.
+
+By counting the READ, WRITE and RMW commands sent to the DDRC through the HIF
+interface, we could calculate the bandwidth. Example usage of counting memory
+data bandwidth::
+
+ perf stat \
+ -e ali_drw_21000/hif_wr/ \
+ -e ali_drw_21000/hif_rd/ \
+ -e ali_drw_21000/hif_rmw/ \
+ -e ali_drw_21000/cycle/ \
+ -e ali_drw_21080/hif_wr/ \
+ -e ali_drw_21080/hif_rd/ \
+ -e ali_drw_21080/hif_rmw/ \
+ -e ali_drw_21080/cycle/ \
+ -e ali_drw_23000/hif_wr/ \
+ -e ali_drw_23000/hif_rd/ \
+ -e ali_drw_23000/hif_rmw/ \
+ -e ali_drw_23000/cycle/ \
+ -e ali_drw_23080/hif_wr/ \
+ -e ali_drw_23080/hif_rd/ \
+ -e ali_drw_23080/hif_rmw/ \
+ -e ali_drw_23080/cycle/ \
+ -e ali_drw_25000/hif_wr/ \
+ -e ali_drw_25000/hif_rd/ \
+ -e ali_drw_25000/hif_rmw/ \
+ -e ali_drw_25000/cycle/ \
+ -e ali_drw_25080/hif_wr/ \
+ -e ali_drw_25080/hif_rd/ \
+ -e ali_drw_25080/hif_rmw/ \
+ -e ali_drw_25080/cycle/ \
+ -e ali_drw_27000/hif_wr/ \
+ -e ali_drw_27000/hif_rd/ \
+ -e ali_drw_27000/hif_rmw/ \
+ -e ali_drw_27000/cycle/ \
+ -e ali_drw_27080/hif_wr/ \
+ -e ali_drw_27080/hif_rd/ \
+ -e ali_drw_27080/hif_rmw/ \
+ -e ali_drw_27080/cycle/ -- sleep 10
+
+The average DRAM bandwidth can be calculated as follows:
+
+- Read Bandwidth = perf_hif_rd * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
+- Write Bandwidth = (perf_hif_wr + perf_hif_rmw) * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
+
+Here, DDRC_WIDTH = 64 bytes.
+
+The current driver does not support sampling. So "perf record" is
+unsupported. Also attach to a task is unsupported as the events are all
+uncore.
diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
new file mode 100644
index 000000000000..7e863662e2d4
--- /dev/null
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -0,0 +1,130 @@
+================================================
+HiSilicon PCIe Performance Monitoring Unit (PMU)
+================================================
+
+On Hip09, HiSilicon PCIe Performance Monitoring Unit (PMU) could monitor
+bandwidth, latency, bus utilization and buffer occupancy data of PCIe.
+
+Each PCIe Core has a PMU to monitor multi Root Ports of this PCIe Core and
+all Endpoints downstream these Root Ports.
+
+
+HiSilicon PCIe PMU driver
+=========================
+
+The PCIe PMU driver registers a perf PMU with the name of its sicl-id and PCIe
+Core id.::
+
+ /sys/bus/event_source/hisi_pcie<sicl>_core<core>
+
+PMU driver provides description of available events and filter options in sysfs,
+see /sys/bus/event_source/devices/hisi_pcie<sicl>_core<core>.
+
+The "format" directory describes all formats of the config (events) and config1
+(filter options) fields of the perf_event_attr structure. The "events" directory
+describes all documented events shown in perf list.
+
+The "identifier" sysfs file allows users to identify the version of the
+PMU hardware device.
+
+The "bus" sysfs file allows users to get the bus number of Root Ports
+monitored by PMU.
+
+Example usage of perf::
+
+ $# perf list
+ hisi_pcie0_core0/rx_mwr_latency/ [kernel PMU event]
+ hisi_pcie0_core0/rx_mwr_cnt/ [kernel PMU event]
+ ------------------------------------------
+
+ $# perf stat -e hisi_pcie0_core0/rx_mwr_latency/
+ $# perf stat -e hisi_pcie0_core0/rx_mwr_cnt/
+ $# perf stat -g -e hisi_pcie0_core0/rx_mwr_latency/ -e hisi_pcie0_core0/rx_mwr_cnt/
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported for PCIe PMU.
+
+Filter options
+--------------
+
+1. Target filter
+
+ PMU could only monitor the performance of traffic downstream target Root
+ Ports or downstream target Endpoint. PCIe PMU driver support "port" and
+ "bdf" interfaces for users, and these two interfaces aren't supported at the
+ same time.
+
+ - port
+
+ "port" filter can be used in all PCIe PMU events, target Root Port can be
+ selected by configuring the 16-bits-bitmap "port". Multi ports can be
+ selected for AP-layer-events, and only one port can be selected for
+ TL/DL-layer-events.
+
+ For example, if target Root Port is 0000:00:00.0 (x8 lanes), bit0 of
+ bitmap should be set, port=0x1; if target Root Port is 0000:00:04.0 (x4
+ lanes), bit8 is set, port=0x100; if these two Root Ports are both
+ monitored, port=0x101.
+
+ Example usage of perf::
+
+ $# perf stat -e hisi_pcie0_core0/rx_mwr_latency,port=0x1/ sleep 5
+
+ - bdf
+
+ "bdf" filter can only be used in bandwidth events, target Endpoint is
+ selected by configuring BDF to "bdf". Counter only counts the bandwidth of
+ message requested by target Endpoint.
+
+ For example, "bdf=0x3900" means BDF of target Endpoint is 0000:39:00.0.
+
+ Example usage of perf::
+
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,bdf=0x3900/ sleep 5
+
+2. Trigger filter
+
+ Event statistics start when the first time TLP length is greater/smaller
+ than trigger condition. You can set the trigger condition by writing
+ "trig_len", and set the trigger mode by writing "trig_mode". This filter can
+ only be used in bandwidth events.
+
+ For example, "trig_len=4" means trigger condition is 2^4 DW, "trig_mode=0"
+ means statistics start when TLP length > trigger condition, "trig_mode=1"
+ means start when TLP length < condition.
+
+ Example usage of perf::
+
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
+
+3. Threshold filter
+
+ Counter counts when TLP length within the specified range. You can set the
+ threshold by writing "thr_len", and set the threshold mode by writing
+ "thr_mode". This filter can only be used in bandwidth events.
+
+ For example, "thr_len=4" means threshold is 2^4 DW, "thr_mode=0" means
+ counter counts when TLP length >= threshold, and "thr_mode=1" means counts
+ when TLP length < threshold.
+
+ Example usage of perf::
+
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5
+
+4. TLP Length filter
+
+ When counting bandwidth, the data can be composed of certain parts of TLP
+ packets. You can specify it through "len_mode":
+
+ - 2'b00: Reserved (Do not use this since the behaviour is undefined)
+ - 2'b01: Bandwidth of TLP payloads
+ - 2'b10: Bandwidth of TLP headers
+ - 2'b11: Bandwidth of both TLP payloads and headers
+
+ For example, "len_mode=2" means only counting the bandwidth of TLP headers
+ and "len_mode=3" means the final bandwidth data is composed of both TLP
+ headers and payloads. Default value if not specified is 2'b11.
+
+ Example usage of perf::
+
+ $# perf stat -e hisi_pcie0_core0/rx_mrd_flux,len_mode=0x1/ sleep 5
diff --git a/Documentation/admin-guide/perf/hns3-pmu.rst b/Documentation/admin-guide/perf/hns3-pmu.rst
new file mode 100644
index 000000000000..75a40846d47f
--- /dev/null
+++ b/Documentation/admin-guide/perf/hns3-pmu.rst
@@ -0,0 +1,136 @@
+======================================
+HNS3 Performance Monitoring Unit (PMU)
+======================================
+
+HNS3(HiSilicon network system 3) Performance Monitoring Unit (PMU) is an
+End Point device to collect performance statistics of HiSilicon SoC NIC.
+On Hip09, each SICL(Super I/O cluster) has one PMU device.
+
+HNS3 PMU supports collection of performance statistics such as bandwidth,
+latency, packet rate and interrupt rate.
+
+Each HNS3 PMU supports 8 hardware events.
+
+HNS3 PMU driver
+===============
+
+The HNS3 PMU driver registers a perf PMU with the name of its sicl id.::
+
+ /sys/devices/hns3_pmu_sicl_<sicl_id>
+
+PMU driver provides description of available events, filter modes, format,
+identifier and cpumask in sysfs.
+
+The "events" directory describes the event code of all supported events
+shown in perf list.
+
+The "filtermode" directory describes the supported filter modes of each
+event.
+
+The "format" directory describes all formats of the config (events) and
+config1 (filter options) fields of the perf_event_attr structure.
+
+The "identifier" file shows version of PMU hardware device.
+
+The "bdf_min" and "bdf_max" files show the supported bdf range of each
+pmu device.
+
+The "hw_clk_freq" file shows the hardware clock frequency of each pmu
+device.
+
+Example usage of checking event code and subevent code::
+
+ $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_time
+ config=0x00204
+ $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_packet_num
+ config=0x10204
+
+Each performance statistic has a pair of events to get two values to
+calculate real performance data in userspace.
+
+The bits 0~15 of config (here 0x0204) are the true hardware event code. If
+two events have same value of bits 0~15 of config, that means they are
+event pair. And the bit 16 of config indicates getting counter 0 or
+counter 1 of hardware event.
+
+After getting two values of event pair in userspace, the formula of
+computation to calculate real performance data is:::
+
+ counter 0 / counter 1
+
+Example usage of checking supported filter mode::
+
+ $# cat /sys/devices/hns3_pmu_sicl_0/filtermode/bw_ssu_rpu_byte_num
+ filter mode supported: global/port/port-tc/func/func-queue/
+
+Example usage of perf::
+
+ $# perf list
+ hns3_pmu_sicl_0/bw_ssu_rpu_byte_num/ [kernel PMU event]
+ hns3_pmu_sicl_0/bw_ssu_rpu_time/ [kernel PMU event]
+ ------------------------------------------
+
+ $# perf stat -g -e hns3_pmu_sicl_0/bw_ssu_rpu_byte_num,global=1/ -e hns3_pmu_sicl_0/bw_ssu_rpu_time,global=1/ -I 1000
+ or
+ $# perf stat -g -e hns3_pmu_sicl_0/config=0x00002,global=1/ -e hns3_pmu_sicl_0/config=0x10002,global=1/ -I 1000
+
+
+Filter modes
+--------------
+
+1. global mode
+PMU collect performance statistics for all HNS3 PCIe functions of IO DIE.
+Set the "global" filter option to 1 will enable this mode.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,global=1/ -I 1000
+
+2. port mode
+PMU collect performance statistic of one whole physical port. The port id
+is same as mac id. The "tc" filter option must be set to 0xF in this mode,
+here tc stands for traffic class.
+
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,port=0,tc=0xF/ -I 1000
+
+3. port-tc mode
+PMU collect performance statistic of one tc of physical port. The port id
+is same as mac id. The "tc" filter option must be set to 0 ~ 7 in this
+mode.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,port=0,tc=0/ -I 1000
+
+4. func mode
+PMU collect performance statistic of one PF/VF. The function id is BDF of
+PF/VF, its conversion formula::
+
+ func = (bus << 8) + (device << 3) + (function)
+
+for example:
+ BDF func
+ 35:00.0 0x3500
+ 35:00.1 0x3501
+ 35:01.0 0x3508
+
+In this mode, the "queue" filter option must be set to 0xFFFF.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,bdf=0x3500,queue=0xFFFF/ -I 1000
+
+5. func-queue mode
+PMU collect performance statistic of one queue of PF/VF. The function id
+is BDF of PF/VF, the "queue" filter option must be set to the exact queue
+id of function.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,bdf=0x3500,queue=0/ -I 1000
+
+6. func-intr mode
+PMU collect performance statistic of one interrupt of PF/VF. The function
+id is BDF of PF/VF, the "intr" filter option must be set to the exact
+interrupt id of function.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x00301,bdf=0x3500,intr=0/ -I 1000
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 5a8f2529a033..9de64a40adab 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -8,6 +8,8 @@ Performance monitor support
:maxdepth: 1
hisi-pmu
+ hisi-pcie-pmu
+ hns3-pmu
imx-ddr
qcom_l2_pmu
qcom_l3_pmu
@@ -16,3 +18,6 @@ Performance monitor support
xgene-pmu
arm_dsu_pmu
thunderx2-pmu
+ alibaba_pmu
+ nvidia-pmu
+ meson-ddr-pmu
diff --git a/Documentation/admin-guide/perf/meson-ddr-pmu.rst b/Documentation/admin-guide/perf/meson-ddr-pmu.rst
new file mode 100644
index 000000000000..8e71be1d6346
--- /dev/null
+++ b/Documentation/admin-guide/perf/meson-ddr-pmu.rst
@@ -0,0 +1,70 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================================================
+Amlogic SoC DDR Bandwidth Performance Monitoring Unit (PMU)
+===========================================================
+
+The Amlogic Meson G12 SoC contains a bandwidth monitor inside DRAM controller.
+The monitor includes 4 channels. Each channel can count the request accessing
+DRAM. The channel can count up to 3 AXI port simultaneously. It can be helpful
+to show if the performance bottleneck is on DDR bandwidth.
+
+Currently, this driver supports the following 5 perf events:
+
++ meson_ddr_bw/total_rw_bytes/
++ meson_ddr_bw/chan_1_rw_bytes/
++ meson_ddr_bw/chan_2_rw_bytes/
++ meson_ddr_bw/chan_3_rw_bytes/
++ meson_ddr_bw/chan_4_rw_bytes/
+
+meson_ddr_bw/chan_{1,2,3,4}_rw_bytes/ events are channel-specific events.
+Each channel support filtering, which can let the channel to monitor
+individual IP module in SoC.
+
+Below are DDR access request event filter keywords:
+
++ arm - from CPU
++ vpu_read1 - from OSD + VPP read
++ gpu - from 3D GPU
++ pcie - from PCIe controller
++ hdcp - from HDCP controller
++ hevc_front - from HEVC codec front end
++ usb3_0 - from USB3.0 controller
++ hevc_back - from HEVC codec back end
++ h265enc - from HEVC encoder
++ vpu_read2 - from DI read
++ vpu_write1 - from VDIN write
++ vpu_write2 - from di write
++ vdec - from legacy codec video decoder
++ hcodec - from H264 encoder
++ ge2d - from ge2d
++ spicc1 - from SPI controller 1
++ usb0 - from USB2.0 controller 0
++ dma - from system DMA controller 1
++ arb0 - from arb0
++ sd_emmc_b - from SD eMMC b controller
++ usb1 - from USB2.0 controller 1
++ audio - from Audio module
++ sd_emmc_c - from SD eMMC c controller
++ spicc2 - from SPI controller 2
++ ethernet - from Ethernet controller
+
+
+Examples:
+
+ + Show the total DDR bandwidth per seconds:
+
+ .. code-block:: bash
+
+ perf stat -a -e meson_ddr_bw/total_rw_bytes/ -I 1000 sleep 10
+
+
+ + Show individual DDR bandwidth from CPU and GPU respectively, as well as
+ sum of them:
+
+ .. code-block:: bash
+
+ perf stat -a -e meson_ddr_bw/chan_1_rw_bytes,arm=1/ -I 1000 sleep 10
+ perf stat -a -e meson_ddr_bw/chan_2_rw_bytes,gpu=1/ -I 1000 sleep 10
+ perf stat -a -e meson_ddr_bw/chan_3_rw_bytes,arm=1,gpu=1/ -I 1000 sleep 10
+
diff --git a/Documentation/admin-guide/perf/nvidia-pmu.rst b/Documentation/admin-guide/perf/nvidia-pmu.rst
new file mode 100644
index 000000000000..2e0d47cfe7ea
--- /dev/null
+++ b/Documentation/admin-guide/perf/nvidia-pmu.rst
@@ -0,0 +1,299 @@
+=========================================================
+NVIDIA Tegra SoC Uncore Performance Monitoring Unit (PMU)
+=========================================================
+
+The NVIDIA Tegra SoC includes various system PMUs to measure key performance
+metrics like memory bandwidth, latency, and utilization:
+
+* Scalable Coherency Fabric (SCF)
+* NVLink-C2C0
+* NVLink-C2C1
+* CNVLink
+* PCIE
+
+PMU Driver
+----------
+
+The PMUs in this document are based on ARM CoreSight PMU Architecture as
+described in document: ARM IHI 0091. Since this is a standard architecture, the
+PMUs are managed by a common driver "arm-cs-arch-pmu". This driver describes
+the available events and configuration of each PMU in sysfs. Please see the
+sections below to get the sysfs path of each PMU. Like other uncore PMU drivers,
+the driver provides "cpumask" sysfs attribute to show the CPU id used to handle
+the PMU event. There is also "associated_cpus" sysfs attribute, which contains a
+list of CPUs associated with the PMU instance.
+
+.. _SCF_PMU_Section:
+
+SCF PMU
+-------
+
+The SCF PMU monitors system level cache events, CPU traffic, and
+strongly-ordered (SO) PCIE write traffic to local/remote memory. Please see
+:ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about the PMU
+traffic coverage.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_sources/devices/nvidia_scf_pmu_<socket-id>.
+
+Example usage:
+
+* Count event id 0x0 in socket 0::
+
+ perf stat -a -e nvidia_scf_pmu_0/event=0x0/
+
+* Count event id 0x0 in socket 1::
+
+ perf stat -a -e nvidia_scf_pmu_1/event=0x0/
+
+NVLink-C2C0 PMU
+--------------------
+
+The NVLink-C2C0 PMU monitors incoming traffic from a GPU/CPU connected with
+NVLink-C2C (Chip-2-Chip) interconnect. The type of traffic captured by this PMU
+varies dependent on the chip configuration:
+
+* NVIDIA Grace Hopper Superchip: Hopper GPU is connected with Grace SoC.
+
+ In this config, the PMU captures GPU ATS translated or EGM traffic from the GPU.
+
+* NVIDIA Grace CPU Superchip: two Grace CPU SoCs are connected.
+
+ In this config, the PMU captures read and relaxed ordered (RO) writes from
+ PCIE device of the remote SoC.
+
+Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about
+the PMU traffic coverage.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_sources/devices/nvidia_nvlink_c2c0_pmu_<socket-id>.
+
+Example usage:
+
+* Count event id 0x0 from the GPU/CPU connected with socket 0::
+
+ perf stat -a -e nvidia_nvlink_c2c0_pmu_0/event=0x0/
+
+* Count event id 0x0 from the GPU/CPU connected with socket 1::
+
+ perf stat -a -e nvidia_nvlink_c2c0_pmu_1/event=0x0/
+
+* Count event id 0x0 from the GPU/CPU connected with socket 2::
+
+ perf stat -a -e nvidia_nvlink_c2c0_pmu_2/event=0x0/
+
+* Count event id 0x0 from the GPU/CPU connected with socket 3::
+
+ perf stat -a -e nvidia_nvlink_c2c0_pmu_3/event=0x0/
+
+NVLink-C2C1 PMU
+-------------------
+
+The NVLink-C2C1 PMU monitors incoming traffic from a GPU connected with
+NVLink-C2C (Chip-2-Chip) interconnect. This PMU captures untranslated GPU
+traffic, in contrast with NvLink-C2C0 PMU that captures ATS translated traffic.
+Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section` for more info about
+the PMU traffic coverage.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_sources/devices/nvidia_nvlink_c2c1_pmu_<socket-id>.
+
+Example usage:
+
+* Count event id 0x0 from the GPU connected with socket 0::
+
+ perf stat -a -e nvidia_nvlink_c2c1_pmu_0/event=0x0/
+
+* Count event id 0x0 from the GPU connected with socket 1::
+
+ perf stat -a -e nvidia_nvlink_c2c1_pmu_1/event=0x0/
+
+* Count event id 0x0 from the GPU connected with socket 2::
+
+ perf stat -a -e nvidia_nvlink_c2c1_pmu_2/event=0x0/
+
+* Count event id 0x0 from the GPU connected with socket 3::
+
+ perf stat -a -e nvidia_nvlink_c2c1_pmu_3/event=0x0/
+
+CNVLink PMU
+---------------
+
+The CNVLink PMU monitors traffic from GPU and PCIE device on remote sockets
+to local memory. For PCIE traffic, this PMU captures read and relaxed ordered
+(RO) write traffic. Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section`
+for more info about the PMU traffic coverage.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_sources/devices/nvidia_cnvlink_pmu_<socket-id>.
+
+Each SoC socket can be connected to one or more sockets via CNVLink. The user can
+use "rem_socket" bitmap parameter to select the remote socket(s) to monitor.
+Each bit represents the socket number, e.g. "rem_socket=0xE" corresponds to
+socket 1 to 3.
+/sys/bus/event_sources/devices/nvidia_cnvlink_pmu_<socket-id>/format/rem_socket
+shows the valid bits that can be set in the "rem_socket" parameter.
+
+The PMU can not distinguish the remote traffic initiator, therefore it does not
+provide filter to select the traffic source to monitor. It reports combined
+traffic from remote GPU and PCIE devices.
+
+Example usage:
+
+* Count event id 0x0 for the traffic from remote socket 1, 2, and 3 to socket 0::
+
+ perf stat -a -e nvidia_cnvlink_pmu_0/event=0x0,rem_socket=0xE/
+
+* Count event id 0x0 for the traffic from remote socket 0, 2, and 3 to socket 1::
+
+ perf stat -a -e nvidia_cnvlink_pmu_1/event=0x0,rem_socket=0xD/
+
+* Count event id 0x0 for the traffic from remote socket 0, 1, and 3 to socket 2::
+
+ perf stat -a -e nvidia_cnvlink_pmu_2/event=0x0,rem_socket=0xB/
+
+* Count event id 0x0 for the traffic from remote socket 0, 1, and 2 to socket 3::
+
+ perf stat -a -e nvidia_cnvlink_pmu_3/event=0x0,rem_socket=0x7/
+
+
+PCIE PMU
+------------
+
+The PCIE PMU monitors all read/write traffic from PCIE root ports to
+local/remote memory. Please see :ref:`NVIDIA_Uncore_PMU_Traffic_Coverage_Section`
+for more info about the PMU traffic coverage.
+
+The events and configuration options of this PMU device are described in sysfs,
+see /sys/bus/event_sources/devices/nvidia_pcie_pmu_<socket-id>.
+
+Each SoC socket can support multiple root ports. The user can use
+"root_port" bitmap parameter to select the port(s) to monitor, i.e.
+"root_port=0xF" corresponds to root port 0 to 3.
+/sys/bus/event_sources/devices/nvidia_pcie_pmu_<socket-id>/format/root_port
+shows the valid bits that can be set in the "root_port" parameter.
+
+Example usage:
+
+* Count event id 0x0 from root port 0 and 1 of socket 0::
+
+ perf stat -a -e nvidia_pcie_pmu_0/event=0x0,root_port=0x3/
+
+* Count event id 0x0 from root port 0 and 1 of socket 1::
+
+ perf stat -a -e nvidia_pcie_pmu_1/event=0x0,root_port=0x3/
+
+.. _NVIDIA_Uncore_PMU_Traffic_Coverage_Section:
+
+Traffic Coverage
+----------------
+
+The PMU traffic coverage may vary dependent on the chip configuration:
+
+* **NVIDIA Grace Hopper Superchip**: Hopper GPU is connected with Grace SoC.
+
+ Example configuration with two Grace SoCs::
+
+ ********************************* *********************************
+ * SOCKET-A * * SOCKET-B *
+ * * * *
+ * :::::::: * * :::::::: *
+ * : PCIE : * * : PCIE : *
+ * :::::::: * * :::::::: *
+ * | * * | *
+ * | * * | *
+ * ::::::: ::::::::: * * ::::::::: ::::::: *
+ * : : : : * * : : : : *
+ * : GPU :<--NVLink-->: Grace :<---CNVLink--->: Grace :<--NVLink-->: GPU : *
+ * : : C2C : SoC : * * : SoC : C2C : : *
+ * ::::::: ::::::::: * * ::::::::: ::::::: *
+ * | | * * | | *
+ * | | * * | | *
+ * &&&&&&&& &&&&&&&& * * &&&&&&&& &&&&&&&& *
+ * & GMEM & & CMEM & * * & CMEM & & GMEM & *
+ * &&&&&&&& &&&&&&&& * * &&&&&&&& &&&&&&&& *
+ * * * *
+ ********************************* *********************************
+
+ GMEM = GPU Memory (e.g. HBM)
+ CMEM = CPU Memory (e.g. LPDDR5X)
+
+ |
+ | Following table contains traffic coverage of Grace SoC PMU in socket-A:
+
+ ::
+
+ +--------------+-------+-----------+-----------+-----+----------+----------+
+ | | Source |
+ + +-------+-----------+-----------+-----+----------+----------+
+ | Destination | |GPU ATS |GPU Not-ATS| | Socket-B | Socket-B |
+ | |PCI R/W|Translated,|Translated | CPU | CPU/PCIE1| GPU/PCIE2|
+ | | |EGM | | | | |
+ +==============+=======+===========+===========+=====+==========+==========+
+ | Local | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | SCF PMU | CNVLink |
+ | SYSRAM/CMEM | PMU |PMU |PMU | PMU | | PMU |
+ +--------------+-------+-----------+-----------+-----+----------+----------+
+ | Local GMEM | PCIE | N/A |NVLink-C2C1| SCF | SCF PMU | CNVLink |
+ | | PMU | |PMU | PMU | | PMU |
+ +--------------+-------+-----------+-----------+-----+----------+----------+
+ | Remote | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | | |
+ | SYSRAM/CMEM | PMU |PMU |PMU | PMU | N/A | N/A |
+ | over CNVLink | | | | | | |
+ +--------------+-------+-----------+-----------+-----+----------+----------+
+ | Remote GMEM | PCIE |NVLink-C2C0|NVLink-C2C1| SCF | | |
+ | over CNVLink | PMU |PMU |PMU | PMU | N/A | N/A |
+ +--------------+-------+-----------+-----------+-----+----------+----------+
+
+ PCIE1 traffic represents strongly ordered (SO) writes.
+ PCIE2 traffic represents reads and relaxed ordered (RO) writes.
+
+* **NVIDIA Grace CPU Superchip**: two Grace CPU SoCs are connected.
+
+ Example configuration with two Grace SoCs::
+
+ ******************* *******************
+ * SOCKET-A * * SOCKET-B *
+ * * * *
+ * :::::::: * * :::::::: *
+ * : PCIE : * * : PCIE : *
+ * :::::::: * * :::::::: *
+ * | * * | *
+ * | * * | *
+ * ::::::::: * * ::::::::: *
+ * : : * * : : *
+ * : Grace :<--------NVLink------->: Grace : *
+ * : SoC : * C2C * : SoC : *
+ * ::::::::: * * ::::::::: *
+ * | * * | *
+ * | * * | *
+ * &&&&&&&& * * &&&&&&&& *
+ * & CMEM & * * & CMEM & *
+ * &&&&&&&& * * &&&&&&&& *
+ * * * *
+ ******************* *******************
+
+ GMEM = GPU Memory (e.g. HBM)
+ CMEM = CPU Memory (e.g. LPDDR5X)
+
+ |
+ | Following table contains traffic coverage of Grace SoC PMU in socket-A:
+
+ ::
+
+ +-----------------+-----------+---------+----------+-------------+
+ | | Source |
+ + +-----------+---------+----------+-------------+
+ | Destination | | | Socket-B | Socket-B |
+ | | PCI R/W | CPU | CPU/PCIE1| PCIE2 |
+ | | | | | |
+ +=================+===========+=========+==========+=============+
+ | Local | PCIE PMU | SCF PMU | SCF PMU | NVLink-C2C0 |
+ | SYSRAM/CMEM | | | | PMU |
+ +-----------------+-----------+---------+----------+-------------+
+ | Remote | | | | |
+ | SYSRAM/CMEM | PCIE PMU | SCF PMU | N/A | N/A |
+ | over NVLink-C2C | | | | |
+ +-----------------+-----------+---------+----------+-------------+
+
+ PCIE1 traffic represents strongly ordered (SO) writes.
+ PCIE2 traffic represents reads and relaxed ordered (RO) writes.
diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
new file mode 100644
index 000000000000..1cf40f69278c
--- /dev/null
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -0,0 +1,720 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===============================================
+``amd-pstate`` CPU Performance Scaling Driver
+===============================================
+
+:Copyright: |copy| 2021 Advanced Micro Devices, Inc.
+
+:Author: Huang Rui <ray.huang@amd.com>
+
+
+Introduction
+===================
+
+``amd-pstate`` is the AMD CPU performance scaling driver that introduces a
+new CPU frequency control mechanism on modern AMD APU and CPU series in
+Linux kernel. The new mechanism is based on Collaborative Processor
+Performance Control (CPPC) which provides finer grain frequency management
+than legacy ACPI hardware P-States. Current AMD CPU/APU platforms are using
+the ACPI P-states driver to manage CPU frequency and clocks with switching
+only in 3 P-states. CPPC replaces the ACPI P-states controls and allows a
+flexible, low-latency interface for the Linux kernel to directly
+communicate the performance hints to hardware.
+
+``amd-pstate`` leverages the Linux kernel governors such as ``schedutil``,
+``ondemand``, etc. to manage the performance hints which are provided by
+CPPC hardware functionality that internally follows the hardware
+specification (for details refer to AMD64 Architecture Programmer's Manual
+Volume 2: System Programming [1]_). Currently, ``amd-pstate`` supports basic
+frequency control function according to kernel governors on some of the
+Zen2 and Zen3 processors, and we will implement more AMD specific functions
+in future after we verify them on the hardware and SBIOS.
+
+
+AMD CPPC Overview
+=======================
+
+Collaborative Processor Performance Control (CPPC) interface enumerates a
+continuous, abstract, and unit-less performance value in a scale that is
+not tied to a specific performance state / frequency. This is an ACPI
+standard [2]_ which software can specify application performance goals and
+hints as a relative target to the infrastructure limits. AMD processors
+provide the low latency register model (MSR) instead of an AML code
+interpreter for performance adjustments. ``amd-pstate`` will initialize a
+``struct cpufreq_driver`` instance, ``amd_pstate_driver``, with the callbacks
+to manage each performance update behavior. ::
+
+ Highest Perf ------>+-----------------------+ +-----------------------+
+ | | | |
+ | | | |
+ | | Max Perf ---->| |
+ | | | |
+ | | | |
+ Nominal Perf ------>+-----------------------+ +-----------------------+
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | Desired Perf ---->| |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ | | | |
+ Lowest non- | | | |
+ linear perf ------>+-----------------------+ +-----------------------+
+ | | | |
+ | | Lowest perf ---->| |
+ | | | |
+ Lowest perf ------>+-----------------------+ +-----------------------+
+ | | | |
+ | | | |
+ | | | |
+ 0 ------>+-----------------------+ +-----------------------+
+
+ AMD P-States Performance Scale
+
+
+.. _perf_cap:
+
+AMD CPPC Performance Capability
+--------------------------------
+
+Highest Performance (RO)
+.........................
+
+This is the absolute maximum performance an individual processor may reach,
+assuming ideal conditions. This performance level may not be sustainable
+for long durations and may only be achievable if other platform components
+are in a specific state; for example, it may require other processors to be in
+an idle state. This would be equivalent to the highest frequencies
+supported by the processor.
+
+Nominal (Guaranteed) Performance (RO)
+......................................
+
+This is the maximum sustained performance level of the processor, assuming
+ideal operating conditions. In the absence of an external constraint (power,
+thermal, etc.), this is the performance level the processor is expected to
+be able to maintain continuously. All cores/processors are expected to be
+able to sustain their nominal performance state simultaneously.
+
+Lowest non-linear Performance (RO)
+...................................
+
+This is the lowest performance level at which nonlinear power savings are
+achieved, for example, due to the combined effects of voltage and frequency
+scaling. Above this threshold, lower performance levels should be generally
+more energy efficient than higher performance levels. This register
+effectively conveys the most efficient performance level to ``amd-pstate``.
+
+Lowest Performance (RO)
+........................
+
+This is the absolute lowest performance level of the processor. Selecting a
+performance level lower than the lowest nonlinear performance level may
+cause an efficiency penalty but should reduce the instantaneous power
+consumption of the processor.
+
+AMD CPPC Performance Control
+------------------------------
+
+``amd-pstate`` passes performance goals through these registers. The
+register drives the behavior of the desired performance target.
+
+Minimum requested performance (RW)
+...................................
+
+``amd-pstate`` specifies the minimum allowed performance level.
+
+Maximum requested performance (RW)
+...................................
+
+``amd-pstate`` specifies a limit the maximum performance that is expected
+to be supplied by the hardware.
+
+Desired performance target (RW)
+...................................
+
+``amd-pstate`` specifies a desired target in the CPPC performance scale as
+a relative number. This can be expressed as percentage of nominal
+performance (infrastructure max). Below the nominal sustained performance
+level, desired performance expresses the average performance level of the
+processor subject to hardware. Above the nominal performance level,
+the processor must provide at least nominal performance requested and go higher
+if current operating conditions allow.
+
+Energy Performance Preference (EPP) (RW)
+.........................................
+
+This attribute provides a hint to the hardware if software wants to bias
+toward performance (0x0) or energy efficiency (0xff).
+
+
+Key Governors Support
+=======================
+
+``amd-pstate`` can be used with all the (generic) scaling governors listed
+by the ``scaling_available_governors`` policy attribute in ``sysfs``. Then,
+it is responsible for the configuration of policy objects corresponding to
+CPUs and provides the ``CPUFreq`` core (and the scaling governors attached
+to the policy objects) with accurate information on the maximum and minimum
+operating frequencies supported by the hardware. Users can check the
+``scaling_cur_freq`` information comes from the ``CPUFreq`` core.
+
+``amd-pstate`` mainly supports ``schedutil`` and ``ondemand`` for dynamic
+frequency control. It is to fine tune the processor configuration on
+``amd-pstate`` to the ``schedutil`` with CPU CFS scheduler. ``amd-pstate``
+registers the adjust_perf callback to implement performance update behavior
+similar to CPPC. It is initialized by ``sugov_start`` and then populates the
+CPU's update_util_data pointer to assign ``sugov_update_single_perf`` as the
+utilization update callback function in the CPU scheduler. The CPU scheduler
+will call ``cpufreq_update_util`` and assigns the target performance according
+to the ``struct sugov_cpu`` that the utilization update belongs to.
+Then, ``amd-pstate`` updates the desired performance according to the CPU
+scheduler assigned.
+
+.. _processor_support:
+
+Processor Support
+=======================
+
+The ``amd-pstate`` initialization will fail if the ``_CPC`` entry in the ACPI
+SBIOS does not exist in the detected processor. It uses ``acpi_cpc_valid``
+to check the existence of ``_CPC``. All Zen based processors support the legacy
+ACPI hardware P-States function, so when ``amd-pstate`` fails initialization,
+the kernel will fall back to initialize the ``acpi-cpufreq`` driver.
+
+There are two types of hardware implementations for ``amd-pstate``: one is
+`Full MSR Support <perf_cap_>`_ and another is `Shared Memory Support
+<perf_cap_>`_. It can use the :c:macro:`X86_FEATURE_CPPC` feature flag to
+indicate the different types. (For details, refer to the Processor Programming
+Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors [3]_.)
+``amd-pstate`` is to register different ``static_call`` instances for different
+hardware implementations.
+
+Currently, some of the Zen2 and Zen3 processors support ``amd-pstate``. In the
+future, it will be supported on more and more AMD processors.
+
+Full MSR Support
+-----------------
+
+Some new Zen3 processors such as Cezanne provide the MSR registers directly
+while the :c:macro:`X86_FEATURE_CPPC` CPU feature flag is set.
+``amd-pstate`` can handle the MSR register to implement the fast switch
+function in ``CPUFreq`` that can reduce the latency of frequency control in
+interrupt context. The functions with a ``pstate_xxx`` prefix represent the
+operations on MSR registers.
+
+Shared Memory Support
+----------------------
+
+If the :c:macro:`X86_FEATURE_CPPC` CPU feature flag is not set, the
+processor supports the shared memory solution. In this case, ``amd-pstate``
+uses the ``cppc_acpi`` helper methods to implement the callback functions
+that are defined on ``static_call``. The functions with the ``cppc_xxx`` prefix
+represent the operations of ACPI CPPC helpers for the shared memory solution.
+
+
+AMD P-States and ACPI hardware P-States always can be supported in one
+processor. But AMD P-States has the higher priority and if it is enabled
+with :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond
+to the request from AMD P-States.
+
+
+User Space Interface in ``sysfs`` - Per-policy control
+======================================================
+
+``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
+control its functionality at the system level. They are located in the
+``/sys/devices/system/cpu/cpufreq/policyX/`` directory and affect all CPUs. ::
+
+ root@hr-test1:/home/ray# ls /sys/devices/system/cpu/cpufreq/policy0/*amd*
+ /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_highest_perf
+ /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_lowest_nonlinear_freq
+ /sys/devices/system/cpu/cpufreq/policy0/amd_pstate_max_freq
+
+
+``amd_pstate_highest_perf / amd_pstate_max_freq``
+
+Maximum CPPC performance and CPU frequency that the driver is allowed to
+set, in percent of the maximum supported CPPC performance level (the highest
+performance supported in `AMD CPPC Performance Capability <perf_cap_>`_).
+In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
+table, so we need to expose it to sysfs. If boost is not active, but
+still supported, this maximum frequency will be larger than the one in
+``cpuinfo``.
+This attribute is read-only.
+
+``amd_pstate_lowest_nonlinear_freq``
+
+The lowest non-linear CPPC CPU frequency that the driver is allowed to set,
+in percent of the maximum supported CPPC performance level. (Please see the
+lowest non-linear performance in `AMD CPPC Performance Capability
+<perf_cap_>`_.)
+This attribute is read-only.
+
+``energy_performance_available_preferences``
+
+A list of all the supported EPP preferences that could be used for
+``energy_performance_preference`` on this system.
+These profiles represent different hints that are provided
+to the low-level firmware about the user's desired energy vs efficiency
+tradeoff. ``default`` represents the epp value is set by platform
+firmware. This attribute is read-only.
+
+``energy_performance_preference``
+
+The current energy performance preference can be read from this attribute.
+and user can change current preference according to energy or performance needs
+Please get all support profiles list from
+``energy_performance_available_preferences`` attribute, all the profiles are
+integer values defined between 0 to 255 when EPP feature is enabled by platform
+firmware, if EPP feature is disabled, driver will ignore the written value
+This attribute is read-write.
+
+Other performance and frequency values can be read back from
+``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
+
+
+``amd-pstate`` vs ``acpi-cpufreq``
+======================================
+
+On the majority of AMD platforms supported by ``acpi-cpufreq``, the ACPI tables
+provided by the platform firmware are used for CPU performance scaling, but
+only provide 3 P-states on AMD processors.
+However, on modern AMD APU and CPU series, hardware provides the Collaborative
+Processor Performance Control according to the ACPI protocol and customizes this
+for AMD platforms. That is, fine-grained and continuous frequency ranges
+instead of the legacy hardware P-states. ``amd-pstate`` is the kernel
+module which supports the new AMD P-States mechanism on most of the future AMD
+platforms. The AMD P-States mechanism is the more performance and energy
+efficiency frequency management method on AMD processors.
+
+
+AMD Pstate Driver Operation Modes
+=================================
+
+``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode,
+non-autonomous (passive) mode and guided autonomous (guided) mode.
+Active/passive/guided mode can be chosen by different kernel parameters.
+
+- In autonomous mode, platform ignores the desired performance level request
+ and takes into account only the values set to the minimum, maximum and energy
+ performance preference registers.
+- In non-autonomous mode, platform gets desired performance level
+ from OS directly through Desired Performance Register.
+- In guided-autonomous mode, platform sets operating performance level
+ autonomously according to the current workload and within the limits set by
+ OS through min and max performance registers.
+
+Active Mode
+------------
+
+``amd_pstate=active``
+
+This is the low-level firmware control mode which is implemented by ``amd_pstate_epp``
+driver with ``amd_pstate=active`` passed to the kernel in the command line.
+In this mode, ``amd_pstate_epp`` driver provides a hint to the hardware if software
+wants to bias toward performance (0x0) or energy efficiency (0xff) to the CPPC firmware.
+then CPPC power algorithm will calculate the runtime workload and adjust the realtime
+cores frequency according to the power supply and thermal, core voltage and some other
+hardware conditions.
+
+Passive Mode
+------------
+
+``amd_pstate=passive``
+
+It will be enabled if the ``amd_pstate=passive`` is passed to the kernel in the command line.
+In this mode, ``amd_pstate`` driver software specifies a desired QoS target in the CPPC
+performance scale as a relative number. This can be expressed as percentage of nominal
+performance (infrastructure max). Below the nominal sustained performance level,
+desired performance expresses the average performance level of the processor subject
+to the Performance Reduction Tolerance register. Above the nominal performance level,
+processor must provide at least nominal performance requested and go higher if current
+operating conditions allow.
+
+Guided Mode
+-----------
+
+``amd_pstate=guided``
+
+If ``amd_pstate=guided`` is passed to kernel command line option then this mode
+is activated. In this mode, driver requests minimum and maximum performance
+level and the platform autonomously selects a performance level in this range
+and appropriate to the current workload.
+
+User Space Interface in ``sysfs`` - General
+===========================================
+
+Global Attributes
+-----------------
+
+``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
+control its functionality at the system level. They are located in the
+``/sys/devices/system/cpu/amd-pstate/`` directory and affect all CPUs.
+
+``status``
+ Operation mode of the driver: "active", "passive" or "disable".
+
+ "active"
+ The driver is functional and in the ``active mode``
+
+ "passive"
+ The driver is functional and in the ``passive mode``
+
+ "guided"
+ The driver is functional and in the ``guided mode``
+
+ "disable"
+ The driver is unregistered and not functional now.
+
+ This attribute can be written to in order to change the driver's
+ operation mode or to unregister it. The string written to it must be
+ one of the possible values of it and, if successful, writing one of
+ these values to the sysfs file will cause the driver to switch over
+ to the operation mode represented by that string - or to be
+ unregistered in the "disable" case.
+
+``cpupower`` tool support for ``amd-pstate``
+===============================================
+
+``amd-pstate`` is supported by the ``cpupower`` tool, which can be used to dump
+frequency information. Development is in progress to support more and more
+operations for the new ``amd-pstate`` module with this tool. ::
+
+ root@hr-test1:/home/ray# cpupower frequency-info
+ analyzing CPU 0:
+ driver: amd-pstate
+ CPUs which run at the same hardware frequency: 0
+ CPUs which need to have their frequency coordinated by software: 0
+ maximum transition latency: 131 us
+ hardware limits: 400 MHz - 4.68 GHz
+ available cpufreq governors: ondemand conservative powersave userspace performance schedutil
+ current policy: frequency should be within 400 MHz and 4.68 GHz.
+ The governor "schedutil" may decide which speed to use
+ within this range.
+ current CPU frequency: Unable to call hardware
+ current CPU frequency: 4.02 GHz (asserted by call to kernel)
+ boost state support:
+ Supported: yes
+ Active: yes
+ AMD PSTATE Highest Performance: 166. Maximum Frequency: 4.68 GHz.
+ AMD PSTATE Nominal Performance: 117. Nominal Frequency: 3.30 GHz.
+ AMD PSTATE Lowest Non-linear Performance: 39. Lowest Non-linear Frequency: 1.10 GHz.
+ AMD PSTATE Lowest Performance: 15. Lowest Frequency: 400 MHz.
+
+
+Diagnostics and Tuning
+=======================
+
+Trace Events
+--------------
+
+There are two static trace events that can be used for ``amd-pstate``
+diagnostics. One of them is the ``cpu_frequency`` trace event generally used
+by ``CPUFreq``, and the other one is the ``amd_pstate_perf`` trace event
+specific to ``amd-pstate``. The following sequence of shell commands can
+be used to enable them and see their output (if the kernel is
+configured to support event tracing). ::
+
+ root@hr-test1:/home/ray# cd /sys/kernel/tracing/
+ root@hr-test1:/sys/kernel/tracing# echo 1 > events/amd_cpu/enable
+ root@hr-test1:/sys/kernel/tracing# cat trace
+ # tracer: nop
+ #
+ # entries-in-buffer/entries-written: 47827/42233061 #P:2
+ #
+ # _-----=> irqs-off
+ # / _----=> need-resched
+ # | / _---=> hardirq/softirq
+ # || / _--=> preempt-depth
+ # ||| / delay
+ # TASK-PID CPU# |||| TIMESTAMP FUNCTION
+ # | | | |||| | |
+ <idle>-0 [015] dN... 4995.979886: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=15 changed=false fast_switch=true
+ <idle>-0 [007] d.h.. 4995.979893: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=7 changed=false fast_switch=true
+ cat-2161 [000] d.... 4995.980841: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=0 changed=false fast_switch=true
+ sshd-2125 [004] d.s.. 4995.980968: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=4 changed=false fast_switch=true
+ <idle>-0 [007] d.s.. 4995.980968: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=7 changed=false fast_switch=true
+ <idle>-0 [003] d.s.. 4995.980971: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=3 changed=false fast_switch=true
+ <idle>-0 [011] d.s.. 4995.980996: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=11 changed=false fast_switch=true
+
+The ``cpu_frequency`` trace event will be triggered either by the ``schedutil`` scaling
+governor (for the policies it is attached to), or by the ``CPUFreq`` core (for the
+policies with other scaling governors).
+
+
+Tracer Tool
+-------------
+
+``amd_pstate_tracer.py`` can record and parse ``amd-pstate`` trace log, then
+generate performance plots. This utility can be used to debug and tune the
+performance of ``amd-pstate`` driver. The tracer tool needs to import intel
+pstate tracer.
+
+Tracer tool located in ``linux/tools/power/x86/amd_pstate_tracer``. It can be
+used in two ways. If trace file is available, then directly parse the file
+with command ::
+
+ ./amd_pstate_trace.py [-c cpus] -t <trace_file> -n <test_name>
+
+Or generate trace file with root privilege, then parse and plot with command ::
+
+ sudo ./amd_pstate_trace.py [-c cpus] -n <test_name> -i <interval> [-m kbytes]
+
+The test result can be found in ``results/test_name``. Following is the example
+about part of the output. ::
+
+ common_cpu common_secs common_usecs min_perf des_perf max_perf freq mperf apef tsc load duration_ms sample_num elapsed_time common_comm
+ CPU_005 712 116384 39 49 166 0.7565 9645075 2214891 38431470 25.1 11.646 469 2.496 kworker/5:0-40
+ CPU_006 712 116408 39 49 166 0.6769 8950227 1839034 37192089 24.06 11.272 470 2.496 kworker/6:0-1264
+
+Unit Tests for amd-pstate
+-------------------------
+
+``amd-pstate-ut`` is a test module for testing the ``amd-pstate`` driver.
+
+ * It can help all users to verify their processor support (SBIOS/Firmware or Hardware).
+
+ * Kernel can have a basic function test to avoid the kernel regression during the update.
+
+ * We can introduce more functional or performance tests to align the result together, it will benefit power and performance scale optimization.
+
+1. Test case descriptions
+
+ 1). Basic tests
+
+ Test prerequisite and basic functions for the ``amd-pstate`` driver.
+
+ +---------+--------------------------------+------------------------------------------------------------------------------------+
+ | Index | Functions | Description |
+ +=========+================================+====================================================================================+
+ | 1 | amd_pstate_ut_acpi_cpc_valid || Check whether the _CPC object is present in SBIOS. |
+ | | || |
+ | | || The detail refer to `Processor Support <processor_support_>`_. |
+ +---------+--------------------------------+------------------------------------------------------------------------------------+
+ | 2 | amd_pstate_ut_check_enabled || Check whether AMD P-State is enabled. |
+ | | || |
+ | | || AMD P-States and ACPI hardware P-States always can be supported in one processor. |
+ | | | But AMD P-States has the higher priority and if it is enabled with |
+ | | | :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the |
+ | | | request from AMD P-States. |
+ +---------+--------------------------------+------------------------------------------------------------------------------------+
+ | 3 | amd_pstate_ut_check_perf || Check if the each performance values are reasonable. |
+ | | || highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0. |
+ +---------+--------------------------------+------------------------------------------------------------------------------------+
+ | 4 | amd_pstate_ut_check_freq || Check if the each frequency values and max freq when set support boost mode |
+ | | | are reasonable. |
+ | | || max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 |
+ | | || If boost is not active but supported, this maximum frequency will be larger than |
+ | | | the one in ``cpuinfo``. |
+ +---------+--------------------------------+------------------------------------------------------------------------------------+
+
+ 2). Tbench test
+
+ Test and monitor the cpu changes when running tbench benchmark under the specified governor.
+ These changes include desire performance, frequency, load, performance, energy etc.
+ The specified governor is ondemand or schedutil.
+ Tbench can also be tested on the ``acpi-cpufreq`` kernel driver for comparison.
+
+ 3). Gitsource test
+
+ Test and monitor the cpu changes when running gitsource benchmark under the specified governor.
+ These changes include desire performance, frequency, load, time, energy etc.
+ The specified governor is ondemand or schedutil.
+ Gitsource can also be tested on the ``acpi-cpufreq`` kernel driver for comparison.
+
+#. How to execute the tests
+
+ We use test module in the kselftest frameworks to implement it.
+ We create ``amd-pstate-ut`` module and tie it into kselftest.(for
+ details refer to Linux Kernel Selftests [4]_).
+
+ 1). Build
+
+ + open the :c:macro:`CONFIG_X86_AMD_PSTATE` configuration option.
+ + set the :c:macro:`CONFIG_X86_AMD_PSTATE_UT` configuration option to M.
+ + make project
+ + make selftest ::
+
+ $ cd linux
+ $ make -C tools/testing/selftests
+
+ + make perf ::
+
+ $ cd tools/perf/
+ $ make
+
+
+ 2). Installation & Steps ::
+
+ $ make -C tools/testing/selftests install INSTALL_PATH=~/kselftest
+ $ cp tools/perf/perf /usr/bin/perf
+ $ sudo ./kselftest/run_kselftest.sh -c amd-pstate
+
+ 3). Specified test case ::
+
+ $ cd ~/kselftest/amd-pstate
+ $ sudo ./run.sh -t basic
+ $ sudo ./run.sh -t tbench
+ $ sudo ./run.sh -t tbench -m acpi-cpufreq
+ $ sudo ./run.sh -t gitsource
+ $ sudo ./run.sh -t gitsource -m acpi-cpufreq
+ $ ./run.sh --help
+ ./run.sh: illegal option -- -
+ Usage: ./run.sh [OPTION...]
+ [-h <help>]
+ [-o <output-file-for-dump>]
+ [-c <all: All testing,
+ basic: Basic testing,
+ tbench: Tbench testing,
+ gitsource: Gitsource testing.>]
+ [-t <tbench time limit>]
+ [-p <tbench process number>]
+ [-l <loop times for tbench>]
+ [-i <amd tracer interval>]
+ [-m <comparative test: acpi-cpufreq>]
+
+
+ 4). Results
+
+ + basic
+
+ When you finish test, you will get the following log info ::
+
+ $ dmesg | grep "amd_pstate_ut" | tee log.txt
+ [12977.570663] amd_pstate_ut: 1 amd_pstate_ut_acpi_cpc_valid success!
+ [12977.570673] amd_pstate_ut: 2 amd_pstate_ut_check_enabled success!
+ [12977.571207] amd_pstate_ut: 3 amd_pstate_ut_check_perf success!
+ [12977.571212] amd_pstate_ut: 4 amd_pstate_ut_check_freq success!
+
+ + tbench
+
+ When you finish test, you will get selftest.tbench.csv and png images.
+ The selftest.tbench.csv file contains the raw data and the drop of the comparative test.
+ The png images shows the performance, energy and performan per watt of each test.
+ Open selftest.tbench.csv :
+
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + Governor | Round | Des-perf | Freq | Load | Performance | Energy | Performance Per Watt |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + Unit | | | GHz | | MB/s | J | MB/J |
+ +=================================================+==============+==========+=========+==========+=============+=========+======================+
+ + amd-pstate-ondemand | 1 | | | | 2504.05 | 1563.67 | 158.5378 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | 2 | | | | 2243.64 | 1430.32 | 155.2941 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | 3 | | | | 2183.88 | 1401.32 | 154.2860 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | Average | | | | 2310.52 | 1465.1 | 156.1268 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 1 | 165.329 | 1.62257 | 99.798 | 2136.54 | 1395.26 | 151.5971 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 2 | 166 | 1.49761 | 99.9993 | 2100.56 | 1380.5 | 150.6377 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 3 | 166 | 1.47806 | 99.9993 | 2084.12 | 1375.76 | 149.9737 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | Average | 165.776 | 1.53275 | 99.9322 | 2107.07 | 1383.84 | 150.7399 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 1 | | | | 2529.9 | 1564.4 | 160.0997 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 2 | | | | 2249.76 | 1432.97 | 155.4297 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 3 | | | | 2181.46 | 1406.88 | 153.5060 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | Average | | | | 2320.37 | 1468.08 | 156.4741 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 1 | | | | 2137.64 | 1385.24 | 152.7723 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 2 | | | | 2107.05 | 1372.23 | 152.0138 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 3 | | | | 2085.86 | 1365.35 | 151.2433 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | Average | | | | 2110.18 | 1374.27 | 152.0136 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand VS acpi-cpufreq-schedutil | Comprison(%) | | | | -9.0584 | -6.3899 | -2.8506 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand VS amd-pstate-schedutil | Comprison(%) | | | | 8.8053 | -5.5463 | -3.4503 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand VS amd-pstate-ondemand | Comprison(%) | | | | -0.4245 | -0.2029 | -0.2219 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil VS amd-pstate-schedutil | Comprison(%) | | | | -0.1473 | 0.6963 | -0.8378 |
+ +-------------------------------------------------+--------------+----------+---------+----------+-------------+---------+----------------------+
+
+ + gitsource
+
+ When you finish test, you will get selftest.gitsource.csv and png images.
+ The selftest.gitsource.csv file contains the raw data and the drop of the comparative test.
+ The png images shows the performance, energy and performan per watt of each test.
+ Open selftest.gitsource.csv :
+
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + Governor | Round | Des-perf | Freq | Load | Time | Energy | Performance Per Watt |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + Unit | | | GHz | | s | J | 1/J |
+ +=================================================+==============+==========+==========+==========+=============+=========+======================+
+ + amd-pstate-ondemand | 1 | 50.119 | 2.10509 | 23.3076 | 475.69 | 865.78 | 0.001155027 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | 2 | 94.8006 | 1.98771 | 56.6533 | 467.1 | 839.67 | 0.001190944 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | 3 | 76.6091 | 2.53251 | 43.7791 | 467.69 | 855.85 | 0.001168429 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand | Average | 73.8429 | 2.20844 | 41.2467 | 470.16 | 853.767 | 0.001171279 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 1 | 165.919 | 1.62319 | 98.3868 | 464.17 | 866.8 | 0.001153668 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 2 | 165.97 | 1.31309 | 99.5712 | 480.15 | 880.4 | 0.001135847 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | 3 | 165.973 | 1.28448 | 99.9252 | 481.79 | 867.02 | 0.001153375 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-schedutil | Average | 165.954 | 1.40692 | 99.2944 | 475.37 | 871.407 | 0.001147569 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 1 | | | | 2379.62 | 742.96 | 0.001345967 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 2 | | | | 441.74 | 817.49 | 0.001223256 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | 3 | | | | 455.48 | 820.01 | 0.001219497 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand | Average | | | | 425.613 | 793.487 | 0.001260260 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 1 | | | | 459.69 | 838.54 | 0.001192548 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 2 | | | | 466.55 | 830.89 | 0.001203528 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | 3 | | | | 470.38 | 837.32 | 0.001194286 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil | Average | | | | 465.54 | 835.583 | 0.001196769 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand VS acpi-cpufreq-schedutil | Comprison(%) | | | | 9.3810 | 5.3051 | -5.0379 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + amd-pstate-ondemand VS amd-pstate-schedutil | Comprison(%) | 124.7392 | -36.2934 | 140.7329 | 1.1081 | 2.0661 | -2.0242 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-ondemand VS amd-pstate-ondemand | Comprison(%) | | | | 10.4665 | 7.5968 | -7.0605 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+ + acpi-cpufreq-schedutil VS amd-pstate-schedutil | Comprison(%) | | | | 2.1115 | 4.2873 | -4.1110 |
+ +-------------------------------------------------+--------------+----------+----------+----------+-------------+---------+----------------------+
+
+Reference
+===========
+
+.. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming,
+ https://www.amd.com/system/files/TechDocs/24593.pdf
+
+.. [2] Advanced Configuration and Power Interface Specification,
+ https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf
+
+.. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors
+ https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip
+
+.. [4] Linux Kernel Selftests,
+ https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index aec2cd2aaea7..19754beb5a4e 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -612,8 +612,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor
by default this way, for example.
The other kernel command line parameters controlling CPU idle time management
-described below are only relevant for the *x86* architecture and some of
-them affect Intel processors only.
+described below are only relevant for the *x86* architecture and references
+to ``intel_idle`` affect Intel processors only.
The *x86* architecture support code recognizes three kernel command line
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
@@ -635,10 +635,13 @@ idle, so it very well may hurt single-thread computations performance as well as
energy-efficiency. Thus using it for performance reasons may not be a good idea
at all.]
-The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
-``acpi_idle`` to be used (as long as all of the information needed by it is
-there in the system's ACPI tables), but it is not allowed to use the
-``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
+The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of
+the CPU to enter idle states. When this option is used, the ``acpi_idle``
+driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems
+running Intel processors, this option disables the ``intel_idle`` driver
+and forces the use of the ``acpi_idle`` driver instead. Note that in either
+case, ``acpi_idle`` driver will function only if all the information needed
+by it is in the system's ACPI tables.
In addition to the architecture-level kernel command line options affecting CPU
idle time management, there are parameters affecting individual ``CPUIdle``
diff --git a/Documentation/admin-guide/pm/intel-speed-select.rst b/Documentation/admin-guide/pm/intel-speed-select.rst
index 0a1fbdb54bfe..a2bfb971654f 100644
--- a/Documentation/admin-guide/pm/intel-speed-select.rst
+++ b/Documentation/admin-guide/pm/intel-speed-select.rst
@@ -262,6 +262,28 @@ Which shows that the base frequency now increased from 2600 MHz at performance
level 0 to 2800 MHz at performance level 4. As a result, any workload, which can
use fewer CPUs, can see a boost of 200 MHz compared to performance level 0.
+Changing performance level via BMC Interface
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to change SST-PP level using out of band (OOB) agent (Via some
+remote management console, through BMC "Baseboard Management Controller"
+interface). This mode is supported from the Sapphire Rapids processor
+generation. The kernel and tool change to support this mode is added to Linux
+kernel version 5.18. To enable this feature, kernel config
+"CONFIG_INTEL_HFI_THERMAL" is required. The minimum version of the tool
+is "v1.12" to support this feature, which is part of Linux kernel version 5.18.
+
+To support such configuration, this tool can be used as a daemon. Add
+a command line option --oob::
+
+ # intel-speed-select --oob
+ Intel(R) Speed Select Technology
+ Executing on CPU model:143[0x8f]
+ OOB mode is enabled and will run as daemon
+
+In this mode the tool will online/offline CPUs based on the new performance
+level.
+
Check presence of other Intel(R) SST features
---------------------------------------------
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index d5043cd8d2f5..bf13ad25a32f 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -712,7 +712,7 @@ it works in the `active mode <Active Mode_>`_.
The following sequence of shell commands can be used to enable them and see
their output (if the kernel is generally configured to support event tracing)::
- # cd /sys/kernel/debug/tracing/
+ # cd /sys/kernel/tracing/
# echo 1 > events/power/pstate_sample/enable
# echo 1 > events/power/cpu_frequency/enable
# cat trace
@@ -732,7 +732,7 @@ The ``ftrace`` interface can be used for low-level diagnostics of
P-state is called, the ``ftrace`` filter can be set to
:c:func:`intel_pstate_set_pstate`::
- # cd /sys/kernel/debug/tracing/
+ # cd /sys/kernel/tracing/
# cat available_filter_functions | grep -i pstate
intel_pstate_set_pstate
intel_pstate_cpu_init
diff --git a/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst
new file mode 100644
index 000000000000..09169d935835
--- /dev/null
+++ b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==============================
+Intel Uncore Frequency Scaling
+==============================
+
+:Copyright: |copy| 2022 Intel Corporation
+
+:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Introduction
+------------
+
+The uncore can consume significant amount of power in Intel's Xeon servers based
+on the workload characteristics. To optimize the total power and improve overall
+performance, SoCs have internal algorithms for scaling uncore frequency. These
+algorithms monitor workload usage of uncore and set a desirable frequency.
+
+It is possible that users have different expectations of uncore performance and
+want to have control over it. The objective is similar to allowing users to set
+the scaling min/max frequencies via cpufreq sysfs to improve CPU performance.
+Users may have some latency sensitive workloads where they do not want any
+change to uncore frequency. Also, users may have workloads which require
+different core and uncore performance at distinct phases and they may want to
+use both cpufreq and the uncore scaling interface to distribute power and
+improve overall performance.
+
+Sysfs Interface
+---------------
+
+To control uncore frequency, a sysfs interface is provided in the directory:
+`/sys/devices/system/cpu/intel_uncore_frequency/`.
+
+There is one directory for each package and die combination as the scope of
+uncore scaling control is per die in multiple die/package SoCs or per
+package for single die per package SoCs. The name represents the
+scope of control. For example: 'package_00_die_00' is for package id 0 and
+die 0.
+
+Each package_*_die_* contains the following attributes:
+
+``initial_max_freq_khz``
+ Out of reset, this attribute represent the maximum possible frequency.
+ This is a read-only attribute. If users adjust max_freq_khz,
+ they can always go back to maximum using the value from this attribute.
+
+``initial_min_freq_khz``
+ Out of reset, this attribute represent the minimum possible frequency.
+ This is a read-only attribute. If users adjust min_freq_khz,
+ they can always go back to minimum using the value from this attribute.
+
+``max_freq_khz``
+ This attribute is used to set the maximum uncore frequency.
+
+``min_freq_khz``
+ This attribute is used to set the minimum uncore frequency.
+
+``current_freq_khz``
+ This attribute is used to get the current uncore frequency.
diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst
index f40994c422dc..ee45887811ff 100644
--- a/Documentation/admin-guide/pm/working-state.rst
+++ b/Documentation/admin-guide/pm/working-state.rst
@@ -11,6 +11,8 @@ Working-State Power Management
intel_idle
cpufreq
intel_pstate
+ amd-pstate
cpufreq_drivers
intel_epb
intel-speed-select
+ intel_uncore_frequency_scaling
diff --git a/Documentation/admin-guide/quickly-build-trimmed-linux.rst b/Documentation/admin-guide/quickly-build-trimmed-linux.rst
new file mode 100644
index 000000000000..ff4f4cc8522b
--- /dev/null
+++ b/Documentation/admin-guide/quickly-build-trimmed-linux.rst
@@ -0,0 +1,1092 @@
+.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0)
+.. [see the bottom of this file for redistribution information]
+
+===========================================
+How to quickly build a trimmed Linux kernel
+===========================================
+
+This guide explains how to swiftly build Linux kernels that are ideal for
+testing purposes, but perfectly fine for day-to-day use, too.
+
+The essence of the process (aka 'TL;DR')
+========================================
+
+*[If you are new to compiling Linux, ignore this TLDR and head over to the next
+section below: it contains a step-by-step guide, which is more detailed, but
+still brief and easy to follow; that guide and its accompanying reference
+section also mention alternatives, pitfalls, and additional aspects, all of
+which might be relevant for you.]*
+
+If your system uses techniques like Secure Boot, prepare it to permit starting
+self-compiled Linux kernels; install compilers and everything else needed for
+building Linux; make sure to have 12 Gigabyte free space in your home directory.
+Now run the following commands to download fresh Linux mainline sources, which
+you then use to configure, build and install your own kernel::
+
+ git clone --depth 1 -b master \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git ~/linux/
+ cd ~/linux/
+ # Hint: if you want to apply patches, do it at this point. See below for details.
+ # Hint: it's recommended to tag your build at this point. See below for details.
+ yes "" | make localmodconfig
+ # Hint: at this point you might want to adjust the build configuration; you'll
+ # have to, if you are running Debian. See below for details.
+ make -j $(nproc --all)
+ # Note: on many commodity distributions the next command suffices, but on Arch
+ # Linux, its derivatives, and some others it does not. See below for details.
+ command -v installkernel && sudo make modules_install install
+ reboot
+
+If you later want to build a newer mainline snapshot, use these commands::
+
+ cd ~/linux/
+ git fetch --depth 1 origin
+ # Note: the next command will discard any changes you did to the code:
+ git checkout --force --detach origin/master
+ # Reminder: if you want to (re)apply patches, do it at this point.
+ # Reminder: you might want to add or modify a build tag at this point.
+ make olddefconfig
+ make -j $(nproc --all)
+ # Reminder: the next command on some distributions does not suffice.
+ command -v installkernel && sudo make modules_install install
+ reboot
+
+Step-by-step guide
+==================
+
+Compiling your own Linux kernel is easy in principle. There are various ways to
+do it. Which of them actually work and is the best depends on the circumstances.
+
+This guide describes a way perfectly suited for those who want to quickly
+install Linux from sources without being bothered by complicated details; the
+goal is to cover everything typically needed on mainstream Linux distributions
+running on commodity PC or server hardware.
+
+The described approach is great for testing purposes, for example to try a
+proposed fix or to check if a problem was already fixed in the latest codebase.
+Nonetheless, kernels built this way are also totally fine for day-to-day use
+while at the same time being easy to keep up to date.
+
+The following steps describe the important aspects of the process; a
+comprehensive reference section later explains each of them in more detail. It
+sometimes also describes alternative approaches, pitfalls, as well as errors
+that might occur at a particular point -- and how to then get things rolling
+again.
+
+..
+ Note: if you see this note, you are reading the text's source file. You
+ might want to switch to a rendered version, as it makes it a lot easier to
+ quickly look something up in the reference section and afterwards jump back
+ to where you left off. Find a the latest rendered version here:
+ https://docs.kernel.org/admin-guide/quickly-build-trimmed-linux.html
+
+.. _backup_sbs:
+
+ * Create a fresh backup and put system repair and restore tools at hand, just
+ to be prepared for the unlikely case of something going sideways.
+
+ [:ref:`details<backup>`]
+
+.. _secureboot_sbs:
+
+ * On platforms with 'Secure Boot' or similar techniques, prepare everything to
+ ensure the system will permit your self-compiled kernel to boot later. The
+ quickest and easiest way to achieve this on commodity x86 systems is to
+ disable such techniques in the BIOS setup utility; alternatively, remove
+ their restrictions through a process initiated by
+ ``mokutil --disable-validation``.
+
+ [:ref:`details<secureboot>`]
+
+.. _buildrequires_sbs:
+
+ * Install all software required to build a Linux kernel. Often you will need:
+ 'bc', 'binutils' ('ld' et al.), 'bison', 'flex', 'gcc', 'git', 'openssl',
+ 'pahole', 'perl', and the development headers for 'libelf' and 'openssl'. The
+ reference section shows how to quickly install those on various popular Linux
+ distributions.
+
+ [:ref:`details<buildrequires>`]
+
+.. _diskspace_sbs:
+
+ * Ensure to have enough free space for building and installing Linux. For the
+ latter 150 Megabyte in /lib/ and 100 in /boot/ are a safe bet. For storing
+ sources and build artifacts 12 Gigabyte in your home directory should
+ typically suffice. If you have less available, be sure to check the reference
+ section for the step that explains adjusting your kernels build
+ configuration: it mentions a trick that reduce the amount of required space
+ in /home/ to around 4 Gigabyte.
+
+ [:ref:`details<diskspace>`]
+
+.. _sources_sbs:
+
+ * Retrieve the sources of the Linux version you intend to build; then change
+ into the directory holding them, as all further commands in this guide are
+ meant to be executed from there.
+
+ *[Note: the following paragraphs describe how to retrieve the sources by
+ partially cloning the Linux stable git repository. This is called a shallow
+ clone. The reference section explains two alternatives:* :ref:`packaged
+ archives<sources_archive>` *and* :ref:`a full git clone<sources_full>` *;
+ prefer the latter, if downloading a lot of data does not bother you, as that
+ will avoid some* :ref:`peculiar characteristics of shallow clones the
+ reference section explains<sources_shallow>` *.]*
+
+ First, execute the following command to retrieve a fresh mainline codebase::
+
+ git clone --no-checkout --depth 1 -b master \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git ~/linux/
+ cd ~/linux/
+
+ If you want to access recent mainline releases and pre-releases, deepen you
+ clone's history to the oldest mainline version you are interested in::
+
+ git fetch --shallow-exclude=v6.0 origin
+
+ In case you want to access a stable/longterm release (say v6.1.5), simply add
+ the branch holding that series; afterwards fetch the history at least up to
+ the mainline version that started the series (v6.1)::
+
+ git remote set-branches --add origin linux-6.1.y
+ git fetch --shallow-exclude=v6.0 origin
+
+ Now checkout the code you are interested in. If you just performed the
+ initial clone, you will be able to check out a fresh mainline codebase, which
+ is ideal for checking whether developers already fixed an issue::
+
+ git checkout --detach origin/master
+
+ If you deepened your clone, you instead of ``origin/master`` can specify the
+ version you deepened to (``v6.0`` above); later releases like ``v6.1`` and
+ pre-release like ``v6.2-rc1`` will work, too. Stable or longterm versions
+ like ``v6.1.5`` work just the same, if you added the appropriate
+ stable/longterm branch as described.
+
+ [:ref:`details<sources>`]
+
+.. _patching_sbs:
+
+ * In case you want to apply a kernel patch, do so now. Often a command like
+ this will do the trick::
+
+ patch -p1 < ../proposed-fix.patch
+
+ If the ``-p1`` is actually needed, depends on how the patch was created; in
+ case it does not apply thus try without it.
+
+ If you cloned the sources with git and anything goes sideways, run ``git
+ reset --hard`` to undo any changes to the sources.
+
+ [:ref:`details<patching>`]
+
+.. _tagging_sbs:
+
+ * If you patched your kernel or have one of the same version installed already,
+ better add a unique tag to the one you are about to build::
+
+ echo "-proposed_fix" > localversion
+
+ Running ``uname -r`` under your kernel later will then print something like
+ '6.1-rc4-proposed_fix'.
+
+ [:ref:`details<tagging>`]
+
+ .. _configuration_sbs:
+
+ * Create the build configuration for your kernel based on an existing
+ configuration.
+
+ If you already prepared such a '.config' file yourself, copy it to
+ ~/linux/ and run ``make olddefconfig``.
+
+ Use the same command, if your distribution or somebody else already tailored
+ your running kernel to your or your hardware's needs: the make target
+ 'olddefconfig' will then try to use that kernel's .config as base.
+
+ Using this make target is fine for everybody else, too -- but you often can
+ save a lot of time by using this command instead::
+
+ yes "" | make localmodconfig
+
+ This will try to pick your distribution's kernel as base, but then disable
+ modules for any features apparently superfluous for your setup. This will
+ reduce the compile time enormously, especially if you are running an
+ universal kernel from a commodity Linux distribution.
+
+ There is a catch: the make target 'localmodconfig' will disable kernel
+ features you have not directly or indirectly through some program utilized
+ since you booted the system. You can reduce or nearly eliminate that risk by
+ using tricks outlined in the reference section; for quick testing purposes
+ that risk is often negligible, but it is an aspect you want to keep in mind
+ in case your kernel behaves oddly.
+
+ [:ref:`details<configuration>`]
+
+.. _configmods_sbs:
+
+ * Check if you might want to or have to adjust some kernel configuration
+ options:
+
+ * Evaluate how you want to handle debug symbols. Enable them, if you later
+ might need to decode a stack trace found for example in a 'panic', 'Oops',
+ 'warning', or 'BUG'; on the other hand disable them, if you are short on
+ storage space or prefer a smaller kernel binary. See the reference section
+ for details on how to do either. If neither applies, it will likely be fine
+ to simply not bother with this. [:ref:`details<configmods_debugsymbols>`]
+
+ * Are you running Debian? Then to avoid known problems by performing
+ additional adjustments explained in the reference section.
+ [:ref:`details<configmods_distros>`].
+
+ * If you want to influence the other aspects of the configuration, do so now
+ by using make targets like 'menuconfig' or 'xconfig'.
+ [:ref:`details<configmods_individual>`].
+
+.. _build_sbs:
+
+ * Build the image and the modules of your kernel::
+
+ make -j $(nproc --all)
+
+ If you want your kernel packaged up as deb, rpm, or tar file, see the
+ reference section for alternatives.
+
+ [:ref:`details<build>`]
+
+.. _install_sbs:
+
+ * Now install your kernel::
+
+ command -v installkernel && sudo make modules_install install
+
+ Often all left for you to do afterwards is a ``reboot``, as many commodity
+ Linux distributions will then create an initramfs (also known as initrd) and
+ an entry for your kernel in your bootloader's configuration; but on some
+ distributions you have to take care of these two steps manually for reasons
+ the reference section explains.
+
+ On a few distributions like Arch Linux and its derivatives the above command
+ does nothing at all; in that case you have to manually install your kernel,
+ as outlined in the reference section.
+
+ [:ref:`details<install>`]
+
+.. _another_sbs:
+
+ * To later build another kernel you need similar steps, but sometimes slightly
+ different commands.
+
+ First, switch back into the sources tree::
+
+ cd ~/linux/
+
+ In case you want to build a version from a stable/longterm series you have
+ not used yet (say 6.2.y), tell git to track it::
+
+ git remote set-branches --add origin linux-6.2.y
+
+ Now fetch the latest upstream changes; you again need to specify the earliest
+ version you care about, as git otherwise might retrieve the entire commit
+ history::
+
+ git fetch --shallow-exclude=v6.1 origin
+
+ If you modified the sources (for example by applying a patch), you now need
+ to discard those modifications; that's because git otherwise will not be able
+ to switch to the sources of another version due to potential conflicting
+ changes::
+
+ git reset --hard
+
+ Now checkout the version you are interested in, as explained above::
+
+ git checkout --detach origin/master
+
+ At this point you might want to patch the sources again or set/modify a build
+ tag, as explained earlier; afterwards adjust the build configuration to the
+ new codebase and build your next kernel::
+
+ # reminder: if you want to apply patches, do it at this point
+ # reminder: you might want to update your build tag at this point
+ make olddefconfig
+ make -j $(nproc --all)
+
+ Install the kernel as outlined above::
+
+ command -v installkernel && sudo make modules_install install
+
+ [:ref:`details<another>`]
+
+.. _uninstall_sbs:
+
+ * Your kernel is easy to remove later, as its parts are only stored in two
+ places and clearly identifiable by the kernel's release name. Just ensure to
+ not delete the kernel you are running, as that might render your system
+ unbootable.
+
+ Start by deleting the directory holding your kernel's modules, which is named
+ after its release name -- '6.0.1-foobar' in the following example::
+
+ sudo rm -rf /lib/modules/6.0.1-foobar
+
+ Now try the following command, which on some distributions will delete all
+ other kernel files installed while also removing the kernel's entry from the
+ bootloader configuration::
+
+ command -v kernel-install && sudo kernel-install -v remove 6.0.1-foobar
+
+ If that command does not output anything or fails, see the reference section;
+ do the same if any files named '*6.0.1-foobar*' remain in /boot/.
+
+ [:ref:`details<uninstall>`]
+
+.. _submit_improvements:
+
+Did you run into trouble following any of the above steps that is not cleared up
+by the reference section below? Or do you have ideas how to improve the text?
+Then please take a moment of your time and let the maintainer of this document
+know by email (Thorsten Leemhuis <linux@leemhuis.info>), ideally while CCing the
+Linux docs mailing list (linux-doc@vger.kernel.org). Such feedback is vital to
+improve this document further, which is in everybody's interest, as it will
+enable more people to master the task described here.
+
+Reference section for the step-by-step guide
+============================================
+
+This section holds additional information for each of the steps in the above
+guide.
+
+.. _backup:
+
+Prepare for emergencies
+-----------------------
+
+ *Create a fresh backup and put system repair and restore tools at hand*
+ [:ref:`... <backup_sbs>`]
+
+Remember, you are dealing with computers, which sometimes do unexpected things
+-- especially if you fiddle with crucial parts like the kernel of an operating
+system. That's what you are about to do in this process. Hence, better prepare
+for something going sideways, even if that should not happen.
+
+[:ref:`back to step-by-step guide <backup_sbs>`]
+
+.. _secureboot:
+
+Dealing with techniques like Secure Boot
+----------------------------------------
+
+ *On platforms with 'Secure Boot' or similar techniques, prepare everything to
+ ensure the system will permit your self-compiled kernel to boot later.*
+ [:ref:`... <secureboot_sbs>`]
+
+Many modern systems allow only certain operating systems to start; they thus by
+default will reject booting self-compiled kernels.
+
+You ideally deal with this by making your platform trust your self-built kernels
+with the help of a certificate and signing. How to do that is not described
+here, as it requires various steps that would take the text too far away from
+its purpose; 'Documentation/admin-guide/module-signing.rst' and various web
+sides already explain this in more detail.
+
+Temporarily disabling solutions like Secure Boot is another way to make your own
+Linux boot. On commodity x86 systems it is possible to do this in the BIOS Setup
+utility; the steps to do so are not described here, as they greatly vary between
+machines.
+
+On mainstream x86 Linux distributions there is a third and universal option:
+disable all Secure Boot restrictions for your Linux environment. You can
+initiate this process by running ``mokutil --disable-validation``; this will
+tell you to create a one-time password, which is safe to write down. Now
+restart; right after your BIOS performed all self-tests the bootloader Shim will
+show a blue box with a message 'Press any key to perform MOK management'. Hit
+some key before the countdown exposes. This will open a menu and choose 'Change
+Secure Boot state' there. Shim's 'MokManager' will now ask you to enter three
+randomly chosen characters from the one-time password specified earlier. Once
+you provided them, confirm that you really want to disable the validation.
+Afterwards, permit MokManager to reboot the machine.
+
+[:ref:`back to step-by-step guide <secureboot_sbs>`]
+
+.. _buildrequires:
+
+Install build requirements
+--------------------------
+
+ *Install all software required to build a Linux kernel.*
+ [:ref:`...<buildrequires_sbs>`]
+
+The kernel is pretty stand-alone, but besides tools like the compiler you will
+sometimes need a few libraries to build one. How to install everything needed
+depends on your Linux distribution and the configuration of the kernel you are
+about to build.
+
+Here are a few examples what you typically need on some mainstream
+distributions:
+
+ * Debian, Ubuntu, and derivatives::
+
+ sudo apt install bc binutils bison dwarves flex gcc git make openssl \
+ pahole perl-base libssl-dev libelf-dev
+
+ * Fedora and derivatives::
+
+ sudo dnf install binutils /usr/include/{libelf.h,openssl/pkcs7.h} \
+ /usr/bin/{bc,bison,flex,gcc,git,openssl,make,perl,pahole}
+
+ * openSUSE and derivatives::
+
+ sudo zypper install bc binutils bison dwarves flex gcc git make perl-base \
+ openssl openssl-devel libelf-dev
+
+In case you wonder why these lists include openssl and its development headers:
+they are needed for the Secure Boot support, which many distributions enable in
+their kernel configuration for x86 machines.
+
+Sometimes you will need tools for compression formats like bzip2, gzip, lz4,
+lzma, lzo, xz, or zstd as well.
+
+You might need additional libraries and their development headers in case you
+perform tasks not covered in this guide. For example, zlib will be needed when
+building kernel tools from the tools/ directory; adjusting the build
+configuration with make targets like 'menuconfig' or 'xconfig' will require
+development headers for ncurses or Qt5.
+
+[:ref:`back to step-by-step guide <buildrequires_sbs>`]
+
+.. _diskspace:
+
+Space requirements
+------------------
+
+ *Ensure to have enough free space for building and installing Linux.*
+ [:ref:`... <diskspace_sbs>`]
+
+The numbers mentioned are rough estimates with a big extra charge to be on the
+safe side, so often you will need less.
+
+If you have space constraints, remember to read the reference section when you
+reach the :ref:`section about configuration adjustments' <configmods>`, as
+ensuring debug symbols are disabled will reduce the consumed disk space by quite
+a few gigabytes.
+
+[:ref:`back to step-by-step guide <diskspace_sbs>`]
+
+
+.. _sources:
+
+Download the sources
+--------------------
+
+ *Retrieve the sources of the Linux version you intend to build.*
+ [:ref:`...<sources_sbs>`]
+
+The step-by-step guide outlines how to retrieve Linux' sources using a shallow
+git clone. There is :ref:`more to tell about this method<sources_shallow>` and
+two alternate ways worth describing: :ref:`packaged archives<sources_archive>`
+and :ref:`a full git clone<sources_full>`. And the aspects ':ref:`wouldn't it
+be wiser to use a proper pre-release than the latest mainline code
+<sources_snapshot>`' and ':ref:`how to get an even fresher mainline codebase
+<sources_fresher>`' need elaboration, too.
+
+Note, to keep things simple the commands used in this guide store the build
+artifacts in the source tree. If you prefer to separate them, simply add
+something like ``O=~/linux-builddir/`` to all make calls; also adjust the path
+in all commands that add files or modify any generated (like your '.config').
+
+[:ref:`back to step-by-step guide <sources_sbs>`]
+
+.. _sources_shallow:
+
+Noteworthy characteristics of shallow clones
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The step-by-step guide uses a shallow clone, as it is the best solution for most
+of this document's target audience. There are a few aspects of this approach
+worth mentioning:
+
+ * This document in most places uses ``git fetch`` with ``--shallow-exclude=``
+ to specify the earliest version you care about (or to be precise: its git
+ tag). You alternatively can use the parameter ``--shallow-since=`` to specify
+ an absolute (say ``'2023-07-15'``) or relative (``'12 months'``) date to
+ define the depth of the history you want to download. As a second
+ alternative, you can also specify a certain depth explicitly with a parameter
+ like ``--depth=1``, unless you add branches for stable/longterm kernels.
+
+ * When running ``git fetch``, remember to always specify the oldest version,
+ the time you care about, or an explicit depth as shown in the step-by-step
+ guide. Otherwise you will risk downloading nearly the entire git history,
+ which will consume quite a bit of time and bandwidth while also stressing the
+ servers.
+
+ Note, you do not have to use the same version or date all the time. But when
+ you change it over time, git will deepen or flatten the history to the
+ specified point. That allows you to retrieve versions you initially thought
+ you did not need -- or it will discard the sources of older versions, for
+ example in case you want to free up some disk space. The latter will happen
+ automatically when using ``--shallow-since=`` or
+ ``--depth=``.
+
+ * Be warned, when deepening your clone you might encounter an error like
+ 'fatal: error in object: unshallow cafecaca0c0dacafecaca0c0dacafecaca0c0da'.
+ In that case run ``git repack -d`` and try again``
+
+ * In case you want to revert changes from a certain version (say Linux 6.3) or
+ perform a bisection (v6.2..v6.3), better tell ``git fetch`` to retrieve
+ objects up to three versions earlier (e.g. 6.0): ``git describe`` will then
+ be able to describe most commits just like it would in a full git clone.
+
+[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`]
+
+.. _sources_archive:
+
+Downloading the sources using a packages archive
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+People new to compiling Linux often assume downloading an archive via the
+front-page of https://kernel.org is the best approach to retrieve Linux'
+sources. It actually can be, if you are certain to build just one particular
+kernel version without changing any code. Thing is: you might be sure this will
+be the case, but in practice it often will turn out to be a wrong assumption.
+
+That's because when reporting or debugging an issue developers will often ask to
+give another version a try. They also might suggest temporarily undoing a commit
+with ``git revert`` or might provide various patches to try. Sometimes reporters
+will also be asked to use ``git bisect`` to find the change causing a problem.
+These things rely on git or are a lot easier and quicker to handle with it.
+
+A shallow clone also does not add any significant overhead. For example, when
+you use ``git clone --depth=1`` to create a shallow clone of the latest mainline
+codebase git will only retrieve a little more data than downloading the latest
+mainline pre-release (aka 'rc') via the front-page of kernel.org would.
+
+A shallow clone therefore is often the better choice. If you nevertheless want
+to use a packaged source archive, download one via kernel.org; afterwards
+extract its content to some directory and change to the subdirectory created
+during extraction. The rest of the step-by-step guide will work just fine, apart
+from things that rely on git -- but this mainly concerns the section on
+successive builds of other versions.
+
+[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`]
+
+.. _sources_full:
+
+Downloading the sources using a full git clone
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If downloading and storing a lot of data (~4,4 Gigabyte as of early 2023) is
+nothing that bothers you, instead of a shallow clone perform a full git clone
+instead. You then will avoid the specialties mentioned above and will have all
+versions and individual commits at hand at any time::
+
+ curl -L \
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/clone.bundle \
+ -o linux-stable.git.bundle
+ git clone clone.bundle ~/linux/
+ rm linux-stable.git.bundle
+ cd ~/linux/
+ git remote set-url origin
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
+ git fetch origin
+ git checkout --detach origin/master
+
+[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`]
+
+.. _sources_snapshot:
+
+Proper pre-releases (RCs) vs. latest mainline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When cloning the sources using git and checking out origin/master, you often
+will retrieve a codebase that is somewhere between the latest and the next
+release or pre-release. This almost always is the code you want when giving
+mainline a shot: pre-releases like v6.1-rc5 are in no way special, as they do
+not get any significant extra testing before being published.
+
+There is one exception: you might want to stick to the latest mainline release
+(say v6.1) before its successor's first pre-release (v6.2-rc1) is out. That is
+because compiler errors and other problems are more likely to occur during this
+time, as mainline then is in its 'merge window': a usually two week long phase,
+in which the bulk of the changes for the next release is merged.
+
+[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`]
+
+.. _sources_fresher:
+
+Avoiding the mainline lag
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The explanations for both the shallow clone and the full clone both retrieve the
+code from the Linux stable git repository. That makes things simpler for this
+document's audience, as it allows easy access to both mainline and
+stable/longterm releases. This approach has just one downside:
+
+Changes merged into the mainline repository are only synced to the master branch
+of the Linux stable repository every few hours. This lag most of the time is
+not something to worry about; but in case you really need the latest code, just
+add the mainline repo as additional remote and checkout the code from there::
+
+ git remote add mainline \
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+ git fetch mainline
+ git checkout --detach mainline/master
+
+When doing this with a shallow clone, remember to call ``git fetch`` with one
+of the parameters described earlier to limit the depth.
+
+[:ref:`back to step-by-step guide <sources_sbs>`] [:ref:`back to section intro <sources>`]
+
+.. _patching:
+
+Patch the sources (optional)
+----------------------------
+
+ *In case you want to apply a kernel patch, do so now.*
+ [:ref:`...<patching_sbs>`]
+
+This is the point where you might want to patch your kernel -- for example when
+a developer proposed a fix and asked you to check if it helps. The step-by-step
+guide already explains everything crucial here.
+
+[:ref:`back to step-by-step guide <patching_sbs>`]
+
+.. _tagging:
+
+Tagging this kernel build (optional, often wise)
+------------------------------------------------
+
+ *If you patched your kernel or already have that kernel version installed,
+ better tag your kernel by extending its release name:*
+ [:ref:`...<tagging_sbs>`]
+
+Tagging your kernel will help avoid confusion later, especially when you patched
+your kernel. Adding an individual tag will also ensure the kernel's image and
+its modules are installed in parallel to any existing kernels.
+
+There are various ways to add such a tag. The step-by-step guide realizes one by
+creating a 'localversion' file in your build directory from which the kernel
+build scripts will automatically pick up the tag. You can later change that file
+to use a different tag in subsequent builds or simply remove that file to dump
+the tag.
+
+[:ref:`back to step-by-step guide <tagging_sbs>`]
+
+.. _configuration:
+
+Define the build configuration for your kernel
+----------------------------------------------
+
+ *Create the build configuration for your kernel based on an existing
+ configuration.* [:ref:`... <configuration_sbs>`]
+
+There are various aspects for this steps that require a more careful
+explanation:
+
+Pitfalls when using another configuration file as base
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Make targets like localmodconfig and olddefconfig share a few common snares you
+want to be aware of:
+
+ * These targets will reuse a kernel build configuration in your build directory
+ (e.g. '~/linux/.config'), if one exists. In case you want to start from
+ scratch you thus need to delete it.
+
+ * The make targets try to find the configuration for your running kernel
+ automatically, but might choose poorly. A line like '# using defaults found
+ in /boot/config-6.0.7-250.fc36.x86_64' or 'using config:
+ '/boot/config-6.0.7-250.fc36.x86_64' tells you which file they picked. If
+ that is not the intended one, simply store it as '~/linux/.config'
+ before using these make targets.
+
+ * Unexpected things might happen if you try to use a config file prepared for
+ one kernel (say v6.0) on an older generation (say v5.15). In that case you
+ might want to use a configuration as base which your distribution utilized
+ when they used that or an slightly older kernel version.
+
+Influencing the configuration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The make target olddefconfig and the ``yes "" |`` used when utilizing
+localmodconfig will set any undefined build options to their default value. This
+among others will disable many kernel features that were introduced after your
+base kernel was released.
+
+If you want to set these configurations options manually, use ``oldconfig``
+instead of ``olddefconfig`` or omit the ``yes "" |`` when utilizing
+localmodconfig. Then for each undefined configuration option you will be asked
+how to proceed. In case you are unsure what to answer, simply hit 'enter' to
+apply the default value.
+
+Big pitfall when using localmodconfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As explained briefly in the step-by-step guide already: with localmodconfig it
+can easily happen that your self-built kernel will lack modules for tasks you
+did not perform before utilizing this make target. That's because those tasks
+require kernel modules that are normally autoloaded when you perform that task
+for the first time; if you didn't perform that task at least once before using
+localmodonfig, the latter will thus assume these modules are superfluous and
+disable them.
+
+You can try to avoid this by performing typical tasks that often will autoload
+additional kernel modules: start a VM, establish VPN connections, loop-mount a
+CD/DVD ISO, mount network shares (CIFS, NFS, ...), and connect all external
+devices (2FA keys, headsets, webcams, ...) as well as storage devices with file
+systems you otherwise do not utilize (btrfs, ext4, FAT, NTFS, XFS, ...). But it
+is hard to think of everything that might be needed -- even kernel developers
+often forget one thing or another at this point.
+
+Do not let that risk bother you, especially when compiling a kernel only for
+testing purposes: everything typically crucial will be there. And if you forget
+something important you can turn on a missing feature later and quickly run the
+commands to compile and install a better kernel.
+
+But if you plan to build and use self-built kernels regularly, you might want to
+reduce the risk by recording which modules your system loads over the course of
+a few weeks. You can automate this with `modprobed-db
+<https://github.com/graysky2/modprobed-db>`_. Afterwards use ``LSMOD=<path>`` to
+point localmodconfig to the list of modules modprobed-db noticed being used::
+
+ yes "" | make LSMOD="${HOME}"/.config/modprobed.db localmodconfig
+
+Remote building with localmodconfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to use localmodconfig to build a kernel for another machine, run
+``lsmod > lsmod_foo-machine`` on it and transfer that file to your build host.
+Now point the build scripts to the file like this: ``yes "" | make
+LSMOD=~/lsmod_foo-machine localmodconfig``. Note, in this case
+you likely want to copy a base kernel configuration from the other machine over
+as well and place it as .config in your build directory.
+
+[:ref:`back to step-by-step guide <configuration_sbs>`]
+
+.. _configmods:
+
+Adjust build configuration
+--------------------------
+
+ *Check if you might want to or have to adjust some kernel configuration
+ options:*
+
+Depending on your needs you at this point might want or have to adjust some
+kernel configuration options.
+
+.. _configmods_debugsymbols:
+
+Debug symbols
+~~~~~~~~~~~~~
+
+ *Evaluate how you want to handle debug symbols.*
+ [:ref:`...<configmods_sbs>`]
+
+Most users do not need to care about this, it's often fine to leave everything
+as it is; but you should take a closer look at this, if you might need to decode
+a stack trace or want to reduce space consumption.
+
+Having debug symbols available can be important when your kernel throws a
+'panic', 'Oops', 'warning', or 'BUG' later when running, as then you will be
+able to find the exact place where the problem occurred in the code. But
+collecting and embedding the needed debug information takes time and consumes
+quite a bit of space: in late 2022 the build artifacts for a typical x86 kernel
+configured with localmodconfig consumed around 5 Gigabyte of space with debug
+symbols, but less than 1 when they were disabled. The resulting kernel image and
+the modules are bigger as well, which increases load times.
+
+Hence, if you want a small kernel and are unlikely to decode a stack trace
+later, you might want to disable debug symbols to avoid above downsides::
+
+ ./scripts/config --file .config -d DEBUG_INFO \
+ -d DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -d DEBUG_INFO_DWARF4 \
+ -d DEBUG_INFO_DWARF5 -e CONFIG_DEBUG_INFO_NONE
+ make olddefconfig
+
+You on the other hand definitely want to enable them, if there is a decent
+chance that you need to decode a stack trace later (as explained by 'Decode
+failure messages' in Documentation/admin-guide/tainted-kernels.rst in more
+detail)::
+
+ ./scripts/config --file .config -d DEBUG_INFO_NONE -e DEBUG_KERNEL
+ -e DEBUG_INFO -e DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT -e KALLSYMS -e KALLSYMS_ALL
+ make olddefconfig
+
+Note, many mainstream distributions enable debug symbols in their kernel
+configurations -- make targets like localmodconfig and olddefconfig thus will
+often pick that setting up.
+
+[:ref:`back to step-by-step guide <configmods_sbs>`]
+
+.. _configmods_distros:
+
+Distro specific adjustments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ *Are you running* [:ref:`... <configmods_sbs>`]
+
+The following sections help you to avoid build problems that are known to occur
+when following this guide on a few commodity distributions.
+
+**Debian:**
+
+ * Remove a stale reference to a certificate file that would cause your build to
+ fail::
+
+ ./scripts/config --file .config --set-str SYSTEM_TRUSTED_KEYS ''
+
+ Alternatively, download the needed certificate and make that configuration
+ option point to it, as `the Debian handbook explains in more detail
+ <https://debian-handbook.info/browse/stable/sect.kernel-compilation.html>`_
+ -- or generate your own, as explained in
+ Documentation/admin-guide/module-signing.rst.
+
+[:ref:`back to step-by-step guide <configmods_sbs>`]
+
+.. _configmods_individual:
+
+Individual adjustments
+~~~~~~~~~~~~~~~~~~~~~~
+
+ *If you want to influence the other aspects of the configuration, do so
+ now* [:ref:`... <configmods_sbs>`]
+
+You at this point can use a command like ``make menuconfig`` to enable or
+disable certain features using a text-based user interface; to use a graphical
+configuration utilize, use the make target ``xconfig`` or ``gconfig`` instead.
+All of them require development libraries from toolkits they are based on
+(ncurses, Qt5, Gtk2); an error message will tell you if something required is
+missing.
+
+[:ref:`back to step-by-step guide <configmods_sbs>`]
+
+.. _build:
+
+Build your kernel
+-----------------
+
+ *Build the image and the modules of your kernel* [:ref:`... <build_sbs>`]
+
+A lot can go wrong at this stage, but the instructions below will help you help
+yourself. Another subsection explains how to directly package your kernel up as
+deb, rpm or tar file.
+
+Dealing with build errors
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When a build error occurs, it might be caused by some aspect of your machine's
+setup that often can be fixed quickly; other times though the problem lies in
+the code and can only be fixed by a developer. A close examination of the
+failure messages coupled with some research on the internet will often tell you
+which of the two it is. To perform such a investigation, restart the build
+process like this::
+
+ make V=1
+
+The ``V=1`` activates verbose output, which might be needed to see the actual
+error. To make it easier to spot, this command also omits the ``-j $(nproc
+--all)`` used earlier to utilize every CPU core in the system for the job -- but
+this parallelism also results in some clutter when failures occur.
+
+After a few seconds the build process should run into the error again. Now try
+to find the most crucial line describing the problem. Then search the internet
+for the most important and non-generic section of that line (say 4 to 8 words);
+avoid or remove anything that looks remotely system-specific, like your username
+or local path names like ``/home/username/linux/``. First try your regular
+internet search engine with that string, afterwards search Linux kernel mailing
+lists via `lore.kernel.org/all/ <https://lore.kernel.org/all/>`_.
+
+This most of the time will find something that will explain what is wrong; quite
+often one of the hits will provide a solution for your problem, too. If you
+do not find anything that matches your problem, try again from a different angle
+by modifying your search terms or using another line from the error messages.
+
+In the end, most trouble you are to run into has likely been encountered and
+reported by others already. That includes issues where the cause is not your
+system, but lies the code. If you run into one of those, you might thus find a
+solution (e.g. a patch) or workaround for your problem, too.
+
+Package your kernel up
+~~~~~~~~~~~~~~~~~~~~~~
+
+The step-by-step guide uses the default make targets (e.g. 'bzImage' and
+'modules' on x86) to build the image and the modules of your kernel, which later
+steps of the guide then install. You instead can also directly build everything
+and directly package it up by using one of the following targets:
+
+ * ``make -j $(nproc --all) bindeb-pkg`` to generate a deb package
+
+ * ``make -j $(nproc --all) binrpm-pkg`` to generate a rpm package
+
+ * ``make -j $(nproc --all) tarbz2-pkg`` to generate a bz2 compressed tarball
+
+This is just a selection of available make targets for this purpose, see
+``make help`` for others. You can also use these targets after running
+``make -j $(nproc --all)``, as they will pick up everything already built.
+
+If you employ the targets to generate deb or rpm packages, ignore the
+step-by-step guide's instructions on installing and removing your kernel;
+instead install and remove the packages using the package utility for the format
+(e.g. dpkg and rpm) or a package management utility build on top of them (apt,
+aptitude, dnf/yum, zypper, ...). Be aware that the packages generated using
+these two make targets are designed to work on various distributions utilizing
+those formats, they thus will sometimes behave differently than your
+distribution's kernel packages.
+
+[:ref:`back to step-by-step guide <build_sbs>`]
+
+.. _install:
+
+Install your kernel
+-------------------
+
+ *Now install your kernel* [:ref:`... <install_sbs>`]
+
+What you need to do after executing the command in the step-by-step guide
+depends on the existence and the implementation of an ``installkernel``
+executable. Many commodity Linux distributions ship such a kernel installer in
+``/sbin/`` that does everything needed, hence there is nothing left for you
+except rebooting. But some distributions contain an installkernel that does
+only part of the job -- and a few lack it completely and leave all the work to
+you.
+
+If ``installkernel`` is found, the kernel's build system will delegate the
+actual installation of your kernel's image and related files to this executable.
+On almost all Linux distributions it will store the image as '/boot/vmlinuz-
+<your kernel's release name>' and put a 'System.map-<your kernel's release
+name>' alongside it. Your kernel will thus be installed in parallel to any
+existing ones, unless you already have one with exactly the same release name.
+
+Installkernel on many distributions will afterwards generate an 'initramfs'
+(often also called 'initrd'), which commodity distributions rely on for booting;
+hence be sure to keep the order of the two make targets used in the step-by-step
+guide, as things will go sideways if you install your kernel's image before its
+modules. Often installkernel will then add your kernel to the bootloader
+configuration, too. You have to take care of one or both of these tasks
+yourself, if your distributions installkernel doesn't handle them.
+
+A few distributions like Arch Linux and its derivatives totally lack an
+installkernel executable. On those just install the modules using the kernel's
+build system and then install the image and the System.map file manually::
+
+ sudo make modules_install
+ sudo install -m 0600 $(make -s image_name) /boot/vmlinuz-$(make -s kernelrelease)
+ sudo install -m 0600 System.map /boot/System.map-$(make -s kernelrelease)
+
+If your distribution boots with the help of an initramfs, now generate one for
+your kernel using the tools your distribution provides for this process.
+Afterwards add your kernel to your bootloader configuration and reboot.
+
+[:ref:`back to step-by-step guide <install_sbs>`]
+
+.. _another:
+
+Another round later
+-------------------
+
+ *To later build another kernel you need similar, but sometimes slightly
+ different commands* [:ref:`... <another_sbs>`]
+
+The process to build later kernels is similar, but at some points slightly
+different. You for example do not want to use 'localmodconfig' for succeeding
+kernel builds, as you already created a trimmed down configuration you want to
+use from now on. Hence instead just use ``oldconfig`` or ``olddefconfig`` to
+adjust your build configurations to the needs of the kernel version you are
+about to build.
+
+If you created a shallow-clone with git, remember what the :ref:`section that
+explained the setup described in more detail <sources>`: you need to use a
+slightly different ``git fetch`` command and when switching to another series
+need to add an additional remote branch.
+
+[:ref:`back to step-by-step guide <another_sbs>`]
+
+.. _uninstall:
+
+Uninstall the kernel later
+--------------------------
+
+ *All parts of your installed kernel are identifiable by its release name and
+ thus easy to remove later.* [:ref:`... <uninstall_sbs>`]
+
+Do not worry installing your kernel manually and thus bypassing your
+distribution's packaging system will totally mess up your machine: all parts of
+your kernel are easy to remove later, as files are stored in two places only and
+normally identifiable by the kernel's release name.
+
+One of the two places is a directory in /lib/modules/, which holds the modules
+for each installed kernel. This directory is named after the kernel's release
+name; hence, to remove all modules for one of your kernels, simply remove its
+modules directory in /lib/modules/.
+
+The other place is /boot/, where typically one to five files will be placed
+during installation of a kernel. All of them usually contain the release name in
+their file name, but how many files and their name depends somewhat on your
+distribution's installkernel executable (:ref:`see above <install>`) and its
+initramfs generator. On some distributions the ``kernel-install`` command
+mentioned in the step-by-step guide will remove all of these files for you --
+and the entry for your kernel in the bootloader configuration at the same time,
+too. On others you have to take care of these steps yourself. The following
+command should interactively remove the two main files of a kernel with the
+release name '6.0.1-foobar'::
+
+ rm -i /boot/{System.map,vmlinuz}-6.0.1-foobar
+
+Now remove the belonging initramfs, which often will be called something like
+``/boot/initramfs-6.0.1-foobar.img`` or ``/boot/initrd.img-6.0.1-foobar``.
+Afterwards check for other files in /boot/ that have '6.0.1-foobar' in their
+name and delete them as well. Now remove the kernel from your bootloader's
+configuration.
+
+Note, be very careful with wildcards like '*' when deleting files or directories
+for kernels manually: you might accidentally remove files of a 6.0.11 kernel
+when all you want is to remove 6.0 or 6.0.1.
+
+[:ref:`back to step-by-step guide <uninstall_sbs>`]
+
+.. _faq:
+
+FAQ
+===
+
+Why does this 'how-to' not work on my system?
+---------------------------------------------
+
+As initially stated, this guide is 'designed to cover everything typically
+needed [to build a kernel] on mainstream Linux distributions running on
+commodity PC or server hardware'. The outlined approach despite this should work
+on many other setups as well. But trying to cover every possible use-case in one
+guide would defeat its purpose, as without such a focus you would need dozens or
+hundreds of constructs along the lines of 'in case you are having <insert
+machine or distro>, you at this point have to do <this and that>
+<instead|additionally>'. Each of which would make the text longer, more
+complicated, and harder to follow.
+
+That being said: this of course is a balancing act. Hence, if you think an
+additional use-case is worth describing, suggest it to the maintainers of this
+document, as :ref:`described above <submit_improvements>`.
+
+
+..
+ end-of-content
+..
+ This document is maintained by Thorsten Leemhuis <linux@leemhuis.info>. If
+ you spot a typo or small mistake, feel free to let him know directly and
+ he'll fix it. You are free to do the same in a mostly informal way if you
+ want to contribute changes to the text -- but for copyright reasons please CC
+ linux-doc@vger.kernel.org and 'sign-off' your contribution as
+ Documentation/process/submitting-patches.rst explains in the section 'Sign
+ your work - the Developer's Certificate of Origin'.
+..
+ This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top
+ of the file. If you want to distribute this text under CC-BY-4.0 only,
+ please use 'The Linux kernel development community' for author attribution
+ and link this as source:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/quickly-build-trimmed-linux.rst
+..
+ Note: Only the content of this RST file as found in the Linux kernel sources
+ is available under CC-BY-4.0, as versions of this text that were processed
+ (for example by the kernel's build system) might contain content taken from
+ files which use a more restrictive license.
+
diff --git a/Documentation/admin-guide/ramoops.rst b/Documentation/admin-guide/ramoops.rst
index 8f107d8c9261..e9f85142182d 100644
--- a/Documentation/admin-guide/ramoops.rst
+++ b/Documentation/admin-guide/ramoops.rst
@@ -69,7 +69,7 @@ Setting the ramoops parameters can be done in several different manners:
mem=128M ramoops.mem_address=0x8000000 ramoops.ecc=1
B. Use Device Tree bindings, as described in
- ``Documentation/devicetree/bindings/reserved-memory/ramoops.txt``.
+ ``Documentation/devicetree/bindings/reserved-memory/ramoops.yaml``.
For example::
reserved-memory {
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst
index 7b481b2a368e..8e03751d126d 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -199,7 +199,7 @@ Architecture (MCA)\ [#f3]_.
mode).
.. [#f3] For more details about the Machine Check Architecture (MCA),
- please read Documentation/x86/x86_64/machinecheck.rst at the Kernel tree.
+ please read Documentation/arch/x86/x86_64/machinecheck.rst at the Kernel tree.
EDAC - Error Detection And Correction
*************************************
diff --git a/Documentation/admin-guide/reporting-issues.rst b/Documentation/admin-guide/reporting-issues.rst
index d7ac13f789cc..2fd5a030235a 100644
--- a/Documentation/admin-guide/reporting-issues.rst
+++ b/Documentation/admin-guide/reporting-issues.rst
@@ -1,14 +1,5 @@
.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0)
-..
- If you want to distribute this text under CC-BY-4.0 only, please use 'The
- Linux kernel developers' for author attribution and link this as source:
- https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/reporting-issues.rst
-..
- Note: Only the content of this RST file as found in the Linux kernel sources
- is available under CC-BY-4.0, as versions of this text that were processed
- (for example by the kernel's build system) might contain content taken from
- files which use a more restrictive license.
-
+.. See the bottom of this file for additional redistribution information.
Reporting issues
++++++++++++++++
@@ -395,22 +386,16 @@ fixed as soon as possible, hence there are 'issues of high priority' that get
handled slightly differently in the reporting process. Three type of cases
qualify: regressions, security issues, and really severe problems.
-You deal with a 'regression' if something that worked with an older version of
-the Linux kernel does not work with a newer one or somehow works worse with it.
-It thus is a regression when a WiFi driver that did a fine job with Linux 5.7
-somehow misbehaves with 5.8 or doesn't work at all. It's also a regression if
-an application shows erratic behavior with a newer kernel, which might happen
-due to incompatible changes in the interface between the kernel and the
-userland (like procfs and sysfs). Significantly reduced performance or
-increased power consumption also qualify as regression. But keep in mind: the
-new kernel needs to be built with a configuration that is similar to the one
-from the old kernel (see below how to achieve that). That's because the kernel
-developers sometimes can not avoid incompatibilities when implementing new
-features; but to avoid regressions such features have to be enabled explicitly
-during build time configuration.
+You deal with a regression if some application or practical use case running
+fine with one Linux kernel works worse or not at all with a newer version
+compiled using a similar configuration. The document
+Documentation/admin-guide/reporting-regressions.rst explains this in more
+detail. It also provides a good deal of other information about regressions you
+might want to be aware of; it for example explains how to add your issue to the
+list of tracked regressions, to ensure it won't fall through the cracks.
What qualifies as security issue is left to your judgment. Consider reading
-'Documentation/admin-guide/security-bugs.rst' before proceeding, as it
+Documentation/process/security-bugs.rst before proceeding, as it
provides additional details how to best handle security issues.
An issue is a 'really severe problem' when something totally unacceptably bad
@@ -517,7 +502,7 @@ line starting with 'CPU:'. It should end with 'Not tainted' if the kernel was
not tainted when it noticed the problem; it was tainted if you see 'Tainted:'
followed by a few spaces and some letters.
-If your kernel is tainted, study 'Documentation/admin-guide/tainted-kernels.rst'
+If your kernel is tainted, study Documentation/admin-guide/tainted-kernels.rst
to find out why. Try to eliminate the reason. Often it's caused by one these
three things:
@@ -1043,7 +1028,7 @@ down the culprit, as maintainers often won't have the time or setup at hand to
reproduce it themselves.
To find the change there is a process called 'bisection' which the document
-'Documentation/admin-guide/bug-bisect.rst' describes in detail. That process
+Documentation/admin-guide/bug-bisect.rst describes in detail. That process
will often require you to build about ten to twenty kernel images, trying to
reproduce the issue with each of them before building the next. Yes, that takes
some time, but don't worry, it works a lot quicker than most people assume.
@@ -1073,10 +1058,11 @@ When dealing with regressions make sure the issue you face is really caused by
the kernel and not by something else, as outlined above already.
In the whole process keep in mind: an issue only qualifies as regression if the
-older and the newer kernel got built with a similar configuration. The best way
-to archive this: copy the configuration file (``.config``) from the old working
-kernel freshly to each newer kernel version you try. Afterwards run ``make
-olddefconfig`` to adjust it for the needs of the new version.
+older and the newer kernel got built with a similar configuration. This can be
+achieved by using ``make olddefconfig``, as explained in more detail by
+Documentation/admin-guide/reporting-regressions.rst; that document also
+provides a good deal of other information about regressions you might want to be
+aware of.
Write and send the report
@@ -1283,7 +1269,7 @@ them when sending the report by mail. If you filed it in a bug tracker, forward
the report's text to these addresses; but on top of it put a small note where
you mention that you filed it with a link to the ticket.
-See 'Documentation/admin-guide/security-bugs.rst' for more information.
+See Documentation/process/security-bugs.rst for more information.
Duties after the report went out
@@ -1571,7 +1557,7 @@ Once your report is out your might get asked to do a proper one, as it allows to
pinpoint the exact change that causes the issue (which then can easily get
reverted to fix the issue quickly). Hence consider to do a proper bisection
right away if time permits. See the section 'Special care for regressions' and
-the document 'Documentation/admin-guide/bug-bisect.rst' for details how to
+the document Documentation/admin-guide/bug-bisect.rst for details how to
perform one. In case of a successful bisection add the author of the culprit to
the recipients; also CC everyone in the signed-off-by chain, which you find at
the end of its commit message.
@@ -1594,7 +1580,7 @@ Some fixes are too complex
Even small and seemingly obvious code-changes sometimes introduce new and
totally unexpected problems. The maintainers of the stable and longterm kernels
are very aware of that and thus only apply changes to these kernels that are
-within rules outlined in 'Documentation/process/stable-kernel-rules.rst'.
+within rules outlined in Documentation/process/stable-kernel-rules.rst.
Complex or risky changes for example do not qualify and thus only get applied
to mainline. Other fixes are easy to get backported to the newest stable and
@@ -1756,10 +1742,23 @@ art will lay some groundwork to improve the situation over time.
..
- This text is maintained by Thorsten Leemhuis <linux@leemhuis.info>. If you
- spot a typo or small mistake, feel free to let him know directly and he'll
- fix it. You are free to do the same in a mostly informal way if you want
- to contribute changes to the text, but for copyright reasons please CC
+ end-of-content
+..
+ This document is maintained by Thorsten Leemhuis <linux@leemhuis.info>. If
+ you spot a typo or small mistake, feel free to let him know directly and
+ he'll fix it. You are free to do the same in a mostly informal way if you
+ want to contribute changes to the text, but for copyright reasons please CC
linux-doc@vger.kernel.org and "sign-off" your contribution as
Documentation/process/submitting-patches.rst outlines in the section "Sign
your work - the Developer's Certificate of Origin".
+..
+ This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top
+ of the file. If you want to distribute this text under CC-BY-4.0 only,
+ please use "The Linux kernel developers" for author attribution and link
+ this as source:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/reporting-issues.rst
+..
+ Note: Only the content of this RST file as found in the Linux kernel sources
+ is available under CC-BY-4.0, as versions of this text that were processed
+ (for example by the kernel's build system) might contain content taken from
+ files which use a more restrictive license.
diff --git a/Documentation/admin-guide/reporting-regressions.rst b/Documentation/admin-guide/reporting-regressions.rst
new file mode 100644
index 000000000000..d8adccdae23f
--- /dev/null
+++ b/Documentation/admin-guide/reporting-regressions.rst
@@ -0,0 +1,451 @@
+.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0)
+.. [see the bottom of this file for redistribution information]
+
+Reporting regressions
++++++++++++++++++++++
+
+"*We don't cause regressions*" is the first rule of Linux kernel development;
+Linux founder and lead developer Linus Torvalds established it himself and
+ensures it's obeyed.
+
+This document describes what the rule means for users and how the Linux kernel's
+development model ensures to address all reported regressions; aspects relevant
+for kernel developers are left to Documentation/process/handling-regressions.rst.
+
+
+The important bits (aka "TL;DR")
+================================
+
+#. It's a regression if something running fine with one Linux kernel works worse
+ or not at all with a newer version. Note, the newer kernel has to be compiled
+ using a similar configuration; the detailed explanations below describes this
+ and other fine print in more detail.
+
+#. Report your issue as outlined in Documentation/admin-guide/reporting-issues.rst,
+ it already covers all aspects important for regressions and repeated
+ below for convenience. Two of them are important: start your report's subject
+ with "[REGRESSION]" and CC or forward it to `the regression mailing list
+ <https://lore.kernel.org/regressions/>`_ (regressions@lists.linux.dev).
+
+#. Optional, but recommended: when sending or forwarding your report, make the
+ Linux kernel regression tracking bot "regzbot" track the issue by specifying
+ when the regression started like this::
+
+ #regzbot introduced v5.13..v5.14-rc1
+
+
+All the details on Linux kernel regressions relevant for users
+==============================================================
+
+
+The important basics
+--------------------
+
+
+What is a "regression" and what is the "no regressions rule"?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It's a regression if some application or practical use case running fine with
+one Linux kernel works worse or not at all with a newer version compiled using a
+similar configuration. The "no regressions rule" forbids this to take place; if
+it happens by accident, developers that caused it are expected to quickly fix
+the issue.
+
+It thus is a regression when a WiFi driver from Linux 5.13 works fine, but with
+5.14 doesn't work at all, works significantly slower, or misbehaves somehow.
+It's also a regression if a perfectly working application suddenly shows erratic
+behavior with a newer kernel version; such issues can be caused by changes in
+procfs, sysfs, or one of the many other interfaces Linux provides to userland
+software. But keep in mind, as mentioned earlier: 5.14 in this example needs to
+be built from a configuration similar to the one from 5.13. This can be achieved
+using ``make olddefconfig``, as explained in more detail below.
+
+Note the "practical use case" in the first sentence of this section: developers
+despite the "no regressions" rule are free to change any aspect of the kernel
+and even APIs or ABIs to userland, as long as no existing application or use
+case breaks.
+
+Also be aware the "no regressions" rule covers only interfaces the kernel
+provides to the userland. It thus does not apply to kernel-internal interfaces
+like the module API, which some externally developed drivers use to hook into
+the kernel.
+
+How do I report a regression?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Just report the issue as outlined in
+Documentation/admin-guide/reporting-issues.rst, it already describes the
+important points. The following aspects outlined there are especially relevant
+for regressions:
+
+ * When checking for existing reports to join, also search the `archives of the
+ Linux regressions mailing list <https://lore.kernel.org/regressions/>`_ and
+ `regzbot's web-interface <https://linux-regtracking.leemhuis.info/regzbot/>`_.
+
+ * Start your report's subject with "[REGRESSION]".
+
+ * In your report, clearly mention the last kernel version that worked fine and
+ the first broken one. Ideally try to find the exact change causing the
+ regression using a bisection, as explained below in more detail.
+
+ * Remember to let the Linux regressions mailing list
+ (regressions@lists.linux.dev) know about your report:
+
+ * If you report the regression by mail, CC the regressions list.
+
+ * If you report your regression to some bug tracker, forward the submitted
+ report by mail to the regressions list while CCing the maintainer and the
+ mailing list for the subsystem in question.
+
+ If it's a regression within a stable or longterm series (e.g.
+ v5.15.3..v5.15.5), remember to CC the `Linux stable mailing list
+ <https://lore.kernel.org/stable/>`_ (stable@vger.kernel.org).
+
+ In case you performed a successful bisection, add everyone to the CC the
+ culprit's commit message mentions in lines starting with "Signed-off-by:".
+
+When CCing for forwarding your report to the list, consider directly telling the
+aforementioned Linux kernel regression tracking bot about your report. To do
+that, include a paragraph like this in your mail::
+
+ #regzbot introduced: v5.13..v5.14-rc1
+
+Regzbot will then consider your mail a report for a regression introduced in the
+specified version range. In above case Linux v5.13 still worked fine and Linux
+v5.14-rc1 was the first version where you encountered the issue. If you
+performed a bisection to find the commit that caused the regression, specify the
+culprit's commit-id instead::
+
+ #regzbot introduced: 1f2e3d4c5d
+
+Placing such a "regzbot command" is in your interest, as it will ensure the
+report won't fall through the cracks unnoticed. If you omit this, the Linux
+kernel's regressions tracker will take care of telling regzbot about your
+regression, as long as you send a copy to the regressions mailing lists. But the
+regression tracker is just one human which sometimes has to rest or occasionally
+might even enjoy some time away from computers (as crazy as that might sound).
+Relying on this person thus will result in an unnecessary delay before the
+regressions becomes mentioned `on the list of tracked and unresolved Linux
+kernel regressions <https://linux-regtracking.leemhuis.info/regzbot/>`_ and the
+weekly regression reports sent by regzbot. Such delays can result in Linus
+Torvalds being unaware of important regressions when deciding between "continue
+development or call this finished and release the final?".
+
+Are really all regressions fixed?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Nearly all of them are, as long as the change causing the regression (the
+"culprit commit") is reliably identified. Some regressions can be fixed without
+this, but often it's required.
+
+Who needs to find the root cause of a regression?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Developers of the affected code area should try to locate the culprit on their
+own. But for them that's often impossible to do with reasonable effort, as quite
+a lot of issues only occur in a particular environment outside the developer's
+reach -- for example, a specific hardware platform, firmware, Linux distro,
+system's configuration, or application. That's why in the end it's often up to
+the reporter to locate the culprit commit; sometimes users might even need to
+run additional tests afterwards to pinpoint the exact root cause. Developers
+should offer advice and reasonably help where they can, to make this process
+relatively easy and achievable for typical users.
+
+How can I find the culprit?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Perform a bisection, as roughly outlined in
+Documentation/admin-guide/reporting-issues.rst and described in more detail by
+Documentation/admin-guide/bug-bisect.rst. It might sound like a lot of work, but
+in many cases finds the culprit relatively quickly. If it's hard or
+time-consuming to reliably reproduce the issue, consider teaming up with other
+affected users to narrow down the search range together.
+
+Who can I ask for advice when it comes to regressions?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Send a mail to the regressions mailing list (regressions@lists.linux.dev) while
+CCing the Linux kernel's regression tracker (regressions@leemhuis.info); if the
+issue might better be dealt with in private, feel free to omit the list.
+
+
+Additional details about regressions
+------------------------------------
+
+
+What is the goal of the "no regressions rule"?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Users should feel safe when updating kernel versions and not have to worry
+something might break. This is in the interest of the kernel developers to make
+updating attractive: they don't want users to stay on stable or longterm Linux
+series that are either abandoned or more than one and a half years old. That's
+in everybody's interest, as `those series might have known bugs, security
+issues, or other problematic aspects already fixed in later versions
+<http://www.kroah.com/log/blog/2018/08/24/what-stable-kernel-should-i-use/>`_.
+Additionally, the kernel developers want to make it simple and appealing for
+users to test the latest pre-release or regular release. That's also in
+everybody's interest, as it's a lot easier to track down and fix problems, if
+they are reported shortly after being introduced.
+
+Is the "no regressions" rule really adhered in practice?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It's taken really seriously, as can be seen by many mailing list posts from
+Linux creator and lead developer Linus Torvalds, some of which are quoted in
+Documentation/process/handling-regressions.rst.
+
+Exceptions to this rule are extremely rare; in the past developers almost always
+turned out to be wrong when they assumed a particular situation was warranting
+an exception.
+
+Who ensures the "no regressions" is actually followed?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The subsystem maintainers should take care of that, which are watched and
+supported by the tree maintainers -- e.g. Linus Torvalds for mainline and
+Greg Kroah-Hartman et al. for various stable/longterm series.
+
+All of them are helped by people trying to ensure no regression report falls
+through the cracks. One of them is Thorsten Leemhuis, who's currently acting as
+the Linux kernel's "regressions tracker"; to facilitate this work he relies on
+regzbot, the Linux kernel regression tracking bot. That's why you want to bring
+your report on the radar of these people by CCing or forwarding each report to
+the regressions mailing list, ideally with a "regzbot command" in your mail to
+get it tracked immediately.
+
+How quickly are regressions normally fixed?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Developers should fix any reported regression as quickly as possible, to provide
+affected users with a solution in a timely manner and prevent more users from
+running into the issue; nevertheless developers need to take enough time and
+care to ensure regression fixes do not cause additional damage.
+
+The answer thus depends on various factors like the impact of a regression, its
+age, or the Linux series in which it occurs. In the end though, most regressions
+should be fixed within two weeks.
+
+Is it a regression, if the issue can be avoided by updating some software?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Almost always: yes. If a developer tells you otherwise, ask the regression
+tracker for advice as outlined above.
+
+Is it a regression, if a newer kernel works slower or consumes more energy?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Yes, but the difference has to be significant. A five percent slow-down in a
+micro-benchmark thus is unlikely to qualify as regression, unless it also
+influences the results of a broad benchmark by more than one percent. If in
+doubt, ask for advice.
+
+Is it a regression, if an external kernel module breaks when updating Linux?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+No, as the "no regression" rule is about interfaces and services the Linux
+kernel provides to the userland. It thus does not cover building or running
+externally developed kernel modules, as they run in kernel-space and hook into
+the kernel using internal interfaces occasionally changed.
+
+How are regressions handled that are caused by security fixes?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In extremely rare situations security issues can't be fixed without causing
+regressions; those fixes are given way, as they are the lesser evil in the end.
+Luckily this middling almost always can be avoided, as key developers for the
+affected area and often Linus Torvalds himself try very hard to fix security
+issues without causing regressions.
+
+If you nevertheless face such a case, check the mailing list archives if people
+tried their best to avoid the regression. If not, report it; if in doubt, ask
+for advice as outlined above.
+
+What happens if fixing a regression is impossible without causing another?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sadly these things happen, but luckily not very often; if they occur, expert
+developers of the affected code area should look into the issue to find a fix
+that avoids regressions or at least their impact. If you run into such a
+situation, do what was outlined already for regressions caused by security
+fixes: check earlier discussions if people already tried their best and ask for
+advice if in doubt.
+
+A quick note while at it: these situations could be avoided, if people would
+regularly give mainline pre-releases (say v5.15-rc1 or -rc3) from each
+development cycle a test run. This is best explained by imagining a change
+integrated between Linux v5.14 and v5.15-rc1 which causes a regression, but at
+the same time is a hard requirement for some other improvement applied for
+5.15-rc1. All these changes often can simply be reverted and the regression thus
+solved, if someone finds and reports it before 5.15 is released. A few days or
+weeks later this solution can become impossible, as some software might have
+started to rely on aspects introduced by one of the follow-up changes: reverting
+all changes would then cause a regression for users of said software and thus is
+out of the question.
+
+Is it a regression, if some feature I relied on was removed months ago?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is, but often it's hard to fix such regressions due to the aspects outlined
+in the previous section. It hence needs to be dealt with on a case-by-case
+basis. This is another reason why it's in everybody's interest to regularly test
+mainline pre-releases.
+
+Does the "no regression" rule apply if I seem to be the only affected person?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It does, but only for practical usage: the Linux developers want to be free to
+remove support for hardware only to be found in attics and museums anymore.
+
+Note, sometimes regressions can't be avoided to make progress -- and the latter
+is needed to prevent Linux from stagnation. Hence, if only very few users seem
+to be affected by a regression, it for the greater good might be in their and
+everyone else's interest to lettings things pass. Especially if there is an
+easy way to circumvent the regression somehow, for example by updating some
+software or using a kernel parameter created just for this purpose.
+
+Does the regression rule apply for code in the staging tree as well?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Not according to the `help text for the configuration option covering all
+staging code <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/staging/Kconfig>`_,
+which since its early days states::
+
+ Please note that these drivers are under heavy development, may or
+ may not work, and may contain userspace interfaces that most likely
+ will be changed in the near future.
+
+The staging developers nevertheless often adhere to the "no regressions" rule,
+but sometimes bend it to make progress. That's for example why some users had to
+deal with (often negligible) regressions when a WiFi driver from the staging
+tree was replaced by a totally different one written from scratch.
+
+Why do later versions have to be "compiled with a similar configuration"?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Because the Linux kernel developers sometimes integrate changes known to cause
+regressions, but make them optional and disable them in the kernel's default
+configuration. This trick allows progress, as the "no regressions" rule
+otherwise would lead to stagnation.
+
+Consider for example a new security feature blocking access to some kernel
+interfaces often abused by malware, which at the same time are required to run a
+few rarely used applications. The outlined approach makes both camps happy:
+people using these applications can leave the new security feature off, while
+everyone else can enable it without running into trouble.
+
+How to create a configuration similar to the one of an older kernel?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Start your machine with a known-good kernel and configure the newer Linux
+version with ``make olddefconfig``. This makes the kernel's build scripts pick
+up the configuration file (the ".config" file) from the running kernel as base
+for the new one you are about to compile; afterwards they set all new
+configuration options to their default value, which should disable new features
+that might cause regressions.
+
+Can I report a regression I found with pre-compiled vanilla kernels?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You need to ensure the newer kernel was compiled with a similar configuration
+file as the older one (see above), as those that built them might have enabled
+some known-to-be incompatible feature for the newer kernel. If in doubt, report
+the matter to the kernel's provider and ask for advice.
+
+
+More about regression tracking with "regzbot"
+---------------------------------------------
+
+What is regression tracking and why should I care about it?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Rules like "no regressions" need someone to ensure they are followed, otherwise
+they are broken either accidentally or on purpose. History has shown this to be
+true for Linux kernel development as well. That's why Thorsten Leemhuis, the
+Linux Kernel's regression tracker, and some people try to ensure all regression
+are fixed by keeping an eye on them until they are resolved. Neither of them are
+paid for this, that's why the work is done on a best effort basis.
+
+Why and how are Linux kernel regressions tracked using a bot?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tracking regressions completely manually has proven to be quite hard due to the
+distributed and loosely structured nature of Linux kernel development process.
+That's why the Linux kernel's regression tracker developed regzbot to facilitate
+the work, with the long term goal to automate regression tracking as much as
+possible for everyone involved.
+
+Regzbot works by watching for replies to reports of tracked regressions.
+Additionally, it's looking out for posted or committed patches referencing such
+reports with "Link:" tags; replies to such patch postings are tracked as well.
+Combined this data provides good insights into the current state of the fixing
+process.
+
+How to see which regressions regzbot tracks currently?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Check out `regzbot's web-interface <https://linux-regtracking.leemhuis.info/regzbot/>`_.
+
+What kind of issues are supposed to be tracked by regzbot?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The bot is meant to track regressions, hence please don't involve regzbot for
+regular issues. But it's okay for the Linux kernel's regression tracker if you
+involve regzbot to track severe issues, like reports about hangs, corrupted
+data, or internal errors (Panic, Oops, BUG(), warning, ...).
+
+How to change aspects of a tracked regression?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By using a 'regzbot command' in a direct or indirect reply to the mail with the
+report. The easiest way to do that: find the report in your "Sent" folder or the
+mailing list archive and reply to it using your mailer's "Reply-all" function.
+In that mail, use one of the following commands in a stand-alone paragraph (IOW:
+use blank lines to separate one or multiple of these commands from the rest of
+the mail's text).
+
+ * Update when the regression started to happen, for example after performing a
+ bisection::
+
+ #regzbot introduced: 1f2e3d4c5d
+
+ * Set or update the title::
+
+ #regzbot title: foo
+
+ * Monitor a discussion or bugzilla.kernel.org ticket where additions aspects of
+ the issue or a fix are discussed:::
+
+ #regzbot monitor: https://lore.kernel.org/r/30th.anniversary.repost@klaava.Helsinki.FI/
+ #regzbot monitor: https://bugzilla.kernel.org/show_bug.cgi?id=123456789
+
+ * Point to a place with further details of interest, like a mailing list post
+ or a ticket in a bug tracker that are slightly related, but about a different
+ topic::
+
+ #regzbot link: https://bugzilla.kernel.org/show_bug.cgi?id=123456789
+
+ * Mark a regression as invalid::
+
+ #regzbot invalid: wasn't a regression, problem has always existed
+
+Regzbot supports a few other commands primarily used by developers or people
+tracking regressions. They and more details about the aforementioned regzbot
+commands can be found in the `getting started guide
+<https://gitlab.com/knurd42/regzbot/-/blob/main/docs/getting_started.md>`_ and
+the `reference documentation <https://gitlab.com/knurd42/regzbot/-/blob/main/docs/reference.md>`_
+for regzbot.
+
+..
+ end-of-content
+..
+ This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top
+ of the file. If you want to distribute this text under CC-BY-4.0 only,
+ please use "The Linux kernel developers" for author attribution and link
+ this as source:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/reporting-regressions.rst
+..
+ Note: Only the content of this RST file as found in the Linux kernel sources
+ is available under CC-BY-4.0, as versions of this text that were processed
+ (for example by the kernel's build system) might contain content taken from
+ files which use a more restrictive license.
diff --git a/Documentation/admin-guide/serial-console.rst b/Documentation/admin-guide/serial-console.rst
index 58b32832e50a..8c8b94e54e26 100644
--- a/Documentation/admin-guide/serial-console.rst
+++ b/Documentation/admin-guide/serial-console.rst
@@ -33,8 +33,11 @@ The format of this option is::
9600n8. The maximum baudrate is 115200.
You can specify multiple console= options on the kernel command line.
-Output will appear on all of them. The last device will be used when
-you open ``/dev/console``. So, for example::
+
+The behavior is well defined when each device type is mentioned only once.
+In this case, the output will appear on all requested consoles. And
+the last device will be used when you open ``/dev/console``.
+So, for example::
console=ttyS1,9600 console=tty0
@@ -42,7 +45,34 @@ defines that opening ``/dev/console`` will get you the current foreground
virtual console, and kernel messages will appear on both the VGA
console and the 2nd serial port (ttyS1 or COM2) at 9600 baud.
-Note that you can only define one console per device type (serial, video).
+The behavior is more complicated when the same device type is defined more
+times. In this case, there are the following two rules:
+
+1. The output will appear only on the first device of each defined type.
+
+2. ``/dev/console`` will be associated with the first registered device.
+ Where the registration order depends on how kernel initializes various
+ subsystems.
+
+ This rule is used also when the last console= parameter is not used
+ for other reasons. For example, because of a typo or because
+ the hardware is not available.
+
+The result might be surprising. For example, the following two command
+lines have the same result:
+
+ console=ttyS1,9600 console=tty0 console=tty1
+ console=tty0 console=ttyS1,9600 console=tty1
+
+The kernel messages are printed only on ``tty0`` and ``ttyS1``. And
+``/dev/console`` gets associated with ``tty0``. It is because kernel
+tries to register graphical consoles before serial ones. It does it
+because of the default behavior when no console device is specified,
+see below.
+
+Note that the last ``console=tty1`` parameter still makes a difference.
+The kernel command line is used also by systemd. It would use the last
+defined ``tty1`` as the login console.
If no console device is specified, the first device found capable of
acting as a system console will be used. At this time, the system
diff --git a/Documentation/admin-guide/spkguide.txt b/Documentation/admin-guide/spkguide.txt
index 977ab3f5a0a8..74ea7f391942 100644
--- a/Documentation/admin-guide/spkguide.txt
+++ b/Documentation/admin-guide/spkguide.txt
@@ -543,7 +543,7 @@ As mentioned earlier, Speakup can either be completely compiled into the
kernel, with the exception of the help module, or it can be compiled as
a series of modules. When compiled as modules, Speakup will only be
able to speak some of the bootup messages if your system administrator
-has configured the system to load the modules at boo time. The modules
+has configured the system to load the modules at boot time. The modules
can be loaded after the file systems have been checked and mounted, or
from an initrd. There is a third possibility. Speakup can be compiled
with some components built into the kernel, and others as modules. As
@@ -1105,8 +1105,8 @@ speakup load
Alternatively, you can add the above line to your file
~/.bashrc or ~/.bash_profile.
-If your system administrator ran himself the script, all the users will be able
-to change from English to the language choosed by root and do directly
+If your system administrator himself ran the script, all the users will be able
+to change from English to the language chosen by root and do directly
speakupconf load (or add this to the ~/.bashrc or
~/.bash_profile file). If there are several languages to handle, the
administrator (or every user) will have to run the first steps until speakupconf
diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst
index 60314953c728..e3cfffef5a63 100644
--- a/Documentation/admin-guide/syscall-user-dispatch.rst
+++ b/Documentation/admin-guide/syscall-user-dispatch.rst
@@ -73,6 +73,10 @@ thread-wide, without the need to invoke the kernel directly. selector
can be set to SYSCALL_DISPATCH_FILTER_ALLOW or SYSCALL_DISPATCH_FILTER_BLOCK.
Any other value should terminate the program with a SIGSYS.
+Additionally, a tasks syscall user dispatch configuration can be peeked
+and poked via the PTRACE_(GET|SET)_SYSCALL_USER_DISPATCH_CONFIG ptrace
+requests. This is useful for checkpoint/restart software.
+
Security Notes
--------------
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index 2a501c9ddc55..a321b84eccaa 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -2,8 +2,6 @@
Documentation for /proc/sys/fs/
===============================
-kernel version 2.2.10
-
Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
Copyright (c) 2009, Shen Feng<shen@cn.fujitsu.com>
@@ -12,58 +10,40 @@ For general info and legal blurb, please look in intro.rst.
------------------------------------------------------------------------------
-This file contains documentation for the sysctl files in
-/proc/sys/fs/ and is valid for Linux kernel version 2.2.
+This file contains documentation for the sysctl files and directories
+in ``/proc/sys/fs/``.
The files in this directory can be used to tune and monitor
miscellaneous and general things in the operation of the Linux
-kernel. Since some of the files _can_ be used to screw up your
+kernel. Since some of the files *can* be used to screw up your
system, it is advisable to read both documentation and source
before actually making adjustments.
1. /proc/sys/fs
===============
-Currently, these files are in /proc/sys/fs:
-
-- aio-max-nr
-- aio-nr
-- dentry-state
-- dquot-max
-- dquot-nr
-- file-max
-- file-nr
-- inode-max
-- inode-nr
-- inode-state
-- nr_open
-- overflowuid
-- overflowgid
-- pipe-user-pages-hard
-- pipe-user-pages-soft
-- protected_fifos
-- protected_hardlinks
-- protected_regular
-- protected_symlinks
-- suid_dumpable
-- super-max
-- super-nr
+Currently, these files might (depending on your configuration)
+show up in ``/proc/sys/fs``:
+
+.. contents:: :local:
aio-nr & aio-max-nr
-------------------
-aio-nr is the running total of the number of events specified on the
-io_setup system call for all currently active aio contexts. If aio-nr
-reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
-raising aio-max-nr does not result in the pre-allocation or re-sizing
-of any kernel data structures.
+``aio-nr`` shows the current system-wide number of asynchronous io
+requests. ``aio-max-nr`` allows you to change the maximum value
+``aio-nr`` can grow to. If ``aio-nr`` reaches ``aio-nr-max`` then
+``io_setup`` will fail with ``EAGAIN``. Note that raising
+``aio-max-nr`` does not result in the
+pre-allocation or re-sizing of any kernel data structures.
dentry-state
------------
-From linux/include/linux/dcache.h::
+This file shows the values in ``struct dentry_stat``, as defined in
+``linux/include/linux/dcache.h``::
struct dentry_stat_t dentry_stat {
int nr_dentry;
@@ -76,95 +56,84 @@ From linux/include/linux/dcache.h::
Dentries are dynamically allocated and deallocated.
-nr_dentry shows the total number of dentries allocated (active
-+ unused). nr_unused shows the number of dentries that are not
+``nr_dentry`` shows the total number of dentries allocated (active
++ unused). ``nr_unused shows`` the number of dentries that are not
actively used, but are saved in the LRU list for future reuse.
-Age_limit is the age in seconds after which dcache entries
-can be reclaimed when memory is short and want_pages is
-nonzero when shrink_dcache_pages() has been called and the
+``age_limit`` is the age in seconds after which dcache entries
+can be reclaimed when memory is short and ``want_pages`` is
+nonzero when ``shrink_dcache_pages()`` has been called and the
dcache isn't pruned yet.
-nr_negative shows the number of unused dentries that are also
+``nr_negative`` shows the number of unused dentries that are also
negative dentries which do not map to any files. Instead,
they help speeding up rejection of non-existing files provided
by the users.
-dquot-max & dquot-nr
---------------------
-
-The file dquot-max shows the maximum number of cached disk
-quota entries.
-
-The file dquot-nr shows the number of allocated disk quota
-entries and the number of free disk quota entries.
-
-If the number of free cached disk quotas is very low and
-you have some awesome number of simultaneous system users,
-you might want to raise the limit.
-
-
file-max & file-nr
------------------
-The value in file-max denotes the maximum number of file-
+The value in ``file-max`` denotes the maximum number of file-
handles that the Linux kernel will allocate. When you get lots
of error messages about running out of file handles, you might
want to increase this limit.
Historically,the kernel was able to allocate file handles
dynamically, but not to free them again. The three values in
-file-nr denote the number of allocated file handles, the number
+``file-nr`` denote the number of allocated file handles, the number
of allocated but unused file handles, and the maximum number of
-file handles. Linux 2.6 always reports 0 as the number of free
+file handles. Linux 2.6 and later always reports 0 as the number of free
file handles -- this is not an error, it just means that the
number of allocated file handles exactly matches the number of
used file handles.
-Attempts to allocate more file descriptors than file-max are
-reported with printk, look for "VFS: file-max limit <number>
-reached".
+Attempts to allocate more file descriptors than ``file-max`` are
+reported with ``printk``, look for::
+ VFS: file-max limit <number> reached
-nr_open
--------
-
-This denotes the maximum number of file-handles a process can
-allocate. Default value is 1024*1024 (1048576) which should be
-enough for most machines. Actual limit depends on RLIMIT_NOFILE
-resource limit.
+in the kernel logs.
-inode-max, inode-nr & inode-state
----------------------------------
+inode-nr & inode-state
+----------------------
As with file handles, the kernel allocates the inode structures
dynamically, but can't free them yet.
-The value in inode-max denotes the maximum number of inode
-handlers. This value should be 3-4 times larger than the value
-in file-max, since stdin, stdout and network sockets also
-need an inode struct to handle them. When you regularly run
-out of inodes, you need to increase this value.
-
-The file inode-nr contains the first two items from
-inode-state, so we'll skip to that file...
+The file ``inode-nr`` contains the first two items from
+``inode-state``, so we'll skip to that file...
-Inode-state contains three actual numbers and four dummies.
-The actual numbers are, in order of appearance, nr_inodes,
-nr_free_inodes and preshrink.
+``inode-state`` contains three actual numbers and four dummies.
+The actual numbers are, in order of appearance, ``nr_inodes``,
+``nr_free_inodes`` and ``preshrink``.
-Nr_inodes stands for the number of inodes the system has
-allocated, this can be slightly more than inode-max because
-Linux allocates them one pageful at a time.
+``nr_inodes`` stands for the number of inodes the system has
+allocated.
-Nr_free_inodes represents the number of free inodes (?) and
-preshrink is nonzero when the nr_inodes > inode-max and the
+``nr_free_inodes`` represents the number of free inodes (?) and
+preshrink is nonzero when the
system needs to prune the inode list instead of allocating
more.
+mount-max
+---------
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+
+nr_open
+-------
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on ``RLIMIT_NOFILE``
+resource limit.
+
+
overflowgid & overflowuid
-------------------------
@@ -192,7 +161,7 @@ pipe-user-pages-soft
Maximum total number of pages a non-privileged user may allocate for pipes
before the pipe size gets limited to a single page. Once this limit is reached,
new pipes will be limited to a single page in size for this user in order to
-limit total memory usage, and trying to increase them using fcntl() will be
+limit total memory usage, and trying to increase them using ``fcntl()`` will be
denied until usage goes below the limit again. The default value allows to
allocate up to 1024 pipes at their default size. When set to 0, no limit is
applied.
@@ -207,7 +176,7 @@ file.
When set to "0", writing to FIFOs is unrestricted.
-When set to "1" don't allow O_CREAT open on FIFOs that we don't own
+When set to "1" don't allow ``O_CREAT`` open on FIFOs that we don't own
in world writable sticky directories, unless they are owned by the
owner of the directory.
@@ -221,7 +190,7 @@ protected_hardlinks
A long-standing class of security issues is the hardlink-based
time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
+directories like ``/tmp``. The common method of exploitation of this flaw
is to cross privilege boundaries when following a given hardlink (i.e. a
root process follows a hardlink created by another user). Additionally,
on systems without separated partitions, this stops unauthorized users
@@ -239,13 +208,13 @@ This protection is based on the restrictions in Openwall and grsecurity.
protected_regular
-----------------
-This protection is similar to protected_fifos, but it
+This protection is similar to `protected_fifos`_, but it
avoids writes to an attacker-controlled regular file, where a program
expected to create one.
When set to "0", writing to regular files is unrestricted.
-When set to "1" don't allow O_CREAT open on regular files that we
+When set to "1" don't allow ``O_CREAT`` open on regular files that we
don't own in world writable sticky directories, unless they are
owned by the owner of the directory.
@@ -257,7 +226,7 @@ protected_symlinks
A long-standing class of security issues is the symlink-based
time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
+directories like ``/tmp``. The common method of exploitation of this flaw
is to cross privilege boundaries when following a given symlink (i.e. a
root process follows a symlink belonging to another user). For a likely
incomplete list of hundreds of examples across the years, please see:
@@ -272,23 +241,25 @@ follower match, or when the directory owner matches the symlink's owner.
This protection is based on the restrictions in Openwall and grsecurity.
-suid_dumpable:
---------------
+suid_dumpable
+-------------
This value can be used to query and set the core dump mode for setuid
or otherwise protected/tainted binaries. The modes are
= ========== ===============================================================
-0 (default) traditional behaviour. Any process which has changed
+0 (default) Traditional behaviour. Any process which has changed
privilege levels or is execute only will not be dumped.
-1 (debug) all processes dump core when possible. The core dump is
+1 (debug) All processes dump core when possible. The core dump is
owned by the current user and no security is applied. This is
intended for system debugging situations only.
Ptrace is unchecked.
This is insecure as it allows regular users to examine the
memory contents of privileged processes.
-2 (suidsafe) any binary which normally would not be dumped is dumped
- anyway, but only if the "core_pattern" kernel sysctl is set to
+2 (suidsafe) Any binary which normally would not be dumped is dumped
+ anyway, but only if the ``core_pattern`` kernel sysctl (see
+ :ref:`Documentation/admin-guide/sysctl/kernel.rst <core_pattern>`)
+ is set to
either a pipe handler or a fully qualified path. (For more
details on this limitation, see CVE-2006-2451.) This mode is
appropriate when administrators are attempting to debug
@@ -301,36 +272,11 @@ or otherwise protected/tainted binaries. The modes are
= ========== ===============================================================
-super-max & super-nr
---------------------
-
-These numbers control the maximum number of superblocks, and
-thus the maximum number of mounted filesystems the kernel
-can have. You only need to increase super-max if you need to
-mount more filesystems than the current value in super-max
-allows you to.
-
-
-aio-nr & aio-max-nr
--------------------
-
-aio-nr shows the current system-wide number of asynchronous io
-requests. aio-max-nr allows you to change the maximum value
-aio-nr can grow to.
-
-
-mount-max
----------
-
-This denotes the maximum number of mounts that may exist
-in a mount namespace.
-
-
2. /proc/sys/fs/binfmt_misc
===========================
-Documentation for the files in /proc/sys/fs/binfmt_misc is
+Documentation for the files in ``/proc/sys/fs/binfmt_misc`` is
in Documentation/admin-guide/binfmt-misc.rst.
@@ -343,28 +289,32 @@ creation of a user space library that implements the POSIX message queues
API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
Interfaces specification.)
-The "mqueue" filesystem contains values for determining/setting the amount of
-resources used by the file system.
+The "mqueue" filesystem contains values for determining/setting the
+amount of resources used by the file system.
-/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
-maximum number of message queues allowed on the system.
+``/proc/sys/fs/mqueue/queues_max`` is a read/write file for
+setting/getting the maximum number of message queues allowed on the
+system.
-/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
-maximum number of messages in a queue value. In fact it is the limiting value
-for another (user) limit which is set in mq_open invocation. This attribute of
-a queue must be less or equal then msg_max.
+``/proc/sys/fs/mqueue/msg_max`` is a read/write file for
+setting/getting the maximum number of messages in a queue value. In
+fact it is the limiting value for another (user) limit which is set in
+``mq_open`` invocation. This attribute of a queue must be less than
+or equal to ``msg_max``.
-/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
-maximum message size value (it is every message queue's attribute set during
-its creation).
+``/proc/sys/fs/mqueue/msgsize_max`` is a read/write file for
+setting/getting the maximum message size value (it is an attribute of
+every message queue, set during its creation).
-/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the
-default number of messages in a queue value if attr parameter of mq_open(2) is
-NULL. If it exceed msg_max, the default value is initialized msg_max.
+``/proc/sys/fs/mqueue/msg_default`` is a read/write file for
+setting/getting the default number of messages in a queue value if the
+``attr`` parameter of ``mq_open(2)`` is ``NULL``. If it exceeds
+``msg_max``, the default value is initialized to ``msg_max``.
-/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
-the default message size value if attr parameter of mq_open(2) is NULL. If it
-exceed msgsize_max, the default value is initialized msgsize_max.
+``/proc/sys/fs/mqueue/msgsize_default`` is a read/write file for
+setting/getting the default message size value if the ``attr``
+parameter of ``mq_open(2)`` is ``NULL``. If it exceeds
+``msgsize_max``, the default value is initialized to ``msgsize_max``.
4. /proc/sys/fs/epoll - Configuration options for the epoll interface
=====================================================================
@@ -378,7 +328,7 @@ Every epoll file descriptor can store a number of files to be monitored
for event readiness. Each one of these monitored files constitutes a "watch".
This configuration option sets the maximum number of "watches" that are
allowed for each user.
-Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
-on a 64bit one.
-The current default value for max_user_watches is the 1/25 (4%) of the
-available low memory, divided for the "watch" cost in bytes.
+Each "watch" costs roughly 90 bytes on a 32-bit kernel, and roughly 160 bytes
+on a 64-bit one.
+The current default value for ``max_user_watches`` is 4% of the
+available low memory, divided by the "watch" cost in bytes.
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 426162009ce9..d85d90f5d000 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -38,8 +38,8 @@ acct
If BSD-style process accounting is enabled these values control
its behaviour. If free space on filesystem where the log lives
-goes below ``lowwater``% accounting suspends. If free space gets
-above ``highwater``% accounting resumes. ``frequency`` determines
+goes below ``lowwater``\ % accounting suspends. If free space gets
+above ``highwater``\ % accounting resumes. ``frequency`` determines
how often do we check the amount of free space (value is in
seconds). Default:
@@ -65,6 +65,11 @@ combining the following values:
4 s3_beep
= =======
+arch
+====
+
+The machine hardware name, the same output as ``uname -m``
+(e.g. ``x86_64`` or ``aarch64``).
auto_msgmni
===========
@@ -90,7 +95,7 @@ is 0x15 and the full version number is 0x234, this file will contain
the value 340 = 0x154.
See the ``type_of_loader`` and ``ext_loader_type`` fields in
-Documentation/x86/boot.rst for additional information.
+Documentation/arch/x86/boot.rst for additional information.
bootloader_version (x86 only)
@@ -100,7 +105,7 @@ The complete bootloader version number. In the example above, this
file will contain the value 564 = 0x234.
See the ``type_of_loader`` and ``ext_loader_ver`` fields in
-Documentation/x86/boot.rst for additional information.
+Documentation/arch/x86/boot.rst for additional information.
bpf_stats_enabled
@@ -134,6 +139,8 @@ Highest valid capability of the running kernel. Exports
``CAP_LAST_CAP`` from the kernel.
+.. _core_pattern:
+
core_pattern
============
@@ -169,6 +176,7 @@ core_pattern
%f executable filename
%E executable path
%c maximum size of core file by resource limit RLIMIT_CORE
+ %C CPU the task ran on
%<OTHER> both are dropped
======== ==========================================
@@ -428,8 +436,8 @@ ignore-unaligned-usertrap
On architectures where unaligned accesses cause traps, and where this
feature is supported (``CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN``;
-currently, ``arc`` and ``ia64``), controls whether all unaligned traps
-are logged.
+currently, ``arc``, ``ia64`` and ``loongarch``), controls whether all
+unaligned traps are logged.
= =============================================================
0 Log all unaligned accesses.
@@ -445,9 +453,10 @@ this allows system administrators to override the
kexec_load_disabled
===================
-A toggle indicating if the ``kexec_load`` syscall has been disabled.
-This value defaults to 0 (false: ``kexec_load`` enabled), but can be
-set to 1 (true: ``kexec_load`` disabled).
+A toggle indicating if the syscalls ``kexec_load`` and
+``kexec_file_load`` have been disabled.
+This value defaults to 0 (false: ``kexec_*load`` enabled), but can be
+set to 1 (true: ``kexec_*load`` disabled).
Once true, kexec can no longer be used, and the toggle cannot be set
back to false.
This allows a kexec image to be loaded before disabling the syscall,
@@ -455,6 +464,24 @@ allowing a system to set up (and later use) an image without it being
altered.
Generally used together with the `modules_disabled`_ sysctl.
+kexec_load_limit_panic
+======================
+
+This parameter specifies a limit to the number of times the syscalls
+``kexec_load`` and ``kexec_file_load`` can be called with a crash
+image. It can only be set with a more restrictive value than the
+current one.
+
+== ======================================================
+-1 Unlimited calls to kexec. This is the default setting.
+N Number of calls left.
+== ======================================================
+
+kexec_load_limit_reboot
+=======================
+
+Similar functionality as ``kexec_load_limit_panic``, but for a normal
+image.
kptr_restrict
=============
@@ -592,68 +619,59 @@ to the guest kernel command line (see
Documentation/admin-guide/kernel-parameters.rst).
-numa_balancing
-==============
+nmi_wd_lpm_factor (PPC only)
+============================
-Enables/disables automatic page fault based NUMA memory
-balancing. Memory is moved automatically to nodes
-that access it often.
+Factor to apply to the NMI watchdog timeout (only when ``nmi_watchdog`` is
+set to 1). This factor represents the percentage added to
+``watchdog_thresh`` when calculating the NMI watchdog timeout during an
+LPM. The soft lockup timeout is not impacted.
-Enables/disables automatic NUMA memory balancing. On NUMA machines, there
-is a performance penalty if remote memory is accessed by a CPU. When this
-feature is enabled the kernel samples what task thread is accessing memory
-by periodically unmapping pages and later trapping a page fault. At the
-time of the page fault, it is determined if the data being accessed should
-be migrated to a local memory node.
+A value of 0 means no change. The default value is 200 meaning the NMI
+watchdog is set to 30s (based on ``watchdog_thresh`` equal to 10).
-The unmapping of pages and trapping faults incur additional overhead that
-ideally is offset by improved memory locality but there is no universal
-guarantee. If the target workload is already bound to NUMA nodes then this
-feature should be disabled. Otherwise, if the system overhead from the
-feature is too high then the rate the kernel samples for NUMA hinting
-faults may be controlled by the `numa_balancing_scan_period_min_ms,
-numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls.
-
-
-numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
-===============================================================================================================================
+numa_balancing
+==============
-Automatic NUMA balancing scans tasks address space and unmaps pages to
-detect if pages are properly placed or if the data should be migrated to a
-memory node local to where the task is running. Every "scan delay" the task
-scans the next "scan size" number of pages in its address space. When the
-end of the address space is reached the scanner restarts from the beginning.
+Enables/disables and configures automatic page fault based NUMA memory
+balancing. Memory is moved automatically to nodes that access it often.
+The value to set can be the result of ORing the following:
-In combination, the "scan delay" and "scan size" determine the scan rate.
-When "scan delay" decreases, the scan rate increases. The scan delay and
-hence the scan rate of every task is adaptive and depends on historical
-behaviour. If pages are properly placed then the scan delay increases,
-otherwise the scan delay decreases. The "scan size" is not adaptive but
-the higher the "scan size", the higher the scan rate.
+= =================================
+0 NUMA_BALANCING_DISABLED
+1 NUMA_BALANCING_NORMAL
+2 NUMA_BALANCING_MEMORY_TIERING
+= =================================
-Higher scan rates incur higher system overhead as page faults must be
-trapped and potentially data must be migrated. However, the higher the scan
-rate, the more quickly a tasks memory is migrated to a local node if the
-workload pattern changes and minimises performance impact due to remote
-memory accesses. These sysctls control the thresholds for scan delays and
-the number of pages scanned.
+Or NUMA_BALANCING_NORMAL to optimize page placement among different
+NUMA nodes to reduce remote accessing. On NUMA machines, there is a
+performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing
+memory by periodically unmapping pages and later trapping a page
+fault. At the time of the page fault, it is determined if the data
+being accessed should be migrated to a local memory node.
-``numa_balancing_scan_period_min_ms`` is the minimum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the maximum scanning
-rate for each task.
+The unmapping of pages and trapping faults incur additional overhead that
+ideally is offset by improved memory locality but there is no universal
+guarantee. If the target workload is already bound to NUMA nodes then this
+feature should be disabled.
-``numa_balancing_scan_delay_ms`` is the starting "scan delay" used for a task
-when it initially forks.
+Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among
+different types of memory (represented as different NUMA nodes) to
+place the hot pages in the fast memory. This is implemented based on
+unmapping and page fault too.
-``numa_balancing_scan_period_max_ms`` is the maximum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the minimum scanning
-rate for each task.
+numa_balancing_promote_rate_limit_MBps
+======================================
-``numa_balancing_scan_size_mb`` is how many megabytes worth of pages are
-scanned for a given scan.
+Too high promotion/demotion throughput between different memory types
+may hurt application latency. This can be used to rate limit the
+promotion throughput. The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
oops_all_cpu_backtrace
======================
@@ -671,6 +689,15 @@ This is the default behavior.
an oops event is detected.
+oops_limit
+==========
+
+Number of kernel oopses after which the kernel should panic when
+``panic_on_oops`` is not set. Setting this to 0 disables checking
+the count. Setting this to 1 has the same effect as setting
+``panic_on_oops=1``. The default value is 10000.
+
+
osrelease, ostype & version
===========================
@@ -795,6 +822,8 @@ bit 1 print system memory info
bit 2 print timer info
bit 3 print locks info if ``CONFIG_LOCKDEP`` is on
bit 4 print ftrace buffer
+bit 5 print all printk messages in buffer
+bit 6 print all CPUs backtrace (if available in the arch)
===== ============================================
So for example to print tasks and memory info on panic, user can::
@@ -813,6 +842,13 @@ is useful to define the root cause of RCU stalls using a vmcore.
1 panic() after printing RCU stall messages.
= ============================================================
+max_rcu_stall_to_panic
+======================
+
+When ``panic_on_rcu_stall`` is set to 1, this value determines the
+number of times that RCU can stall before panic() is called.
+
+When ``panic_on_rcu_stall`` is set to 0, this value is has no effect.
perf_cpu_time_max_percent
=========================
@@ -905,6 +941,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
The default value is 8.
+perf_user_access (arm64 only)
+=================================
+
+Controls user space access for reading perf event counters. When set to 1,
+user space can read performance monitor counter registers directly.
+
+The default value is 0 (access disabled).
+
+See Documentation/arm64/perf.rst for more information.
+
+
pid_max
=======
@@ -1013,28 +1060,22 @@ This is a directory, with the following entries:
* ``boot_id``: a UUID generated the first time this is retrieved, and
unvarying after that;
+* ``uuid``: a UUID generated every time this is retrieved (this can
+ thus be used to generate UUIDs at will);
+
* ``entropy_avail``: the pool's entropy count, in bits;
* ``poolsize``: the entropy pool size, in bits;
* ``urandom_min_reseed_secs``: obsolete (used to determine the minimum
- number of seconds between urandom pool reseeding).
-
-* ``uuid``: a UUID generated every time this is retrieved (this can
- thus be used to generate UUIDs at will);
+ number of seconds between urandom pool reseeding). This file is
+ writable for compatibility purposes, but writing to it has no effect
+ on any RNG behavior;
* ``write_wakeup_threshold``: when the entropy count drops below this
(as a number of bits), processes waiting to write to ``/dev/random``
- are woken up.
-
-If ``drivers/char/random.c`` is built with ``ADD_INTERRUPT_BENCH``
-defined, these additional entries are present:
-
-* ``add_interrupt_avg_cycles``: the average number of cycles between
- interrupts used to feed the pool;
-
-* ``add_interrupt_avg_deviation``: the standard deviation seen on the
- number of cycles between interrupts used to feed the pool.
+ are woken up. This file is writable for compatibility purposes, but
+ writing to it has no effect on any RNG behavior.
randomize_va_space
@@ -1099,7 +1140,7 @@ task_delayacct
===============
Enables/disables task delay accounting (see
-:doc:`accounting/delay-accounting.rst`). Enabling this feature incurs
+Documentation/accounting/delay-accounting.rst. Enabling this feature incurs
a small amount of overhead in the scheduler but is useful for debugging
and performance tuning. It is required by some tools such as iotop.
@@ -1304,6 +1345,29 @@ watchdog work to be queued by the watchdog timer function, otherwise the NMI
watchdog — if enabled — can detect a hard lockup condition.
+split_lock_mitigate (x86 only)
+==============================
+
+On x86, each "split lock" imposes a system-wide performance penalty. On larger
+systems, large numbers of split locks from unprivileged users can result in
+denials of service to well-behaved and potentially more important users.
+
+The kernel mitigates these bad users by detecting split locks and imposing
+penalties: forcing them to wait and only allowing one core to execute split
+locks at a time.
+
+These mitigations can make those bad applications unbearably slow. Setting
+split_lock_mitigate=0 may restore some application performance, but will also
+increase system exposure to denial of service attacks from split lock users.
+
+= ===================================================================
+0 Disable the mitigation mode - just warns the split lock on kernel log
+ and exposes the system to denials of service from the split lockers.
+1 Enable the mitigation mode (this is the default) - penalizes the split
+ lockers with intentional performance degradation.
+= ===================================================================
+
+
stack_erasing
=============
@@ -1447,8 +1511,8 @@ unaligned-trap
On architectures where unaligned accesses cause traps, and where this
feature is supported (``CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW``; currently,
-``arc`` and ``parisc``), controls whether unaligned traps are caught
-and emulated (instead of failing).
+``arc``, ``parisc`` and ``loongarch``), controls whether unaligned traps
+are caught and emulated (instead of failing).
= ========================================================
0 Do not emulate unaligned accesses.
@@ -1490,6 +1554,16 @@ entry will default to 2 instead of 0.
2 Unprivileged calls to ``bpf()`` are disabled
= =============================================================
+
+warn_limit
+==========
+
+Number of kernel warnings after which the kernel should panic when
+``panic_on_warn`` is not set. Setting this to 0 disables checking
+the warning count. Setting this to 1 has the same effect as setting
+``panic_on_warn=1``. The default value is 0.
+
+
watchdog
========
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 4150f74c521a..466c560b0c30 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -31,17 +31,18 @@ see only some of them, depending on your kernel's configuration.
Table : Subdirectories in /proc/sys/net
- ========= =================== = ========== ==================
+ ========= =================== = ========== ===================
Directory Content Directory Content
- ========= =================== = ========== ==================
- core General parameter appletalk Appletalk protocol
- unix Unix domain sockets netrom NET/ROM
- 802 E802 protocol ax25 AX25
- ethernet Ethernet protocol rose X.25 PLP layer
+ ========= =================== = ========== ===================
+ 802 E802 protocol mptcp Multipath TCP
+ appletalk Appletalk protocol netfilter Network Filter
+ ax25 AX25 netrom NET/ROM
+ bridge Bridging rose X.25 PLP layer
+ core General parameter tipc TIPC
+ ethernet Ethernet protocol unix Unix domain sockets
ipv4 IP version 4 x25 X.25 protocol
- bridge Bridging decnet DEC net
- ipv6 IP version 6 tipc TIPC
- ========= =================== = ========== ==================
+ ipv6 IP version 6
+ ========= =================== = ========== ===================
1. /proc/sys/net/core - Network core options
============================================
@@ -101,6 +102,9 @@ Values:
- 1 - enable JIT hardening for unprivileged users only
- 2 - enable JIT hardening for all users
+where "privileged user" in this context means a process having
+CAP_BPF or CAP_SYS_ADMIN in the root user name space.
+
bpf_jit_kallsyms
----------------
@@ -211,6 +215,12 @@ rmem_max
The maximum receive socket buffer size in bytes.
+rps_default_mask
+----------------
+
+The default RPS CPU mask used on newly created network devices. An empty
+mask means RPS disabled by default.
+
tstamp_allow_data
-----------------
Allow processes to receive tx timestamps looped together with the original
@@ -271,7 +281,7 @@ poll cycle or the number of packets processed reaches netdev_budget.
netdev_max_backlog
------------------
-Maximum number of packets, queued on the INPUT side, when the interface
+Maximum number of packets, queued on the INPUT side, when the interface
receives packets faster than kernel can process them.
netdev_rss_key
@@ -322,6 +332,14 @@ a leaked reference faster. A larger value may be useful to prevent false
warnings on slow/loaded systems.
Default value is 10, minimum 1, maximum 3600.
+skb_defer_max
+-------------
+
+Max size (in skbs) of the per-cpu list of skbs being freed
+by the cpu which allocated them. Used by TCP stack so far.
+
+Default: 64
+
optmem_max
----------
@@ -365,6 +383,36 @@ new netns has been created.
Default : 0 (for compatibility reasons)
+txrehash
+--------
+
+Controls default hash rethink behaviour on listening socket when SO_TXREHASH
+option is set to SOCK_TXREHASH_DEFAULT (i. e. not overridden by setsockopt).
+
+If set to 1 (default), hash rethink is performed on listening socket.
+If set to 0, hash rethink is not performed.
+
+gro_normal_batch
+----------------
+
+Maximum number of the segments to batch up on output of GRO. When a packet
+exits GRO, either as a coalesced superframe or as an original packet which
+GRO has decided not to coalesce, it is placed on a per-NAPI list. This
+list is then passed to the stack when the number of segments reaches the
+gro_normal_batch limit.
+
+high_order_alloc_disable
+------------------------
+
+By default the allocator for page frags tries to use high order pages (order-3
+on x86). While the default behavior gives good results in most cases, some users
+might have hit a contention in page allocations/freeing. This was especially
+true on older kernels (< 5.14) when high-order pages were not stored on per-cpu
+lists. This allows to opt-in for order-0 allocation instead but is now mostly of
+historical importance.
+
+Default: 0
+
2. /proc/sys/net/unix - Parameters for Unix domain sockets
----------------------------------------------------------
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 5e795202111f..45ba1f4dc004 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -62,6 +62,7 @@ Currently, these files are in /proc/sys/vm:
- overcommit_memory
- overcommit_ratio
- page-cluster
+- page_lock_unfairness
- panic_on_oom
- percpu_pagelist_high_fraction
- stat_interval
@@ -355,7 +356,7 @@ The lowmem_reserve_ratio is an array. You can see them by reading this file::
But, these values are not used directly. The kernel calculates # of protection
pages for each zones from them. These are shown as array of protection pages
-in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+in /proc/zoneinfo like the following. (This is an example of x86-64 box).
Each zone has an array of protection pages like this::
Node 0, zone DMA
@@ -432,7 +433,7 @@ a 2bit error in a memory module) is detected in the background by hardware
that cannot be handled by the kernel. In some cases (like the page
still having a valid copy on disk) the kernel will handle the failure
transparently without affecting any applications. But if there is
-no other uptodate copy of the data it will kill to prevent any data
+no other up-to-date copy of the data it will kill to prevent any data
corruptions from propagating.
1: Kill all processes that have the corrupted and not reloadable page mapped
@@ -561,6 +562,43 @@ Change the minimum size of the hugepage pool.
See Documentation/admin-guide/mm/hugetlbpage.rst
+hugetlb_optimize_vmemmap
+========================
+
+This knob is not available when the size of 'struct page' (a structure defined
+in include/linux/mm_types.h) is not power of two (an unusual system config could
+result in this).
+
+Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO).
+
+Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
+per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be
+optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool
+to the buddy allocator, the vmemmap pages representing that range needs to be
+remapped again and the vmemmap pages discarded earlier need to be rellocated
+again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g.
+never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set
+'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on
+the fly') instead of being pulled from the HugeTLB pool, you should weigh the
+benefits of memory savings against the more overhead (~2x slower than before)
+of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy
+allocator. Another behavior to note is that if the system is under heavy memory
+pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB
+pool to the buddy allocator since the allocation of vmemmap pages could be
+failed, you have to retry later if your system encounter this situation.
+
+Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
+buddy allocator will not be optimized meaning the extra overhead at allocation
+time from buddy allocator disappears, whereas already optimized HugeTLB pages
+will not be affected. If you want to make sure there are no optimized HugeTLB
+pages, you can set "nr_hugepages" to 0 first and then disable this. Note that
+writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus
+pages. So, those surplus pages are still optimized until they are no longer
+in use. You would need to wait for those surplus pages to be released before
+there are no optimized pages in the system.
+
+
nr_hugepages_mempolicy
======================
@@ -720,7 +758,7 @@ and don't use much of it.
The default value is 0.
-See Documentation/vm/overcommit-accounting.rst and
+See Documentation/mm/overcommit-accounting.rst and
mm/util.c::__vm_enough_memory() for more information.
@@ -754,6 +792,14 @@ extra faults and I/O delays for following faults if they would have been part of
that consecutive pages readahead would have brought in.
+page_lock_unfairness
+====================
+
+This value determines the number of times that the page lock can be
+stolen from under a waiter. After the lock is stolen the number of times
+specified in this file (default is 5), the "fair lock handoff" semantics
+will apply, and the waiter will only be awakened if the lock can be taken.
+
panic_on_oom
============
@@ -880,6 +926,9 @@ calls without any restrictions.
The default value is 0.
+Another way to control permissions for userfaultfd is to use
+/dev/userfaultfd instead of userfaultfd(2). See
+Documentation/admin-guide/mm/userfaultfd.rst.
user_reserve_kbytes
===================
@@ -948,7 +997,7 @@ how much memory needs to be free before kswapd goes back to sleep.
The unit is in fractions of 10,000. The default value of 10 means the
distances between watermarks are 0.1% of the available memory in the
-node/system. The maximum value is 1000, or 10% of memory.
+node/system. The maximum value is 3000, or 30% of memory.
A high rate of threads entering direct reclaim (allocstall) or kswapd
going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
diff --git a/Documentation/admin-guide/sysrq.rst b/Documentation/admin-guide/sysrq.rst
index 0a178ef0111d..51906e47327b 100644
--- a/Documentation/admin-guide/sysrq.rst
+++ b/Documentation/admin-guide/sysrq.rst
@@ -138,7 +138,7 @@ Command Function
``v`` Forcefully restores framebuffer console
``v`` Causes ETM buffer dump [ARM-specific]
-``w`` Dumps tasks that are in uninterruptable (blocked) state.
+``w`` Dumps tasks that are in uninterruptible (blocked) state.
``x`` Used by xmon interface on ppc/powerpc platforms.
Show global PMU Registers on sparc64.
diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index ceeed7b0798d..92a8a07f5c43 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -100,6 +100,7 @@ Bit Log Number Reason that got the kernel tainted
15 _/K 32768 kernel has been live patched
16 _/X 65536 auxiliary taint, defined for and used by distros
17 _/T 131072 kernel was built with the struct randomization plugin
+ 18 _/N 262144 an in-kernel test has been run
=== === ====== ========================================================
Note: The character ``_`` is representing a blank in this table to make reading
@@ -133,6 +134,12 @@ More detailed explanation for tainting
scsi/snic on something else than x86_64, scsi/ips on non
x86/x86_64/itanium, have broken firmware settings for the
irqchip/irq-gic on arm64 ...).
+ - x86/x86_64: Microcode late loading is dangerous and will result in
+ tainting the kernel. It requires that all CPUs rendezvous to make sure
+ the update happens when the system is as quiescent as possible. However,
+ a higher priority MCE/SMI/NMI can move control flow away from that
+ rendezvous and interrupt the update, which can be detrimental to the
+ machine.
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
modules were unloaded normally.
diff --git a/Documentation/admin-guide/thermal/index.rst b/Documentation/admin-guide/thermal/index.rst
new file mode 100644
index 000000000000..193b7b01a87d
--- /dev/null
+++ b/Documentation/admin-guide/thermal/index.rst
@@ -0,0 +1,8 @@
+=================
+Thermal Subsystem
+=================
+
+.. toctree::
+ :maxdepth: 1
+
+ intel_powerclamp
diff --git a/Documentation/driver-api/thermal/intel_powerclamp.rst b/Documentation/admin-guide/thermal/intel_powerclamp.rst
index 3f6dfb0b3ea6..08509b978af4 100644
--- a/Documentation/driver-api/thermal/intel_powerclamp.rst
+++ b/Documentation/admin-guide/thermal/intel_powerclamp.rst
@@ -26,6 +26,8 @@ By:
- Generic Thermal Layer (sysfs)
- Kernel APIs (TBD)
+ (*) Module Parameters
+
INTRODUCTION
============
@@ -85,7 +87,7 @@ migrated, unless the CPU is taken offline. In this case, threads
belong to the offlined CPUs will be terminated immediately.
Running as SCHED_FIFO and relatively high priority, also allows such
-scheme to work for both preemptable and non-preemptable kernels.
+scheme to work for both preemptible and non-preemptible kernels.
Alignment of idle time around jiffies ensures scalability for HZ
values. This effect can be better visualized using a Perf timechart.
The following diagram shows the behavior of kernel thread
@@ -153,13 +155,15 @@ b) determine the amount of compensation needed at each target ratio
Compensation to each target ratio consists of two parts:
a) steady state error compensation
- This is to offset the error occurring when the system can
- enter idle without extra wakeups (such as external interrupts).
+
+ This is to offset the error occurring when the system can
+ enter idle without extra wakeups (such as external interrupts).
b) dynamic error compensation
- When an excessive amount of wakeups occurs during idle, an
- additional idle ratio can be added to quiet interrupts, by
- slowing down CPU activities.
+
+ When an excessive amount of wakeups occurs during idle, an
+ additional idle ratio can be added to quiet interrupts, by
+ slowing down CPU activities.
A debugfs file is provided for the user to examine compensation
progress and results, such as on a Westmere system::
@@ -281,6 +285,7 @@ cur_state returns value -1 instead of 0 which is to avoid confusing
100% busy state with the disabled state.
Example usage:
+
- To inject 25% idle time::
$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
@@ -318,3 +323,23 @@ device, a PID based userspace thermal controller can manage to
control CPU temperature effectively, when no other thermal influence
is added. For example, a UltraBook user can compile the kernel under
certain temperature (below most active trip points).
+
+Module Parameters
+=================
+
+``cpumask`` (RW)
+ A bit mask of CPUs to inject idle. The format of the bitmask is same as
+ used in other subsystems like in /proc/irq/\*/smp_affinity. The mask is
+ comma separated 32 bit groups. Each CPU is one bit. For example for a 256
+ CPU system the full mask is:
+ ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff
+
+ The rightmost mask is for CPU 0-32.
+
+``max_idle`` (RW)
+ Maximum injected idle time to the total CPU time ratio in percent range
+ from 1 to 100. Even if the cooling device max_state is always 100 (100%),
+ this parameter allows to add a max idle percent limit. The default is 50,
+ to match the current implementation of powerclamp driver. Also doesn't
+ allow value more than 75, if the cpumask includes every CPU present in
+ the system.
diff --git a/Documentation/admin-guide/unicode.rst b/Documentation/admin-guide/unicode.rst
index 290fe83ebe82..cba7e5017d36 100644
--- a/Documentation/admin-guide/unicode.rst
+++ b/Documentation/admin-guide/unicode.rst
@@ -3,11 +3,10 @@ Unicode support
Last update: 2005-01-17, version 1.4
-This file is maintained by H. Peter Anvin <unicode@lanana.org> as part
-of the Linux Assigned Names And Numbers Authority (LANANA) project.
-The current version can be found at:
-
- http://www.lanana.org/docs/unicode/admin-guide/unicode.rst
+Note: The original version of this document, which was maintained at
+lanana.org as part of the Linux Assigned Names And Numbers Authority
+(LANANA) project, is no longer existent. So, this version in the
+mainline Linux kernel is now the maintained main document.
Introduction
------------
diff --git a/Documentation/admin-guide/workload-tracing.rst b/Documentation/admin-guide/workload-tracing.rst
new file mode 100644
index 000000000000..b2e254ec8ee8
--- /dev/null
+++ b/Documentation/admin-guide/workload-tracing.rst
@@ -0,0 +1,606 @@
+.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0)
+
+======================================================
+Discovering Linux kernel subsystems used by a workload
+======================================================
+
+:Authors: - Shuah Khan <skhan@linuxfoundation.org>
+ - Shefali Sharma <sshefali021@gmail.com>
+:maintained-by: Shuah Khan <skhan@linuxfoundation.org>
+
+Key Points
+==========
+
+ * Understanding system resources necessary to build and run a workload
+ is important.
+ * Linux tracing and strace can be used to discover the system resources
+ in use by a workload. The completeness of the system usage information
+ depends on the completeness of coverage of a workload.
+ * Performance and security of the operating system can be analyzed with
+ the help of tools such as:
+ `perf <https://man7.org/linux/man-pages/man1/perf.1.html>`_,
+ `stress-ng <https://www.mankier.com/1/stress-ng>`_,
+ `paxtest <https://github.com/opntr/paxtest-freebsd>`_.
+ * Once we discover and understand the workload needs, we can focus on them
+ to avoid regressions and use it to evaluate safety considerations.
+
+Methodology
+===========
+
+`strace <https://man7.org/linux/man-pages/man1/strace.1.html>`_ is a
+diagnostic, instructional, and debugging tool and can be used to discover
+the system resources in use by a workload. Once we discover and understand
+the workload needs, we can focus on them to avoid regressions and use it
+to evaluate safety considerations. We use strace tool to trace workloads.
+
+This method of tracing using strace tells us the system calls invoked by
+the workload and doesn't include all the system calls that can be invoked
+by it. In addition, this tracing method tells us just the code paths within
+these system calls that are invoked. As an example, if a workload opens a
+file and reads from it successfully, then the success path is the one that
+is traced. Any error paths in that system call will not be traced. If there
+is a workload that provides full coverage of a workload then the method
+outlined here will trace and find all possible code paths. The completeness
+of the system usage information depends on the completeness of coverage of a
+workload.
+
+The goal is tracing a workload on a system running a default kernel without
+requiring custom kernel installs.
+
+How do we gather fine-grained system information?
+=================================================
+
+strace tool can be used to trace system calls made by a process and signals
+it receives. System calls are the fundamental interface between an
+application and the operating system kernel. They enable a program to
+request services from the kernel. For instance, the open() system call in
+Linux is used to provide access to a file in the file system. strace enables
+us to track all the system calls made by an application. It lists all the
+system calls made by a process and their resulting output.
+
+You can generate profiling data combining strace and perf record tools to
+record the events and information associated with a process. This provides
+insight into the process. "perf annotate" tool generates the statistics of
+each instruction of the program. This document goes over the details of how
+to gather fine-grained information on a workload's usage of system resources.
+
+We used strace to trace the perf, stress-ng, paxtest workloads to illustrate
+our methodology to discover resources used by a workload. This process can
+be applied to trace other workloads.
+
+Getting the system ready for tracing
+====================================
+
+Before we can get started we will show you how to get your system ready.
+We assume that you have a Linux distribution running on a physical system
+or a virtual machine. Most distributions will include strace command. Let’s
+install other tools that aren’t usually included to build Linux kernel.
+Please note that the following works on Debian based distributions. You
+might have to find equivalent packages on other Linux distributions.
+
+Install tools to build Linux kernel and tools in kernel repository.
+scripts/ver_linux is a good way to check if your system already has
+the necessary tools::
+
+ sudo apt-get build-essentials flex bison yacc
+ sudo apt install libelf-dev systemtap-sdt-dev libaudit-dev libslang2-dev libperl-dev libdw-dev
+
+cscope is a good tool to browse kernel sources. Let's install it now::
+
+ sudo apt-get install cscope
+
+Install stress-ng and paxtest::
+
+ apt-get install stress-ng
+ apt-get install paxtest
+
+Workload overview
+=================
+
+As mentioned earlier, we used strace to trace perf bench, stress-ng and
+paxtest workloads to show how to analyze a workload and identify Linux
+subsystems used by these workloads. Let's start with an overview of these
+three workloads to get a better understanding of what they do and how to
+use them.
+
+perf bench (all) workload
+-------------------------
+
+The perf bench command contains multiple multi-threaded microkernel
+benchmarks for executing different subsystems in the Linux kernel and
+system calls. This allows us to easily measure the impact of changes,
+which can help mitigate performance regressions. It also acts as a common
+benchmarking framework, enabling developers to easily create test cases,
+integrate transparently, and use performance-rich tooling subsystems.
+
+Stress-ng netdev stressor workload
+----------------------------------
+
+stress-ng is used for performing stress testing on the kernel. It allows
+you to exercise various physical subsystems of the computer, as well as
+interfaces of the OS kernel, using "stressor-s". They are available for
+CPU, CPU cache, devices, I/O, interrupts, file system, memory, network,
+operating system, pipelines, schedulers, and virtual machines. Please refer
+to the `stress-ng man-page <https://www.mankier.com/1/stress-ng>`_ to
+find the description of all the available stressor-s. The netdev stressor
+starts specified number (N) of workers that exercise various netdevice
+ioctl commands across all the available network devices.
+
+paxtest kiddie workload
+-----------------------
+
+paxtest is a program that tests buffer overflows in the kernel. It tests
+kernel enforcements over memory usage. Generally, execution in some memory
+segments makes buffer overflows possible. It runs a set of programs that
+attempt to subvert memory usage. It is used as a regression test suite for
+PaX, but might be useful to test other memory protection patches for the
+kernel. We used paxtest kiddie mode which looks for simple vulnerabilities.
+
+What is strace and how do we use it?
+====================================
+
+As mentioned earlier, strace which is a useful diagnostic, instructional,
+and debugging tool and can be used to discover the system resources in use
+by a workload. It can be used:
+
+ * To see how a process interacts with the kernel.
+ * To see why a process is failing or hanging.
+ * For reverse engineering a process.
+ * To find the files on which a program depends.
+ * For analyzing the performance of an application.
+ * For troubleshooting various problems related to the operating system.
+
+In addition, strace can generate run-time statistics on times, calls, and
+errors for each system call and report a summary when program exits,
+suppressing the regular output. This attempts to show system time (CPU time
+spent running in the kernel) independent of wall clock time. We plan to use
+these features to get information on workload system usage.
+
+strace command supports basic, verbose, and stats modes. strace command when
+run in verbose mode gives more detailed information about the system calls
+invoked by a process.
+
+Running strace -c generates a report of the percentage of time spent in each
+system call, the total time in seconds, the microseconds per call, the total
+number of calls, the count of each system call that has failed with an error
+and the type of system call made.
+
+ * Usage: strace <command we want to trace>
+ * Verbose mode usage: strace -v <command>
+ * Gather statistics: strace -c <command>
+
+We used the “-c” option to gather fine-grained run-time statistics in use
+by three workloads we have chose for this analysis.
+
+ * perf
+ * stress-ng
+ * paxtest
+
+What is cscope and how do we use it?
+====================================
+
+Now let’s look at `cscope <https://cscope.sourceforge.net/>`_, a command
+line tool for browsing C, C++ or Java code-bases. We can use it to find
+all the references to a symbol, global definitions, functions called by a
+function, functions calling a function, text strings, regular expression
+patterns, files including a file.
+
+We can use cscope to find which system call belongs to which subsystem.
+This way we can find the kernel subsystems used by a process when it is
+executed.
+
+Let’s checkout the latest Linux repository and build cscope database::
+
+ git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git linux
+ cd linux
+ cscope -R -p10 # builds cscope.out database before starting browse session
+ cscope -d -p10 # starts browse session on cscope.out database
+
+Note: Run "cscope -R -p10" to build the database and c"scope -d -p10" to
+enter into the browsing session. cscope by default cscope.out database.
+To get out of this mode press ctrl+d. -p option is used to specify the
+number of file path components to display. -p10 is optimal for browsing
+kernel sources.
+
+What is perf and how do we use it?
+==================================
+
+Perf is an analysis tool based on Linux 2.6+ systems, which abstracts the
+CPU hardware difference in performance measurement in Linux, and provides
+a simple command line interface. Perf is based on the perf_events interface
+exported by the kernel. It is very useful for profiling the system and
+finding performance bottlenecks in an application.
+
+If you haven't already checked out the Linux mainline repository, you can do
+so and then build kernel and perf tool::
+
+ git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git linux
+ cd linux
+ make -j3 all
+ cd tools/perf
+ make
+
+Note: The perf command can be built without building the kernel in the
+repository and can be run on older kernels. However matching the kernel
+and perf revisions gives more accurate information on the subsystem usage.
+
+We used "perf stat" and "perf bench" options. For a detailed information on
+the perf tool, run "perf -h".
+
+perf stat
+---------
+The perf stat command generates a report of various hardware and software
+events. It does so with the help of hardware counter registers found in
+modern CPUs that keep the count of these activities. "perf stat cal" shows
+stats for cal command.
+
+Perf bench
+----------
+The perf bench command contains multiple multi-threaded microkernel
+benchmarks for executing different subsystems in the Linux kernel and
+system calls. This allows us to easily measure the impact of changes,
+which can help mitigate performance regressions. It also acts as a common
+benchmarking framework, enabling developers to easily create test cases,
+integrate transparently, and use performance-rich tooling.
+
+"perf bench all" command runs the following benchmarks:
+
+ * sched/messaging
+ * sched/pipe
+ * syscall/basic
+ * mem/memcpy
+ * mem/memset
+
+What is stress-ng and how do we use it?
+=======================================
+
+As mentioned earlier, stress-ng is used for performing stress testing on
+the kernel. It allows you to exercise various physical subsystems of the
+computer, as well as interfaces of the OS kernel, using stressor-s. They
+are available for CPU, CPU cache, devices, I/O, interrupts, file system,
+memory, network, operating system, pipelines, schedulers, and virtual
+machines.
+
+The netdev stressor starts N workers that exercise various netdevice ioctl
+commands across all the available network devices. The following ioctls are
+exercised:
+
+ * SIOCGIFCONF, SIOCGIFINDEX, SIOCGIFNAME, SIOCGIFFLAGS
+ * SIOCGIFADDR, SIOCGIFNETMASK, SIOCGIFMETRIC, SIOCGIFMTU
+ * SIOCGIFHWADDR, SIOCGIFMAP, SIOCGIFTXQLEN
+
+The following command runs the stressor::
+
+ stress-ng --netdev 1 -t 60 --metrics command.
+
+We can use the perf record command to record the events and information
+associated with a process. This command records the profiling data in the
+perf.data file in the same directory.
+
+Using the following commands you can record the events associated with the
+netdev stressor, view the generated report perf.data and annotate the to
+view the statistics of each instruction of the program::
+
+ perf record stress-ng --netdev 1 -t 60 --metrics command.
+ perf report
+ perf annotate
+
+What is paxtest and how do we use it?
+=====================================
+
+paxtest is a program that tests buffer overflows in the kernel. It tests
+kernel enforcements over memory usage. Generally, execution in some memory
+segments makes buffer overflows possible. It runs a set of programs that
+attempt to subvert memory usage. It is used as a regression test suite for
+PaX, and will be useful to test other memory protection patches for the
+kernel.
+
+paxtest provides kiddie and blackhat modes. The paxtest kiddie mode runs
+in normal mode, whereas the blackhat mode tries to get around the protection
+of the kernel testing for vulnerabilities. We focus on the kiddie mode here
+and combine "paxtest kiddie" run with "perf record" to collect CPU stack
+traces for the paxtest kiddie run to see which function is calling other
+functions in the performance profile. Then the "dwarf" (DWARF's Call Frame
+Information) mode can be used to unwind the stack.
+
+The following command can be used to view resulting report in call-graph
+format::
+
+ perf record --call-graph dwarf paxtest kiddie
+ perf report --stdio
+
+Tracing workloads
+=================
+
+Now that we understand the workloads, let's start tracing them.
+
+Tracing perf bench all workload
+-------------------------------
+
+Run the following command to trace perf bench all workload::
+
+ strace -c perf bench all
+
+**System Calls made by the workload**
+
+The below table shows the system calls invoked by the workload, number of
+times each system call is invoked, and the corresponding Linux subsystem.
+
++-------------------+-----------+-----------------+-------------------------+
+| System Call | # calls | Linux Subsystem | System Call (API) |
++===================+===========+=================+=========================+
+| getppid | 10000001 | Process Mgmt | sys_getpid() |
++-------------------+-----------+-----------------+-------------------------+
+| clone | 1077 | Process Mgmt. | sys_clone() |
++-------------------+-----------+-----------------+-------------------------+
+| prctl | 23 | Process Mgmt. | sys_prctl() |
++-------------------+-----------+-----------------+-------------------------+
+| prlimit64 | 7 | Process Mgmt. | sys_prlimit64() |
++-------------------+-----------+-----------------+-------------------------+
+| getpid | 10 | Process Mgmt. | sys_getpid() |
++-------------------+-----------+-----------------+-------------------------+
+| uname | 3 | Process Mgmt. | sys_uname() |
++-------------------+-----------+-----------------+-------------------------+
+| sysinfo | 1 | Process Mgmt. | sys_sysinfo() |
++-------------------+-----------+-----------------+-------------------------+
+| getuid | 1 | Process Mgmt. | sys_getuid() |
++-------------------+-----------+-----------------+-------------------------+
+| getgid | 1 | Process Mgmt. | sys_getgid() |
++-------------------+-----------+-----------------+-------------------------+
+| geteuid | 1 | Process Mgmt. | sys_geteuid() |
++-------------------+-----------+-----------------+-------------------------+
+| getegid | 1 | Process Mgmt. | sys_getegid |
++-------------------+-----------+-----------------+-------------------------+
+| close | 49951 | Filesystem | sys_close() |
++-------------------+-----------+-----------------+-------------------------+
+| pipe | 604 | Filesystem | sys_pipe() |
++-------------------+-----------+-----------------+-------------------------+
+| openat | 48560 | Filesystem | sys_opennat() |
++-------------------+-----------+-----------------+-------------------------+
+| fstat | 8338 | Filesystem | sys_fstat() |
++-------------------+-----------+-----------------+-------------------------+
+| stat | 1573 | Filesystem | sys_stat() |
++-------------------+-----------+-----------------+-------------------------+
+| pread64 | 9646 | Filesystem | sys_pread64() |
++-------------------+-----------+-----------------+-------------------------+
+| getdents64 | 1873 | Filesystem | sys_getdents64() |
++-------------------+-----------+-----------------+-------------------------+
+| access | 3 | Filesystem | sys_access() |
++-------------------+-----------+-----------------+-------------------------+
+| lstat | 1880 | Filesystem | sys_lstat() |
++-------------------+-----------+-----------------+-------------------------+
+| lseek | 6 | Filesystem | sys_lseek() |
++-------------------+-----------+-----------------+-------------------------+
+| ioctl | 3 | Filesystem | sys_ioctl() |
++-------------------+-----------+-----------------+-------------------------+
+| dup2 | 1 | Filesystem | sys_dup2() |
++-------------------+-----------+-----------------+-------------------------+
+| execve | 2 | Filesystem | sys_execve() |
++-------------------+-----------+-----------------+-------------------------+
+| fcntl | 8779 | Filesystem | sys_fcntl() |
++-------------------+-----------+-----------------+-------------------------+
+| statfs | 1 | Filesystem | sys_statfs() |
++-------------------+-----------+-----------------+-------------------------+
+| epoll_create | 2 | Filesystem | sys_epoll_create() |
++-------------------+-----------+-----------------+-------------------------+
+| epoll_ctl | 64 | Filesystem | sys_epoll_ctl() |
++-------------------+-----------+-----------------+-------------------------+
+| newfstatat | 8318 | Filesystem | sys_newfstatat() |
++-------------------+-----------+-----------------+-------------------------+
+| eventfd2 | 192 | Filesystem | sys_eventfd2() |
++-------------------+-----------+-----------------+-------------------------+
+| mmap | 243 | Memory Mgmt. | sys_mmap() |
++-------------------+-----------+-----------------+-------------------------+
+| mprotect | 32 | Memory Mgmt. | sys_mprotect() |
++-------------------+-----------+-----------------+-------------------------+
+| brk | 21 | Memory Mgmt. | sys_brk() |
++-------------------+-----------+-----------------+-------------------------+
+| munmap | 128 | Memory Mgmt. | sys_munmap() |
++-------------------+-----------+-----------------+-------------------------+
+| set_mempolicy | 156 | Memory Mgmt. | sys_set_mempolicy() |
++-------------------+-----------+-----------------+-------------------------+
+| set_tid_address | 1 | Process Mgmt. | sys_set_tid_address() |
++-------------------+-----------+-----------------+-------------------------+
+| set_robust_list | 1 | Futex | sys_set_robust_list() |
++-------------------+-----------+-----------------+-------------------------+
+| futex | 341 | Futex | sys_futex() |
++-------------------+-----------+-----------------+-------------------------+
+| sched_getaffinity | 79 | Scheduler | sys_sched_getaffinity() |
++-------------------+-----------+-----------------+-------------------------+
+| sched_setaffinity | 223 | Scheduler | sys_sched_setaffinity() |
++-------------------+-----------+-----------------+-------------------------+
+| socketpair | 202 | Network | sys_socketpair() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigprocmask | 21 | Signal | sys_rt_sigprocmask() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigaction | 36 | Signal | sys_rt_sigaction() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigreturn | 2 | Signal | sys_rt_sigreturn() |
++-------------------+-----------+-----------------+-------------------------+
+| wait4 | 889 | Time | sys_wait4() |
++-------------------+-----------+-----------------+-------------------------+
+| clock_nanosleep | 37 | Time | sys_clock_nanosleep() |
++-------------------+-----------+-----------------+-------------------------+
+| capget | 4 | Capability | sys_capget() |
++-------------------+-----------+-----------------+-------------------------+
+
+Tracing stress-ng netdev stressor workload
+------------------------------------------
+
+Run the following command to trace stress-ng netdev stressor workload::
+
+ strace -c stress-ng --netdev 1 -t 60 --metrics
+
+**System Calls made by the workload**
+
+The below table shows the system calls invoked by the workload, number of
+times each system call is invoked, and the corresponding Linux subsystem.
+
++-------------------+-----------+-----------------+-------------------------+
+| System Call | # calls | Linux Subsystem | System Call (API) |
++===================+===========+=================+=========================+
+| openat | 74 | Filesystem | sys_openat() |
++-------------------+-----------+-----------------+-------------------------+
+| close | 75 | Filesystem | sys_close() |
++-------------------+-----------+-----------------+-------------------------+
+| read | 58 | Filesystem | sys_read() |
++-------------------+-----------+-----------------+-------------------------+
+| fstat | 20 | Filesystem | sys_fstat() |
++-------------------+-----------+-----------------+-------------------------+
+| flock | 10 | Filesystem | sys_flock() |
++-------------------+-----------+-----------------+-------------------------+
+| write | 7 | Filesystem | sys_write() |
++-------------------+-----------+-----------------+-------------------------+
+| getdents64 | 8 | Filesystem | sys_getdents64() |
++-------------------+-----------+-----------------+-------------------------+
+| pread64 | 8 | Filesystem | sys_pread64() |
++-------------------+-----------+-----------------+-------------------------+
+| lseek | 1 | Filesystem | sys_lseek() |
++-------------------+-----------+-----------------+-------------------------+
+| access | 2 | Filesystem | sys_access() |
++-------------------+-----------+-----------------+-------------------------+
+| getcwd | 1 | Filesystem | sys_getcwd() |
++-------------------+-----------+-----------------+-------------------------+
+| execve | 1 | Filesystem | sys_execve() |
++-------------------+-----------+-----------------+-------------------------+
+| mmap | 61 | Memory Mgmt. | sys_mmap() |
++-------------------+-----------+-----------------+-------------------------+
+| munmap | 3 | Memory Mgmt. | sys_munmap() |
++-------------------+-----------+-----------------+-------------------------+
+| mprotect | 20 | Memory Mgmt. | sys_mprotect() |
++-------------------+-----------+-----------------+-------------------------+
+| mlock | 2 | Memory Mgmt. | sys_mlock() |
++-------------------+-----------+-----------------+-------------------------+
+| brk | 3 | Memory Mgmt. | sys_brk() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigaction | 21 | Signal | sys_rt_sigaction() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigprocmask | 1 | Signal | sys_rt_sigprocmask() |
++-------------------+-----------+-----------------+-------------------------+
+| sigaltstack | 1 | Signal | sys_sigaltstack() |
++-------------------+-----------+-----------------+-------------------------+
+| rt_sigreturn | 1 | Signal | sys_rt_sigreturn() |
++-------------------+-----------+-----------------+-------------------------+
+| getpid | 8 | Process Mgmt. | sys_getpid() |
++-------------------+-----------+-----------------+-------------------------+
+| prlimit64 | 5 | Process Mgmt. | sys_prlimit64() |
++-------------------+-----------+-----------------+-------------------------+
+| arch_prctl | 2 | Process Mgmt. | sys_arch_prctl() |
++-------------------+-----------+-----------------+-------------------------+
+| sysinfo | 2 | Process Mgmt. | sys_sysinfo() |
++-------------------+-----------+-----------------+-------------------------+
+| getuid | 2 | Process Mgmt. | sys_getuid() |
++-------------------+-----------+-----------------+-------------------------+
+| uname | 1 | Process Mgmt. | sys_uname() |
++-------------------+-----------+-----------------+-------------------------+
+| setpgid | 1 | Process Mgmt. | sys_setpgid() |
++-------------------+-----------+-----------------+-------------------------+
+| getrusage | 1 | Process Mgmt. | sys_getrusage() |
++-------------------+-----------+-----------------+-------------------------+
+| geteuid | 1 | Process Mgmt. | sys_geteuid() |
++-------------------+-----------+-----------------+-------------------------+
+| getppid | 1 | Process Mgmt. | sys_getppid() |
++-------------------+-----------+-----------------+-------------------------+
+| sendto | 3 | Network | sys_sendto() |
++-------------------+-----------+-----------------+-------------------------+
+| connect | 1 | Network | sys_connect() |
++-------------------+-----------+-----------------+-------------------------+
+| socket | 1 | Network | sys_socket() |
++-------------------+-----------+-----------------+-------------------------+
+| clone | 1 | Process Mgmt. | sys_clone() |
++-------------------+-----------+-----------------+-------------------------+
+| set_tid_address | 1 | Process Mgmt. | sys_set_tid_address() |
++-------------------+-----------+-----------------+-------------------------+
+| wait4 | 2 | Time | sys_wait4() |
++-------------------+-----------+-----------------+-------------------------+
+| alarm | 1 | Time | sys_alarm() |
++-------------------+-----------+-----------------+-------------------------+
+| set_robust_list | 1 | Futex | sys_set_robust_list() |
++-------------------+-----------+-----------------+-------------------------+
+
+Tracing paxtest kiddie workload
+-------------------------------
+
+Run the following command to trace paxtest kiddie workload::
+
+ strace -c paxtest kiddie
+
+**System Calls made by the workload**
+
+The below table shows the system calls invoked by the workload, number of
+times each system call is invoked, and the corresponding Linux subsystem.
+
++-------------------+-----------+-----------------+----------------------+
+| System Call | # calls | Linux Subsystem | System Call (API) |
++===================+===========+=================+======================+
+| read | 3 | Filesystem | sys_read() |
++-------------------+-----------+-----------------+----------------------+
+| write | 11 | Filesystem | sys_write() |
++-------------------+-----------+-----------------+----------------------+
+| close | 41 | Filesystem | sys_close() |
++-------------------+-----------+-----------------+----------------------+
+| stat | 24 | Filesystem | sys_stat() |
++-------------------+-----------+-----------------+----------------------+
+| fstat | 2 | Filesystem | sys_fstat() |
++-------------------+-----------+-----------------+----------------------+
+| pread64 | 6 | Filesystem | sys_pread64() |
++-------------------+-----------+-----------------+----------------------+
+| access | 1 | Filesystem | sys_access() |
++-------------------+-----------+-----------------+----------------------+
+| pipe | 1 | Filesystem | sys_pipe() |
++-------------------+-----------+-----------------+----------------------+
+| dup2 | 24 | Filesystem | sys_dup2() |
++-------------------+-----------+-----------------+----------------------+
+| execve | 1 | Filesystem | sys_execve() |
++-------------------+-----------+-----------------+----------------------+
+| fcntl | 26 | Filesystem | sys_fcntl() |
++-------------------+-----------+-----------------+----------------------+
+| openat | 14 | Filesystem | sys_openat() |
++-------------------+-----------+-----------------+----------------------+
+| rt_sigaction | 7 | Signal | sys_rt_sigaction() |
++-------------------+-----------+-----------------+----------------------+
+| rt_sigreturn | 38 | Signal | sys_rt_sigreturn() |
++-------------------+-----------+-----------------+----------------------+
+| clone | 38 | Process Mgmt. | sys_clone() |
++-------------------+-----------+-----------------+----------------------+
+| wait4 | 44 | Time | sys_wait4() |
++-------------------+-----------+-----------------+----------------------+
+| mmap | 7 | Memory Mgmt. | sys_mmap() |
++-------------------+-----------+-----------------+----------------------+
+| mprotect | 3 | Memory Mgmt. | sys_mprotect() |
++-------------------+-----------+-----------------+----------------------+
+| munmap | 1 | Memory Mgmt. | sys_munmap() |
++-------------------+-----------+-----------------+----------------------+
+| brk | 3 | Memory Mgmt. | sys_brk() |
++-------------------+-----------+-----------------+----------------------+
+| getpid | 1 | Process Mgmt. | sys_getpid() |
++-------------------+-----------+-----------------+----------------------+
+| getuid | 1 | Process Mgmt. | sys_getuid() |
++-------------------+-----------+-----------------+----------------------+
+| getgid | 1 | Process Mgmt. | sys_getgid() |
++-------------------+-----------+-----------------+----------------------+
+| geteuid | 2 | Process Mgmt. | sys_geteuid() |
++-------------------+-----------+-----------------+----------------------+
+| getegid | 1 | Process Mgmt. | sys_getegid() |
++-------------------+-----------+-----------------+----------------------+
+| getppid | 1 | Process Mgmt. | sys_getppid() |
++-------------------+-----------+-----------------+----------------------+
+| arch_prctl | 2 | Process Mgmt. | sys_arch_prctl() |
++-------------------+-----------+-----------------+----------------------+
+
+Conclusion
+==========
+
+This document is intended to be used as a guide on how to gather fine-grained
+information on the resources in use by workloads using strace.
+
+References
+==========
+
+ * `Discovery Linux Kernel Subsystems used by OpenAPS <https://elisa.tech/blog/2022/02/02/discovery-linux-kernel-subsystems-used-by-openaps>`_
+ * `ELISA-White-Papers-Discovering Linux kernel subsystems used by a workload <https://github.com/elisa-tech/ELISA-White-Papers/blob/master/Processes/Discovering_Linux_kernel_subsystems_used_by_a_workload.md>`_
+ * `strace <https://man7.org/linux/man-pages/man1/strace.1.html>`_
+ * `perf <https://man7.org/linux/man-pages/man1/perf.1.html>`_
+ * `paxtest README <https://github.com/opntr/paxtest-freebsd/blob/hardenedbsd/0.9.14-hbsd/README>`_
+ * `stress-ng <https://www.mankier.com/1/stress-ng>`_
+ * `Monitoring and managing system status and performance <https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/monitoring_and_managing_system_status_and_performance/index>`_
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
index 8de008c0c5ad..3a9c041d7f6c 100644
--- a/Documentation/admin-guide/xfs.rst
+++ b/Documentation/admin-guide/xfs.rst
@@ -236,13 +236,14 @@ the dates listed above.
Deprecated Mount Options
========================
-=========================== ================
+============================ ================
Name Removal Schedule
-=========================== ================
+============================ ================
Mounting with V4 filesystem September 2030
+Mounting ascii-ci filesystem September 2030
ikeep/noikeep September 2025
attr2/noattr2 September 2025
-=========================== ================
+============================ ================
Removed Mount Options
@@ -296,7 +297,7 @@ The following sysctls are available for the XFS filesystem:
XFS_ERRLEVEL_LOW: 1
XFS_ERRLEVEL_HIGH: 5
- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256)
+ fs.xfs.panic_mask (Min: 0 Default: 0 Max: 511)
Causes certain error conditions to call BUG(). Value is a bitmask;
OR together the tags which represent errors which should cause panics:
diff --git a/Documentation/arch.rst b/Documentation/arch.rst
deleted file mode 100644
index f10bd32a5972..000000000000
--- a/Documentation/arch.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-CPU Architectures
-=================
-
-These books provide programming details about architecture-specific
-implementation.
-
-.. toctree::
- :maxdepth: 2
-
- arm/index
- arm64/index
- ia64/index
- m68k/index
- mips/index
- nios2/index
- openrisc/index
- parisc/index
- powerpc/index
- riscv/index
- s390/index
- sh/index
- sparc/index
- x86/index
- xtensa/index
diff --git a/Documentation/arch/arc/arc.rst b/Documentation/arch/arc/arc.rst
new file mode 100644
index 000000000000..6c4d978f3f4e
--- /dev/null
+++ b/Documentation/arch/arc/arc.rst
@@ -0,0 +1,85 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Linux kernel for ARC processors
+*******************************
+
+Other sources of information
+############################
+
+Below are some resources where more information can be found on
+ARC processors and relevant open source projects.
+
+- `<https://embarc.org>`_ - Community portal for open source on ARC.
+ Good place to start to find relevant FOSS projects, toolchain releases,
+ news items and more.
+
+- `<https://github.com/foss-for-synopsys-dwc-arc-processors>`_ -
+ Home for all development activities regarding open source projects for
+ ARC processors. Some of the projects are forks of various upstream projects,
+ where "work in progress" is hosted prior to submission to upstream projects.
+ Other projects are developed by Synopsys and made available to community
+ as open source for use on ARC Processors.
+
+- `Official Synopsys ARC Processors website
+ <https://www.synopsys.com/designware-ip/processor-solutions.html>`_ -
+ location, with access to some IP documentation (`Programmer's Reference
+ Manual, AKA PRM for ARC HS processors
+ <https://www.synopsys.com/dw/doc.php/ds/cc/programmers-reference-manual-ARC-HS.pdf>`_)
+ and free versions of some commercial tools (`Free nSIM
+ <https://www.synopsys.com/cgi-bin/dwarcnsim/req1.cgi>`_ and
+ `MetaWare Light Edition <https://www.synopsys.com/cgi-bin/arcmwtk_lite/reg1.cgi>`_).
+ Please note though, registration is required to access both the documentation and
+ the tools.
+
+Important note on ARC processors configurability
+################################################
+
+ARC processors are highly configurable and several configurable options
+are supported in Linux. Some options are transparent to software
+(i.e cache geometries, some can be detected at runtime and configured
+and used accordingly, while some need to be explicitly selected or configured
+in the kernel's configuration utility (AKA "make menuconfig").
+
+However not all configurable options are supported when an ARC processor
+is to run Linux. SoC design teams should refer to "Appendix E:
+Configuration for ARC Linux" in the ARC HS Databook for configurability
+guidelines.
+
+Following these guidelines and selecting valid configuration options
+up front is critical to help prevent any unwanted issues during
+SoC bringup and software development in general.
+
+Building the Linux kernel for ARC processors
+############################################
+
+The process of kernel building for ARC processors is the same as for any other
+architecture and could be done in 2 ways:
+
+- Cross-compilation: process of compiling for ARC targets on a development
+ host with a different processor architecture (generally x86_64/amd64).
+- Native compilation: process of compiling for ARC on a ARC platform
+ (hardware board or a simulator like QEMU) with complete development environment
+ (GNU toolchain, dtc, make etc) installed on the platform.
+
+In both cases, up-to-date GNU toolchain for ARC for the host is needed.
+Synopsys offers prebuilt toolchain releases which can be used for this purpose,
+available from:
+
+- Synopsys GNU toolchain releases:
+ `<https://github.com/foss-for-synopsys-dwc-arc-processors/toolchain/releases>`_
+
+- Linux kernel compilers collection:
+ `<https://mirrors.edge.kernel.org/pub/tools/crosstool>`_
+
+- Bootlin's toolchain collection: `<https://toolchains.bootlin.com>`_
+
+Once the toolchain is installed in the system, make sure its "bin" folder
+is added in your ``PATH`` environment variable. Then set ``ARCH=arc`` &
+``CROSS_COMPILE=arc-linux`` (or whatever matches installed ARC toolchain prefix)
+and then as usual ``make defconfig && make``.
+
+This will produce "vmlinux" file in the root of the kernel source tree
+usable for loading on the target system via JTAG.
+If you need to get an image usable with U-Boot bootloader,
+type ``make uImage`` and ``uImage`` will be produced in ``arch/arc/boot``
+folder.
diff --git a/Documentation/arch/arc/features.rst b/Documentation/arch/arc/features.rst
new file mode 100644
index 000000000000..b793583d688a
--- /dev/null
+++ b/Documentation/arch/arc/features.rst
@@ -0,0 +1,3 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. kernel-feat:: $srctree/Documentation/features arc
diff --git a/Documentation/arch/arc/index.rst b/Documentation/arch/arc/index.rst
new file mode 100644
index 000000000000..7b098d4a5e3e
--- /dev/null
+++ b/Documentation/arch/arc/index.rst
@@ -0,0 +1,17 @@
+===================
+ARC architecture
+===================
+
+.. toctree::
+ :maxdepth: 1
+
+ arc
+
+ features
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/ia64/aliasing.rst b/Documentation/arch/ia64/aliasing.rst
index a08b36aba015..36a1e1d4842b 100644
--- a/Documentation/ia64/aliasing.rst
+++ b/Documentation/arch/ia64/aliasing.rst
@@ -61,7 +61,7 @@ Memory Map
The efi_memmap table is preserved unmodified because the original
boot-time information is required for kexec.
-Kernel Identify Mappings
+Kernel Identity Mappings
========================
Linux/ia64 identity mappings are done with large pages, currently
diff --git a/Documentation/ia64/efirtc.rst b/Documentation/arch/ia64/efirtc.rst
index fd8328408301..fd8328408301 100644
--- a/Documentation/ia64/efirtc.rst
+++ b/Documentation/arch/ia64/efirtc.rst
diff --git a/Documentation/ia64/err_inject.rst b/Documentation/arch/ia64/err_inject.rst
index 900f71e93a29..900f71e93a29 100644
--- a/Documentation/ia64/err_inject.rst
+++ b/Documentation/arch/ia64/err_inject.rst
diff --git a/Documentation/ia64/features.rst b/Documentation/arch/ia64/features.rst
index d7226fdcf5f8..d7226fdcf5f8 100644
--- a/Documentation/ia64/features.rst
+++ b/Documentation/arch/ia64/features.rst
diff --git a/Documentation/ia64/fsys.rst b/Documentation/arch/ia64/fsys.rst
index a702d2cc94b6..a702d2cc94b6 100644
--- a/Documentation/ia64/fsys.rst
+++ b/Documentation/arch/ia64/fsys.rst
diff --git a/Documentation/ia64/ia64.rst b/Documentation/arch/ia64/ia64.rst
index b725019a9492..b725019a9492 100644
--- a/Documentation/ia64/ia64.rst
+++ b/Documentation/arch/ia64/ia64.rst
diff --git a/Documentation/ia64/index.rst b/Documentation/arch/ia64/index.rst
index 761f2154dfa2..761f2154dfa2 100644
--- a/Documentation/ia64/index.rst
+++ b/Documentation/arch/ia64/index.rst
diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/arch/ia64/irq-redir.rst
index 6bbbbe4f73ef..6bbbbe4f73ef 100644
--- a/Documentation/ia64/irq-redir.rst
+++ b/Documentation/arch/ia64/irq-redir.rst
diff --git a/Documentation/ia64/mca.rst b/Documentation/arch/ia64/mca.rst
index 08270bba44a4..08270bba44a4 100644
--- a/Documentation/ia64/mca.rst
+++ b/Documentation/arch/ia64/mca.rst
diff --git a/Documentation/ia64/serial.rst b/Documentation/arch/ia64/serial.rst
index 1de70c305a79..1de70c305a79 100644
--- a/Documentation/ia64/serial.rst
+++ b/Documentation/arch/ia64/serial.rst
diff --git a/Documentation/arch/index.rst b/Documentation/arch/index.rst
new file mode 100644
index 000000000000..80ee31016584
--- /dev/null
+++ b/Documentation/arch/index.rst
@@ -0,0 +1,28 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+CPU Architectures
+=================
+
+These books provide programming details about architecture-specific
+implementation.
+
+.. toctree::
+ :maxdepth: 2
+
+ arc/index
+ ../arm/index
+ ../arm64/index
+ ia64/index
+ ../loongarch/index
+ m68k/index
+ ../mips/index
+ nios2/index
+ openrisc/index
+ parisc/index
+ ../powerpc/index
+ ../riscv/index
+ ../s390/index
+ sh/index
+ sparc/index
+ x86/index
+ xtensa/index
diff --git a/Documentation/m68k/buddha-driver.rst b/Documentation/arch/m68k/buddha-driver.rst
index 20e401413991..20e401413991 100644
--- a/Documentation/m68k/buddha-driver.rst
+++ b/Documentation/arch/m68k/buddha-driver.rst
diff --git a/Documentation/m68k/features.rst b/Documentation/arch/m68k/features.rst
index 5107a2119472..5107a2119472 100644
--- a/Documentation/m68k/features.rst
+++ b/Documentation/arch/m68k/features.rst
diff --git a/Documentation/m68k/index.rst b/Documentation/arch/m68k/index.rst
index 0f890dbb5fe2..0f890dbb5fe2 100644
--- a/Documentation/m68k/index.rst
+++ b/Documentation/arch/m68k/index.rst
diff --git a/Documentation/m68k/kernel-options.rst b/Documentation/arch/m68k/kernel-options.rst
index cabd9419740d..2008a20b4329 100644
--- a/Documentation/m68k/kernel-options.rst
+++ b/Documentation/arch/m68k/kernel-options.rst
@@ -367,8 +367,8 @@ activated by a "external:" sub-option.
4.1.2) inverse
--------------
-Invert the display. This affects both, text (consoles) and graphics
-(X) display. Usually, the background is chosen to be black. With this
+Invert the display. This affects only text consoles.
+Usually, the background is chosen to be black. With this
option, you can make the background white.
4.1.3) font
diff --git a/Documentation/nios2/features.rst b/Documentation/arch/nios2/features.rst
index 8449e63f69b2..8449e63f69b2 100644
--- a/Documentation/nios2/features.rst
+++ b/Documentation/arch/nios2/features.rst
diff --git a/Documentation/nios2/index.rst b/Documentation/arch/nios2/index.rst
index 4468fe1a1037..4468fe1a1037 100644
--- a/Documentation/nios2/index.rst
+++ b/Documentation/arch/nios2/index.rst
diff --git a/Documentation/nios2/nios2.rst b/Documentation/arch/nios2/nios2.rst
index 43da3f7cee76..43da3f7cee76 100644
--- a/Documentation/nios2/nios2.rst
+++ b/Documentation/arch/nios2/nios2.rst
diff --git a/Documentation/openrisc/features.rst b/Documentation/arch/openrisc/features.rst
index 3f7c40d219f2..3f7c40d219f2 100644
--- a/Documentation/openrisc/features.rst
+++ b/Documentation/arch/openrisc/features.rst
diff --git a/Documentation/openrisc/index.rst b/Documentation/arch/openrisc/index.rst
index 6879f998b87a..6879f998b87a 100644
--- a/Documentation/openrisc/index.rst
+++ b/Documentation/arch/openrisc/index.rst
diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/arch/openrisc/openrisc_port.rst
index 657ac4af7be6..657ac4af7be6 100644
--- a/Documentation/openrisc/openrisc_port.rst
+++ b/Documentation/arch/openrisc/openrisc_port.rst
diff --git a/Documentation/openrisc/todo.rst b/Documentation/arch/openrisc/todo.rst
index 420b18b87eda..420b18b87eda 100644
--- a/Documentation/openrisc/todo.rst
+++ b/Documentation/arch/openrisc/todo.rst
diff --git a/Documentation/parisc/debugging.rst b/Documentation/arch/parisc/debugging.rst
index de1b60402c5b..de1b60402c5b 100644
--- a/Documentation/parisc/debugging.rst
+++ b/Documentation/arch/parisc/debugging.rst
diff --git a/Documentation/parisc/features.rst b/Documentation/arch/parisc/features.rst
index 501d7c450037..501d7c450037 100644
--- a/Documentation/parisc/features.rst
+++ b/Documentation/arch/parisc/features.rst
diff --git a/Documentation/parisc/index.rst b/Documentation/arch/parisc/index.rst
index 240685751825..240685751825 100644
--- a/Documentation/parisc/index.rst
+++ b/Documentation/arch/parisc/index.rst
diff --git a/Documentation/parisc/registers.rst b/Documentation/arch/parisc/registers.rst
index 59c8ecf3e856..59c8ecf3e856 100644
--- a/Documentation/parisc/registers.rst
+++ b/Documentation/arch/parisc/registers.rst
diff --git a/Documentation/sh/booting.rst b/Documentation/arch/sh/booting.rst
index d851c49a01bf..d851c49a01bf 100644
--- a/Documentation/sh/booting.rst
+++ b/Documentation/arch/sh/booting.rst
diff --git a/Documentation/sh/features.rst b/Documentation/arch/sh/features.rst
index f722af3b6c99..f722af3b6c99 100644
--- a/Documentation/sh/features.rst
+++ b/Documentation/arch/sh/features.rst
diff --git a/Documentation/sh/index.rst b/Documentation/arch/sh/index.rst
index c64776738cf6..c64776738cf6 100644
--- a/Documentation/sh/index.rst
+++ b/Documentation/arch/sh/index.rst
diff --git a/Documentation/sh/new-machine.rst b/Documentation/arch/sh/new-machine.rst
index e501c52b3b30..e501c52b3b30 100644
--- a/Documentation/sh/new-machine.rst
+++ b/Documentation/arch/sh/new-machine.rst
diff --git a/Documentation/sh/register-banks.rst b/Documentation/arch/sh/register-banks.rst
index 2bef5c8fcbbc..2bef5c8fcbbc 100644
--- a/Documentation/sh/register-banks.rst
+++ b/Documentation/arch/sh/register-banks.rst
diff --git a/Documentation/sparc/adi.rst b/Documentation/arch/sparc/adi.rst
index 857ad30f9569..dbcd8b6e7bc3 100644
--- a/Documentation/sparc/adi.rst
+++ b/Documentation/arch/sparc/adi.rst
@@ -38,7 +38,7 @@ virtual addresses that contain 0xa in bits 63-60.
ADI is enabled on a set of pages using mprotect() with PROT_ADI flag.
When ADI is enabled on a set of pages by a task for the first time,
-kernel sets the PSTATE.mcde bit fot the task. Version tags for memory
+kernel sets the PSTATE.mcde bit for the task. Version tags for memory
addresses are set with an stxa instruction on the addresses using
ASI_MCD_PRIMARY or ASI_MCD_ST_BLKINIT_PRIMARY. ADI block size is
provided by the hypervisor to the kernel. Kernel returns the value of
@@ -97,7 +97,7 @@ With ADI enabled, following new traps may occur:
Disrupting memory corruption
----------------------------
- When a store accesses a memory localtion that has TTE.mcd=1,
+ When a store accesses a memory location that has TTE.mcd=1,
the task is running with ADI enabled (PSTATE.mcde=1), and the ADI
tag in the address used (bits 63:60) does not match the tag set on
the corresponding cacheline, a memory corruption trap occurs. By
diff --git a/Documentation/sparc/console.rst b/Documentation/arch/sparc/console.rst
index 73132db83ece..73132db83ece 100644
--- a/Documentation/sparc/console.rst
+++ b/Documentation/arch/sparc/console.rst
diff --git a/Documentation/sparc/features.rst b/Documentation/arch/sparc/features.rst
index c0c92468b0fe..c0c92468b0fe 100644
--- a/Documentation/sparc/features.rst
+++ b/Documentation/arch/sparc/features.rst
diff --git a/Documentation/sparc/index.rst b/Documentation/arch/sparc/index.rst
index ae884224eec2..ae884224eec2 100644
--- a/Documentation/sparc/index.rst
+++ b/Documentation/arch/sparc/index.rst
diff --git a/Documentation/sparc/oradax/dax-hv-api.txt b/Documentation/arch/sparc/oradax/dax-hv-api.txt
index 73e8d506cf64..7ecd0bf4957b 100644
--- a/Documentation/sparc/oradax/dax-hv-api.txt
+++ b/Documentation/arch/sparc/oradax/dax-hv-api.txt
@@ -22,7 +22,7 @@ Chapter 36. Coprocessor services
functionality offered may vary by virtual machine implementation.
The DAX is a virtual device to sun4v guests, with supported data operations indicated by the virtual device
- compatibilty property. Functionality is accessed through the submission of Command Control Blocks
+ compatibility property. Functionality is accessed through the submission of Command Control Blocks
(CCBs) via the ccb_submit API function. The operations are processed asynchronously, with the status
of the submitted operations reported through a Completion Area linked to each CCB. Each CCB has a
separate Completion Area and, unless execution order is specifically restricted through the use of serial-
@@ -313,7 +313,7 @@ bits set, and terminate at a CCB that has the Conditional bit set, but not the P
Secondary Input Description
Format Code
- 0 Element is stored as value minus 1 (0 evalutes to 1, 1 evalutes
+ 0 Element is stored as value minus 1 (0 evaluates to 1, 1 evaluates
to 2, etc)
1 Element is stored as value
@@ -659,7 +659,7 @@ Offset Size Field Description
“Secondary Input Element Size”
[13:10] Output Format (see Section 36.2.1.1.6, “Output Format”)
[9:5] Operand size for first scan criteria value. In a scan value
- operation, this is one of two potential extact match values.
+ operation, this is one of two potential exact match values.
In a scan range operation, this is the size of the upper range
@@ -673,7 +673,7 @@ Offset Size Field Description
operand, minus 1. Values 0xF-0x1E are reserved. A value of
0x1F indicates this operand is not in use for this scan operation.
[4:0] Operand size for second scan criteria value. In a scan value
- operation, this is one of two potential extact match values.
+ operation, this is one of two potential exact match values.
In a scan range operation, this is the size of the lower range
boundary. The value of this field is the number of bytes in the
operand, minus 1. Values 0xF-0x1E are reserved. A value of
@@ -690,24 +690,24 @@ Offset Size Field Description
48 8 Output (same fields as Primary Input)
56 8 Symbol Table (if used by Primary Input). Same fields as Section 36.2.1.2,
“Extract command”
-64 4 Next 4 most significant bytes of first scan criteria operand occuring after the
+64 4 Next 4 most significant bytes of first scan criteria operand occurring after the
bytes specified at offset 40, if needed by the operand size. If first operand
is less than 8 bytes, the valid bytes are left-aligned to the lowest address.
-68 4 Next 4 most significant bytes of second scan criteria operand occuring after
+68 4 Next 4 most significant bytes of second scan criteria operand occurring after
the bytes specified at offset 44, if needed by the operand size. If second
operand is less than 8 bytes, the valid bytes are left-aligned to the lowest
address.
-72 4 Next 4 most significant bytes of first scan criteria operand occuring after the
+72 4 Next 4 most significant bytes of first scan criteria operand occurring after the
bytes specified at offset 64, if needed by the operand size. If first operand
is less than 12 bytes, the valid bytes are left-aligned to the lowest address.
-76 4 Next 4 most significant bytes of second scan criteria operand occuring after
+76 4 Next 4 most significant bytes of second scan criteria operand occurring after
the bytes specified at offset 68, if needed by the operand size. If second
operand is less than 12 bytes, the valid bytes are left-aligned to the lowest
address.
-80 4 Next 4 most significant bytes of first scan criteria operand occuring after the
+80 4 Next 4 most significant bytes of first scan criteria operand occurring after the
bytes specified at offset 72, if needed by the operand size. If first operand
is less than 16 bytes, the valid bytes are left-aligned to the lowest address.
-84 4 Next 4 most significant bytes of second scan criteria operand occuring after
+84 4 Next 4 most significant bytes of second scan criteria operand occurring after
the bytes specified at offset 76, if needed by the operand size. If second
operand is less than 16 bytes, the valid bytes are left-aligned to the lowest
address.
@@ -721,10 +721,10 @@ Offset Size Field Description
36.2.1.4. Translate commands
- The translate commands takes an input array of indicies, and a table of single bit values indexed by those
- indicies, and outputs a bit vector or index array created by reading the tables bit value at each index in
+ The translate commands takes an input array of indices, and a table of single bit values indexed by those
+ indices, and outputs a bit vector or index array created by reading the tables bit value at each index in
the input array. The output should therefore contain exactly one bit per index in the input data stream,
- when outputing as a bit vector. When outputing as an index array, the number of elements depends on the
+ when outputting as a bit vector. When outputting as an index array, the number of elements depends on the
values read in the bit table, but will always be less than, or equal to, the number of input elements. Only
a restricted subset of the possible input format types are supported. No variable width or Huffman/OZIP
encoded input streams are allowed. The primary input data element size must be 3 bytes or less.
@@ -742,7 +742,7 @@ Offset Size Field Description
code in the CCB header.
There are two supported formats for the output stream: the bit vector and index array formats (codes 0x8,
- 0xD, and 0xE). The index array format is an array of indicies of bits which would have been set if the
+ 0xD, and 0xE). The index array format is an array of indices of bits which would have been set if the
output format was a bit array.
The return value of the CCB completion area contains the number of bits set in the output bit vector,
@@ -1254,7 +1254,7 @@ EUNAVAILABLE The requested CCB operation could not be performed at this time.
submitted CCB, or may apply to a larger scope. The status should not be
interpreted as permanent, and the guest should attempt to submit CCBs in
the future which had previously been unable to be performed. The status
- data provides additional information about scope of the retricted availability
+ data provides additional information about scope of the restricted availability
as follows:
Value Description
0 Processing for the exact CCB instance submitted was unavailable,
@@ -1330,20 +1330,20 @@ EUNAVAILABLE The requested CCB operation could not be performed at this time.
of other CCBs ahead of the requested CCB, to provide a relative estimate of when the CCB may execute.
The dax return value is only valid when the state is ENQUEUED. The value returned is the DAX unit
- instance indentifier for the DAX unit processing the queue where the requested CCB is located. The value
+ instance identifier for the DAX unit processing the queue where the requested CCB is located. The value
matches the value that would have been, or was, returned by ccb_submit using the queue info flag.
The queue return value is only valid when the state is ENQUEUED. The value returned is the DAX
- queue instance indentifier for the DAX unit processing the queue where the requested CCB is located. The
+ queue instance identifier for the DAX unit processing the queue where the requested CCB is located. The
value matches the value that would have been, or was, returned by ccb_submit using the queue info flag.
36.3.2.1. Errors
- EOK The request was proccessed and the CCB state is valid.
+ EOK The request was processed and the CCB state is valid.
EBADALIGN address is not on a 64-byte aligned.
ENORADDR The real address provided for address is not valid.
EINVAL The CCB completion area contents are not valid.
- EWOULDBLOCK Internal resource contraints prevented the CCB state from being queried at this
+ EWOULDBLOCK Internal resource constraints prevented the CCB state from being queried at this
time. The guest should retry the request.
ENOACCESS The guest does not have permission to access the coprocessor virtual device
functionality.
@@ -1401,11 +1401,11 @@ EUNAVAILABLE The requested CCB operation could not be performed at this time.
36.3.3.2. Errors
- EOK The request was proccessed and the result is valid.
+ EOK The request was processed and the result is valid.
EBADALIGN address is not on a 64-byte aligned.
ENORADDR The real address provided for address is not valid.
EINVAL The CCB completion area contents are not valid.
- EWOULDBLOCK Internal resource contraints prevented the CCB from being killed at this time.
+ EWOULDBLOCK Internal resource constraints prevented the CCB from being killed at this time.
The guest should retry the request.
ENOACCESS The guest does not have permission to access the coprocessor virtual device
functionality.
@@ -1423,7 +1423,7 @@ EUNAVAILABLE The requested CCB operation could not be performed at this time.
36.3.4.1. Errors
- EOK The request was proccessed and the number of enabled/disabled DAX units
+ EOK The request was processed and the number of enabled/disabled DAX units
are valid.
diff --git a/Documentation/sparc/oradax/oracle-dax.rst b/Documentation/arch/sparc/oradax/oracle-dax.rst
index d1e14d572918..d1e14d572918 100644
--- a/Documentation/sparc/oradax/oracle-dax.rst
+++ b/Documentation/arch/sparc/oradax/oracle-dax.rst
diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst
new file mode 100644
index 000000000000..934310ce7258
--- /dev/null
+++ b/Documentation/arch/x86/amd-memory-encryption.rst
@@ -0,0 +1,133 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+AMD Memory Encryption
+=====================
+
+Secure Memory Encryption (SME) and Secure Encrypted Virtualization (SEV) are
+features found on AMD processors.
+
+SME provides the ability to mark individual pages of memory as encrypted using
+the standard x86 page tables. A page that is marked encrypted will be
+automatically decrypted when read from DRAM and encrypted when written to
+DRAM. SME can therefore be used to protect the contents of DRAM from physical
+attacks on the system.
+
+SEV enables running encrypted virtual machines (VMs) in which the code and data
+of the guest VM are secured so that a decrypted version is available only
+within the VM itself. SEV guest VMs have the concept of private and shared
+memory. Private memory is encrypted with the guest-specific key, while shared
+memory may be encrypted with hypervisor key. When SME is enabled, the hypervisor
+key is the same key which is used in SME.
+
+A page is encrypted when a page table entry has the encryption bit set (see
+below on how to determine its position). The encryption bit can also be
+specified in the cr3 register, allowing the PGD table to be encrypted. Each
+successive level of page tables can also be encrypted by setting the encryption
+bit in the page table entry that points to the next table. This allows the full
+page table hierarchy to be encrypted. Note, this means that just because the
+encryption bit is set in cr3, doesn't imply the full hierarchy is encrypted.
+Each page table entry in the hierarchy needs to have the encryption bit set to
+achieve that. So, theoretically, you could have the encryption bit set in cr3
+so that the PGD is encrypted, but not set the encryption bit in the PGD entry
+for a PUD which results in the PUD pointed to by that entry to not be
+encrypted.
+
+When SEV is enabled, instruction pages and guest page tables are always treated
+as private. All the DMA operations inside the guest must be performed on shared
+memory. Since the memory encryption bit is controlled by the guest OS when it
+is operating in 64-bit or 32-bit PAE mode, in all other modes the SEV hardware
+forces the memory encryption bit to 1.
+
+Support for SME and SEV can be determined through the CPUID instruction. The
+CPUID function 0x8000001f reports information related to SME::
+
+ 0x8000001f[eax]:
+ Bit[0] indicates support for SME
+ Bit[1] indicates support for SEV
+ 0x8000001f[ebx]:
+ Bits[5:0] pagetable bit number used to activate memory
+ encryption
+ Bits[11:6] reduction in physical address space, in bits, when
+ memory encryption is enabled (this only affects
+ system physical addresses, not guest physical
+ addresses)
+
+If support for SME is present, MSR 0xc00100010 (MSR_AMD64_SYSCFG) can be used to
+determine if SME is enabled and/or to enable memory encryption::
+
+ 0xc0010010:
+ Bit[23] 0 = memory encryption features are disabled
+ 1 = memory encryption features are enabled
+
+If SEV is supported, MSR 0xc0010131 (MSR_AMD64_SEV) can be used to determine if
+SEV is active::
+
+ 0xc0010131:
+ Bit[0] 0 = memory encryption is not active
+ 1 = memory encryption is active
+
+Linux relies on BIOS to set this bit if BIOS has determined that the reduction
+in the physical address space as a result of enabling memory encryption (see
+CPUID information above) will not conflict with the address space resource
+requirements for the system. If this bit is not set upon Linux startup then
+Linux itself will not set it and memory encryption will not be possible.
+
+The state of SME in the Linux kernel can be documented as follows:
+
+ - Supported:
+ The CPU supports SME (determined through CPUID instruction).
+
+ - Enabled:
+ Supported and bit 23 of MSR_AMD64_SYSCFG is set.
+
+ - Active:
+ Supported, Enabled and the Linux kernel is actively applying
+ the encryption bit to page table entries (the SME mask in the
+ kernel is non-zero).
+
+SME can also be enabled and activated in the BIOS. If SME is enabled and
+activated in the BIOS, then all memory accesses will be encrypted and it will
+not be necessary to activate the Linux memory encryption support. If the BIOS
+merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate
+memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
+not enable SME, then Linux will not be able to activate memory encryption, even
+if configured to do so by default or the mem_encrypt=on command line parameter
+is specified.
+
+Secure Nested Paging (SNP)
+==========================
+
+SEV-SNP introduces new features (SEV_FEATURES[1:63]) which can be enabled
+by the hypervisor for security enhancements. Some of these features need
+guest side implementation to function correctly. The below table lists the
+expected guest behavior with various possible scenarios of guest/hypervisor
+SNP feature support.
+
++-----------------+---------------+---------------+------------------+
+| Feature Enabled | Guest needs | Guest has | Guest boot |
+| by the HV | implementation| implementation| behaviour |
++=================+===============+===============+==================+
+| No | No | No | Boot |
+| | | | |
++-----------------+---------------+---------------+------------------+
+| No | Yes | No | Boot |
+| | | | |
++-----------------+---------------+---------------+------------------+
+| No | Yes | Yes | Boot |
+| | | | |
++-----------------+---------------+---------------+------------------+
+| Yes | No | No | Boot with |
+| | | | feature enabled |
++-----------------+---------------+---------------+------------------+
+| Yes | Yes | No | Graceful boot |
+| | | | failure |
++-----------------+---------------+---------------+------------------+
+| Yes | Yes | Yes | Boot with |
+| | | | feature enabled |
++-----------------+---------------+---------------+------------------+
+
+More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
+
+[1] https://www.amd.com/system/files/TechDocs/40332.pdf
diff --git a/Documentation/arch/x86/amd_hsmp.rst b/Documentation/arch/x86/amd_hsmp.rst
new file mode 100644
index 000000000000..440e4b645a1c
--- /dev/null
+++ b/Documentation/arch/x86/amd_hsmp.rst
@@ -0,0 +1,86 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================
+AMD HSMP interface
+============================================
+
+Newer Fam19h EPYC server line of processors from AMD support system
+management functionality via HSMP (Host System Management Port).
+
+The Host System Management Port (HSMP) is an interface to provide
+OS-level software with access to system management functions via a
+set of mailbox registers.
+
+More details on the interface can be found in chapter
+"7 Host System Management Port (HSMP)" of the family/model PPR
+Eg: https://www.amd.com/system/files/TechDocs/55898_B1_pub_0.50.zip
+
+HSMP interface is supported on EPYC server CPU models only.
+
+
+HSMP device
+============================================
+
+amd_hsmp driver under the drivers/platforms/x86/ creates miscdevice
+/dev/hsmp to let user space programs run hsmp mailbox commands.
+
+$ ls -al /dev/hsmp
+crw-r--r-- 1 root root 10, 123 Jan 21 21:41 /dev/hsmp
+
+Characteristics of the dev node:
+ * Write mode is used for running set/configure commands
+ * Read mode is used for running get/status monitor commands
+
+Access restrictions:
+ * Only root user is allowed to open the file in write mode.
+ * The file can be opened in read mode by all the users.
+
+In-kernel integration:
+ * Other subsystems in the kernel can use the exported transport
+ function hsmp_send_message().
+ * Locking across callers is taken care by the driver.
+
+
+An example
+==========
+
+To access hsmp device from a C program.
+First, you need to include the headers::
+
+ #include <linux/amd_hsmp.h>
+
+Which defines the supported messages/message IDs.
+
+Next thing, open the device file, as follows::
+
+ int file;
+
+ file = open("/dev/hsmp", O_RDWR);
+ if (file < 0) {
+ /* ERROR HANDLING; you can check errno to see what went wrong */
+ exit(1);
+ }
+
+The following IOCTL is defined:
+
+``ioctl(file, HSMP_IOCTL_CMD, struct hsmp_message *msg)``
+ The argument is a pointer to a::
+
+ struct hsmp_message {
+ __u32 msg_id; /* Message ID */
+ __u16 num_args; /* Number of input argument words in message */
+ __u16 response_sz; /* Number of expected output/response words */
+ __u32 args[HSMP_MAX_MSG_LEN]; /* argument/response buffer */
+ __u16 sock_ind; /* socket number */
+ };
+
+The ioctl would return a non-zero on failure; you can read errno to see
+what happened. The transaction returns 0 on success.
+
+More details on the interface and message definitions can be found in chapter
+"7 Host System Management Port (HSMP)" of the respective family/model PPR
+eg: https://www.amd.com/system/files/TechDocs/55898_B1_pub_0.50.zip
+
+User space C-APIs are made available by linking against the esmi library,
+which is provided by the E-SMS project https://developer.amd.com/e-sms/.
+See: https://github.com/amd/esmi_ib_library
diff --git a/Documentation/x86/boot.rst b/Documentation/arch/x86/boot.rst
index 894a19897005..33520ecdb37a 100644
--- a/Documentation/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -455,6 +455,7 @@ Protocol: 2.00+
11 Minimal Linux Bootloader
<http://sebastian-plotz.blogspot.de>
12 OVMF UEFI virtualization stack
+ 13 barebox
== =======================================
Please contact <hpa@zytor.com> if you need a bootloader ID value assigned.
@@ -1343,7 +1344,7 @@ follow::
In addition to read/modify/write the setup header of the struct
boot_params as that of 16-bit boot protocol, the boot loader should
also fill the additional fields of the struct boot_params as
-described in chapter Documentation/x86/zero-page.rst.
+described in chapter Documentation/arch/x86/zero-page.rst.
After setting up the struct boot_params, the boot loader can load the
32/64-bit kernel in the same way as that of 16-bit boot protocol.
@@ -1379,7 +1380,7 @@ can be calculated as follows::
In addition to read/modify/write the setup header of the struct
boot_params as that of 16-bit boot protocol, the boot loader should
also fill the additional fields of the struct boot_params as described
-in chapter Documentation/x86/zero-page.rst.
+in chapter Documentation/arch/x86/zero-page.rst.
After setting up the struct boot_params, the boot loader can load
64-bit kernel in the same way as that of 16-bit boot protocol, but
diff --git a/Documentation/x86/booting-dt.rst b/Documentation/arch/x86/booting-dt.rst
index 965a374071ab..b089ffd56e6e 100644
--- a/Documentation/x86/booting-dt.rst
+++ b/Documentation/arch/x86/booting-dt.rst
@@ -7,7 +7,7 @@ DeviceTree Booting
the decompressor (the real mode entry point goes to the same 32bit
entry point once it switched into protected mode). That entry point
supports one calling convention which is documented in
- Documentation/x86/boot.rst
+ Documentation/arch/x86/boot.rst
The physical pointer to the device-tree block is passed via setup_data
which requires at least boot protocol 2.09.
The type filed is defined as
diff --git a/Documentation/x86/buslock.rst b/Documentation/arch/x86/buslock.rst
index 7c051e714943..31ec0ef78086 100644
--- a/Documentation/x86/buslock.rst
+++ b/Documentation/arch/x86/buslock.rst
@@ -53,8 +53,14 @@ parameter "split_lock_detect". Here is a summary of different options:
|off |Do nothing |Do nothing |
+------------------+----------------------------+-----------------------+
|warn |Kernel OOPs |Warn once per task and |
-|(default) |Warn once per task and |and continues to run. |
-| |disable future checking | |
+|(default) |Warn once per task, add a |and continues to run. |
+| |delay, add synchronization | |
+| |to prevent more than one | |
+| |core from executing a | |
+| |split lock in parallel. | |
+| |sysctl split_lock_mitigate | |
+| |can be used to avoid the | |
+| |delay and synchronization | |
| |When both features are | |
| |supported, warn in #AC | |
+------------------+----------------------------+-----------------------+
diff --git a/Documentation/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst
index 5d54c39a063f..08246e8ac835 100644
--- a/Documentation/x86/cpuinfo.rst
+++ b/Documentation/arch/x86/cpuinfo.rst
@@ -140,9 +140,8 @@ from #define X86_FEATURE_UMIP (16*32 + 2).
In addition, there exists a variety of custom command-line parameters that
disable specific features. The list of parameters includes, but is not limited
-to, nofsgsbase, nosmap, and nosmep. 5-level paging can also be disabled using
-"no5lvl". SMAP and SMEP are disabled with the aforementioned parameters,
-respectively.
+to, nofsgsbase, nosgx, noxsave, etc. 5-level paging can also be disabled using
+"no5lvl".
e: The feature was known to be non-functional.
----------------------------------------------
diff --git a/Documentation/x86/earlyprintk.rst b/Documentation/arch/x86/earlyprintk.rst
index 51ef11e8f725..51ef11e8f725 100644
--- a/Documentation/x86/earlyprintk.rst
+++ b/Documentation/arch/x86/earlyprintk.rst
diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/arch/x86/elf_auxvec.rst
index 18e4744717f9..18e4744717f9 100644
--- a/Documentation/x86/elf_auxvec.rst
+++ b/Documentation/arch/x86/elf_auxvec.rst
diff --git a/Documentation/x86/entry_64.rst b/Documentation/arch/x86/entry_64.rst
index a48b3f6ebbe8..0afdce3c06f4 100644
--- a/Documentation/x86/entry_64.rst
+++ b/Documentation/arch/x86/entry_64.rst
@@ -8,7 +8,7 @@ This file documents some of the kernel entries in
arch/x86/entry/entry_64.S. A lot of this explanation is adapted from
an email from Ingo Molnar:
-http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>
+https://lore.kernel.org/r/20110529191055.GC9835%40elte.hu
The x86 architecture has quite a few different ways to jump into
kernel code. Most of these entry points are registered in
@@ -33,8 +33,8 @@ Some of these entries are:
- interrupt: An array of entries. Every IDT vector that doesn't
explicitly point somewhere else gets set to the corresponding
value in interrupts. These point to a whole array of
- magically-generated functions that make their way to do_IRQ with
- the interrupt number as a parameter.
+ magically-generated functions that make their way to common_interrupt()
+ with the interrupt number as a parameter.
- APIC interrupts: Various special-purpose interrupts for things
like TLB shootdown.
diff --git a/Documentation/x86/exception-tables.rst b/Documentation/arch/x86/exception-tables.rst
index de58110c5ffd..efde1fef4fbd 100644
--- a/Documentation/x86/exception-tables.rst
+++ b/Documentation/arch/x86/exception-tables.rst
@@ -32,14 +32,14 @@ Whenever the kernel tries to access an address that is currently not
accessible, the CPU generates a page fault exception and calls the
page fault handler::
- void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+ void exc_page_fault(struct pt_regs *regs, unsigned long error_code)
in arch/x86/mm/fault.c. The parameters on the stack are set up by
the low level assembly glue in arch/x86/entry/entry_32.S. The parameter
regs is a pointer to the saved registers on the stack, error_code
contains a reason code for the exception.
-do_page_fault first obtains the unaccessible address from the CPU
+exc_page_fault() first obtains the inaccessible address from the CPU
control register CR2. If the address is within the virtual address
space of the process, the fault probably occurred, because the page
was not swapped in, write protected or something similar. However,
@@ -57,10 +57,10 @@ Where does fixup point to?
Since we jump to the contents of fixup, fixup obviously points
to executable code. This code is hidden inside the user access macros.
-I have picked the get_user macro defined in arch/x86/include/asm/uaccess.h
+I have picked the get_user() macro defined in arch/x86/include/asm/uaccess.h
as an example. The definition is somewhat hard to follow, so let's peek at
the code generated by the preprocessor and the compiler. I selected
-the get_user call in drivers/char/sysrq.c for a detailed examination.
+the get_user() call in drivers/char/sysrq.c for a detailed examination.
The original code in sysrq.c line 587::
@@ -281,12 +281,15 @@ vma occurs?
> c017e7a5 <do_con_write+e1> movb (%ebx),%dl
#. MMU generates exception
-#. CPU calls do_page_fault
-#. do page fault calls search_exception_table (regs->eip == c017e7a5);
-#. search_exception_table looks up the address c017e7a5 in the
+#. CPU calls exc_page_fault()
+#. exc_page_fault() calls do_user_addr_fault()
+#. do_user_addr_fault() calls kernelmode_fixup_or_oops()
+#. kernelmode_fixup_or_oops() calls fixup_exception() (regs->eip == c017e7a5);
+#. fixup_exception() calls search_exception_tables()
+#. search_exception_tables() looks up the address c017e7a5 in the
exception table (i.e. the contents of the ELF section __ex_table)
and returns the address of the associated fault handle code c0199ff5.
-#. do_page_fault modifies its own return address to point to the fault
+#. fixup_exception() modifies its own return address to point to the fault
handle code and returns.
#. execution continues in the fault handling code.
#. a) EAX becomes -EFAULT (== -14)
@@ -298,9 +301,9 @@ The steps 8a to 8c in a certain way emulate the faulting instruction.
That's it, mostly. If you look at our example, you might ask why
we set EAX to -EFAULT in the exception handler code. Well, the
-get_user macro actually returns a value: 0, if the user access was
+get_user() macro actually returns a value: 0, if the user access was
successful, -EFAULT on failure. Our original code did not test this
-return value, however the inline assembly code in get_user tries to
+return value, however the inline assembly code in get_user() tries to
return -EFAULT. GCC selected EAX to return this value.
NOTE:
diff --git a/Documentation/x86/features.rst b/Documentation/arch/x86/features.rst
index b663f15053ce..b663f15053ce 100644
--- a/Documentation/x86/features.rst
+++ b/Documentation/arch/x86/features.rst
diff --git a/Documentation/x86/i386/IO-APIC.rst b/Documentation/arch/x86/i386/IO-APIC.rst
index ce4d8df15e7c..ce4d8df15e7c 100644
--- a/Documentation/x86/i386/IO-APIC.rst
+++ b/Documentation/arch/x86/i386/IO-APIC.rst
diff --git a/Documentation/x86/i386/index.rst b/Documentation/arch/x86/i386/index.rst
index 8747cf5bbd49..8747cf5bbd49 100644
--- a/Documentation/x86/i386/index.rst
+++ b/Documentation/arch/x86/i386/index.rst
diff --git a/Documentation/arch/x86/ifs.rst b/Documentation/arch/x86/ifs.rst
new file mode 100644
index 000000000000..97abb696a680
--- /dev/null
+++ b/Documentation/arch/x86/ifs.rst
@@ -0,0 +1,2 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. kernel-doc:: drivers/platform/x86/intel/ifs/ifs.h
diff --git a/Documentation/arch/x86/index.rst b/Documentation/arch/x86/index.rst
new file mode 100644
index 000000000000..c73d133fd37c
--- /dev/null
+++ b/Documentation/arch/x86/index.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+x86-specific Documentation
+==========================
+
+.. toctree::
+ :maxdepth: 2
+ :numbered:
+
+ boot
+ booting-dt
+ cpuinfo
+ topology
+ exception-tables
+ kernel-stacks
+ entry_64
+ earlyprintk
+ orc-unwinder
+ zero-page
+ tlb
+ mtrr
+ pat
+ intel-hfi
+ iommu
+ intel_txt
+ amd-memory-encryption
+ amd_hsmp
+ tdx
+ pti
+ mds
+ microcode
+ resctrl
+ tsx_async_abort
+ buslock
+ usb-legacy-support
+ i386/index
+ x86_64/index
+ ifs
+ sva
+ sgx
+ features
+ elf_auxvec
+ xstate
diff --git a/Documentation/arch/x86/intel-hfi.rst b/Documentation/arch/x86/intel-hfi.rst
new file mode 100644
index 000000000000..49dea58ea4fb
--- /dev/null
+++ b/Documentation/arch/x86/intel-hfi.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================================
+Hardware-Feedback Interface for scheduling on Intel Hardware
+============================================================
+
+Overview
+--------
+
+Intel has described the Hardware Feedback Interface (HFI) in the Intel 64 and
+IA-32 Architectures Software Developer's Manual (Intel SDM) Volume 3 Section
+14.6 [1]_.
+
+The HFI gives the operating system a performance and energy efficiency
+capability data for each CPU in the system. Linux can use the information from
+the HFI to influence task placement decisions.
+
+The Hardware Feedback Interface
+-------------------------------
+
+The Hardware Feedback Interface provides to the operating system information
+about the performance and energy efficiency of each CPU in the system. Each
+capability is given as a unit-less quantity in the range [0-255]. Higher values
+indicate higher capability. Energy efficiency and performance are reported in
+separate capabilities. Even though on some systems these two metrics may be
+related, they are specified as independent capabilities in the Intel SDM.
+
+These capabilities may change at runtime as a result of changes in the
+operating conditions of the system or the action of external factors. The rate
+at which these capabilities are updated is specific to each processor model. On
+some models, capabilities are set at boot time and never change. On others,
+capabilities may change every tens of milliseconds. For instance, a remote
+mechanism may be used to lower Thermal Design Power. Such change can be
+reflected in the HFI. Likewise, if the system needs to be throttled due to
+excessive heat, the HFI may reflect reduced performance on specific CPUs.
+
+The kernel or a userspace policy daemon can use these capabilities to modify
+task placement decisions. For instance, if either the performance or energy
+capabilities of a given logical processor becomes zero, it is an indication that
+the hardware recommends to the operating system to not schedule any tasks on
+that processor for performance or energy efficiency reasons, respectively.
+
+Implementation details for Linux
+--------------------------------
+
+The infrastructure to handle thermal event interrupts has two parts. In the
+Local Vector Table of a CPU's local APIC, there exists a register for the
+Thermal Monitor Register. This register controls how interrupts are delivered
+to a CPU when the thermal monitor generates and interrupt. Further details
+can be found in the Intel SDM Vol. 3 Section 10.5 [1]_.
+
+The thermal monitor may generate interrupts per CPU or per package. The HFI
+generates package-level interrupts. This monitor is configured and initialized
+via a set of machine-specific registers. Specifically, the HFI interrupt and
+status are controlled via designated bits in the IA32_PACKAGE_THERM_INTERRUPT
+and IA32_PACKAGE_THERM_STATUS registers, respectively. There exists one HFI
+table per package. Further details can be found in the Intel SDM Vol. 3
+Section 14.9 [1]_.
+
+The hardware issues an HFI interrupt after updating the HFI table and is ready
+for the operating system to consume it. CPUs receive such interrupt via the
+thermal entry in the Local APIC's Local Vector Table.
+
+When servicing such interrupt, the HFI driver parses the updated table and
+relays the update to userspace using the thermal notification framework. Given
+that there may be many HFI updates every second, the updates relayed to
+userspace are throttled at a rate of CONFIG_HZ jiffies.
+
+References
+----------
+
+.. [1] https://www.intel.com/sdm
diff --git a/Documentation/x86/intel_txt.rst b/Documentation/arch/x86/intel_txt.rst
index d83c1a2122c9..d83c1a2122c9 100644
--- a/Documentation/x86/intel_txt.rst
+++ b/Documentation/arch/x86/intel_txt.rst
diff --git a/Documentation/arch/x86/iommu.rst b/Documentation/arch/x86/iommu.rst
new file mode 100644
index 000000000000..42c7a6faa39a
--- /dev/null
+++ b/Documentation/arch/x86/iommu.rst
@@ -0,0 +1,151 @@
+=================
+x86 IOMMU Support
+=================
+
+The architecture specs can be obtained from the below locations.
+
+- Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
+- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different IOMMUs on the platform, and
+device scope relationships between devices and which IOMMU controls
+them.
+
+Some ACPI Keywords:
+
+- DMAR - Intel DMA Remapping table
+- DRHD - Intel DMA Remapping Hardware Unit Definition
+- RMRR - Intel Reserved Memory Region Reporting Structure
+- IVRS - AMD I/O Virtualization Reporting Structure
+- IVDB - AMD I/O Virtualization Definition Block
+- IVHD - AMD I/O Virtualization Hardware Definition
+
+What is Intel RMRR?
+^^^^^^^^^^^^^^^^^^^
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+What is AMD IVRS?
+^^^^^^^^^^^^^^^^^
+
+The architecture defines an ACPI-compatible data structure called an I/O
+Virtualization Reporting Structure (IVRS) that is used to convey information
+related to I/O virtualization to system software. The IVRS describes the
+configuration and capabilities of the IOMMUs contained in the platform as
+well as information about the devices that each IOMMU virtualizes.
+
+The IVRS provides information about the following:
+
+- IOMMUs present in the platform including their capabilities and proper configuration
+- System I/O topology relevant to each IOMMU
+- Peripheral devices that cannot be otherwise enumerated
+- Memory regions used by SMI/SMM, platform firmware, and platform hardware. These are generally exclusion ranges to be configured by system software.
+
+How is an I/O Virtual Address (IOVA) generated?
+-----------------------------------------------
+
+Well behaved drivers call dma_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, driver performs dma_unmap_*() calls to unmap the region.
+
+Intel Specific Notes
+--------------------
+
+Graphics Problems?
+^^^^^^^^^^^^^^^^^^
+
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+If this fixes anything, please ensure you file a bug reporting the problem.
+
+Some exceptions to IOVA
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+AMD Specific Notes
+------------------
+
+Graphics Problems?
+^^^^^^^^^^^^^^^^^^
+
+If you encounter issues with integrated graphics devices, you can try adding
+option iommu=pt to the kernel command line use a 1:1 mapping for the IOMMU. If
+this fixes anything, please ensure you file a bug reporting the problem.
+
+Fault reporting
+---------------
+When errors are reported, the IOMMU signals via an interrupt. The fault
+reason and device that caused it is printed on the console.
+
+
+Kernel Log Samples
+------------------
+
+Intel Boot Messages
+^^^^^^^^^^^^^^^^^^^
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI:
+
+::
+
+ ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed:
+
+::
+
+ ACPI DMAR:Host address width 36
+ ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice:
+
+::
+
+ PCI-DMA: Using DMAR IOMMU
+
+Intel Fault reporting
+^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+ DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+ DMAR:[fault reason 05] PTE Write access is not set
+ DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+ DMAR:[fault reason 05] PTE Write access is not set
+
+AMD Boot Messages
+^^^^^^^^^^^^^^^^^
+
+Something like this gets printed indicating presence of the IOMMU:
+
+::
+
+ iommu: Default domain type: Translated
+ iommu: DMA domain TLB invalidation policy: lazy mode
+
+AMD Fault reporting
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+ AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0007 address=0xffffc02000 flags=0x0000]
+ AMD-Vi: Event logged [IO_PAGE_FAULT device=07:00.0 domain=0x0007 address=0xffffc02000 flags=0x0000]
diff --git a/Documentation/x86/kernel-stacks.rst b/Documentation/arch/x86/kernel-stacks.rst
index 6b0bcf027ff1..738671a4070b 100644
--- a/Documentation/x86/kernel-stacks.rst
+++ b/Documentation/arch/x86/kernel-stacks.rst
@@ -12,7 +12,7 @@ Most of the text from Keith Owens, hacked by AK
x86_64 page size (PAGE_SIZE) is 4K.
Like all other architectures, x86_64 has a kernel stack for every
-active thread. These thread stacks are THREAD_SIZE (2*PAGE_SIZE) big.
+active thread. These thread stacks are THREAD_SIZE (4*PAGE_SIZE) big.
These stacks contain useful data as long as a thread is alive or a
zombie. While the thread is in user space the kernel stack is empty
except for the thread_info structure at the bottom.
diff --git a/Documentation/x86/mds.rst b/Documentation/arch/x86/mds.rst
index 5d4330be200f..5d4330be200f 100644
--- a/Documentation/x86/mds.rst
+++ b/Documentation/arch/x86/mds.rst
diff --git a/Documentation/arch/x86/microcode.rst b/Documentation/arch/x86/microcode.rst
new file mode 100644
index 000000000000..b627c6f36bcf
--- /dev/null
+++ b/Documentation/arch/x86/microcode.rst
@@ -0,0 +1,240 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+The Linux Microcode Loader
+==========================
+
+:Authors: - Fenghua Yu <fenghua.yu@intel.com>
+ - Borislav Petkov <bp@suse.de>
+ - Ashok Raj <ashok.raj@intel.com>
+
+The kernel has a x86 microcode loading facility which is supposed to
+provide microcode loading methods in the OS. Potential use cases are
+updating the microcode on platforms beyond the OEM End-Of-Life support,
+and updating the microcode on long-running systems without rebooting.
+
+The loader supports three loading methods:
+
+Early load microcode
+====================
+
+The kernel can update microcode very early during boot. Loading
+microcode early can fix CPU issues before they are observed during
+kernel boot time.
+
+The microcode is stored in an initrd file. During boot, it is read from
+it and loaded into the CPU cores.
+
+The format of the combined initrd image is microcode in (uncompressed)
+cpio format followed by the (possibly compressed) initrd image. The
+loader parses the combined initrd image during boot.
+
+The microcode files in cpio name space are:
+
+on Intel:
+ kernel/x86/microcode/GenuineIntel.bin
+on AMD :
+ kernel/x86/microcode/AuthenticAMD.bin
+
+During BSP (BootStrapping Processor) boot (pre-SMP), the kernel
+scans the microcode file in the initrd. If microcode matching the
+CPU is found, it will be applied in the BSP and later on in all APs
+(Application Processors).
+
+The loader also saves the matching microcode for the CPU in memory.
+Thus, the cached microcode patch is applied when CPUs resume from a
+sleep state.
+
+Here's a crude example how to prepare an initrd with microcode (this is
+normally done automatically by the distribution, when recreating the
+initrd, so you don't really have to do it yourself. It is documented
+here for future reference only).
+::
+
+ #!/bin/bash
+
+ if [ -z "$1" ]; then
+ echo "You need to supply an initrd file"
+ exit 1
+ fi
+
+ INITRD="$1"
+
+ DSTDIR=kernel/x86/microcode
+ TMPDIR=/tmp/initrd
+
+ rm -rf $TMPDIR
+
+ mkdir $TMPDIR
+ cd $TMPDIR
+ mkdir -p $DSTDIR
+
+ if [ -d /lib/firmware/amd-ucode ]; then
+ cat /lib/firmware/amd-ucode/microcode_amd*.bin > $DSTDIR/AuthenticAMD.bin
+ fi
+
+ if [ -d /lib/firmware/intel-ucode ]; then
+ cat /lib/firmware/intel-ucode/* > $DSTDIR/GenuineIntel.bin
+ fi
+
+ find . | cpio -o -H newc >../ucode.cpio
+ cd ..
+ mv $INITRD $INITRD.orig
+ cat ucode.cpio $INITRD.orig > $INITRD
+
+ rm -rf $TMPDIR
+
+
+The system needs to have the microcode packages installed into
+/lib/firmware or you need to fixup the paths above if yours are
+somewhere else and/or you've downloaded them directly from the processor
+vendor's site.
+
+Late loading
+============
+
+You simply install the microcode packages your distro supplies and
+run::
+
+ # echo 1 > /sys/devices/system/cpu/microcode/reload
+
+as root.
+
+The loading mechanism looks for microcode blobs in
+/lib/firmware/{intel-ucode,amd-ucode}. The default distro installation
+packages already put them there.
+
+Since kernel 5.19, late loading is not enabled by default.
+
+The /dev/cpu/microcode method has been removed in 5.19.
+
+Why is late loading dangerous?
+==============================
+
+Synchronizing all CPUs
+----------------------
+
+The microcode engine which receives the microcode update is shared
+between the two logical threads in a SMT system. Therefore, when
+the update is executed on one SMT thread of the core, the sibling
+"automatically" gets the update.
+
+Since the microcode can "simulate" MSRs too, while the microcode update
+is in progress, those simulated MSRs transiently cease to exist. This
+can result in unpredictable results if the SMT sibling thread happens to
+be in the middle of an access to such an MSR. The usual observation is
+that such MSR accesses cause #GPs to be raised to signal that former are
+not present.
+
+The disappearing MSRs are just one common issue which is being observed.
+Any other instruction that's being patched and gets concurrently
+executed by the other SMT sibling, can also result in similar,
+unpredictable behavior.
+
+To eliminate this case, a stop_machine()-based CPU synchronization was
+introduced as a way to guarantee that all logical CPUs will not execute
+any code but just wait in a spin loop, polling an atomic variable.
+
+While this took care of device or external interrupts, IPIs including
+LVT ones, such as CMCI etc, it cannot address other special interrupts
+that can't be shut off. Those are Machine Check (#MC), System Management
+(#SMI) and Non-Maskable interrupts (#NMI).
+
+Machine Checks
+--------------
+
+Machine Checks (#MC) are non-maskable. There are two kinds of MCEs.
+Fatal un-recoverable MCEs and recoverable MCEs. While un-recoverable
+errors are fatal, recoverable errors can also happen in kernel context
+are also treated as fatal by the kernel.
+
+On certain Intel machines, MCEs are also broadcast to all threads in a
+system. If one thread is in the middle of executing WRMSR, a MCE will be
+taken at the end of the flow. Either way, they will wait for the thread
+performing the wrmsr(0x79) to rendezvous in the MCE handler and shutdown
+eventually if any of the threads in the system fail to check in to the
+MCE rendezvous.
+
+To be paranoid and get predictable behavior, the OS can choose to set
+MCG_STATUS.MCIP. Since MCEs can be at most one in a system, if an
+MCE was signaled, the above condition will promote to a system reset
+automatically. OS can turn off MCIP at the end of the update for that
+core.
+
+System Management Interrupt
+---------------------------
+
+SMIs are also broadcast to all CPUs in the platform. Microcode update
+requests exclusive access to the core before writing to MSR 0x79. So if
+it does happen such that, one thread is in WRMSR flow, and the 2nd got
+an SMI, that thread will be stopped in the first instruction in the SMI
+handler.
+
+Since the secondary thread is stopped in the first instruction in SMI,
+there is very little chance that it would be in the middle of executing
+an instruction being patched. Plus OS has no way to stop SMIs from
+happening.
+
+Non-Maskable Interrupts
+-----------------------
+
+When thread0 of a core is doing the microcode update, if thread1 is
+pulled into NMI, that can cause unpredictable behavior due to the
+reasons above.
+
+OS can choose a variety of methods to avoid running into this situation.
+
+
+Is the microcode suitable for late loading?
+-------------------------------------------
+
+Late loading is done when the system is fully operational and running
+real workloads. Late loading behavior depends on what the base patch on
+the CPU is before upgrading to the new patch.
+
+This is true for Intel CPUs.
+
+Consider, for example, a CPU has patch level 1 and the update is to
+patch level 3.
+
+Between patch1 and patch3, patch2 might have deprecated a software-visible
+feature.
+
+This is unacceptable if software is even potentially using that feature.
+For instance, say MSR_X is no longer available after an update,
+accessing that MSR will cause a #GP fault.
+
+Basically there is no way to declare a new microcode update suitable
+for late-loading. This is another one of the problems that caused late
+loading to be not enabled by default.
+
+Builtin microcode
+=================
+
+The loader supports also loading of a builtin microcode supplied through
+the regular builtin firmware method CONFIG_EXTRA_FIRMWARE. Only 64-bit is
+currently supported.
+
+Here's an example::
+
+ CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
+ CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
+
+This basically means, you have the following tree structure locally::
+
+ /lib/firmware/
+ |-- amd-ucode
+ ...
+ | |-- microcode_amd_fam15h.bin
+ ...
+ |-- intel-ucode
+ ...
+ | |-- 06-3a-09
+ ...
+
+so that the build system can find those files and integrate them into
+the final kernel image. The early loader finds them and applies them.
+
+Needless to say, this method is not the most flexible one because it
+requires rebuilding the kernel each time updated microcode from the CPU
+vendor is available.
diff --git a/Documentation/x86/mtrr.rst b/Documentation/arch/x86/mtrr.rst
index 9f0b1851771a..f65ef034da7a 100644
--- a/Documentation/x86/mtrr.rst
+++ b/Documentation/arch/x86/mtrr.rst
@@ -28,7 +28,7 @@ are aligned with platform MTRR setup. If MTRRs are only set up by the platform
firmware code though and the OS does not make any specific MTRR mapping
requests mtrr_type_lookup() should always return MTRR_TYPE_INVALID.
-For details refer to Documentation/x86/pat.rst.
+For details refer to Documentation/arch/x86/pat.rst.
.. tip::
On Intel P6 family processors (Pentium Pro, Pentium II and later)
diff --git a/Documentation/x86/orc-unwinder.rst b/Documentation/arch/x86/orc-unwinder.rst
index d811576c1f3e..cdb257015bd9 100644
--- a/Documentation/x86/orc-unwinder.rst
+++ b/Documentation/arch/x86/orc-unwinder.rst
@@ -140,7 +140,7 @@ Unwinder implementation details
Objtool generates the ORC data by integrating with the compile-time
stack metadata validation feature, which is described in detail in
-tools/objtool/Documentation/stack-validation.txt. After analyzing all
+tools/objtool/Documentation/objtool.txt. After analyzing all
the code paths of a .o file, it creates an array of orc_entry structs,
and a parallel array of instruction addresses associated with those
structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
@@ -177,6 +177,6 @@ brutal, unyielding efficiency.
ORC stands for Oops Rewind Capability.
-.. [1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
-.. [2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
+.. [1] https://lore.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
+.. [2] https://lore.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
.. [3] http://dustin.wikidot.com/half-orcs-and-orcs
diff --git a/Documentation/x86/pat.rst b/Documentation/arch/x86/pat.rst
index 5d901771016d..5d901771016d 100644
--- a/Documentation/x86/pat.rst
+++ b/Documentation/arch/x86/pat.rst
diff --git a/Documentation/x86/pti.rst b/Documentation/arch/x86/pti.rst
index 4b858a9bad8d..4b858a9bad8d 100644
--- a/Documentation/x86/pti.rst
+++ b/Documentation/arch/x86/pti.rst
diff --git a/Documentation/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst
index 71a531061e4e..387ccbcb558f 100644
--- a/Documentation/x86/resctrl.rst
+++ b/Documentation/arch/x86/resctrl.rst
@@ -17,14 +17,21 @@ AMD refers to this feature as AMD Platform Quality of Service(AMD QoS).
This feature is enabled by the CONFIG_X86_CPU_RESCTRL and the x86 /proc/cpuinfo
flag bits:
-============================================= ================================
+=============================================== ================================
RDT (Resource Director Technology) Allocation "rdt_a"
CAT (Cache Allocation Technology) "cat_l3", "cat_l2"
CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2"
CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc"
MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local"
MBA (Memory Bandwidth Allocation) "mba"
-============================================= ================================
+SMBA (Slow Memory Bandwidth Allocation) ""
+BMEC (Bandwidth Monitoring Event Configuration) ""
+=============================================== ================================
+
+Historically, new features were made visible by default in /proc/cpuinfo. This
+resulted in the feature flags becoming hard to parse by humans. Adding a new
+flag to /proc/cpuinfo should be avoided if user space can obtain information
+about the feature from resctrl's info directory.
To use the feature mount the file system::
@@ -161,6 +168,83 @@ with the following files:
"mon_features":
Lists the monitoring events if
monitoring is enabled for the resource.
+ Example::
+
+ # cat /sys/fs/resctrl/info/L3_MON/mon_features
+ llc_occupancy
+ mbm_total_bytes
+ mbm_local_bytes
+
+ If the system supports Bandwidth Monitoring Event
+ Configuration (BMEC), then the bandwidth events will
+ be configurable. The output will be::
+
+ # cat /sys/fs/resctrl/info/L3_MON/mon_features
+ llc_occupancy
+ mbm_total_bytes
+ mbm_total_bytes_config
+ mbm_local_bytes
+ mbm_local_bytes_config
+
+"mbm_total_bytes_config", "mbm_local_bytes_config":
+ Read/write files containing the configuration for the mbm_total_bytes
+ and mbm_local_bytes events, respectively, when the Bandwidth
+ Monitoring Event Configuration (BMEC) feature is supported.
+ The event configuration settings are domain specific and affect
+ all the CPUs in the domain. When either event configuration is
+ changed, the bandwidth counters for all RMIDs of both events
+ (mbm_total_bytes as well as mbm_local_bytes) are cleared for that
+ domain. The next read for every RMID will report "Unavailable"
+ and subsequent reads will report the valid value.
+
+ Following are the types of events supported:
+
+ ==== ========================================================
+ Bits Description
+ ==== ========================================================
+ 6 Dirty Victims from the QOS domain to all types of memory
+ 5 Reads to slow memory in the non-local NUMA domain
+ 4 Reads to slow memory in the local NUMA domain
+ 3 Non-temporal writes to non-local NUMA domain
+ 2 Non-temporal writes to local NUMA domain
+ 1 Reads to memory in the non-local NUMA domain
+ 0 Reads to memory in the local NUMA domain
+ ==== ========================================================
+
+ By default, the mbm_total_bytes configuration is set to 0x7f to count
+ all the event types and the mbm_local_bytes configuration is set to
+ 0x15 to count all the local memory events.
+
+ Examples:
+
+ * To view the current configuration::
+ ::
+
+ # cat /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
+ 0=0x7f;1=0x7f;2=0x7f;3=0x7f
+
+ # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config
+ 0=0x15;1=0x15;3=0x15;4=0x15
+
+ * To change the mbm_total_bytes to count only reads on domain 0,
+ the bits 0, 1, 4 and 5 needs to be set, which is 110011b in binary
+ (in hexadecimal 0x33):
+ ::
+
+ # echo "0=0x33" > /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
+
+ # cat /sys/fs/resctrl/info/L3_MON/mbm_total_bytes_config
+ 0=0x33;1=0x7f;2=0x7f;3=0x7f
+
+ * To change the mbm_local_bytes to count all the slow memory reads on
+ domain 0 and 1, the bits 4 and 5 needs to be set, which is 110000b
+ in binary (in hexadecimal 0x30):
+ ::
+
+ # echo "0=0x30;1=0x30" > /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config
+
+ # cat /sys/fs/resctrl/info/L3_MON/mbm_local_bytes_config
+ 0=0x30;1=0x30;3=0x15;4=0x15
"max_threshold_occupancy":
Read/write file provides the largest value (in
@@ -464,6 +548,25 @@ Memory bandwidth domain is L3 cache.
MB:<cache_id0>=bw_MBps0;<cache_id1>=bw_MBps1;...
+Slow Memory Bandwidth Allocation (SMBA)
+---------------------------------------
+AMD hardware supports Slow Memory Bandwidth Allocation (SMBA).
+CXL.memory is the only supported "slow" memory device. With the
+support of SMBA, the hardware enables bandwidth allocation on
+the slow memory devices. If there are multiple such devices in
+the system, the throttling logic groups all the slow sources
+together and applies the limit on them as a whole.
+
+The presence of SMBA (with CXL.memory) is independent of slow memory
+devices presence. If there are no such devices on the system, then
+configuring SMBA will have no impact on the performance of the system.
+
+The bandwidth domain for slow memory is L3 cache. Its schemata file
+is formatted as:
+::
+
+ SMBA:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;...
+
Reading/writing the schemata file
---------------------------------
Reading the schemata file will show the state of all resources
@@ -479,6 +582,46 @@ which you wish to change. E.g.
L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+Reading/writing the schemata file (on AMD systems)
+--------------------------------------------------
+Reading the schemata file will show the current bandwidth limit on all
+domains. The allocated resources are in multiples of one eighth GB/s.
+When writing to the file, you need to specify what cache id you wish to
+configure the bandwidth limit.
+
+For example, to allocate 2GB/s limit on the first cache id:
+
+::
+
+ # cat schemata
+ MB:0=2048;1=2048;2=2048;3=2048
+ L3:0=ffff;1=ffff;2=ffff;3=ffff
+
+ # echo "MB:1=16" > schemata
+ # cat schemata
+ MB:0=2048;1= 16;2=2048;3=2048
+ L3:0=ffff;1=ffff;2=ffff;3=ffff
+
+Reading/writing the schemata file (on AMD systems) with SMBA feature
+--------------------------------------------------------------------
+Reading and writing the schemata file is the same as without SMBA in
+above section.
+
+For example, to allocate 8GB/s limit on the first cache id:
+
+::
+
+ # cat schemata
+ SMBA:0=2048;1=2048;2=2048;3=2048
+ MB:0=2048;1=2048;2=2048;3=2048
+ L3:0=ffff;1=ffff;2=ffff;3=ffff
+
+ # echo "SMBA:1=64" > schemata
+ # cat schemata
+ SMBA:0=2048;1= 64;2=2048;3=2048
+ MB:0=2048;1=2048;2=2048;3=2048
+ L3:0=ffff;1=ffff;2=ffff;3=ffff
+
Cache Pseudo-Locking
====================
CAT enables a user to specify the amount of cache space that an
@@ -608,12 +751,12 @@ how we can measure the latency in cycles of reading from this region and
visualize this data with a histogram that is available if CONFIG_HIST_TRIGGERS
is set::
- # :> /sys/kernel/debug/tracing/trace
- # echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/trigger
- # echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+ # :> /sys/kernel/tracing/trace
+ # echo 'hist:keys=latency' > /sys/kernel/tracing/events/resctrl/pseudo_lock_mem_latency/trigger
+ # echo 1 > /sys/kernel/tracing/events/resctrl/pseudo_lock_mem_latency/enable
# echo 1 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
- # echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
- # cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist
+ # echo 0 > /sys/kernel/tracing/events/resctrl/pseudo_lock_mem_latency/enable
+ # cat /sys/kernel/tracing/events/resctrl/pseudo_lock_mem_latency/hist
# event histogram
#
@@ -642,11 +785,11 @@ cache of a platform. Here is how we can obtain details of the cache hits
and misses using the platform's precision counters.
::
- # :> /sys/kernel/debug/tracing/trace
- # echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
+ # :> /sys/kernel/tracing/trace
+ # echo 1 > /sys/kernel/tracing/events/resctrl/pseudo_lock_l2/enable
# echo 2 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
- # echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
- # cat /sys/kernel/debug/tracing/trace
+ # echo 0 > /sys/kernel/tracing/events/resctrl/pseudo_lock_l2/enable
+ # cat /sys/kernel/tracing/trace
# tracer: nop
#
diff --git a/Documentation/x86/sgx.rst b/Documentation/arch/x86/sgx.rst
index dd0ac96ff9ef..2bcbffacbed5 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/arch/x86/sgx.rst
@@ -10,7 +10,7 @@ Overview
Software Guard eXtensions (SGX) hardware enables for user space applications
to set aside private memory regions of code and data:
-* Privileged (ring-0) ENCLS functions orchestrate the construction of the.
+* Privileged (ring-0) ENCLS functions orchestrate the construction of the
regions.
* Unprivileged (ring-3) ENCLU functions allow an application to enter and
execute inside the regions.
@@ -91,7 +91,7 @@ In addition to the traditional compiler and linker build process, SGX has a
separate enclave “build” process. Enclaves must be built before they can be
executed (entered). The first step in building an enclave is opening the
**/dev/sgx_enclave** device. Since enclave memory is protected from direct
-access, special privileged instructions are Then used to copy data into enclave
+access, special privileged instructions are then used to copy data into enclave
pages and establish enclave page permissions.
.. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c
@@ -100,6 +100,21 @@ pages and establish enclave page permissions.
sgx_ioc_enclave_init
sgx_ioc_enclave_provision
+Enclave runtime management
+--------------------------
+
+Systems supporting SGX2 additionally support changes to initialized
+enclaves: modifying enclave page permissions and type, and dynamically
+adding and removing of enclave pages. When an enclave accesses an address
+within its address range that does not have a backing page then a new
+regular page will be dynamically added to the enclave. The enclave is
+still required to run EACCEPT on the new page before it can be used.
+
+.. kernel-doc:: arch/x86/kernel/cpu/sgx/ioctl.c
+ :functions: sgx_ioc_enclave_restrict_permissions
+ sgx_ioc_enclave_modify_types
+ sgx_ioc_enclave_remove_pages
+
Enclave vDSO
------------
@@ -126,13 +141,13 @@ the need to juggle signal handlers.
ksgxd
=====
-SGX support includes a kernel thread called *ksgxwapd*.
+SGX support includes a kernel thread called *ksgxd*.
EPC sanitization
----------------
ksgxd is started when SGX initializes. Enclave memory is typically ready
-For use when the processor powers on or resets. However, if SGX has been in
+for use when the processor powers on or resets. However, if SGX has been in
use since the reset, enclave pages may be in an inconsistent state. This might
occur after a crash and kexec() cycle, for instance. At boot, ksgxd
reinitializes all enclave pages so that they can be allocated and re-used.
@@ -147,7 +162,7 @@ Page reclaimer
Similar to the core kswapd, ksgxd, is responsible for managing the
overcommitment of enclave memory. If the system runs out of enclave memory,
-*ksgxwapd* “swaps” enclave memory to normal memory.
+*ksgxd* “swaps” enclave memory to normal memory.
Launch Control
==============
@@ -156,7 +171,7 @@ SGX provides a launch control mechanism. After all enclave pages have been
copied, kernel executes EINIT function, which initializes the enclave. Only after
this the CPU can execute inside the enclave.
-ENIT function takes an RSA-3072 signature of the enclave measurement. The function
+EINIT function takes an RSA-3072 signature of the enclave measurement. The function
checks that the measurement is correct and signature is signed with the key
hashed to the four **IA32_SGXLEPUBKEYHASH{0, 1, 2, 3}** MSRs representing the
SHA256 of a public key.
@@ -184,7 +199,7 @@ CPUs starting from Icelake use Total Memory Encryption (TME) in the place of
MEE. TME-based SGX implementations do not have an integrity Merkle tree, which
means integrity and replay-attacks are not mitigated. B, it includes
additional changes to prevent cipher text from being returned and SW memory
-aliases from being Created.
+aliases from being created.
DMA to enclave memory is blocked by range registers on both MEE and TME systems
(SDM section 41.10).
@@ -250,3 +265,38 @@ user wants to deploy SGX applications both on the host and in guests
on the same machine, the user should reserve enough EPC (by taking out
total virtual EPC size of all SGX VMs from the physical EPC size) for
host SGX applications so they can run with acceptable performance.
+
+Architectural behavior is to restore all EPC pages to an uninitialized
+state also after a guest reboot. Because this state can be reached only
+through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc``
+provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction
+on all pages in the virtual EPC.
+
+``EREMOVE`` can fail for three reasons. Userspace must pay attention
+to expected failures and handle them as follows:
+
+1. Page removal will always fail when any thread is running in the
+ enclave to which the page belongs. In this case the ioctl will
+ return ``EBUSY`` independent of whether it has successfully removed
+ some pages; userspace can avoid these failures by preventing execution
+ of any vcpu which maps the virtual EPC.
+
+2. Page removal will cause a general protection fault if two calls to
+ ``EREMOVE`` happen concurrently for pages that refer to the same
+ "SECS" metadata pages. This can happen if there are concurrent
+ invocations to ``SGX_IOC_VEPC_REMOVE_ALL``, or if a ``/dev/sgx_vepc``
+ file descriptor in the guest is closed at the same time as
+ ``SGX_IOC_VEPC_REMOVE_ALL``; it will also be reported as ``EBUSY``.
+ This can be avoided in userspace by serializing calls to the ioctl()
+ and to close(), but in general it should not be a problem.
+
+3. Finally, page removal will fail for SECS metadata pages which still
+ have child pages. Child pages can be removed by executing
+ ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors
+ mapped into the guest. This means that the ioctl() must be called
+ twice: an initial set of calls to remove child pages and a subsequent
+ set of calls to remove SECS pages. The second set of calls is only
+ required for those mappings that returned a nonzero value from the
+ first call. It indicates a bug in the kernel or the userspace client
+ if any of the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls has
+ a return code other than 0.
diff --git a/Documentation/x86/sva.rst b/Documentation/arch/x86/sva.rst
index 076efd51ef1f..33cb05005982 100644
--- a/Documentation/x86/sva.rst
+++ b/Documentation/arch/x86/sva.rst
@@ -104,18 +104,47 @@ The MSR must be configured on each logical CPU before any application
thread can interact with a device. Threads that belong to the same
process share the same page tables, thus the same MSR value.
-PASID is cleared when a process is created. The PASID allocation and MSR
-programming may occur long after a process and its threads have been created.
-One thread must call iommu_sva_bind_device() to allocate the PASID for the
-process. If a thread uses ENQCMD without the MSR first being populated, a #GP
-will be raised. The kernel will update the PASID MSR with the PASID for all
-threads in the process. A single process PASID can be used simultaneously
-with multiple devices since they all share the same address space.
-
-One thread can call iommu_sva_unbind_device() to free the allocated PASID.
-The kernel will clear the PASID MSR for all threads belonging to the process.
-
-New threads inherit the MSR value from the parent.
+PASID Life Cycle Management
+===========================
+
+PASID is initialized as IOMMU_PASID_INVALID (-1) when a process is created.
+
+Only processes that access SVA-capable devices need to have a PASID
+allocated. This allocation happens when a process opens/binds an SVA-capable
+device but finds no PASID for this process. Subsequent binds of the same, or
+other devices will share the same PASID.
+
+Although the PASID is allocated to the process by opening a device,
+it is not active in any of the threads of that process. It's loaded to the
+IA32_PASID MSR lazily when a thread tries to submit a work descriptor
+to a device using the ENQCMD.
+
+That first access will trigger a #GP fault because the IA32_PASID MSR
+has not been initialized with the PASID value assigned to the process
+when the device was opened. The Linux #GP handler notes that a PASID has
+been allocated for the process, and so initializes the IA32_PASID MSR
+and returns so that the ENQCMD instruction is re-executed.
+
+On fork(2) or exec(2) the PASID is removed from the process as it no
+longer has the same address space that it had when the device was opened.
+
+On clone(2) the new task shares the same address space, so will be
+able to use the PASID allocated to the process. The IA32_PASID is not
+preemptively initialized as the PASID value might not be allocated yet or
+the kernel does not know whether this thread is going to access the device
+and the cleared IA32_PASID MSR reduces context switch overhead by xstate
+init optimization. Since #GP faults have to be handled on any threads that
+were created before the PASID was assigned to the mm of the process, newly
+created threads might as well be treated in a consistent way.
+
+Due to complexity of freeing the PASID and clearing all IA32_PASID MSRs in
+all threads in unbind, free the PASID lazily only on mm exit.
+
+If a process does a close(2) of the device file descriptor and munmap(2)
+of the device MMIO portal, then the driver will unbind the device. The
+PASID is still marked VALID in the PASID_MSR for any threads in the
+process that accessed the device. But this is harmless as without the
+MMIO portal they cannot submit new work to the device.
Relationships
=============
diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst
new file mode 100644
index 000000000000..dc8d9fd2c3f7
--- /dev/null
+++ b/Documentation/arch/x86/tdx.rst
@@ -0,0 +1,261 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Intel Trust Domain Extensions (TDX)
+=====================================
+
+Intel's Trust Domain Extensions (TDX) protect confidential guest VMs from
+the host and physical attacks by isolating the guest register state and by
+encrypting the guest memory. In TDX, a special module running in a special
+mode sits between the host and the guest and manages the guest/host
+separation.
+
+Since the host cannot directly access guest registers or memory, much
+normal functionality of a hypervisor must be moved into the guest. This is
+implemented using a Virtualization Exception (#VE) that is handled by the
+guest kernel. A #VE is handled entirely inside the guest kernel, but some
+require the hypervisor to be consulted.
+
+TDX includes new hypercall-like mechanisms for communicating from the
+guest to the hypervisor or the TDX module.
+
+New TDX Exceptions
+==================
+
+TDX guests behave differently from bare-metal and traditional VMX guests.
+In TDX guests, otherwise normal instructions or memory accesses can cause
+#VE or #GP exceptions.
+
+Instructions marked with an '*' conditionally cause exceptions. The
+details for these instructions are discussed below.
+
+Instruction-based #VE
+---------------------
+
+- Port I/O (INS, OUTS, IN, OUT)
+- HLT
+- MONITOR, MWAIT
+- WBINVD, INVD
+- VMCALL
+- RDMSR*,WRMSR*
+- CPUID*
+
+Instruction-based #GP
+---------------------
+
+- All VMX instructions: INVEPT, INVVPID, VMCLEAR, VMFUNC, VMLAUNCH,
+ VMPTRLD, VMPTRST, VMREAD, VMRESUME, VMWRITE, VMXOFF, VMXON
+- ENCLS, ENCLU
+- GETSEC
+- RSM
+- ENQCMD
+- RDMSR*,WRMSR*
+
+RDMSR/WRMSR Behavior
+--------------------
+
+MSR access behavior falls into three categories:
+
+- #GP generated
+- #VE generated
+- "Just works"
+
+In general, the #GP MSRs should not be used in guests. Their use likely
+indicates a bug in the guest. The guest may try to handle the #GP with a
+hypercall but it is unlikely to succeed.
+
+The #VE MSRs are typically able to be handled by the hypervisor. Guests
+can make a hypercall to the hypervisor to handle the #VE.
+
+The "just works" MSRs do not need any special guest handling. They might
+be implemented by directly passing through the MSR to the hardware or by
+trapping and handling in the TDX module. Other than possibly being slow,
+these MSRs appear to function just as they would on bare metal.
+
+CPUID Behavior
+--------------
+
+For some CPUID leaves and sub-leaves, the virtualized bit fields of CPUID
+return values (in guest EAX/EBX/ECX/EDX) are configurable by the
+hypervisor. For such cases, the Intel TDX module architecture defines two
+virtualization types:
+
+- Bit fields for which the hypervisor controls the value seen by the guest
+ TD.
+
+- Bit fields for which the hypervisor configures the value such that the
+ guest TD either sees their native value or a value of 0. For these bit
+ fields, the hypervisor can mask off the native values, but it can not
+ turn *on* values.
+
+A #VE is generated for CPUID leaves and sub-leaves that the TDX module does
+not know how to handle. The guest kernel may ask the hypervisor for the
+value with a hypercall.
+
+#VE on Memory Accesses
+======================
+
+There are essentially two classes of TDX memory: private and shared.
+Private memory receives full TDX protections. Its content is protected
+against access from the hypervisor. Shared memory is expected to be
+shared between guest and hypervisor and does not receive full TDX
+protections.
+
+A TD guest is in control of whether its memory accesses are treated as
+private or shared. It selects the behavior with a bit in its page table
+entries. This helps ensure that a guest does not place sensitive
+information in shared memory, exposing it to the untrusted hypervisor.
+
+#VE on Shared Memory
+--------------------
+
+Access to shared mappings can cause a #VE. The hypervisor ultimately
+controls whether a shared memory access causes a #VE, so the guest must be
+careful to only reference shared pages it can safely handle a #VE. For
+instance, the guest should be careful not to access shared memory in the
+#VE handler before it reads the #VE info structure (TDG.VP.VEINFO.GET).
+
+Shared mapping content is entirely controlled by the hypervisor. The guest
+should only use shared mappings for communicating with the hypervisor.
+Shared mappings must never be used for sensitive memory content like kernel
+stacks. A good rule of thumb is that hypervisor-shared memory should be
+treated the same as memory mapped to userspace. Both the hypervisor and
+userspace are completely untrusted.
+
+MMIO for virtual devices is implemented as shared memory. The guest must
+be careful not to access device MMIO regions unless it is also prepared to
+handle a #VE.
+
+#VE on Private Pages
+--------------------
+
+An access to private mappings can also cause a #VE. Since all kernel
+memory is also private memory, the kernel might theoretically need to
+handle a #VE on arbitrary kernel memory accesses. This is not feasible, so
+TDX guests ensure that all guest memory has been "accepted" before memory
+is used by the kernel.
+
+A modest amount of memory (typically 512M) is pre-accepted by the firmware
+before the kernel runs to ensure that the kernel can start up without
+being subjected to a #VE.
+
+The hypervisor is permitted to unilaterally move accepted pages to a
+"blocked" state. However, if it does this, page access will not generate a
+#VE. It will, instead, cause a "TD Exit" where the hypervisor is required
+to handle the exception.
+
+Linux #VE handler
+=================
+
+Just like page faults or #GP's, #VE exceptions can be either handled or be
+fatal. Typically, an unhandled userspace #VE results in a SIGSEGV.
+An unhandled kernel #VE results in an oops.
+
+Handling nested exceptions on x86 is typically nasty business. A #VE
+could be interrupted by an NMI which triggers another #VE and hilarity
+ensues. The TDX #VE architecture anticipated this scenario and includes a
+feature to make it slightly less nasty.
+
+During #VE handling, the TDX module ensures that all interrupts (including
+NMIs) are blocked. The block remains in place until the guest makes a
+TDG.VP.VEINFO.GET TDCALL. This allows the guest to control when interrupts
+or a new #VE can be delivered.
+
+However, the guest kernel must still be careful to avoid potential
+#VE-triggering actions (discussed above) while this block is in place.
+While the block is in place, any #VE is elevated to a double fault (#DF)
+which is not recoverable.
+
+MMIO handling
+=============
+
+In non-TDX VMs, MMIO is usually implemented by giving a guest access to a
+mapping which will cause a VMEXIT on access, and then the hypervisor
+emulates the access. That is not possible in TDX guests because VMEXIT
+will expose the register state to the host. TDX guests don't trust the host
+and can't have their state exposed to the host.
+
+In TDX, MMIO regions typically trigger a #VE exception in the guest. The
+guest #VE handler then emulates the MMIO instruction inside the guest and
+converts it into a controlled TDCALL to the host, rather than exposing
+guest state to the host.
+
+MMIO addresses on x86 are just special physical addresses. They can
+theoretically be accessed with any instruction that accesses memory.
+However, the kernel instruction decoding method is limited. It is only
+designed to decode instructions like those generated by io.h macros.
+
+MMIO access via other means (like structure overlays) may result in an
+oops.
+
+Shared Memory Conversions
+=========================
+
+All TDX guest memory starts out as private at boot. This memory can not
+be accessed by the hypervisor. However, some kernel users like device
+drivers might have a need to share data with the hypervisor. To do this,
+memory must be converted between shared and private. This can be
+accomplished using some existing memory encryption helpers:
+
+ * set_memory_decrypted() converts a range of pages to shared.
+ * set_memory_encrypted() converts memory back to private.
+
+Device drivers are the primary user of shared memory, but there's no need
+to touch every driver. DMA buffers and ioremap() do the conversions
+automatically.
+
+TDX uses SWIOTLB for most DMA allocations. The SWIOTLB buffer is
+converted to shared on boot.
+
+For coherent DMA allocation, the DMA buffer gets converted on the
+allocation. Check force_dma_unencrypted() for details.
+
+Attestation
+===========
+
+Attestation is used to verify the TDX guest trustworthiness to other
+entities before provisioning secrets to the guest. For example, a key
+server may want to use attestation to verify that the guest is the
+desired one before releasing the encryption keys to mount the encrypted
+rootfs or a secondary drive.
+
+The TDX module records the state of the TDX guest in various stages of
+the guest boot process using the build time measurement register (MRTD)
+and runtime measurement registers (RTMR). Measurements related to the
+guest initial configuration and firmware image are recorded in the MRTD
+register. Measurements related to initial state, kernel image, firmware
+image, command line options, initrd, ACPI tables, etc are recorded in
+RTMR registers. For more details, as an example, please refer to TDX
+Virtual Firmware design specification, section titled "TD Measurement".
+At TDX guest runtime, the attestation process is used to attest to these
+measurements.
+
+The attestation process consists of two steps: TDREPORT generation and
+Quote generation.
+
+TDX guest uses TDCALL[TDG.MR.REPORT] to get the TDREPORT (TDREPORT_STRUCT)
+from the TDX module. TDREPORT is a fixed-size data structure generated by
+the TDX module which contains guest-specific information (such as build
+and boot measurements), platform security version, and the MAC to protect
+the integrity of the TDREPORT. A user-provided 64-Byte REPORTDATA is used
+as input and included in the TDREPORT. Typically it can be some nonce
+provided by attestation service so the TDREPORT can be verified uniquely.
+More details about the TDREPORT can be found in Intel TDX Module
+specification, section titled "TDG.MR.REPORT Leaf".
+
+After getting the TDREPORT, the second step of the attestation process
+is to send it to the Quoting Enclave (QE) to generate the Quote. TDREPORT
+by design can only be verified on the local platform as the MAC key is
+bound to the platform. To support remote verification of the TDREPORT,
+TDX leverages Intel SGX Quoting Enclave to verify the TDREPORT locally
+and convert it to a remotely verifiable Quote. Method of sending TDREPORT
+to QE is implementation specific. Attestation software can choose
+whatever communication channel available (i.e. vsock or TCP/IP) to
+send the TDREPORT to QE and receive the Quote.
+
+References
+==========
+
+TDX reference material is collected here:
+
+https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html
diff --git a/Documentation/x86/tlb.rst b/Documentation/arch/x86/tlb.rst
index 82ec58ae63a8..82ec58ae63a8 100644
--- a/Documentation/x86/tlb.rst
+++ b/Documentation/arch/x86/tlb.rst
diff --git a/Documentation/x86/topology.rst b/Documentation/arch/x86/topology.rst
index 7f58010ea86a..7f58010ea86a 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/arch/x86/tsx_async_abort.rst
index 583ddc185ba2..583ddc185ba2 100644
--- a/Documentation/x86/tsx_async_abort.rst
+++ b/Documentation/arch/x86/tsx_async_abort.rst
diff --git a/Documentation/x86/usb-legacy-support.rst b/Documentation/arch/x86/usb-legacy-support.rst
index e01c08b7c981..e01c08b7c981 100644
--- a/Documentation/x86/usb-legacy-support.rst
+++ b/Documentation/arch/x86/usb-legacy-support.rst
diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/arch/x86/x86_64/5level-paging.rst
index b792bbdc0b01..71f882f4a173 100644
--- a/Documentation/x86/x86_64/5level-paging.rst
+++ b/Documentation/arch/x86/x86_64/5level-paging.rst
@@ -20,7 +20,7 @@ physical address space. This "ought to be enough for anybody" ©.
QEMU 2.9 and later support 5-level paging.
Virtual memory layout for 5-level paging is described in
-Documentation/x86/x86_64/mm.rst
+Documentation/arch/x86/x86_64/mm.rst
Enabling 5-level paging
diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/arch/x86/x86_64/boot-options.rst
index ccb7e86bf8d9..137432d34109 100644
--- a/Documentation/x86/x86_64/boot-options.rst
+++ b/Documentation/arch/x86/x86_64/boot-options.rst
@@ -9,7 +9,7 @@ only the AMD64 specific ones are listed here.
Machine check
=============
-Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
+Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
mce=off
Disable machine check
@@ -47,14 +47,7 @@ Please see Documentation/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
in a reboot. On Intel systems it is enabled by default.
mce=nobootlog
Disable boot machine check logging.
- mce=tolerancelevel[,monarchtimeout] (number,number)
- tolerance levels:
- 0: always panic on uncorrected errors, log corrected errors
- 1: panic or SIGBUS on uncorrected errors, log corrected errors
- 2: SIGBUS or log uncorrected errors, log corrected errors
- 3: never panic or SIGBUS, log all errors (for testing only)
- Default is 1
- Can be also set using sysfs which is preferable.
+ mce=monarchtimeout (number)
monarchtimeout:
Sets the time in us to wait for other CPUs on machine checks. 0
to disable.
@@ -89,7 +82,7 @@ APICs
Don't use the local APIC (alias for i386 compatibility)
pirq=...
- See Documentation/x86/i386/IO-APIC.rst
+ See Documentation/arch/x86/i386/IO-APIC.rst
noapictimer
Don't set up the APIC timer
@@ -164,15 +157,6 @@ Rebooting
newer BIOS, or newer board) using this option will ignore the built-in
quirk table, and use the generic default reboot actions.
-Non Executable Mappings
-=======================
-
- noexec=on|off
- on
- Enable(default)
- off
- Disable
-
NUMA
====
@@ -303,11 +287,13 @@ iommu options only relevant to the AMD GART hardware IOMMU:
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
implementation:
- swiotlb=<pages>[,force]
- <pages>
- Prereserve that many 128K pages for the software IO bounce buffering.
+ swiotlb=<slots>[,force,noforce]
+ <slots>
+ Prereserve that many 2K slots for the software IO bounce buffering.
force
Force all IO through the software TLB.
+ noforce
+ Do not initialize the software TLB.
Miscellaneous
@@ -317,3 +303,17 @@ Miscellaneous
Do not use GB pages for kernel direct mappings.
gbpages
Use GB pages for kernel direct mappings.
+
+
+AMD SEV (Secure Encrypted Virtualization)
+=========================================
+Options relating to AMD SEV, specified via the following format:
+
+::
+
+ sev=option1[,option2]
+
+The available options are:
+
+ debug
+ Enable debug messages.
diff --git a/Documentation/x86/x86_64/cpu-hotplug-spec.rst b/Documentation/arch/x86/x86_64/cpu-hotplug-spec.rst
index 8d1c91f0c880..8d1c91f0c880 100644
--- a/Documentation/x86/x86_64/cpu-hotplug-spec.rst
+++ b/Documentation/arch/x86/x86_64/cpu-hotplug-spec.rst
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst
index ff9bcfd2cc14..ba74617d4999 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/arch/x86/x86_64/fake-numa-for-cpusets.rst
@@ -18,7 +18,7 @@ For more information on the features of cpusets, see
Documentation/admin-guide/cgroup-v1/cpusets.rst.
There are a number of different configurations you can use for your needs. For
more information on the numa=fake command line option and its various ways of
-configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
+configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst.
For the purposes of this introduction, we'll assume a very primitive NUMA
emulation setup of "numa=fake=4*512,". This will split our system memory into
diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/arch/x86/x86_64/fsgs.rst
index 50960e09e1f6..50960e09e1f6 100644
--- a/Documentation/x86/x86_64/fsgs.rst
+++ b/Documentation/arch/x86/x86_64/fsgs.rst
diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/arch/x86/x86_64/index.rst
index a56070fc8e77..a56070fc8e77 100644
--- a/Documentation/x86/x86_64/index.rst
+++ b/Documentation/arch/x86/x86_64/index.rst
diff --git a/Documentation/arch/x86/x86_64/machinecheck.rst b/Documentation/arch/x86/x86_64/machinecheck.rst
new file mode 100644
index 000000000000..cea12ee97200
--- /dev/null
+++ b/Documentation/arch/x86/x86_64/machinecheck.rst
@@ -0,0 +1,33 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================================================
+Configurable sysfs parameters for the x86-64 machine check code
+===============================================================
+
+Machine checks report internal hardware error conditions detected
+by the CPU. Uncorrected errors typically cause a machine check
+(often with panic), corrected ones cause a machine check log entry.
+
+Machine checks are organized in banks (normally associated with
+a hardware subsystem) and subevents in a bank. The exact meaning
+of the banks and subevent is CPU specific.
+
+mcelog knows how to decode them.
+
+When you see the "Machine check errors logged" message in the system
+log then mcelog should run to collect and decode machine check entries
+from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
+
+Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
+(N = CPU number).
+
+The directory contains some configurable entries. See
+Documentation/ABI/testing/sysfs-mce for more details.
+
+TBD document entries for AMD threshold interrupt configuration
+
+For more details about the x86 machine check architecture
+see the Intel and AMD architecture manuals from their developer websites.
+
+For more details about the architecture
+see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/Documentation/x86/x86_64/mm.rst b/Documentation/arch/x86/x86_64/mm.rst
index 9798676bb0bf..35e5e18c83d0 100644
--- a/Documentation/x86/x86_64/mm.rst
+++ b/Documentation/arch/x86/x86_64/mm.rst
@@ -140,7 +140,7 @@ The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).
-We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
+We map EFI runtime services in the 'efi_pgd' PGD in a 64GB large virtual
memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.
diff --git a/Documentation/x86/x86_64/uefi.rst b/Documentation/arch/x86/x86_64/uefi.rst
index 3b894103a734..fbc30c9a071d 100644
--- a/Documentation/x86/x86_64/uefi.rst
+++ b/Documentation/arch/x86/x86_64/uefi.rst
@@ -29,7 +29,7 @@ Mechanics
be selected::
CONFIG_EFI=y
- CONFIG_EFI_VARS=y or m # optional
+ CONFIG_EFIVAR_FS=y or m # optional
- Create a VFAT partition on the disk
- Copy the following to the VFAT partition:
diff --git a/Documentation/arch/x86/xstate.rst b/Documentation/arch/x86/xstate.rst
new file mode 100644
index 000000000000..ae5c69e48b11
--- /dev/null
+++ b/Documentation/arch/x86/xstate.rst
@@ -0,0 +1,174 @@
+Using XSTATE features in user space applications
+================================================
+
+The x86 architecture supports floating-point extensions which are
+enumerated via CPUID. Applications consult CPUID and use XGETBV to
+evaluate which features have been enabled by the kernel XCR0.
+
+Up to AVX-512 and PKRU states, these features are automatically enabled by
+the kernel if available. Features like AMX TILE_DATA (XSTATE component 18)
+are enabled by XCR0 as well, but the first use of related instruction is
+trapped by the kernel because by default the required large XSTATE buffers
+are not allocated automatically.
+
+The purpose for dynamic features
+--------------------------------
+
+Legacy userspace libraries often have hard-coded, static sizes for
+alternate signal stacks, often using MINSIGSTKSZ which is typically 2KB.
+That stack must be able to store at *least* the signal frame that the
+kernel sets up before jumping into the signal handler. That signal frame
+must include an XSAVE buffer defined by the CPU.
+
+However, that means that the size of signal stacks is dynamic, not static,
+because different CPUs have differently-sized XSAVE buffers. A compiled-in
+size of 2KB with existing applications is too small for new CPU features
+like AMX. Instead of universally requiring larger stack, with the dynamic
+enabling, the kernel can enforce userspace applications to have
+properly-sized altstacks.
+
+Using dynamically enabled XSTATE features in user space applications
+--------------------------------------------------------------------
+
+The kernel provides an arch_prctl(2) based mechanism for applications to
+request the usage of such features. The arch_prctl(2) options related to
+this are:
+
+-ARCH_GET_XCOMP_SUPP
+
+ arch_prctl(ARCH_GET_XCOMP_SUPP, &features);
+
+ ARCH_GET_XCOMP_SUPP stores the supported features in userspace storage of
+ type uint64_t. The second argument is a pointer to that storage.
+
+-ARCH_GET_XCOMP_PERM
+
+ arch_prctl(ARCH_GET_XCOMP_PERM, &features);
+
+ ARCH_GET_XCOMP_PERM stores the features for which the userspace process
+ has permission in userspace storage of type uint64_t. The second argument
+ is a pointer to that storage.
+
+-ARCH_REQ_XCOMP_PERM
+
+ arch_prctl(ARCH_REQ_XCOMP_PERM, feature_nr);
+
+ ARCH_REQ_XCOMP_PERM allows to request permission for a dynamically enabled
+ feature or a feature set. A feature set can be mapped to a facility, e.g.
+ AMX, and can require one or more XSTATE components to be enabled.
+
+ The feature argument is the number of the highest XSTATE component which
+ is required for a facility to work.
+
+When requesting permission for a feature, the kernel checks the
+availability. The kernel ensures that sigaltstacks in the process's tasks
+are large enough to accommodate the resulting large signal frame. It
+enforces this both during ARCH_REQ_XCOMP_SUPP and during any subsequent
+sigaltstack(2) calls. If an installed sigaltstack is smaller than the
+resulting sigframe size, ARCH_REQ_XCOMP_SUPP results in -ENOSUPP. Also,
+sigaltstack(2) results in -ENOMEM if the requested altstack is too small
+for the permitted features.
+
+Permission, when granted, is valid per process. Permissions are inherited
+on fork(2) and cleared on exec(3).
+
+The first use of an instruction related to a dynamically enabled feature is
+trapped by the kernel. The trap handler checks whether the process has
+permission to use the feature. If the process has no permission then the
+kernel sends SIGILL to the application. If the process has permission then
+the handler allocates a larger xstate buffer for the task so the large
+state can be context switched. In the unlikely cases that the allocation
+fails, the kernel sends SIGSEGV.
+
+AMX TILE_DATA enabling example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Below is the example of how userspace applications enable
+TILE_DATA dynamically:
+
+ 1. The application first needs to query the kernel for AMX
+ support::
+
+ #include <asm/prctl.h>
+ #include <sys/syscall.h>
+ #include <stdio.h>
+ #include <unistd.h>
+
+ #ifndef ARCH_GET_XCOMP_SUPP
+ #define ARCH_GET_XCOMP_SUPP 0x1021
+ #endif
+
+ #ifndef ARCH_XCOMP_TILECFG
+ #define ARCH_XCOMP_TILECFG 17
+ #endif
+
+ #ifndef ARCH_XCOMP_TILEDATA
+ #define ARCH_XCOMP_TILEDATA 18
+ #endif
+
+ #define MASK_XCOMP_TILE ((1 << ARCH_XCOMP_TILECFG) | \
+ (1 << ARCH_XCOMP_TILEDATA))
+
+ unsigned long features;
+ long rc;
+
+ ...
+
+ rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features);
+
+ if (!rc && (features & MASK_XCOMP_TILE) == MASK_XCOMP_TILE)
+ printf("AMX is available.\n");
+
+ 2. After that, determining support for AMX, an application must
+ explicitly ask permission to use it::
+
+ #ifndef ARCH_REQ_XCOMP_PERM
+ #define ARCH_REQ_XCOMP_PERM 0x1023
+ #endif
+
+ ...
+
+ rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, ARCH_XCOMP_TILEDATA);
+
+ if (!rc)
+ printf("AMX is ready for use.\n");
+
+Note this example does not include the sigaltstack preparation.
+
+Dynamic features in signal frames
+---------------------------------
+
+Dynamcally enabled features are not written to the signal frame upon signal
+entry if the feature is in its initial configuration. This differs from
+non-dynamic features which are always written regardless of their
+configuration. Signal handlers can examine the XSAVE buffer's XSTATE_BV
+field to determine if a features was written.
+
+Dynamic features for virtual machines
+-------------------------------------
+
+The permission for the guest state component needs to be managed separately
+from the host, as they are exclusive to each other. A coupled of options
+are extended to control the guest permission:
+
+-ARCH_GET_XCOMP_GUEST_PERM
+
+ arch_prctl(ARCH_GET_XCOMP_GUEST_PERM, &features);
+
+ ARCH_GET_XCOMP_GUEST_PERM is a variant of ARCH_GET_XCOMP_PERM. So it
+ provides the same semantics and functionality but for the guest
+ components.
+
+-ARCH_REQ_XCOMP_GUEST_PERM
+
+ arch_prctl(ARCH_REQ_XCOMP_GUEST_PERM, feature_nr);
+
+ ARCH_REQ_XCOMP_GUEST_PERM is a variant of ARCH_REQ_XCOMP_PERM. It has the
+ same semantics for the guest permission. While providing a similar
+ functionality, this comes with a constraint. Permission is frozen when the
+ first VCPU is created. Any attempt to change permission after that point
+ is going to be rejected. So, the permission has to be requested before the
+ first VCPU creation.
+
+Note that some VMMs may have already established a set of supported state
+components. These options are not presumed to support any particular VMM.
diff --git a/Documentation/x86/zero-page.rst b/Documentation/arch/x86/zero-page.rst
index f088f5881666..45aa9cceb4f1 100644
--- a/Documentation/x86/zero-page.rst
+++ b/Documentation/arch/x86/zero-page.rst
@@ -19,6 +19,7 @@ Offset/Size Proto Name Meaning
058/008 ALL tboot_addr Physical address of tboot shared page
060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information
(struct ist_info)
+070/008 ALL acpi_rsdp_addr Physical address of ACPI RSDP table
080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!!
090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!!
0A0/010 ALL sys_desc_table System description table (struct sys_desc_table),
@@ -27,6 +28,7 @@ Offset/Size Proto Name Meaning
0C0/004 ALL ext_ramdisk_image ramdisk_image high 32bits
0C4/004 ALL ext_ramdisk_size ramdisk_size high 32bits
0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits
+13C/004 ALL cc_blob_address Physical address of Confidential Computing blob
140/080 ALL edid_info Video mode setup (struct edid_info)
1C0/020 ALL efi_info EFI 32 information (struct efi_info)
1E0/004 ALL alt_mem_k Alternative mem check, in KB
diff --git a/Documentation/xtensa/atomctl.rst b/Documentation/arch/xtensa/atomctl.rst
index 1ecbd0ba9a2e..1ecbd0ba9a2e 100644
--- a/Documentation/xtensa/atomctl.rst
+++ b/Documentation/arch/xtensa/atomctl.rst
diff --git a/Documentation/xtensa/booting.rst b/Documentation/arch/xtensa/booting.rst
index e1b83707e5b6..e1b83707e5b6 100644
--- a/Documentation/xtensa/booting.rst
+++ b/Documentation/arch/xtensa/booting.rst
diff --git a/Documentation/xtensa/features.rst b/Documentation/arch/xtensa/features.rst
index 6b92c7bfa19d..6b92c7bfa19d 100644
--- a/Documentation/xtensa/features.rst
+++ b/Documentation/arch/xtensa/features.rst
diff --git a/Documentation/xtensa/index.rst b/Documentation/arch/xtensa/index.rst
index 69952446a9be..69952446a9be 100644
--- a/Documentation/xtensa/index.rst
+++ b/Documentation/arch/xtensa/index.rst
diff --git a/Documentation/xtensa/mmu.rst b/Documentation/arch/xtensa/mmu.rst
index 450573afa31a..450573afa31a 100644
--- a/Documentation/xtensa/mmu.rst
+++ b/Documentation/arch/xtensa/mmu.rst
diff --git a/Documentation/arm/google/chromebook-boot-flow.rst b/Documentation/arm/google/chromebook-boot-flow.rst
new file mode 100644
index 000000000000..36da77684bba
--- /dev/null
+++ b/Documentation/arm/google/chromebook-boot-flow.rst
@@ -0,0 +1,69 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+Chromebook Boot Flow
+======================================
+
+Most recent Chromebooks that use device tree are using the opensource
+depthcharge_ bootloader. Depthcharge_ expects the OS to be packaged as a `FIT
+Image`_ which contains an OS image as well as a collection of device trees. It
+is up to depthcharge_ to pick the right device tree from the `FIT Image`_ and
+provide it to the OS.
+
+The scheme that depthcharge_ uses to pick the device tree takes into account
+three variables:
+
+- Board name, specified at depthcharge_ compile time. This is $(BOARD) below.
+- Board revision number, determined at runtime (perhaps by reading GPIO
+ strappings, perhaps via some other method). This is $(REV) below.
+- SKU number, read from GPIO strappings at boot time. This is $(SKU) below.
+
+For recent Chromebooks, depthcharge_ creates a match list that looks like this:
+
+- google,$(BOARD)-rev$(REV)-sku$(SKU)
+- google,$(BOARD)-rev$(REV)
+- google,$(BOARD)-sku$(SKU)
+- google,$(BOARD)
+
+Note that some older Chromebooks use a slightly different list that may
+not include SKU matching or may prioritize SKU/rev differently.
+
+Note that for some boards there may be extra board-specific logic to inject
+extra compatibles into the list, but this is uncommon.
+
+Depthcharge_ will look through all device trees in the `FIT Image`_ trying to
+find one that matches the most specific compatible. It will then look
+through all device trees in the `FIT Image`_ trying to find the one that
+matches the *second most* specific compatible, etc.
+
+When searching for a device tree, depthcharge_ doesn't care where the
+compatible string falls within a device tree's root compatible string array.
+As an example, if we're on board "lazor", rev 4, SKU 0 and we have two device
+trees:
+
+- "google,lazor-rev5-sku0", "google,lazor-rev4-sku0", "qcom,sc7180"
+- "google,lazor", "qcom,sc7180"
+
+Then depthcharge_ will pick the first device tree even though
+"google,lazor-rev4-sku0" was the second compatible listed in that device tree.
+This is because it is a more specific compatible than "google,lazor".
+
+It should be noted that depthcharge_ does not have any smarts to try to
+match board or SKU revisions that are "close by". That is to say that
+if depthcharge_ knows it's on "rev4" of a board but there is no "rev4"
+device tree then depthcharge_ *won't* look for a "rev3" device tree.
+
+In general when any significant changes are made to a board the board
+revision number is increased even if none of those changes need to
+be reflected in the device tree. Thus it's fairly common to see device
+trees with multiple revisions.
+
+It should be noted that, taking into account the above system that
+depthcharge_ has, the most flexibility is achieved if the device tree
+supporting the newest revision(s) of a board omits the "-rev{REV}"
+compatible strings. When this is done then if you get a new board
+revision and try to run old software on it then we'll at pick the
+newest device tree we know about.
+
+.. _depthcharge: https://source.chromium.org/chromiumos/chromiumos/codesearch/+/main:src/platform/depthcharge/
+.. _`FIT Image`: https://doc.coreboot.org/lib/payloads/fit.html
diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst
index d4f34ae9e6f4..fd43502ae924 100644
--- a/Documentation/arm/index.rst
+++ b/Documentation/arm/index.rst
@@ -31,6 +31,8 @@ SoC-specific documents
.. toctree::
:maxdepth: 1
+ google/chromebook-boot-flow
+
ixp4xx
marvell
@@ -55,22 +57,22 @@ SoC-specific documents
stm32/stm32h750-overview
stm32/stm32f769-overview
stm32/stm32f429-overview
+ stm32/stm32mp13-overview
+ stm32/stm32mp151-overview
stm32/stm32mp157-overview
+ stm32/stm32-dma-mdma-chaining
sunxi
samsung/index
- samsung-s3c24xx/index
sunxi/clocks
spear/overview
- sti/stih416-overview
sti/stih407-overview
sti/stih418-overview
sti/overview
- sti/stih415-overview
vfp/release-notes
diff --git a/Documentation/arm/marvell.rst b/Documentation/arm/marvell.rst
index 56bb592dbd0c..3d369a566038 100644
--- a/Documentation/arm/marvell.rst
+++ b/Documentation/arm/marvell.rst
@@ -14,17 +14,20 @@ Orion family
Flavors:
- 88F5082
- - 88F5181
- - 88F5181L
- - 88F5182
+ - 88F5181 a.k.a Orion-1
+ - 88F5181L a.k.a Orion-VoIP
+ - 88F5182 a.k.a Orion-NAS
- Datasheet: https://web.archive.org/web/20210124231420/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-datasheet.pdf
- Programmer's User Guide: https://web.archive.org/web/20210124231536/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-opensource-manual.pdf
- User Manual: https://web.archive.org/web/20210124231631/http://csclub.uwaterloo.ca/~board/ts7800/MV88F5182-usermanual.pdf
- - 88F5281
+ - Functional Errata: https://web.archive.org/web/20210704165540/https://www.digriz.org.uk/ts78xx/88F5182_Functional_Errata.pdf
+ - 88F5281 a.k.a Orion-2
- Datasheet: https://web.archive.org/web/20131028144728/http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
- - 88F6183
+ - 88F6183 a.k.a Orion-1-90
+ Homepage:
+ https://web.archive.org/web/20080607215437/http://www.marvell.com/products/media/index.jsp
Core:
Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
Linux kernel mach directory:
@@ -103,6 +106,8 @@ Discovery family
Not supported by the Linux kernel.
+ Homepage:
+ https://web.archive.org/web/20110924171043/http://www.marvell.com/embedded-processors/discovery-innovation/
Core:
Feroceon 88fr571-vd ARMv5 compatible
@@ -119,6 +124,7 @@ EBU Armada family
- 88F6707
- 88F6W11
+ - Product infos: https://web.archive.org/web/20141002083258/http://www.marvell.com/embedded-processors/armada-370/
- Product Brief: https://web.archive.org/web/20121115063038/http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
- Hardware Spec: https://web.archive.org/web/20140617183747/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
- Functional Spec: https://web.archive.org/web/20140617183701/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
@@ -126,9 +132,29 @@ EBU Armada family
Core:
Sheeva ARMv7 compatible PJ4B
+ Armada XP Flavors:
+ - MV78230
+ - MV78260
+ - MV78460
+
+ NOTE:
+ not to be confused with the non-SMP 78xx0 SoCs
+
+ - Product infos: https://web.archive.org/web/20150101215721/http://www.marvell.com/embedded-processors/armada-xp/
+ - Product Brief: https://web.archive.org/web/20121021173528/http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
+ - Functional Spec: https://web.archive.org/web/20180829171131/http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
+ - Hardware Specs:
+ - https://web.archive.org/web/20141127013651/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
+ - https://web.archive.org/web/20141222000224/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
+ - https://web.archive.org/web/20141222000230/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
+
+ Core:
+ Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
+
Armada 375 Flavors:
- 88F6720
+ - Product infos: https://web.archive.org/web/20140108032402/http://www.marvell.com/embedded-processors/armada-375/
- Product Brief: https://web.archive.org/web/20131216023516/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
Core:
@@ -161,29 +187,6 @@ EBU Armada family
Core:
ARM Cortex-A9
- Armada XP Flavors:
- - MV78230
- - MV78260
- - MV78460
-
- NOTE:
- not to be confused with the non-SMP 78xx0 SoCs
-
- Product Brief:
- https://web.archive.org/web/20121021173528/http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
-
- Functional Spec:
- https://web.archive.org/web/20180829171131/http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
-
- - Hardware Specs:
-
- - https://web.archive.org/web/20141127013651/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
- - https://web.archive.org/web/20141222000224/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
- - https://web.archive.org/web/20141222000230/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
-
- Core:
- Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
-
Linux kernel mach directory:
arch/arm/mach-mvebu
Linux kernel plat directory:
@@ -212,6 +215,7 @@ EBU Armada family ARMv8
arch/arm64/boot/dts/marvell/armada-37*
Armada 7K Flavors:
+ - 88F6040 (AP806 Quad 600 MHz + one CP110)
- 88F7020 (AP806 Dual + one CP110)
- 88F7040 (AP806 Quad + one CP110)
@@ -243,14 +247,33 @@ EBU Armada family ARMv8
Device tree files:
arch/arm64/boot/dts/marvell/armada-80*
+ Octeon TX2 CN913x Flavors:
+ - CN9130 (AP807 Quad + one internal CP115)
+ - CN9131 (AP807 Quad + one internal CP115 + one external CP115 / 88F8215)
+ - CN9132 (AP807 Quad + one internal CP115 + two external CP115 / 88F8215)
+
+ Core:
+ ARM Cortex A72
+
+ Homepage:
+ https://web.archive.org/web/20200803150818/https://www.marvell.com/products/infrastructure-processors/multi-core-processors/octeon-tx2/octeon-tx2-cn9130.html
+
+ Product Brief:
+ https://web.archive.org/web/20200803150818/https://www.marvell.com/content/dam/marvell/en/public-collateral/embedded-processors/marvell-infrastructure-processors-octeon-tx2-cn913x-product-brief-2020-02.pdf
+
+ Device tree files:
+ arch/arm64/boot/dts/marvell/cn913*
+
Avanta family
-------------
Flavors:
+ - 88F6500
- 88F6510
- 88F6530P
- 88F6550
- 88F6560
+ - 88F6601
Homepage:
https://web.archive.org/web/20181005145041/http://www.marvell.com/broadband/
@@ -353,8 +376,6 @@ PXA 2xx/3xx/93x/95x family
Linux kernel mach directory:
arch/arm/mach-pxa
- Linux kernel plat directory:
- arch/arm/plat-pxa
MMP/MMP2/MMP3 family (communication processor)
----------------------------------------------
@@ -408,8 +429,6 @@ MMP/MMP2/MMP3 family (communication processor)
Linux kernel mach directory:
arch/arm/mach-mmp
- Linux kernel plat directory:
- arch/arm/plat-pxa
Berlin family (Multimedia Solutions)
-------------------------------------
@@ -417,7 +436,7 @@ Berlin family (Multimedia Solutions)
- Flavors:
- 88DE3010, Armada 1000 (no Linux support)
- Core: Marvell PJ1 (ARMv5TE), Dual-core
- - Product Brief: http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
+ - Product Brief: https://web.archive.org/web/20131103162620/http://www.marvell.com/digital-entertainment/assets/armada_1000_pb.pdf
- 88DE3005, Armada 1500 Mini
- Design name: BG2CD
- Core: ARM Cortex-A9, PL310 L2CC
@@ -497,9 +516,6 @@ Long-term plans
Business Unit) in a single mach-<foo> directory. The plat-orion/
would therefore disappear.
- * Unify the mach-mmp/ and mach-pxa/ into the same mach-pxa
- directory. The plat-pxa/ would therefore disappear.
-
Credits
-------
diff --git a/Documentation/arm/microchip.rst b/Documentation/arm/microchip.rst
index 9c013299fd3b..e721d855f2c9 100644
--- a/Documentation/arm/microchip.rst
+++ b/Documentation/arm/microchip.rst
@@ -137,6 +137,26 @@ the Microchip website: http://www.microchip.com.
http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001476B.pdf
+ * ARM Cortex-A7 based SoCs
+ - sama7g5 family
+
+ - sama7g51
+ - sama7g52
+ - sama7g53
+ - sama7g54 (device superset)
+
+ * Datasheet
+
+ Coming soon
+
+ - lan966 family
+ - lan9662
+ - lan9668
+
+ * Datasheet
+
+ Coming soon
+
* ARM Cortex-M7 MCUs
- sams70 family
diff --git a/Documentation/arm/samsung-s3c24xx/cpufreq.rst b/Documentation/arm/samsung-s3c24xx/cpufreq.rst
deleted file mode 100644
index 2ddc26c03b1f..000000000000
--- a/Documentation/arm/samsung-s3c24xx/cpufreq.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-=======================
-S3C24XX CPUfreq support
-=======================
-
-Introduction
-------------
-
- The S3C24XX series support a number of power saving systems, such as
- the ability to change the core, memory and peripheral operating
- frequencies. The core control is exported via the CPUFreq driver
- which has a number of different manual or automatic controls over the
- rate the core is running at.
-
- There are two forms of the driver depending on the specific CPU and
- how the clocks are arranged. The first implementation used as single
- PLL to feed the ARM, memory and peripherals via a series of dividers
- and muxes and this is the implementation that is documented here. A
- newer version where there is a separate PLL and clock divider for the
- ARM core is available as a separate driver.
-
-
-Layout
-------
-
- The code core manages the CPU specific drivers, any data that they
- need to register and the interface to the generic drivers/cpufreq
- system. Each CPU registers a driver to control the PLL, clock dividers
- and anything else associated with it. Any board that wants to use this
- framework needs to supply at least basic details of what is required.
-
- The core registers with drivers/cpufreq at init time if all the data
- necessary has been supplied.
-
-
-CPU support
------------
-
- The support for each CPU depends on the facilities provided by the
- SoC and the driver as each device has different PLL and clock chains
- associated with it.
-
-
-Slow Mode
----------
-
- The SLOW mode where the PLL is turned off altogether and the
- system is fed by the external crystal input is currently not
- supported.
-
-
-sysfs
------
-
- The core code exports extra information via sysfs in the directory
- devices/system/cpu/cpu0/arch-freq.
-
-
-Board Support
--------------
-
- Each board that wants to use the cpufreq code must register some basic
- information with the core driver to provide information about what the
- board requires and any restrictions being placed on it.
-
- The board needs to supply information about whether it needs the IO bank
- timings changing, any maximum frequency limits and information about the
- SDRAM refresh rate.
-
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2009 Simtec Electronics
-Licensed under GPLv2
diff --git a/Documentation/arm/samsung-s3c24xx/eb2410itx.rst b/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
deleted file mode 100644
index 7863c93652f8..000000000000
--- a/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-===================================
-Simtec Electronics EB2410ITX (BAST)
-===================================
-
- http://www.simtec.co.uk/products/EB2410ITX/
-
-Introduction
-------------
-
- The EB2410ITX is a S3C2410 based development board with a variety of
- peripherals and expansion connectors. This board is also known by
- the shortened name of Bast.
-
-
-Configuration
--------------
-
- To set the default configuration, use `make bast_defconfig` which
- supports the commonly used features of this board.
-
-
-Support
--------
-
- Official support information can be found on the Simtec Electronics
- website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
-
- Useful links:
-
- - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
-
- - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
-
- - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
- and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
-
-
-MTD
----
-
- The NAND and NOR support has been merged from the linux-mtd project.
- Any problems, see http://www.linux-mtd.infradead.org/ for more
- information or up-to-date versions of linux-mtd.
-
-
-IDE
----
-
- Both onboard IDE ports are supported, however there is no support for
- changing speed of devices, PIO Mode 4 capable drives should be used.
-
-
-Maintainers
------------
-
- This board is maintained by Simtec Electronics.
-
-
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/gpio.rst b/Documentation/arm/samsung-s3c24xx/gpio.rst
deleted file mode 100644
index f4a8c800a457..000000000000
--- a/Documentation/arm/samsung-s3c24xx/gpio.rst
+++ /dev/null
@@ -1,172 +0,0 @@
-====================
-S3C24XX GPIO Control
-====================
-
-Introduction
-------------
-
- The s3c2410 kernel provides an interface to configure and
- manipulate the state of the GPIO pins, and find out other
- information about them.
-
- There are a number of conditions attached to the configuration
- of the s3c2410 GPIO system, please read the Samsung provided
- data-sheet/users manual to find out the complete list.
-
- See Documentation/arm/samsung/gpio.rst for the core implementation.
-
-
-GPIOLIB
--------
-
- With the event of the GPIOLIB in drivers/gpio, support for some
- of the GPIO functions such as reading and writing a pin will
- be removed in favour of this common access method.
-
- Once all the extant drivers have been converted, the functions
- listed below will be removed (they may be marked as __deprecated
- in the near future).
-
- The following functions now either have a `s3c_` specific variant
- or are merged into gpiolib. See the definitions in
- arch/arm/mach-s3c/gpio-cfg.h:
-
- - s3c2410_gpio_setpin() gpio_set_value() or gpio_direction_output()
- - s3c2410_gpio_getpin() gpio_get_value() or gpio_direction_input()
- - s3c2410_gpio_getirq() gpio_to_irq()
- - s3c2410_gpio_cfgpin() s3c_gpio_cfgpin()
- - s3c2410_gpio_getcfg() s3c_gpio_getcfg()
- - s3c2410_gpio_pullup() s3c_gpio_setpull()
-
-
-GPIOLIB conversion
-------------------
-
-If you need to convert your board or driver to use gpiolib from the phased
-out s3c2410 API, then here are some notes on the process.
-
-1) If your board is exclusively using an GPIO, say to control peripheral
- power, then it will require to claim the gpio with gpio_request() before
- it can use it.
-
- It is recommended to check the return value, with at least WARN_ON()
- during initialisation.
-
-2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
- as they have the same arguments, and can either take the pin specific
- values, or the more generic special-function-number arguments.
-
-3) s3c2410_gpio_pullup() changes have the problem that while the
- s3c2410_gpio_pullup(x, 1) can be easily translated to the
- s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
- are not so easy.
-
- The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
- of some of the devices, a pull-down) and as such the new API distinguishes
- between the UP and DOWN case. There is currently no 'just turn on' setting
- which may be required if this becomes a problem.
-
-4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
- does not implicitly configure the relevant gpio to output. The gpio
- direction should be changed before using gpio_set_value().
-
-5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
- has been set to input. It is currently unknown what the behaviour is
- when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
- would return the value the pin is supposed to be outputting).
-
-6) s3c2410_gpio_getirq() should be directly replaceable with the
- gpio_to_irq() call.
-
-The s3c2410_gpio and `gpio_` calls have always operated on the same gpio
-numberspace, so there is no problem with converting the gpio numbering
-between the calls.
-
-
-Headers
--------
-
- See arch/arm/mach-s3c/regs-gpio-s3c24xx.h for the list
- of GPIO pins, and the configuration values for them. This
- is included by using #include <mach/regs-gpio.h>
-
-
-PIN Numbers
------------
-
- Each pin has an unique number associated with it in regs-gpio.h,
- e.g. S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
- the GPIO functions which pin is to be used.
-
- With the conversion to gpiolib, there is no longer a direct conversion
- from gpio pin number to register base address as in earlier kernels. This
- is due to the number space required for newer SoCs where the later
- GPIOs are not contiguous.
-
-
-Configuring a pin
------------------
-
- The following function allows the configuration of a given pin to
- be changed.
-
- void s3c_gpio_cfgpin(unsigned int pin, unsigned int function);
-
- e.g.:
-
- s3c_gpio_cfgpin(S3C2410_GPA(0), S3C_GPIO_SFN(1));
- s3c_gpio_cfgpin(S3C2410_GPE(8), S3C_GPIO_SFN(2));
-
- which would turn GPA(0) into the lowest Address line A0, and set
- GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
-
-
-Reading the current configuration
----------------------------------
-
- The current configuration of a pin can be read by using standard
- gpiolib function:
-
- s3c_gpio_getcfg(unsigned int pin);
-
- The return value will be from the same set of values which can be
- passed to s3c_gpio_cfgpin().
-
-
-Configuring a pull-up resistor
-------------------------------
-
- A large proportion of the GPIO pins on the S3C2410 can have weak
- pull-up resistors enabled. This can be configured by the following
- function:
-
- void s3c_gpio_setpull(unsigned int pin, unsigned int to);
-
- Where the to value is S3C_GPIO_PULL_NONE to set the pull-up off,
- and S3C_GPIO_PULL_UP to enable the specified pull-up. Any other
- values are currently undefined.
-
-
-Getting and setting the state of a PIN
---------------------------------------
-
- These calls are now implemented by the relevant gpiolib calls, convert
- your board or driver to use gpiolib.
-
-
-Getting the IRQ number associated with a PIN
---------------------------------------------
-
- A standard gpiolib function can map the given pin number to an IRQ
- number to pass to the IRQ system.
-
- int gpio_to_irq(unsigned int pin);
-
- Note, not all pins have an IRQ.
-
-
-Author
--------
-
-Ben Dooks, 03 October 2004
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/h1940.rst b/Documentation/arm/samsung-s3c24xx/h1940.rst
deleted file mode 100644
index 62a562c178e3..000000000000
--- a/Documentation/arm/samsung-s3c24xx/h1940.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-=============
-HP IPAQ H1940
-=============
-
-http://www.handhelds.org/projects/h1940.html
-
-Introduction
-------------
-
- The HP H1940 is a S3C2410 based handheld device, with
- bluetooth connectivity.
-
-
-Support
--------
-
- A variety of information is available
-
- handhelds.org project page:
-
- http://www.handhelds.org/projects/h1940.html
-
- handhelds.org wiki page:
-
- http://handhelds.org/moin/moin.cgi/HpIpaqH1940
-
- Herbert Pötzl pages:
-
- http://vserver.13thfloor.at/H1940/
-
-
-Maintainers
------------
-
- This project is being maintained and developed by a variety
- of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
-
- Thanks to the many others who have also provided support.
-
-
-(c) 2005 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/index.rst b/Documentation/arm/samsung-s3c24xx/index.rst
deleted file mode 100644
index ccb951a0bedb..000000000000
--- a/Documentation/arm/samsung-s3c24xx/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==========================
-Samsung S3C24XX SoC Family
-==========================
-
-.. toctree::
- :maxdepth: 1
-
- h1940
- gpio
- cpufreq
- suspend
- usb-host
- s3c2412
- eb2410itx
- nand
- smdk2440
- s3c2413
- overview
diff --git a/Documentation/arm/samsung-s3c24xx/nand.rst b/Documentation/arm/samsung-s3c24xx/nand.rst
deleted file mode 100644
index 938995694ee7..000000000000
--- a/Documentation/arm/samsung-s3c24xx/nand.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-====================
-S3C24XX NAND Support
-====================
-
-Introduction
-------------
-
-Small Page NAND
----------------
-
-The driver uses a 512 byte (1 page) ECC code for this setup. The
-ECC code is not directly compatible with the default kernel ECC
-code, so the driver enforces its own OOB layout and ECC parameters
-
-Large Page NAND
----------------
-
-The driver is capable of handling NAND flash with a 2KiB page
-size, with support for hardware ECC generation and correction.
-
-Unlike the 512byte page mode, the driver generates ECC data for
-each 256 byte block in an 2KiB page. This means that more than
-one error in a page can be rectified. It also means that the
-OOB layout remains the default kernel layout for these flashes.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2007 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/overview.rst b/Documentation/arm/samsung-s3c24xx/overview.rst
deleted file mode 100644
index 14535e5cffb7..000000000000
--- a/Documentation/arm/samsung-s3c24xx/overview.rst
+++ /dev/null
@@ -1,311 +0,0 @@
-==========================
-S3C24XX ARM Linux Overview
-==========================
-
-
-
-Introduction
-------------
-
- The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
- by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
- S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443 and S3C2450 devices
- are supported.
-
- Support for the S3C2400 and S3C24A0 series was never completed and the
- corresponding code has been removed after a while. If someone wishes to
- revive this effort, partial support can be retrieved from earlier Linux
- versions.
-
- The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
- included under the arch/arm/mach-s3c directory. Note, while core
- support for these SoCs is in, work on some of the extra peripherals
- and extra interrupts is still ongoing.
-
-
-Configuration
--------------
-
- A generic S3C2410 configuration is provided, and can be used as the
- default by `make s3c2410_defconfig`. This configuration has support
- for all the machines, and the commonly used features on them.
-
- Certain machines may have their own default configurations as well,
- please check the machine specific documentation.
-
-
-Layout
-------
-
- The core support files, register, kernel and paltform data are located in the
- platform code contained in arch/arm/mach-s3c with headers in
- arch/arm/mach-s3c/include
-
-arch/arm/mach-s3c:
-
- Files in here are either common to all the s3c24xx family,
- or are common to only some of them with names to indicate this
- status. The files that are not common to all are generally named
- with the initial cpu they support in the series to ensure a short
- name without any possibility of confusion with newer devices.
-
- As an example, initially s3c244x would cover s3c2440 and s3c2442, but
- with the s3c2443 which does not share many of the same drivers in
- this directory, the name becomes invalid. We stick to s3c2440-<x>
- to indicate a driver that is s3c2440 and s3c2442 compatible.
-
- This does mean that to find the status of any given SoC, a number
- of directories may need to be searched.
-
-
-Machines
---------
-
- The currently supported machines are as follows:
-
- Simtec Electronics EB2410ITX (BAST)
-
- A general purpose development board, see EB2410ITX.txt for further
- details
-
- Simtec Electronics IM2440D20 (Osiris)
-
- CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
- and a PCMCIA controller.
-
- Samsung SMDK2410
-
- Samsung's own development board, geared for PDA work.
-
- Samsung/Aiji SMDK2412
-
- The S3C2412 version of the SMDK2440.
-
- Samsung/Aiji SMDK2413
-
- The S3C2412 version of the SMDK2440.
-
- Samsung/Meritech SMDK2440
-
- The S3C2440 compatible version of the SMDK2440, which has the
- option of an S3C2440 or S3C2442 CPU module.
-
- Thorcom VR1000
-
- Custom embedded board
-
- HP IPAQ 1940
-
- Handheld (IPAQ), available in several varieties
-
- HP iPAQ rx3715
-
- S3C2440 based IPAQ, with a number of variations depending on
- features shipped.
-
- Acer N30
-
- A S3C2410 based PDA from Acer. There is a Wiki page at
- http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
-
- AML M5900
-
- American Microsystems' M5900
-
- Nex Vision Nexcoder
- Nex Vision Otom
-
- Two machines by Nex Vision
-
-
-Adding New Machines
--------------------
-
- The architecture has been designed to support as many machines as can
- be configured for it in one kernel build, and any future additions
- should keep this in mind before altering items outside of their own
- machine files.
-
- Machine definitions should be kept in arch/arm/mach-s3c,
- and there are a number of examples that can be looked at.
-
- Read the kernel patch submission policies as well as the
- Documentation/arm directory before submitting patches. The
- ARM kernel series is managed by Russell King, and has a patch system
- located at http://www.arm.linux.org.uk/developer/patches/
- as well as mailing lists that can be found from the same site.
-
- As a courtesy, please notify <ben-linux@fluff.org> of any new
- machines or other modifications.
-
- Any large scale modifications, or new drivers should be discussed
- on the ARM kernel mailing list (linux-arm-kernel) before being
- attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
- mailing list information.
-
-
-I2C
----
-
- The hardware I2C core in the CPU is supported in single master
- mode, and can be configured via platform data.
-
-
-RTC
----
-
- Support for the onboard RTC unit, including alarm function.
-
- This has recently been upgraded to use the new RTC core,
- and the module has been renamed to rtc-s3c to fit in with
- the new rtc naming scheme.
-
-
-Watchdog
---------
-
- The onchip watchdog is available via the standard watchdog
- interface.
-
-
-NAND
-----
-
- The current kernels now have support for the s3c2410 NAND
- controller. If there are any problems the latest linux-mtd
- code can be found from http://www.linux-mtd.infradead.org/
-
- For more information see Documentation/arm/samsung-s3c24xx/nand.rst
-
-
-SD/MMC
-------
-
- The SD/MMC hardware pre S3C2443 is supported in the current
- kernel, the driver is drivers/mmc/host/s3cmci.c and supports
- 1 and 4 bit SD or MMC cards.
-
- The SDIO behaviour of this driver has not been fully tested. There is no
- current support for hardware SDIO interrupts.
-
-
-Serial
-------
-
- The s3c2410 serial driver provides support for the internal
- serial ports. These devices appear as /dev/ttySAC0 through 3.
-
- To create device nodes for these, use the following commands
-
- mknod ttySAC0 c 204 64
- mknod ttySAC1 c 204 65
- mknod ttySAC2 c 204 66
-
-
-GPIO
-----
-
- The core contains support for manipulating the GPIO, see the
- documentation in GPIO.txt in the same directory as this file.
-
- Newer kernels carry GPIOLIB, and support is being moved towards
- this with some of the older support in line to be removed.
-
- As of v2.6.34, the move towards using gpiolib support is almost
- complete, and very little of the old calls are left.
-
- See Documentation/arm/samsung-s3c24xx/gpio.rst for the S3C24XX specific
- support and Documentation/arm/samsung/gpio.rst for the core Samsung
- implementation.
-
-
-Clock Management
-----------------
-
- The core provides the interface defined in the header file
- include/asm-arm/hardware/clock.h, to allow control over the
- various clock units
-
-
-Suspend to RAM
---------------
-
- For boards that provide support for suspend to RAM, the
- system can be placed into low power suspend.
-
- See Suspend.txt for more information.
-
-
-SPI
----
-
- SPI drivers are available for both the in-built hardware
- (although there is no DMA support yet) and a generic
- GPIO based solution.
-
-
-LEDs
-----
-
- There is support for GPIO based LEDs via a platform driver
- in the LED subsystem.
-
-
-Platform Data
--------------
-
- Whenever a device has platform specific data that is specified
- on a per-machine basis, care should be taken to ensure the
- following:
-
- 1) that default data is not left in the device to confuse the
- driver if a machine does not set it at startup
-
- 2) the data should (if possible) be marked as __initdata,
- to ensure that the data is thrown away if the machine is
- not the one currently in use.
-
- The best way of doing this is to make a function that
- kmalloc()s an area of memory, and copies the __initdata
- and then sets the relevant device's platform data. Making
- the function `__init` takes care of ensuring it is discarded
- with the rest of the initialisation code::
-
- static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
- {
- struct s3c2410_xxx_mach_info *npd;
-
- npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
- if (npd) {
- memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
- s3c_device_xxx.dev.platform_data = npd;
- } else {
- printk(KERN_ERR "no memory for xxx platform data\n");
- }
- }
-
- Note, since the code is marked as __init, it should not be
- exported outside arch/arm/mach-s3c/, or exported to
- modules via EXPORT_SYMBOL() and related functions.
-
-
-Port Contributors
------------------
-
- Ben Dooks (BJD)
- Vincent Sanders
- Herbert Potzl
- Arnaud Patard (RTP)
- Roc Wu
- Klaus Fetscher
- Dimitry Andric
- Shannon Holland
- Guillaume Gourat (NexVision)
- Christer Weinigel (wingel) (Acer N30)
- Lucas Correia Villa Real (S3C2400 port)
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004-2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2412.rst b/Documentation/arm/samsung-s3c24xx/s3c2412.rst
deleted file mode 100644
index 68b985fc6bf4..000000000000
--- a/Documentation/arm/samsung-s3c24xx/s3c2412.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-==========================
-S3C2412 ARM Linux Overview
-==========================
-
-Introduction
-------------
-
- The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
- from Samsung. This part has an ARM926-EJS core, capable of running up
- to 266MHz (see data-sheet for more information)
-
-
-Clock
------
-
- The core clock code provides a set of clocks to the drivers, and allows
- for source selection and a number of other features.
-
-
-Power
------
-
- No support for suspend/resume to RAM in the current system.
-
-
-DMA
----
-
- No current support for DMA.
-
-
-GPIO
-----
-
- There is support for setting the GPIO to input/output/special function
- and reading or writing to them.
-
-
-UART
-----
-
- The UART hardware is similar to the S3C2440, and is supported by the
- s3c2410 driver in the drivers/serial directory.
-
-
-NAND
-----
-
- The NAND hardware is similar to the S3C2440, and is supported by the
- s3c2410 driver in the drivers/mtd/nand/raw directory.
-
-
-USB Host
---------
-
- The USB hardware is similar to the S3C2410, with extended clock source
- control. The OHCI portion is supported by the ohci-s3c2410 driver, and
- the clock control selection is supported by the core clock code.
-
-
-USB Device
-----------
-
- No current support in the kernel
-
-
-IRQs
-----
-
- All the standard, and external interrupt sources are supported. The
- extra sub-sources are not yet supported.
-
-
-RTC
----
-
- The RTC hardware is similar to the S3C2410, and is supported by the
- s3c2410-rtc driver.
-
-
-Watchdog
---------
-
- The watchdog hardware is the same as the S3C2410, and is supported by
- the s3c2410_wdt driver.
-
-
-MMC/SD/SDIO
------------
-
- No current support for the MMC/SD/SDIO block.
-
-IIC
----
-
- The IIC hardware is the same as the S3C2410, and is supported by the
- i2c-s3c24xx driver.
-
-
-IIS
----
-
- No current support for the IIS interface.
-
-
-SPI
----
-
- No current support for the SPI interfaces.
-
-
-ATA
----
-
- No current support for the on-board ATA block.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2413.rst b/Documentation/arm/samsung-s3c24xx/s3c2413.rst
deleted file mode 100644
index 1f51e207fc46..000000000000
--- a/Documentation/arm/samsung-s3c24xx/s3c2413.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-==========================
-S3C2413 ARM Linux Overview
-==========================
-
-Introduction
-------------
-
- The S3C2413 is an extended version of the S3C2412, with an camera
- interface and mobile DDR memory support. See the S3C2412 support
- documentation for more information.
-
-
-Camera Interface
-----------------
-
- This block is currently not supported.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/smdk2440.rst b/Documentation/arm/samsung-s3c24xx/smdk2440.rst
deleted file mode 100644
index 524fd0b4afaf..000000000000
--- a/Documentation/arm/samsung-s3c24xx/smdk2440.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-=========================
-Samsung/Meritech SMDK2440
-=========================
-
-Introduction
-------------
-
- The SMDK2440 is a two part evaluation board for the Samsung S3C2440
- processor. It includes support for LCD, SmartMedia, Audio, SD and
- 10MBit Ethernet, and expansion headers for various signals, including
- the camera and unused GPIO.
-
-
-Configuration
--------------
-
- To set the default configuration, use `make smdk2440_defconfig` which
- will configure the common features of this board, or use
- `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
-
-
-Support
--------
-
- Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
- includes linux based USB download tools.
-
- Some of the h1940 patches that can be found from the H1940 project
- site at http://www.handhelds.org/projects/h1940.html can also be
- applied to this board.
-
-
-Peripherals
------------
-
- There is no current support for any of the extra peripherals on the
- base-board itself.
-
-
-MTD
----
-
- The NAND flash should be supported by the in kernel MTD NAND support,
- NOR flash will be added later.
-
-
-Maintainers
------------
-
- This board is being maintained by Ben Dooks, for more info, see
- http://www.fluff.org/ben/smdk2440/
-
- Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
- and to Simtec Electronics for allowing me time to work on this.
-
-
-(c) 2004 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/suspend.rst b/Documentation/arm/samsung-s3c24xx/suspend.rst
deleted file mode 100644
index b4f3ae9fe76e..000000000000
--- a/Documentation/arm/samsung-s3c24xx/suspend.rst
+++ /dev/null
@@ -1,137 +0,0 @@
-=======================
-S3C24XX Suspend Support
-=======================
-
-
-Introduction
-------------
-
- The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
- in Self-Refresh mode, and all but the essential peripheral blocks are
- powered down. For more information on how this works, please look
- at the relevant CPU datasheet from Samsung.
-
-
-Requirements
-------------
-
- 1) A bootloader that can support the necessary resume operation
-
- 2) Support for at least 1 source for resume
-
- 3) CONFIG_PM enabled in the kernel
-
- 4) Any peripherals that are going to be powered down at the same
- time require suspend/resume support.
-
-
-Resuming
---------
-
- The S3C2410 user manual defines the process of sending the CPU to
- sleep and how it resumes. The default behaviour of the Linux code
- is to set the GSTATUS3 register to the physical address of the
- code to resume Linux operation.
-
- GSTATUS4 is currently left alone by the sleep code, and is free to
- use for any other purposes (for example, the EB2410ITX uses this to
- save memory configuration in).
-
-
-Machine Support
----------------
-
- The machine specific functions must call the s3c_pm_init() function
- to say that its bootloader is capable of resuming. This can be as
- simple as adding the following to the machine's definition:
-
- INITMACHINE(s3c_pm_init)
-
- A board can do its own setup before calling s3c_pm_init, if it
- needs to setup anything else for power management support.
-
- There is currently no support for over-riding the default method of
- saving the resume address, if your board requires it, then contact
- the maintainer and discuss what is required.
-
- Note, the original method of adding an late_initcall() is wrong,
- and will end up initialising all compiled machines' pm init!
-
- The following is an example of code used for testing wakeup from
- an falling edge on IRQ_EINT0::
-
-
- static irqreturn_t button_irq(int irq, void *pw)
- {
- return IRQ_HANDLED;
- }
-
- statuc void __init machine_init(void)
- {
- ...
-
- request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
- "button-irq-eint0", NULL);
-
- enable_irq_wake(IRQ_EINT0);
-
- s3c_pm_init();
- }
-
-
-Debugging
----------
-
- There are several important things to remember when using PM suspend:
-
- 1) The uart drivers will disable the clocks to the UART blocks when
- suspending, which means that use of printascii() or similar direct
- access to the UARTs will cause the debug to stop.
-
- 2) While the pm code itself will attempt to re-enable the UART clocks,
- care should be taken that any external clock sources that the UARTs
- rely on are still enabled at that point.
-
- 3) If any debugging is placed in the resume path, then it must have the
- relevant clocks and peripherals setup before use (ie, bootloader).
-
- For example, if you transmit a character from the UART, the baud
- rate and uart controls must be setup beforehand.
-
-
-Configuration
--------------
-
- The S3C2410 specific configuration in `System Type` defines various
- aspects of how the S3C2410 suspend and resume support is configured
-
- `S3C2410 PM Suspend debug`
-
- This option prints messages to the serial console before and after
- the actual suspend, giving detailed information on what is
- happening
-
-
- `S3C2410 PM Suspend Memory CRC`
-
- Allows the entire memory to be checksummed before and after the
- suspend to see if there has been any corruption of the contents.
-
- Note, the time to calculate the CRC is dependent on the CPU speed
- and the size of memory. For an 64Mbyte RAM area on an 200MHz
- S3C2410, this can take approximately 4 seconds to complete.
-
- This support requires the CRC32 function to be enabled.
-
-
- `S3C2410 PM Suspend CRC Chunksize (KiB)`
-
- Defines the size of memory each CRC chunk covers. A smaller value
- will mean that the CRC data block will take more memory, but will
- identify any faults with better precision
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/usb-host.rst b/Documentation/arm/samsung-s3c24xx/usb-host.rst
deleted file mode 100644
index 7aaffac89e04..000000000000
--- a/Documentation/arm/samsung-s3c24xx/usb-host.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-========================
-S3C24XX USB Host support
-========================
-
-
-
-Introduction
-------------
-
- This document details the S3C2410/S3C2440 in-built OHCI USB host support.
-
-Configuration
--------------
-
- Enable at least the following kernel options:
-
- menuconfig::
-
- Device Drivers --->
- USB support --->
- <*> Support for Host-side USB
- <*> OHCI HCD support
-
-
- .config:
-
- - CONFIG_USB
- - CONFIG_USB_OHCI_HCD
-
-
- Once these options are configured, the standard set of USB device
- drivers can be configured and used.
-
-
-Board Support
--------------
-
- The driver attaches to a platform device, which will need to be
- added by the board specific support file in arch/arm/mach-s3c,
- such as mach-bast.c or mach-smdk2410.c
-
- The platform device's platform_data field is only needed if the
- board implements extra power control or over-current monitoring.
-
- The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
- register, so if both ports are to be used for the host, then it is
- the board support file's responsibility to ensure that the second
- port is configured to be connected to the OHCI core.
-
-
-Platform Data
--------------
-
- See include/linux/platform_data/usb-ohci-s3c2410.h for the
- descriptions of the platform device data. An implementation
- can be found in arch/arm/mach-s3c/simtec-usb.c .
-
- The `struct s3c2410_hcd_info` contains a pair of functions
- that get called to enable over-current detection, and to
- control the port power status.
-
- The ports are numbered 0 and 1.
-
- power_control:
- Called to enable or disable the power on the port.
-
- enable_oc:
- Called to enable or disable the over-current monitoring.
- This should claim or release the resources being used to
- check the power condition on the port, such as an IRQ.
-
- report_oc:
- The OHCI driver fills this field in for the over-current code
- to call when there is a change to the over-current state on
- an port. The ports argument is a bitmask of 1 bit per port,
- with bit X being 1 for an over-current on port X.
-
- The function s3c2410_usb_report_oc() has been provided to
- ensure this is called correctly.
-
- port[x]:
- This is struct describes each port, 0 or 1. The platform driver
- should set the flags field of each port to S3C_HCDFLG_USED if
- the port is enabled.
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2005 Simtec Electronics
diff --git a/Documentation/arm/samsung/gpio.rst b/Documentation/arm/samsung/gpio.rst
index f6e27b07c993..27fae0d50361 100644
--- a/Documentation/arm/samsung/gpio.rst
+++ b/Documentation/arm/samsung/gpio.rst
@@ -9,14 +9,6 @@ This outlines the Samsung GPIO implementation and the architecture
specific calls provided alongside the drivers/gpio core.
-S3C24XX (Legacy)
-----------------
-
-See Documentation/arm/samsung-s3c24xx/gpio.rst for more information
-about these devices. Their implementation has been brought into line
-with the core samsung implementation described in this document.
-
-
GPIOLIB integration
-------------------
diff --git a/Documentation/arm/samsung/overview.rst b/Documentation/arm/samsung/overview.rst
index e74307897416..8b15a190169b 100644
--- a/Documentation/arm/samsung/overview.rst
+++ b/Documentation/arm/samsung/overview.rst
@@ -12,21 +12,10 @@ Introduction
The currently supported SoCs are:
- - S3C24XX: See Documentation/arm/samsung-s3c24xx/overview.rst for full list
- S3C64XX: S3C6400 and S3C6410
- S5PC110 / S5PV210
-S3C24XX Systems
----------------
-
- There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
- deals with the architecture and drivers specific to these devices.
-
- See Documentation/arm/samsung-s3c24xx/overview.rst for more information
- on the implementation details and specific support.
-
-
Configuration
-------------
@@ -51,8 +40,6 @@ Layout
specific information. It contains the base clock, GPIO and device definitions
to get the system running.
- plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
-
plat-s5p is for s5p specific builds, and contains common support for the
S5P specific systems. Not all S5Ps use all the features in this directory
due to differences in the hardware.
diff --git a/Documentation/arm/sti/overview.rst b/Documentation/arm/sti/overview.rst
index 70743617a74f..ae16aced800f 100644
--- a/Documentation/arm/sti/overview.rst
+++ b/Documentation/arm/sti/overview.rst
@@ -7,22 +7,18 @@ Introduction
The ST Microelectronics Multimedia and Application Processors range of
CortexA9 System-on-Chip are supported by the 'STi' platform of
- ARM Linux. Currently STiH415, STiH416 SOCs are supported with both
- B2000 and B2020 Reference boards.
+ ARM Linux. Currently STiH407, STiH410 and STiH418 are supported.
configuration
-------------
- A generic configuration is provided for both STiH415/416, and can be used as the
- default by::
-
- make stih41x_defconfig
+ The configuration for the STi platform is supported via the multi_v7_defconfig.
Layout
------
- All the files for multiple machine families (STiH415, STiH416, and STiG125)
+ All the files for multiple machine families (STiH407, STiH410, and STiH418)
are located in the platform code contained in arch/arm/mach-sti
There is a generic board board-dt.c in the mach folder which support
diff --git a/Documentation/arm/sti/stih415-overview.rst b/Documentation/arm/sti/stih415-overview.rst
deleted file mode 100644
index b67452d610c4..000000000000
--- a/Documentation/arm/sti/stih415-overview.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-================
-STiH415 Overview
-================
-
-Introduction
-------------
-
- The STiH415 is the next generation of HD, AVC set-top box processors
- for satellite, cable, terrestrial and IP-STB markets.
-
- Features:
-
- - ARM Cortex-A9 1.0 GHz, dual-core CPU
- - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih416-overview.rst b/Documentation/arm/sti/stih416-overview.rst
deleted file mode 100644
index 93f17d74d8db..000000000000
--- a/Documentation/arm/sti/stih416-overview.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-================
-STiH416 Overview
-================
-
-Introduction
-------------
-
- The STiH416 is the next generation of HD, AVC set-top box processors
- for satellite, cable, terrestrial and IP-STB markets.
-
- Features
- - ARM Cortex-A9 1.2 GHz dual core CPU
- - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/stm32/stm32-dma-mdma-chaining.rst b/Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
new file mode 100644
index 000000000000..2945e0e33104
--- /dev/null
+++ b/Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
@@ -0,0 +1,415 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+STM32 DMA-MDMA chaining
+=======================
+
+
+Introduction
+------------
+
+ This document describes the STM32 DMA-MDMA chaining feature. But before going
+ further, let's introduce the peripherals involved.
+
+ To offload data transfers from the CPU, STM32 microprocessors (MPUs) embed
+ direct memory access controllers (DMA).
+
+ STM32MP1 SoCs embed both STM32 DMA and STM32 MDMA controllers. STM32 DMA
+ request routing capabilities are enhanced by a DMA request multiplexer
+ (STM32 DMAMUX).
+
+ **STM32 DMAMUX**
+
+ STM32 DMAMUX routes any DMA request from a given peripheral to any STM32 DMA
+ controller (STM32MP1 counts two STM32 DMA controllers) channels.
+
+ **STM32 DMA**
+
+ STM32 DMA is mainly used to implement central data buffer storage (usually in
+ the system SRAM) for different peripheral. It can access external RAMs but
+ without the ability to generate convenient burst transfer ensuring the best
+ load of the AXI.
+
+ **STM32 MDMA**
+
+ STM32 MDMA (Master DMA) is mainly used to manage direct data transfers between
+ RAM data buffers without CPU intervention. It can also be used in a
+ hierarchical structure that uses STM32 DMA as first level data buffer
+ interfaces for AHB peripherals, while the STM32 MDMA acts as a second level
+ DMA with better performance. As a AXI/AHB master, STM32 MDMA can take control
+ of the AXI/AHB bus.
+
+
+Principles
+----------
+
+ STM32 DMA-MDMA chaining feature relies on the strengths of STM32 DMA and
+ STM32 MDMA controllers.
+
+ STM32 DMA has a circular Double Buffer Mode (DBM). At each end of transaction
+ (when DMA data counter - DMA_SxNDTR - reaches 0), the memory pointers
+ (configured with DMA_SxSM0AR and DMA_SxM1AR) are swapped and the DMA data
+ counter is automatically reloaded. This allows the SW or the STM32 MDMA to
+ process one memory area while the second memory area is being filled/used by
+ the STM32 DMA transfer.
+
+ With STM32 MDMA linked-list mode, a single request initiates the data array
+ (collection of nodes) to be transferred until the linked-list pointer for the
+ channel is null. The channel transfer complete of the last node is the end of
+ transfer, unless first and last nodes are linked to each other, in such a
+ case, the linked-list loops on to create a circular MDMA transfer.
+
+ STM32 MDMA has direct connections with STM32 DMA. This enables autonomous
+ communication and synchronization between peripherals, thus saving CPU
+ resources and bus congestion. Transfer Complete signal of STM32 DMA channel
+ can triggers STM32 MDMA transfer. STM32 MDMA can clear the request generated
+ by the STM32 DMA by writing to its Interrupt Clear register (whose address is
+ stored in MDMA_CxMAR, and bit mask in MDMA_CxMDR).
+
+ .. table:: STM32 MDMA interconnect table with STM32 DMA
+
+ +--------------+----------------+-----------+------------+
+ | STM32 DMAMUX | STM32 DMA | STM32 DMA | STM32 MDMA |
+ | channels | channels | Transfer | request |
+ | | | complete | |
+ | | | signal | |
+ +==============+================+===========+============+
+ | Channel *0* | DMA1 channel 0 | dma1_tcf0 | *0x00* |
+ +--------------+----------------+-----------+------------+
+ | Channel *1* | DMA1 channel 1 | dma1_tcf1 | *0x01* |
+ +--------------+----------------+-----------+------------+
+ | Channel *2* | DMA1 channel 2 | dma1_tcf2 | *0x02* |
+ +--------------+----------------+-----------+------------+
+ | Channel *3* | DMA1 channel 3 | dma1_tcf3 | *0x03* |
+ +--------------+----------------+-----------+------------+
+ | Channel *4* | DMA1 channel 4 | dma1_tcf4 | *0x04* |
+ +--------------+----------------+-----------+------------+
+ | Channel *5* | DMA1 channel 5 | dma1_tcf5 | *0x05* |
+ +--------------+----------------+-----------+------------+
+ | Channel *6* | DMA1 channel 6 | dma1_tcf6 | *0x06* |
+ +--------------+----------------+-----------+------------+
+ | Channel *7* | DMA1 channel 7 | dma1_tcf7 | *0x07* |
+ +--------------+----------------+-----------+------------+
+ | Channel *8* | DMA2 channel 0 | dma2_tcf0 | *0x08* |
+ +--------------+----------------+-----------+------------+
+ | Channel *9* | DMA2 channel 1 | dma2_tcf1 | *0x09* |
+ +--------------+----------------+-----------+------------+
+ | Channel *10* | DMA2 channel 2 | dma2_tcf2 | *0x0A* |
+ +--------------+----------------+-----------+------------+
+ | Channel *11* | DMA2 channel 3 | dma2_tcf3 | *0x0B* |
+ +--------------+----------------+-----------+------------+
+ | Channel *12* | DMA2 channel 4 | dma2_tcf4 | *0x0C* |
+ +--------------+----------------+-----------+------------+
+ | Channel *13* | DMA2 channel 5 | dma2_tcf5 | *0x0D* |
+ +--------------+----------------+-----------+------------+
+ | Channel *14* | DMA2 channel 6 | dma2_tcf6 | *0x0E* |
+ +--------------+----------------+-----------+------------+
+ | Channel *15* | DMA2 channel 7 | dma2_tcf7 | *0x0F* |
+ +--------------+----------------+-----------+------------+
+
+ STM32 DMA-MDMA chaining feature then uses a SRAM buffer. STM32MP1 SoCs embed
+ three fast access static internal RAMs of various size, used for data storage.
+ Due to STM32 DMA legacy (within microcontrollers), STM32 DMA performances are
+ bad with DDR, while they are optimal with SRAM. Hence the SRAM buffer used
+ between STM32 DMA and STM32 MDMA. This buffer is split in two equal periods
+ and STM32 DMA uses one period while STM32 MDMA uses the other period
+ simultaneously.
+ ::
+
+ dma[1:2]-tcf[0:7]
+ .----------------.
+ ____________ ' _________ V____________
+ | STM32 DMA | / __|>_ \ | STM32 MDMA |
+ |------------| | / \ | |------------|
+ | DMA_SxM0AR |<=>| | SRAM | |<=>| []-[]...[] |
+ | DMA_SxM1AR | | \_____/ | | |
+ |____________| \___<|____/ |____________|
+
+ STM32 DMA-MDMA chaining uses (struct dma_slave_config).peripheral_config to
+ exchange the parameters needed to configure MDMA. These parameters are
+ gathered into a u32 array with three values:
+
+ * the STM32 MDMA request (which is actually the DMAMUX channel ID),
+ * the address of the STM32 DMA register to clear the Transfer Complete
+ interrupt flag,
+ * the mask of the Transfer Complete interrupt flag of the STM32 DMA channel.
+
+Device Tree updates for STM32 DMA-MDMA chaining support
+-------------------------------------------------------
+
+ **1. Allocate a SRAM buffer**
+
+ SRAM device tree node is defined in SoC device tree. You can refer to it in
+ your board device tree to define your SRAM pool.
+ ::
+
+ &sram {
+ my_foo_device_dma_pool: dma-sram@0 {
+ reg = <0x0 0x1000>;
+ };
+ };
+
+ Be careful of the start index, in case there are other SRAM consumers.
+ Define your pool size strategically: to optimise chaining, the idea is that
+ STM32 DMA and STM32 MDMA can work simultaneously, on each buffer of the
+ SRAM.
+ If the SRAM period is greater than the expected DMA transfer, then STM32 DMA
+ and STM32 MDMA will work sequentially instead of simultaneously. It is not a
+ functional issue but it is not optimal.
+
+ Don't forget to refer to your SRAM pool in your device node. You need to
+ define a new property.
+ ::
+
+ &my_foo_device {
+ ...
+ my_dma_pool = &my_foo_device_dma_pool;
+ };
+
+ Then get this SRAM pool in your foo driver and allocate your SRAM buffer.
+
+ **2. Allocate a STM32 DMA channel and a STM32 MDMA channel**
+
+ You need to define an extra channel in your device tree node, in addition to
+ the one you should already have for "classic" DMA operation.
+
+ This new channel must be taken from STM32 MDMA channels, so, the phandle of
+ the DMA controller to use is the MDMA controller's one.
+ ::
+
+ &my_foo_device {
+ [...]
+ my_dma_pool = &my_foo_device_dma_pool;
+ dmas = <&dmamux1 ...>, // STM32 DMA channel
+ <&mdma1 0 0x3 0x1200000a 0 0>; // + STM32 MDMA channel
+ };
+
+ Concerning STM32 MDMA bindings:
+
+ 1. The request line number : whatever the value here, it will be overwritten
+ by MDMA driver with the STM32 DMAMUX channel ID passed through
+ (struct dma_slave_config).peripheral_config
+
+ 2. The priority level : choose Very High (0x3) so that your channel will
+ take priority other the other during request arbitration
+
+ 3. A 32bit mask specifying the DMA channel configuration : source and
+ destination address increment, block transfer with 128 bytes per single
+ transfer
+
+ 4. The 32bit value specifying the register to be used to acknowledge the
+ request: it will be overwritten by MDMA driver, with the DMA channel
+ interrupt flag clear register address passed through
+ (struct dma_slave_config).peripheral_config
+
+ 5. The 32bit mask specifying the value to be written to acknowledge the
+ request: it will be overwritten by MDMA driver, with the DMA channel
+ Transfer Complete flag passed through
+ (struct dma_slave_config).peripheral_config
+
+Driver updates for STM32 DMA-MDMA chaining support in foo driver
+----------------------------------------------------------------
+
+ **0. (optional) Refactor the original sg_table if dmaengine_prep_slave_sg()**
+
+ In case of dmaengine_prep_slave_sg(), the original sg_table can't be used as
+ is. Two new sg_tables must be created from the original one. One for
+ STM32 DMA transfer (where memory address targets now the SRAM buffer instead
+ of DDR buffer) and one for STM32 MDMA transfer (where memory address targets
+ the DDR buffer).
+
+ The new sg_list items must fit SRAM period length. Here is an example for
+ DMA_DEV_TO_MEM:
+ ::
+
+ /*
+ * Assuming sgl and nents, respectively the initial scatterlist and its
+ * length.
+ * Assuming sram_dma_buf and sram_period, respectively the memory
+ * allocated from the pool for DMA usage, and the length of the period,
+ * which is half of the sram_buf size.
+ */
+ struct sg_table new_dma_sgt, new_mdma_sgt;
+ struct scatterlist *s, *_sgl;
+ dma_addr_t ddr_dma_buf;
+ u32 new_nents = 0, len;
+ int i;
+
+ /* Count the number of entries needed */
+ for_each_sg(sgl, s, nents, i)
+ if (sg_dma_len(s) > sram_period)
+ new_nents += DIV_ROUND_UP(sg_dma_len(s), sram_period);
+ else
+ new_nents++;
+
+ /* Create sg table for STM32 DMA channel */
+ ret = sg_alloc_table(&new_dma_sgt, new_nents, GFP_ATOMIC);
+ if (ret)
+ dev_err(dev, "DMA sg table alloc failed\n");
+
+ for_each_sg(new_dma_sgt.sgl, s, new_dma_sgt.nents, i) {
+ _sgl = sgl;
+ sg_dma_len(s) = min(sg_dma_len(_sgl), sram_period);
+ /* Targets the beginning = first half of the sram_buf */
+ s->dma_address = sram_buf;
+ /*
+ * Targets the second half of the sram_buf
+ * for odd indexes of the item of the sg_list
+ */
+ if (i & 1)
+ s->dma_address += sram_period;
+ }
+
+ /* Create sg table for STM32 MDMA channel */
+ ret = sg_alloc_table(&new_mdma_sgt, new_nents, GFP_ATOMIC);
+ if (ret)
+ dev_err(dev, "MDMA sg_table alloc failed\n");
+
+ _sgl = sgl;
+ len = sg_dma_len(sgl);
+ ddr_dma_buf = sg_dma_address(sgl);
+ for_each_sg(mdma_sgt.sgl, s, mdma_sgt.nents, i) {
+ size_t bytes = min_t(size_t, len, sram_period);
+
+ sg_dma_len(s) = bytes;
+ sg_dma_address(s) = ddr_dma_buf;
+ len -= bytes;
+
+ if (!len && sg_next(_sgl)) {
+ _sgl = sg_next(_sgl);
+ len = sg_dma_len(_sgl);
+ ddr_dma_buf = sg_dma_address(_sgl);
+ } else {
+ ddr_dma_buf += bytes;
+ }
+ }
+
+ Don't forget to release these new sg_tables after getting the descriptors
+ with dmaengine_prep_slave_sg().
+
+ **1. Set controller specific parameters**
+
+ First, use dmaengine_slave_config() with a struct dma_slave_config to
+ configure STM32 DMA channel. You just have to take care of DMA addresses,
+ the memory address (depending on the transfer direction) must point on your
+ SRAM buffer, and set (struct dma_slave_config).peripheral_size != 0.
+
+ STM32 DMA driver will check (struct dma_slave_config).peripheral_size to
+ determine if chaining is being used or not. If it is used, then STM32 DMA
+ driver fills (struct dma_slave_config).peripheral_config with an array of
+ three u32 : the first one containing STM32 DMAMUX channel ID, the second one
+ the channel interrupt flag clear register address, and the third one the
+ channel Transfer Complete flag mask.
+
+ Then, use dmaengine_slave_config with another struct dma_slave_config to
+ configure STM32 MDMA channel. Take care of DMA addresses, the device address
+ (depending on the transfer direction) must point on your SRAM buffer, and
+ the memory address must point to the buffer originally used for "classic"
+ DMA operation. Use the previous (struct dma_slave_config).peripheral_size
+ and .peripheral_config that have been updated by STM32 DMA driver, to set
+ (struct dma_slave_config).peripheral_size and .peripheral_config of the
+ struct dma_slave_config to configure STM32 MDMA channel.
+ ::
+
+ struct dma_slave_config dma_conf;
+ struct dma_slave_config mdma_conf;
+
+ memset(&dma_conf, 0, sizeof(dma_conf));
+ [...]
+ config.direction = DMA_DEV_TO_MEM;
+ config.dst_addr = sram_dma_buf; // SRAM buffer
+ config.peripheral_size = 1; // peripheral_size != 0 => chaining
+
+ dmaengine_slave_config(dma_chan, &dma_config);
+
+ memset(&mdma_conf, 0, sizeof(mdma_conf));
+ config.direction = DMA_DEV_TO_MEM;
+ mdma_conf.src_addr = sram_dma_buf; // SRAM buffer
+ mdma_conf.dst_addr = rx_dma_buf; // original memory buffer
+ mdma_conf.peripheral_size = dma_conf.peripheral_size; // <- dma_conf
+ mdma_conf.peripheral_config = dma_config.peripheral_config; // <- dma_conf
+
+ dmaengine_slave_config(mdma_chan, &mdma_conf);
+
+ **2. Get a descriptor for STM32 DMA channel transaction**
+
+ In the same way you get your descriptor for your "classic" DMA operation,
+ you just have to replace the original sg_list (in case of
+ dmaengine_prep_slave_sg()) with the new sg_list using SRAM buffer, or to
+ replace the original buffer address, length and period (in case of
+ dmaengine_prep_dma_cyclic()) with the new SRAM buffer.
+
+ **3. Get a descriptor for STM32 MDMA channel transaction**
+
+ If you previously get descriptor (for STM32 DMA) with
+
+ * dmaengine_prep_slave_sg(), then use dmaengine_prep_slave_sg() for
+ STM32 MDMA;
+ * dmaengine_prep_dma_cyclic(), then use dmaengine_prep_dma_cyclic() for
+ STM32 MDMA.
+
+ Use the new sg_list using SRAM buffer (in case of dmaengine_prep_slave_sg())
+ or, depending on the transfer direction, either the original DDR buffer (in
+ case of DMA_DEV_TO_MEM) or the SRAM buffer (in case of DMA_MEM_TO_DEV), the
+ source address being previously set with dmaengine_slave_config().
+
+ **4. Submit both transactions**
+
+ Before submitting your transactions, you may need to define on which
+ descriptor you want a callback to be called at the end of the transfer
+ (dmaengine_prep_slave_sg()) or the period (dmaengine_prep_dma_cyclic()).
+ Depending on the direction, set the callback on the descriptor that finishes
+ the overal transfer:
+
+ * DMA_DEV_TO_MEM: set the callback on the "MDMA" descriptor
+ * DMA_MEM_TO_DEV: set the callback on the "DMA" descriptor
+
+ Then, submit the descriptors whatever the order, with dmaengine_tx_submit().
+
+ **5. Issue pending requests (and wait for callback notification)**
+
+ As STM32 MDMA channel transfer is triggered by STM32 DMA, you must issue
+ STM32 MDMA channel before STM32 DMA channel.
+
+ If any, your callback will be called to warn you about the end of the overal
+ transfer or the period completion.
+
+ Don't forget to terminate both channels. STM32 DMA channel is configured in
+ cyclic Double-Buffer mode so it won't be disabled by HW, you need to terminate
+ it. STM32 MDMA channel will be stopped by HW in case of sg transfer, but not
+ in case of cyclic transfer. You can terminate it whatever the kind of transfer.
+
+ **STM32 DMA-MDMA chaining DMA_MEM_TO_DEV special case**
+
+ STM32 DMA-MDMA chaining in DMA_MEM_TO_DEV is a special case. Indeed, the
+ STM32 MDMA feeds the SRAM buffer with the DDR data, and the STM32 DMA reads
+ data from SRAM buffer. So some data (the first period) have to be copied in
+ SRAM buffer when the STM32 DMA starts to read.
+
+ A trick could be pausing the STM32 DMA channel (that will raise a Transfer
+ Complete signal, triggering the STM32 MDMA channel), but the first data read
+ by the STM32 DMA could be "wrong". The proper way is to prepare the first SRAM
+ period with dmaengine_prep_dma_memcpy(). Then this first period should be
+ "removed" from the sg or the cyclic transfer.
+
+ Due to this complexity, rather use the STM32 DMA-MDMA chaining for
+ DMA_DEV_TO_MEM and keep the "classic" DMA usage for DMA_MEM_TO_DEV, unless
+ you're not afraid.
+
+Resources
+---------
+
+ Application note, datasheet and reference manual are available on ST website
+ (STM32MP1_).
+
+ Dedicated focus on three application notes (AN5224_, AN4031_ & AN5001_)
+ dealing with STM32 DMAMUX, STM32 DMA and STM32 MDMA.
+
+.. _STM32MP1: https://www.st.com/en/microcontrollers-microprocessors/stm32mp1-series.html
+.. _AN5224: https://www.st.com/resource/en/application_note/an5224-stm32-dmamux-the-dma-request-router-stmicroelectronics.pdf
+.. _AN4031: https://www.st.com/resource/en/application_note/dm00046011-using-the-stm32f2-stm32f4-and-stm32f7-series-dma-controller-stmicroelectronics.pdf
+.. _AN5001: https://www.st.com/resource/en/application_note/an5001-stm32cube-expansion-package-for-stm32h7-series-mdma-stmicroelectronics.pdf
+
+:Authors:
+
+- Amelie Delaunay <amelie.delaunay@foss.st.com> \ No newline at end of file
diff --git a/Documentation/arm/stm32/stm32mp13-overview.rst b/Documentation/arm/stm32/stm32mp13-overview.rst
new file mode 100644
index 000000000000..3bb9492dad49
--- /dev/null
+++ b/Documentation/arm/stm32/stm32mp13-overview.rst
@@ -0,0 +1,37 @@
+===================
+STM32MP13 Overview
+===================
+
+Introduction
+------------
+
+The STM32MP131/STM32MP133/STM32MP135 are Cortex-A MPU aimed at various applications.
+They feature:
+
+- One Cortex-A7 application core
+- Standard memories interface support
+- Standard connectivity, widely inherited from the STM32 MCU family
+- Comprehensive security support
+
+More details:
+
+- Cortex-A7 core running up to @900MHz
+- FMC controller to connect SDRAM, NOR and NAND memories
+- QSPI
+- SD/MMC/SDIO support
+- 2*Ethernet controller
+- CAN
+- ADC/DAC
+- USB EHCI/OHCI controllers
+- USB OTG
+- I2C, SPI, CAN busses support
+- Several general purpose timers
+- Serial Audio interface
+- LCD controller
+- DCMIPP
+- SPDIFRX
+- DFSDM
+
+:Authors:
+
+- Alexandre Torgue <alexandre.torgue@foss.st.com>
diff --git a/Documentation/arm/stm32/stm32mp151-overview.rst b/Documentation/arm/stm32/stm32mp151-overview.rst
new file mode 100644
index 000000000000..f42a2ac309c0
--- /dev/null
+++ b/Documentation/arm/stm32/stm32mp151-overview.rst
@@ -0,0 +1,36 @@
+===================
+STM32MP151 Overview
+===================
+
+Introduction
+------------
+
+The STM32MP151 is a Cortex-A MPU aimed at various applications.
+It features:
+
+- Single Cortex-A7 application core
+- Standard memories interface support
+- Standard connectivity, widely inherited from the STM32 MCU family
+- Comprehensive security support
+
+More details:
+
+- Cortex-A7 core running up to @800MHz
+- FMC controller to connect SDRAM, NOR and NAND memories
+- QSPI
+- SD/MMC/SDIO support
+- Ethernet controller
+- ADC/DAC
+- USB EHCI/OHCI controllers
+- USB OTG
+- I2C, SPI busses support
+- Several general purpose timers
+- Serial Audio interface
+- LCD-TFT controller
+- DCMIPP
+- SPDIFRX
+- DFSDM
+
+:Authors:
+
+- Roan van Dijk <roan@protonic.nl>
diff --git a/Documentation/arm/tcm.rst b/Documentation/arm/tcm.rst
index b256f9783883..1dc6c39220f9 100644
--- a/Documentation/arm/tcm.rst
+++ b/Documentation/arm/tcm.rst
@@ -34,7 +34,7 @@ CPU so it is usually wise not to overlap any physical RAM with
the TCM.
The TCM memory can then be remapped to another address again using
-the MMU, but notice that the TCM if often used in situations where
+the MMU, but notice that the TCM is often used in situations where
the MMU is turned off. To avoid confusion the current Linux
implementation will map the TCM 1 to 1 from physical to virtual
memory in the location specified by the kernel. Currently Linux
diff --git a/Documentation/arm/uefi.rst b/Documentation/arm/uefi.rst
index 9b0b5e458a1e..baebe688a006 100644
--- a/Documentation/arm/uefi.rst
+++ b/Documentation/arm/uefi.rst
@@ -65,10 +65,6 @@ linux,uefi-mmap-desc-size 32-bit Size in bytes of each entry in the UEFI
linux,uefi-mmap-desc-ver 32-bit Version of the mmap descriptor format.
-linux,initrd-start 64-bit Physical start address of an initrd
-
-linux,initrd-end 64-bit Physical end address of an initrd
-
kaslr-seed 64-bit Entropy used to randomize the kernel image
base address location.
========================== ====== ===========================================
diff --git a/Documentation/arm64/acpi_object_usage.rst b/Documentation/arm64/acpi_object_usage.rst
index 0609da73970b..484ef9676653 100644
--- a/Documentation/arm64/acpi_object_usage.rst
+++ b/Documentation/arm64/acpi_object_usage.rst
@@ -163,7 +163,7 @@ FPDT Section 5.2.23 (signature == "FPDT")
**Firmware Performance Data Table**
- Optional, not currently supported.
+ Optional, useful for boot performance profiling.
GTDT Section 5.2.24 (signature == "GTDT")
diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
index 3f9d86557c5e..ffeccdd6bdac 100644
--- a/Documentation/arm64/booting.rst
+++ b/Documentation/arm64/booting.rst
@@ -10,9 +10,9 @@ This document is based on the ARM booting document by Russell King and
is relevant to all public releases of the AArch64 Linux kernel.
The AArch64 exception model is made up of a number of exception levels
-(EL0 - EL3), with EL0 and EL1 having a secure and a non-secure
-counterpart. EL2 is the hypervisor level and exists only in non-secure
-mode. EL3 is the highest priority level and exists only in secure mode.
+(EL0 - EL3), with EL0, EL1 and EL2 having a secure and a non-secure
+counterpart. EL2 is the hypervisor level, EL3 is the highest priority
+level and exists only in secure mode. Both are architecturally optional.
For the purposes of this document, we will use the term `boot loader`
simply to define all software that executes on the CPU(s) before control
@@ -121,8 +121,9 @@ Header notes:
to the base of DRAM, since memory below it is not
accessible via the linear mapping
1
- 2MB aligned base may be anywhere in physical
- memory
+ 2MB aligned base such that all image_size bytes
+ counted from the start of the image are within
+ the 48-bit addressable range of physical memory
Bits 4-63 Reserved.
============= ===============================================================
@@ -167,8 +168,8 @@ Before jumping into the kernel, the following conditions must be met:
All forms of interrupts must be masked in PSTATE.DAIF (Debug, SError,
IRQ and FIQ).
- The CPU must be in either EL2 (RECOMMENDED in order to have access to
- the virtualisation extensions) or non-secure EL1.
+ The CPU must be in non-secure state, either in EL2 (RECOMMENDED in order
+ to have access to the virtualisation extensions), or in EL1.
- Caches, MMUs
@@ -222,7 +223,7 @@ Before jumping into the kernel, the following conditions must be met:
For systems with a GICv3 interrupt controller to be used in v3 mode:
- If EL3 is present:
- - ICC_SRE_EL3.Enable (bit 3) must be initialiased to 0b1.
+ - ICC_SRE_EL3.Enable (bit 3) must be initialised to 0b1.
- ICC_SRE_EL3.SRE (bit 0) must be initialised to 0b1.
- ICC_CTLR_EL3.PMHE (bit 6) must be set to the same value across
all CPUs the kernel is executing on, and must stay constant
@@ -340,6 +341,44 @@ Before jumping into the kernel, the following conditions must be met:
- SMCR_EL2.LEN must be initialised to the same value for all CPUs the
kernel will execute on.
+ - HWFGRTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+
+ - HWFGWTR_EL2.nTPIDR2_EL0 (bit 55) must be initialised to 0b01.
+
+ - HWFGRTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+
+ - HWFGWTR_EL2.nSMPRI_EL1 (bit 54) must be initialised to 0b01.
+
+ For CPUs with the Scalable Matrix Extension FA64 feature (FEAT_SME_FA64):
+
+ - If EL3 is present:
+
+ - SMCR_EL3.FA64 (bit 31) must be initialised to 0b1.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - SMCR_EL2.FA64 (bit 31) must be initialised to 0b1.
+
+ For CPUs with the Memory Tagging Extension feature (FEAT_MTE2):
+
+ - If EL3 is present:
+
+ - SCR_EL3.ATA (bit 26) must be initialised to 0b1.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - HCR_EL2.ATA (bit 56) must be initialised to 0b1.
+
+ For CPUs with the Scalable Matrix Extension version 2 (FEAT_SME2):
+
+ - If EL3 is present:
+
+ - SMCR_EL3.EZT0 (bit 30) must be initialised to 0b1.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - SMCR_EL2.EZT0 (bit 30) must be initialised to 0b1.
+
The requirements described above for CPU mode, caches, MMUs, architected
timers, coherency and system registers apply to all CPUs. All CPUs must
enter the kernel in the same exception level. Where the values documented
diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index 328e0c454fbd..c7adc7897df6 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -92,7 +92,7 @@ operation if the source belongs to the supported system register space.
The infrastructure emulates only the following system register space::
- Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7
+ Op0=3, Op1=0, CRn=0, CRm=0,2,3,4,5,6,7
(See Table C5-6 'System instruction encodings for non-Debug System
register accesses' in ARMv8 ARM DDI 0487A.h, for the list of
@@ -235,7 +235,15 @@ infrastructure:
| DPB | [3-0] | y |
+------------------------------+---------+---------+
- 6) ID_AA64MMFR2_EL1 - Memory model feature register 2
+ 6) ID_AA64MMFR0_EL1 - Memory model feature register 0
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | ECV | [63-60] | y |
+ +------------------------------+---------+---------+
+
+ 7) ID_AA64MMFR2_EL1 - Memory model feature register 2
+------------------------------+---------+---------+
| Name | bits | visible |
@@ -243,7 +251,7 @@ infrastructure:
| AT | [35-32] | y |
+------------------------------+---------+---------+
- 7) ID_AA64ZFR0_EL1 - SVE feature ID register 0
+ 8) ID_AA64ZFR0_EL1 - SVE feature ID register 0
+------------------------------+---------+---------+
| Name | bits | visible |
@@ -267,6 +275,61 @@ infrastructure:
| SVEVer | [3-0] | y |
+------------------------------+---------+---------+
+ 8) ID_AA64MMFR1_EL1 - Memory model feature register 1
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | AFP | [47-44] | y |
+ +------------------------------+---------+---------+
+
+ 9) ID_AA64ISAR2_EL1 - Instruction set attribute register 2
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | RPRES | [7-4] | y |
+ +------------------------------+---------+---------+
+ | WFXT | [3-0] | y |
+ +------------------------------+---------+---------+
+
+ 10) MVFR0_EL1 - AArch32 Media and VFP Feature Register 0
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | FPDP | [11-8] | y |
+ +------------------------------+---------+---------+
+
+ 11) MVFR1_EL1 - AArch32 Media and VFP Feature Register 1
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | SIMDFMAC | [31-28] | y |
+ +------------------------------+---------+---------+
+ | SIMDSP | [19-16] | y |
+ +------------------------------+---------+---------+
+ | SIMDInt | [15-12] | y |
+ +------------------------------+---------+---------+
+ | SIMDLS | [11-8] | y |
+ +------------------------------+---------+---------+
+
+ 12) ID_ISAR5_EL1 - AArch32 Instruction Set Attribute Register 5
+
+ +------------------------------+---------+---------+
+ | Name | bits | visible |
+ +------------------------------+---------+---------+
+ | CRC32 | [19-16] | y |
+ +------------------------------+---------+---------+
+ | SHA2 | [15-12] | y |
+ +------------------------------+---------+---------+
+ | SHA1 | [11-8] | y |
+ +------------------------------+---------+---------+
+ | AES | [7-4] | y |
+ +------------------------------+---------+---------+
+
+
Appendix I: Example
-------------------
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index ec1a5a63c1d0..83e57e4d38e2 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -14,7 +14,7 @@ Some hardware or software features are only available on some CPU
implementations, and/or with certain kernel configurations, but have no
architected discovery mechanism available to userspace code at EL0. The
kernel exposes the presence of these features to userspace through a set
-of flags called hwcaps, exposed in the auxilliary vector.
+of flags called hwcaps, exposed in the auxiliary vector.
Userspace software can test for features by acquiring the AT_HWCAP or
AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
@@ -171,82 +171,137 @@ HWCAP_PACG
Documentation/arm64/pointer-authentication.rst.
HWCAP2_DCPODP
-
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
HWCAP2_SVE2
-
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
HWCAP2_SVEAES
-
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
HWCAP2_SVEPMULL
-
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
HWCAP2_SVEBITPERM
-
Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
HWCAP2_SVESHA3
-
Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
HWCAP2_SVESM4
-
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
HWCAP2_FLAGM2
-
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010.
HWCAP2_FRINT
-
Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
HWCAP2_SVEI8MM
-
Functionality implied by ID_AA64ZFR0_EL1.I8MM == 0b0001.
HWCAP2_SVEF32MM
-
Functionality implied by ID_AA64ZFR0_EL1.F32MM == 0b0001.
HWCAP2_SVEF64MM
-
Functionality implied by ID_AA64ZFR0_EL1.F64MM == 0b0001.
HWCAP2_SVEBF16
-
Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0001.
HWCAP2_I8MM
-
Functionality implied by ID_AA64ISAR1_EL1.I8MM == 0b0001.
HWCAP2_BF16
-
Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0001.
HWCAP2_DGH
-
Functionality implied by ID_AA64ISAR1_EL1.DGH == 0b0001.
HWCAP2_RNG
-
Functionality implied by ID_AA64ISAR0_EL1.RNDR == 0b0001.
HWCAP2_BTI
-
Functionality implied by ID_AA64PFR0_EL1.BT == 0b0001.
HWCAP2_MTE
-
Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0010, as described
by Documentation/arm64/memory-tagging-extension.rst.
+HWCAP2_ECV
+ Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001.
+
+HWCAP2_AFP
+ Functionality implied by ID_AA64MFR1_EL1.AFP == 0b0001.
+
+HWCAP2_RPRES
+ Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
+
+HWCAP2_MTE3
+ Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0011, as described
+ by Documentation/arm64/memory-tagging-extension.rst.
+
+HWCAP2_SME
+ Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
+ by Documentation/arm64/sme.rst.
+
+HWCAP2_SME_I16I64
+ Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
+
+HWCAP2_SME_F64F64
+ Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
+
+HWCAP2_SME_I8I32
+ Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
+
+HWCAP2_SME_F16F32
+ Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
+
+HWCAP2_SME_B16F32
+ Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
+
+HWCAP2_SME_F32F32
+ Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
+
+HWCAP2_SME_FA64
+ Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
+
+HWCAP2_WFXT
+ Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
+
+HWCAP2_EBF16
+ Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
+
+HWCAP2_SVE_EBF16
+ Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0010.
+
+HWCAP2_CSSC
+ Functionality implied by ID_AA64ISAR2_EL1.CSSC == 0b0001.
+
+HWCAP2_RPRFM
+ Functionality implied by ID_AA64ISAR2_EL1.RPRFM == 0b0001.
+
+HWCAP2_SVE2P1
+ Functionality implied by ID_AA64ZFR0_EL1.SVEver == 0b0010.
+
+HWCAP2_SME2
+ Functionality implied by ID_AA64SMFR0_EL1.SMEver == 0b0001.
+
+HWCAP2_SME2P1
+ Functionality implied by ID_AA64SMFR0_EL1.SMEver == 0b0010.
+
+HWCAP2_SMEI16I32
+ Functionality implied by ID_AA64SMFR0_EL1.I16I32 == 0b0101
+
+HWCAP2_SMEBI32I32
+ Functionality implied by ID_AA64SMFR0_EL1.BI32I32 == 0b1
+
+HWCAP2_SMEB16B16
+ Functionality implied by ID_AA64SMFR0_EL1.B16B16 == 0b1
+
+HWCAP2_SMEF16F16
+ Functionality implied by ID_AA64SMFR0_EL1.F16F16 == 0b1
+
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst
index 4f840bac083e..ae21f8118830 100644
--- a/Documentation/arm64/index.rst
+++ b/Documentation/arm64/index.rst
@@ -21,6 +21,7 @@ ARM64 Architecture
perf
pointer-authentication
silicon-errata
+ sme
sve
tagged-address-abi
tagged-pointers
diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst
index 7b99c8f428eb..dbae47bba25e 100644
--- a/Documentation/arm64/memory-tagging-extension.rst
+++ b/Documentation/arm64/memory-tagging-extension.rst
@@ -76,6 +76,9 @@ configurable behaviours:
with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting
address is unknown).
+- *Asymmetric* - Reads are handled as for synchronous mode while writes
+ are handled as for asynchronous mode.
+
The user can select the above modes, per thread, using the
``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where ``flags``
contains any number of the following values in the ``PR_MTE_TCF_MASK``
@@ -91,8 +94,9 @@ mode is specified, the program will run in that mode. If multiple
modes are specified, the mode is selected as described in the "Per-CPU
preferred tag checking modes" section below.
-The current tag check fault mode can be read using the
-``prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)`` system call.
+The current tag check fault configuration can be read using the
+``prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)`` system call. If
+multiple modes were requested then all will be reported.
Tag checking can also be disabled for a user thread by setting the
``PSTATE.TCO`` bit with ``MSR TCO, #1``.
@@ -139,18 +143,25 @@ tag checking mode as the CPU's preferred tag checking mode.
The preferred tag checking mode for each CPU is controlled by
``/sys/devices/system/cpu/cpu<N>/mte_tcf_preferred``, to which a
-privileged user may write the value ``async`` or ``sync``. The default
-preferred mode for each CPU is ``async``.
+privileged user may write the value ``async``, ``sync`` or ``asymm``. The
+default preferred mode for each CPU is ``async``.
To allow a program to potentially run in the CPU's preferred tag
checking mode, the user program may set multiple tag check fault mode
bits in the ``flags`` argument to the ``prctl(PR_SET_TAGGED_ADDR_CTRL,
-flags, 0, 0, 0)`` system call. If the CPU's preferred tag checking
-mode is in the task's set of provided tag checking modes (this will
-always be the case at present because the kernel only supports two
-tag checking modes, but future kernels may support more modes), that
-mode will be selected. Otherwise, one of the modes in the task's mode
-set will be selected in a currently unspecified manner.
+flags, 0, 0, 0)`` system call. If both synchronous and asynchronous
+modes are requested then asymmetric mode may also be selected by the
+kernel. If the CPU's preferred tag checking mode is in the task's set
+of provided tag checking modes, that mode will be selected. Otherwise,
+one of the modes in the task's mode will be selected by the kernel
+from the task's mode set using the preference order:
+
+ 1. Asynchronous
+ 2. Asymmetric
+ 3. Synchronous
+
+Note that there is no way for userspace to request multiple modes and
+also disable asymmetric mode.
Initial process state
---------------------
@@ -213,6 +224,29 @@ address ABI control and MTE configuration of a process as per the
Documentation/arm64/tagged-address-abi.rst and above. The corresponding
``regset`` is 1 element of 8 bytes (``sizeof(long))``).
+Core dump support
+-----------------
+
+The allocation tags for user memory mapped with ``PROT_MTE`` are dumped
+in the core file as additional ``PT_AARCH64_MEMTAG_MTE`` segments. The
+program header for such segment is defined as:
+
+:``p_type``: ``PT_AARCH64_MEMTAG_MTE``
+:``p_flags``: 0
+:``p_offset``: segment file offset
+:``p_vaddr``: segment virtual address, same as the corresponding
+ ``PT_LOAD`` segment
+:``p_paddr``: 0
+:``p_filesz``: segment size in file, calculated as ``p_mem_sz / 32``
+ (two 4-bit tags cover 32 bytes of memory)
+:``p_memsz``: segment size in memory, same as the corresponding
+ ``PT_LOAD`` segment
+:``p_align``: 0
+
+The tags are stored in the core file at ``p_offset`` as two 4-bit tags
+in a byte. With the tag granule of 16 bytes, a 4K page requires 128
+bytes in the core file.
+
Example of correct usage
========================
diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
index 901cd094f4ec..2a641ba7be3b 100644
--- a/Documentation/arm64/memory.rst
+++ b/Documentation/arm64/memory.rst
@@ -33,9 +33,8 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit)::
0000000000000000 0000ffffffffffff 256TB user
ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map
[ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region]
- ffff800000000000 ffff800007ffffff 128MB bpf jit region
- ffff800008000000 ffff80000fffffff 128MB modules
- ffff800010000000 fffffbffefffffff 124TB vmalloc
+ ffff800000000000 ffff800007ffffff 128MB modules
+ ffff800008000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
@@ -51,9 +50,8 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):
0000000000000000 000fffffffffffff 4PB user
fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map
[fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region]
- ffff800000000000 ffff800007ffffff 128MB bpf jit region
- ffff800008000000 ffff80000fffffff 128MB modules
- ffff800010000000 fffffbffefffffff 124TB vmalloc
+ ffff800000000000 ffff800007ffffff 128MB modules
+ ffff800008000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
diff --git a/Documentation/arm64/perf.rst b/Documentation/arm64/perf.rst
index b567f177d385..1f87b57c2332 100644
--- a/Documentation/arm64/perf.rst
+++ b/Documentation/arm64/perf.rst
@@ -2,7 +2,10 @@
.. _perf_index:
-=====================
+====
+Perf
+====
+
Perf Event Attributes
=====================
@@ -88,3 +91,76 @@ exclude_host. However when using !exclude_hv there is a small blackout
window at the guest entry/exit where host events are not captured.
On VHE systems there are no blackout windows.
+
+Perf Userspace PMU Hardware Counter Access
+==========================================
+
+Overview
+--------
+The perf userspace tool relies on the PMU to monitor events. It offers an
+abstraction layer over the hardware counters since the underlying
+implementation is cpu-dependent.
+Arm64 allows userspace tools to have access to the registers storing the
+hardware counters' values directly.
+
+This targets specifically self-monitoring tasks in order to reduce the overhead
+by directly accessing the registers without having to go through the kernel.
+
+How-to
+------
+The focus is set on the armv8 PMUv3 which makes sure that the access to the pmu
+registers is enabled and that the userspace has access to the relevant
+information in order to use them.
+
+In order to have access to the hardware counters, the global sysctl
+kernel/perf_user_access must first be enabled:
+
+.. code-block:: sh
+
+ echo 1 > /proc/sys/kernel/perf_user_access
+
+It is necessary to open the event using the perf tool interface with config1:1
+attr bit set: the sys_perf_event_open syscall returns a fd which can
+subsequently be used with the mmap syscall in order to retrieve a page of memory
+containing information about the event. The PMU driver uses this page to expose
+to the user the hardware counter's index and other necessary data. Using this
+index enables the user to access the PMU registers using the `mrs` instruction.
+Access to the PMU registers is only valid while the sequence lock is unchanged.
+In particular, the PMSELR_EL0 register is zeroed each time the sequence lock is
+changed.
+
+The userspace access is supported in libperf using the perf_evsel__mmap()
+and perf_evsel__read() functions. See `tools/lib/perf/tests/test-evsel.c`_ for
+an example.
+
+About heterogeneous systems
+---------------------------
+On heterogeneous systems such as big.LITTLE, userspace PMU counter access can
+only be enabled when the tasks are pinned to a homogeneous subset of cores and
+the corresponding PMU instance is opened by specifying the 'type' attribute.
+The use of generic event types is not supported in this case.
+
+Have a look at `tools/perf/arch/arm64/tests/user-events.c`_ for an example. It
+can be run using the perf tool to check that the access to the registers works
+correctly from userspace:
+
+.. code-block:: sh
+
+ perf test -v user
+
+About chained events and counter sizes
+--------------------------------------
+The user can request either a 32-bit (config1:0 == 0) or 64-bit (config1:0 == 1)
+counter along with userspace access. The sys_perf_event_open syscall will fail
+if a 64-bit counter is requested and the hardware doesn't support 64-bit
+counters. Chained events are not supported in conjunction with userspace counter
+access. If a 32-bit counter is requested on hardware with 64-bit counters, then
+userspace must treat the upper 32-bits read from the counter as UNKNOWN. The
+'pmc_width' field in the user page will indicate the valid width of the counter
+and should be used to mask the upper bits as needed.
+
+.. Links
+.. _tools/perf/arch/arm64/tests/user-events.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/arm64/tests/user-events.c
+.. _tools/lib/perf/tests/test-evsel.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/perf/tests/test-evsel.c
diff --git a/Documentation/arm64/pointer-authentication.rst b/Documentation/arm64/pointer-authentication.rst
index f127666ea3a8..e5dad2e40aa8 100644
--- a/Documentation/arm64/pointer-authentication.rst
+++ b/Documentation/arm64/pointer-authentication.rst
@@ -53,11 +53,10 @@ The number of bits that the PAC occupies in a pointer is 55 minus the
virtual address size configured by the kernel. For example, with a
virtual address size of 48, the PAC is 7 bits wide.
-Recent versions of GCC can compile code with APIAKey-based return
-address protection when passed the -msign-return-address option. This
-uses instructions in the HINT space (unless -march=armv8.3-a or higher
-is also passed), and such code can run on systems without the pointer
-authentication extension.
+When ARM64_PTR_AUTH_KERNEL is selected, the kernel will be compiled
+with HINT space pointer authentication instructions protecting
+function returns. Kernels built with this option will work on hardware
+with or without pointer authentication support.
In addition to exec(), keys can also be reinitialized to random values
using the PR_PAC_RESET_KEYS prctl. A bitmask of PR_PAC_APIAKEY,
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index d410a47ffa57..9e311bc43e05 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -52,6 +52,14 @@ stable kernels.
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #1902691 | ARM64_ERRATUM_1902691 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A53 | #826319 | ARM64_ERRATUM_826319 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A53 | #827319 | ARM64_ERRATUM_827319 |
@@ -68,6 +76,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A55 | #2441007 | ARM64_ERRATUM_2441007 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A57 | #852523 | N/A |
@@ -76,10 +86,14 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A57 | #1319537 | ARM64_ERRATUM_1319367 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #1742098 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A72 | #853709 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A72 | #1319367 | ARM64_ERRATUM_1319367 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A72 | #1655431 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 |
@@ -92,12 +106,38 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A77 | #1508412 | ARM64_ERRATUM_1508412 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2051678 | ARM64_ERRATUM_2051678 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A710 | #2224489 | ARM64_ERRATUM_2224489 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A715 | #2645198 | ARM64_ERRATUM_2645198 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-X2 | #2119858 | ARM64_ERRATUM_2119858 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-X2 | #2224489 | ARM64_ERRATUM_2224489 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1349291 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Neoverse-N2 | #2139208 | ARM64_ERRATUM_2139208 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Neoverse-N2 | #2067961 | ARM64_ERRATUM_2067961 |
++----------------+-----------------+-----------------+-----------------------------+
+| ARM | Neoverse-N2 | #2253138 | ARM64_ERRATUM_2253138 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | MMU-500 | #841119,826419 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
@@ -110,7 +150,7 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| Cavium | ThunderX ITS | #23144 | CAVIUM_ERRATUM_23144 |
+----------------+-----------------+-----------------+-----------------------------+
-| Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 |
+| Cavium | ThunderX GICv3 | #23154,38545 | CAVIUM_ERRATUM_23154 |
+----------------+-----------------+-----------------+-----------------------------+
| Cavium | ThunderX GICv3 | #38539 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
@@ -132,6 +172,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | Carmel Core | N/A | NVIDIA_CARMEL_CNP_ERRATUM |
+----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
+----------------+-----------------+-----------------+-----------------------------+
@@ -163,6 +205,12 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| Qualcomm Tech. | Kryo4xx Silver | N/A | ARM64_ERRATUM_1024718 |
+----------------+-----------------+-----------------+-----------------------------+
+| Qualcomm Tech. | Kryo4xx Gold | N/A | ARM64_ERRATUM_1286807 |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
+| Rockchip | RK3588 | #3588001 | ROCKCHIP_ERRATUM_3588001 |
++----------------+-----------------+-----------------+-----------------------------+
+
+----------------+-----------------+-----------------+-----------------------------+
| Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 |
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arm64/sme.rst b/Documentation/arm64/sme.rst
new file mode 100644
index 000000000000..1c43ea12eb4f
--- /dev/null
+++ b/Documentation/arm64/sme.rst
@@ -0,0 +1,468 @@
+===================================================
+Scalable Matrix Extension support for AArch64 Linux
+===================================================
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Scalable Matrix Extension (SME).
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive. It should be read in conjunction with the SVE
+documentation in sve.rst which provides details on the Streaming SVE mode
+included in SME.
+
+This document does not aim to describe the SME architecture or programmer's
+model. To aid understanding, a minimal description of relevant programmer's
+model features for SME is included in Appendix A.
+
+
+1. General
+-----------
+
+* PSTATE.SM, PSTATE.ZA, the streaming mode vector length, the ZA and (when
+ present) ZTn register state and TPIDR2_EL0 are tracked per thread.
+
+* The presence of SME is reported to userspace via HWCAP2_SME in the aux vector
+ AT_HWCAP2 entry. Presence of this flag implies the presence of the SME
+ instructions and registers, and the Linux-specific system interfaces
+ described in this document. SME is reported in /proc/cpuinfo as "sme".
+
+* The presence of SME2 is reported to userspace via HWCAP2_SME2 in the
+ aux vector AT_HWCAP2 entry. Presence of this flag implies the presence of
+ the SME2 instructions and ZT0, and the Linux-specific system interfaces
+ described in this document. SME2 is reported in /proc/cpuinfo as "sme2".
+
+* Support for the execution of SME instructions in userspace can also be
+ detected by reading the CPU ID register ID_AA64PFR1_EL1 using an MRS
+ instruction, and checking that the value of the SME field is nonzero. [3]
+
+ It does not guarantee the presence of the system interfaces described in the
+ following sections: software that needs to verify that those interfaces are
+ present must check for HWCAP2_SME instead.
+
+* There are a number of optional SME features, presence of these is reported
+ through AT_HWCAP2 through:
+
+ HWCAP2_SME_I16I64
+ HWCAP2_SME_F64F64
+ HWCAP2_SME_I8I32
+ HWCAP2_SME_F16F32
+ HWCAP2_SME_B16F32
+ HWCAP2_SME_F32F32
+ HWCAP2_SME_FA64
+ HWCAP2_SME2
+
+ This list may be extended over time as the SME architecture evolves.
+
+ These extensions are also reported via the CPU ID register ID_AA64SMFR0_EL1,
+ which userspace can read using an MRS instruction. See elf_hwcaps.txt and
+ cpu-feature-registers.txt for details.
+
+* Debuggers should restrict themselves to interacting with the target via the
+ NT_ARM_SVE, NT_ARM_SSVE, NT_ARM_ZA and NT_ARM_ZT regsets. The recommended
+ way of detecting support for these regsets is to connect to a target process
+ first and then attempt a
+
+ ptrace(PTRACE_GETREGSET, pid, NT_ARM_<regset>, &iov).
+
+* Whenever ZA register values are exchanged in memory between userspace and
+ the kernel, the register value is encoded in memory as a series of horizontal
+ vectors from 0 to VL/8-1 stored in the same endianness invariant format as is
+ used for SVE vectors.
+
+* On thread creation TPIDR2_EL0 is preserved unless CLONE_SETTLS is specified,
+ in which case it is set to 0.
+
+2. Vector lengths
+------------------
+
+SME defines a second vector length similar to the SVE vector length which is
+controls the size of the streaming mode SVE vectors and the ZA matrix array.
+The ZA matrix is square with each side having as many bytes as a streaming
+mode SVE vector.
+
+
+3. Sharing of streaming and non-streaming mode SVE state
+---------------------------------------------------------
+
+It is implementation defined which if any parts of the SVE state are shared
+between streaming and non-streaming modes. When switching between modes
+via software interfaces such as ptrace if no register content is provided as
+part of switching no state will be assumed to be shared and everything will
+be zeroed.
+
+
+4. System call behaviour
+-------------------------
+
+* On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the
+ ZA matrix and ZTn (if present) are preserved.
+
+* On syscall PSTATE.SM will be cleared and the SVE registers will be handled
+ as per the standard SVE ABI.
+
+* None of the SVE registers, ZA or ZTn are used to pass arguments to
+ or receive results from any syscall.
+
+* On process creation (eg, clone()) the newly created process will have
+ PSTATE.SM cleared.
+
+* All other SME state of a thread, including the currently configured vector
+ length, the state of the PR_SME_VL_INHERIT flag, and the deferred vector
+ length (if any), is preserved across all syscalls, subject to the specific
+ exceptions for execve() described in section 6.
+
+
+5. Signal handling
+-------------------
+
+* Signal handlers are invoked with streaming mode and ZA disabled.
+
+* A new signal frame record TPIDR2_MAGIC is added formatted as a struct
+ tpidr2_context to allow access to TPIDR2_EL0 from signal handlers.
+
+* A new signal frame record za_context encodes the ZA register contents on
+ signal delivery. [1]
+
+* The signal frame record for ZA always contains basic metadata, in particular
+ the thread's vector length (in za_context.vl).
+
+* The ZA matrix may or may not be included in the record, depending on
+ the value of PSTATE.ZA. The registers are present if and only if:
+ za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl))
+ in which case PSTATE.ZA == 1.
+
+* If matrix data is present, the remainder of the record has a vl-dependent
+ size and layout. Macros ZA_SIG_* are defined [1] to facilitate access to
+ them.
+
+* The matrix is stored as a series of horizontal vectors in the same format as
+ is used for SVE vectors.
+
+* If the ZA context is too big to fit in sigcontext.__reserved[], then extra
+ space is allocated on the stack, an extra_context record is written in
+ __reserved[] referencing this space. za_context is then written in the
+ extra space. Refer to [1] for further details about this mechanism.
+
+* If ZTn is supported and PSTATE.ZA==1 then a signal frame record for ZTn will
+ be generated.
+
+* The signal record for ZTn has magic ZT_MAGIC (0x5a544e01) and consists of a
+ standard signal frame header followed by a struct zt_context specifying
+ the number of ZTn registers supported by the system, then zt_context.nregs
+ blocks of 64 bytes of data per register.
+
+
+5. Signal return
+-----------------
+
+When returning from a signal handler:
+
+* If there is no za_context record in the signal frame, or if the record is
+ present but contains no register data as described in the previous section,
+ then ZA is disabled.
+
+* If za_context is present in the signal frame and contains matrix data then
+ PSTATE.ZA is set to 1 and ZA is populated with the specified data.
+
+* The vector length cannot be changed via signal return. If za_context.vl in
+ the signal frame does not match the current vector length, the signal return
+ attempt is treated as illegal, resulting in a forced SIGSEGV.
+
+* If ZTn is not supported or PSTATE.ZA==0 then it is illegal to have a
+ signal frame record for ZTn, resulting in a forced SIGSEGV.
+
+
+6. prctl extensions
+--------------------
+
+Some new prctl() calls are added to allow programs to manage the SME vector
+length:
+
+prctl(PR_SME_SET_VL, unsigned long arg)
+
+ Sets the vector length of the calling thread and related flags, where
+ arg == vl | flags. Other threads of the calling process are unaffected.
+
+ vl is the desired vector length, where sve_vl_valid(vl) must be true.
+
+ flags:
+
+ PR_SME_VL_INHERIT
+
+ Inherit the current vector length across execve(). Otherwise, the
+ vector length is reset to the system default at execve(). (See
+ Section 9.)
+
+ PR_SME_SET_VL_ONEXEC
+
+ Defer the requested vector length change until the next execve()
+ performed by this thread.
+
+ The effect is equivalent to implicit execution of the following
+ call immediately after the next execve() (if any) by the thread:
+
+ prctl(PR_SME_SET_VL, arg & ~PR_SME_SET_VL_ONEXEC)
+
+ This allows launching of a new program with a different vector
+ length, while avoiding runtime side effects in the caller.
+
+ Without PR_SME_SET_VL_ONEXEC, the requested change takes effect
+ immediately.
+
+
+ Return value: a nonnegative on success, or a negative value on error:
+ EINVAL: SME not supported, invalid vector length requested, or
+ invalid flags.
+
+
+ On success:
+
+ * Either the calling thread's vector length or the deferred vector length
+ to be applied at the next execve() by the thread (dependent on whether
+ PR_SME_SET_VL_ONEXEC is present in arg), is set to the largest value
+ supported by the system that is less than or equal to vl. If vl ==
+ SVE_VL_MAX, the value set will be the largest value supported by the
+ system.
+
+ * Any previously outstanding deferred vector length change in the calling
+ thread is cancelled.
+
+ * The returned value describes the resulting configuration, encoded as for
+ PR_SME_GET_VL. The vector length reported in this value is the new
+ current vector length for this thread if PR_SME_SET_VL_ONEXEC was not
+ present in arg; otherwise, the reported vector length is the deferred
+ vector length that will be applied at the next execve() by the calling
+ thread.
+
+ * Changing the vector length causes all of ZA, ZTn, P0..P15, FFR and all
+ bits of Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
+ unspecified, including both streaming and non-streaming SVE state.
+ Calling PR_SME_SET_VL with vl equal to the thread's current vector
+ length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ does not constitute a change to the vector length for this purpose.
+
+ * Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
+ Calling PR_SME_SET_VL with vl equal to the thread's current vector
+ length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ does not constitute a change to the vector length for this purpose.
+
+
+prctl(PR_SME_GET_VL)
+
+ Gets the vector length of the calling thread.
+
+ The following flag may be OR-ed into the result:
+
+ PR_SME_VL_INHERIT
+
+ Vector length will be inherited across execve().
+
+ There is no way to determine whether there is an outstanding deferred
+ vector length change (which would only normally be the case between a
+ fork() or vfork() and the corresponding execve() in typical use).
+
+ To extract the vector length from the result, bitwise and it with
+ PR_SME_VL_LEN_MASK.
+
+ Return value: a nonnegative value on success, or a negative value on error:
+ EINVAL: SME not supported.
+
+
+7. ptrace extensions
+---------------------
+
+* A new regset NT_ARM_SSVE is defined for access to streaming mode SVE
+ state via PTRACE_GETREGSET and PTRACE_SETREGSET, this is documented in
+ sve.rst.
+
+* A new regset NT_ARM_ZA is defined for ZA state for access to ZA state via
+ PTRACE_GETREGSET and PTRACE_SETREGSET.
+
+ Refer to [2] for definitions.
+
+The regset data starts with struct user_za_header, containing:
+
+ size
+
+ Size of the complete regset, in bytes.
+ This depends on vl and possibly on other things in the future.
+
+ If a call to PTRACE_GETREGSET requests less data than the value of
+ size, the caller can allocate a larger buffer and retry in order to
+ read the complete regset.
+
+ max_size
+
+ Maximum size in bytes that the regset can grow to for the target
+ thread. The regset won't grow bigger than this even if the target
+ thread changes its vector length etc.
+
+ vl
+
+ Target thread's current streaming vector length, in bytes.
+
+ max_vl
+
+ Maximum possible streaming vector length for the target thread.
+
+ flags
+
+ Zero or more of the following flags, which have the same
+ meaning and behaviour as the corresponding PR_SET_VL_* flags:
+
+ SME_PT_VL_INHERIT
+
+ SME_PT_VL_ONEXEC (SETREGSET only).
+
+* The effects of changing the vector length and/or flags are equivalent to
+ those documented for PR_SME_SET_VL.
+
+ The caller must make a further GETREGSET call if it needs to know what VL is
+ actually set by SETREGSET, unless is it known in advance that the requested
+ VL is supported.
+
+* The size and layout of the payload depends on the header fields. The
+ SME_PT_ZA_*() macros are provided to facilitate access to the data.
+
+* In either case, for SETREGSET it is permissible to omit the payload, in which
+ case the vector length and flags are changed and PSTATE.ZA is set to 0
+ (along with any consequences of those changes). If a payload is provided
+ then PSTATE.ZA will be set to 1.
+
+* For SETREGSET, if the requested VL is not supported, the effect will be the
+ same as if the payload were omitted, except that an EIO error is reported.
+ No attempt is made to translate the payload data to the correct layout
+ for the vector length actually set. It is up to the caller to translate the
+ payload layout for the actual VL and retry.
+
+* The effect of writing a partial, incomplete payload is unspecified.
+
+* A new regset NT_ARM_ZT is defined for access to ZTn state via
+ PTRACE_GETREGSET and PTRACE_SETREGSET.
+
+* The NT_ARM_ZT regset consists of a single 512 bit register.
+
+* When PSTATE.ZA==0 reads of NT_ARM_ZT will report all bits of ZTn as 0.
+
+* Writes to NT_ARM_ZT will set PSTATE.ZA to 1.
+
+
+8. ELF coredump extensions
+---------------------------
+
+* NT_ARM_SSVE notes will be added to each coredump for
+ each thread of the dumped process. The contents will be equivalent to the
+ data that would have been read if a PTRACE_GETREGSET of the corresponding
+ type were executed for each thread when the coredump was generated.
+
+* A NT_ARM_ZA note will be added to each coredump for each thread of the
+ dumped process. The contents will be equivalent to the data that would have
+ been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
+ when the coredump was generated.
+
+* A NT_ARM_ZT note will be added to each coredump for each thread of the
+ dumped process. The contents will be equivalent to the data that would have
+ been read if a PTRACE_GETREGSET of NT_ARM_ZT were executed for each thread
+ when the coredump was generated.
+
+* The NT_ARM_TLS note will be extended to two registers, the second register
+ will contain TPIDR2_EL0 on systems that support SME and will be read as
+ zero with writes ignored otherwise.
+
+9. System runtime configuration
+--------------------------------
+
+* To mitigate the ABI impact of expansion of the signal frame, a policy
+ mechanism is provided for administrators, distro maintainers and developers
+ to set the default vector length for userspace processes:
+
+/proc/sys/abi/sme_default_vector_length
+
+ Writing the text representation of an integer to this file sets the system
+ default vector length to the specified value, unless the value is greater
+ than the maximum vector length supported by the system in which case the
+ default vector length is set to that maximum.
+
+ The result can be determined by reopening the file and reading its
+ contents.
+
+ At boot, the default vector length is initially set to 32 or the maximum
+ supported vector length, whichever is smaller and supported. This
+ determines the initial vector length of the init process (PID 1).
+
+ Reading this file returns the current system default vector length.
+
+* At every execve() call, the new vector length of the new process is set to
+ the system default vector length, unless
+
+ * PR_SME_VL_INHERIT (or equivalently SME_PT_VL_INHERIT) is set for the
+ calling thread, or
+
+ * a deferred vector length change is pending, established via the
+ PR_SME_SET_VL_ONEXEC flag (or SME_PT_VL_ONEXEC).
+
+* Modifying the system default vector length does not affect the vector length
+ of any existing process or thread that does not make an execve() call.
+
+
+Appendix A. SME programmer's model (informative)
+=================================================
+
+This section provides a minimal description of the additions made by SME to the
+ARMv8-A programmer's model that are relevant to this document.
+
+Note: This section is for information only and not intended to be complete or
+to replace any architectural specification.
+
+A.1. Registers
+---------------
+
+In A64 state, SME adds the following:
+
+* A new mode, streaming mode, in which a subset of the normal FPSIMD and SVE
+ features are available. When supported EL0 software may enter and leave
+ streaming mode at any time.
+
+ For best system performance it is strongly encouraged for software to enable
+ streaming mode only when it is actively being used.
+
+* A new vector length controlling the size of ZA and the Z registers when in
+ streaming mode, separately to the vector length used for SVE when not in
+ streaming mode. There is no requirement that either the currently selected
+ vector length or the set of vector lengths supported for the two modes in
+ a given system have any relationship. The streaming mode vector length
+ is referred to as SVL.
+
+* A new ZA matrix register. This is a square matrix of SVLxSVL bits. Most
+ operations on ZA require that streaming mode be enabled but ZA can be
+ enabled without streaming mode in order to load, save and retain data.
+
+ For best system performance it is strongly encouraged for software to enable
+ ZA only when it is actively being used.
+
+* A new ZT0 register is introduced when SME2 is present. This is a 512 bit
+ register which is accessible when PSTATE.ZA is set, as ZA itself is.
+
+* Two new 1 bit fields in PSTATE which may be controlled via the SMSTART and
+ SMSTOP instructions or by access to the SVCR system register:
+
+ * PSTATE.ZA, if this is 1 then the ZA matrix is accessible and has valid
+ data while if it is 0 then ZA can not be accessed. When PSTATE.ZA is
+ changed from 0 to 1 all bits in ZA are cleared.
+
+ * PSTATE.SM, if this is 1 then the PE is in streaming mode. When the value
+ of PSTATE.SM is changed then it is implementation defined if the subset
+ of the floating point register bits valid in both modes may be retained.
+ Any other bits will be cleared.
+
+
+References
+==========
+
+[1] arch/arm64/include/uapi/asm/sigcontext.h
+ AArch64 Linux signal ABI definitions
+
+[2] arch/arm64/include/uapi/asm/ptrace.h
+ AArch64 Linux ptrace ABI definitions
+
+[3] Documentation/arm64/cpu-feature-registers.rst
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 03137154299e..1b90a30382ac 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -7,7 +7,9 @@ Author: Dave Martin <Dave.Martin@arm.com>
Date: 4 August 2017
This document outlines briefly the interface provided to userspace by Linux in
-order to support use of the ARM Scalable Vector Extension (SVE).
+order to support use of the ARM Scalable Vector Extension (SVE), including
+interactions with Streaming SVE mode added by the Scalable Matrix Extension
+(SME).
This is an outline of the most important features and issues only and not
intended to be exhaustive.
@@ -23,6 +25,10 @@ model features for SVE is included in Appendix A.
* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are
tracked per-thread.
+* In streaming mode FFR is not accessible unless HWCAP2_SME_FA64 is present
+ in the system, when it is not supported and these interfaces are used to
+ access streaming mode FFR is read and written as zero.
+
* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector
AT_HWCAP entry. Presence of this flag implies the presence of the SVE
instructions and registers, and the Linux-specific system interfaces
@@ -46,6 +52,7 @@ model features for SVE is included in Appendix A.
HWCAP2_SVEBITPERM
HWCAP2_SVESHA3
HWCAP2_SVESM4
+ HWCAP2_SVE2P1
This list may be extended over time as the SVE architecture evolves.
@@ -53,10 +60,19 @@ model features for SVE is included in Appendix A.
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
cpu-feature-registers.txt for details.
+* On hardware that supports the SME extensions, HWCAP2_SME will also be
+ reported in the AT_HWCAP2 aux vector entry. Among other things SME adds
+ streaming mode which provides a subset of the SVE feature set using a
+ separate SME vector length and the same Z/V registers. See sme.rst
+ for more details.
+
* Debuggers should restrict themselves to interacting with the target via the
NT_ARM_SVE regset. The recommended way of detecting support for this regset
is to connect to a target process first and then attempt a
- ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov).
+ ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). Note that when SME is
+ present and streaming SVE mode is in use the FPSIMD subset of registers
+ will be read via NT_ARM_SVE and NT_ARM_SVE writes will exit streaming mode
+ in the target.
* Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory
between userspace and the kernel, the register value is encoded in memory in
@@ -96,7 +112,7 @@ the SVE instruction set architecture.
* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of
Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR
- become unspecified on return from a syscall.
+ become zero on return from a syscall.
* The SVE registers are not used to pass arguments to or receive results from
any syscall.
@@ -126,6 +142,11 @@ the SVE instruction set architecture.
are only present in fpsimd_context. For convenience, the content of V0..V31
is duplicated between sve_context and fpsimd_context.
+* The record contains a flag field which includes a flag SVE_SIG_FLAG_SM which
+ if set indicates that the thread is in streaming mode and the vector length
+ and register data (if present) describe the streaming SVE data and vector
+ length.
+
* The signal frame record for SVE always contains basic metadata, in particular
the thread's vector length (in sve_context.vl).
@@ -154,7 +175,7 @@ the SVE instruction set architecture.
When returning from a signal handler:
* If there is no sve_context record in the signal frame, or if the record is
- present but contains no register data as desribed in the previous section,
+ present but contains no register data as described in the previous section,
then the SVE registers/bits become non-live and take unspecified values.
* If sve_context is present in the signal frame and contains full register
@@ -170,6 +191,11 @@ When returning from a signal handler:
the signal frame does not match the current vector length, the signal return
attempt is treated as illegal, resulting in a forced SIGSEGV.
+* It is permitted to enter or leave streaming mode by setting or clearing
+ the SVE_SIG_FLAG_SM flag but applications should take care to ensure that
+ when doing so sve_context.vl and any register data are appropriate for the
+ vector length in the new mode.
+
6. prctl extensions
--------------------
@@ -197,7 +223,7 @@ prctl(PR_SVE_SET_VL, unsigned long arg)
Defer the requested vector length change until the next execve()
performed by this thread.
- The effect is equivalent to implicit exceution of the following
+ The effect is equivalent to implicit execution of the following
call immediately after the next execve() (if any) by the thread:
prctl(PR_SVE_SET_VL, arg & ~PR_SVE_SET_VL_ONEXEC)
@@ -255,7 +281,7 @@ prctl(PR_SVE_GET_VL)
vector length change (which would only normally be the case between a
fork() or vfork() and the corresponding execve() in typical use).
- To extract the vector length from the result, and it with
+ To extract the vector length from the result, bitwise and it with
PR_SVE_VL_LEN_MASK.
Return value: a nonnegative value on success, or a negative value on error:
@@ -265,8 +291,14 @@ prctl(PR_SVE_GET_VL)
7. ptrace extensions
---------------------
-* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and
- PTRACE_SETREGSET.
+* New regsets NT_ARM_SVE and NT_ARM_SSVE are defined for use with
+ PTRACE_GETREGSET and PTRACE_SETREGSET. NT_ARM_SSVE describes the
+ streaming mode SVE registers and NT_ARM_SVE describes the
+ non-streaming mode SVE registers.
+
+ In this description a register set is referred to as being "live" when
+ the target is in the appropriate streaming or non-streaming mode and is
+ using data beyond the subset shared with the FPSIMD Vn registers.
Refer to [2] for definitions.
@@ -297,7 +329,7 @@ The regset data starts with struct user_sve_header, containing:
flags
- either
+ at most one of
SVE_PT_REGS_FPSIMD
@@ -331,6 +363,10 @@ The regset data starts with struct user_sve_header, containing:
SVE_PT_VL_ONEXEC (SETREGSET only).
+ If neither FPSIMD nor SVE flags are provided then no register
+ payload is available, this is only possible when SME is implemented.
+
+
* The effects of changing the vector length and/or flags are equivalent to
those documented for PR_SVE_SET_VL.
@@ -346,6 +382,13 @@ The regset data starts with struct user_sve_header, containing:
case only the vector length and flags are changed (along with any
consequences of those changes).
+* In systems supporting SME when in streaming mode a GETREGSET for
+ NT_REG_SVE will return only the user_sve_header with no register data,
+ similarly a GETREGSET for NT_REG_SSVE will not return any register data
+ when not in streaming mode.
+
+* A GETREGSET for NT_ARM_SSVE will never return SVE_PT_REGS_FPSIMD.
+
* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the
requested VL is not supported, the effect will be the same as if the
payload were omitted, except that an EIO error is reported. No
@@ -355,17 +398,25 @@ The regset data starts with struct user_sve_header, containing:
unspecified. It is up to the caller to translate the payload layout
for the actual VL and retry.
+* Where SME is implemented it is not possible to GETREGSET the register
+ state for normal SVE when in streaming mode, nor the streaming mode
+ register state when in normal mode, regardless of the implementation defined
+ behaviour of the hardware for sharing data between the two modes.
+
+* Any SETREGSET of NT_ARM_SVE will exit streaming mode if the target was in
+ streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
+ if the target was not in streaming mode.
+
* The effect of writing a partial, incomplete payload is unspecified.
8. ELF coredump extensions
---------------------------
-* A NT_ARM_SVE note will be added to each coredump for each thread of the
- dumped process. The contents will be equivalent to the data that would have
- been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread
- when the coredump was generated.
-
+* NT_ARM_SVE and NT_ARM_SSVE notes will be added to each coredump for
+ each thread of the dumped process. The contents will be equivalent to the
+ data that would have been read if a PTRACE_GETREGSET of the corresponding
+ type were executed for each thread when the coredump was generated.
9. System runtime configuration
--------------------------------
@@ -402,6 +453,24 @@ The regset data starts with struct user_sve_header, containing:
* Modifying the system default vector length does not affect the vector length
of any existing process or thread that does not make an execve() call.
+10. Perf extensions
+--------------------------------
+
+* The arm64 specific DWARF standard [5] added the VG (Vector Granule) register
+ at index 46. This register is used for DWARF unwinding when variable length
+ SVE registers are pushed onto the stack.
+
+* Its value is equivalent to the current SVE vector length (VL) in bits divided
+ by 64.
+
+* The value is included in Perf samples in the regs[46] field if
+ PERF_SAMPLE_REGS_USER is set and the sample_regs_user mask has bit 46 set.
+
+* The value is the current value at the time the sample was taken, and it can
+ change over time.
+
+* If the system doesn't support SVE when perf_event_open is called with these
+ settings, the event will fail to open.
Appendix A. SVE programmer's model (informative)
=================================================
@@ -543,3 +612,5 @@ References
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
+
+[5] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index 0c9120ec58ae..540a1d4fc6c9 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -49,7 +49,7 @@ how the user addresses are used by the kernel:
- ``brk()``, ``mmap()`` and the ``new_address`` argument to
``mremap()`` as these have the potential to alias with existing
- user addresses.
+ user addresses.
NOTE: This behaviour changed in v5.6 and so some earlier kernels may
incorrectly accept valid tagged pointers for the ``brk()``,
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index 093cdaefdb37..edea4656c5c0 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
- RMW operations that have a return value are fully ordered.
- - RMW operations that are conditional are unordered on FAILURE,
- otherwise the above rules apply. In the case of test_and_{}_bit() operations,
- if the bit in memory is unchanged by the operation then it is deemed to have
- failed.
+ - RMW operations that are conditional are fully ordered.
-Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
-clear_bit_unlock() which has RELEASE semantics.
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
+clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
+ACQUIRE semantics.
Since a platform only has a single means of achieving atomic operations
the same barriers as for atomic_t are used, see atomic_t.txt.
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index 0f1ffa03db09..d7adc6d543db 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -324,7 +324,7 @@ atomic operations.
Specifically 'simple' cmpxchg() loops are expected to not starve one another
indefinitely. However, this is not evident on LL/SC architectures, because
-while an LL/SC architecure 'can/should/must' provide forward progress
+while an LL/SC architecture 'can/should/must' provide forward progress
guarantees between competing LL/SC sections, such a guarantee does not
transfer to cmpxchg() implemented using LL/SC. Consider:
diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
deleted file mode 100644
index 2098477851a4..000000000000
--- a/Documentation/block/biodoc.rst
+++ /dev/null
@@ -1,1164 +0,0 @@
-=====================================================
-Notes on the Generic Block Layer Rewrite in Linux 2.5
-=====================================================
-
-.. note::
-
- It seems that there are lot of outdated stuff here. This seems
- to be written somewhat as a task list. Yet, eventually, something
- here might still be useful.
-
-Notes Written on Jan 15, 2002:
-
- - Jens Axboe <jens.axboe@oracle.com>
- - Suparna Bhattacharya <suparna@in.ibm.com>
-
-Last Updated May 2, 2002
-
-September 2003: Updated I/O Scheduler portions
- - Nick Piggin <npiggin@kernel.dk>
-
-Introduction
-============
-
-These are some notes describing some aspects of the 2.5 block layer in the
-context of the bio rewrite. The idea is to bring out some of the key
-changes and a glimpse of the rationale behind those changes.
-
-Please mail corrections & suggestions to suparna@in.ibm.com.
-
-Credits
-=======
-
-2.5 bio rewrite:
- - Jens Axboe <jens.axboe@oracle.com>
-
-Many aspects of the generic block layer redesign were driven by and evolved
-over discussions, prior patches and the collective experience of several
-people. See sections 8 and 9 for a list of some related references.
-
-The following people helped with review comments and inputs for this
-document:
-
- - Christoph Hellwig <hch@infradead.org>
- - Arjan van de Ven <arjanv@redhat.com>
- - Randy Dunlap <rdunlap@xenotime.net>
- - Andre Hedrick <andre@linux-ide.org>
-
-The following people helped with fixes/contributions to the bio patches
-while it was still work-in-progress:
-
- - David S. Miller <davem@redhat.com>
-
-
-.. Description of Contents:
-
- 1. Scope for tuning of logic to various needs
- 1.1 Tuning based on device or low level driver capabilities
- - Per-queue parameters
- - Highmem I/O support
- - I/O scheduler modularization
- 1.2 Tuning based on high level requirements/capabilities
- 1.2.1 Request Priority/Latency
- 1.3 Direct access/bypass to lower layers for diagnostics and special
- device operations
- 1.3.1 Pre-built commands
- 2. New flexible and generic but minimalist i/o structure or descriptor
- (instead of using buffer heads at the i/o layer)
- 2.1 Requirements/Goals addressed
- 2.2 The bio struct in detail (multi-page io unit)
- 2.3 Changes in the request structure
- 3. Using bios
- 3.1 Setup/teardown (allocation, splitting)
- 3.2 Generic bio helper routines
- 3.2.1 Traversing segments and completion units in a request
- 3.2.2 Setting up DMA scatterlists
- 3.2.3 I/O completion
- 3.2.4 Implications for drivers that do not interpret bios (don't handle
- multiple segments)
- 3.3 I/O submission
- 4. The I/O scheduler
- 5. Scalability related changes
- 5.1 Granular locking: Removal of io_request_lock
- 5.2 Prepare for transition to 64 bit sector_t
- 6. Other Changes/Implications
- 6.1 Partition re-mapping handled by the generic block layer
- 7. A few tips on migration of older drivers
- 8. A list of prior/related/impacted patches/ideas
- 9. Other References/Discussion Threads
-
-
-Bio Notes
-=========
-
-Let us discuss the changes in the context of how some overall goals for the
-block layer are addressed.
-
-1. Scope for tuning the generic logic to satisfy various requirements
-=====================================================================
-
-The block layer design supports adaptable abstractions to handle common
-processing with the ability to tune the logic to an appropriate extent
-depending on the nature of the device and the requirements of the caller.
-One of the objectives of the rewrite was to increase the degree of tunability
-and to enable higher level code to utilize underlying device/driver
-capabilities to the maximum extent for better i/o performance. This is
-important especially in the light of ever improving hardware capabilities
-and application/middleware software designed to take advantage of these
-capabilities.
-
-1.1 Tuning based on low level device / driver capabilities
-----------------------------------------------------------
-
-Sophisticated devices with large built-in caches, intelligent i/o scheduling
-optimizations, high memory DMA support, etc may find some of the
-generic processing an overhead, while for less capable devices the
-generic functionality is essential for performance or correctness reasons.
-Knowledge of some of the capabilities or parameters of the device should be
-used at the generic block layer to take the right decisions on
-behalf of the driver.
-
-How is this achieved ?
-
-Tuning at a per-queue level:
-
-i. Per-queue limits/values exported to the generic layer by the driver
-
-Various parameters that the generic i/o scheduler logic uses are set at
-a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, logical block size)
-
-Some parameters that were earlier available as global arrays indexed by
-major/minor are now directly associated with the queue. Some of these may
-move into the block device structure in the future. Some characteristics
-have been incorporated into a queue flags field rather than separate fields
-in themselves. There are blk_queue_xxx functions to set the parameters,
-rather than update the fields directly
-
-Some new queue property settings:
-
- blk_queue_bounce_limit(q, u64 dma_address)
- Enable I/O to highmem pages, dma_address being the
- limit. No highmem default.
-
- blk_queue_max_sectors(q, max_sectors)
- Sets two variables that limit the size of the request.
-
- - The request queue's max_sectors, which is a soft size in
- units of 512 byte sectors, and could be dynamically varied
- by the core kernel.
-
- - The request queue's max_hw_sectors, which is a hard limit
- and reflects the maximum size request a driver can handle
- in units of 512 byte sectors.
-
- The default for both max_sectors and max_hw_sectors is
- 255. The upper limit of max_sectors is 1024.
-
- blk_queue_max_phys_segments(q, max_segments)
- Maximum physical segments you can handle in a request. 128
- default (driver limit). (See 3.2.2)
-
- blk_queue_max_hw_segments(q, max_segments)
- Maximum dma segments the hardware can handle in a request. 128
- default (host adapter limit, after dma remapping).
- (See 3.2.2)
-
- blk_queue_max_segment_size(q, max_seg_size)
- Maximum size of a clustered segment, 64kB default.
-
- blk_queue_logical_block_size(q, logical_block_size)
- Lowest possible sector size that the hardware can operate
- on, 512 bytes default.
-
-New queue flags:
-
- - QUEUE_FLAG_CLUSTER (see 3.2.2)
- - QUEUE_FLAG_QUEUED (see 3.2.4)
-
-
-ii. High-mem i/o capabilities are now considered the default
-
-The generic bounce buffer logic, present in 2.4, where the block layer would
-by default copyin/out i/o requests on high-memory buffers to low-memory buffers
-assuming that the driver wouldn't be able to handle it directly, has been
-changed in 2.5. The bounce logic is now applied only for memory ranges
-for which the device cannot handle i/o. A driver can specify this by
-setting the queue bounce limit for the request queue for the device
-(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
-where a device is capable of handling high memory i/o.
-
-In order to enable high-memory i/o where the device is capable of supporting
-it, the pci dma mapping routines and associated data structures have now been
-modified to accomplish a direct page -> bus translation, without requiring
-a virtual address mapping (unlike the earlier scheme of virtual address
--> bus translation). So this works uniformly for high-memory pages (which
-do not have a corresponding kernel virtual address space mapping) and
-low-memory pages.
-
-Note: Please refer to Documentation/core-api/dma-api-howto.rst for a discussion
-on PCI high mem DMA aspects and mapping of scatter gather lists, and support
-for 64 bit PCI.
-
-Special handling is required only for cases where i/o needs to happen on
-pages at physical memory addresses beyond what the device can support. In these
-cases, a bounce bio representing a buffer from the supported memory range
-is used for performing the i/o with copyin/copyout as needed depending on
-the type of the operation. For example, in case of a read operation, the
-data read has to be copied to the original buffer on i/o completion, so a
-callback routine is set up to do this, while for write, the data is copied
-from the original buffer to the bounce buffer prior to issuing the
-operation. Since an original buffer may be in a high memory area that's not
-mapped in kernel virtual addr, a kmap operation may be required for
-performing the copy, and special care may be needed in the completion path
-as it may not be in irq context. Special care is also required (by way of
-GFP flags) when allocating bounce buffers, to avoid certain highmem
-deadlock possibilities.
-
-It is also possible that a bounce buffer may be allocated from high-memory
-area that's not mapped in kernel virtual addr, but within the range that the
-device can use directly; so the bounce page may need to be kmapped during
-copy operations. [Note: This does not hold in the current implementation,
-though]
-
-There are some situations when pages from high memory may need to
-be kmapped, even if bounce buffers are not necessary. For example a device
-may need to abort DMA operations and revert to PIO for the transfer, in
-which case a virtual mapping of the page is required. For SCSI it is also
-done in some scenarios where the low level driver cannot be trusted to
-handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions as appropriate. A driver could also use
-the blk_queue_bounce() routine on its own to bounce highmem i/o to low
-memory for specific requests if so desired.
-
-iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
-
-As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
-queue or pick from (copy) existing generic schedulers and replace/override
-certain portions of it. The 2.5 rewrite provides improved modularization
-of the i/o scheduler. There are more pluggable callbacks, e.g for init,
-add request, extract request, which makes it possible to abstract specific
-i/o scheduling algorithm aspects and details outside of the generic loop.
-It also makes it possible to completely hide the implementation details of
-the i/o scheduler from block drivers.
-
-I/O scheduler wrappers are to be used instead of accessing the queue directly.
-See section 4. The I/O scheduler for details.
-
-1.2 Tuning Based on High level code capabilities
-------------------------------------------------
-
-i. Application capabilities for raw i/o
-
-This comes from some of the high-performance database/middleware
-requirements where an application prefers to make its own i/o scheduling
-decisions based on an understanding of the access patterns and i/o
-characteristics
-
-ii. High performance filesystems or other higher level kernel code's
-capabilities
-
-Kernel components like filesystems could also take their own i/o scheduling
-decisions for optimizing performance. Journalling filesystems may need
-some control over i/o ordering.
-
-What kind of support exists at the generic block layer for this ?
-
-The flags and rw fields in the bio structure can be used for some tuning
-from above e.g indicating that an i/o is just a readahead request, or priority
-settings (currently unused). As far as user applications are concerned they
-would need an additional mechanism either via open flags or ioctls, or some
-other upper level mechanism to communicate such settings to block.
-
-1.2.1 Request Priority/Latency
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Todo/Under discussion::
-
- Arjan's proposed request priority scheme allows higher levels some broad
- control (high/med/low) over the priority of an i/o request vs other pending
- requests in the queue. For example it allows reads for bringing in an
- executable page on demand to be given a higher priority over pending write
- requests which haven't aged too much on the queue. Potentially this priority
- could even be exposed to applications in some manner, providing higher level
- tunability. Time based aging avoids starvation of lower priority
- requests. Some bits in the bi_opf flags field in the bio structure are
- intended to be used for this priority information.
-
-
-1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
------------------------------------------------------------------------
-
-(e.g Diagnostics, Systems Management)
-
-There are situations where high-level code needs to have direct access to
-the low level device capabilities or requires the ability to issue commands
-to the device bypassing some of the intermediate i/o layers.
-These could, for example, be special control commands issued through ioctl
-interfaces, or could be raw read/write commands that stress the drive's
-capabilities for certain kinds of fitness tests. Having direct interfaces at
-multiple levels without having to pass through upper layers makes
-it possible to perform bottom up validation of the i/o path, layer by
-layer, starting from the media.
-
-The normal i/o submission interfaces, e.g submit_bio, could be bypassed
-for specially crafted requests which such ioctl or diagnostics
-interfaces would typically use, and the elevator add_request routine
-can instead be used to directly insert such requests in the queue or preferably
-the blk_do_rq routine can be used to place the request on the queue and
-wait for completion. Alternatively, sometimes the caller might just
-invoke a lower level driver specific interface with the request as a
-parameter.
-
-If the request is a means for passing on special information associated with
-the command, then such information is associated with the request->special
-field (rather than misuse the request->buffer field which is meant for the
-request data buffer's virtual mapping).
-
-For passing request data, the caller must build up a bio descriptor
-representing the concerned memory buffer if the underlying driver interprets
-bio segments or uses the block layer end*request* functions for i/o
-completion. Alternatively one could directly use the request->buffer field to
-specify the virtual address of the buffer, if the driver expects buffer
-addresses passed in this way and ignores bio entries for the request type
-involved. In the latter case, the driver would modify and manage the
-request->buffer, request->sector and request->nr_sectors or
-request->current_nr_sectors fields itself rather than using the block layer
-end_request or end_that_request_first completion interfaces.
-(See 2.3 or Documentation/block/request.rst for a brief explanation of
-the request structure fields)
-
-::
-
- [TBD: end_that_request_last should be usable even in this case;
- Perhaps an end_that_direct_request_first routine could be implemented to make
- handling direct requests easier for such drivers; Also for drivers that
- expect bios, a helper function could be provided for setting up a bio
- corresponding to a data buffer]
-
- <JENS: I dont understand the above, why is end_that_request_first() not
- usable? Or _last for that matter. I must be missing something>
-
- <SUP: What I meant here was that if the request doesn't have a bio, then
- end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
- and hence can't be used for advancing request state settings on the
- completion of partial transfers. The driver has to modify these fields
- directly by hand.
- This is because end_that_request_first only iterates over the bio list,
- and always returns 0 if there are none associated with the request.
- _last works OK in this case, and is not a problem, as I mentioned earlier
- >
-
-1.3.1 Pre-built Commands
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-A request can be created with a pre-built custom command to be sent directly
-to the device. The cmd block in the request structure has room for filling
-in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
-command pre-building, and the type of the request is now indicated
-through rq->flags instead of via rq->cmd)
-
-The request structure flags can be set up to indicate the type of request
-in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
-packet command issued via blk_do_rq, REQ_SPECIAL: special request).
-
-It can help to pre-build device commands for requests in advance.
-Drivers can now specify a request prepare function (q->prep_rq_fn) that the
-block layer would invoke to pre-build device commands for a given request,
-or perform other preparatory processing for the request. This is routine is
-called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have RQF_DONTPREP
-enabled)
-
-Aside:
- Pre-building could possibly even be done early, i.e before placing the
- request on the queue, rather than construct the command on the fly in the
- driver while servicing the request queue when it may affect latencies in
- interrupt context or responsiveness in general. One way to add early
- pre-building would be to do it whenever we fail to merge on a request.
- Now REQ_NOMERGE is set in the request flags to skip this one in the future,
- which means that it will not change before we feed it to the device. So
- the pre-builder hook can be invoked there.
-
-
-2. Flexible and generic but minimalist i/o structure/descriptor
-===============================================================
-
-2.1 Reason for a new structure and requirements addressed
----------------------------------------------------------
-
-Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
-layer, and the low level request structure was associated with a chain of
-buffer heads for a contiguous i/o request. This led to certain inefficiencies
-when it came to large i/o requests and readv/writev style operations, as it
-forced such requests to be broken up into small chunks before being passed
-on to the generic block layer, only to be merged by the i/o scheduler
-when the underlying device was capable of handling the i/o in one shot.
-Also, using the buffer head as an i/o structure for i/os that didn't originate
-from the buffer cache unnecessarily added to the weight of the descriptors
-which were generated for each such chunk.
-
-The following were some of the goals and expectations considered in the
-redesign of the block i/o data structure in 2.5.
-
-1. Should be appropriate as a descriptor for both raw and buffered i/o -
- avoid cache related fields which are irrelevant in the direct/page i/o path,
- or filesystem block size alignment restrictions which may not be relevant
- for raw i/o.
-2. Ability to represent high-memory buffers (which do not have a virtual
- address mapping in kernel address space).
-3. Ability to represent large i/os w/o unnecessarily breaking them up (i.e
- greater than PAGE_SIZE chunks in one shot)
-4. At the same time, ability to retain independent identity of i/os from
- different sources or i/o units requiring individual completion (e.g. for
- latency reasons)
-5. Ability to represent an i/o involving multiple physical memory segments
- (including non-page aligned page fragments, as specified via readv/writev)
- without unnecessarily breaking it up, if the underlying device is capable of
- handling it.
-6. Preferably should be based on a memory descriptor structure that can be
- passed around different types of subsystems or layers, maybe even
- networking, without duplication or extra copies of data/descriptor fields
- themselves in the process
-7. Ability to handle the possibility of splits/merges as the structure passes
- through layered drivers (lvm, md, evms), with minimal overhead.
-
-The solution was to define a new structure (bio) for the block layer,
-instead of using the buffer head structure (bh) directly, the idea being
-avoidance of some associated baggage and limitations. The bio structure
-is uniformly used for all i/o at the block layer ; it forms a part of the
-bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
-mapped to bio structures.
-
-2.2 The bio struct
-------------------
-
-The bio structure uses a vector representation pointing to an array of tuples
-of <page, offset, len> to describe the i/o buffer, and has various other
-fields describing i/o parameters and state that needs to be maintained for
-performing the i/o.
-
-Notice that this representation means that a bio has no virtual address
-mapping at all (unlike buffer heads).
-
-::
-
- struct bio_vec {
- struct page *bv_page;
- unsigned short bv_len;
- unsigned short bv_offset;
- };
-
- /*
- * main unit of I/O for the block layer and lower layers (ie drivers)
- */
- struct bio {
- struct bio *bi_next; /* request queue link */
- struct block_device *bi_bdev; /* target device */
- unsigned long bi_flags; /* status, command, etc */
- unsigned long bi_opf; /* low bits: r/w, high: priority */
-
- unsigned int bi_vcnt; /* how may bio_vec's */
- struct bvec_iter bi_iter; /* current index into bio_vec array */
-
- unsigned int bi_size; /* total size in bytes */
- unsigned short bi_hw_segments; /* segments after DMA remapping */
- unsigned int bi_max; /* max bio_vecs we can hold
- used as index into pool */
- struct bio_vec *bi_io_vec; /* the actual vec list */
- bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
- atomic_t bi_cnt; /* pin count: free when it hits zero */
- void *bi_private;
- };
-
-With this multipage bio design:
-
-- Large i/os can be sent down in one go using a bio_vec list consisting
- of an array of <page, offset, len> fragments (similar to the way fragments
- are represented in the zero-copy network code)
-- Splitting of an i/o request across multiple devices (as in the case of
- lvm or raid) is achieved by cloning the bio (where the clone points to
- the same bi_io_vec array, but with the index and size accordingly modified)
-- A linked list of bios is used as before for unrelated merges [#]_ - this
- avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list can find all the segments of a bio
- by using rq_for_each_segment. This handles the fact that a request
- has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_iter
- field to keep track of the next bio_vec entry to process.
- (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
- [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
- bi_offset an len fields]
-
-.. [#]
-
- unrelated merges -- a request ends up containing two or more bios that
- didn't originate from the same place.
-
-bi_end_io() i/o callback gets called on i/o completion of the entire bio.
-
-At a lower level, drivers build a scatter gather list from the merged bios.
-The scatter gather list is in the form of an array of <page, offset, len>
-entries with their corresponding dma address mappings filled in at the
-appropriate time. As an optimization, contiguous physical pages can be
-covered by a single entry where <page> refers to the first page and <len>
-covers the range of pages (up to 16 contiguous pages could be covered this
-way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
-the sg list.
-
-Note: Right now the only user of bios with more than one page is ll_rw_kio,
-which in turn means that only raw I/O uses it (direct i/o may not work
-right now). The intent however is to enable clustering of pages etc to
-become possible. The pagebuf abstraction layer from SGI also uses multi-page
-bios, but that is currently not included in the stock development kernels.
-The same is true of Andrew Morton's work-in-progress multipage bio writeout
-and readahead patches.
-
-2.3 Changes in the Request Structure
-------------------------------------
-
-The request structure is the structure that gets passed down to low level
-drivers. The block layer make_request function builds up a request structure,
-places it on the queue and invokes the drivers request_fn. The driver makes
-use of block layer helper routine elv_next_request to pull the next request
-off the queue. Control or diagnostic functions might bypass block and directly
-invoke underlying driver entry points passing in a specially constructed
-request structure.
-
-Only some relevant fields (mainly those which changed or may be referred
-to in some of the discussion here) are listed below, not necessarily in
-the order in which they occur in the structure (see include/linux/blkdev.h)
-Refer to Documentation/block/request.rst for details about all the request
-structure fields and a quick reference about the layers which are
-supposed to use or modify those fields::
-
- struct request {
- struct list_head queuelist; /* Not meant to be directly accessed by
- the driver.
- Used by q->elv_next_request_fn
- rq->queue is gone
- */
- .
- .
- unsigned char cmd[16]; /* prebuilt command data block */
- unsigned long flags; /* also includes earlier rq->cmd settings */
- .
- .
- sector_t sector; /* this field is now of type sector_t instead of int
- preparation for 64 bit sectors */
- .
- .
-
- /* Number of scatter-gather DMA addr+len pairs after
- * physical address coalescing is performed.
- */
- unsigned short nr_phys_segments;
-
- /* Number of scatter-gather addr+len pairs after
- * physical and DMA remapping hardware coalescing is performed.
- * This is the number of scatter-gather entries the driver
- * will actually have to deal with after DMA mapping is done.
- */
- unsigned short nr_hw_segments;
-
- /* Various sector counts */
- unsigned long nr_sectors; /* no. of sectors left: driver modifiable */
- unsigned long hard_nr_sectors; /* block internal copy of above */
- unsigned int current_nr_sectors; /* no. of sectors left in the
- current segment:driver modifiable */
- unsigned long hard_cur_sectors; /* block internal copy of the above */
- .
- .
- int tag; /* command tag associated with request */
- void *special; /* same as before */
- char *buffer; /* valid only for low memory buffers up to
- current_nr_sectors */
- .
- .
- struct bio *bio, *biotail; /* bio list instead of bh */
- struct request_list *rl;
- }
-
-See the req_ops and req_flag_bits definitions for an explanation of the various
-flags available. Some bits are used by the block layer or i/o scheduler.
-
-The behaviour of the various sector counts are almost the same as before,
-except that since we have multi-segment bios, current_nr_sectors refers
-to the numbers of sectors in the current segment being processed which could
-be one of the many segments in the current bio (i.e i/o completion unit).
-The nr_sectors value refers to the total number of sectors in the whole
-request that remain to be transferred (no change). The purpose of the
-hard_xxx values is for block to remember these counts every time it hands
-over the request to the driver. These values are updated by block on
-end_that_request_first, i.e. every time the driver completes a part of the
-transfer and invokes block end*request helpers to mark this. The
-driver should not modify these values. The block layer sets up the
-nr_sectors and current_nr_sectors fields (based on the corresponding
-hard_xxx values and the number of bytes transferred) and updates it on
-every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_iter fields too.
-
-The buffer field is just a virtual address mapping of the current segment
-of the i/o buffer in cases where the buffer resides in low-memory. For high
-memory i/o, this field is not valid and must not be used by drivers.
-
-Code that sets up its own request structures and passes them down to
-a driver needs to be careful about interoperation with the block layer helper
-functions which the driver uses. (Section 1.3)
-
-3. Using bios
-=============
-
-3.1 Setup/Teardown
-------------------
-
-There are routines for managing the allocation, and reference counting, and
-freeing of bios (bio_alloc, bio_get, bio_put).
-
-This makes use of Ingo Molnar's mempool implementation, which enables
-subsystems like bio to maintain their own reserve memory pools for guaranteed
-deadlock-free allocations during extreme VM load. For example, the VM
-subsystem makes use of the block layer to writeout dirty pages in order to be
-able to free up memory space, a case which needs careful handling. The
-allocation logic draws from the preallocated emergency reserve in situations
-where it cannot allocate through normal means. If the pool is empty and it
-can wait, then it would trigger action that would help free up memory or
-replenish the pool (without deadlocking) and wait for availability in the pool.
-If it is in IRQ context, and hence not in a position to do this, allocation
-could fail if the pool is empty. In general mempool always first tries to
-perform allocation without having to wait, even if it means digging into the
-pool as long it is not less that 50% full.
-
-On a free, memory is released to the pool or directly freed depending on
-the current availability in the pool. The mempool interface lets the
-subsystem specify the routines to be used for normal alloc and free. In the
-case of bio, these routines make use of the standard slab allocator.
-
-The caller of bio_alloc is expected to taken certain steps to avoid
-deadlocks, e.g. avoid trying to allocate more memory from the pool while
-already holding memory obtained from the pool.
-
-::
-
- [TBD: This is a potential issue, though a rare possibility
- in the bounce bio allocation that happens in the current code, since
- it ends up allocating a second bio from the same pool while
- holding the original bio ]
-
-Memory allocated from the pool should be released back within a limited
-amount of time (in the case of bio, that would be after the i/o is completed).
-This ensures that if part of the pool has been used up, some work (in this
-case i/o) must already be in progress and memory would be available when it
-is over. If allocating from multiple pools in the same code path, the order
-or hierarchy of allocation needs to be consistent, just the way one deals
-with multiple locks.
-
-The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
-for a non-clone bio. There are the 6 pools setup for different size biovecs,
-so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
-given size from these slabs.
-
-The bio_get() routine may be used to hold an extra reference on a bio prior
-to i/o submission, if the bio fields are likely to be accessed after the
-i/o is issued (since the bio may otherwise get freed in case i/o completion
-happens in the meantime).
-
-The bio_clone_fast() routine may be used to duplicate a bio, where the clone
-shares the bio_vec_list with the original bio (i.e. both point to the
-same bio_vec_list). This would typically be used for splitting i/o requests
-in lvm or md.
-
-3.2 Generic bio helper Routines
--------------------------------
-
-3.2.1 Traversing segments and completion units in a request
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The macro rq_for_each_segment() should be used for traversing the bios
-in the request list (drivers should avoid directly trying to do it
-themselves). Using these helpers should also make it easier to cope
-with block changes in the future.
-
-::
-
- struct req_iterator iter;
- rq_for_each_segment(bio_vec, rq, iter)
- /* bio_vec is now current segment */
-
-I/O completion callbacks are per-bio rather than per-segment, so drivers
-that traverse bio chains on completion need to keep that in mind. Drivers
-which don't make a distinction between segments and completion units would
-need to be reorganized to support multi-segment bios.
-
-3.2.2 Setting up DMA scatterlists
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The blk_rq_map_sg() helper routine would be used for setting up scatter
-gather lists from a request, so a driver need not do it on its own.
-
- nr_segments = blk_rq_map_sg(q, rq, scatterlist);
-
-The helper routine provides a level of abstraction which makes it easier
-to modify the internals of request to scatterlist conversion down the line
-without breaking drivers. The blk_rq_map_sg routine takes care of several
-things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
-is set) and correct segment accounting to avoid exceeding the limits which
-the i/o hardware can handle, based on various queue properties.
-
-- Prevents a clustered segment from crossing a 4GB mem boundary
-- Avoids building segments that would exceed the number of physical
- memory segments that the driver can handle (phys_segments) and the
- number that the underlying hardware can handle at once, accounting for
- DMA remapping (hw_segments) (i.e. IOMMU aware limits).
-
-Routines which the low level driver can use to set up the segment limits:
-
-blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
-hw data segments in a request (i.e. the maximum number of address/length
-pairs the host adapter can actually hand to the device at once)
-
-blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
-of physical data segments in a request (i.e. the largest sized scatter list
-a driver could handle)
-
-3.2.3 I/O completion
-^^^^^^^^^^^^^^^^^^^^
-
-The existing generic block layer helper routines end_request,
-end_that_request_first and end_that_request_last can be used for i/o
-completion (and setting things up so the rest of the i/o or the next
-request can be kicked of) as before. With the introduction of multi-page
-bio support, end_that_request_first requires an additional argument indicating
-the number of sectors completed.
-
-3.2.4 Implications for drivers that do not interpret bios
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-(don't handle multiple segments)
-
-Drivers that do not interpret bios e.g those which do not handle multiple
-segments and do not support i/o into high memory addresses (require bounce
-buffers) and expect only virtually mapped buffers, can access the rq->buffer
-field. As before the driver should use current_nr_sectors to determine the
-size of remaining data in the current segment (that is the maximum it can
-transfer in one go unless it interprets segments), and rely on the block layer
-end_request, or end_that_request_first/last to take care of all accounting
-and transparent mapping of the next bio segment when a segment boundary
-is crossed on completion of a transfer. (The end*request* functions should
-be used if only if the request has come down from block/bio path, not for
-direct access requests which only specify rq->buffer without a valid rq->bio)
-
-3.3 I/O Submission
-------------------
-
-The routine submit_bio() is used to submit a single io. Higher level i/o
-routines make use of this:
-
-(a) Buffered i/o:
-
-The routine submit_bh() invokes submit_bio() on a bio corresponding to the
-bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
-
-(b) Kiobuf i/o (for raw/direct i/o):
-
-The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
-maps the array to one or more multi-page bios, issuing submit_bio() to
-perform the i/o on each of these.
-
-The embedded bh array in the kiobuf structure has been removed and no
-preallocation of bios is done for kiobufs. [The intent is to remove the
-blocks array as well, but it's currently in there to kludge around direct i/o.]
-Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
-
-Todo/Observation:
-
- A single kiobuf structure is assumed to correspond to a contiguous range
- of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
- So right now it wouldn't work for direct i/o on non-contiguous blocks.
- This is to be resolved. The eventual direction is to replace kiobuf
- by kvec's.
-
- Badari Pulavarty has a patch to implement direct i/o correctly using
- bio and kvec.
-
-
-(c) Page i/o:
-
-Todo/Under discussion:
-
- Andrew Morton's multi-page bio patches attempt to issue multi-page
- writeouts (and reads) from the page cache, by directly building up
- large bios for submission completely bypassing the usage of buffer
- heads. This work is still in progress.
-
- Christoph Hellwig had some code that uses bios for page-io (rather than
- bh). This isn't included in bio as yet. Christoph was also working on a
- design for representing virtual/real extents as an entity and modifying
- some of the address space ops interfaces to utilize this abstraction rather
- than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
- abstraction, but intended to be as lightweight as possible).
-
-(d) Direct access i/o:
-
-Direct access requests that do not contain bios would be submitted differently
-as discussed earlier in section 1.3.
-
-Aside:
-
- Kvec i/o:
-
- Ben LaHaise's aio code uses a slightly different structure instead
- of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
- tuples (very much like the networking code), together with a callback function
- and data pointer. This is embedded into a brw_cb structure when passed
- to brw_kvec_async().
-
- Now it should be possible to directly map these kvecs to a bio. Just as while
- cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
- array pointer to point to the veclet array in kvecs.
-
- TBD: In order for this to work, some changes are needed in the way multi-page
- bios are handled today. The values of the tuples in such a vector passed in
- from higher level code should not be modified by the block layer in the course
- of its request processing, since that would make it hard for the higher layer
- to continue to use the vector descriptor (kvec) after i/o completes. Instead,
- all such transient state should either be maintained in the request structure,
- and passed on in some way to the endio completion routine.
-
-
-4. The I/O scheduler
-====================
-
-I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
-queue and specific I/O schedulers. Unless stated otherwise, elevator is used
-to refer to both parts and I/O scheduler to specific I/O schedulers.
-
-Block layer implements generic dispatch queue in `block/*.c`.
-The generic dispatch queue is responsible for requeueing, handling non-fs
-requests and all other subtleties.
-
-Specific I/O schedulers are responsible for ordering normal filesystem
-requests. They can also choose to delay certain requests to improve
-throughput or whatever purpose. As the plural form indicates, there are
-multiple I/O schedulers. They can be built as modules but at least one should
-be built inside the kernel. Each queue can choose different one and can also
-change to another one dynamically.
-
-A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
-and xxx might not match exactly, but use your imagination. If an elevator
-doesn't implement a function, the switch does nothing or some minimal house
-keeping work.
-
-4.1. I/O scheduler API
-----------------------
-
-The functions an elevator may implement are: (* are mandatory)
-
-=============================== ================================================
-elevator_merge_fn called to query requests for merge with a bio
-
-elevator_merge_req_fn called when two requests get merged. the one
- which gets merged into the other one will be
- never seen by I/O scheduler again. IOW, after
- being merged, the request is gone.
-
-elevator_merged_fn called when a request in the scheduler has been
- involved in a merge. It is used in the deadline
- scheduler for example, to reposition the request
- if its sorting order has changed.
-
-elevator_allow_merge_fn called whenever the block layer determines
- that a bio can be merged into an existing
- request safely. The io scheduler may still
- want to stop a merge at this point if it
- results in some sort of conflict internally,
- this hook allows it to do that. Note however
- that two *requests* can still be merged at later
- time. Currently the io scheduler has no way to
- prevent that. It can only learn about the fact
- from elevator_merge_req_fn callback.
-
-elevator_dispatch_fn* fills the dispatch queue with ready requests.
- I/O schedulers are free to postpone requests by
- not filling the dispatch queue unless @force
- is non-zero. Once dispatched, I/O schedulers
- are not allowed to manipulate the requests -
- they belong to generic dispatch queue.
-
-elevator_add_req_fn* called to add a new request into the scheduler
-
-elevator_former_req_fn
-elevator_latter_req_fn These return the request before or after the
- one specified in disk sort order. Used by the
- block layer to find merge possibilities.
-
-elevator_completed_req_fn called when a request is completed.
-
-elevator_set_req_fn
-elevator_put_req_fn Must be used to allocate and free any elevator
- specific storage for a request.
-
-elevator_activate_req_fn Called when device driver first sees a request.
- I/O schedulers can use this callback to
- determine when actual execution of a request
- starts.
-elevator_deactivate_req_fn Called when device driver decides to delay
- a request by requeueing it.
-
-elevator_init_fn*
-elevator_exit_fn Allocate and free any elevator specific storage
- for a queue.
-=============================== ================================================
-
-4.2 Request flows seen by I/O schedulers
-----------------------------------------
-
-All requests seen by I/O schedulers strictly follow one of the following three
-flows.
-
- set_req_fn ->
-
- i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
- (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
- ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
- iii. [none]
-
- -> put_req_fn
-
-4.3 I/O scheduler implementation
---------------------------------
-
-The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
-optimal disk scan and request servicing performance (based on generic
-principles and device capabilities), optimized for:
-
-i. improved throughput
-ii. improved latency
-iii. better utilization of h/w & CPU time
-
-Characteristics:
-
-i. Binary tree
-AS and deadline i/o schedulers use red black binary trees for disk position
-sorting and searching, and a fifo linked list for time-based searching. This
-gives good scalability and good availability of information. Requests are
-almost always dispatched in disk sort order, so a cache is kept of the next
-request in sort order to prevent binary tree lookups.
-
-This arrangement is not a generic block layer characteristic however, so
-elevators may implement queues as they please.
-
-ii. Merge hash
-AS and deadline use a hash table indexed by the last sector of a request. This
-enables merging code to quickly look up "back merge" candidates, even when
-multiple I/O streams are being performed at once on one disk.
-
-"Front merges", a new request being merged at the front of an existing request,
-are far less common than "back merges" due to the nature of most I/O patterns.
-Front merges are handled by the binary trees in AS and deadline schedulers.
-
-iii. Plugging the queue to batch requests in anticipation of opportunities for
- merge/sort optimizations
-
-Plugging is an approach that the current i/o scheduling algorithm resorts to so
-that it collects up enough requests in the queue to be able to take
-advantage of the sorting/merging logic in the elevator. If the
-queue is empty when a request comes in, then it plugs the request queue
-(sort of like plugging the bath tub of a vessel to get fluid to build up)
-till it fills up with a few more requests, before starting to service
-the requests. This provides an opportunity to merge/sort the requests before
-passing them down to the device. There are various conditions when the queue is
-unplugged (to open up the flow again), either through a scheduled task or
-could be on demand. For example wait_on_buffer sets the unplugging going
-through sync_buffer() running blk_run_address_space(mapping). Or the caller
-can do it explicity through blk_unplug(bdev). So in the read case,
-the queue gets explicitly unplugged as part of waiting for completion on that
-buffer.
-
-Aside:
- This is kind of controversial territory, as it's not clear if plugging is
- always the right thing to do. Devices typically have their own queues,
- and allowing a big queue to build up in software, while letting the device be
- idle for a while may not always make sense. The trick is to handle the fine
- balance between when to plug and when to open up. Also now that we have
- multi-page bios being queued in one shot, we may not need to wait to merge
- a big request from the broken up pieces coming by.
-
-4.4 I/O contexts
-----------------
-
-I/O contexts provide a dynamically allocated per process data area. They may
-be used in I/O schedulers, and in the block layer (could be used for IO statis,
-priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c
-for an example of usage in an i/o scheduler.
-
-
-5. Scalability related changes
-==============================
-
-5.1 Granular Locking: io_request_lock replaced by a per-queue lock
-------------------------------------------------------------------
-
-The global io_request_lock has been removed as of 2.5, to avoid
-the scalability bottleneck it was causing, and has been replaced by more
-granular locking. The request queue structure has a pointer to the
-lock to be used for that queue. As a result, locking can now be
-per-queue, with a provision for sharing a lock across queues if
-necessary (e.g the scsi layer sets the queue lock pointers to the
-corresponding adapter lock, which results in a per host locking
-granularity). The locking semantics are the same, i.e. locking is
-still imposed by the block layer, grabbing the lock before
-request_fn execution which it means that lots of older drivers
-should still be SMP safe. Drivers are free to drop the queue
-lock themselves, if required. Drivers that explicitly used the
-io_request_lock for serialization need to be modified accordingly.
-Usually it's as easy as adding a global lock::
-
- static DEFINE_SPINLOCK(my_driver_lock);
-
-and passing the address to that lock to blk_init_queue().
-
-5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
-----------------------------------------------------------------
-
-The sector number used in the bio structure has been changed to sector_t,
-which could be defined as 64 bit in preparation for 64 bit sector support.
-
-6. Other Changes/Implications
-=============================
-
-6.1 Partition re-mapping handled by the generic block layer
------------------------------------------------------------
-
-In 2.5 some of the gendisk/partition related code has been reorganized.
-Now the generic block layer performs partition-remapping early and thus
-provides drivers with a sector number relative to whole device, rather than
-having to take partition number into account in order to arrive at the true
-sector number. The routine blk_partition_remap() is invoked by
-submit_bio_noacct even before invoking the queue specific ->submit_bio,
-so the i/o scheduler also gets to operate on whole disk sector numbers. This
-should typically not require changes to block drivers, it just never gets
-to invoke its own partition sector offset calculations since all bios
-sent are offset from the beginning of the device.
-
-
-7. A Few Tips on Migration of older drivers
-===========================================
-
-Old-style drivers that just use CURRENT and ignores clustered requests,
-may not need much change. The generic layer will automatically handle
-clustered requests, multi-page bios, etc for the driver.
-
-For a low performance driver or hardware that is PIO driven or just doesn't
-support scatter-gather changes should be minimal too.
-
-The following are some points to keep in mind when converting old drivers
-to bio.
-
-Drivers should use elv_next_request to pick up requests and are no longer
-supposed to handle looping directly over the request list.
-(struct request->queue has been removed)
-
-Now end_that_request_first takes an additional number_of_sectors argument.
-It used to handle always just the first buffer_head in a request, now
-it will loop and handle as many sectors (on a bio-segment granularity)
-as specified.
-
-Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio) instead.
-
-If the driver is dropping the io_request_lock from its request_fn strategy,
-then it just needs to replace that with q->queue_lock instead.
-
-As described in Sec 1.1, drivers can set max sector size, max segment size
-etc per queue now. Drivers that used to define their own merge functions i
-to handle things like this can now just use the blk_queue_* functions at
-blk_init_queue time.
-
-Drivers no longer have to map a {partition, sector offset} into the
-correct absolute location anymore, this is done by the block layer, so
-where a driver received a request ala this before::
-
- rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */
- rq->sector = 0; /* first sector on hda5 */
-
-it will now see::
-
- rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */
- rq->sector = 123128; /* offset from start of disk */
-
-As mentioned, there is no virtual mapping of a bio. For DMA, this is
-not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (dma_map_page for a single segment or
-use dma_map_sg for scatter gather) to be able to ship it to the driver. For
-PIO drivers (or drivers that need to revert to PIO transfer once in a
-while (IDE for example)), where the CPU is doing the actual data
-transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
-a bio into the virtual address space.
-
-
-8. Prior/Related/Impacted patches
-=================================
-
-8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
------------------------------------------------------
-
-- orig kiobuf & raw i/o patches (now in 2.4 tree)
-- direct kiobuf based i/o to devices (no intermediate bh's)
-- page i/o using kiobuf
-- kiobuf splitting for lvm (mkp)
-- elevator support for kiobuf request merging (axboe)
-
-8.2. Zero-copy networking (Dave Miller)
----------------------------------------
-
-8.3. SGI XFS - pagebuf patches - use of kiobufs
------------------------------------------------
-8.4. Multi-page pioent patch for bio (Christoph Hellwig)
---------------------------------------------------------
-8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
---------------------------------------------------------------------
-8.6. Async i/o implementation patch (Ben LaHaise)
--------------------------------------------------
-8.7. EVMS layering design (IBM EVMS team)
------------------------------------------
-8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips)
--------------------------------------------------------------------------------------
-
- => larger contiguous physical memory buffers
-
-8.9. VM reservations patch (Ben LaHaise)
-----------------------------------------
-8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
-----------------------------------------------------------
-8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
----------------------------------------------------------------------------
-8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari)
--------------------------------------------------------------------------------
-8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven)
-------------------------------------------------------------------
-8.14 IDE Taskfile i/o patch (Andre Hedrick)
---------------------------------------------
-8.15 Multi-page writeout and readahead patches (Andrew Morton)
----------------------------------------------------------------
-8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
------------------------------------------------------------------------
-
-9. Other References
-===================
-
-9.1 The Splice I/O Model
-------------------------
-
-Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001
-
-9.2 Discussions about kiobuf and bh design
-------------------------------------------
-
-On lkml between sct, linus, alan et al - Feb-March 2001 (many of the
-initial thoughts that led to bio were brought up in this discussion thread)
-
-9.3 Discussions on mempool on lkml - Dec 2001.
-----------------------------------------------
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst
deleted file mode 100644
index 160a5148b915..000000000000
--- a/Documentation/block/capability.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-===============================
-Generic Block Device Capability
-===============================
-
-This file documents the sysfs file ``block/<disk>/capability``.
-
-``capability`` is a bitfield, printed in hexadecimal, indicating which
-capabilities a specific block device supports:
-
-.. kernel-doc:: include/linux/genhd.h
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
index 86dcf7159f99..9fea696f9daa 100644
--- a/Documentation/block/index.rst
+++ b/Documentation/block/index.rst
@@ -8,10 +8,8 @@ Block
:maxdepth: 1
bfq-iosched
- biodoc
biovecs
blk-mq
- capability
cmdline-partition
data-integrity
deadline-iosched
@@ -20,8 +18,7 @@ Block
kyber-iosched
null_blk
pr
- queue-sysfs
- request
stat
switching-sched
writeback_cache_control
+ ublk
diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index 7f9b40d6b416..90b733422ed4 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -1,5 +1,7 @@
.. SPDX-License-Identifier: GPL-2.0
+.. _inline_encryption:
+
=================
Inline Encryption
=================
@@ -7,230 +9,268 @@ Inline Encryption
Background
==========
-Inline encryption hardware sits logically between memory and the disk, and can
-en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a
-fixed number of "keyslots" - slots into which encryption contexts (i.e. the
-encryption key, encryption algorithm, data unit size) can be programmed by the
-kernel at any time. Each request sent to the disk can be tagged with the index
-of a keyslot (and also a data unit number to act as an encryption tweak), and
-the inline encryption hardware will en/decrypt the data in the request with the
-encryption context programmed into that keyslot. This is very different from
-full disk encryption solutions like self encrypting drives/TCG OPAL/ATA
-Security standards, since with inline encryption, any block on disk could be
-encrypted with any encryption context the kernel chooses.
-
+Inline encryption hardware sits logically between memory and disk, and can
+en/decrypt data as it goes in/out of the disk. For each I/O request, software
+can control exactly how the inline encryption hardware will en/decrypt the data
+in terms of key, algorithm, data unit size (the granularity of en/decryption),
+and data unit number (a value that determines the initialization vector(s)).
+
+Some inline encryption hardware accepts all encryption parameters including raw
+keys directly in low-level I/O requests. However, most inline encryption
+hardware instead has a fixed number of "keyslots" and requires that the key,
+algorithm, and data unit size first be programmed into a keyslot. Each
+low-level I/O request then just contains a keyslot index and data unit number.
+
+Note that inline encryption hardware is very different from traditional crypto
+accelerators, which are supported through the kernel crypto API. Traditional
+crypto accelerators operate on memory regions, whereas inline encryption
+hardware operates on I/O requests. Thus, inline encryption hardware needs to be
+managed by the block layer, not the kernel crypto API.
+
+Inline encryption hardware is also very different from "self-encrypting drives",
+such as those based on the TCG Opal or ATA Security standards. Self-encrypting
+drives don't provide fine-grained control of encryption and provide no way to
+verify the correctness of the resulting ciphertext. Inline encryption hardware
+provides fine-grained control of encryption, including the choice of key and
+initialization vector for each sector, and can be tested for correctness.
Objective
=========
-We want to support inline encryption (IE) in the kernel.
-To allow for testing, we also want a crypto API fallback when actual
-IE hardware is absent. We also want IE to work with layered devices
-like dm and loopback (i.e. we want to be able to use the IE hardware
-of the underlying devices if present, or else fall back to crypto API
-en/decryption).
-
+We want to support inline encryption in the kernel. To make testing easier, we
+also want support for falling back to the kernel crypto API when actual inline
+encryption hardware is absent. We also want inline encryption to work with
+layered devices like device-mapper and loopback (i.e. we want to be able to use
+the inline encryption hardware of the underlying devices if present, or else
+fall back to crypto API en/decryption).
Constraints and notes
=====================
-- IE hardware has a limited number of "keyslots" that can be programmed
- with an encryption context (key, algorithm, data unit size, etc.) at any time.
- One can specify a keyslot in a data request made to the device, and the
- device will en/decrypt the data using the encryption context programmed into
- that specified keyslot. When possible, we want to make multiple requests with
- the same encryption context share the same keyslot.
-
-- We need a way for upper layers like filesystems to specify an encryption
- context to use for en/decrypting a struct bio, and a device driver (like UFS)
- needs to be able to use that encryption context when it processes the bio.
-
-- We need a way for device drivers to expose their inline encryption
- capabilities in a unified way to the upper layers.
-
-
-Design
-======
-
-We add a struct bio_crypt_ctx to struct bio that can
-represent an encryption context, because we need to be able to pass this
-encryption context from the upper layers (like the fs layer) to the
-device driver to act upon.
-
-While IE hardware works on the notion of keyslots, the FS layer has no
-knowledge of keyslots - it simply wants to specify an encryption context to
-use while en/decrypting a bio.
-
-We introduce a keyslot manager (KSM) that handles the translation from
-encryption contexts specified by the FS to keyslots on the IE hardware.
-This KSM also serves as the way IE hardware can expose its capabilities to
-upper layers. The generic mode of operation is: each device driver that wants
-to support IE will construct a KSM and set it up in its struct request_queue.
-Upper layers that want to use IE on this device can then use this KSM in
-the device's struct request_queue to translate an encryption context into
-a keyslot. The presence of the KSM in the request queue shall be used to mean
-that the device supports IE.
-
-The KSM uses refcounts to track which keyslots are idle (either they have no
-encryption context programmed, or there are no in-flight struct bios
-referencing that keyslot). When a new encryption context needs a keyslot, it
-tries to find a keyslot that has already been programmed with the same
-encryption context, and if there is no such keyslot, it evicts the least
-recently used idle keyslot and programs the new encryption context into that
-one. If no idle keyslots are available, then the caller will sleep until there
-is at least one.
-
-
-blk-mq changes, other block layer changes and blk-crypto-fallback
-=================================================================
-
-We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to
-struct request. These will be referred to as the ``crypto fields``
-for the request. This ``keyslot`` is the keyslot into which the
-``bi_crypt_context`` has been programmed in the KSM of the ``request_queue``
-that this request is being sent to.
-
-We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain
-blissfully unaware of whether or not real inline encryption hardware is present
-underneath. When a bio is submitted with a target ``request_queue`` that doesn't
-support the encryption context specified with the bio, the block layer will
-en/decrypt the bio with the blk-crypto-fallback.
-
-If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio
-is encrypted stored in the bounce bio - blk-mq will then proceed to process the
-bounce bio as if it were not encrypted at all (except when blk-integrity is
-concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an
-internal function that cleans up the bounce bio and ends the original bio.
-
-If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``)
-is saved and overwritten by ``blk-crypto-fallback`` to
-``bio_crypto_fallback_decrypt_bio``. The bio's ``bi_crypt_context`` is also
-overwritten with ``NULL``, so that to the rest of the stack, the bio looks
-as if it was a regular bio that never had an encryption context specified.
-``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original
-``bi_end_io`` (and also ``bi_private``) and end the bio again.
-
-Regardless of whether real inline encryption hardware is used or the
+- We need a way for upper layers (e.g. filesystems) to specify an encryption
+ context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need
+ to be able to use that encryption context when they process the request.
+ Encryption contexts also introduce constraints on bio merging; the block layer
+ needs to be aware of these constraints.
+
+- Different inline encryption hardware has different supported algorithms,
+ supported data unit sizes, maximum data unit numbers, etc. We call these
+ properties the "crypto capabilities". We need a way for device drivers to
+ advertise crypto capabilities to upper layers in a generic way.
+
+- Inline encryption hardware usually (but not always) requires that keys be
+ programmed into keyslots before being used. Since programming keyslots may be
+ slow and there may not be very many keyslots, we shouldn't just program the
+ key for every I/O request, but rather keep track of which keys are in the
+ keyslots and reuse an already-programmed keyslot when possible.
+
+- Upper layers typically define a specific end-of-life for crypto keys, e.g.
+ when an encrypted directory is locked or when a crypto mapping is torn down.
+ At these times, keys are wiped from memory. We must provide a way for upper
+ layers to also evict keys from any keyslots they are present in.
+
+- When possible, device-mapper devices must be able to pass through the inline
+ encryption support of their underlying devices. However, it doesn't make
+ sense for device-mapper devices to have keyslots themselves.
+
+Basic design
+============
+
+We introduce ``struct blk_crypto_key`` to represent an inline encryption key and
+how it will be used. This includes the actual bytes of the key; the size of the
+key; the algorithm and data unit size the key will be used with; and the number
+of bytes needed to represent the maximum data unit number the key will be used
+with.
+
+We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It
+contains a data unit number and a pointer to a blk_crypto_key. We add pointers
+to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users
+of the block layer (e.g. filesystems) to provide an encryption context when
+creating a bio and have it be passed down the stack for processing by the block
+layer and device drivers. Note that the encryption context doesn't explicitly
+say whether to encrypt or decrypt, as that is implicit from the direction of the
+bio; WRITE means encrypt, and READ means decrypt.
+
+We also introduce ``struct blk_crypto_profile`` to contain all generic inline
+encryption-related state for a particular inline encryption device. The
+blk_crypto_profile serves as the way that drivers for inline encryption hardware
+advertise their crypto capabilities and provide certain functions (e.g.,
+functions to program and evict keys) to upper layers. Each device driver that
+wants to support inline encryption will construct a blk_crypto_profile, then
+associate it with the disk's request_queue.
+
+The blk_crypto_profile also manages the hardware's keyslots, when applicable.
+This happens in the block layer, so that users of the block layer can just
+specify encryption contexts and don't need to know about keyslots at all, nor do
+device drivers need to care about most details of keyslot management.
+
+Specifically, for each keyslot, the block layer (via the blk_crypto_profile)
+keeps track of which blk_crypto_key that keyslot contains (if any), and how many
+in-flight I/O requests are using it. When the block layer creates a
+``struct request`` for a bio that has an encryption context, it grabs a keyslot
+that already contains the key if possible. Otherwise it waits for an idle
+keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the
+least-recently-used idle keyslot using the function the device driver provided.
+In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of
+the request, where it is then accessible to device drivers and is released after
+the request completes.
+
+``struct request`` also contains a pointer to the original bio_crypt_ctx.
+Requests can be built from multiple bios, and the block layer must take the
+encryption context into account when trying to merge bios and requests. For two
+bios/requests to be merged, they must have compatible encryption contexts: both
+unencrypted, or both encrypted with the same key and contiguous data unit
+numbers. Only the encryption context for the first bio in a request is
+retained, since the remaining bios have been verified to be merge-compatible
+with the first bio.
+
+To make it possible for inline encryption to work with request_queue based
+layered devices, when a request is cloned, its encryption context is cloned as
+well. When the cloned request is submitted, it is then processed as usual; this
+includes getting a keyslot from the clone's target device if needed.
+
+blk-crypto-fallback
+===================
+
+It is desirable for the inline encryption support of upper layers (e.g.
+filesystems) to be testable without real inline encryption hardware, and
+likewise for the block layer's keyslot management logic. It is also desirable
+to allow upper layers to just always use inline encryption rather than have to
+implement encryption in multiple ways.
+
+Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
+of inline encryption using the kernel crypto API. blk-crypto-fallback is built
+into the block layer, so it works on any block device without any special setup.
+Essentially, when a bio with an encryption context is submitted to a
+block_device that doesn't support that encryption context, the block layer will
+handle en/decryption of the bio using blk-crypto-fallback.
+
+For encryption, the data cannot be encrypted in-place, as callers usually rely
+on it being unmodified. Instead, blk-crypto-fallback allocates bounce pages,
+fills a new bio with those bounce pages, encrypts the data into those bounce
+pages, and submits that "bounce" bio. When the bounce bio completes,
+blk-crypto-fallback completes the original bio. If the original bio is too
+large, multiple bounce bios may be required; see the code for details.
+
+For decryption, blk-crypto-fallback "wraps" the bio's completion callback
+(``bi_complete``) and private data (``bi_private``) with its own, unsets the
+bio's encryption context, then submits the bio. If the read completes
+successfully, blk-crypto-fallback restores the bio's original completion
+callback and private data, then decrypts the bio's data in-place using the
+kernel crypto API. Decryption happens from a workqueue, as it may sleep.
+Afterwards, blk-crypto-fallback completes the bio.
+
+In both cases, the bios that blk-crypto-fallback submits no longer have an
+encryption context. Therefore, lower layers only see standard unencrypted I/O.
+
+blk-crypto-fallback also defines its own blk_crypto_profile and has its own
+"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects. The reason
+for this is twofold. First, it allows the keyslot management logic to be tested
+without actual inline encryption hardware. Second, similar to actual inline
+encryption hardware, the crypto API doesn't accept keys directly in requests but
+rather requires that keys be set ahead of time, and setting keys can be
+expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path
+at all due to the locks it takes. Therefore, the concept of keyslots still
+makes sense for blk-crypto-fallback.
+
+Note that regardless of whether real inline encryption hardware or
blk-crypto-fallback is used, the ciphertext written to disk (and hence the
-on-disk format of data) will be the same (assuming the hardware's implementation
-of the algorithm being used adheres to spec and functions correctly).
-
-If a ``request queue``'s inline encryption hardware claimed to support the
-encryption context specified with a bio, then it will not be handled by the
-``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a
-struct request needs to be allocated for that bio. At that point,
-blk-mq tries to program the encryption context into the ``request_queue``'s
-keyslot_manager, and obtain a keyslot, which it stores in its newly added
-``keyslot`` field. This keyslot is released when the request is completed.
-
-When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called,
-which sets the request's ``crypt_ctx`` to a copy of the bio's
-``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent
-bio is merged to the front of the request, which updates the ``crypt_ctx`` of
-the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first
-bio in its bio-list (blk-mq needs to be careful to maintain this invariant
-during bio and request merges).
-
-To make it possible for inline encryption to work with request queue based
-layered devices, when a request is cloned, its ``crypto fields`` are cloned as
-well. When the cloned request is submitted, blk-mq programs the
-``bi_crypt_context`` of the request into the clone's request_queue's keyslot
-manager, and stores the returned keyslot in the clone's ``keyslot``.
+on-disk format of data) will be the same (assuming that both the inline
+encryption hardware's implementation and the kernel crypto API's implementation
+of the algorithm being used adhere to spec and function correctly).
+blk-crypto-fallback is optional and is controlled by the
+``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option.
API presented to users of the block layer
=========================================
-``struct blk_crypto_key`` represents a crypto key (the raw key, size of the
-key, the crypto algorithm to use, the data unit size to use, and the number of
-bytes required to represent data unit numbers that will be specified with the
-``bi_crypt_context``).
-
-``blk_crypto_init_key`` allows upper layers to initialize such a
-``blk_crypto_key``.
-
-``bio_crypt_set_ctx`` should be called on any bio that a user of
-the block layer wants en/decrypted via inline encryption (or the
-blk-crypto-fallback, if hardware support isn't available for the desired
-crypto configuration). This function takes the ``blk_crypto_key`` and the
-data unit number (DUN) to use when en/decrypting the bio.
-
-``blk_crypto_config_supported`` allows upper layers to query whether or not the
-an encryption context passed to request queue can be handled by blk-crypto
-(either by real inline encryption hardware, or by the blk-crypto-fallback).
-This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer
-wants to use an algorithm that may not supported by hardware - this function
-lets the upper layer know ahead of time that the algorithm isn't supported,
-and the upper layer can fallback to something else if appropriate.
-
-``blk_crypto_start_using_key`` - Upper layers must call this function on
-``blk_crypto_key`` and a ``request_queue`` before using the key with any bio
-headed for that ``request_queue``. This function ensures that either the
-hardware supports the key's crypto settings, or the crypto API fallback has
-transforms for the needed mode allocated and ready to go. Note that this
-function may allocate an ``skcipher``, and must not be called from the data
-path, since allocating ``skciphers`` from the data path can deadlock.
-
-``blk_crypto_evict_key`` *must* be called by upper layers before a
-``blk_crypto_key`` is freed. Further, it *must* only be called only once
-there are no more in-flight requests that use that ``blk_crypto_key``.
-``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in
-inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback).
+``blk_crypto_config_supported()`` allows users to check ahead of time whether
+inline encryption with particular crypto settings will work on a particular
+block_device -- either via hardware or via blk-crypto-fallback. This function
+takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
+the actual bytes of the key and instead just contains the algorithm, data unit
+size, etc. This function can be useful if blk-crypto-fallback is disabled.
+
+``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
+
+Users must call ``blk_crypto_start_using_key()`` before actually starting to use
+a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
+was called earlier). This is needed to initialize blk-crypto-fallback if it
+will be needed. This must not be called from the data path, as this may have to
+allocate resources, which may deadlock in that case.
+
+Next, to attach an encryption context to a bio, users should call
+``bio_crypt_set_ctx()``. This function allocates a bio_crypt_ctx and attaches
+it to a bio, given the blk_crypto_key and the data unit number that will be used
+for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
+later, as that happens automatically when the bio is freed or reset.
+
+Finally, when done using inline encryption with a blk_crypto_key on a
+block_device, users must call ``blk_crypto_evict_key()``. This ensures that
+the key is evicted from all keyslots it may be programmed into and unlinked from
+any kernel data structures it may be linked into.
+
+In summary, for users of the block layer, the lifecycle of a blk_crypto_key is
+as follows:
+
+1. ``blk_crypto_config_supported()`` (optional)
+2. ``blk_crypto_init_key()``
+3. ``blk_crypto_start_using_key()``
+4. ``bio_crypt_set_ctx()`` (potentially many times)
+5. ``blk_crypto_evict_key()`` (after all I/O has completed)
+6. Zeroize the blk_crypto_key (this has no dedicated function)
+
+If a blk_crypto_key is being used on multiple block_devices, then
+``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
+and ``blk_crypto_evict_key()`` must be called on each block_device.
API presented to device drivers
===============================
-A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in
-the ``request_queue`` of the device. The device driver needs to call
-``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the
-``blk_keyslot_manager``, while specifying the number of keyslots supported by
-the hardware.
-
-The device driver also needs to tell the KSM how to actually manipulate the
-IE hardware in the device to do things like programming the crypto key into
-the IE hardware into a particular keyslot. All this is achieved through the
-struct blk_ksm_ll_ops field in the KSM that the device driver
-must fill up after initing the ``blk_keyslot_manager``.
-
-The KSM also handles runtime power management for the device when applicable
-(e.g. when it wants to program a crypto key into the IE hardware, the device
-must be runtime powered on) - so the device driver must also set the ``dev``
-field in the ksm to point to the `struct device` for the KSM to use for runtime
-power management.
-
-``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device
-needs each and every of its keyslots to be reprogrammed with the key it
-"should have" at the point in time when the function is called. This is useful
-e.g. if a device loses all its keys on runtime power down/up.
-
-If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then
-``blk_ksm_destroy`` should be called to free up all resources used by a
-``blk_keyslot_manager`` once it is no longer needed.
+A device driver that wants to support inline encryption must set up a
+blk_crypto_profile in the request_queue of its device. To do this, it first
+must call ``blk_crypto_profile_init()`` (or its resource-managed variant
+``devm_blk_crypto_profile_init()``), providing the number of keyslots.
+
+Next, it must advertise its crypto capabilities by setting fields in the
+blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``.
+
+It then must set function pointers in the ``ll_ops`` field of the
+blk_crypto_profile to tell upper layers how to control the inline encryption
+hardware, e.g. how to program and evict keyslots. Most drivers will need to
+implement ``keyslot_program`` and ``keyslot_evict``. For details, see the
+comments for ``struct blk_crypto_ll_ops``.
+
+Once the driver registers a blk_crypto_profile with a request_queue, I/O
+requests the driver receives via that queue may have an encryption context. All
+encryption contexts will be compatible with the crypto capabilities declared in
+the blk_crypto_profile, so drivers don't need to worry about handling
+unsupported requests. Also, if a nonzero number of keyslots was declared in the
+blk_crypto_profile, then all I/O requests that have an encryption context will
+also have a keyslot which was already programmed with the appropriate key.
+
+If the driver implements runtime suspend and its blk_crypto_ll_ops don't work
+while the device is runtime-suspended, then the driver must also set the ``dev``
+field of the blk_crypto_profile to point to the ``struct device`` that will be
+resumed before any of the low-level operations are called.
+
+If there are situations where the inline encryption hardware loses the contents
+of its keyslots, e.g. device resets, the driver must handle reprogramming the
+keyslots. To do this, the driver may call ``blk_crypto_reprogram_all_keys()``.
+
+Finally, if the driver used ``blk_crypto_profile_init()`` instead of
+``devm_blk_crypto_profile_init()``, then it is responsible for calling
+``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed.
Layered Devices
===============
-Request queue based layered devices like dm-rq that wish to support IE need to
-create their own keyslot manager for their request queue, and expose whatever
-functionality they choose. When a layered device wants to pass a clone of that
-request to another ``request_queue``, blk-crypto will initialize and prepare the
-clone as necessary - see ``blk_crypto_insert_cloned_request`` in
-``blk-crypto.c``.
-
-
-Future Optimizations for layered devices
-========================================
-
-Creating a keyslot manager for a layered device uses up memory for each
-keyslot, and in general, a layered device merely passes the request on to a
-"child" device, so the keyslots in the layered device itself are completely
-unused, and don't need any refcounting or keyslot programming. We can instead
-define a new type of KSM; the "passthrough KSM", that layered devices can use
-to advertise an unlimited number of keyslots, and support for any encryption
-algorithms they choose, while not actually using any memory for each keyslot.
-Another use case for the "passthrough KSM" is for IE devices that do not have a
-limited number of keyslots.
-
+Request queue based layered devices like dm-rq that wish to support inline
+encryption need to create their own blk_crypto_profile for their request_queue,
+and expose whatever functionality they choose. When a layered device wants to
+pass a clone of that request to another request_queue, blk-crypto will
+initialize and prepare the clone as necessary.
Interaction between inline encryption and blk integrity
=======================================================
@@ -257,7 +297,7 @@ Because there isn't any real hardware yet, it seems prudent to assume that
hardware implementations might not implement both features together correctly,
and disallow the combination for now. Whenever a device supports integrity, the
kernel will pretend that the device does not support hardware inline encryption
-(by essentially setting the keyslot manager in the request_queue of the device
-to NULL). When the crypto API fallback is enabled, this means that all bios with
-and encryption context will use the fallback, and IO will complete as usual.
-When the fallback is disabled, a bio with an encryption context will be failed.
+(by setting the blk_crypto_profile in the request_queue of the device to NULL).
+When the crypto API fallback is enabled, this means that all bios with and
+encryption context will use the fallback, and IO will complete as usual. When
+the fallback is disabled, a bio with an encryption context will be failed.
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
index edbbab2f12f8..4dd78f24d10a 100644
--- a/Documentation/block/null_blk.rst
+++ b/Documentation/block/null_blk.rst
@@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1
hw_queue_depth=[0..qdepth]: Default: 64
The hardware queue depth of the device.
+memory_backed=[0/1]: Default: 0
+ Whether or not to use a memory buffer to respond to IO requests
+
+ = =============================================
+ 0 Transfer no data in response to IO requests
+ 1 Use a memory buffer to respond to IO requests
+ = =============================================
+
+discard=[0/1]: Default: 0
+ Support discard operations (requires memory-backed null_blk device).
+
+ = =====================================
+ 0 Do not support discard operations
+ 1 Enable support for discard operations
+ = =====================================
+
+cache_size=[Size in MB]: Default: 0
+ Cache size in MB for memory-backed device.
+
+mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit)
+ Bandwidth limit for device performance.
+
Multi-queue specific parameters
-------------------------------
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
deleted file mode 100644
index 4dc7f0d499a8..000000000000
--- a/Documentation/block/queue-sysfs.rst
+++ /dev/null
@@ -1,289 +0,0 @@
-=================
-Queue sysfs files
-=================
-
-This text file will detail the queue files that are located in the sysfs tree
-for each block device. Note that stacked devices typically do not export
-any settings, since their queue merely functions are a remapping target.
-These files are the ones found in the /sys/block/xxx/queue/ directory.
-
-Files denoted with a RO postfix are readonly and the RW postfix means
-read-write.
-
-add_random (RW)
----------------
-This file allows to turn off the disk entropy contribution. Default
-value of this file is '1'(on).
-
-chunk_sectors (RO)
-------------------
-This has different meaning depending on the type of the block device.
-For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
-of the RAID volume stripe segment. For a zoned block device, either host-aware
-or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
-of the device, with the eventual exception of the last zone of the device which
-may be smaller.
-
-dax (RO)
---------
-This file indicates whether the device supports Direct Access (DAX),
-used by CPU-addressable storage to bypass the pagecache. It shows '1'
-if true, '0' if not.
-
-discard_granularity (RO)
-------------------------
-This shows the size of internal allocation of the device in bytes, if
-reported by the device. A value of '0' means device does not support
-the discard functionality.
-
-discard_max_hw_bytes (RO)
--------------------------
-Devices that support discard functionality may have internal limits on
-the number of bytes that can be trimmed or unmapped in a single operation.
-The discard_max_bytes parameter is set by the device driver to the maximum
-number of bytes that can be discarded in a single operation. Discard
-requests issued to the device must not exceed this limit. A discard_max_bytes
-value of 0 means that the device does not support discard functionality.
-
-discard_max_bytes (RW)
-----------------------
-While discard_max_hw_bytes is the hardware limit for the device, this
-setting is the software limit. Some devices exhibit large latencies when
-large discards are issued, setting this value lower will make Linux issue
-smaller discards and potentially help reduce latencies induced by large
-discard operations.
-
-discard_zeroes_data (RO)
-------------------------
-Obsolete. Always zero.
-
-fua (RO)
---------
-Whether or not the block driver supports the FUA flag for write requests.
-FUA stands for Force Unit Access. If the FUA flag is set that means that
-write requests must bypass the volatile cache of the storage device.
-
-hw_sector_size (RO)
--------------------
-This is the hardware sector size of the device, in bytes.
-
-io_poll (RW)
-------------
-When read, this file shows whether polling is enabled (1) or disabled
-(0). Writing '0' to this file will disable polling for this device.
-Writing any non-zero value will enable this feature.
-
-io_poll_delay (RW)
-------------------
-If polling is enabled, this controls what kind of polling will be
-performed. It defaults to -1, which is classic polling. In this mode,
-the CPU will repeatedly ask for completions without giving up any time.
-If set to 0, a hybrid polling mode is used, where the kernel will attempt
-to make an educated guess at when the IO will complete. Based on this
-guess, the kernel will put the process issuing IO to sleep for an amount
-of time, before entering a classic poll loop. This mode might be a
-little slower than pure classic polling, but it will be more efficient.
-If set to a value larger than 0, the kernel will put the process issuing
-IO to sleep for this amount of microseconds before entering classic
-polling.
-
-io_timeout (RW)
----------------
-io_timeout is the request timeout in milliseconds. If a request does not
-complete in this time then the block driver timeout handler is invoked.
-That timeout handler can decide to retry the request, to fail it or to start
-a device recovery strategy.
-
-iostats (RW)
--------------
-This file is used to control (on/off) the iostats accounting of the
-disk.
-
-logical_block_size (RO)
------------------------
-This is the logical block size of the device, in bytes.
-
-max_discard_segments (RO)
--------------------------
-The maximum number of DMA scatter/gather entries in a discard request.
-
-max_hw_sectors_kb (RO)
-----------------------
-This is the maximum number of kilobytes supported in a single data transfer.
-
-max_integrity_segments (RO)
----------------------------
-Maximum number of elements in a DMA scatter/gather list with integrity
-data that will be submitted by the block layer core to the associated
-block driver.
-
-max_active_zones (RO)
----------------------
-For zoned block devices (zoned attribute indicating "host-managed" or
-"host-aware"), the sum of zones belonging to any of the zone states:
-EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value.
-If this value is 0, there is no limit.
-
-If the host attempts to exceed this limit, the driver should report this error
-with BLK_STS_ZONE_ACTIVE_RESOURCE, which user space may see as the EOVERFLOW
-errno.
-
-max_open_zones (RO)
--------------------
-For zoned block devices (zoned attribute indicating "host-managed" or
-"host-aware"), the sum of zones belonging to any of the zone states:
-EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value.
-If this value is 0, there is no limit.
-
-If the host attempts to exceed this limit, the driver should report this error
-with BLK_STS_ZONE_OPEN_RESOURCE, which user space may see as the ETOOMANYREFS
-errno.
-
-max_sectors_kb (RW)
--------------------
-This is the maximum number of kilobytes that the block layer will allow
-for a filesystem request. Must be smaller than or equal to the maximum
-size allowed by the hardware.
-
-max_segments (RO)
------------------
-Maximum number of elements in a DMA scatter/gather list that is submitted
-to the associated block driver.
-
-max_segment_size (RO)
----------------------
-Maximum size in bytes of a single element in a DMA scatter/gather list.
-
-minimum_io_size (RO)
---------------------
-This is the smallest preferred IO size reported by the device.
-
-nomerges (RW)
--------------
-This enables the user to disable the lookup logic involved with IO
-merging requests in the block layer. By default (0) all merges are
-enabled. When set to 1 only simple one-hit merges will be tried. When
-set to 2 no merge algorithms will be tried (including one-hit or more
-complex tree/hash lookups).
-
-nr_requests (RW)
-----------------
-This controls how many requests may be allocated in the block layer for
-read or write requests. Note that the total allocated number may be twice
-this amount, since it applies only to reads or writes (not the accumulated
-sum).
-
-To avoid priority inversion through request starvation, a request
-queue maintains a separate request pool per each cgroup when
-CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
-per-block-cgroup request pool. IOW, if there are N block cgroups,
-each request queue may have up to N request pools, each independently
-regulated by nr_requests.
-
-nr_zones (RO)
--------------
-For zoned block devices (zoned attribute indicating "host-managed" or
-"host-aware"), this indicates the total number of zones of the device.
-This is always 0 for regular block devices.
-
-optimal_io_size (RO)
---------------------
-This is the optimal IO size reported by the device.
-
-physical_block_size (RO)
-------------------------
-This is the physical block size of device, in bytes.
-
-read_ahead_kb (RW)
-------------------
-Maximum number of kilobytes to read-ahead for filesystems on this block
-device.
-
-rotational (RW)
----------------
-This file is used to stat if the device is of rotational type or
-non-rotational type.
-
-rq_affinity (RW)
-----------------
-If this option is '1', the block layer will migrate request completions to the
-cpu "group" that originally submitted the request. For some workloads this
-provides a significant reduction in CPU cycles due to caching effects.
-
-For storage configurations that need to maximize distribution of completion
-processing setting this option to '2' forces the completion to run on the
-requesting cpu (bypassing the "group" aggregation logic).
-
-scheduler (RW)
---------------
-When read, this file will display the current and available IO schedulers
-for this block device. The currently active IO scheduler will be enclosed
-in [] brackets. Writing an IO scheduler name to this file will switch
-control of this block device to that new IO scheduler. Note that writing
-an IO scheduler name to this file will attempt to load that IO scheduler
-module, if it isn't already present in the system.
-
-write_cache (RW)
-----------------
-When read, this file will display whether the device has write back
-caching enabled or not. It will return "write back" for the former
-case, and "write through" for the latter. Writing to this file can
-change the kernels view of the device, but it doesn't alter the
-device state. This means that it might not be safe to toggle the
-setting from "write back" to "write through", since that will also
-eliminate cache flushes issued by the kernel.
-
-write_same_max_bytes (RO)
--------------------------
-This is the number of bytes the device can write in a single write-same
-command. A value of '0' means write-same is not supported by this
-device.
-
-wbt_lat_usec (RW)
------------------
-If the device is registered for writeback throttling, then this file shows
-the target minimum read latency. If this latency is exceeded in a given
-window of time (see wb_window_usec), then the writeback throttling will start
-scaling back writes. Writing a value of '0' to this file disables the
-feature. Writing a value of '-1' to this file resets the value to the
-default setting.
-
-throttle_sample_time (RW)
--------------------------
-This is the time window that blk-throttle samples data, in millisecond.
-blk-throttle makes decision based on the samplings. Lower time means cgroups
-have more smooth throughput, but higher CPU overhead. This exists only when
-CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
-
-write_zeroes_max_bytes (RO)
----------------------------
-For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of
-bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES
-is not supported.
-
-zone_append_max_bytes (RO)
---------------------------
-This is the maximum number of bytes that can be written to a sequential
-zone of a zoned block device using a zone append write operation
-(REQ_OP_ZONE_APPEND). This value is always 0 for regular block devices.
-
-zoned (RO)
-----------
-This indicates if the device is a zoned block device and the zone model of the
-device if it is indeed zoned. The possible values indicated by zoned are
-"none" for regular block devices and "host-aware" or "host-managed" for zoned
-block devices. The characteristics of host-aware and host-managed zoned block
-devices are described in the ZBC (Zoned Block Commands) and ZAC
-(Zoned Device ATA Command Set) standards. These standards also define the
-"drive-managed" zone model. However, since drive-managed zoned block devices
-do not support zone commands, they will be treated as regular block devices
-and zoned will report "none".
-
-zone_write_granularity (RO)
----------------------------
-This indicates the alignment constraint, in bytes, for write operations in
-sequential zones of zoned block devices (devices with a zoned attributed
-that reports "host-managed" or "host-aware"). This value is always 0 for
-regular block devices.
-
-Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/block/request.rst b/Documentation/block/request.rst
deleted file mode 100644
index 747021e1ffdb..000000000000
--- a/Documentation/block/request.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-============================
-struct request documentation
-============================
-
-Jens Axboe <jens.axboe@oracle.com> 27/05/02
-
-
-.. FIXME:
- No idea about what does mean - seems just some noise, so comment it
-
- 1.0
- Index
-
- 2.0 Struct request members classification
-
- 2.1 struct request members explanation
-
- 3.0
-
-
- 2.0
-
-
-
-Short explanation of request members
-====================================
-
-Classification flags:
-
- = ====================
- D driver member
- B block layer member
- I I/O scheduler member
- = ====================
-
-Unless an entry contains a D classification, a device driver must not access
-this member. Some members may contain D classifications, but should only be
-access through certain macros or functions (eg ->flags).
-
-<linux/blkdev.h>
-
-=============================== ======= =======================================
-Member Flag Comment
-=============================== ======= =======================================
-struct list_head queuelist BI Organization on various internal
- queues
-
-``void *elevator_private`` I I/O scheduler private data
-
-unsigned char cmd[16] D Driver can use this for setting up
- a cdb before execution, see
- blk_queue_prep_rq
-
-unsigned long flags DBI Contains info about data direction,
- request type, etc.
-
-int rq_status D Request status bits
-
-kdev_t rq_dev DBI Target device
-
-int errors DB Error counts
-
-sector_t sector DBI Target location
-
-unsigned long hard_nr_sectors B Used to keep sector sane
-
-unsigned long nr_sectors DBI Total number of sectors in request
-
-unsigned long hard_nr_sectors B Used to keep nr_sectors sane
-
-unsigned short nr_phys_segments DB Number of physical scatter gather
- segments in a request
-
-unsigned short nr_hw_segments DB Number of hardware scatter gather
- segments in a request
-
-unsigned int current_nr_sectors DB Number of sectors in first segment
- of request
-
-unsigned int hard_cur_sectors B Used to keep current_nr_sectors sane
-
-int tag DB TCQ tag, if assigned
-
-``void *special`` D Free to be used by driver
-
-``char *buffer`` D Map of first segment, also see
- section on bouncing SECTION
-
-``struct completion *waiting`` D Can be used by driver to get signalled
- on request completion
-
-``struct bio *bio`` DBI First bio in request
-
-``struct bio *biotail`` DBI Last bio in request
-
-``struct request_queue *q`` DB Request queue this request belongs to
-
-``struct request_list *rl`` B Request list this request came from
-=============================== ======= =======================================
diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst
new file mode 100644
index 000000000000..1713b2890abb
--- /dev/null
+++ b/Documentation/block/ublk.rst
@@ -0,0 +1,326 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================================
+Userspace block device driver (ublk driver)
+===========================================
+
+Overview
+========
+
+ublk is a generic framework for implementing block device logic from userspace.
+The motivation behind it is that moving virtual block drivers into userspace,
+such as loop, nbd and similar can be very helpful. It can help to implement
+new virtual block device such as ublk-qcow2 (there are several attempts of
+implementing qcow2 driver in kernel).
+
+Userspace block devices are attractive because:
+
+- They can be written many programming languages.
+- They can use libraries that are not available in the kernel.
+- They can be debugged with tools familiar to application developers.
+- Crashes do not kernel panic the machine.
+- Bugs are likely to have a lower security impact than bugs in kernel
+ code.
+- They can be installed and updated independently of the kernel.
+- They can be used to simulate block device easily with user specified
+ parameters/setting for test/debug purpose
+
+ublk block device (``/dev/ublkb*``) is added by ublk driver. Any IO request
+on the device will be forwarded to ublk userspace program. For convenience,
+in this document, ``ublk server`` refers to generic ublk userspace
+program. ``ublksrv`` [#userspace]_ is one of such implementation. It
+provides ``libublksrv`` [#userspace_lib]_ library for developing specific
+user block device conveniently, while also generic type block device is
+included, such as loop and null. Richard W.M. Jones wrote userspace nbd device
+``nbdublk`` [#userspace_nbdublk]_ based on ``libublksrv`` [#userspace_lib]_.
+
+After the IO is handled by userspace, the result is committed back to the
+driver, thus completing the request cycle. This way, any specific IO handling
+logic is totally done by userspace, such as loop's IO handling, NBD's IO
+communication, or qcow2's IO mapping.
+
+``/dev/ublkb*`` is driven by blk-mq request-based driver. Each request is
+assigned by one queue wide unique tag. ublk server assigns unique tag to each
+IO too, which is 1:1 mapped with IO of ``/dev/ublkb*``.
+
+Both the IO request forward and IO handling result committing are done via
+``io_uring`` passthrough command; that is why ublk is also one io_uring based
+block driver. It has been observed that using io_uring passthrough command can
+give better IOPS than block IO; which is why ublk is one of high performance
+implementation of userspace block device: not only IO request communication is
+done by io_uring, but also the preferred IO handling in ublk server is io_uring
+based approach too.
+
+ublk provides control interface to set/get ublk block device parameters.
+The interface is extendable and kabi compatible: basically any ublk request
+queue's parameter or ublk generic feature parameters can be set/get via the
+interface. Thus, ublk is generic userspace block device framework.
+For example, it is easy to setup a ublk device with specified block
+parameters from userspace.
+
+Using ublk
+==========
+
+ublk requires userspace ublk server to handle real block device logic.
+
+Below is example of using ``ublksrv`` to provide ublk-based loop device.
+
+- add a device::
+
+ ublk add -t loop -f ublk-loop.img
+
+- format with xfs, then use it::
+
+ mkfs.xfs /dev/ublkb0
+ mount /dev/ublkb0 /mnt
+ # do anything. all IOs are handled by io_uring
+ ...
+ umount /mnt
+
+- list the devices with their info::
+
+ ublk list
+
+- delete the device::
+
+ ublk del -a
+ ublk del -n $ublk_dev_id
+
+See usage details in README of ``ublksrv`` [#userspace_readme]_.
+
+Design
+======
+
+Control plane
+-------------
+
+ublk driver provides global misc device node (``/dev/ublk-control``) for
+managing and controlling ublk devices with help of several control commands:
+
+- ``UBLK_CMD_ADD_DEV``
+
+ Add a ublk char device (``/dev/ublkc*``) which is talked with ublk server
+ WRT IO command communication. Basic device info is sent together with this
+ command. It sets UAPI structure of ``ublksrv_ctrl_dev_info``,
+ such as ``nr_hw_queues``, ``queue_depth``, and max IO request buffer size,
+ for which the info is negotiated with the driver and sent back to the server.
+ When this command is completed, the basic device info is immutable.
+
+- ``UBLK_CMD_SET_PARAMS`` / ``UBLK_CMD_GET_PARAMS``
+
+ Set or get parameters of the device, which can be either generic feature
+ related, or request queue limit related, but can't be IO logic specific,
+ because the driver does not handle any IO logic. This command has to be
+ sent before sending ``UBLK_CMD_START_DEV``.
+
+- ``UBLK_CMD_START_DEV``
+
+ After the server prepares userspace resources (such as creating per-queue
+ pthread & io_uring for handling ublk IO), this command is sent to the
+ driver for allocating & exposing ``/dev/ublkb*``. Parameters set via
+ ``UBLK_CMD_SET_PARAMS`` are applied for creating the device.
+
+- ``UBLK_CMD_STOP_DEV``
+
+ Halt IO on ``/dev/ublkb*`` and remove the device. When this command returns,
+ ublk server will release resources (such as destroying per-queue pthread &
+ io_uring).
+
+- ``UBLK_CMD_DEL_DEV``
+
+ Remove ``/dev/ublkc*``. When this command returns, the allocated ublk device
+ number can be reused.
+
+- ``UBLK_CMD_GET_QUEUE_AFFINITY``
+
+ When ``/dev/ublkc`` is added, the driver creates block layer tagset, so
+ that each queue's affinity info is available. The server sends
+ ``UBLK_CMD_GET_QUEUE_AFFINITY`` to retrieve queue affinity info. It can
+ set up the per-queue context efficiently, such as bind affine CPUs with IO
+ pthread and try to allocate buffers in IO thread context.
+
+- ``UBLK_CMD_GET_DEV_INFO``
+
+ For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's
+ responsibility to save IO target specific info in userspace.
+
+- ``UBLK_CMD_GET_DEV_INFO2``
+ Same purpose with ``UBLK_CMD_GET_DEV_INFO``, but ublk server has to
+ provide path of the char device of ``/dev/ublkc*`` for kernel to run
+ permission check, and this command is added for supporting unprivileged
+ ublk device, and introduced with ``UBLK_F_UNPRIVILEGED_DEV`` together.
+ Only the user owning the requested device can retrieve the device info.
+
+ How to deal with userspace/kernel compatibility:
+
+ 1) if kernel is capable of handling ``UBLK_F_UNPRIVILEGED_DEV``
+
+ If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``:
+
+ ublk server should send ``UBLK_CMD_GET_DEV_INFO2``, given anytime
+ unprivileged application needs to query devices the current user owns,
+ when the application has no idea if ``UBLK_F_UNPRIVILEGED_DEV`` is set
+ given the capability info is stateless, and application should always
+ retrieve it via ``UBLK_CMD_GET_DEV_INFO2``
+
+ If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``:
+
+ ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of
+ UBLK_F_UNPRIVILEGED_DEV isn't available for user
+
+ 2) if kernel isn't capable of handling ``UBLK_F_UNPRIVILEGED_DEV``
+
+ If ublk server supports ``UBLK_F_UNPRIVILEGED_DEV``:
+
+ ``UBLK_CMD_GET_DEV_INFO2`` is tried first, and will be failed, then
+ ``UBLK_CMD_GET_DEV_INFO`` needs to be retried given
+ ``UBLK_F_UNPRIVILEGED_DEV`` can't be set
+
+ If ublk server doesn't support ``UBLK_F_UNPRIVILEGED_DEV``:
+
+ ``UBLK_CMD_GET_DEV_INFO`` is always sent to kernel, and the feature of
+ ``UBLK_F_UNPRIVILEGED_DEV`` isn't available for user
+
+- ``UBLK_CMD_START_USER_RECOVERY``
+
+ This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This
+ command is accepted after the old process has exited, ublk device is quiesced
+ and ``/dev/ublkc*`` is released. User should send this command before he starts
+ a new process which re-opens ``/dev/ublkc*``. When this command returns, the
+ ublk device is ready for the new process.
+
+- ``UBLK_CMD_END_USER_RECOVERY``
+
+ This command is valid if ``UBLK_F_USER_RECOVERY`` feature is enabled. This
+ command is accepted after ublk device is quiesced and a new process has
+ opened ``/dev/ublkc*`` and get all ublk queues be ready. When this command
+ returns, ublk device is unquiesced and new I/O requests are passed to the
+ new process.
+
+- user recovery feature description
+
+ Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and
+ ``UBLK_F_USER_RECOVERY_REISSUE``.
+
+ With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io
+ handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole
+ recovery stage and ublk device ID is kept. It is ublk server's
+ responsibility to recover the device context by its own knowledge.
+ Requests which have not been issued to userspace are requeued. Requests
+ which have been issued to userspace are aborted.
+
+ With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk
+ server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``,
+ requests which have been issued to userspace are requeued and will be
+ re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``.
+ ``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate
+ double-write since the driver may issue the same I/O request twice. It
+ might be useful to a read-only FS or a VM backend.
+
+Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``.
+Once the flag is set, all control commands can be sent by unprivileged
+user. Except for command of ``UBLK_CMD_ADD_DEV``, permission check on
+the specified char device(``/dev/ublkc*``) is done for all other control
+commands by ublk driver, for doing that, path of the char device has to
+be provided in these commands' payload from ublk server. With this way,
+ublk device becomes container-ware, and device created in one container
+can be controlled/accessed just inside this container.
+
+Data plane
+----------
+
+ublk server needs to create per-queue IO pthread & io_uring for handling IO
+commands via io_uring passthrough. The per-queue IO pthread
+focuses on IO handling and shouldn't handle any control & management
+tasks.
+
+The's IO is assigned by a unique tag, which is 1:1 mapping with IO
+request of ``/dev/ublkb*``.
+
+UAPI structure of ``ublksrv_io_desc`` is defined for describing each IO from
+the driver. A fixed mmaped area (array) on ``/dev/ublkc*`` is provided for
+exporting IO info to the server; such as IO offset, length, OP/flags and
+buffer address. Each ``ublksrv_io_desc`` instance can be indexed via queue id
+and IO tag directly.
+
+The following IO commands are communicated via io_uring passthrough command,
+and each command is only for forwarding the IO and committing the result
+with specified IO tag in the command data:
+
+- ``UBLK_IO_FETCH_REQ``
+
+ Sent from the server IO pthread for fetching future incoming IO requests
+ destined to ``/dev/ublkb*``. This command is sent only once from the server
+ IO pthread for ublk driver to setup IO forward environment.
+
+- ``UBLK_IO_COMMIT_AND_FETCH_REQ``
+
+ When an IO request is destined to ``/dev/ublkb*``, the driver stores
+ the IO's ``ublksrv_io_desc`` to the specified mapped area; then the
+ previous received IO command of this IO tag (either ``UBLK_IO_FETCH_REQ``
+ or ``UBLK_IO_COMMIT_AND_FETCH_REQ)`` is completed, so the server gets
+ the IO notification via io_uring.
+
+ After the server handles the IO, its result is committed back to the
+ driver by sending ``UBLK_IO_COMMIT_AND_FETCH_REQ`` back. Once ublkdrv
+ received this command, it parses the result and complete the request to
+ ``/dev/ublkb*``. In the meantime setup environment for fetching future
+ requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ``
+ is reused for both fetching request and committing back IO result.
+
+- ``UBLK_IO_NEED_GET_DATA``
+
+ With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly
+ issued to ublk server without data copy. Then, IO backend of ublk server
+ receives the request and it can allocate data buffer and embed its addr
+ inside this new io command. After the kernel driver gets the command,
+ data copy is done from request pages to this backend's buffer. Finally,
+ backend receives the request again with data to be written and it can
+ truly handle the request.
+
+ ``UBLK_IO_NEED_GET_DATA`` adds one additional round-trip and one
+ io_uring_enter() syscall. Any user thinks that it may lower performance
+ should not enable UBLK_F_NEED_GET_DATA. ublk server pre-allocates IO
+ buffer for each IO by default. Any new project should try to use this
+ buffer to communicate with ublk driver. However, existing project may
+ break or not able to consume the new buffer interface; that's why this
+ command is added for backwards compatibility so that existing projects
+ can still consume existing buffers.
+
+- data copy between ublk server IO buffer and ublk block IO request
+
+ The driver needs to copy the block IO request pages into the server buffer
+ (pages) first for WRITE before notifying the server of the coming IO, so
+ that the server can handle WRITE request.
+
+ When the server handles READ request and sends
+ ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
+ the server buffer (pages) read to the IO request pages.
+
+Future development
+==================
+
+Zero copy
+---------
+
+Zero copy is a generic requirement for nbd, fuse or similar drivers. A
+problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace
+can't be remapped any more in kernel with existing mm interfaces. This can
+occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that
+big requests (IO size >= 256 KB) may benefit a lot from zero copy.
+
+
+References
+==========
+
+.. [#userspace] https://github.com/ming1/ubdsrv
+
+.. [#userspace_lib] https://github.com/ming1/ubdsrv/tree/master/lib
+
+.. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk
+
+.. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README
+
+.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
+
+.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index 437de2a7a5de..38372a956d65 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -208,12 +208,22 @@ data structures and compile with kernel internal headers. Both of these
kernel internals are subject to change and can break with newer kernels
such that the program needs to be adapted accordingly.
+New BPF functionality is generally added through the use of kfuncs instead of
+new helpers. Kfuncs are not considered part of the stable API, and have their own
+lifecycle expectations as described in :ref:`BPF_kfunc_lifecycle_expectations`.
+
Q: Are tracepoints part of the stable ABI?
------------------------------------------
A: NO. Tracepoints are tied to internal implementation details hence they are
subject to change and can break with newer kernels. BPF programs need to change
accordingly when this happens.
+Q: Are places where kprobes can attach part of the stable ABI?
+--------------------------------------------------------------
+A: NO. The places to which kprobes can attach are internal implementation
+details, which means that they are subject to change and can break with
+newer kernels. BPF programs need to change accordingly when this happens.
+
Q: How much stack space a BPF program uses?
-------------------------------------------
A: Currently all program types are limited to 512 bytes of stack
@@ -230,8 +240,8 @@ A: NO. Classic BPF programs are converted into extend BPF instructions.
Q: Can BPF call arbitrary kernel functions?
-------------------------------------------
-A: NO. BPF programs can only call a set of helper functions which
-is defined for every program type.
+A: NO. BPF programs can only call specific functions exposed as BPF helpers or
+kfuncs. The set of available functions is defined for every program type.
Q: Can BPF overwrite arbitrary kernel memory?
---------------------------------------------
@@ -257,7 +267,12 @@ Q: New functionality via kernel modules?
Q: Can BPF functionality such as new program or map types, new
helpers, etc be added out of kernel module code?
-A: NO.
+A: Yes, through kfuncs and kptrs
+
+The core BPF functionality such as program types, maps and helpers cannot be
+added to by modules. However, modules can expose functionality to BPF programs
+by exporting kfuncs (which may return pointers to module-internal data
+structures as kptrs).
Q: Directly calling kernel function is an ABI?
----------------------------------------------
@@ -272,4 +287,70 @@ kernel functions have already been used by other kernel tcp
cc (congestion-control) implementations. If any of these kernel
functions has changed, both the in-tree and out-of-tree kernel tcp cc
implementations have to be changed. The same goes for the bpf
-programs and they have to be adjusted accordingly.
+programs and they have to be adjusted accordingly. See
+:ref:`BPF_kfunc_lifecycle_expectations` for details.
+
+Q: Attaching to arbitrary kernel functions is an ABI?
+-----------------------------------------------------
+Q: BPF programs can be attached to many kernel functions. Do these
+kernel functions become part of the ABI?
+
+A: NO.
+
+The kernel function prototypes will change, and BPF programs attaching to
+them will need to change. The BPF compile-once-run-everywhere (CO-RE)
+should be used in order to make it easier to adapt your BPF programs to
+different versions of the kernel.
+
+Q: Marking a function with BTF_ID makes that function an ABI?
+-------------------------------------------------------------
+A: NO.
+
+The BTF_ID macro does not cause a function to become part of the ABI
+any more than does the EXPORT_SYMBOL_GPL macro.
+
+Q: What is the compatibility story for special BPF types in map values?
+-----------------------------------------------------------------------
+Q: Users are allowed to embed bpf_spin_lock, bpf_timer fields in their BPF map
+values (when using BTF support for BPF maps). This allows to use helpers for
+such objects on these fields inside map values. Users are also allowed to embed
+pointers to some kernel types (with __kptr_untrusted and __kptr BTF tags). Will the
+kernel preserve backwards compatibility for these features?
+
+A: It depends. For bpf_spin_lock, bpf_timer: YES, for kptr and everything else:
+NO, but see below.
+
+For struct types that have been added already, like bpf_spin_lock and bpf_timer,
+the kernel will preserve backwards compatibility, as they are part of UAPI.
+
+For kptrs, they are also part of UAPI, but only with respect to the kptr
+mechanism. The types that you can use with a __kptr_untrusted and __kptr tagged
+pointer in your struct are NOT part of the UAPI contract. The supported types can
+and will change across kernel releases. However, operations like accessing kptr
+fields and bpf_kptr_xchg() helper will continue to be supported across kernel
+releases for the supported types.
+
+For any other supported struct type, unless explicitly stated in this document
+and added to bpf.h UAPI header, such types can and will arbitrarily change their
+size, type, and alignment, or any other user visible API or ABI detail across
+kernel releases. The users must adapt their BPF programs to the new changes and
+update them to make sure their programs continue to work correctly.
+
+NOTE: BPF subsystem specially reserves the 'bpf\_' prefix for type names, in
+order to introduce more special fields in the future. Hence, user programs must
+avoid defining types with 'bpf\_' prefix to not be broken in future releases.
+In other words, no backwards compatibility is guaranteed if one using a type
+in BTF with 'bpf\_' prefix.
+
+Q: What is the compatibility story for special BPF types in allocated objects?
+------------------------------------------------------------------------------
+Q: Same as above, but for allocated objects (i.e. objects allocated using
+bpf_obj_new for user defined types). Will the kernel preserve backwards
+compatibility for these features?
+
+A: NO.
+
+Unlike map value types, the API to work with allocated objects and any support
+for special fields inside them is exposed through kfuncs, and thus has the same
+lifecycle expectations as the kfuncs themselves. See
+:ref:`BPF_kfunc_lifecycle_expectations` for details.
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index 253496af8fef..609b71f5747d 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -7,8 +7,8 @@ workflows related to reporting bugs, submitting patches, and queueing
patches for stable kernels.
For general information about submitting patches, please refer to
-`Documentation/process/`_. This document only describes additional specifics
-related to BPF.
+Documentation/process/submitting-patches.rst. This document only describes
+additional specifics related to BPF.
.. contents::
:local:
@@ -44,6 +44,33 @@ is a guarantee that the reported issue will be overlooked.**
Submitting patches
==================
+Q: How do I run BPF CI on my changes before sending them out for review?
+------------------------------------------------------------------------
+A: BPF CI is GitHub based and hosted at https://github.com/kernel-patches/bpf.
+While GitHub also provides a CLI that can be used to accomplish the same
+results, here we focus on the UI based workflow.
+
+The following steps lay out how to start a CI run for your patches:
+
+- Create a fork of the aforementioned repository in your own account (one time
+ action)
+
+- Clone the fork locally, check out a new branch tracking either the bpf-next
+ or bpf branch, and apply your to-be-tested patches on top of it
+
+- Push the local branch to your fork and create a pull request against
+ kernel-patches/bpf's bpf-next_base or bpf_base branch, respectively
+
+Shortly after the pull request has been created, the CI workflow will run. Note
+that capacity is shared with patches submitted upstream being checked and so
+depending on utilization the run can take a while to finish.
+
+Note furthermore that both base branches (bpf-next_base and bpf_base) will be
+updated as patches are pushed to the respective upstream branches they track. As
+such, your patch set will automatically (be attempted to) be rebased as well.
+This behavior can result in a CI run being aborted and restarted with the new
+base line.
+
Q: To which mailing list do I need to submit my BPF patches?
------------------------------------------------------------
A: Please submit your BPF patches to the bpf kernel mailing list:
@@ -101,7 +128,8 @@ into the bpf-next tree will make their way into net-next tree. net and
net-next are both run by David S. Miller. From there, they will go
into the kernel mainline tree run by Linus Torvalds. To read up on the
process of net and net-next being merged into the mainline tree, see
-the :ref:`netdev-FAQ`
+the documentation on netdev subsystem at
+Documentation/process/maintainer-netdev.rst.
@@ -120,7 +148,8 @@ request)::
Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be applied to?
---------------------------------------------------------------------------------
-A: The process is the very same as described in the :ref:`netdev-FAQ`,
+A: The process is the very same as described in the netdev subsystem
+documentation at Documentation/process/maintainer-netdev.rst,
so please read up on it. The subject line must indicate whether the
patch is a fix or rather "next-like" content in order to let the
maintainers know whether it is targeted at bpf or bpf-next.
@@ -179,8 +208,9 @@ ii) run extensive BPF test suite and
Once the BPF pull request was accepted by David S. Miller, then
the patches end up in net or net-next tree, respectively, and
make their way from there further into mainline. Again, see the
-:ref:`netdev-FAQ` for additional information e.g. on how often they are
-merged to mainline.
+documentation for netdev subsystem at
+Documentation/process/maintainer-netdev.rst for additional information
+e.g. on how often they are merged to mainline.
Q: How long do I need to wait for feedback on my BPF patches?
-------------------------------------------------------------
@@ -203,7 +233,8 @@ Q: Are patches applied to bpf-next when the merge window is open?
-----------------------------------------------------------------
A: For the time when the merge window is open, bpf-next will not be
processed. This is roughly analogous to net-next patch processing,
-so feel free to read up on the :ref:`netdev-FAQ` about further details.
+so feel free to read up on the netdev docs at
+Documentation/process/maintainer-netdev.rst about further details.
During those two weeks of merge window, we might ask you to resend
your patch series once bpf-next is open again. Once Linus released
@@ -367,7 +398,8 @@ netdev kernel mailing list in Cc and ask for the fix to be queued up:
netdev@vger.kernel.org
The process in general is the same as on netdev itself, see also the
-:ref:`netdev-FAQ`.
+the documentation on networking subsystem at
+Documentation/process/maintainer-netdev.rst.
Q: Do you also backport to kernels not currently maintained as stable?
----------------------------------------------------------------------
@@ -383,7 +415,7 @@ Q: The BPF patch I am about to submit needs to go to stable as well
What should I do?
A: The same rules apply as with netdev patch submissions in general, see
-the :ref:`netdev-FAQ`.
+the netdev docs at Documentation/process/maintainer-netdev.rst.
Never add "``Cc: stable@vger.kernel.org``" to the patch description, but
ask the BPF maintainers to queue the patches instead. This can be done
@@ -434,15 +466,15 @@ needed::
$ sudo make run_tests
-See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
-document for further documentation.
+See :doc:`kernel selftest documentation </dev-tools/kselftest>`
+for details.
To maximize the number of tests passing, the .config of the kernel
under test should match the config file fragment in
tools/testing/selftests/bpf as closely as possible.
Finally to ensure support for latest BPF Type Format features -
-discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
+discussed in Documentation/bpf/btf.rst - pahole version 1.16
is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
pahole is delivered in the dwarves package or can be built
from source at
@@ -657,12 +689,7 @@ when:
.. Links
-.. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/
-.. _netdev-FAQ: ../networking/netdev-FAQ.rst
.. _selftests:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/
-.. _Documentation/dev-tools/kselftest.rst:
- https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
-.. _Documentation/bpf/btf.rst: btf.rst
Happy BPF hacking!
diff --git a/Documentation/bpf/bpf_iterators.rst b/Documentation/bpf/bpf_iterators.rst
new file mode 100644
index 000000000000..6d7770793fab
--- /dev/null
+++ b/Documentation/bpf/bpf_iterators.rst
@@ -0,0 +1,485 @@
+=============
+BPF Iterators
+=============
+
+
+----------
+Motivation
+----------
+
+There are a few existing ways to dump kernel data into user space. The most
+popular one is the ``/proc`` system. For example, ``cat /proc/net/tcp6`` dumps
+all tcp6 sockets in the system, and ``cat /proc/net/netlink`` dumps all netlink
+sockets in the system. However, their output format tends to be fixed, and if
+users want more information about these sockets, they have to patch the kernel,
+which often takes time to publish upstream and release. The same is true for popular
+tools like `ss <https://man7.org/linux/man-pages/man8/ss.8.html>`_ where any
+additional information needs a kernel patch.
+
+To solve this problem, the `drgn
+<https://www.kernel.org/doc/html/latest/bpf/drgn.html>`_ tool is often used to
+dig out the kernel data with no kernel change. However, the main drawback for
+drgn is performance, as it cannot do pointer tracing inside the kernel. In
+addition, drgn cannot validate a pointer value and may read invalid data if the
+pointer becomes invalid inside the kernel.
+
+The BPF iterator solves the above problem by providing flexibility on what data
+(e.g., tasks, bpf_maps, etc.) to collect by calling BPF programs for each kernel
+data object.
+
+----------------------
+How BPF Iterators Work
+----------------------
+
+A BPF iterator is a type of BPF program that allows users to iterate over
+specific types of kernel objects. Unlike traditional BPF tracing programs that
+allow users to define callbacks that are invoked at particular points of
+execution in the kernel, BPF iterators allow users to define callbacks that
+should be executed for every entry in a variety of kernel data structures.
+
+For example, users can define a BPF iterator that iterates over every task on
+the system and dumps the total amount of CPU runtime currently used by each of
+them. Another BPF task iterator may instead dump the cgroup information for each
+task. Such flexibility is the core value of BPF iterators.
+
+A BPF program is always loaded into the kernel at the behest of a user space
+process. A user space process loads a BPF program by opening and initializing
+the program skeleton as required and then invoking a syscall to have the BPF
+program verified and loaded by the kernel.
+
+In traditional tracing programs, a program is activated by having user space
+obtain a ``bpf_link`` to the program with ``bpf_program__attach()``. Once
+activated, the program callback will be invoked whenever the tracepoint is
+triggered in the main kernel. For BPF iterator programs, a ``bpf_link`` to the
+program is obtained using ``bpf_link_create()``, and the program callback is
+invoked by issuing system calls from user space.
+
+Next, let us see how you can use the iterators to iterate on kernel objects and
+read data.
+
+------------------------
+How to Use BPF iterators
+------------------------
+
+BPF selftests are a great resource to illustrate how to use the iterators. In
+this section, we’ll walk through a BPF selftest which shows how to load and use
+a BPF iterator program. To begin, we’ll look at `bpf_iter.c
+<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/prog_tests/bpf_iter.c>`_,
+which illustrates how to load and trigger BPF iterators on the user space side.
+Later, we’ll look at a BPF program that runs in kernel space.
+
+Loading a BPF iterator in the kernel from user space typically involves the
+following steps:
+
+* The BPF program is loaded into the kernel through ``libbpf``. Once the kernel
+ has verified and loaded the program, it returns a file descriptor (fd) to user
+ space.
+* Obtain a ``link_fd`` to the BPF program by calling the ``bpf_link_create()``
+ specified with the BPF program file descriptor received from the kernel.
+* Next, obtain a BPF iterator file descriptor (``bpf_iter_fd``) by calling the
+ ``bpf_iter_create()`` specified with the ``bpf_link`` received from Step 2.
+* Trigger the iteration by calling ``read(bpf_iter_fd)`` until no data is
+ available.
+* Close the iterator fd using ``close(bpf_iter_fd)``.
+* If needed to reread the data, get a new ``bpf_iter_fd`` and do the read again.
+
+The following are a few examples of selftest BPF iterator programs:
+
+* `bpf_iter_tcp4.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c>`_
+* `bpf_iter_task_vma.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c>`_
+* `bpf_iter_task_file.c <https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c>`_
+
+Let us look at ``bpf_iter_task_file.c``, which runs in kernel space:
+
+Here is the definition of ``bpf_iter__task_file`` in `vmlinux.h
+<https://facebookmicrosites.github.io/bpf/blog/2020/02/19/bpf-portability-and-co-re.html#btf>`_.
+Any struct name in ``vmlinux.h`` in the format ``bpf_iter__<iter_name>``
+represents a BPF iterator. The suffix ``<iter_name>`` represents the type of
+iterator.
+
+::
+
+ struct bpf_iter__task_file {
+ union {
+ struct bpf_iter_meta *meta;
+ };
+ union {
+ struct task_struct *task;
+ };
+ u32 fd;
+ union {
+ struct file *file;
+ };
+ };
+
+In the above code, the field 'meta' contains the metadata, which is the same for
+all BPF iterator programs. The rest of the fields are specific to different
+iterators. For example, for task_file iterators, the kernel layer provides the
+'task', 'fd' and 'file' field values. The 'task' and 'file' are `reference
+counted
+<https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html#file-descriptors-and-reference-counters>`_,
+so they won't go away when the BPF program runs.
+
+Here is a snippet from the ``bpf_iter_task_file.c`` file:
+
+::
+
+ SEC("iter/task_file")
+ int dump_task_file(struct bpf_iter__task_file *ctx)
+ {
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ struct file *file = ctx->file;
+ __u32 fd = ctx->fd;
+
+ if (task == NULL || file == NULL)
+ return 0;
+
+ if (ctx->meta->seq_num == 0) {
+ count = 0;
+ BPF_SEQ_PRINTF(seq, " tgid gid fd file\n");
+ }
+
+ if (tgid == task->tgid && task->tgid != task->pid)
+ count++;
+
+ if (last_tgid != task->tgid) {
+ last_tgid = task->tgid;
+ unique_tgid_count++;
+ }
+
+ BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+ (long)file->f_op);
+ return 0;
+ }
+
+In the above example, the section name ``SEC(iter/task_file)``, indicates that
+the program is a BPF iterator program to iterate all files from all tasks. The
+context of the program is ``bpf_iter__task_file`` struct.
+
+The user space program invokes the BPF iterator program running in the kernel
+by issuing a ``read()`` syscall. Once invoked, the BPF
+program can export data to user space using a variety of BPF helper functions.
+You can use either ``bpf_seq_printf()`` (and BPF_SEQ_PRINTF helper macro) or
+``bpf_seq_write()`` function based on whether you need formatted output or just
+binary data, respectively. For binary-encoded data, the user space applications
+can process the data from ``bpf_seq_write()`` as needed. For the formatted data,
+you can use ``cat <path>`` to print the results similar to ``cat
+/proc/net/netlink`` after pinning the BPF iterator to the bpffs mount. Later,
+use ``rm -f <path>`` to remove the pinned iterator.
+
+For example, you can use the following command to create a BPF iterator from the
+``bpf_iter_ipv6_route.o`` object file and pin it to the ``/sys/fs/bpf/my_route``
+path:
+
+::
+
+ $ bpftool iter pin ./bpf_iter_ipv6_route.o /sys/fs/bpf/my_route
+
+And then print out the results using the following command:
+
+::
+
+ $ cat /sys/fs/bpf/my_route
+
+
+-------------------------------------------------------
+Implement Kernel Support for BPF Iterator Program Types
+-------------------------------------------------------
+
+To implement a BPF iterator in the kernel, the developer must make a one-time
+change to the following key data structure defined in the `bpf.h
+<https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/include/linux/bpf.h>`_
+file.
+
+::
+
+ struct bpf_iter_reg {
+ const char *target;
+ bpf_iter_attach_target_t attach_target;
+ bpf_iter_detach_target_t detach_target;
+ bpf_iter_show_fdinfo_t show_fdinfo;
+ bpf_iter_fill_link_info_t fill_link_info;
+ bpf_iter_get_func_proto_t get_func_proto;
+ u32 ctx_arg_info_size;
+ u32 feature;
+ struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
+ const struct bpf_iter_seq_info *seq_info;
+ };
+
+After filling the data structure fields, call ``bpf_iter_reg_target()`` to
+register the iterator to the main BPF iterator subsystem.
+
+The following is the breakdown for each field in struct ``bpf_iter_reg``.
+
+.. list-table::
+ :widths: 25 50
+ :header-rows: 1
+
+ * - Fields
+ - Description
+ * - target
+ - Specifies the name of the BPF iterator. For example: ``bpf_map``,
+ ``bpf_map_elem``. The name should be different from other ``bpf_iter`` target names in the kernel.
+ * - attach_target and detach_target
+ - Allows for target specific ``link_create`` action since some targets
+ may need special processing. Called during the user space link_create stage.
+ * - show_fdinfo and fill_link_info
+ - Called to fill target specific information when user tries to get link
+ info associated with the iterator.
+ * - get_func_proto
+ - Permits a BPF iterator to access BPF helpers specific to the iterator.
+ * - ctx_arg_info_size and ctx_arg_info
+ - Specifies the verifier states for BPF program arguments associated with
+ the bpf iterator.
+ * - feature
+ - Specifies certain action requests in the kernel BPF iterator
+ infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
+ that the kernel function cond_resched() is called to avoid other kernel
+ subsystem (e.g., rcu) misbehaving.
+ * - seq_info
+ - Specifies certain action requests in the kernel BPF iterator
+ infrastructure. Currently, only BPF_ITER_RESCHED is supported. This means
+ that the kernel function cond_resched() is called to avoid other kernel
+ subsystem (e.g., rcu) misbehaving.
+
+
+`Click here
+<https://lore.kernel.org/bpf/20210212183107.50963-2-songliubraving@fb.com/>`_
+to see an implementation of the ``task_vma`` BPF iterator in the kernel.
+
+---------------------------------
+Parameterizing BPF Task Iterators
+---------------------------------
+
+By default, BPF iterators walk through all the objects of the specified types
+(processes, cgroups, maps, etc.) across the entire system to read relevant
+kernel data. But often, there are cases where we only care about a much smaller
+subset of iterable kernel objects, such as only iterating tasks within a
+specific process. Therefore, BPF iterator programs support filtering out objects
+from iteration by allowing user space to configure the iterator program when it
+is attached.
+
+--------------------------
+BPF Task Iterator Program
+--------------------------
+
+The following code is a BPF iterator program to print files and task information
+through the ``seq_file`` of the iterator. It is a standard BPF iterator program
+that visits every file of an iterator. We will use this BPF program in our
+example later.
+
+::
+
+ #include <vmlinux.h>
+ #include <bpf/bpf_helpers.h>
+
+ char _license[] SEC("license") = "GPL";
+
+ SEC("iter/task_file")
+ int dump_task_file(struct bpf_iter__task_file *ctx)
+ {
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ struct file *file = ctx->file;
+ __u32 fd = ctx->fd;
+ if (task == NULL || file == NULL)
+ return 0;
+ if (ctx->meta->seq_num == 0) {
+ BPF_SEQ_PRINTF(seq, " tgid pid fd file\n");
+ }
+ BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+ (long)file->f_op);
+ return 0;
+ }
+
+----------------------------------------
+Creating a File Iterator with Parameters
+----------------------------------------
+
+Now, let us look at how to create an iterator that includes only files of a
+process.
+
+First, fill the ``bpf_iter_attach_opts`` struct as shown below:
+
+::
+
+ LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ union bpf_iter_link_info linfo;
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.task.pid = getpid();
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+
+``linfo.task.pid``, if it is non-zero, directs the kernel to create an iterator
+that only includes opened files for the process with the specified ``pid``. In
+this example, we will only be iterating files for our process. If
+``linfo.task.pid`` is zero, the iterator will visit every opened file of every
+process. Similarly, ``linfo.task.tid`` directs the kernel to create an iterator
+that visits opened files of a specific thread, not a process. In this example,
+``linfo.task.tid`` is different from ``linfo.task.pid`` only if the thread has a
+separate file descriptor table. In most circumstances, all process threads share
+a single file descriptor table.
+
+Now, in the userspace program, pass the pointer of struct to the
+``bpf_program__attach_iter()``.
+
+::
+
+ link = bpf_program__attach_iter(prog, &opts); iter_fd =
+ bpf_iter_create(bpf_link__fd(link));
+
+If both *tid* and *pid* are zero, an iterator created from this struct
+``bpf_iter_attach_opts`` will include every opened file of every task in the
+system (in the namespace, actually.) It is the same as passing a NULL as the
+second argument to ``bpf_program__attach_iter()``.
+
+The whole program looks like the following code:
+
+::
+
+ #include <stdio.h>
+ #include <unistd.h>
+ #include <bpf/bpf.h>
+ #include <bpf/libbpf.h>
+ #include "bpf_iter_task_ex.skel.h"
+
+ static int do_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
+ {
+ struct bpf_link *link;
+ char buf[16] = {};
+ int iter_fd = -1, len;
+ int ret = 0;
+
+ link = bpf_program__attach_iter(prog, opts);
+ if (!link) {
+ fprintf(stderr, "bpf_program__attach_iter() fails\n");
+ return -1;
+ }
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (iter_fd < 0) {
+ fprintf(stderr, "bpf_iter_create() fails\n");
+ ret = -1;
+ goto free_link;
+ }
+ /* not check contents, but ensure read() ends without error */
+ while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) {
+ buf[len] = 0;
+ printf("%s", buf);
+ }
+ printf("\n");
+ free_link:
+ if (iter_fd >= 0)
+ close(iter_fd);
+ bpf_link__destroy(link);
+ return 0;
+ }
+
+ static void test_task_file(void)
+ {
+ LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ struct bpf_iter_task_ex *skel;
+ union bpf_iter_link_info linfo;
+ skel = bpf_iter_task_ex__open_and_load();
+ if (skel == NULL)
+ return;
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.task.pid = getpid();
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ printf("PID %d\n", getpid());
+ do_read_opts(skel->progs.dump_task_file, &opts);
+ bpf_iter_task_ex__destroy(skel);
+ }
+
+ int main(int argc, const char * const * argv)
+ {
+ test_task_file();
+ return 0;
+ }
+
+The following lines are the output of the program.
+::
+
+ PID 1859
+
+ tgid pid fd file
+ 1859 1859 0 ffffffff82270aa0
+ 1859 1859 1 ffffffff82270aa0
+ 1859 1859 2 ffffffff82270aa0
+ 1859 1859 3 ffffffff82272980
+ 1859 1859 4 ffffffff8225e120
+ 1859 1859 5 ffffffff82255120
+ 1859 1859 6 ffffffff82254f00
+ 1859 1859 7 ffffffff82254d80
+ 1859 1859 8 ffffffff8225abe0
+
+------------------
+Without Parameters
+------------------
+
+Let us look at how a BPF iterator without parameters skips files of other
+processes in the system. In this case, the BPF program has to check the pid or
+the tid of tasks, or it will receive every opened file in the system (in the
+current *pid* namespace, actually). So, we usually add a global variable in the
+BPF program to pass a *pid* to the BPF program.
+
+The BPF program would look like the following block.
+
+ ::
+
+ ......
+ int target_pid = 0;
+
+ SEC("iter/task_file")
+ int dump_task_file(struct bpf_iter__task_file *ctx)
+ {
+ ......
+ if (task->tgid != target_pid) /* Check task->pid instead to check thread IDs */
+ return 0;
+ BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+ (long)file->f_op);
+ return 0;
+ }
+
+The user space program would look like the following block:
+
+ ::
+
+ ......
+ static void test_task_file(void)
+ {
+ ......
+ skel = bpf_iter_task_ex__open_and_load();
+ if (skel == NULL)
+ return;
+ skel->bss->target_pid = getpid(); /* process ID. For thread id, use gettid() */
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.task.pid = getpid();
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ ......
+ }
+
+``target_pid`` is a global variable in the BPF program. The user space program
+should initialize the variable with a process ID to skip opened files of other
+processes in the BPF program. When you parametrize a BPF iterator, the iterator
+calls the BPF program fewer times which can save significant resources.
+
+---------------------------
+Parametrizing VMA Iterators
+---------------------------
+
+By default, a BPF VMA iterator includes every VMA in every process. However,
+you can still specify a process or a thread to include only its VMAs. Unlike
+files, a thread can not have a separate address space (since Linux 2.6.0-test6).
+Here, using *tid* makes no difference from using *pid*.
+
+----------------------------
+Parametrizing Task Iterators
+----------------------------
+
+A BPF task iterator with *pid* includes all tasks (threads) of a process. The
+BPF program receives these tasks one after another. You can specify a BPF task
+iterator with *tid* parameter to include only the tasks that match the given
+*tid*.
diff --git a/Documentation/bpf/bpf_licensing.rst b/Documentation/bpf/bpf_licensing.rst
new file mode 100644
index 000000000000..b19c433f41d2
--- /dev/null
+++ b/Documentation/bpf/bpf_licensing.rst
@@ -0,0 +1,92 @@
+=============
+BPF licensing
+=============
+
+Background
+==========
+
+* Classic BPF was BSD licensed
+
+"BPF" was originally introduced as BSD Packet Filter in
+http://www.tcpdump.org/papers/bpf-usenix93.pdf. The corresponding instruction
+set and its implementation came from BSD with BSD license. That original
+instruction set is now known as "classic BPF".
+
+However an instruction set is a specification for machine-language interaction,
+similar to a programming language. It is not a code. Therefore, the
+application of a BSD license may be misleading in a certain context, as the
+instruction set may enjoy no copyright protection.
+
+* eBPF (extended BPF) instruction set continues to be BSD
+
+In 2014, the classic BPF instruction set was significantly extended. We
+typically refer to this instruction set as eBPF to disambiguate it from cBPF.
+The eBPF instruction set is still BSD licensed.
+
+Implementations of eBPF
+=======================
+
+Using the eBPF instruction set requires implementing code in both kernel space
+and user space.
+
+In Linux Kernel
+---------------
+
+The reference implementations of the eBPF interpreter and various just-in-time
+compilers are part of Linux and are GPLv2 licensed. The implementation of
+eBPF helper functions is also GPLv2 licensed. Interpreters, JITs, helpers,
+and verifiers are called eBPF runtime.
+
+In User Space
+-------------
+
+There are also implementations of eBPF runtime (interpreter, JITs, helper
+functions) under
+Apache2 (https://github.com/iovisor/ubpf),
+MIT (https://github.com/qmonnet/rbpf), and
+BSD (https://github.com/DPDK/dpdk/blob/main/lib/librte_bpf).
+
+In HW
+-----
+
+The HW can choose to execute eBPF instruction natively and provide eBPF runtime
+in HW or via the use of implementing firmware with a proprietary license.
+
+In other operating systems
+--------------------------
+
+Other kernels or user space implementations of eBPF instruction set and runtime
+can have proprietary licenses.
+
+Using BPF programs in the Linux kernel
+======================================
+
+Linux Kernel (while being GPLv2) allows linking of proprietary kernel modules
+under these rules:
+Documentation/process/license-rules.rst
+
+When a kernel module is loaded, the linux kernel checks which functions it
+intends to use. If any function is marked as "GPL only," the corresponding
+module or program has to have GPL compatible license.
+
+Loading BPF program into the Linux kernel is similar to loading a kernel
+module. BPF is loaded at run time and not statically linked to the Linux
+kernel. BPF program loading follows the same license checking rules as kernel
+modules. BPF programs can be proprietary if they don't use "GPL only" BPF
+helper functions.
+
+Further, some BPF program types - Linux Security Modules (LSM) and TCP
+Congestion Control (struct_ops), as of Aug 2021 - are required to be GPL
+compatible even if they don't use "GPL only" helper functions directly. The
+registration step of LSM and TCP congestion control modules of the Linux
+kernel is done through EXPORT_SYMBOL_GPL kernel functions. In that sense LSM
+and struct_ops BPF programs are implicitly calling "GPL only" functions.
+The same restriction applies to BPF programs that call kernel functions
+directly via unstable interface also known as "kfunc".
+
+Packaging BPF programs with user space applications
+====================================================
+
+Generally, proprietary-licensed applications and GPL licensed BPF programs
+written for the Linux kernel in the same package can co-exist because they are
+separate executable processes. This applies to both cBPF and eBPF programs.
diff --git a/Documentation/bpf/bpf_lsm.rst b/Documentation/bpf/bpf_lsm.rst
deleted file mode 100644
index 0dc3fb0d9544..000000000000
--- a/Documentation/bpf/bpf_lsm.rst
+++ /dev/null
@@ -1,143 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0+
-.. Copyright (C) 2020 Google LLC.
-
-================
-LSM BPF Programs
-================
-
-These BPF programs allow runtime instrumentation of the LSM hooks by privileged
-users to implement system-wide MAC (Mandatory Access Control) and Audit
-policies using eBPF.
-
-Structure
----------
-
-The example shows an eBPF program that can be attached to the ``file_mprotect``
-LSM hook:
-
-.. c:function:: int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);
-
-Other LSM hooks which can be instrumented can be found in
-``include/linux/lsm_hooks.h``.
-
-eBPF programs that use Documentation/bpf/btf.rst do not need to include kernel
-headers for accessing information from the attached eBPF program's context.
-They can simply declare the structures in the eBPF program and only specify
-the fields that need to be accessed.
-
-.. code-block:: c
-
- struct mm_struct {
- unsigned long start_brk, brk, start_stack;
- } __attribute__((preserve_access_index));
-
- struct vm_area_struct {
- unsigned long start_brk, brk, start_stack;
- unsigned long vm_start, vm_end;
- struct mm_struct *vm_mm;
- } __attribute__((preserve_access_index));
-
-
-.. note:: The order of the fields is irrelevant.
-
-This can be further simplified (if one has access to the BTF information at
-build time) by generating the ``vmlinux.h`` with:
-
-.. code-block:: console
-
- # bpftool btf dump file <path-to-btf-vmlinux> format c > vmlinux.h
-
-.. note:: ``path-to-btf-vmlinux`` can be ``/sys/kernel/btf/vmlinux`` if the
- build environment matches the environment the BPF programs are
- deployed in.
-
-The ``vmlinux.h`` can then simply be included in the BPF programs without
-requiring the definition of the types.
-
-The eBPF programs can be declared using the``BPF_PROG``
-macros defined in `tools/lib/bpf/bpf_tracing.h`_. In this
-example:
-
- * ``"lsm/file_mprotect"`` indicates the LSM hook that the program must
- be attached to
- * ``mprotect_audit`` is the name of the eBPF program
-
-.. code-block:: c
-
- SEC("lsm/file_mprotect")
- int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
- unsigned long reqprot, unsigned long prot, int ret)
- {
- /* ret is the return value from the previous BPF program
- * or 0 if it's the first hook.
- */
- if (ret != 0)
- return ret;
-
- int is_heap;
-
- is_heap = (vma->vm_start >= vma->vm_mm->start_brk &&
- vma->vm_end <= vma->vm_mm->brk);
-
- /* Return an -EPERM or write information to the perf events buffer
- * for auditing
- */
- if (is_heap)
- return -EPERM;
- }
-
-The ``__attribute__((preserve_access_index))`` is a clang feature that allows
-the BPF verifier to update the offsets for the access at runtime using the
-Documentation/bpf/btf.rst information. Since the BPF verifier is aware of the
-types, it also validates all the accesses made to the various types in the
-eBPF program.
-
-Loading
--------
-
-eBPF programs can be loaded with the :manpage:`bpf(2)` syscall's
-``BPF_PROG_LOAD`` operation:
-
-.. code-block:: c
-
- struct bpf_object *obj;
-
- obj = bpf_object__open("./my_prog.o");
- bpf_object__load(obj);
-
-This can be simplified by using a skeleton header generated by ``bpftool``:
-
-.. code-block:: console
-
- # bpftool gen skeleton my_prog.o > my_prog.skel.h
-
-and the program can be loaded by including ``my_prog.skel.h`` and using
-the generated helper, ``my_prog__open_and_load``.
-
-Attachment to LSM Hooks
------------------------
-
-The LSM allows attachment of eBPF programs as LSM hooks using :manpage:`bpf(2)`
-syscall's ``BPF_RAW_TRACEPOINT_OPEN`` operation or more simply by
-using the libbpf helper ``bpf_program__attach_lsm``.
-
-The program can be detached from the LSM hook by *destroying* the ``link``
-link returned by ``bpf_program__attach_lsm`` using ``bpf_link__destroy``.
-
-One can also use the helpers generated in ``my_prog.skel.h`` i.e.
-``my_prog__attach`` for attachment and ``my_prog__destroy`` for cleaning up.
-
-Examples
---------
-
-An example eBPF program can be found in
-`tools/testing/selftests/bpf/progs/lsm.c`_ and the corresponding
-userspace code in `tools/testing/selftests/bpf/prog_tests/test_lsm.c`_
-
-.. Links
-.. _tools/lib/bpf/bpf_tracing.h:
- https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h
-.. _tools/testing/selftests/bpf/progs/lsm.c:
- https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/lsm.c
-.. _tools/testing/selftests/bpf/prog_tests/test_lsm.c:
- https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/test_lsm.c
diff --git a/Documentation/bpf/bpf_prog_run.rst b/Documentation/bpf/bpf_prog_run.rst
new file mode 100644
index 000000000000..4868c909df5c
--- /dev/null
+++ b/Documentation/bpf/bpf_prog_run.rst
@@ -0,0 +1,117 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+Running BPF programs from userspace
+===================================
+
+This document describes the ``BPF_PROG_RUN`` facility for running BPF programs
+from userspace.
+
+.. contents::
+ :local:
+ :depth: 2
+
+
+Overview
+--------
+
+The ``BPF_PROG_RUN`` command can be used through the ``bpf()`` syscall to
+execute a BPF program in the kernel and return the results to userspace. This
+can be used to unit test BPF programs against user-supplied context objects, and
+as way to explicitly execute programs in the kernel for their side effects. The
+command was previously named ``BPF_PROG_TEST_RUN``, and both constants continue
+to be defined in the UAPI header, aliased to the same value.
+
+The ``BPF_PROG_RUN`` command can be used to execute BPF programs of the
+following types:
+
+- ``BPF_PROG_TYPE_SOCKET_FILTER``
+- ``BPF_PROG_TYPE_SCHED_CLS``
+- ``BPF_PROG_TYPE_SCHED_ACT``
+- ``BPF_PROG_TYPE_XDP``
+- ``BPF_PROG_TYPE_SK_LOOKUP``
+- ``BPF_PROG_TYPE_CGROUP_SKB``
+- ``BPF_PROG_TYPE_LWT_IN``
+- ``BPF_PROG_TYPE_LWT_OUT``
+- ``BPF_PROG_TYPE_LWT_XMIT``
+- ``BPF_PROG_TYPE_LWT_SEG6LOCAL``
+- ``BPF_PROG_TYPE_FLOW_DISSECTOR``
+- ``BPF_PROG_TYPE_STRUCT_OPS``
+- ``BPF_PROG_TYPE_RAW_TRACEPOINT``
+- ``BPF_PROG_TYPE_SYSCALL``
+
+When using the ``BPF_PROG_RUN`` command, userspace supplies an input context
+object and (for program types operating on network packets) a buffer containing
+the packet data that the BPF program will operate on. The kernel will then
+execute the program and return the results to userspace. Note that programs will
+not have any side effects while being run in this mode; in particular, packets
+will not actually be redirected or dropped, the program return code will just be
+returned to userspace. A separate mode for live execution of XDP programs is
+provided, documented separately below.
+
+Running XDP programs in "live frame mode"
+-----------------------------------------
+
+The ``BPF_PROG_RUN`` command has a separate mode for running live XDP programs,
+which can be used to execute XDP programs in a way where packets will actually
+be processed by the kernel after the execution of the XDP program as if they
+arrived on a physical interface. This mode is activated by setting the
+``BPF_F_TEST_XDP_LIVE_FRAMES`` flag when supplying an XDP program to
+``BPF_PROG_RUN``.
+
+The live packet mode is optimised for high performance execution of the supplied
+XDP program many times (suitable for, e.g., running as a traffic generator),
+which means the semantics are not quite as straight-forward as the regular test
+run mode. Specifically:
+
+- When executing an XDP program in live frame mode, the result of the execution
+ will not be returned to userspace; instead, the kernel will perform the
+ operation indicated by the program's return code (drop the packet, redirect
+ it, etc). For this reason, setting the ``data_out`` or ``ctx_out`` attributes
+ in the syscall parameters when running in this mode will be rejected. In
+ addition, not all failures will be reported back to userspace directly;
+ specifically, only fatal errors in setup or during execution (like memory
+ allocation errors) will halt execution and return an error. If an error occurs
+ in packet processing, like a failure to redirect to a given interface,
+ execution will continue with the next repetition; these errors can be detected
+ via the same trace points as for regular XDP programs.
+
+- Userspace can supply an ifindex as part of the context object, just like in
+ the regular (non-live) mode. The XDP program will be executed as though the
+ packet arrived on this interface; i.e., the ``ingress_ifindex`` of the context
+ object will point to that interface. Furthermore, if the XDP program returns
+ ``XDP_PASS``, the packet will be injected into the kernel networking stack as
+ though it arrived on that ifindex, and if it returns ``XDP_TX``, the packet
+ will be transmitted *out* of that same interface. Do note, though, that
+ because the program execution is not happening in driver context, an
+ ``XDP_TX`` is actually turned into the same action as an ``XDP_REDIRECT`` to
+ that same interface (i.e., it will only work if the driver has support for the
+ ``ndo_xdp_xmit`` driver op).
+
+- When running the program with multiple repetitions, the execution will happen
+ in batches. The batch size defaults to 64 packets (which is same as the
+ maximum NAPI receive batch size), but can be specified by userspace through
+ the ``batch_size`` parameter, up to a maximum of 256 packets. For each batch,
+ the kernel executes the XDP program repeatedly, each invocation getting a
+ separate copy of the packet data. For each repetition, if the program drops
+ the packet, the data page is immediately recycled (see below). Otherwise, the
+ packet is buffered until the end of the batch, at which point all packets
+ buffered this way during the batch are transmitted at once.
+
+- When setting up the test run, the kernel will initialise a pool of memory
+ pages of the same size as the batch size. Each memory page will be initialised
+ with the initial packet data supplied by userspace at ``BPF_PROG_RUN``
+ invocation. When possible, the pages will be recycled on future program
+ invocations, to improve performance. Pages will generally be recycled a full
+ batch at a time, except when a packet is dropped (by return code or because
+ of, say, a redirection error), in which case that page will be recycled
+ immediately. If a packet ends up being passed to the regular networking stack
+ (because the XDP program returns ``XDP_PASS``, or because it ends up being
+ redirected to an interface that injects it into the stack), the page will be
+ released and a new one will be allocated when the pool is empty.
+
+ When recycling, the page content is not rewritten; only the packet boundary
+ pointers (``data``, ``data_end`` and ``data_meta``) in the context object will
+ be reset to the original values. This means that if a program rewrites the
+ packet contents, it has to be prepared to see either the original content or
+ the modified version on subsequent invocations.
diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index 846354cd2d69..7cd7c5415a99 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -3,7 +3,7 @@ BPF Type Format (BTF)
=====================
1. Introduction
-***************
+===============
BTF (BPF Type Format) is the metadata format which encodes the debug info
related to BPF program/map. The name BTF was used initially to describe data
@@ -30,7 +30,7 @@ sections are discussed in details in :ref:`BTF_Type_String`.
.. _BTF_Type_String:
2. BTF Type and String Encoding
-*******************************
+===============================
The file ``include/uapi/linux/btf.h`` provides high-level definition of how
types/strings are encoded.
@@ -57,13 +57,13 @@ little-endian target. The ``btf_header`` is designed to be extensible with
generated.
2.1 String Encoding
-===================
+-------------------
The first string in the string section must be a null string. The rest of
string table is a concatenation of other null-terminated strings.
2.2 Type Encoding
-=================
+-----------------
The type id ``0`` is reserved for ``void`` type. The type section is parsed
sequentially and type id is assigned to each recognized type starting from id
@@ -74,7 +74,7 @@ sequentially and type id is assigned to each recognized type starting from id
#define BTF_KIND_ARRAY 3 /* Array */
#define BTF_KIND_STRUCT 4 /* Struct */
#define BTF_KIND_UNION 5 /* Union */
- #define BTF_KIND_ENUM 6 /* Enumeration */
+ #define BTF_KIND_ENUM 6 /* Enumeration up to 32-bit values */
#define BTF_KIND_FWD 7 /* Forward */
#define BTF_KIND_TYPEDEF 8 /* Typedef */
#define BTF_KIND_VOLATILE 9 /* Volatile */
@@ -85,6 +85,9 @@ sequentially and type id is assigned to each recognized type starting from id
#define BTF_KIND_VAR 14 /* Variable */
#define BTF_KIND_DATASEC 15 /* Section */
#define BTF_KIND_FLOAT 16 /* Floating point */
+ #define BTF_KIND_DECL_TAG 17 /* Decl Tag */
+ #define BTF_KIND_TYPE_TAG 18 /* Type Tag */
+ #define BTF_KIND_ENUM64 19 /* Enumeration up to 64-bit values */
Note that the type section encodes debug info, not just pure types.
``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
@@ -99,14 +102,14 @@ Each type contains the following common data::
* bits 24-28: kind (e.g. int, ptr, array...etc)
* bits 29-30: unused
* bit 31: kind_flag, currently used by
- * struct, union and fwd
+ * struct, union, fwd, enum and enum64.
*/
__u32 info;
- /* "size" is used by INT, ENUM, STRUCT and UNION.
+ /* "size" is used by INT, ENUM, STRUCT, UNION and ENUM64.
* "size" tells the size of the type it is describing.
*
* "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
- * FUNC and FUNC_PROTO.
+ * FUNC, FUNC_PROTO, DECL_TAG and TYPE_TAG.
* "type" is a type_id referring to another type.
*/
union {
@@ -279,10 +282,10 @@ modes exist:
``struct btf_type`` encoding requirement:
* ``name_off``: 0 or offset to a valid C identifier
- * ``info.kind_flag``: 0
+ * ``info.kind_flag``: 0 for unsigned, 1 for signed
* ``info.kind``: BTF_KIND_ENUM
* ``info.vlen``: number of enum values
- * ``size``: 4
+ * ``size``: 1/2/4/8
``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum``.::
@@ -295,6 +298,10 @@ The ``btf_enum`` encoding:
* ``name_off``: offset to a valid C identifier
* ``val``: any value
+If the original enum value is signed and the size is less than 4,
+that value will be sign extended into 4 bytes. If the size is 8,
+the value will be truncated into 4 bytes.
+
2.2.7 BTF_KIND_FWD
~~~~~~~~~~~~~~~~~~
@@ -362,7 +369,8 @@ No additional type data follow ``btf_type``.
* ``name_off``: offset to a valid C identifier
* ``info.kind_flag``: 0
* ``info.kind``: BTF_KIND_FUNC
- * ``info.vlen``: 0
+ * ``info.vlen``: linkage information (BTF_FUNC_STATIC, BTF_FUNC_GLOBAL
+ or BTF_FUNC_EXTERN)
* ``type``: a BTF_KIND_FUNC_PROTO type
No additional type data follow ``btf_type``.
@@ -373,6 +381,9 @@ type. The BTF_KIND_FUNC may in turn be referenced by a func_info in the
:ref:`BTF_Ext_Section` (ELF) or in the arguments to :ref:`BPF_Prog_Load`
(ABI).
+Currently, only linkage values of BTF_FUNC_STATIC and BTF_FUNC_GLOBAL are
+supported in the kernel.
+
2.2.13 BTF_KIND_FUNC_PROTO
~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -465,8 +476,83 @@ map definition.
No additional type data follow ``btf_type``.
+2.2.17 BTF_KIND_DECL_TAG
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+ * ``name_off``: offset to a non-empty string
+ * ``info.kind_flag``: 0
+ * ``info.kind``: BTF_KIND_DECL_TAG
+ * ``info.vlen``: 0
+ * ``type``: ``struct``, ``union``, ``func``, ``var`` or ``typedef``
+
+``btf_type`` is followed by ``struct btf_decl_tag``.::
+
+ struct btf_decl_tag {
+ __u32 component_idx;
+ };
+
+The ``name_off`` encodes btf_decl_tag attribute string.
+The ``type`` should be ``struct``, ``union``, ``func``, ``var`` or ``typedef``.
+For ``var`` or ``typedef`` type, ``btf_decl_tag.component_idx`` must be ``-1``.
+For the other three types, if the btf_decl_tag attribute is
+applied to the ``struct``, ``union`` or ``func`` itself,
+``btf_decl_tag.component_idx`` must be ``-1``. Otherwise,
+the attribute is applied to a ``struct``/``union`` member or
+a ``func`` argument, and ``btf_decl_tag.component_idx`` should be a
+valid index (starting from 0) pointing to a member or an argument.
+
+2.2.18 BTF_KIND_TYPE_TAG
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+ * ``name_off``: offset to a non-empty string
+ * ``info.kind_flag``: 0
+ * ``info.kind``: BTF_KIND_TYPE_TAG
+ * ``info.vlen``: 0
+ * ``type``: the type with ``btf_type_tag`` attribute
+
+Currently, ``BTF_KIND_TYPE_TAG`` is only emitted for pointer types.
+It has the following btf type chain:
+::
+
+ ptr -> [type_tag]*
+ -> [const | volatile | restrict | typedef]*
+ -> base_type
+
+Basically, a pointer type points to zero or more
+type_tag, then zero or more const/volatile/restrict/typedef
+and finally the base type. The base type is one of
+int, ptr, array, struct, union, enum, func_proto and float types.
+
+2.2.19 BTF_KIND_ENUM64
+~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+ * ``name_off``: 0 or offset to a valid C identifier
+ * ``info.kind_flag``: 0 for unsigned, 1 for signed
+ * ``info.kind``: BTF_KIND_ENUM64
+ * ``info.vlen``: number of enum values
+ * ``size``: 1/2/4/8
+
+``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum64``.::
+
+ struct btf_enum64 {
+ __u32 name_off;
+ __u32 val_lo32;
+ __u32 val_hi32;
+ };
+
+The ``btf_enum64`` encoding:
+ * ``name_off``: offset to a valid C identifier
+ * ``val_lo32``: lower 32-bit value for a 64-bit value
+ * ``val_hi32``: high 32-bit value for a 64-bit value
+
+If the original enum value is signed and the size is less than 8,
+that value will be sign extended into 8 bytes.
+
3. BTF Kernel API
-*****************
+=================
The following bpf syscall command involves BTF:
* BPF_BTF_LOAD: load a blob of BTF data into kernel
@@ -509,14 +595,14 @@ The workflow typically looks like:
3.1 BPF_BTF_LOAD
-================
+----------------
Load a blob of BTF data into kernel. A blob of data, described in
:ref:`BTF_Type_String`, can be directly loaded into the kernel. A ``btf_fd``
is returned to a userspace.
3.2 BPF_MAP_CREATE
-==================
+------------------
A map can be created with ``btf_fd`` and specified key/value type id.::
@@ -527,23 +613,20 @@ A map can be created with ``btf_fd`` and specified key/value type id.::
In libbpf, the map can be defined with extra annotation like below:
::
- struct bpf_map_def SEC("maps") btf_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(struct ipv_counts),
- .max_entries = 4,
- };
- BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, struct ipv_counts);
+ __uint(max_entries, 4);
+ } btf_map SEC(".maps");
-Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, key and
-value types for the map. During ELF parsing, libbpf is able to extract
-key/value type_id's and assign them to BPF_MAP_CREATE attributes
-automatically.
+During ELF parsing, libbpf is able to extract key/value type_id's and assign
+them to BPF_MAP_CREATE attributes automatically.
.. _BPF_Prog_Load:
3.3 BPF_PROG_LOAD
-=================
+-----------------
During prog_load, func_info and line_info can be passed to kernel with proper
values for the following attributes:
@@ -593,7 +676,7 @@ For line_info, the line number and column number are defined as below:
#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
3.4 BPF_{PROG,MAP}_GET_NEXT_ID
-==============================
+------------------------------
In kernel, every loaded program, map or btf has a unique id. The id won't
change during the lifetime of a program, map, or btf.
@@ -603,13 +686,13 @@ each command, to user space, for bpf program or maps, respectively, so an
inspection tool can inspect all programs and maps.
3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
-===============================
+-------------------------------
An introspection tool cannot use id to get details about program or maps.
A file descriptor needs to be obtained first for reference-counting purpose.
3.6 BPF_OBJ_GET_INFO_BY_FD
-==========================
+--------------------------
Once a program/map fd is acquired, an introspection tool can get the detailed
information from kernel about this fd, some of which are BTF-related. For
@@ -618,7 +701,7 @@ example, ``bpf_map_info`` returns ``btf_id`` and key/value type ids.
bpf byte codes, and jited_line_info.
3.7 BPF_BTF_GET_FD_BY_ID
-========================
+------------------------
With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, bpf
syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. Then, with
@@ -630,10 +713,10 @@ tool has full btf knowledge and is able to pretty print map key/values, dump
func signatures and line info, along with byte/jit codes.
4. ELF File Format Interface
-****************************
+============================
4.1 .BTF section
-================
+----------------
The .BTF section contains type and string data. The format of this section is
same as the one describe in :ref:`BTF_Type_String`.
@@ -641,7 +724,7 @@ same as the one describe in :ref:`BTF_Type_String`.
.. _BTF_Ext_Section:
4.2 .BTF.ext section
-====================
+--------------------
The .BTF.ext section encodes func_info and line_info which needs loader
manipulation before loading into the kernel.
@@ -705,7 +788,7 @@ bpf_insn``. For ELF API, the ``insn_off`` is the byte offset from the
beginning of section (``btf_ext_info_sec->sec_name_off``).
4.2 .BTF_ids section
-====================
+--------------------
The .BTF_ids section encodes BTF ID values that are used within the kernel.
@@ -766,10 +849,10 @@ All the BTF ID lists and sets are compiled in the .BTF_ids section and
resolved during the linking phase of kernel build by ``resolve_btfids`` tool.
5. Using BTF
-************
+============
5.1 bpftool map pretty print
-============================
+----------------------------
With BTF, the map key/value can be printed based on fields rather than simply
raw bytes. This is especially valuable for large structure or if your data
@@ -786,13 +869,12 @@ structure has bitfields. For example, for the following map,::
___A b1:4;
enum A b2:4;
};
- struct bpf_map_def SEC("maps") tmpmap = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(__u32),
- .value_size = sizeof(struct tmp_t),
- .max_entries = 1,
- };
- BPF_ANNOTATE_KV_PAIR(tmpmap, int, struct tmp_t);
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, struct tmp_t);
+ __uint(max_entries, 1);
+ } tmpmap SEC(".maps");
bpftool is able to pretty print like below:
::
@@ -811,7 +893,7 @@ bpftool is able to pretty print like below:
]
5.2 bpftool prog dump
-=====================
+---------------------
The following is an example showing how func_info and line_info can help prog
dump with better kernel symbol names, function prototypes and line
@@ -845,7 +927,7 @@ information.::
[...]
5.3 Verifier Log
-================
+----------------
The following is an example of how line_info can help debugging verification
failure.::
@@ -871,7 +953,7 @@ failure.::
R2 offset is outside of the packet
6. BTF Generation
-*****************
+=================
You need latest pahole
@@ -978,6 +1060,11 @@ format.::
.long 8206 # Line 8 Col 14
7. Testing
-**********
+==========
+
+The kernel BPF selftest `tools/testing/selftests/bpf/prog_tests/btf.c`_
+provides an extensive set of BTF-related tests.
-Kernel bpf selftest `test_btf.c` provides extensive set of BTF-related tests.
+.. Links
+.. _tools/testing/selftests/bpf/prog_tests/btf.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/btf.c
diff --git a/Documentation/bpf/clang-notes.rst b/Documentation/bpf/clang-notes.rst
new file mode 100644
index 000000000000..2c872a1ee08e
--- /dev/null
+++ b/Documentation/bpf/clang-notes.rst
@@ -0,0 +1,36 @@
+.. contents::
+.. sectnum::
+
+==========================
+Clang implementation notes
+==========================
+
+This document provides more details specific to the Clang/LLVM implementation of the eBPF instruction set.
+
+Versions
+========
+
+Clang defined "CPU" versions, where a CPU version of 3 corresponds to the current eBPF ISA.
+
+Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3.
+
+Arithmetic instructions
+=======================
+
+For CPU versions prior to 3, Clang v7.0 and later can enable ``BPF_ALU`` support with
+``-Xclang -target-feature -Xclang +alu32``. In CPU version 3, support is automatically included.
+
+Jump instructions
+=================
+
+If ``-O0`` is used, Clang will generate the ``BPF_CALL | BPF_X | BPF_JMP`` (0x8d)
+instruction, which is not supported by the Linux kernel verifier.
+
+Atomic operations
+=================
+
+Clang can generate atomic instructions by default when ``-mcpu=v3`` is
+enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
+Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
+the atomics features, while keeping a lower ``-mcpu`` version, you can use
+``-Xclang -target-feature -Xclang +alu32``.
diff --git a/Documentation/bpf/classic_vs_extended.rst b/Documentation/bpf/classic_vs_extended.rst
new file mode 100644
index 000000000000..2f81a81f5267
--- /dev/null
+++ b/Documentation/bpf/classic_vs_extended.rst
@@ -0,0 +1,376 @@
+
+===================
+Classic BPF vs eBPF
+===================
+
+eBPF is designed to be JITed with one to one mapping, which can also open up
+the possibility for GCC/LLVM compilers to generate optimized eBPF code through
+an eBPF backend that performs almost as fast as natively compiled code.
+
+Some core changes of the eBPF format from classic BPF:
+
+- Number of registers increase from 2 to 10:
+
+ The old format had two registers A and X, and a hidden frame pointer. The
+ new layout extends this to be 10 internal registers and a read-only frame
+ pointer. Since 64-bit CPUs are passing arguments to functions via registers
+ the number of args from eBPF program to in-kernel function is restricted
+ to 5 and one register is used to accept return value from an in-kernel
+ function. Natively, x86_64 passes first 6 arguments in registers, aarch64/
+ sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved
+ registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers.
+
+ Thus, all eBPF registers map one to one to HW registers on x86_64, aarch64,
+ etc, and eBPF calling convention maps directly to ABIs used by the kernel on
+ 64-bit architectures.
+
+ On 32-bit architectures JIT may map programs that use only 32-bit arithmetic
+ and may let more complex programs to be interpreted.
+
+ R0 - R5 are scratch registers and eBPF program needs spill/fill them if
+ necessary across calls. Note that there is only one eBPF program (== one
+ eBPF main routine) and it cannot call other eBPF functions, it can only
+ call predefined in-kernel functions, though.
+
+- Register width increases from 32-bit to 64-bit:
+
+ Still, the semantics of the original 32-bit ALU operations are preserved
+ via 32-bit subregisters. All eBPF registers are 64-bit with 32-bit lower
+ subregisters that zero-extend into 64-bit if they are being written to.
+ That behavior maps directly to x86_64 and arm64 subregister definition, but
+ makes other JITs more difficult.
+
+ 32-bit architectures run 64-bit eBPF programs via interpreter.
+ Their JITs may convert BPF programs that only use 32-bit subregisters into
+ native instruction set and let the rest being interpreted.
+
+ Operation is 64-bit, because on 64-bit architectures, pointers are also
+ 64-bit wide, and we want to pass 64-bit values in/out of kernel functions,
+ so 32-bit eBPF registers would otherwise require to define register-pair
+ ABI, thus, there won't be able to use a direct eBPF register to HW register
+ mapping and JIT would need to do combine/split/move operations for every
+ register in and out of the function, which is complex, bug prone and slow.
+ Another reason is the use of atomic 64-bit counters.
+
+- Conditional jt/jf targets replaced with jt/fall-through:
+
+ While the original design has constructs such as ``if (cond) jump_true;
+ else jump_false;``, they are being replaced into alternative constructs like
+ ``if (cond) jump_true; /* else fall-through */``.
+
+- Introduces bpf_call insn and register passing convention for zero overhead
+ calls from/to other kernel functions:
+
+ Before an in-kernel function call, the eBPF program needs to
+ place function arguments into R1 to R5 registers to satisfy calling
+ convention, then the interpreter will take them from registers and pass
+ to in-kernel function. If R1 - R5 registers are mapped to CPU registers
+ that are used for argument passing on given architecture, the JIT compiler
+ doesn't need to emit extra moves. Function arguments will be in the correct
+ registers and BPF_CALL instruction will be JITed as single 'call' HW
+ instruction. This calling convention was picked to cover common call
+ situations without performance penalty.
+
+ After an in-kernel function call, R1 - R5 are reset to unreadable and R0 has
+ a return value of the function. Since R6 - R9 are callee saved, their state
+ is preserved across the call.
+
+ For example, consider three C functions::
+
+ u64 f1() { return (*_f2)(1); }
+ u64 f2(u64 a) { return f3(a + 1, a); }
+ u64 f3(u64 a, u64 b) { return a - b; }
+
+ GCC can compile f1, f3 into x86_64::
+
+ f1:
+ movl $1, %edi
+ movq _f2(%rip), %rax
+ jmp *%rax
+ f3:
+ movq %rdi, %rax
+ subq %rsi, %rax
+ ret
+
+ Function f2 in eBPF may look like::
+
+ f2:
+ bpf_mov R2, R1
+ bpf_add R1, 1
+ bpf_call f3
+ bpf_exit
+
+ If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and
+ returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to
+ be used to call into f2.
+
+ For practical reasons all eBPF programs have only one argument 'ctx' which is
+ already placed into R1 (e.g. on __bpf_prog_run() startup) and the programs
+ can call kernel functions with up to 5 arguments. Calls with 6 or more arguments
+ are currently not supported, but these restrictions can be lifted if necessary
+ in the future.
+
+ On 64-bit architectures all register map to HW registers one to one. For
+ example, x86_64 JIT compiler can map them as ...
+
+ ::
+
+ R0 - rax
+ R1 - rdi
+ R2 - rsi
+ R3 - rdx
+ R4 - rcx
+ R5 - r8
+ R6 - rbx
+ R7 - r13
+ R8 - r14
+ R9 - r15
+ R10 - rbp
+
+ ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing
+ and rbx, r12 - r15 are callee saved.
+
+ Then the following eBPF pseudo-program::
+
+ bpf_mov R6, R1 /* save ctx */
+ bpf_mov R2, 2
+ bpf_mov R3, 3
+ bpf_mov R4, 4
+ bpf_mov R5, 5
+ bpf_call foo
+ bpf_mov R7, R0 /* save foo() return value */
+ bpf_mov R1, R6 /* restore ctx for next call */
+ bpf_mov R2, 6
+ bpf_mov R3, 7
+ bpf_mov R4, 8
+ bpf_mov R5, 9
+ bpf_call bar
+ bpf_add R0, R7
+ bpf_exit
+
+ After JIT to x86_64 may look like::
+
+ push %rbp
+ mov %rsp,%rbp
+ sub $0x228,%rsp
+ mov %rbx,-0x228(%rbp)
+ mov %r13,-0x220(%rbp)
+ mov %rdi,%rbx
+ mov $0x2,%esi
+ mov $0x3,%edx
+ mov $0x4,%ecx
+ mov $0x5,%r8d
+ callq foo
+ mov %rax,%r13
+ mov %rbx,%rdi
+ mov $0x6,%esi
+ mov $0x7,%edx
+ mov $0x8,%ecx
+ mov $0x9,%r8d
+ callq bar
+ add %r13,%rax
+ mov -0x228(%rbp),%rbx
+ mov -0x220(%rbp),%r13
+ leaveq
+ retq
+
+ Which is in this example equivalent in C to::
+
+ u64 bpf_filter(u64 ctx)
+ {
+ return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9);
+ }
+
+ In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64
+ arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper
+ registers and place their return value into ``%rax`` which is R0 in eBPF.
+ Prologue and epilogue are emitted by JIT and are implicit in the
+ interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve
+ them across the calls as defined by calling convention.
+
+ For example the following program is invalid::
+
+ bpf_mov R1, 1
+ bpf_call foo
+ bpf_mov R0, R1
+ bpf_exit
+
+ After the call the registers R1-R5 contain junk values and cannot be read.
+ An in-kernel verifier.rst is used to validate eBPF programs.
+
+Also in the new design, eBPF is limited to 4096 insns, which means that any
+program will terminate quickly and will only call a fixed number of kernel
+functions. Original BPF and eBPF are two operand instructions,
+which helps to do one-to-one mapping between eBPF insn and x86 insn during JIT.
+
+The input context pointer for invoking the interpreter function is generic,
+its content is defined by a specific use case. For seccomp register R1 points
+to seccomp_data, for converted BPF filters R1 points to a skb.
+
+A program, that is translated internally consists of the following elements::
+
+ op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32
+
+So far 87 eBPF instructions were implemented. 8-bit 'op' opcode field
+has room for new instructions. Some of them may use 16/24/32 byte encoding. New
+instructions must be multiple of 8 bytes to preserve backward compatibility.
+
+eBPF is a general purpose RISC instruction set. Not every register and
+every instruction are used during translation from original BPF to eBPF.
+For example, socket filters are not using ``exclusive add`` instruction, but
+tracing filters may do to maintain counters of events, for example. Register R9
+is not used by socket filters either, but more complex filters may be running
+out of registers and would have to resort to spill/fill to stack.
+
+eBPF can be used as a generic assembler for last step performance
+optimizations, socket filters and seccomp are using it as assembler. Tracing
+filters may use it as assembler to generate code from kernel. In kernel usage
+may not be bounded by security considerations, since generated eBPF code
+may be optimizing internal code path and not being exposed to the user space.
+Safety of eBPF can come from the verifier.rst. In such use cases as
+described, it may be used as safe instruction set.
+
+Just like the original BPF, eBPF runs within a controlled environment,
+is deterministic and the kernel can easily prove that. The safety of the program
+can be determined in two steps: first step does depth-first-search to disallow
+loops and other CFG validation; second step starts from the first insn and
+descends all possible paths. It simulates execution of every insn and observes
+the state change of registers and stack.
+
+opcode encoding
+===============
+
+eBPF is reusing most of the opcode encoding from classic to simplify conversion
+of classic BPF to eBPF.
+
+For arithmetic and jump instructions the 8-bit 'code' field is divided into three
+parts::
+
+ +----------------+--------+--------------------+
+ | 4 bits | 1 bit | 3 bits |
+ | operation code | source | instruction class |
+ +----------------+--------+--------------------+
+ (MSB) (LSB)
+
+Three LSB bits store instruction class which is one of:
+
+ =================== ===============
+ Classic BPF classes eBPF classes
+ =================== ===============
+ BPF_LD 0x00 BPF_LD 0x00
+ BPF_LDX 0x01 BPF_LDX 0x01
+ BPF_ST 0x02 BPF_ST 0x02
+ BPF_STX 0x03 BPF_STX 0x03
+ BPF_ALU 0x04 BPF_ALU 0x04
+ BPF_JMP 0x05 BPF_JMP 0x05
+ BPF_RET 0x06 BPF_JMP32 0x06
+ BPF_MISC 0x07 BPF_ALU64 0x07
+ =================== ===============
+
+The 4th bit encodes the source operand ...
+
+ ::
+
+ BPF_K 0x00
+ BPF_X 0x08
+
+ * in classic BPF, this means::
+
+ BPF_SRC(code) == BPF_X - use register X as source operand
+ BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand
+
+ * in eBPF, this means::
+
+ BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand
+ BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand
+
+... and four MSB bits store operation code.
+
+If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of::
+
+ BPF_ADD 0x00
+ BPF_SUB 0x10
+ BPF_MUL 0x20
+ BPF_DIV 0x30
+ BPF_OR 0x40
+ BPF_AND 0x50
+ BPF_LSH 0x60
+ BPF_RSH 0x70
+ BPF_NEG 0x80
+ BPF_MOD 0x90
+ BPF_XOR 0xa0
+ BPF_MOV 0xb0 /* eBPF only: mov reg to reg */
+ BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */
+ BPF_END 0xd0 /* eBPF only: endianness conversion */
+
+If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of::
+
+ BPF_JA 0x00 /* BPF_JMP only */
+ BPF_JEQ 0x10
+ BPF_JGT 0x20
+ BPF_JGE 0x30
+ BPF_JSET 0x40
+ BPF_JNE 0x50 /* eBPF only: jump != */
+ BPF_JSGT 0x60 /* eBPF only: signed '>' */
+ BPF_JSGE 0x70 /* eBPF only: signed '>=' */
+ BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */
+ BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */
+ BPF_JLT 0xa0 /* eBPF only: unsigned '<' */
+ BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */
+ BPF_JSLT 0xc0 /* eBPF only: signed '<' */
+ BPF_JSLE 0xd0 /* eBPF only: signed '<=' */
+
+So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF
+and eBPF. There are only two registers in classic BPF, so it means A += X.
+In eBPF it means dst_reg = (u32) dst_reg + (u32) src_reg; similarly,
+BPF_XOR | BPF_K | BPF_ALU means A ^= imm32 in classic BPF and analogous
+src_reg = (u32) src_reg ^ (u32) imm32 in eBPF.
+
+Classic BPF is using BPF_MISC class to represent A = X and X = A moves.
+eBPF is using BPF_MOV | BPF_X | BPF_ALU code instead. Since there are no
+BPF_MISC operations in eBPF, the class 7 is used as BPF_ALU64 to mean
+exactly the same operations as BPF_ALU, but with 64-bit wide operands
+instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.:
+dst_reg = dst_reg + src_reg
+
+Classic BPF wastes the whole BPF_RET class to represent a single ``ret``
+operation. Classic BPF_RET | BPF_K means copy imm32 into return register
+and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT
+in eBPF means function exit only. The eBPF program needs to store return
+value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as
+BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide
+operands for the comparisons instead.
+
+For load and store instructions the 8-bit 'code' field is divided as::
+
+ +--------+--------+-------------------+
+ | 3 bits | 2 bits | 3 bits |
+ | mode | size | instruction class |
+ +--------+--------+-------------------+
+ (MSB) (LSB)
+
+Size modifier is one of ...
+
+::
+
+ BPF_W 0x00 /* word */
+ BPF_H 0x08 /* half word */
+ BPF_B 0x10 /* byte */
+ BPF_DW 0x18 /* eBPF only, double word */
+
+... which encodes size of load/store operation::
+
+ B - 1 byte
+ H - 2 byte
+ W - 4 byte
+ DW - 8 byte (eBPF only)
+
+Mode modifier is one of::
+
+ BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
+ BPF_ABS 0x20
+ BPF_IND 0x40
+ BPF_MEM 0x60
+ BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */
+ BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */
+ BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */
diff --git a/Documentation/bpf/cpumasks.rst b/Documentation/bpf/cpumasks.rst
new file mode 100644
index 000000000000..41efd8874eeb
--- /dev/null
+++ b/Documentation/bpf/cpumasks.rst
@@ -0,0 +1,383 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _cpumasks-header-label:
+
+==================
+BPF cpumask kfuncs
+==================
+
+1. Introduction
+===============
+
+``struct cpumask`` is a bitmap data structure in the kernel whose indices
+reflect the CPUs on the system. Commonly, cpumasks are used to track which CPUs
+a task is affinitized to, but they can also be used to e.g. track which cores
+are associated with a scheduling domain, which cores on a machine are idle,
+etc.
+
+BPF provides programs with a set of :ref:`kfuncs-header-label` that can be
+used to allocate, mutate, query, and free cpumasks.
+
+2. BPF cpumask objects
+======================
+
+There are two different types of cpumasks that can be used by BPF programs.
+
+2.1 ``struct bpf_cpumask *``
+----------------------------
+
+``struct bpf_cpumask *`` is a cpumask that is allocated by BPF, on behalf of a
+BPF program, and whose lifecycle is entirely controlled by BPF. These cpumasks
+are RCU-protected, can be mutated, can be used as kptrs, and can be safely cast
+to a ``struct cpumask *``.
+
+2.1.1 ``struct bpf_cpumask *`` lifecycle
+----------------------------------------
+
+A ``struct bpf_cpumask *`` is allocated, acquired, and released, using the
+following functions:
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_create
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_acquire
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_release
+
+For example:
+
+.. code-block:: c
+
+ struct cpumask_map_value {
+ struct bpf_cpumask __kptr * cpumask;
+ };
+
+ struct array_map {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, struct cpumask_map_value);
+ __uint(max_entries, 65536);
+ } cpumask_map SEC(".maps");
+
+ static int cpumask_map_insert(struct bpf_cpumask *mask, u32 pid)
+ {
+ struct cpumask_map_value local, *v;
+ long status;
+ struct bpf_cpumask *old;
+ u32 key = pid;
+
+ local.cpumask = NULL;
+ status = bpf_map_update_elem(&cpumask_map, &key, &local, 0);
+ if (status) {
+ bpf_cpumask_release(mask);
+ return status;
+ }
+
+ v = bpf_map_lookup_elem(&cpumask_map, &key);
+ if (!v) {
+ bpf_cpumask_release(mask);
+ return -ENOENT;
+ }
+
+ old = bpf_kptr_xchg(&v->cpumask, mask);
+ if (old)
+ bpf_cpumask_release(old);
+
+ return 0;
+ }
+
+ /**
+ * A sample tracepoint showing how a task's cpumask can be queried and
+ * recorded as a kptr.
+ */
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(record_task_cpumask, struct task_struct *task, u64 clone_flags)
+ {
+ struct bpf_cpumask *cpumask;
+ int ret;
+
+ cpumask = bpf_cpumask_create();
+ if (!cpumask)
+ return -ENOMEM;
+
+ if (!bpf_cpumask_full(task->cpus_ptr))
+ bpf_printk("task %s has CPU affinity", task->comm);
+
+ bpf_cpumask_copy(cpumask, task->cpus_ptr);
+ return cpumask_map_insert(cpumask, task->pid);
+ }
+
+----
+
+2.1.1 ``struct bpf_cpumask *`` as kptrs
+---------------------------------------
+
+As mentioned and illustrated above, these ``struct bpf_cpumask *`` objects can
+also be stored in a map and used as kptrs. If a ``struct bpf_cpumask *`` is in
+a map, the reference can be removed from the map with bpf_kptr_xchg(), or
+opportunistically acquired using RCU:
+
+.. code-block:: c
+
+ /* struct containing the struct bpf_cpumask kptr which is stored in the map. */
+ struct cpumasks_kfunc_map_value {
+ struct bpf_cpumask __kptr * bpf_cpumask;
+ };
+
+ /* The map containing struct cpumasks_kfunc_map_value entries. */
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, struct cpumasks_kfunc_map_value);
+ __uint(max_entries, 1);
+ } cpumasks_kfunc_map SEC(".maps");
+
+ /* ... */
+
+ /**
+ * A simple example tracepoint program showing how a
+ * struct bpf_cpumask * kptr that is stored in a map can
+ * be passed to kfuncs using RCU protection.
+ */
+ SEC("tp_btf/cgroup_mkdir")
+ int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
+ {
+ struct bpf_cpumask *kptr;
+ struct cpumasks_kfunc_map_value *v;
+ u32 key = 0;
+
+ /* Assume a bpf_cpumask * kptr was previously stored in the map. */
+ v = bpf_map_lookup_elem(&cpumasks_kfunc_map, &key);
+ if (!v)
+ return -ENOENT;
+
+ bpf_rcu_read_lock();
+ /* Acquire a reference to the bpf_cpumask * kptr that's already stored in the map. */
+ kptr = v->cpumask;
+ if (!kptr) {
+ /* If no bpf_cpumask was present in the map, it's because
+ * we're racing with another CPU that removed it with
+ * bpf_kptr_xchg() between the bpf_map_lookup_elem()
+ * above, and our load of the pointer from the map.
+ */
+ bpf_rcu_read_unlock();
+ return -EBUSY;
+ }
+
+ bpf_cpumask_setall(kptr);
+ bpf_rcu_read_unlock();
+
+ return 0;
+ }
+
+----
+
+2.2 ``struct cpumask``
+----------------------
+
+``struct cpumask`` is the object that actually contains the cpumask bitmap
+being queried, mutated, etc. A ``struct bpf_cpumask`` wraps a ``struct
+cpumask``, which is why it's safe to cast it as such (note however that it is
+**not** safe to cast a ``struct cpumask *`` to a ``struct bpf_cpumask *``, and
+the verifier will reject any program that tries to do so).
+
+As we'll see below, any kfunc that mutates its cpumask argument will take a
+``struct bpf_cpumask *`` as that argument. Any argument that simply queries the
+cpumask will instead take a ``struct cpumask *``.
+
+3. cpumask kfuncs
+=================
+
+Above, we described the kfuncs that can be used to allocate, acquire, release,
+etc a ``struct bpf_cpumask *``. This section of the document will describe the
+kfuncs for mutating and querying cpumasks.
+
+3.1 Mutating cpumasks
+---------------------
+
+Some cpumask kfuncs are "read-only" in that they don't mutate any of their
+arguments, whereas others mutate at least one argument (which means that the
+argument must be a ``struct bpf_cpumask *``, as described above).
+
+This section will describe all of the cpumask kfuncs which mutate at least one
+argument. :ref:`cpumasks-querying-label` below describes the read-only kfuncs.
+
+3.1.1 Setting and clearing CPUs
+-------------------------------
+
+bpf_cpumask_set_cpu() and bpf_cpumask_clear_cpu() can be used to set and clear
+a CPU in a ``struct bpf_cpumask`` respectively:
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_set_cpu bpf_cpumask_clear_cpu
+
+These kfuncs are pretty straightforward, and can be used, for example, as
+follows:
+
+.. code-block:: c
+
+ /**
+ * A sample tracepoint showing how a cpumask can be queried.
+ */
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(test_set_clear_cpu, struct task_struct *task, u64 clone_flags)
+ {
+ struct bpf_cpumask *cpumask;
+
+ cpumask = bpf_cpumask_create();
+ if (!cpumask)
+ return -ENOMEM;
+
+ bpf_cpumask_set_cpu(0, cpumask);
+ if (!bpf_cpumask_test_cpu(0, cast(cpumask)))
+ /* Should never happen. */
+ goto release_exit;
+
+ bpf_cpumask_clear_cpu(0, cpumask);
+ if (bpf_cpumask_test_cpu(0, cast(cpumask)))
+ /* Should never happen. */
+ goto release_exit;
+
+ /* struct cpumask * pointers such as task->cpus_ptr can also be queried. */
+ if (bpf_cpumask_test_cpu(0, task->cpus_ptr))
+ bpf_printk("task %s can use CPU %d", task->comm, 0);
+
+ release_exit:
+ bpf_cpumask_release(cpumask);
+ return 0;
+ }
+
+----
+
+bpf_cpumask_test_and_set_cpu() and bpf_cpumask_test_and_clear_cpu() are
+complementary kfuncs that allow callers to atomically test and set (or clear)
+CPUs:
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_test_and_set_cpu bpf_cpumask_test_and_clear_cpu
+
+----
+
+We can also set and clear entire ``struct bpf_cpumask *`` objects in one
+operation using bpf_cpumask_setall() and bpf_cpumask_clear():
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_setall bpf_cpumask_clear
+
+3.1.2 Operations between cpumasks
+---------------------------------
+
+In addition to setting and clearing individual CPUs in a single cpumask,
+callers can also perform bitwise operations between multiple cpumasks using
+bpf_cpumask_and(), bpf_cpumask_or(), and bpf_cpumask_xor():
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_and bpf_cpumask_or bpf_cpumask_xor
+
+The following is an example of how they may be used. Note that some of the
+kfuncs shown in this example will be covered in more detail below.
+
+.. code-block:: c
+
+ /**
+ * A sample tracepoint showing how a cpumask can be mutated using
+ bitwise operators (and queried).
+ */
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(test_and_or_xor, struct task_struct *task, u64 clone_flags)
+ {
+ struct bpf_cpumask *mask1, *mask2, *dst1, *dst2;
+
+ mask1 = bpf_cpumask_create();
+ if (!mask1)
+ return -ENOMEM;
+
+ mask2 = bpf_cpumask_create();
+ if (!mask2) {
+ bpf_cpumask_release(mask1);
+ return -ENOMEM;
+ }
+
+ // ...Safely create the other two masks... */
+
+ bpf_cpumask_set_cpu(0, mask1);
+ bpf_cpumask_set_cpu(1, mask2);
+ bpf_cpumask_and(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2);
+ if (!bpf_cpumask_empty((const struct cpumask *)dst1))
+ /* Should never happen. */
+ goto release_exit;
+
+ bpf_cpumask_or(dst1, (const struct cpumask *)mask1, (const struct cpumask *)mask2);
+ if (!bpf_cpumask_test_cpu(0, (const struct cpumask *)dst1))
+ /* Should never happen. */
+ goto release_exit;
+
+ if (!bpf_cpumask_test_cpu(1, (const struct cpumask *)dst1))
+ /* Should never happen. */
+ goto release_exit;
+
+ bpf_cpumask_xor(dst2, (const struct cpumask *)mask1, (const struct cpumask *)mask2);
+ if (!bpf_cpumask_equal((const struct cpumask *)dst1,
+ (const struct cpumask *)dst2))
+ /* Should never happen. */
+ goto release_exit;
+
+ release_exit:
+ bpf_cpumask_release(mask1);
+ bpf_cpumask_release(mask2);
+ bpf_cpumask_release(dst1);
+ bpf_cpumask_release(dst2);
+ return 0;
+ }
+
+----
+
+The contents of an entire cpumask may be copied to another using
+bpf_cpumask_copy():
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_copy
+
+----
+
+.. _cpumasks-querying-label:
+
+3.2 Querying cpumasks
+---------------------
+
+In addition to the above kfuncs, there is also a set of read-only kfuncs that
+can be used to query the contents of cpumasks.
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_test_cpu
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset
+ bpf_cpumask_empty bpf_cpumask_full
+
+.. kernel-doc:: kernel/bpf/cpumask.c
+ :identifiers: bpf_cpumask_any bpf_cpumask_any_and
+
+----
+
+Some example usages of these querying kfuncs were shown above. We will not
+replicate those exmaples here. Note, however, that all of the aforementioned
+kfuncs are tested in `tools/testing/selftests/bpf/progs/cpumask_success.c`_, so
+please take a look there if you're looking for more examples of how they can be
+used.
+
+.. _tools/testing/selftests/bpf/progs/cpumask_success.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/cpumask_success.c
+
+
+4. Adding BPF cpumask kfuncs
+============================
+
+The set of supported BPF cpumask kfuncs are not (yet) a 1-1 match with the
+cpumask operations in include/linux/cpumask.h. Any of those cpumask operations
+could easily be encapsulated in a new kfunc if and when required. If you'd like
+to support a new cpumask operation, please feel free to submit a patch. If you
+do add a new cpumask kfunc, please document it here, and add any relevant
+selftest testcases to the cpumask selftest suite.
diff --git a/Documentation/bpf/faq.rst b/Documentation/bpf/faq.rst
new file mode 100644
index 000000000000..a622602ce9ad
--- /dev/null
+++ b/Documentation/bpf/faq.rst
@@ -0,0 +1,11 @@
+================================
+Frequently asked questions (FAQ)
+================================
+
+Two sets of Questions and Answers (Q&A) are maintained.
+
+.. toctree::
+ :maxdepth: 1
+
+ bpf_design_QA
+ bpf_devel_QA
diff --git a/Documentation/bpf/graph_ds_impl.rst b/Documentation/bpf/graph_ds_impl.rst
new file mode 100644
index 000000000000..61274622b71d
--- /dev/null
+++ b/Documentation/bpf/graph_ds_impl.rst
@@ -0,0 +1,267 @@
+=========================
+BPF Graph Data Structures
+=========================
+
+This document describes implementation details of new-style "graph" data
+structures (linked_list, rbtree), with particular focus on the verifier's
+implementation of semantics specific to those data structures.
+
+Although no specific verifier code is referred to in this document, the document
+assumes that the reader has general knowledge of BPF verifier internals, BPF
+maps, and BPF program writing.
+
+Note that the intent of this document is to describe the current state of
+these graph data structures. **No guarantees** of stability for either
+semantics or APIs are made or implied here.
+
+.. contents::
+ :local:
+ :depth: 2
+
+Introduction
+------------
+
+The BPF map API has historically been the main way to expose data structures
+of various types for use within BPF programs. Some data structures fit naturally
+with the map API (HASH, ARRAY), others less so. Consequentially, programs
+interacting with the latter group of data structures can be hard to parse
+for kernel programmers without previous BPF experience.
+
+Luckily, some restrictions which necessitated the use of BPF map semantics are
+no longer relevant. With the introduction of kfuncs, kptrs, and the any-context
+BPF allocator, it is now possible to implement BPF data structures whose API
+and semantics more closely match those exposed to the rest of the kernel.
+
+Two such data structures - linked_list and rbtree - have many verification
+details in common. Because both have "root"s ("head" for linked_list) and
+"node"s, the verifier code and this document refer to common functionality
+as "graph_api", "graph_root", "graph_node", etc.
+
+Unless otherwise stated, examples and semantics below apply to both graph data
+structures.
+
+Unstable API
+------------
+
+Data structures implemented using the BPF map API have historically used BPF
+helper functions - either standard map API helpers like ``bpf_map_update_elem``
+or map-specific helpers. The new-style graph data structures instead use kfuncs
+to define their manipulation helpers. Because there are no stability guarantees
+for kfuncs, the API and semantics for these data structures can be evolved in
+a way that breaks backwards compatibility if necessary.
+
+Root and node types for the new data structures are opaquely defined in the
+``uapi/linux/bpf.h`` header.
+
+Locking
+-------
+
+The new-style data structures are intrusive and are defined similarly to their
+vanilla kernel counterparts:
+
+.. code-block:: c
+
+ struct node_data {
+ long key;
+ long data;
+ struct bpf_rb_node node;
+ };
+
+ struct bpf_spin_lock glock;
+ struct bpf_rb_root groot __contains(node_data, node);
+
+The "root" type for both linked_list and rbtree expects to be in a map_value
+which also contains a ``bpf_spin_lock`` - in the above example both global
+variables are placed in a single-value arraymap. The verifier considers this
+spin_lock to be associated with the ``bpf_rb_root`` by virtue of both being in
+the same map_value and will enforce that the correct lock is held when
+verifying BPF programs that manipulate the tree. Since this lock checking
+happens at verification time, there is no runtime penalty.
+
+Non-owning references
+---------------------
+
+**Motivation**
+
+Consider the following BPF code:
+
+.. code-block:: c
+
+ struct node_data *n = bpf_obj_new(typeof(*n)); /* ACQUIRED */
+
+ bpf_spin_lock(&lock);
+
+ bpf_rbtree_add(&tree, n); /* PASSED */
+
+ bpf_spin_unlock(&lock);
+
+From the verifier's perspective, the pointer ``n`` returned from ``bpf_obj_new``
+has type ``PTR_TO_BTF_ID | MEM_ALLOC``, with a ``btf_id`` of
+``struct node_data`` and a nonzero ``ref_obj_id``. Because it holds ``n``, the
+program has ownership of the pointee's (object pointed to by ``n``) lifetime.
+The BPF program must pass off ownership before exiting - either via
+``bpf_obj_drop``, which ``free``'s the object, or by adding it to ``tree`` with
+``bpf_rbtree_add``.
+
+(``ACQUIRED`` and ``PASSED`` comments in the example denote statements where
+"ownership is acquired" and "ownership is passed", respectively)
+
+What should the verifier do with ``n`` after ownership is passed off? If the
+object was ``free``'d with ``bpf_obj_drop`` the answer is obvious: the verifier
+should reject programs which attempt to access ``n`` after ``bpf_obj_drop`` as
+the object is no longer valid. The underlying memory may have been reused for
+some other allocation, unmapped, etc.
+
+When ownership is passed to ``tree`` via ``bpf_rbtree_add`` the answer is less
+obvious. The verifier could enforce the same semantics as for ``bpf_obj_drop``,
+but that would result in programs with useful, common coding patterns being
+rejected, e.g.:
+
+.. code-block:: c
+
+ int x;
+ struct node_data *n = bpf_obj_new(typeof(*n)); /* ACQUIRED */
+
+ bpf_spin_lock(&lock);
+
+ bpf_rbtree_add(&tree, n); /* PASSED */
+ x = n->data;
+ n->data = 42;
+
+ bpf_spin_unlock(&lock);
+
+Both the read from and write to ``n->data`` would be rejected. The verifier
+can do better, though, by taking advantage of two details:
+
+ * Graph data structure APIs can only be used when the ``bpf_spin_lock``
+ associated with the graph root is held
+
+ * Both graph data structures have pointer stability
+
+ * Because graph nodes are allocated with ``bpf_obj_new`` and
+ adding / removing from the root involves fiddling with the
+ ``bpf_{list,rb}_node`` field of the node struct, a graph node will
+ remain at the same address after either operation.
+
+Because the associated ``bpf_spin_lock`` must be held by any program adding
+or removing, if we're in the critical section bounded by that lock, we know
+that no other program can add or remove until the end of the critical section.
+This combined with pointer stability means that, until the critical section
+ends, we can safely access the graph node through ``n`` even after it was used
+to pass ownership.
+
+The verifier considers such a reference a *non-owning reference*. The ref
+returned by ``bpf_obj_new`` is accordingly considered an *owning reference*.
+Both terms currently only have meaning in the context of graph nodes and API.
+
+**Details**
+
+Let's enumerate the properties of both types of references.
+
+*owning reference*
+
+ * This reference controls the lifetime of the pointee
+
+ * Ownership of pointee must be 'released' by passing it to some graph API
+ kfunc, or via ``bpf_obj_drop``, which ``free``'s the pointee
+
+ * If not released before program ends, verifier considers program invalid
+
+ * Access to the pointee's memory will not page fault
+
+*non-owning reference*
+
+ * This reference does not own the pointee
+
+ * It cannot be used to add the graph node to a graph root, nor ``free``'d via
+ ``bpf_obj_drop``
+
+ * No explicit control of lifetime, but can infer valid lifetime based on
+ non-owning ref existence (see explanation below)
+
+ * Access to the pointee's memory will not page fault
+
+From verifier's perspective non-owning references can only exist
+between spin_lock and spin_unlock. Why? After spin_unlock another program
+can do arbitrary operations on the data structure like removing and ``free``-ing
+via bpf_obj_drop. A non-owning ref to some chunk of memory that was remove'd,
+``free``'d, and reused via bpf_obj_new would point to an entirely different thing.
+Or the memory could go away.
+
+To prevent this logic violation all non-owning references are invalidated by the
+verifier after a critical section ends. This is necessary to ensure the "will
+not page fault" property of non-owning references. So if the verifier hasn't
+invalidated a non-owning ref, accessing it will not page fault.
+
+Currently ``bpf_obj_drop`` is not allowed in the critical section, so
+if there's a valid non-owning ref, we must be in a critical section, and can
+conclude that the ref's memory hasn't been dropped-and- ``free``'d or
+dropped-and-reused.
+
+Any reference to a node that is in an rbtree _must_ be non-owning, since
+the tree has control of the pointee's lifetime. Similarly, any ref to a node
+that isn't in rbtree _must_ be owning. This results in a nice property:
+graph API add / remove implementations don't need to check if a node
+has already been added (or already removed), as the ownership model
+allows the verifier to prevent such a state from being valid by simply checking
+types.
+
+However, pointer aliasing poses an issue for the above "nice property".
+Consider the following example:
+
+.. code-block:: c
+
+ struct node_data *n, *m, *o, *p;
+ n = bpf_obj_new(typeof(*n)); /* 1 */
+
+ bpf_spin_lock(&lock);
+
+ bpf_rbtree_add(&tree, n); /* 2 */
+ m = bpf_rbtree_first(&tree); /* 3 */
+
+ o = bpf_rbtree_remove(&tree, n); /* 4 */
+ p = bpf_rbtree_remove(&tree, m); /* 5 */
+
+ bpf_spin_unlock(&lock);
+
+ bpf_obj_drop(o);
+ bpf_obj_drop(p); /* 6 */
+
+Assume the tree is empty before this program runs. If we track verifier state
+changes here using numbers in above comments:
+
+ 1) n is an owning reference
+
+ 2) n is a non-owning reference, it's been added to the tree
+
+ 3) n and m are non-owning references, they both point to the same node
+
+ 4) o is an owning reference, n and m non-owning, all point to same node
+
+ 5) o and p are owning, n and m non-owning, all point to the same node
+
+ 6) a double-free has occurred, since o and p point to same node and o was
+ ``free``'d in previous statement
+
+States 4 and 5 violate our "nice property", as there are non-owning refs to
+a node which is not in an rbtree. Statement 5 will try to remove a node which
+has already been removed as a result of this violation. State 6 is a dangerous
+double-free.
+
+At a minimum we should prevent state 6 from being possible. If we can't also
+prevent state 5 then we must abandon our "nice property" and check whether a
+node has already been removed at runtime.
+
+We prevent both by generalizing the "invalidate non-owning references" behavior
+of ``bpf_spin_unlock`` and doing similar invalidation after
+``bpf_rbtree_remove``. The logic here being that any graph API kfunc which:
+
+ * takes an arbitrary node argument
+
+ * removes it from the data structure
+
+ * returns an owning reference to the removed node
+
+May result in a state where some other non-owning reference points to the same
+node. So ``remove``-type kfuncs must be considered a non-owning reference
+invalidation point as well.
diff --git a/Documentation/bpf/helpers.rst b/Documentation/bpf/helpers.rst
new file mode 100644
index 000000000000..c4ee0cc20dec
--- /dev/null
+++ b/Documentation/bpf/helpers.rst
@@ -0,0 +1,7 @@
+Helper functions
+================
+
+* `bpf-helpers(7)`_ maintains a list of helpers available to eBPF programs.
+
+.. Links
+.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html \ No newline at end of file
diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index 1ceb5d704a97..dbb39e8f9889 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -5,95 +5,40 @@ BPF Documentation
This directory contains documentation for the BPF (Berkeley Packet
Filter) facility, with a focus on the extended BPF version (eBPF).
-This kernel side documentation is still work in progress. The main
-textual documentation is (for historical reasons) described in
-:ref:`networking-filter`, which describe both classical and extended
-BPF instruction-set.
+This kernel side documentation is still work in progress.
The Cilium project also maintains a `BPF and XDP Reference Guide`_
that goes into great technical depth about the BPF Architecture.
-libbpf
-======
-
-Documentation/bpf/libbpf/libbpf.rst is a userspace library for loading and interacting with bpf programs.
-
-BPF Type Format (BTF)
-=====================
-
.. toctree::
:maxdepth: 1
+ instruction-set
+ verifier
+ libbpf/index
btf
-
-
-Frequently asked questions (FAQ)
-================================
-
-Two sets of Questions and Answers (Q&A) are maintained.
-
-.. toctree::
- :maxdepth: 1
-
- bpf_design_QA
- bpf_devel_QA
-
-Syscall API
-===========
-
-The primary info for the bpf syscall is available in the `man-pages`_
-for `bpf(2)`_. For more information about the userspace API, see
-Documentation/userspace-api/ebpf/index.rst.
-
-Helper functions
-================
-
-* `bpf-helpers(7)`_ maintains a list of helpers available to eBPF programs.
-
-
-Program types
-=============
-
-.. toctree::
- :maxdepth: 1
-
- prog_cgroup_sockopt
- prog_cgroup_sysctl
- prog_flow_dissector
- bpf_lsm
- prog_sk_lookup
-
-
-Map types
-=========
-
-.. toctree::
- :maxdepth: 1
-
- map_cgroup_storage
-
-
-Testing and debugging BPF
-=========================
-
-.. toctree::
- :maxdepth: 1
-
- drgn
- s390
-
-
-Other
-=====
-
-.. toctree::
- :maxdepth: 1
-
- ringbuf
- llvm_reloc
+ faq
+ syscall_api
+ helpers
+ kfuncs
+ cpumasks
+ programs
+ maps
+ bpf_prog_run
+ classic_vs_extended.rst
+ bpf_iterators
+ bpf_licensing
+ test_debug
+ clang-notes
+ linux-notes
+ other
+ redirect
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
.. Links:
-.. _networking-filter: ../networking/filter.rst
-.. _man-pages: https://www.kernel.org/doc/man-pages/
-.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html
-.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html
.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/
diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
new file mode 100644
index 000000000000..492980ece1ab
--- /dev/null
+++ b/Documentation/bpf/instruction-set.rst
@@ -0,0 +1,475 @@
+.. contents::
+.. sectnum::
+
+========================================
+eBPF Instruction Set Specification, v1.0
+========================================
+
+This document specifies version 1.0 of the eBPF instruction set.
+
+Documentation conventions
+=========================
+
+For brevity, this document uses the type notion "u64", "u32", etc.
+to mean an unsigned integer whose width is the specified number of bits,
+and "s32", etc. to mean a signed integer of the specified number of bits.
+
+Registers and calling convention
+================================
+
+eBPF has 10 general purpose registers and a read-only frame pointer register,
+all of which are 64-bits wide.
+
+The eBPF calling convention is defined as:
+
+* R0: return value from function calls, and exit value for eBPF programs
+* R1 - R5: arguments for function calls
+* R6 - R9: callee saved registers that function calls will preserve
+* R10: read-only frame pointer to access stack
+
+R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if
+necessary across calls.
+
+Instruction encoding
+====================
+
+eBPF has two instruction encodings:
+
+* the basic instruction encoding, which uses 64 bits to encode an instruction
+* the wide instruction encoding, which appends a second 64-bit immediate (i.e.,
+ constant) value after the basic instruction for a total of 128 bits.
+
+The fields conforming an encoded basic instruction are stored in the
+following order::
+
+ opcode:8 src_reg:4 dst_reg:4 offset:16 imm:32 // In little-endian BPF.
+ opcode:8 dst_reg:4 src_reg:4 offset:16 imm:32 // In big-endian BPF.
+
+**imm**
+ signed integer immediate value
+
+**offset**
+ signed integer offset used with pointer arithmetic
+
+**src_reg**
+ the source register number (0-10), except where otherwise specified
+ (`64-bit immediate instructions`_ reuse this field for other purposes)
+
+**dst_reg**
+ destination register number (0-10)
+
+**opcode**
+ operation to perform
+
+Note that the contents of multi-byte fields ('imm' and 'offset') are
+stored using big-endian byte ordering in big-endian BPF and
+little-endian byte ordering in little-endian BPF.
+
+For example::
+
+ opcode offset imm assembly
+ src_reg dst_reg
+ 07 0 1 00 00 44 33 22 11 r1 += 0x11223344 // little
+ dst_reg src_reg
+ 07 1 0 00 00 11 22 33 44 r1 += 0x11223344 // big
+
+Note that most instructions do not use all of the fields.
+Unused fields shall be cleared to zero.
+
+As discussed below in `64-bit immediate instructions`_, a 64-bit immediate
+instruction uses a 64-bit immediate value that is constructed as follows.
+The 64 bits following the basic instruction contain a pseudo instruction
+using the same format but with opcode, dst_reg, src_reg, and offset all set to zero,
+and imm containing the high 32 bits of the immediate value.
+
+This is depicted in the following figure::
+
+ basic_instruction
+ .-----------------------------.
+ | |
+ code:8 regs:8 offset:16 imm:32 unused:32 imm:32
+ | |
+ '--------------'
+ pseudo instruction
+
+Thus the 64-bit immediate value is constructed as follows:
+
+ imm64 = (next_imm << 32) | imm
+
+where 'next_imm' refers to the imm value of the pseudo instruction
+following the basic instruction. The unused bytes in the pseudo
+instruction are reserved and shall be cleared to zero.
+
+Instruction classes
+-------------------
+
+The three LSB bits of the 'opcode' field store the instruction class:
+
+========= ===== =============================== ===================================
+class value description reference
+========= ===== =============================== ===================================
+BPF_LD 0x00 non-standard load operations `Load and store instructions`_
+BPF_LDX 0x01 load into register operations `Load and store instructions`_
+BPF_ST 0x02 store from immediate operations `Load and store instructions`_
+BPF_STX 0x03 store from register operations `Load and store instructions`_
+BPF_ALU 0x04 32-bit arithmetic operations `Arithmetic and jump instructions`_
+BPF_JMP 0x05 64-bit jump operations `Arithmetic and jump instructions`_
+BPF_JMP32 0x06 32-bit jump operations `Arithmetic and jump instructions`_
+BPF_ALU64 0x07 64-bit arithmetic operations `Arithmetic and jump instructions`_
+========= ===== =============================== ===================================
+
+Arithmetic and jump instructions
+================================
+
+For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
+``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
+
+============== ====== =================
+4 bits (MSB) 1 bit 3 bits (LSB)
+============== ====== =================
+code source instruction class
+============== ====== =================
+
+**code**
+ the operation code, whose meaning varies by instruction class
+
+**source**
+ the source operand location, which unless otherwise specified is one of:
+
+ ====== ===== ==============================================
+ source value description
+ ====== ===== ==============================================
+ BPF_K 0x00 use 32-bit 'imm' value as source operand
+ BPF_X 0x08 use 'src_reg' register value as source operand
+ ====== ===== ==============================================
+
+**instruction class**
+ the instruction class (see `Instruction classes`_)
+
+Arithmetic instructions
+-----------------------
+
+``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
+otherwise identical operations.
+The 'code' field encodes the operation as below, where 'src' and 'dst' refer
+to the values of the source and destination registers, respectively.
+
+======== ===== ==========================================================
+code value description
+======== ===== ==========================================================
+BPF_ADD 0x00 dst += src
+BPF_SUB 0x10 dst -= src
+BPF_MUL 0x20 dst \*= src
+BPF_DIV 0x30 dst = (src != 0) ? (dst / src) : 0
+BPF_OR 0x40 dst \|= src
+BPF_AND 0x50 dst &= src
+BPF_LSH 0x60 dst <<= src
+BPF_RSH 0x70 dst >>= src
+BPF_NEG 0x80 dst = ~src
+BPF_MOD 0x90 dst = (src != 0) ? (dst % src) : dst
+BPF_XOR 0xa0 dst ^= src
+BPF_MOV 0xb0 dst = src
+BPF_ARSH 0xc0 sign extending shift right
+BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below)
+======== ===== ==========================================================
+
+Underflow and overflow are allowed during arithmetic operations, meaning
+the 64-bit or 32-bit value will wrap. If eBPF program execution would
+result in division by zero, the destination register is instead set to zero.
+If execution would result in modulo by zero, for ``BPF_ALU64`` the value of
+the destination register is unchanged whereas for ``BPF_ALU`` the upper
+32 bits of the destination register are zeroed.
+
+``BPF_ADD | BPF_X | BPF_ALU`` means::
+
+ dst = (u32) ((u32) dst + (u32) src)
+
+where '(u32)' indicates that the upper 32 bits are zeroed.
+
+``BPF_ADD | BPF_X | BPF_ALU64`` means::
+
+ dst = dst + src
+
+``BPF_XOR | BPF_K | BPF_ALU`` means::
+
+ dst = (u32) dst ^ (u32) imm32
+
+``BPF_XOR | BPF_K | BPF_ALU64`` means::
+
+ dst = dst ^ imm32
+
+Also note that the division and modulo operations are unsigned. Thus, for
+``BPF_ALU``, 'imm' is first interpreted as an unsigned 32-bit value, whereas
+for ``BPF_ALU64``, 'imm' is first sign extended to 64 bits and the result
+interpreted as an unsigned 64-bit value. There are no instructions for
+signed division or modulo.
+
+Byte swap instructions
+~~~~~~~~~~~~~~~~~~~~~~
+
+The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit
+'code' field of ``BPF_END``.
+
+The byte swap instructions operate on the destination register
+only and do not use a separate source register or immediate value.
+
+The 1-bit source operand field in the opcode is used to select what byte
+order the operation convert from or to:
+
+========= ===== =================================================
+source value description
+========= ===== =================================================
+BPF_TO_LE 0x00 convert between host byte order and little endian
+BPF_TO_BE 0x08 convert between host byte order and big endian
+========= ===== =================================================
+
+The 'imm' field encodes the width of the swap operations. The following widths
+are supported: 16, 32 and 64.
+
+Examples:
+
+``BPF_ALU | BPF_TO_LE | BPF_END`` with imm = 16 means::
+
+ dst = htole16(dst)
+
+``BPF_ALU | BPF_TO_BE | BPF_END`` with imm = 64 means::
+
+ dst = htobe64(dst)
+
+Jump instructions
+-----------------
+
+``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for
+otherwise identical operations.
+The 'code' field encodes the operation as below:
+
+======== ===== === =========================================== =========================================
+code value src description notes
+======== ===== === =========================================== =========================================
+BPF_JA 0x0 0x0 PC += offset BPF_JMP only
+BPF_JEQ 0x1 any PC += offset if dst == src
+BPF_JGT 0x2 any PC += offset if dst > src unsigned
+BPF_JGE 0x3 any PC += offset if dst >= src unsigned
+BPF_JSET 0x4 any PC += offset if dst & src
+BPF_JNE 0x5 any PC += offset if dst != src
+BPF_JSGT 0x6 any PC += offset if dst > src signed
+BPF_JSGE 0x7 any PC += offset if dst >= src signed
+BPF_CALL 0x8 0x0 call helper function by address see `Helper functions`_
+BPF_CALL 0x8 0x1 call PC += offset see `Program-local functions`_
+BPF_CALL 0x8 0x2 call helper function by BTF ID see `Helper functions`_
+BPF_EXIT 0x9 0x0 return BPF_JMP only
+BPF_JLT 0xa any PC += offset if dst < src unsigned
+BPF_JLE 0xb any PC += offset if dst <= src unsigned
+BPF_JSLT 0xc any PC += offset if dst < src signed
+BPF_JSLE 0xd any PC += offset if dst <= src signed
+======== ===== === =========================================== =========================================
+
+The eBPF program needs to store the return value into register R0 before doing a
+``BPF_EXIT``.
+
+Example:
+
+``BPF_JSGE | BPF_X | BPF_JMP32`` (0x7e) means::
+
+ if (s32)dst s>= (s32)src goto +offset
+
+where 's>=' indicates a signed '>=' comparison.
+
+Helper functions
+~~~~~~~~~~~~~~~~
+
+Helper functions are a concept whereby BPF programs can call into a
+set of function calls exposed by the underlying platform.
+
+Historically, each helper function was identified by an address
+encoded in the imm field. The available helper functions may differ
+for each program type, but address values are unique across all program types.
+
+Platforms that support the BPF Type Format (BTF) support identifying
+a helper function by a BTF ID encoded in the imm field, where the BTF ID
+identifies the helper name and type.
+
+Program-local functions
+~~~~~~~~~~~~~~~~~~~~~~~
+Program-local functions are functions exposed by the same BPF program as the
+caller, and are referenced by offset from the call instruction, similar to
+``BPF_JA``. A ``BPF_EXIT`` within the program-local function will return to
+the caller.
+
+Load and store instructions
+===========================
+
+For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
+8-bit 'opcode' field is divided as:
+
+============ ====== =================
+3 bits (MSB) 2 bits 3 bits (LSB)
+============ ====== =================
+mode size instruction class
+============ ====== =================
+
+The mode modifier is one of:
+
+ ============= ===== ==================================== =============
+ mode modifier value description reference
+ ============= ===== ==================================== =============
+ BPF_IMM 0x00 64-bit immediate instructions `64-bit immediate instructions`_
+ BPF_ABS 0x20 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_
+ BPF_IND 0x40 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_
+ BPF_MEM 0x60 regular load and store operations `Regular load and store operations`_
+ BPF_ATOMIC 0xc0 atomic operations `Atomic operations`_
+ ============= ===== ==================================== =============
+
+The size modifier is one of:
+
+ ============= ===== =====================
+ size modifier value description
+ ============= ===== =====================
+ BPF_W 0x00 word (4 bytes)
+ BPF_H 0x08 half word (2 bytes)
+ BPF_B 0x10 byte
+ BPF_DW 0x18 double word (8 bytes)
+ ============= ===== =====================
+
+Regular load and store operations
+---------------------------------
+
+The ``BPF_MEM`` mode modifier is used to encode regular load and store
+instructions that transfer data between a register and memory.
+
+``BPF_MEM | <size> | BPF_STX`` means::
+
+ *(size *) (dst + offset) = src
+
+``BPF_MEM | <size> | BPF_ST`` means::
+
+ *(size *) (dst + offset) = imm32
+
+``BPF_MEM | <size> | BPF_LDX`` means::
+
+ dst = *(size *) (src + offset)
+
+Where size is one of: ``BPF_B``, ``BPF_H``, ``BPF_W``, or ``BPF_DW``.
+
+Atomic operations
+-----------------
+
+Atomic operations are operations that operate on memory and can not be
+interrupted or corrupted by other access to the same memory region
+by other eBPF programs or means outside of this specification.
+
+All atomic operations supported by eBPF are encoded as store operations
+that use the ``BPF_ATOMIC`` mode modifier as follows:
+
+* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
+* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
+* 8-bit and 16-bit wide atomic operations are not supported.
+
+The 'imm' field is used to encode the actual atomic operation.
+Simple atomic operation use a subset of the values defined to encode
+arithmetic operations in the 'imm' field to encode the atomic operation:
+
+======== ===== ===========
+imm value description
+======== ===== ===========
+BPF_ADD 0x00 atomic add
+BPF_OR 0x40 atomic or
+BPF_AND 0x50 atomic and
+BPF_XOR 0xa0 atomic xor
+======== ===== ===========
+
+
+``BPF_ATOMIC | BPF_W | BPF_STX`` with 'imm' = BPF_ADD means::
+
+ *(u32 *)(dst + offset) += src
+
+``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means::
+
+ *(u64 *)(dst + offset) += src
+
+In addition to the simple atomic operations, there also is a modifier and
+two complex atomic operations:
+
+=========== ================ ===========================
+imm value description
+=========== ================ ===========================
+BPF_FETCH 0x01 modifier: return old value
+BPF_XCHG 0xe0 | BPF_FETCH atomic exchange
+BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange
+=========== ================ ===========================
+
+The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
+always set for the complex atomic operations. If the ``BPF_FETCH`` flag
+is set, then the operation also overwrites ``src`` with the value that
+was in memory before it was modified.
+
+The ``BPF_XCHG`` operation atomically exchanges ``src`` with the value
+addressed by ``dst + offset``.
+
+The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
+``dst + offset`` with ``R0``. If they match, the value addressed by
+``dst + offset`` is replaced with ``src``. In either case, the
+value that was at ``dst + offset`` before the operation is zero-extended
+and loaded back to ``R0``.
+
+64-bit immediate instructions
+-----------------------------
+
+Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
+encoding defined in `Instruction encoding`_, and use the 'src' field of the
+basic instruction to hold an opcode subtype.
+
+The following table defines a set of ``BPF_IMM | BPF_DW | BPF_LD`` instructions
+with opcode subtypes in the 'src' field, using new terms such as "map"
+defined further below:
+
+========================= ====== === ========================================= =========== ==============
+opcode construction opcode src pseudocode imm type dst type
+========================= ====== === ========================================= =========== ==============
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x0 dst = imm64 integer integer
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x1 dst = map_by_fd(imm) map fd map
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data pointer
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x4 dst = code_addr(imm) integer code pointer
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x5 dst = map_by_idx(imm) map index map
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data pointer
+========================= ====== === ========================================= =========== ==============
+
+where
+
+* map_by_fd(imm) means to convert a 32-bit file descriptor into an address of a map (see `Maps`_)
+* map_by_idx(imm) means to convert a 32-bit index into an address of a map
+* map_val(map) gets the address of the first value in a given map
+* var_addr(imm) gets the address of a platform variable (see `Platform Variables`_) with a given id
+* code_addr(imm) gets the address of the instruction at a specified relative offset in number of (64-bit) instructions
+* the 'imm type' can be used by disassemblers for display
+* the 'dst type' can be used for verification and JIT compilation purposes
+
+Maps
+~~~~
+
+Maps are shared memory regions accessible by eBPF programs on some platforms.
+A map can have various semantics as defined in a separate document, and may or
+may not have a single contiguous memory region, but the 'map_val(map)' is
+currently only defined for maps that do have a single contiguous memory region.
+
+Each map can have a file descriptor (fd) if supported by the platform, where
+'map_by_fd(imm)' means to get the map with the specified file descriptor. Each
+BPF program can also be defined to use a set of maps associated with the
+program at load time, and 'map_by_idx(imm)' means to get the map with the given
+index in the set associated with the BPF program containing the instruction.
+
+Platform Variables
+~~~~~~~~~~~~~~~~~~
+
+Platform variables are memory regions, identified by integer ids, exposed by
+the runtime and accessible by BPF programs on some platforms. The
+'var_addr(imm)' operation means to get the address of the memory region
+identified by the given id.
+
+Legacy BPF Packet access instructions
+-------------------------------------
+
+eBPF previously introduced special instructions for access to packet data that were
+carried over from classic BPF. However, these instructions are
+deprecated and should no longer be used.
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
new file mode 100644
index 000000000000..ea2516374d92
--- /dev/null
+++ b/Documentation/bpf/kfuncs.rst
@@ -0,0 +1,609 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _kfuncs-header-label:
+
+=============================
+BPF Kernel Functions (kfuncs)
+=============================
+
+1. Introduction
+===============
+
+BPF Kernel Functions or more commonly known as kfuncs are functions in the Linux
+kernel which are exposed for use by BPF programs. Unlike normal BPF helpers,
+kfuncs do not have a stable interface and can change from one kernel release to
+another. Hence, BPF programs need to be updated in response to changes in the
+kernel. See :ref:`BPF_kfunc_lifecycle_expectations` for more information.
+
+2. Defining a kfunc
+===================
+
+There are two ways to expose a kernel function to BPF programs, either make an
+existing function in the kernel visible, or add a new wrapper for BPF. In both
+cases, care must be taken that BPF program can only call such function in a
+valid context. To enforce this, visibility of a kfunc can be per program type.
+
+If you are not creating a BPF wrapper for existing kernel function, skip ahead
+to :ref:`BPF_kfunc_nodef`.
+
+2.1 Creating a wrapper kfunc
+----------------------------
+
+When defining a wrapper kfunc, the wrapper function should have extern linkage.
+This prevents the compiler from optimizing away dead code, as this wrapper kfunc
+is not invoked anywhere in the kernel itself. It is not necessary to provide a
+prototype in a header for the wrapper kfunc.
+
+An example is given below::
+
+ /* Disables missing prototype warnings */
+ __diag_push();
+ __diag_ignore_all("-Wmissing-prototypes",
+ "Global kfuncs as their definitions will be in BTF");
+
+ __bpf_kfunc struct task_struct *bpf_find_get_task_by_vpid(pid_t nr)
+ {
+ return find_get_task_by_vpid(nr);
+ }
+
+ __diag_pop();
+
+A wrapper kfunc is often needed when we need to annotate parameters of the
+kfunc. Otherwise one may directly make the kfunc visible to the BPF program by
+registering it with the BPF subsystem. See :ref:`BPF_kfunc_nodef`.
+
+2.2 Annotating kfunc parameters
+-------------------------------
+
+Similar to BPF helpers, there is sometime need for additional context required
+by the verifier to make the usage of kernel functions safer and more useful.
+Hence, we can annotate a parameter by suffixing the name of the argument of the
+kfunc with a __tag, where tag may be one of the supported annotations.
+
+2.2.1 __sz Annotation
+---------------------
+
+This annotation is used to indicate a memory and size pair in the argument list.
+An example is given below::
+
+ __bpf_kfunc void bpf_memzero(void *mem, int mem__sz)
+ {
+ ...
+ }
+
+Here, the verifier will treat first argument as a PTR_TO_MEM, and second
+argument as its size. By default, without __sz annotation, the size of the type
+of the pointer is used. Without __sz annotation, a kfunc cannot accept a void
+pointer.
+
+2.2.2 __k Annotation
+--------------------
+
+This annotation is only understood for scalar arguments, where it indicates that
+the verifier must check the scalar argument to be a known constant, which does
+not indicate a size parameter, and the value of the constant is relevant to the
+safety of the program.
+
+An example is given below::
+
+ __bpf_kfunc void *bpf_obj_new(u32 local_type_id__k, ...)
+ {
+ ...
+ }
+
+Here, bpf_obj_new uses local_type_id argument to find out the size of that type
+ID in program's BTF and return a sized pointer to it. Each type ID will have a
+distinct size, hence it is crucial to treat each such call as distinct when
+values don't match during verifier state pruning checks.
+
+Hence, whenever a constant scalar argument is accepted by a kfunc which is not a
+size parameter, and the value of the constant matters for program safety, __k
+suffix should be used.
+
+2.2.2 __uninit Annotation
+-------------------------
+
+This annotation is used to indicate that the argument will be treated as
+uninitialized.
+
+An example is given below::
+
+ __bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit)
+ {
+ ...
+ }
+
+Here, the dynptr will be treated as an uninitialized dynptr. Without this
+annotation, the verifier will reject the program if the dynptr passed in is
+not initialized.
+
+.. _BPF_kfunc_nodef:
+
+2.3 Using an existing kernel function
+-------------------------------------
+
+When an existing function in the kernel is fit for consumption by BPF programs,
+it can be directly registered with the BPF subsystem. However, care must still
+be taken to review the context in which it will be invoked by the BPF program
+and whether it is safe to do so.
+
+2.4 Annotating kfuncs
+---------------------
+
+In addition to kfuncs' arguments, verifier may need more information about the
+type of kfunc(s) being registered with the BPF subsystem. To do so, we define
+flags on a set of kfuncs as follows::
+
+ BTF_SET8_START(bpf_task_set)
+ BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
+ BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
+ BTF_SET8_END(bpf_task_set)
+
+This set encodes the BTF ID of each kfunc listed above, and encodes the flags
+along with it. Ofcourse, it is also allowed to specify no flags.
+
+kfunc definitions should also always be annotated with the ``__bpf_kfunc``
+macro. This prevents issues such as the compiler inlining the kfunc if it's a
+static kernel function, or the function being elided in an LTO build as it's
+not used in the rest of the kernel. Developers should not manually add
+annotations to their kfunc to prevent these issues. If an annotation is
+required to prevent such an issue with your kfunc, it is a bug and should be
+added to the definition of the macro so that other kfuncs are similarly
+protected. An example is given below::
+
+ __bpf_kfunc struct task_struct *bpf_get_task_pid(s32 pid)
+ {
+ ...
+ }
+
+2.4.1 KF_ACQUIRE flag
+---------------------
+
+The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a
+refcounted object. The verifier will then ensure that the pointer to the object
+is eventually released using a release kfunc, or transferred to a map using a
+referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the
+loading of the BPF program until no lingering references remain in all possible
+explored states of the program.
+
+2.4.2 KF_RET_NULL flag
+----------------------
+
+The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc
+may be NULL. Hence, it forces the user to do a NULL check on the pointer
+returned from the kfunc before making use of it (dereferencing or passing to
+another helper). This flag is often used in pairing with KF_ACQUIRE flag, but
+both are orthogonal to each other.
+
+2.4.3 KF_RELEASE flag
+---------------------
+
+The KF_RELEASE flag is used to indicate that the kfunc releases the pointer
+passed in to it. There can be only one referenced pointer that can be passed
+in. All copies of the pointer being released are invalidated as a result of
+invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the
+protection afforded by the KF_TRUSTED_ARGS flag described below.
+
+2.4.4 KF_TRUSTED_ARGS flag
+--------------------------
+
+The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
+indicates that the all pointer arguments are valid, and that all pointers to
+BTF objects have been passed in their unmodified form (that is, at a zero
+offset, and without having been obtained from walking another pointer, with one
+exception described below).
+
+There are two types of pointers to kernel objects which are considered "valid":
+
+1. Pointers which are passed as tracepoint or struct_ops callback arguments.
+2. Pointers which were returned from a KF_ACQUIRE kfunc.
+
+Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to
+KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset.
+
+The definition of "valid" pointers is subject to change at any time, and has
+absolutely no ABI stability guarantees.
+
+As mentioned above, a nested pointer obtained from walking a trusted pointer is
+no longer trusted, with one exception. If a struct type has a field that is
+guaranteed to be valid as long as its parent pointer is trusted, the
+``BTF_TYPE_SAFE_NESTED`` macro can be used to express that to the verifier as
+follows:
+
+.. code-block:: c
+
+ BTF_TYPE_SAFE_NESTED(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+ };
+
+In other words, you must:
+
+1. Wrap the trusted pointer type in the ``BTF_TYPE_SAFE_NESTED`` macro.
+
+2. Specify the type and name of the trusted nested field. This field must match
+ the field in the original type definition exactly.
+
+2.4.5 KF_SLEEPABLE flag
+-----------------------
+
+The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only
+be called by sleepable BPF programs (BPF_F_SLEEPABLE).
+
+2.4.6 KF_DESTRUCTIVE flag
+--------------------------
+
+The KF_DESTRUCTIVE flag is used to indicate functions calling which is
+destructive to the system. For example such a call can result in system
+rebooting or panicking. Due to this additional restrictions apply to these
+calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
+added later.
+
+2.4.7 KF_RCU flag
+-----------------
+
+The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
+KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
+that the objects are valid and there is no use-after-free. The pointers are not
+NULL, but the object's refcount could have reached zero. The kfuncs need to
+consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE
+pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely
+also be KF_RET_NULL.
+
+.. _KF_deprecated_flag:
+
+2.4.8 KF_DEPRECATED flag
+------------------------
+
+The KF_DEPRECATED flag is used for kfuncs which are scheduled to be
+changed or removed in a subsequent kernel release. A kfunc that is
+marked with KF_DEPRECATED should also have any relevant information
+captured in its kernel doc. Such information typically includes the
+kfunc's expected remaining lifespan, a recommendation for new
+functionality that can replace it if any is available, and possibly a
+rationale for why it is being removed.
+
+Note that while on some occasions, a KF_DEPRECATED kfunc may continue to be
+supported and have its KF_DEPRECATED flag removed, it is likely to be far more
+difficult to remove a KF_DEPRECATED flag after it's been added than it is to
+prevent it from being added in the first place. As described in
+:ref:`BPF_kfunc_lifecycle_expectations`, users that rely on specific kfuncs are
+encouraged to make their use-cases known as early as possible, and participate
+in upstream discussions regarding whether to keep, change, deprecate, or remove
+those kfuncs if and when such discussions occur.
+
+2.5 Registering the kfuncs
+--------------------------
+
+Once the kfunc is prepared for use, the final step to making it visible is
+registering it with the BPF subsystem. Registration is done per BPF program
+type. An example is shown below::
+
+ BTF_SET8_START(bpf_task_set)
+ BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL)
+ BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE)
+ BTF_SET8_END(bpf_task_set)
+
+ static const struct btf_kfunc_id_set bpf_task_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_task_set,
+ };
+
+ static int init_subsystem(void)
+ {
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_task_kfunc_set);
+ }
+ late_initcall(init_subsystem);
+
+2.6 Specifying no-cast aliases with ___init
+--------------------------------------------
+
+The verifier will always enforce that the BTF type of a pointer passed to a
+kfunc by a BPF program, matches the type of pointer specified in the kfunc
+definition. The verifier, does, however, allow types that are equivalent
+according to the C standard to be passed to the same kfunc arg, even if their
+BTF_IDs differ.
+
+For example, for the following type definition:
+
+.. code-block:: c
+
+ struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+ };
+
+The verifier would allow a ``struct bpf_cpumask *`` to be passed to a kfunc
+taking a ``cpumask_t *`` (which is a typedef of ``struct cpumask *``). For
+instance, both ``struct cpumask *`` and ``struct bpf_cpmuask *`` can be passed
+to bpf_cpumask_test_cpu().
+
+In some cases, this type-aliasing behavior is not desired. ``struct
+nf_conn___init`` is one such example:
+
+.. code-block:: c
+
+ struct nf_conn___init {
+ struct nf_conn ct;
+ };
+
+The C standard would consider these types to be equivalent, but it would not
+always be safe to pass either type to a trusted kfunc. ``struct
+nf_conn___init`` represents an allocated ``struct nf_conn`` object that has
+*not yet been initialized*, so it would therefore be unsafe to pass a ``struct
+nf_conn___init *`` to a kfunc that's expecting a fully initialized ``struct
+nf_conn *`` (e.g. ``bpf_ct_change_timeout()``).
+
+In order to accommodate such requirements, the verifier will enforce strict
+PTR_TO_BTF_ID type matching if two types have the exact same name, with one
+being suffixed with ``___init``.
+
+.. _BPF_kfunc_lifecycle_expectations:
+
+3. kfunc lifecycle expectations
+===============================
+
+kfuncs provide a kernel <-> kernel API, and thus are not bound by any of the
+strict stability restrictions associated with kernel <-> user UAPIs. This means
+they can be thought of as similar to EXPORT_SYMBOL_GPL, and can therefore be
+modified or removed by a maintainer of the subsystem they're defined in when
+it's deemed necessary.
+
+Like any other change to the kernel, maintainers will not change or remove a
+kfunc without having a reasonable justification. Whether or not they'll choose
+to change a kfunc will ultimately depend on a variety of factors, such as how
+widely used the kfunc is, how long the kfunc has been in the kernel, whether an
+alternative kfunc exists, what the norm is in terms of stability for the
+subsystem in question, and of course what the technical cost is of continuing
+to support the kfunc.
+
+There are several implications of this:
+
+a) kfuncs that are widely used or have been in the kernel for a long time will
+ be more difficult to justify being changed or removed by a maintainer. In
+ other words, kfuncs that are known to have a lot of users and provide
+ significant value provide stronger incentives for maintainers to invest the
+ time and complexity in supporting them. It is therefore important for
+ developers that are using kfuncs in their BPF programs to communicate and
+ explain how and why those kfuncs are being used, and to participate in
+ discussions regarding those kfuncs when they occur upstream.
+
+b) Unlike regular kernel symbols marked with EXPORT_SYMBOL_GPL, BPF programs
+ that call kfuncs are generally not part of the kernel tree. This means that
+ refactoring cannot typically change callers in-place when a kfunc changes,
+ as is done for e.g. an upstreamed driver being updated in place when a
+ kernel symbol is changed.
+
+ Unlike with regular kernel symbols, this is expected behavior for BPF
+ symbols, and out-of-tree BPF programs that use kfuncs should be considered
+ relevant to discussions and decisions around modifying and removing those
+ kfuncs. The BPF community will take an active role in participating in
+ upstream discussions when necessary to ensure that the perspectives of such
+ users are taken into account.
+
+c) A kfunc will never have any hard stability guarantees. BPF APIs cannot and
+ will not ever hard-block a change in the kernel purely for stability
+ reasons. That being said, kfuncs are features that are meant to solve
+ problems and provide value to users. The decision of whether to change or
+ remove a kfunc is a multivariate technical decision that is made on a
+ case-by-case basis, and which is informed by data points such as those
+ mentioned above. It is expected that a kfunc being removed or changed with
+ no warning will not be a common occurrence or take place without sound
+ justification, but it is a possibility that must be accepted if one is to
+ use kfuncs.
+
+3.1 kfunc deprecation
+---------------------
+
+As described above, while sometimes a maintainer may find that a kfunc must be
+changed or removed immediately to accommodate some changes in their subsystem,
+usually kfuncs will be able to accommodate a longer and more measured
+deprecation process. For example, if a new kfunc comes along which provides
+superior functionality to an existing kfunc, the existing kfunc may be
+deprecated for some period of time to allow users to migrate their BPF programs
+to use the new one. Or, if a kfunc has no known users, a decision may be made
+to remove the kfunc (without providing an alternative API) after some
+deprecation period so as to provide users with a window to notify the kfunc
+maintainer if it turns out that the kfunc is actually being used.
+
+It's expected that the common case will be that kfuncs will go through a
+deprecation period rather than being changed or removed without warning. As
+described in :ref:`KF_deprecated_flag`, the kfunc framework provides the
+KF_DEPRECATED flag to kfunc developers to signal to users that a kfunc has been
+deprecated. Once a kfunc has been marked with KF_DEPRECATED, the following
+procedure is followed for removal:
+
+1. Any relevant information for deprecated kfuncs is documented in the kfunc's
+ kernel docs. This documentation will typically include the kfunc's expected
+ remaining lifespan, a recommendation for new functionality that can replace
+ the usage of the deprecated function (or an explanation as to why no such
+ replacement exists), etc.
+
+2. The deprecated kfunc is kept in the kernel for some period of time after it
+ was first marked as deprecated. This time period will be chosen on a
+ case-by-case basis, and will typically depend on how widespread the use of
+ the kfunc is, how long it has been in the kernel, and how hard it is to move
+ to alternatives. This deprecation time period is "best effort", and as
+ described :ref:`above<BPF_kfunc_lifecycle_expectations>`, circumstances may
+ sometimes dictate that the kfunc be removed before the full intended
+ deprecation period has elapsed.
+
+3. After the deprecation period the kfunc will be removed. At this point, BPF
+ programs calling the kfunc will be rejected by the verifier.
+
+4. Core kfuncs
+==============
+
+The BPF subsystem provides a number of "core" kfuncs that are potentially
+applicable to a wide variety of different possible use cases and programs.
+Those kfuncs are documented here.
+
+4.1 struct task_struct * kfuncs
+-------------------------------
+
+There are a number of kfuncs that allow ``struct task_struct *`` objects to be
+used as kptrs:
+
+.. kernel-doc:: kernel/bpf/helpers.c
+ :identifiers: bpf_task_acquire bpf_task_release
+
+These kfuncs are useful when you want to acquire or release a reference to a
+``struct task_struct *`` that was passed as e.g. a tracepoint arg, or a
+struct_ops callback arg. For example:
+
+.. code-block:: c
+
+ /**
+ * A trivial example tracepoint program that shows how to
+ * acquire and release a struct task_struct * pointer.
+ */
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(task_acquire_release_example, struct task_struct *task, u64 clone_flags)
+ {
+ struct task_struct *acquired;
+
+ acquired = bpf_task_acquire(task);
+ if (acquired)
+ /*
+ * In a typical program you'd do something like store
+ * the task in a map, and the map will automatically
+ * release it later. Here, we release it manually.
+ */
+ bpf_task_release(acquired);
+ return 0;
+ }
+
+
+References acquired on ``struct task_struct *`` objects are RCU protected.
+Therefore, when in an RCU read region, you can obtain a pointer to a task
+embedded in a map value without having to acquire a reference:
+
+.. code-block:: c
+
+ #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+ private(TASK) static struct task_struct *global;
+
+ /**
+ * A trivial example showing how to access a task stored
+ * in a map using RCU.
+ */
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(task_rcu_read_example, struct task_struct *task, u64 clone_flags)
+ {
+ struct task_struct *local_copy;
+
+ bpf_rcu_read_lock();
+ local_copy = global;
+ if (local_copy)
+ /*
+ * We could also pass local_copy to kfuncs or helper functions here,
+ * as we're guaranteed that local_copy will be valid until we exit
+ * the RCU read region below.
+ */
+ bpf_printk("Global task %s is valid", local_copy->comm);
+ else
+ bpf_printk("No global task found");
+ bpf_rcu_read_unlock();
+
+ /* At this point we can no longer reference local_copy. */
+
+ return 0;
+ }
+
+----
+
+A BPF program can also look up a task from a pid. This can be useful if the
+caller doesn't have a trusted pointer to a ``struct task_struct *`` object that
+it can acquire a reference on with bpf_task_acquire().
+
+.. kernel-doc:: kernel/bpf/helpers.c
+ :identifiers: bpf_task_from_pid
+
+Here is an example of it being used:
+
+.. code-block:: c
+
+ SEC("tp_btf/task_newtask")
+ int BPF_PROG(task_get_pid_example, struct task_struct *task, u64 clone_flags)
+ {
+ struct task_struct *lookup;
+
+ lookup = bpf_task_from_pid(task->pid);
+ if (!lookup)
+ /* A task should always be found, as %task is a tracepoint arg. */
+ return -ENOENT;
+
+ if (lookup->pid != task->pid) {
+ /* bpf_task_from_pid() looks up the task via its
+ * globally-unique pid from the init_pid_ns. Thus,
+ * the pid of the lookup task should always be the
+ * same as the input task.
+ */
+ bpf_task_release(lookup);
+ return -EINVAL;
+ }
+
+ /* bpf_task_from_pid() returns an acquired reference,
+ * so it must be dropped before returning from the
+ * tracepoint handler.
+ */
+ bpf_task_release(lookup);
+ return 0;
+ }
+
+4.2 struct cgroup * kfuncs
+--------------------------
+
+``struct cgroup *`` objects also have acquire and release functions:
+
+.. kernel-doc:: kernel/bpf/helpers.c
+ :identifiers: bpf_cgroup_acquire bpf_cgroup_release
+
+These kfuncs are used in exactly the same manner as bpf_task_acquire() and
+bpf_task_release() respectively, so we won't provide examples for them.
+
+----
+
+Other kfuncs available for interacting with ``struct cgroup *`` objects are
+bpf_cgroup_ancestor() and bpf_cgroup_from_id(), allowing callers to access
+the ancestor of a cgroup and find a cgroup by its ID, respectively. Both
+return a cgroup kptr.
+
+.. kernel-doc:: kernel/bpf/helpers.c
+ :identifiers: bpf_cgroup_ancestor
+
+.. kernel-doc:: kernel/bpf/helpers.c
+ :identifiers: bpf_cgroup_from_id
+
+Eventually, BPF should be updated to allow this to happen with a normal memory
+load in the program itself. This is currently not possible without more work in
+the verifier. bpf_cgroup_ancestor() can be used as follows:
+
+.. code-block:: c
+
+ /**
+ * Simple tracepoint example that illustrates how a cgroup's
+ * ancestor can be accessed using bpf_cgroup_ancestor().
+ */
+ SEC("tp_btf/cgroup_mkdir")
+ int BPF_PROG(cgrp_ancestor_example, struct cgroup *cgrp, const char *path)
+ {
+ struct cgroup *parent;
+
+ /* The parent cgroup resides at the level before the current cgroup's level. */
+ parent = bpf_cgroup_ancestor(cgrp, cgrp->level - 1);
+ if (!parent)
+ return -ENOENT;
+
+ bpf_printk("Parent id is %d", parent->self.id);
+
+ /* Return the parent cgroup that was acquired above. */
+ bpf_cgroup_release(parent);
+ return 0;
+ }
+
+4.3 struct cpumask * kfuncs
+---------------------------
+
+BPF provides a set of kfuncs that can be used to query, allocate, mutate, and
+destroy struct cpumask * objects. Please refer to :ref:`cpumasks-header-label`
+for more details.
diff --git a/Documentation/bpf/libbpf/index.rst b/Documentation/bpf/libbpf/index.rst
index 4f8adfc3ab83..7545a2049692 100644
--- a/Documentation/bpf/libbpf/index.rst
+++ b/Documentation/bpf/libbpf/index.rst
@@ -1,22 +1,33 @@
.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+.. _libbpf:
+
+======
libbpf
======
-For API documentation see the `versioned API documentation site <https://libbpf.readthedocs.io/en/latest/api.html>`_.
+If you are looking to develop BPF applications using the libbpf library, this
+directory contains important documentation that you should read.
+
+To get started, it is recommended to begin with the :doc:`libbpf Overview
+<libbpf_overview>` document, which provides a high-level understanding of the
+libbpf APIs and their usage. This will give you a solid foundation to start
+exploring and utilizing the various features of libbpf to develop your BPF
+applications.
.. toctree::
:maxdepth: 1
+ libbpf_overview
+ API Documentation <https://libbpf.readthedocs.io/en/latest/api.html>
+ program_types
libbpf_naming_convention
libbpf_build
-This is documentation for libbpf, a userspace library for loading and
-interacting with bpf programs.
-All general BPF questions, including kernel functionality, libbpf APIs and
-their application, should be sent to bpf@vger.kernel.org mailing list.
-You can `subscribe <http://vger.kernel.org/vger-lists.html#bpf>`_ to the
-mailing list search its `archive <https://lore.kernel.org/bpf/>`_.
-Please search the archive before asking new questions. It very well might
-be that this was already addressed or answered before.
+All general BPF questions, including kernel functionality, libbpf APIs and their
+application, should be sent to bpf@vger.kernel.org mailing list. You can
+`subscribe <http://vger.kernel.org/vger-lists.html#bpf>`_ to the mailing list
+search its `archive <https://lore.kernel.org/bpf/>`_. Please search the archive
+before asking new questions. It may be that this was already addressed or
+answered before.
diff --git a/Documentation/bpf/libbpf/libbpf_naming_convention.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst
index 9c68d5014ff1..b5b41b61b3c0 100644
--- a/Documentation/bpf/libbpf/libbpf_naming_convention.rst
+++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst
@@ -9,8 +9,8 @@ described here. It's recommended to follow these conventions whenever a
new function or type is added to keep libbpf API clean and consistent.
All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``,
-``btf_dump_``, ``ring_buffer_``, ``perf_buffer_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``btf_dump_``,
+``ring_buffer_``, ``perf_buffer_``.
System call wrappers
--------------------
@@ -59,15 +59,6 @@ Auxiliary functions and types that don't fit well in any of categories
described above should have ``libbpf_`` prefix, e.g.
``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
-AF_XDP functions
--------------------
-
-AF_XDP functions should have an ``xsk_`` prefix, e.g.
-``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists
-of both low-level ring access functions and high-level configuration
-functions. These can be mixed and matched. Note that these functions
-are not reentrant for performance reasons.
-
ABI
---
@@ -92,8 +83,8 @@ This prevents from accidentally exporting a symbol, that is not supposed
to be a part of ABI what, in turn, improves both libbpf developer- and
user-experiences.
-ABI versionning
----------------
+ABI versioning
+--------------
To make future ABI extensions possible libbpf ABI is versioned.
Versioning is implemented by ``libbpf.map`` version script that is
@@ -150,6 +141,46 @@ mirror of the mainline's version of libbpf for a stand-alone build.
However, all changes to libbpf's code base must be upstreamed through
the mainline kernel tree.
+
+API documentation convention
+============================
+
+The libbpf API is documented via comments above definitions in
+header files. These comments can be rendered by doxygen and sphinx
+for well organized html output. This section describes the
+convention in which these comments should be formatted.
+
+Here is an example from btf.h:
+
+.. code-block:: c
+
+ /**
+ * @brief **btf__new()** creates a new instance of a BTF object from the raw
+ * bytes of an ELF's BTF section
+ * @param data raw bytes
+ * @param size number of bytes passed in `data`
+ * @return new BTF object instance which has to be eventually freed with
+ * **btf__free()**
+ *
+ * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract
+ * error code from such a pointer `libbpf_get_error()` should be used. If
+ * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is
+ * returned on error instead. In both cases thread-local `errno` variable is
+ * always set to error code as well.
+ */
+
+The comment must start with a block comment of the form '/\*\*'.
+
+The documentation always starts with a @brief directive. This line is a short
+description about this API. It starts with the name of the API, denoted in bold
+like so: **api_name**. Please include an open and close parenthesis if this is a
+function. Follow with the short description of the API. A longer form description
+can be added below the last directive, at the bottom of the comment.
+
+Parameters are denoted with the @param directive, there should be one for each
+parameter. If this is a function with a non-void return, use the @return directive
+to document it.
+
License
-------------------
diff --git a/Documentation/bpf/libbpf/libbpf_overview.rst b/Documentation/bpf/libbpf/libbpf_overview.rst
new file mode 100644
index 000000000000..f36a2d4ffea2
--- /dev/null
+++ b/Documentation/bpf/libbpf/libbpf_overview.rst
@@ -0,0 +1,228 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+libbpf Overview
+===============
+
+libbpf is a C-based library containing a BPF loader that takes compiled BPF
+object files and prepares and loads them into the Linux kernel. libbpf takes the
+heavy lifting of loading, verifying, and attaching BPF programs to various
+kernel hooks, allowing BPF application developers to focus only on BPF program
+correctness and performance.
+
+The following are the high-level features supported by libbpf:
+
+* Provides high-level and low-level APIs for user space programs to interact
+ with BPF programs. The low-level APIs wrap all the bpf system call
+ functionality, which is useful when users need more fine-grained control
+ over the interactions between user space and BPF programs.
+* Provides overall support for the BPF object skeleton generated by bpftool.
+ The skeleton file simplifies the process for the user space programs to access
+ global variables and work with BPF programs.
+* Provides BPF-side APIS, including BPF helper definitions, BPF maps support,
+ and tracing helpers, allowing developers to simplify BPF code writing.
+* Supports BPF CO-RE mechanism, enabling BPF developers to write portable
+ BPF programs that can be compiled once and run across different kernel
+ versions.
+
+This document will delve into the above concepts in detail, providing a deeper
+understanding of the capabilities and advantages of libbpf and how it can help
+you develop BPF applications efficiently.
+
+BPF App Lifecycle and libbpf APIs
+==================================
+
+A BPF application consists of one or more BPF programs (either cooperating or
+completely independent), BPF maps, and global variables. The global
+variables are shared between all BPF programs, which allows them to cooperate on
+a common set of data. libbpf provides APIs that user space programs can use to
+manipulate the BPF programs by triggering different phases of a BPF application
+lifecycle.
+
+The following section provides a brief overview of each phase in the BPF life
+cycle:
+
+* **Open phase**: In this phase, libbpf parses the BPF
+ object file and discovers BPF maps, BPF programs, and global variables. After
+ a BPF app is opened, user space apps can make additional adjustments
+ (setting BPF program types, if necessary; pre-setting initial values for
+ global variables, etc.) before all the entities are created and loaded.
+
+* **Load phase**: In the load phase, libbpf creates BPF
+ maps, resolves various relocations, and verifies and loads BPF programs into
+ the kernel. At this point, libbpf validates all the parts of a BPF application
+ and loads the BPF program into the kernel, but no BPF program has yet been
+ executed. After the load phase, it’s possible to set up the initial BPF map
+ state without racing with the BPF program code execution.
+
+* **Attachment phase**: In this phase, libbpf
+ attaches BPF programs to various BPF hook points (e.g., tracepoints, kprobes,
+ cgroup hooks, network packet processing pipeline, etc.). During this
+ phase, BPF programs perform useful work such as processing
+ packets, or updating BPF maps and global variables that can be read from user
+ space.
+
+* **Tear down phase**: In the tear down phase,
+ libbpf detaches BPF programs and unloads them from the kernel. BPF maps are
+ destroyed, and all the resources used by the BPF app are freed.
+
+BPF Object Skeleton File
+========================
+
+BPF skeleton is an alternative interface to libbpf APIs for working with BPF
+objects. Skeleton code abstract away generic libbpf APIs to significantly
+simplify code for manipulating BPF programs from user space. Skeleton code
+includes a bytecode representation of the BPF object file, simplifying the
+process of distributing your BPF code. With BPF bytecode embedded, there are no
+extra files to deploy along with your application binary.
+
+You can generate the skeleton header file ``(.skel.h)`` for a specific object
+file by passing the BPF object to the bpftool. The generated BPF skeleton
+provides the following custom functions that correspond to the BPF lifecycle,
+each of them prefixed with the specific object name:
+
+* ``<name>__open()`` – creates and opens BPF application (``<name>`` stands for
+ the specific bpf object name)
+* ``<name>__load()`` – instantiates, loads,and verifies BPF application parts
+* ``<name>__attach()`` – attaches all auto-attachable BPF programs (it’s
+ optional, you can have more control by using libbpf APIs directly)
+* ``<name>__destroy()`` – detaches all BPF programs and
+ frees up all used resources
+
+Using the skeleton code is the recommended way to work with bpf programs. Keep
+in mind, BPF skeleton provides access to the underlying BPF object, so whatever
+was possible to do with generic libbpf APIs is still possible even when the BPF
+skeleton is used. It's an additive convenience feature, with no syscalls, and no
+cumbersome code.
+
+Other Advantages of Using Skeleton File
+---------------------------------------
+
+* BPF skeleton provides an interface for user space programs to work with BPF
+ global variables. The skeleton code memory maps global variables as a struct
+ into user space. The struct interface allows user space programs to initialize
+ BPF programs before the BPF load phase and fetch and update data from user
+ space afterward.
+
+* The ``skel.h`` file reflects the object file structure by listing out the
+ available maps, programs, etc. BPF skeleton provides direct access to all the
+ BPF maps and BPF programs as struct fields. This eliminates the need for
+ string-based lookups with ``bpf_object_find_map_by_name()`` and
+ ``bpf_object_find_program_by_name()`` APIs, reducing errors due to BPF source
+ code and user-space code getting out of sync.
+
+* The embedded bytecode representation of the object file ensures that the
+ skeleton and the BPF object file are always in sync.
+
+BPF Helpers
+===========
+
+libbpf provides BPF-side APIs that BPF programs can use to interact with the
+system. The BPF helpers definition allows developers to use them in BPF code as
+any other plain C function. For example, there are helper functions to print
+debugging messages, get the time since the system was booted, interact with BPF
+maps, manipulate network packets, etc.
+
+For a complete description of what the helpers do, the arguments they take, and
+the return value, see the `bpf-helpers
+<https://man7.org/linux/man-pages/man7/bpf-helpers.7.html>`_ man page.
+
+BPF CO-RE (Compile Once – Run Everywhere)
+=========================================
+
+BPF programs work in the kernel space and have access to kernel memory and data
+structures. One limitation that BPF applications come across is the lack of
+portability across different kernel versions and configurations. `BCC
+<https://github.com/iovisor/bcc/>`_ is one of the solutions for BPF
+portability. However, it comes with runtime overhead and a large binary size
+from embedding the compiler with the application.
+
+libbpf steps up the BPF program portability by supporting the BPF CO-RE concept.
+BPF CO-RE brings together BTF type information, libbpf, and the compiler to
+produce a single executable binary that you can run on multiple kernel versions
+and configurations.
+
+To make BPF programs portable libbpf relies on the BTF type information of the
+running kernel. Kernel also exposes this self-describing authoritative BTF
+information through ``sysfs`` at ``/sys/kernel/btf/vmlinux``.
+
+You can generate the BTF information for the running kernel with the following
+command:
+
+::
+
+ $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
+
+The command generates a ``vmlinux.h`` header file with all kernel types
+(:doc:`BTF types <../btf>`) that the running kernel uses. Including
+``vmlinux.h`` in your BPF program eliminates dependency on system-wide kernel
+headers.
+
+libbpf enables portability of BPF programs by looking at the BPF program’s
+recorded BTF type and relocation information and matching them to BTF
+information (vmlinux) provided by the running kernel. libbpf then resolves and
+matches all the types and fields, and updates necessary offsets and other
+relocatable data to ensure that BPF program’s logic functions correctly for a
+specific kernel on the host. BPF CO-RE concept thus eliminates overhead
+associated with BPF development and allows developers to write portable BPF
+applications without modifications and runtime source code compilation on the
+target machine.
+
+The following code snippet shows how to read the parent field of a kernel
+``task_struct`` using BPF CO-RE and libbf. The basic helper to read a field in a
+CO-RE relocatable manner is ``bpf_core_read(dst, sz, src)``, which will read
+``sz`` bytes from the field referenced by ``src`` into the memory pointed to by
+``dst``.
+
+.. code-block:: C
+ :emphasize-lines: 6
+
+ //...
+ struct task_struct *task = (void *)bpf_get_current_task();
+ struct task_struct *parent_task;
+ int err;
+
+ err = bpf_core_read(&parent_task, sizeof(void *), &task->parent);
+ if (err) {
+ /* handle error */
+ }
+
+ /* parent_task contains the value of task->parent pointer */
+
+In the code snippet, we first get a pointer to the current ``task_struct`` using
+``bpf_get_current_task()``. We then use ``bpf_core_read()`` to read the parent
+field of task struct into the ``parent_task`` variable. ``bpf_core_read()`` is
+just like ``bpf_probe_read_kernel()`` BPF helper, except it records information
+about the field that should be relocated on the target kernel. i.e, if the
+``parent`` field gets shifted to a different offset within
+``struct task_struct`` due to some new field added in front of it, libbpf will
+automatically adjust the actual offset to the proper value.
+
+Getting Started with libbpf
+===========================
+
+Check out the `libbpf-bootstrap <https://github.com/libbpf/libbpf-bootstrap>`_
+repository with simple examples of using libbpf to build various BPF
+applications.
+
+See also `libbpf API documentation
+<https://libbpf.readthedocs.io/en/latest/api.html>`_.
+
+libbpf and Rust
+===============
+
+If you are building BPF applications in Rust, it is recommended to use the
+`Libbpf-rs <https://github.com/libbpf/libbpf-rs>`_ library instead of bindgen
+bindings directly to libbpf. Libbpf-rs wraps libbpf functionality in
+Rust-idiomatic interfaces and provides libbpf-cargo plugin to handle BPF code
+compilation and skeleton generation. Using Libbpf-rs will make building user
+space part of the BPF application easier. Note that the BPF program themselves
+must still be written in plain C.
+
+Additional Documentation
+========================
+
+* `Program types and ELF Sections <https://libbpf.readthedocs.io/en/latest/program_types.html>`_
+* `API naming convention <https://libbpf.readthedocs.io/en/latest/libbpf_naming_convention.html>`_
+* `Building libbpf <https://libbpf.readthedocs.io/en/latest/libbpf_build.html>`_
+* `API documentation Convention <https://libbpf.readthedocs.io/en/latest/libbpf_naming_convention.html#api-documentation-convention>`_
diff --git a/Documentation/bpf/libbpf/program_types.rst b/Documentation/bpf/libbpf/program_types.rst
new file mode 100644
index 000000000000..ad4d4d5eecb0
--- /dev/null
+++ b/Documentation/bpf/libbpf/program_types.rst
@@ -0,0 +1,203 @@
+.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+.. _program_types_and_elf:
+
+Program Types and ELF Sections
+==============================
+
+The table below lists the program types, their attach types where relevant and the ELF section
+names supported by libbpf for them. The ELF section names follow these rules:
+
+- ``type`` is an exact match, e.g. ``SEC("socket")``
+- ``type+`` means it can be either exact ``SEC("type")`` or well-formed ``SEC("type/extras")``
+ with a '``/``' separator between ``type`` and ``extras``.
+
+When ``extras`` are specified, they provide details of how to auto-attach the BPF program. The
+format of ``extras`` depends on the program type, e.g. ``SEC("tracepoint/<category>/<name>")``
+for tracepoints or ``SEC("usdt/<path>:<provider>:<name>")`` for USDT probes. The extras are
+described in more detail in the footnotes.
+
+
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| Program Type | Attach Type | ELF Section Name | Sleepable |
++===========================================+========================================+==================================+===========+
+| ``BPF_PROG_TYPE_CGROUP_DEVICE`` | ``BPF_CGROUP_DEVICE`` | ``cgroup/dev`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_CGROUP_SKB`` | | ``cgroup/skb`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET_EGRESS`` | ``cgroup_skb/egress`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET_INGRESS`` | ``cgroup_skb/ingress`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_CGROUP_SOCKOPT`` | ``BPF_CGROUP_GETSOCKOPT`` | ``cgroup/getsockopt`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_SETSOCKOPT`` | ``cgroup/setsockopt`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_CGROUP_SOCK_ADDR`` | ``BPF_CGROUP_INET4_BIND`` | ``cgroup/bind4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET4_CONNECT`` | ``cgroup/connect4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET4_GETPEERNAME`` | ``cgroup/getpeername4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET4_GETSOCKNAME`` | ``cgroup/getsockname4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET6_BIND`` | ``cgroup/bind6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET6_CONNECT`` | ``cgroup/connect6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET6_GETPEERNAME`` | ``cgroup/getpeername6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET6_GETSOCKNAME`` | ``cgroup/getsockname6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_UDP4_RECVMSG`` | ``cgroup/recvmsg4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_UDP4_SENDMSG`` | ``cgroup/sendmsg4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_UDP6_RECVMSG`` | ``cgroup/recvmsg6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_UDP6_SENDMSG`` | ``cgroup/sendmsg6`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_CGROUP_SOCK`` | ``BPF_CGROUP_INET4_POST_BIND`` | ``cgroup/post_bind4`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET6_POST_BIND`` | ``cgroup/post_bind6`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET_SOCK_CREATE`` | ``cgroup/sock_create`` | |
++ + +----------------------------------+-----------+
+| | | ``cgroup/sock`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_CGROUP_INET_SOCK_RELEASE`` | ``cgroup/sock_release`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_CGROUP_SYSCTL`` | ``BPF_CGROUP_SYSCTL`` | ``cgroup/sysctl`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_EXT`` | | ``freplace+`` [#fentry]_ | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_FLOW_DISSECTOR`` | ``BPF_FLOW_DISSECTOR`` | ``flow_dissector`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_KPROBE`` | | ``kprobe+`` [#kprobe]_ | |
++ + +----------------------------------+-----------+
+| | | ``kretprobe+`` [#kprobe]_ | |
++ + +----------------------------------+-----------+
+| | | ``ksyscall+`` [#ksyscall]_ | |
++ + +----------------------------------+-----------+
+| | | ``kretsyscall+`` [#ksyscall]_ | |
++ + +----------------------------------+-----------+
+| | | ``uprobe+`` [#uprobe]_ | |
++ + +----------------------------------+-----------+
+| | | ``uprobe.s+`` [#uprobe]_ | Yes |
++ + +----------------------------------+-----------+
+| | | ``uretprobe+`` [#uprobe]_ | |
++ + +----------------------------------+-----------+
+| | | ``uretprobe.s+`` [#uprobe]_ | Yes |
++ + +----------------------------------+-----------+
+| | | ``usdt+`` [#usdt]_ | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | |
++ + +----------------------------------+-----------+
+| | | ``kretprobe.multi+`` [#kpmulti]_ | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LSM`` | ``BPF_LSM_CGROUP`` | ``lsm_cgroup+`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_LSM_MAC`` | ``lsm+`` [#lsm]_ | |
++ + +----------------------------------+-----------+
+| | | ``lsm.s+`` [#lsm]_ | Yes |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LWT_IN`` | | ``lwt_in`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LWT_OUT`` | | ``lwt_out`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LWT_SEG6LOCAL`` | | ``lwt_seg6local`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_LWT_XMIT`` | | ``lwt_xmit`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_PERF_EVENT`` | | ``perf_event`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` | | ``raw_tp.w+`` [#rawtp]_ | |
++ + +----------------------------------+-----------+
+| | | ``raw_tracepoint.w+`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_RAW_TRACEPOINT`` | | ``raw_tp+`` [#rawtp]_ | |
++ + +----------------------------------+-----------+
+| | | ``raw_tracepoint+`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` | |
++ + +----------------------------------+-----------+
+| | | ``tc`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SK_LOOKUP`` | ``BPF_SK_LOOKUP`` | ``sk_lookup`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SK_MSG`` | ``BPF_SK_MSG_VERDICT`` | ``sk_msg`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SK_REUSEPORT`` | ``BPF_SK_REUSEPORT_SELECT_OR_MIGRATE`` | ``sk_reuseport/migrate`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_SK_REUSEPORT_SELECT`` | ``sk_reuseport`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SK_SKB`` | | ``sk_skb`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_SK_SKB_STREAM_PARSER`` | ``sk_skb/stream_parser`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_SK_SKB_STREAM_VERDICT`` | ``sk_skb/stream_verdict`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SOCKET_FILTER`` | | ``socket`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SOCK_OPS`` | ``BPF_CGROUP_SOCK_OPS`` | ``sockops`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_SYSCALL`` | | ``syscall`` | Yes |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_TRACEPOINT`` | | ``tp+`` [#tp]_ | |
++ + +----------------------------------+-----------+
+| | | ``tracepoint+`` [#tp]_ | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_TRACING`` | ``BPF_MODIFY_RETURN`` | ``fmod_ret+`` [#fentry]_ | |
++ + +----------------------------------+-----------+
+| | | ``fmod_ret.s+`` [#fentry]_ | Yes |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_TRACE_FENTRY`` | ``fentry+`` [#fentry]_ | |
++ + +----------------------------------+-----------+
+| | | ``fentry.s+`` [#fentry]_ | Yes |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_TRACE_FEXIT`` | ``fexit+`` [#fentry]_ | |
++ + +----------------------------------+-----------+
+| | | ``fexit.s+`` [#fentry]_ | Yes |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_TRACE_ITER`` | ``iter+`` [#iter]_ | |
++ + +----------------------------------+-----------+
+| | | ``iter.s+`` [#iter]_ | Yes |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_TRACE_RAW_TP`` | ``tp_btf+`` [#fentry]_ | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+| ``BPF_PROG_TYPE_XDP`` | ``BPF_XDP_CPUMAP`` | ``xdp.frags/cpumap`` | |
++ + +----------------------------------+-----------+
+| | | ``xdp/cpumap`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_XDP_DEVMAP`` | ``xdp.frags/devmap`` | |
++ + +----------------------------------+-----------+
+| | | ``xdp/devmap`` | |
++ +----------------------------------------+----------------------------------+-----------+
+| | ``BPF_XDP`` | ``xdp.frags`` | |
++ + +----------------------------------+-----------+
+| | | ``xdp`` | |
++-------------------------------------------+----------------------------------------+----------------------------------+-----------+
+
+
+.. rubric:: Footnotes
+
+.. [#fentry] The ``fentry`` attach format is ``fentry[.s]/<function>``.
+.. [#kprobe] The ``kprobe`` attach format is ``kprobe/<function>[+<offset>]``. Valid
+ characters for ``function`` are ``a-zA-Z0-9_.`` and ``offset`` must be a valid
+ non-negative integer.
+.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/<syscall>``.
+.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/<path>:<function>[+<offset>]``.
+.. [#usdt] The ``usdt`` attach format is ``usdt/<path>:<provider>:<name>``.
+.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/<pattern>`` where ``pattern``
+ supports ``*`` and ``?`` wildcards. Valid characters for pattern are
+ ``a-zA-Z0-9_.*?``.
+.. [#lsm] The ``lsm`` attachment format is ``lsm[.s]/<hook>``.
+.. [#rawtp] The ``raw_tp`` attach format is ``raw_tracepoint[.w]/<tracepoint>``.
+.. [#tp] The ``tracepoint`` attach format is ``tracepoint/<category>/<name>``.
+.. [#iter] The ``iter`` attach format is ``iter[.s]/<struct-name>``.
diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst
new file mode 100644
index 000000000000..508d009d3bed
--- /dev/null
+++ b/Documentation/bpf/linux-notes.rst
@@ -0,0 +1,83 @@
+.. contents::
+.. sectnum::
+
+==========================
+Linux implementation notes
+==========================
+
+This document provides more details specific to the Linux kernel implementation of the eBPF instruction set.
+
+Byte swap instructions
+======================
+
+``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and ``BPF_TO_BE`` respectively.
+
+Jump instructions
+=================
+
+``BPF_CALL | BPF_X | BPF_JMP`` (0x8d), where the helper function
+integer would be read from a specified register, is not currently supported
+by the verifier. Any programs with this instruction will fail to load
+until such support is added.
+
+Maps
+====
+
+Linux only supports the 'map_val(map)' operation on array maps with a single element.
+
+Linux uses an fd_array to store maps associated with a BPF program. Thus,
+map_by_idx(imm) uses the fd at that index in the array.
+
+Variables
+=========
+
+The following 64-bit immediate instruction specifies that a variable address,
+which corresponds to some integer stored in the 'imm' field, should be loaded:
+
+========================= ====== === ========================================= =========== ==============
+opcode construction opcode src pseudocode imm type dst type
+========================= ====== === ========================================= =========== ==============
+BPF_IMM | BPF_DW | BPF_LD 0x18 0x3 dst = var_addr(imm) variable id data pointer
+========================= ====== === ========================================= =========== ==============
+
+On Linux, this integer is a BTF ID.
+
+Legacy BPF Packet access instructions
+=====================================
+
+As mentioned in the `ISA standard documentation <instruction-set.rst#legacy-bpf-packet-access-instructions>`_,
+Linux has special eBPF instructions for access to packet data that have been
+carried over from classic BPF to retain the performance of legacy socket
+filters running in the eBPF interpreter.
+
+The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
+``BPF_IND | <size> | BPF_LD``.
+
+These instructions are used to access packet data and can only be used when
+the program context is a pointer to a networking packet. ``BPF_ABS``
+accesses packet data at an absolute offset specified by the immediate data
+and ``BPF_IND`` access packet data at an offset that includes the value of
+a register in addition to the immediate data.
+
+These instructions have seven implicit operands:
+
+* Register R6 is an implicit input that must contain a pointer to a
+ struct sk_buff.
+* Register R0 is an implicit output which contains the data fetched from
+ the packet.
+* Registers R1-R5 are scratch registers that are clobbered by the
+ instruction.
+
+These instructions have an implicit program exit condition as well. If an
+eBPF program attempts access data beyond the packet boundary, the
+program execution will be aborted.
+
+``BPF_ABS | BPF_W | BPF_LD`` (0x20) means::
+
+ R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + imm))
+
+where ``ntohl()`` converts a 32-bit value from network byte order to host byte order.
+
+``BPF_IND | BPF_W | BPF_LD`` (0x40) means::
+
+ R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm))
diff --git a/Documentation/bpf/map_array.rst b/Documentation/bpf/map_array.rst
new file mode 100644
index 000000000000..f2f51a53e8ae
--- /dev/null
+++ b/Documentation/bpf/map_array.rst
@@ -0,0 +1,262 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+================================================
+BPF_MAP_TYPE_ARRAY and BPF_MAP_TYPE_PERCPU_ARRAY
+================================================
+
+.. note::
+ - ``BPF_MAP_TYPE_ARRAY`` was introduced in kernel version 3.19
+ - ``BPF_MAP_TYPE_PERCPU_ARRAY`` was introduced in version 4.6
+
+``BPF_MAP_TYPE_ARRAY`` and ``BPF_MAP_TYPE_PERCPU_ARRAY`` provide generic array
+storage. The key type is an unsigned 32-bit integer (4 bytes) and the map is
+of constant size. The size of the array is defined in ``max_entries`` at
+creation time. All array elements are pre-allocated and zero initialized when
+created. ``BPF_MAP_TYPE_PERCPU_ARRAY`` uses a different memory region for each
+CPU whereas ``BPF_MAP_TYPE_ARRAY`` uses the same memory region. The value
+stored can be of any size, however, all array elements are aligned to 8
+bytes.
+
+Since kernel 5.5, memory mapping may be enabled for ``BPF_MAP_TYPE_ARRAY`` by
+setting the flag ``BPF_F_MMAPABLE``. The map definition is page-aligned and
+starts on the first page. Sufficient page-sized and page-aligned blocks of
+memory are allocated to store all array values, starting on the second page,
+which in some cases will result in over-allocation of memory. The benefit of
+using this is increased performance and ease of use since userspace programs
+would not be required to use helper functions to access and mutate data.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+Array elements can be retrieved using the ``bpf_map_lookup_elem()`` helper.
+This helper returns a pointer into the array element, so to avoid data races
+with userspace reading the value, the user must use primitives like
+``__sync_fetch_and_add()`` when updating the value in-place.
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+
+Array elements can be updated using the ``bpf_map_update_elem()`` helper.
+
+``bpf_map_update_elem()`` returns 0 on success, or negative error in case of
+failure.
+
+Since the array is of constant size, ``bpf_map_delete_elem()`` is not supported.
+To clear an array element, you may use ``bpf_map_update_elem()`` to insert a
+zero value to that index.
+
+Per CPU Array
+-------------
+
+Values stored in ``BPF_MAP_TYPE_ARRAY`` can be accessed by multiple programs
+across different CPUs. To restrict storage to a single CPU, you may use a
+``BPF_MAP_TYPE_PERCPU_ARRAY``.
+
+When using a ``BPF_MAP_TYPE_PERCPU_ARRAY`` the ``bpf_map_update_elem()`` and
+``bpf_map_lookup_elem()`` helpers automatically access the slot for the current
+CPU.
+
+bpf_map_lookup_percpu_elem()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
+
+The ``bpf_map_lookup_percpu_elem()`` helper can be used to lookup the array
+value for a specific CPU. Returns value on success , or ``NULL`` if no entry was
+found or ``cpu`` is invalid.
+
+Concurrency
+-----------
+
+Since kernel version 5.1, the BPF infrastructure provides ``struct bpf_spin_lock``
+to synchronize access.
+
+Userspace
+---------
+
+Access from userspace uses libbpf APIs with the same names as above, with
+the map identified by its ``fd``.
+
+Examples
+========
+
+Please see the ``tools/testing/selftests/bpf`` directory for functional
+examples. The code samples below demonstrate API usage.
+
+Kernel BPF
+----------
+
+This snippet shows how to declare an array in a BPF program.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, 256);
+ } my_map SEC(".maps");
+
+
+This example BPF program shows how to access an array element.
+
+.. code-block:: c
+
+ int bpf_prog(struct __sk_buff *skb)
+ {
+ struct iphdr ip;
+ int index;
+ long *value;
+
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &ip, sizeof(ip)) < 0)
+ return 0;
+
+ index = ip.protocol;
+ value = bpf_map_lookup_elem(&my_map, &index);
+ if (value)
+ __sync_fetch_and_add(value, skb->len);
+
+ return 0;
+ }
+
+Userspace
+---------
+
+BPF_MAP_TYPE_ARRAY
+~~~~~~~~~~~~~~~~~~
+
+This snippet shows how to create an array, using ``bpf_map_create_opts`` to
+set flags.
+
+.. code-block:: c
+
+ #include <bpf/libbpf.h>
+ #include <bpf/bpf.h>
+
+ int create_array()
+ {
+ int fd;
+ LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE);
+
+ fd = bpf_map_create(BPF_MAP_TYPE_ARRAY,
+ "example_array", /* name */
+ sizeof(__u32), /* key size */
+ sizeof(long), /* value size */
+ 256, /* max entries */
+ &opts); /* create opts */
+ return fd;
+ }
+
+This snippet shows how to initialize the elements of an array.
+
+.. code-block:: c
+
+ int initialize_array(int fd)
+ {
+ __u32 i;
+ long value;
+ int ret;
+
+ for (i = 0; i < 256; i++) {
+ value = i;
+ ret = bpf_map_update_elem(fd, &i, &value, BPF_ANY);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+ }
+
+This snippet shows how to retrieve an element value from an array.
+
+.. code-block:: c
+
+ int lookup(int fd)
+ {
+ __u32 index = 42;
+ long value;
+ int ret;
+
+ ret = bpf_map_lookup_elem(fd, &index, &value);
+ if (ret < 0)
+ return ret;
+
+ /* use value here */
+ assert(value == 42);
+
+ return ret;
+ }
+
+BPF_MAP_TYPE_PERCPU_ARRAY
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This snippet shows how to initialize the elements of a per CPU array.
+
+.. code-block:: c
+
+ int initialize_array(int fd)
+ {
+ int ncpus = libbpf_num_possible_cpus();
+ long values[ncpus];
+ __u32 i, j;
+ int ret;
+
+ for (i = 0; i < 256 ; i++) {
+ for (j = 0; j < ncpus; j++)
+ values[j] = i;
+ ret = bpf_map_update_elem(fd, &i, &values, BPF_ANY);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+ }
+
+This snippet shows how to access the per CPU elements of an array value.
+
+.. code-block:: c
+
+ int lookup(int fd)
+ {
+ int ncpus = libbpf_num_possible_cpus();
+ __u32 index = 42, j;
+ long values[ncpus];
+ int ret;
+
+ ret = bpf_map_lookup_elem(fd, &index, &values);
+ if (ret < 0)
+ return ret;
+
+ for (j = 0; j < ncpus; j++) {
+ /* Use per CPU value here */
+ assert(values[j] == 42);
+ }
+
+ return ret;
+ }
+
+Semantics
+=========
+
+As shown in the example above, when accessing a ``BPF_MAP_TYPE_PERCPU_ARRAY``
+in userspace, each value is an array with ``ncpus`` elements.
+
+When calling ``bpf_map_update_elem()`` the flag ``BPF_NOEXIST`` can not be used
+for these maps.
diff --git a/Documentation/bpf/map_bloom_filter.rst b/Documentation/bpf/map_bloom_filter.rst
new file mode 100644
index 000000000000..c82487f2fe0d
--- /dev/null
+++ b/Documentation/bpf/map_bloom_filter.rst
@@ -0,0 +1,174 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=========================
+BPF_MAP_TYPE_BLOOM_FILTER
+=========================
+
+.. note::
+ - ``BPF_MAP_TYPE_BLOOM_FILTER`` was introduced in kernel version 5.16
+
+``BPF_MAP_TYPE_BLOOM_FILTER`` provides a BPF bloom filter map. Bloom
+filters are a space-efficient probabilistic data structure used to
+quickly test whether an element exists in a set. In a bloom filter,
+false positives are possible whereas false negatives are not.
+
+The bloom filter map does not have keys, only values. When the bloom
+filter map is created, it must be created with a ``key_size`` of 0. The
+bloom filter map supports two operations:
+
+- push: adding an element to the map
+- peek: determining whether an element is present in the map
+
+BPF programs must use ``bpf_map_push_elem`` to add an element to the
+bloom filter map and ``bpf_map_peek_elem`` to query the map. These
+operations are exposed to userspace applications using the existing
+``bpf`` syscall in the following way:
+
+- ``BPF_MAP_UPDATE_ELEM`` -> push
+- ``BPF_MAP_LOOKUP_ELEM`` -> peek
+
+The ``max_entries`` size that is specified at map creation time is used
+to approximate a reasonable bitmap size for the bloom filter, and is not
+otherwise strictly enforced. If the user wishes to insert more entries
+into the bloom filter than ``max_entries``, this may lead to a higher
+false positive rate.
+
+The number of hashes to use for the bloom filter is configurable using
+the lower 4 bits of ``map_extra`` in ``union bpf_attr`` at map creation
+time. If no number is specified, the default used will be 5 hash
+functions. In general, using more hashes decreases both the false
+positive rate and the speed of a lookup.
+
+It is not possible to delete elements from a bloom filter map. A bloom
+filter map may be used as an inner map. The user is responsible for
+synchronising concurrent updates and lookups to ensure no false negative
+lookups occur.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_map_push_elem()
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+
+A ``value`` can be added to a bloom filter using the
+``bpf_map_push_elem()`` helper. The ``flags`` parameter must be set to
+``BPF_ANY`` when adding an entry to the bloom filter. This helper
+returns ``0`` on success, or negative error in case of failure.
+
+bpf_map_peek_elem()
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_peek_elem(struct bpf_map *map, void *value)
+
+The ``bpf_map_peek_elem()`` helper is used to determine whether
+``value`` is present in the bloom filter map. This helper returns ``0``
+if ``value`` is probably present in the map, or ``-ENOENT`` if ``value``
+is definitely not present in the map.
+
+Userspace
+---------
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_update_elem (int fd, const void *key, const void *value, __u64 flags)
+
+A userspace program can add a ``value`` to a bloom filter using libbpf's
+``bpf_map_update_elem`` function. The ``key`` parameter must be set to
+``NULL`` and ``flags`` must be set to ``BPF_ANY``. Returns ``0`` on
+success, or negative error in case of failure.
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_lookup_elem (int fd, const void *key, void *value)
+
+A userspace program can determine the presence of ``value`` in a bloom
+filter using libbpf's ``bpf_map_lookup_elem`` function. The ``key``
+parameter must be set to ``NULL``. Returns ``0`` if ``value`` is
+probably present in the map, or ``-ENOENT`` if ``value`` is definitely
+not present in the map.
+
+Examples
+========
+
+Kernel BPF
+----------
+
+This snippet shows how to declare a bloom filter in a BPF program:
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_BLOOM_FILTER);
+ __type(value, __u32);
+ __uint(max_entries, 1000);
+ __uint(map_extra, 3);
+ } bloom_filter SEC(".maps");
+
+This snippet shows how to determine presence of a value in a bloom
+filter in a BPF program:
+
+.. code-block:: c
+
+ void *lookup(__u32 key)
+ {
+ if (bpf_map_peek_elem(&bloom_filter, &key) == 0) {
+ /* Verify not a false positive and fetch an associated
+ * value using a secondary lookup, e.g. in a hash table
+ */
+ return bpf_map_lookup_elem(&hash_table, &key);
+ }
+ return 0;
+ }
+
+Userspace
+---------
+
+This snippet shows how to use libbpf to create a bloom filter map from
+userspace:
+
+.. code-block:: c
+
+ int create_bloom()
+ {
+ LIBBPF_OPTS(bpf_map_create_opts, opts,
+ .map_extra = 3); /* number of hashes */
+
+ return bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER,
+ "ipv6_bloom", /* name */
+ 0, /* key size, must be zero */
+ sizeof(ipv6_addr), /* value size */
+ 10000, /* max entries */
+ &opts); /* create options */
+ }
+
+This snippet shows how to add an element to a bloom filter from
+userspace:
+
+.. code-block:: c
+
+ int add_element(struct bpf_map *bloom_map, __u32 value)
+ {
+ int bloom_fd = bpf_map__fd(bloom_map);
+ return bpf_map_update_elem(bloom_fd, NULL, &value, BPF_ANY);
+ }
+
+References
+==========
+
+https://lwn.net/ml/bpf/20210831225005.2762202-1-joannekoong@fb.com/
diff --git a/Documentation/bpf/map_cgroup_storage.rst b/Documentation/bpf/map_cgroup_storage.rst
index cab9543017bf..8e5fe532c07e 100644
--- a/Documentation/bpf/map_cgroup_storage.rst
+++ b/Documentation/bpf/map_cgroup_storage.rst
@@ -31,7 +31,7 @@ The map uses key of type of either ``__u64 cgroup_inode_id`` or
};
``cgroup_inode_id`` is the inode id of the cgroup directory.
-``attach_type`` is the the program's attach type.
+``attach_type`` is the program's attach type.
Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
When this key type is used, then all attach types of the particular cgroup and
@@ -155,7 +155,7 @@ However, the BPF program can still only associate with one map of each type
``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
-In all versions, userspace may use the the attach parameters of cgroup and
+In all versions, userspace may use the attach parameters of cgroup and
attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
APIs to read or update the storage for a given attachment. For Linux 5.9
attach type shared storages, only the first value in the struct, cgroup inode
diff --git a/Documentation/bpf/map_cgrp_storage.rst b/Documentation/bpf/map_cgrp_storage.rst
new file mode 100644
index 000000000000..5d3f603efffa
--- /dev/null
+++ b/Documentation/bpf/map_cgrp_storage.rst
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Meta Platforms, Inc. and affiliates.
+
+=========================
+BPF_MAP_TYPE_CGRP_STORAGE
+=========================
+
+The ``BPF_MAP_TYPE_CGRP_STORAGE`` map type represents a local fix-sized
+storage for cgroups. It is only available with ``CONFIG_CGROUPS``.
+The programs are made available by the same Kconfig. The
+data for a particular cgroup can be retrieved by looking up the map
+with that cgroup.
+
+This document describes the usage and semantics of the
+``BPF_MAP_TYPE_CGRP_STORAGE`` map type.
+
+Usage
+=====
+
+The map key must be ``sizeof(int)`` representing a cgroup fd.
+To access the storage in a program, use ``bpf_cgrp_storage_get``::
+
+ void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags)
+
+``flags`` could be 0 or ``BPF_LOCAL_STORAGE_GET_F_CREATE`` which indicates that
+a new local storage will be created if one does not exist.
+
+The local storage can be removed with ``bpf_cgrp_storage_delete``::
+
+ long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup)
+
+The map is available to all program types.
+
+Examples
+========
+
+A BPF program example with BPF_MAP_TYPE_CGRP_STORAGE::
+
+ #include <vmlinux.h>
+ #include <bpf/bpf_helpers.h>
+ #include <bpf/bpf_tracing.h>
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, long);
+ } cgrp_storage SEC(".maps");
+
+ SEC("tp_btf/sys_enter")
+ int BPF_PROG(on_enter, struct pt_regs *regs, long id)
+ {
+ struct task_struct *task = bpf_get_current_task_btf();
+ long *ptr;
+
+ ptr = bpf_cgrp_storage_get(&cgrp_storage, task->cgroups->dfl_cgrp, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (ptr)
+ __sync_fetch_and_add(ptr, 1);
+
+ return 0;
+ }
+
+Userspace accessing map declared above::
+
+ #include <linux/bpf.h>
+ #include <linux/libbpf.h>
+
+ __u32 map_lookup(struct bpf_map *map, int cgrp_fd)
+ {
+ __u32 *value;
+ value = bpf_map_lookup_elem(bpf_map__fd(map), &cgrp_fd);
+ if (value)
+ return *value;
+ return 0;
+ }
+
+Difference Between BPF_MAP_TYPE_CGRP_STORAGE and BPF_MAP_TYPE_CGROUP_STORAGE
+============================================================================
+
+The old cgroup storage map ``BPF_MAP_TYPE_CGROUP_STORAGE`` has been marked as
+deprecated (renamed to ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``). The new
+``BPF_MAP_TYPE_CGRP_STORAGE`` map should be used instead. The following
+illusates the main difference between ``BPF_MAP_TYPE_CGRP_STORAGE`` and
+``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``.
+
+(1). ``BPF_MAP_TYPE_CGRP_STORAGE`` can be used by all program types while
+ ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` is available only to cgroup program types
+ like BPF_CGROUP_INET_INGRESS or BPF_CGROUP_SOCK_OPS, etc.
+
+(2). ``BPF_MAP_TYPE_CGRP_STORAGE`` supports local storage for more than one
+ cgroup while ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` only supports one cgroup
+ which is attached by a BPF program.
+
+(3). ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` allocates local storage at attach time so
+ ``bpf_get_local_storage()`` always returns non-NULL local storage.
+ ``BPF_MAP_TYPE_CGRP_STORAGE`` allocates local storage at runtime so
+ it is possible that ``bpf_cgrp_storage_get()`` may return null local storage.
+ To avoid such null local storage issue, user space can do
+ ``bpf_map_update_elem()`` to pre-allocate local storage before a BPF program
+ is attached.
+
+(4). ``BPF_MAP_TYPE_CGRP_STORAGE`` supports deleting local storage by a BPF program
+ while ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED`` only deletes storage during
+ prog detach time.
+
+So overall, ``BPF_MAP_TYPE_CGRP_STORAGE`` supports all ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``
+functionality and beyond. It is recommended to use ``BPF_MAP_TYPE_CGRP_STORAGE``
+instead of ``BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED``.
diff --git a/Documentation/bpf/map_cpumap.rst b/Documentation/bpf/map_cpumap.rst
new file mode 100644
index 000000000000..923cfc8ab51f
--- /dev/null
+++ b/Documentation/bpf/map_cpumap.rst
@@ -0,0 +1,177 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+===================
+BPF_MAP_TYPE_CPUMAP
+===================
+
+.. note::
+ - ``BPF_MAP_TYPE_CPUMAP`` was introduced in kernel version 4.15
+
+.. kernel-doc:: kernel/bpf/cpumap.c
+ :doc: cpu map
+
+An example use-case for this map type is software based Receive Side Scaling (RSS).
+
+The CPUMAP represents the CPUs in the system indexed as the map-key, and the
+map-value is the config setting (per CPUMAP entry). Each CPUMAP entry has a dedicated
+kernel thread bound to the given CPU to represent the remote CPU execution unit.
+
+Starting from Linux kernel version 5.9 the CPUMAP can run a second XDP program
+on the remote CPU. This allows an XDP program to split its processing across
+multiple CPUs. For example, a scenario where the initial CPU (that sees/receives
+the packets) needs to do minimal packet processing and the remote CPU (to which
+the packet is directed) can afford to spend more cycles processing the frame. The
+initial CPU is where the XDP redirect program is executed. The remote CPU
+receives raw ``xdp_frame`` objects.
+
+Usage
+=====
+
+Kernel BPF
+----------
+bpf_redirect_map()
+^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+
+Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
+For ``BPF_MAP_TYPE_CPUMAP`` this map contains references to CPUs.
+
+The lower two bits of ``flags`` are used as the return code if the map lookup
+fails. This is so that the return value can be one of the XDP program return
+codes up to ``XDP_TX``, as chosen by the caller.
+
+User space
+----------
+.. note::
+ CPUMAP entries can only be updated/looked up/deleted from user space and not
+ from an eBPF program. Trying to call these functions from a kernel eBPF
+ program will result in the program failing to load and a verifier warning.
+
+bpf_map_update_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags);
+
+CPU entries can be added or updated using the ``bpf_map_update_elem()``
+helper. This helper replaces existing elements atomically. The ``value`` parameter
+can be ``struct bpf_cpumap_val``.
+
+ .. code-block:: c
+
+ struct bpf_cpumap_val {
+ __u32 qsize; /* queue size to remote target CPU */
+ union {
+ int fd; /* prog fd on map write */
+ __u32 id; /* prog id on map read */
+ } bpf_prog;
+ };
+
+The flags argument can be one of the following:
+ - BPF_ANY: Create a new element or update an existing element.
+ - BPF_NOEXIST: Create a new element only if it did not exist.
+ - BPF_EXIST: Update an existing element.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_lookup_elem(int fd, const void *key, void *value);
+
+CPU entries can be retrieved using the ``bpf_map_lookup_elem()``
+helper.
+
+bpf_map_delete_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_delete_elem(int fd, const void *key);
+
+CPU entries can be deleted using the ``bpf_map_delete_elem()``
+helper. This helper will return 0 on success, or negative error in case of
+failure.
+
+Examples
+========
+Kernel
+------
+
+The following code snippet shows how to declare a ``BPF_MAP_TYPE_CPUMAP`` called
+``cpu_map`` and how to redirect packets to a remote CPU using a round robin scheme.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_CPUMAP);
+ __type(key, __u32);
+ __type(value, struct bpf_cpumap_val);
+ __uint(max_entries, 12);
+ } cpu_map SEC(".maps");
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 12);
+ } cpus_available SEC(".maps");
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 1);
+ } cpus_iterator SEC(".maps");
+
+ SEC("xdp")
+ int xdp_redir_cpu_round_robin(struct xdp_md *ctx)
+ {
+ __u32 key = 0;
+ __u32 cpu_dest = 0;
+ __u32 *cpu_selected, *cpu_iterator;
+ __u32 cpu_idx;
+
+ cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key);
+ if (!cpu_iterator)
+ return XDP_ABORTED;
+ cpu_idx = *cpu_iterator;
+
+ *cpu_iterator += 1;
+ if (*cpu_iterator == bpf_num_possible_cpus())
+ *cpu_iterator = 0;
+
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ if (cpu_dest >= bpf_num_possible_cpus())
+ return XDP_ABORTED;
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+ }
+
+User space
+----------
+
+The following code snippet shows how to dynamically set the max_entries for a
+CPUMAP to the max number of cpus available on the system.
+
+.. code-block:: c
+
+ int set_max_cpu_entries(struct bpf_map *cpu_map)
+ {
+ if (bpf_map__set_max_entries(cpu_map, libbpf_num_possible_cpus()) < 0) {
+ fprintf(stderr, "Failed to set max entries for cpu_map map: %s",
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+ }
+
+References
+===========
+
+- https://developers.redhat.com/blog/2021/05/13/receive-side-scaling-rss-with-ebpf-and-cpumap#redirecting_into_a_cpumap
diff --git a/Documentation/bpf/map_devmap.rst b/Documentation/bpf/map_devmap.rst
new file mode 100644
index 000000000000..927312c7b8c8
--- /dev/null
+++ b/Documentation/bpf/map_devmap.rst
@@ -0,0 +1,238 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=================================================
+BPF_MAP_TYPE_DEVMAP and BPF_MAP_TYPE_DEVMAP_HASH
+=================================================
+
+.. note::
+ - ``BPF_MAP_TYPE_DEVMAP`` was introduced in kernel version 4.14
+ - ``BPF_MAP_TYPE_DEVMAP_HASH`` was introduced in kernel version 5.4
+
+``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` are BPF maps primarily
+used as backend maps for the XDP BPF helper call ``bpf_redirect_map()``.
+``BPF_MAP_TYPE_DEVMAP`` is backed by an array that uses the key as
+the index to lookup a reference to a net device. While ``BPF_MAP_TYPE_DEVMAP_HASH``
+is backed by a hash table that uses a key to lookup a reference to a net device.
+The user provides either <``key``/ ``ifindex``> or <``key``/ ``struct bpf_devmap_val``>
+pairs to update the maps with new net devices.
+
+.. note::
+ - The key to a hash map doesn't have to be an ``ifindex``.
+ - While ``BPF_MAP_TYPE_DEVMAP_HASH`` allows for densely packing the net devices
+ it comes at the cost of a hash of the key when performing a look up.
+
+The setup and packet enqueue/send code is shared between the two types of
+devmap; only the lookup and insertion is different.
+
+Usage
+=====
+Kernel BPF
+----------
+bpf_redirect_map()
+^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+
+Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
+For ``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` this map contains
+references to net devices (for forwarding packets through other ports).
+
+The lower two bits of *flags* are used as the return code if the map lookup
+fails. This is so that the return value can be one of the XDP program return
+codes up to ``XDP_TX``, as chosen by the caller. The higher bits of ``flags``
+can be set to ``BPF_F_BROADCAST`` or ``BPF_F_EXCLUDE_INGRESS`` as defined
+below.
+
+With ``BPF_F_BROADCAST`` the packet will be broadcast to all the interfaces
+in the map, with ``BPF_F_EXCLUDE_INGRESS`` the ingress interface will be excluded
+from the broadcast.
+
+.. note::
+ - The key is ignored if BPF_F_BROADCAST is set.
+ - The broadcast feature can also be used to implement multicast forwarding:
+ simply create multiple DEVMAPs, each one corresponding to a single multicast group.
+
+This helper will return ``XDP_REDIRECT`` on success, or the value of the two
+lower bits of the ``flags`` argument if the map lookup fails.
+
+More information about redirection can be found :doc:`redirect`
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+Net device entries can be retrieved using the ``bpf_map_lookup_elem()``
+helper.
+
+User space
+----------
+.. note::
+ DEVMAP entries can only be updated/deleted from user space and not
+ from an eBPF program. Trying to call these functions from a kernel eBPF
+ program will result in the program failing to load and a verifier warning.
+
+bpf_map_update_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags);
+
+Net device entries can be added or updated using the ``bpf_map_update_elem()``
+helper. This helper replaces existing elements atomically. The ``value`` parameter
+can be ``struct bpf_devmap_val`` or a simple ``int ifindex`` for backwards
+compatibility.
+
+ .. code-block:: c
+
+ struct bpf_devmap_val {
+ __u32 ifindex; /* device index */
+ union {
+ int fd; /* prog fd on map write */
+ __u32 id; /* prog id on map read */
+ } bpf_prog;
+ };
+
+The ``flags`` argument can be one of the following:
+ - ``BPF_ANY``: Create a new element or update an existing element.
+ - ``BPF_NOEXIST``: Create a new element only if it did not exist.
+ - ``BPF_EXIST``: Update an existing element.
+
+DEVMAPs can associate a program with a device entry by adding a ``bpf_prog.fd``
+to ``struct bpf_devmap_val``. Programs are run after ``XDP_REDIRECT`` and have
+access to both Rx device and Tx device. The program associated with the ``fd``
+must have type XDP with expected attach type ``xdp_devmap``.
+When a program is associated with a device index, the program is run on an
+``XDP_REDIRECT`` and before the buffer is added to the per-cpu queue. Examples
+of how to attach/use xdp_devmap progs can be found in the kernel selftests:
+
+- ``tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c``
+- ``tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c``
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+.. c:function::
+ int bpf_map_lookup_elem(int fd, const void *key, void *value);
+
+Net device entries can be retrieved using the ``bpf_map_lookup_elem()``
+helper.
+
+bpf_map_delete_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+.. c:function::
+ int bpf_map_delete_elem(int fd, const void *key);
+
+Net device entries can be deleted using the ``bpf_map_delete_elem()``
+helper. This helper will return 0 on success, or negative error in case of
+failure.
+
+Examples
+========
+
+Kernel BPF
+----------
+
+The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP``
+called tx_port.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 256);
+ } tx_port SEC(".maps");
+
+The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP_HASH``
+called forward_map.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+ __type(key, __u32);
+ __type(value, struct bpf_devmap_val);
+ __uint(max_entries, 32);
+ } forward_map SEC(".maps");
+
+.. note::
+
+ The value type in the DEVMAP above is a ``struct bpf_devmap_val``
+
+The following code snippet shows a simple xdp_redirect_map program. This program
+would work with a user space program that populates the devmap ``forward_map`` based
+on ingress ifindexes. The BPF program (below) is redirecting packets using the
+ingress ``ifindex`` as the ``key``.
+
+.. code-block:: c
+
+ SEC("xdp")
+ int xdp_redirect_map_func(struct xdp_md *ctx)
+ {
+ int index = ctx->ingress_ifindex;
+
+ return bpf_redirect_map(&forward_map, index, 0);
+ }
+
+The following code snippet shows a BPF program that is broadcasting packets to
+all the interfaces in the ``tx_port`` devmap.
+
+.. code-block:: c
+
+ SEC("xdp")
+ int xdp_redirect_map_func(struct xdp_md *ctx)
+ {
+ return bpf_redirect_map(&tx_port, 0, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+ }
+
+User space
+----------
+
+The following code snippet shows how to update a devmap called ``tx_port``.
+
+.. code-block:: c
+
+ int update_devmap(int ifindex, int redirect_ifindex)
+ {
+ int ret;
+
+ ret = bpf_map_update_elem(bpf_map__fd(tx_port), &ifindex, &redirect_ifindex, 0);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to update devmap_ value: %s\n",
+ strerror(errno));
+ }
+
+ return ret;
+ }
+
+The following code snippet shows how to update a hash_devmap called ``forward_map``.
+
+.. code-block:: c
+
+ int update_devmap(int ifindex, int redirect_ifindex)
+ {
+ struct bpf_devmap_val devmap_val = { .ifindex = redirect_ifindex };
+ int ret;
+
+ ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to update devmap_ value: %s\n",
+ strerror(errno));
+ }
+ return ret;
+ }
+
+References
+===========
+
+- https://lwn.net/Articles/728146/
+- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=6f9d451ab1a33728adb72d7ff66a7b374d665176
+- https://elixir.bootlin.com/linux/latest/source/net/core/filter.c#L4106
diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst
new file mode 100644
index 000000000000..8669426264c6
--- /dev/null
+++ b/Documentation/bpf/map_hash.rst
@@ -0,0 +1,208 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+===============================================
+BPF_MAP_TYPE_HASH, with PERCPU and LRU Variants
+===============================================
+
+.. note::
+ - ``BPF_MAP_TYPE_HASH`` was introduced in kernel version 3.19
+ - ``BPF_MAP_TYPE_PERCPU_HASH`` was introduced in version 4.6
+ - Both ``BPF_MAP_TYPE_LRU_HASH`` and ``BPF_MAP_TYPE_LRU_PERCPU_HASH``
+ were introduced in version 4.10
+
+``BPF_MAP_TYPE_HASH`` and ``BPF_MAP_TYPE_PERCPU_HASH`` provide general
+purpose hash map storage. Both the key and the value can be structs,
+allowing for composite keys and values.
+
+The kernel is responsible for allocating and freeing key/value pairs, up
+to the max_entries limit that you specify. Hash maps use pre-allocation
+of hash table elements by default. The ``BPF_F_NO_PREALLOC`` flag can be
+used to disable pre-allocation when it is too memory expensive.
+
+``BPF_MAP_TYPE_PERCPU_HASH`` provides a separate value slot per
+CPU. The per-cpu values are stored internally in an array.
+
+The ``BPF_MAP_TYPE_LRU_HASH`` and ``BPF_MAP_TYPE_LRU_PERCPU_HASH``
+variants add LRU semantics to their respective hash tables. An LRU hash
+will automatically evict the least recently used entries when the hash
+table reaches capacity. An LRU hash maintains an internal LRU list that
+is used to select elements for eviction. This internal LRU list is
+shared across CPUs but it is possible to request a per CPU LRU list with
+the ``BPF_F_NO_COMMON_LRU`` flag when calling ``bpf_map_create``.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+
+Hash entries can be added or updated using the ``bpf_map_update_elem()``
+helper. This helper replaces existing elements atomically. The ``flags``
+parameter can be used to control the update behaviour:
+
+- ``BPF_ANY`` will create a new element or update an existing element
+- ``BPF_NOEXIST`` will create a new element only if one did not already
+ exist
+- ``BPF_EXIST`` will update an existing element
+
+``bpf_map_update_elem()`` returns 0 on success, or negative error in
+case of failure.
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+Hash entries can be retrieved using the ``bpf_map_lookup_elem()``
+helper. This helper returns a pointer to the value associated with
+``key``, or ``NULL`` if no entry was found.
+
+bpf_map_delete_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_delete_elem(struct bpf_map *map, const void *key)
+
+Hash entries can be deleted using the ``bpf_map_delete_elem()``
+helper. This helper will return 0 on success, or negative error in case
+of failure.
+
+Per CPU Hashes
+--------------
+
+For ``BPF_MAP_TYPE_PERCPU_HASH`` and ``BPF_MAP_TYPE_LRU_PERCPU_HASH``
+the ``bpf_map_update_elem()`` and ``bpf_map_lookup_elem()`` helpers
+automatically access the hash slot for the current CPU.
+
+bpf_map_lookup_percpu_elem()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
+
+The ``bpf_map_lookup_percpu_elem()`` helper can be used to lookup the
+value in the hash slot for a specific CPU. Returns value associated with
+``key`` on ``cpu`` , or ``NULL`` if no entry was found or ``cpu`` is
+invalid.
+
+Concurrency
+-----------
+
+Values stored in ``BPF_MAP_TYPE_HASH`` can be accessed concurrently by
+programs running on different CPUs. Since Kernel version 5.1, the BPF
+infrastructure provides ``struct bpf_spin_lock`` to synchronise access.
+See ``tools/testing/selftests/bpf/progs/test_spin_lock.c``.
+
+Userspace
+---------
+
+bpf_map_get_next_key()
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_get_next_key(int fd, const void *cur_key, void *next_key)
+
+In userspace, it is possible to iterate through the keys of a hash using
+libbpf's ``bpf_map_get_next_key()`` function. The first key can be fetched by
+calling ``bpf_map_get_next_key()`` with ``cur_key`` set to
+``NULL``. Subsequent calls will fetch the next key that follows the
+current key. ``bpf_map_get_next_key()`` returns 0 on success, -ENOENT if
+cur_key is the last key in the hash, or negative error in case of
+failure.
+
+Note that if ``cur_key`` gets deleted then ``bpf_map_get_next_key()``
+will instead return the *first* key in the hash table which is
+undesirable. It is recommended to use batched lookup if there is going
+to be key deletion intermixed with ``bpf_map_get_next_key()``.
+
+Examples
+========
+
+Please see the ``tools/testing/selftests/bpf`` directory for functional
+examples. The code snippets below demonstrates API usage.
+
+This example shows how to declare an LRU Hash with a struct key and a
+struct value.
+
+.. code-block:: c
+
+ #include <linux/bpf.h>
+ #include <bpf/bpf_helpers.h>
+
+ struct key {
+ __u32 srcip;
+ };
+
+ struct value {
+ __u64 packets;
+ __u64 bytes;
+ };
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, 32);
+ __type(key, struct key);
+ __type(value, struct value);
+ } packet_stats SEC(".maps");
+
+This example shows how to create or update hash values using atomic
+instructions:
+
+.. code-block:: c
+
+ static void update_stats(__u32 srcip, int bytes)
+ {
+ struct key key = {
+ .srcip = srcip,
+ };
+ struct value *value = bpf_map_lookup_elem(&packet_stats, &key);
+
+ if (value) {
+ __sync_fetch_and_add(&value->packets, 1);
+ __sync_fetch_and_add(&value->bytes, bytes);
+ } else {
+ struct value newval = { 1, bytes };
+
+ bpf_map_update_elem(&packet_stats, &key, &newval, BPF_NOEXIST);
+ }
+ }
+
+Userspace walking the map elements from the map declared above:
+
+.. code-block:: c
+
+ #include <bpf/libbpf.h>
+ #include <bpf/bpf.h>
+
+ static void walk_hash_elements(int map_fd)
+ {
+ struct key *cur_key = NULL;
+ struct key next_key;
+ struct value value;
+ int err;
+
+ for (;;) {
+ err = bpf_map_get_next_key(map_fd, cur_key, &next_key);
+ if (err)
+ break;
+
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
+
+ // Use key and value here
+
+ cur_key = &next_key;
+ }
+ }
diff --git a/Documentation/bpf/map_lpm_trie.rst b/Documentation/bpf/map_lpm_trie.rst
new file mode 100644
index 000000000000..74d64a30f500
--- /dev/null
+++ b/Documentation/bpf/map_lpm_trie.rst
@@ -0,0 +1,197 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=====================
+BPF_MAP_TYPE_LPM_TRIE
+=====================
+
+.. note::
+ - ``BPF_MAP_TYPE_LPM_TRIE`` was introduced in kernel version 4.11
+
+``BPF_MAP_TYPE_LPM_TRIE`` provides a longest prefix match algorithm that
+can be used to match IP addresses to a stored set of prefixes.
+Internally, data is stored in an unbalanced trie of nodes that uses
+``prefixlen,data`` pairs as its keys. The ``data`` is interpreted in
+network byte order, i.e. big endian, so ``data[0]`` stores the most
+significant byte.
+
+LPM tries may be created with a maximum prefix length that is a multiple
+of 8, in the range from 8 to 2048. The key used for lookup and update
+operations is a ``struct bpf_lpm_trie_key``, extended by
+``max_prefixlen/8`` bytes.
+
+- For IPv4 addresses the data length is 4 bytes
+- For IPv6 addresses the data length is 16 bytes
+
+The value type stored in the LPM trie can be any user defined type.
+
+.. note::
+ When creating a map of type ``BPF_MAP_TYPE_LPM_TRIE`` you must set the
+ ``BPF_F_NO_PREALLOC`` flag.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+The longest prefix entry for a given data value can be found using the
+``bpf_map_lookup_elem()`` helper. This helper returns a pointer to the
+value associated with the longest matching ``key``, or ``NULL`` if no
+entry was found.
+
+The ``key`` should have ``prefixlen`` set to ``max_prefixlen`` when
+performing longest prefix lookups. For example, when searching for the
+longest prefix match for an IPv4 address, ``prefixlen`` should be set to
+``32``.
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+
+Prefix entries can be added or updated using the ``bpf_map_update_elem()``
+helper. This helper replaces existing elements atomically.
+
+``bpf_map_update_elem()`` returns ``0`` on success, or negative error in
+case of failure.
+
+ .. note::
+ The flags parameter must be one of BPF_ANY, BPF_NOEXIST or BPF_EXIST,
+ but the value is ignored, giving BPF_ANY semantics.
+
+bpf_map_delete_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_delete_elem(struct bpf_map *map, const void *key)
+
+Prefix entries can be deleted using the ``bpf_map_delete_elem()``
+helper. This helper will return 0 on success, or negative error in case
+of failure.
+
+Userspace
+---------
+
+Access from userspace uses libbpf APIs with the same names as above, with
+the map identified by ``fd``.
+
+bpf_map_get_next_key()
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_get_next_key (int fd, const void *cur_key, void *next_key)
+
+A userspace program can iterate through the entries in an LPM trie using
+libbpf's ``bpf_map_get_next_key()`` function. The first key can be
+fetched by calling ``bpf_map_get_next_key()`` with ``cur_key`` set to
+``NULL``. Subsequent calls will fetch the next key that follows the
+current key. ``bpf_map_get_next_key()`` returns ``0`` on success,
+``-ENOENT`` if ``cur_key`` is the last key in the trie, or negative
+error in case of failure.
+
+``bpf_map_get_next_key()`` will iterate through the LPM trie elements
+from leftmost leaf first. This means that iteration will return more
+specific keys before less specific ones.
+
+Examples
+========
+
+Please see ``tools/testing/selftests/bpf/test_lpm_map.c`` for examples
+of LPM trie usage from userspace. The code snippets below demonstrate
+API usage.
+
+Kernel BPF
+----------
+
+The following BPF code snippet shows how to declare a new LPM trie for IPv4
+address prefixes:
+
+.. code-block:: c
+
+ #include <linux/bpf.h>
+ #include <bpf/bpf_helpers.h>
+
+ struct ipv4_lpm_key {
+ __u32 prefixlen;
+ __u32 data;
+ };
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_LPM_TRIE);
+ __type(key, struct ipv4_lpm_key);
+ __type(value, __u32);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __uint(max_entries, 255);
+ } ipv4_lpm_map SEC(".maps");
+
+The following BPF code snippet shows how to lookup by IPv4 address:
+
+.. code-block:: c
+
+ void *lookup(__u32 ipaddr)
+ {
+ struct ipv4_lpm_key key = {
+ .prefixlen = 32,
+ .data = ipaddr
+ };
+
+ return bpf_map_lookup_elem(&ipv4_lpm_map, &key);
+ }
+
+Userspace
+---------
+
+The following snippet shows how to insert an IPv4 prefix entry into an
+LPM trie:
+
+.. code-block:: c
+
+ int add_prefix_entry(int lpm_fd, __u32 addr, __u32 prefixlen, struct value *value)
+ {
+ struct ipv4_lpm_key ipv4_key = {
+ .prefixlen = prefixlen,
+ .data = addr
+ };
+ return bpf_map_update_elem(lpm_fd, &ipv4_key, value, BPF_ANY);
+ }
+
+The following snippet shows a userspace program walking through the entries
+of an LPM trie:
+
+
+.. code-block:: c
+
+ #include <bpf/libbpf.h>
+ #include <bpf/bpf.h>
+
+ void iterate_lpm_trie(int map_fd)
+ {
+ struct ipv4_lpm_key *cur_key = NULL;
+ struct ipv4_lpm_key next_key;
+ struct value value;
+ int err;
+
+ for (;;) {
+ err = bpf_map_get_next_key(map_fd, cur_key, &next_key);
+ if (err)
+ break;
+
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
+
+ /* Use key and value here */
+
+ cur_key = &next_key;
+ }
+ }
diff --git a/Documentation/bpf/map_of_maps.rst b/Documentation/bpf/map_of_maps.rst
new file mode 100644
index 000000000000..7b5617c2d017
--- /dev/null
+++ b/Documentation/bpf/map_of_maps.rst
@@ -0,0 +1,130 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+========================================================
+BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS
+========================================================
+
+.. note::
+ - ``BPF_MAP_TYPE_ARRAY_OF_MAPS`` and ``BPF_MAP_TYPE_HASH_OF_MAPS`` were
+ introduced in kernel version 4.12
+
+``BPF_MAP_TYPE_ARRAY_OF_MAPS`` and ``BPF_MAP_TYPE_HASH_OF_MAPS`` provide general
+purpose support for map in map storage. One level of nesting is supported, where
+an outer map contains instances of a single type of inner map, for example
+``array_of_maps->sock_map``.
+
+When creating an outer map, an inner map instance is used to initialize the
+metadata that the outer map holds about its inner maps. This inner map has a
+separate lifetime from the outer map and can be deleted after the outer map has
+been created.
+
+The outer map supports element lookup, update and delete from user space using
+the syscall API. A BPF program is only allowed to do element lookup in the outer
+map.
+
+.. note::
+ - Multi-level nesting is not supported.
+ - Any BPF map type can be used as an inner map, except for
+ ``BPF_MAP_TYPE_PROG_ARRAY``.
+ - A BPF program cannot update or delete outer map entries.
+
+For ``BPF_MAP_TYPE_ARRAY_OF_MAPS`` the key is an unsigned 32-bit integer index
+into the array. The array is a fixed size with ``max_entries`` elements that are
+zero initialized when created.
+
+For ``BPF_MAP_TYPE_HASH_OF_MAPS`` the key type can be chosen when defining the
+map. The kernel is responsible for allocating and freeing key/value pairs, up to
+the max_entries limit that you specify. Hash maps use pre-allocation of hash
+table elements by default. The ``BPF_F_NO_PREALLOC`` flag can be used to disable
+pre-allocation when it is too memory expensive.
+
+Usage
+=====
+
+Kernel BPF Helper
+-----------------
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+Inner maps can be retrieved using the ``bpf_map_lookup_elem()`` helper. This
+helper returns a pointer to the inner map, or ``NULL`` if no entry was found.
+
+Examples
+========
+
+Kernel BPF Example
+------------------
+
+This snippet shows how to create and initialise an array of devmaps in a BPF
+program. Note that the outer array can only be modified from user space using
+the syscall API.
+
+.. code-block:: c
+
+ struct inner_map {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(max_entries, 10);
+ __type(key, __u32);
+ __type(value, __u32);
+ } inner_map1 SEC(".maps"), inner_map2 SEC(".maps");
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __array(values, struct inner_map);
+ } outer_map SEC(".maps") = {
+ .values = { &inner_map1,
+ &inner_map2 }
+ };
+
+See ``progs/test_btf_map_in_map.c`` in ``tools/testing/selftests/bpf`` for more
+examples of declarative initialisation of outer maps.
+
+User Space
+----------
+
+This snippet shows how to create an array based outer map:
+
+.. code-block:: c
+
+ int create_outer_array(int inner_fd) {
+ LIBBPF_OPTS(bpf_map_create_opts, opts, .inner_map_fd = inner_fd);
+ int fd;
+
+ fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ "example_array", /* name */
+ sizeof(__u32), /* key size */
+ sizeof(__u32), /* value size */
+ 256, /* max entries */
+ &opts); /* create opts */
+ return fd;
+ }
+
+
+This snippet shows how to add an inner map to an outer map:
+
+.. code-block:: c
+
+ int add_devmap(int outer_fd, int index, const char *name) {
+ int fd;
+
+ fd = bpf_map_create(BPF_MAP_TYPE_DEVMAP, name,
+ sizeof(__u32), sizeof(__u32), 256, NULL);
+ if (fd < 0)
+ return fd;
+
+ return bpf_map_update_elem(outer_fd, &index, &fd, BPF_ANY);
+ }
+
+References
+==========
+
+- https://lore.kernel.org/netdev/20170322170035.923581-3-kafai@fb.com/
+- https://lore.kernel.org/netdev/20170322170035.923581-4-kafai@fb.com/
diff --git a/Documentation/bpf/map_queue_stack.rst b/Documentation/bpf/map_queue_stack.rst
new file mode 100644
index 000000000000..8d14ed49d6e1
--- /dev/null
+++ b/Documentation/bpf/map_queue_stack.rst
@@ -0,0 +1,146 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=========================================
+BPF_MAP_TYPE_QUEUE and BPF_MAP_TYPE_STACK
+=========================================
+
+.. note::
+ - ``BPF_MAP_TYPE_QUEUE`` and ``BPF_MAP_TYPE_STACK`` were introduced
+ in kernel version 4.20
+
+``BPF_MAP_TYPE_QUEUE`` provides FIFO storage and ``BPF_MAP_TYPE_STACK``
+provides LIFO storage for BPF programs. These maps support peek, pop and
+push operations that are exposed to BPF programs through the respective
+helpers. These operations are exposed to userspace applications using
+the existing ``bpf`` syscall in the following way:
+
+- ``BPF_MAP_LOOKUP_ELEM`` -> peek
+- ``BPF_MAP_LOOKUP_AND_DELETE_ELEM`` -> pop
+- ``BPF_MAP_UPDATE_ELEM`` -> push
+
+``BPF_MAP_TYPE_QUEUE`` and ``BPF_MAP_TYPE_STACK`` do not support
+``BPF_F_NO_PREALLOC``.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_map_push_elem()
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+
+An element ``value`` can be added to a queue or stack using the
+``bpf_map_push_elem`` helper. The ``flags`` parameter must be set to
+``BPF_ANY`` or ``BPF_EXIST``. If ``flags`` is set to ``BPF_EXIST`` then,
+when the queue or stack is full, the oldest element will be removed to
+make room for ``value`` to be added. Returns ``0`` on success, or
+negative error in case of failure.
+
+bpf_map_peek_elem()
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_peek_elem(struct bpf_map *map, void *value)
+
+This helper fetches an element ``value`` from a queue or stack without
+removing it. Returns ``0`` on success, or negative error in case of
+failure.
+
+bpf_map_pop_elem()
+~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_map_pop_elem(struct bpf_map *map, void *value)
+
+This helper removes an element into ``value`` from a queue or
+stack. Returns ``0`` on success, or negative error in case of failure.
+
+
+Userspace
+---------
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_update_elem (int fd, const void *key, const void *value, __u64 flags)
+
+A userspace program can push ``value`` onto a queue or stack using libbpf's
+``bpf_map_update_elem`` function. The ``key`` parameter must be set to
+``NULL`` and ``flags`` must be set to ``BPF_ANY`` or ``BPF_EXIST``, with the
+same semantics as the ``bpf_map_push_elem`` kernel helper. Returns ``0`` on
+success, or negative error in case of failure.
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_lookup_elem (int fd, const void *key, void *value)
+
+A userspace program can peek at the ``value`` at the head of a queue or stack
+using the libbpf ``bpf_map_lookup_elem`` function. The ``key`` parameter must be
+set to ``NULL``. Returns ``0`` on success, or negative error in case of
+failure.
+
+bpf_map_lookup_and_delete_elem()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_lookup_and_delete_elem (int fd, const void *key, void *value)
+
+A userspace program can pop a ``value`` from the head of a queue or stack using
+the libbpf ``bpf_map_lookup_and_delete_elem`` function. The ``key`` parameter
+must be set to ``NULL``. Returns ``0`` on success, or negative error in case of
+failure.
+
+Examples
+========
+
+Kernel BPF
+----------
+
+This snippet shows how to declare a queue in a BPF program:
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_QUEUE);
+ __type(value, __u32);
+ __uint(max_entries, 10);
+ } queue SEC(".maps");
+
+
+Userspace
+---------
+
+This snippet shows how to use libbpf's low-level API to create a queue from
+userspace:
+
+.. code-block:: c
+
+ int create_queue()
+ {
+ return bpf_map_create(BPF_MAP_TYPE_QUEUE,
+ "sample_queue", /* name */
+ 0, /* key size, must be zero */
+ sizeof(__u32), /* value size */
+ 10, /* max entries */
+ NULL); /* create options */
+ }
+
+
+References
+==========
+
+https://lwn.net/ml/netdev/153986858555.9127.14517764371945179514.stgit@kernel/
diff --git a/Documentation/bpf/map_sk_storage.rst b/Documentation/bpf/map_sk_storage.rst
new file mode 100644
index 000000000000..4e9d23ab9ecd
--- /dev/null
+++ b/Documentation/bpf/map_sk_storage.rst
@@ -0,0 +1,159 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+=======================
+BPF_MAP_TYPE_SK_STORAGE
+=======================
+
+.. note::
+ - ``BPF_MAP_TYPE_SK_STORAGE`` was introduced in kernel version 5.2
+
+``BPF_MAP_TYPE_SK_STORAGE`` is used to provide socket-local storage for BPF
+programs. A map of type ``BPF_MAP_TYPE_SK_STORAGE`` declares the type of storage
+to be provided and acts as the handle for accessing the socket-local
+storage. The values for maps of type ``BPF_MAP_TYPE_SK_STORAGE`` are stored
+locally with each socket instead of with the map. The kernel is responsible for
+allocating storage for a socket when requested and for freeing the storage when
+either the map or the socket is deleted.
+
+.. note::
+ - The key type must be ``int`` and ``max_entries`` must be set to ``0``.
+ - The ``BPF_F_NO_PREALLOC`` flag must be used when creating a map for
+ socket-local storage.
+
+Usage
+=====
+
+Kernel BPF
+----------
+
+bpf_sk_storage_get()
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
+
+Socket-local storage for ``map`` can be retrieved from socket ``sk`` using the
+``bpf_sk_storage_get()`` helper. If the ``BPF_LOCAL_STORAGE_GET_F_CREATE``
+flag is used then ``bpf_sk_storage_get()`` will create the storage for ``sk``
+if it does not already exist. ``value`` can be used together with
+``BPF_LOCAL_STORAGE_GET_F_CREATE`` to initialize the storage value, otherwise
+it will be zero initialized. Returns a pointer to the storage on success, or
+``NULL`` in case of failure.
+
+.. note::
+ - ``sk`` is a kernel ``struct sock`` pointer for LSM or tracing programs.
+ - ``sk`` is a ``struct bpf_sock`` pointer for other program types.
+
+bpf_sk_storage_delete()
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
+
+Socket-local storage for ``map`` can be deleted from socket ``sk`` using the
+``bpf_sk_storage_delete()`` helper. Returns ``0`` on success, or negative
+error in case of failure.
+
+User space
+----------
+
+bpf_map_update_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_update_elem(int map_fd, const void *key, const void *value, __u64 flags)
+
+Socket-local storage for map ``map_fd`` can be added or updated locally to a
+socket using the ``bpf_map_update_elem()`` libbpf function. The socket is
+identified by a `socket` ``fd`` stored in the pointer ``key``. The pointer
+``value`` has the data to be added or updated to the socket ``fd``. The type
+and size of ``value`` should be the same as the value type of the map
+definition.
+
+The ``flags`` parameter can be used to control the update behaviour:
+
+- ``BPF_ANY`` will create storage for `socket` ``fd`` or update existing storage.
+- ``BPF_NOEXIST`` will create storage for `socket` ``fd`` only if it did not
+ already exist, otherwise the call will fail with ``-EEXIST``.
+- ``BPF_EXIST`` will update existing storage for `socket` ``fd`` if it already
+ exists, otherwise the call will fail with ``-ENOENT``.
+
+Returns ``0`` on success, or negative error in case of failure.
+
+bpf_map_lookup_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_lookup_elem(int map_fd, const void *key, void *value)
+
+Socket-local storage for map ``map_fd`` can be retrieved from a socket using
+the ``bpf_map_lookup_elem()`` libbpf function. The storage is retrieved from
+the socket identified by a `socket` ``fd`` stored in the pointer
+``key``. Returns ``0`` on success, or negative error in case of failure.
+
+bpf_map_delete_elem()
+~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: c
+
+ int bpf_map_delete_elem(int map_fd, const void *key)
+
+Socket-local storage for map ``map_fd`` can be deleted from a socket using the
+``bpf_map_delete_elem()`` libbpf function. The storage is deleted from the
+socket identified by a `socket` ``fd`` stored in the pointer ``key``. Returns
+``0`` on success, or negative error in case of failure.
+
+Examples
+========
+
+Kernel BPF
+----------
+
+This snippet shows how to declare socket-local storage in a BPF program:
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct my_storage);
+ } socket_storage SEC(".maps");
+
+This snippet shows how to retrieve socket-local storage in a BPF program:
+
+.. code-block:: c
+
+ SEC("sockops")
+ int _sockops(struct bpf_sock_ops *ctx)
+ {
+ struct my_storage *storage;
+ struct bpf_sock *sk;
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ storage = bpf_sk_storage_get(&socket_storage, sk, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 1;
+
+ /* Use 'storage' here */
+
+ return 1;
+ }
+
+
+Please see the ``tools/testing/selftests/bpf`` directory for functional
+examples.
+
+References
+==========
+
+https://lwn.net/ml/netdev/20190426171103.61892-1-kafai@fb.com/
diff --git a/Documentation/bpf/map_sockmap.rst b/Documentation/bpf/map_sockmap.rst
new file mode 100644
index 000000000000..cc92047c6630
--- /dev/null
+++ b/Documentation/bpf/map_sockmap.rst
@@ -0,0 +1,498 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright Red Hat
+
+==============================================
+BPF_MAP_TYPE_SOCKMAP and BPF_MAP_TYPE_SOCKHASH
+==============================================
+
+.. note::
+ - ``BPF_MAP_TYPE_SOCKMAP`` was introduced in kernel version 4.14
+ - ``BPF_MAP_TYPE_SOCKHASH`` was introduced in kernel version 4.18
+
+``BPF_MAP_TYPE_SOCKMAP`` and ``BPF_MAP_TYPE_SOCKHASH`` maps can be used to
+redirect skbs between sockets or to apply policy at the socket level based on
+the result of a BPF (verdict) program with the help of the BPF helpers
+``bpf_sk_redirect_map()``, ``bpf_sk_redirect_hash()``,
+``bpf_msg_redirect_map()`` and ``bpf_msg_redirect_hash()``.
+
+``BPF_MAP_TYPE_SOCKMAP`` is backed by an array that uses an integer key as the
+index to look up a reference to a ``struct sock``. The map values are socket
+descriptors. Similarly, ``BPF_MAP_TYPE_SOCKHASH`` is a hash backed BPF map that
+holds references to sockets via their socket descriptors.
+
+.. note::
+ The value type is either __u32 or __u64; the latter (__u64) is to support
+ returning socket cookies to userspace. Returning the ``struct sock *`` that
+ the map holds to user-space is neither safe nor useful.
+
+These maps may have BPF programs attached to them, specifically a parser program
+and a verdict program. The parser program determines how much data has been
+parsed and therefore how much data needs to be queued to come to a verdict. The
+verdict program is essentially the redirect program and can return a verdict
+of ``__SK_DROP``, ``__SK_PASS``, or ``__SK_REDIRECT``.
+
+When a socket is inserted into one of these maps, its socket callbacks are
+replaced and a ``struct sk_psock`` is attached to it. Additionally, this
+``sk_psock`` inherits the programs that are attached to the map.
+
+A sock object may be in multiple maps, but can only inherit a single
+parse or verdict program. If adding a sock object to a map would result
+in having multiple parser programs the update will return an EBUSY error.
+
+The supported programs to attach to these maps are:
+
+.. code-block:: c
+
+ struct sk_psock_progs {
+ struct bpf_prog *msg_parser;
+ struct bpf_prog *stream_parser;
+ struct bpf_prog *stream_verdict;
+ struct bpf_prog *skb_verdict;
+ };
+
+.. note::
+ Users are not allowed to attach ``stream_verdict`` and ``skb_verdict``
+ programs to the same map.
+
+The attach types for the map programs are:
+
+- ``msg_parser`` program - ``BPF_SK_MSG_VERDICT``.
+- ``stream_parser`` program - ``BPF_SK_SKB_STREAM_PARSER``.
+- ``stream_verdict`` program - ``BPF_SK_SKB_STREAM_VERDICT``.
+- ``skb_verdict`` program - ``BPF_SK_SKB_VERDICT``.
+
+There are additional helpers available to use with the parser and verdict
+programs: ``bpf_msg_apply_bytes()`` and ``bpf_msg_cork_bytes()``. With
+``bpf_msg_apply_bytes()`` BPF programs can tell the infrastructure how many
+bytes the given verdict should apply to. The helper ``bpf_msg_cork_bytes()``
+handles a different case where a BPF program cannot reach a verdict on a msg
+until it receives more bytes AND the program doesn't want to forward the packet
+until it is known to be good.
+
+Finally, the helpers ``bpf_msg_pull_data()`` and ``bpf_msg_push_data()`` are
+available to ``BPF_PROG_TYPE_SK_MSG`` BPF programs to pull in data and set the
+start and end pointers to given values or to add metadata to the ``struct
+sk_msg_buff *msg``.
+
+All these helpers will be described in more detail below.
+
+Usage
+=====
+Kernel BPF
+----------
+bpf_msg_redirect_map()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+
+This helper is used in programs implementing policies at the socket level. If
+the message ``msg`` is allowed to pass (i.e., if the verdict BPF program
+returns ``SK_PASS``), redirect it to the socket referenced by ``map`` (of type
+``BPF_MAP_TYPE_SOCKMAP``) at index ``key``. Both ingress and egress interfaces
+can be used for redirection. The ``BPF_F_INGRESS`` value in ``flags`` is used
+to select the ingress path otherwise the egress path is selected. This is the
+only flag supported for now.
+
+Returns ``SK_PASS`` on success, or ``SK_DROP`` on error.
+
+bpf_sk_redirect_map()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key u64 flags)
+
+Redirect the packet to the socket referenced by ``map`` (of type
+``BPF_MAP_TYPE_SOCKMAP``) at index ``key``. Both ingress and egress interfaces
+can be used for redirection. The ``BPF_F_INGRESS`` value in ``flags`` is used
+to select the ingress path otherwise the egress path is selected. This is the
+only flag supported for now.
+
+Returns ``SK_PASS`` on success, or ``SK_DROP`` on error.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+socket entries of type ``struct sock *`` can be retrieved using the
+``bpf_map_lookup_elem()`` helper.
+
+bpf_sock_map_update()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+
+Add an entry to, or update a ``map`` referencing sockets. The ``skops`` is used
+as a new value for the entry associated to ``key``. The ``flags`` argument can
+be one of the following:
+
+- ``BPF_ANY``: Create a new element or update an existing element.
+- ``BPF_NOEXIST``: Create a new element only if it did not exist.
+- ``BPF_EXIST``: Update an existing element.
+
+If the ``map`` has BPF programs (parser and verdict), those will be inherited
+by the socket being added. If the socket is already attached to BPF programs,
+this results in an error.
+
+Returns 0 on success, or a negative error in case of failure.
+
+bpf_sock_hash_update()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+
+Add an entry to, or update a sockhash ``map`` referencing sockets. The ``skops``
+is used as a new value for the entry associated to ``key``.
+
+The ``flags`` argument can be one of the following:
+
+- ``BPF_ANY``: Create a new element or update an existing element.
+- ``BPF_NOEXIST``: Create a new element only if it did not exist.
+- ``BPF_EXIST``: Update an existing element.
+
+If the ``map`` has BPF programs (parser and verdict), those will be inherited
+by the socket being added. If the socket is already attached to BPF programs,
+this results in an error.
+
+Returns 0 on success, or a negative error in case of failure.
+
+bpf_msg_redirect_hash()
+^^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+
+This helper is used in programs implementing policies at the socket level. If
+the message ``msg`` is allowed to pass (i.e., if the verdict BPF program returns
+``SK_PASS``), redirect it to the socket referenced by ``map`` (of type
+``BPF_MAP_TYPE_SOCKHASH``) using hash ``key``. Both ingress and egress
+interfaces can be used for redirection. The ``BPF_F_INGRESS`` value in
+``flags`` is used to select the ingress path otherwise the egress path is
+selected. This is the only flag supported for now.
+
+Returns ``SK_PASS`` on success, or ``SK_DROP`` on error.
+
+bpf_sk_redirect_hash()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+
+This helper is used in programs implementing policies at the skb socket level.
+If the sk_buff ``skb`` is allowed to pass (i.e., if the verdict BPF program
+returns ``SK_PASS``), redirect it to the socket referenced by ``map`` (of type
+``BPF_MAP_TYPE_SOCKHASH``) using hash ``key``. Both ingress and egress
+interfaces can be used for redirection. The ``BPF_F_INGRESS`` value in
+``flags`` is used to select the ingress path otherwise the egress path is
+selected. This is the only flag supported for now.
+
+Returns ``SK_PASS`` on success, or ``SK_DROP`` on error.
+
+bpf_msg_apply_bytes()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+
+For socket policies, apply the verdict of the BPF program to the next (number
+of ``bytes``) of message ``msg``. For example, this helper can be used in the
+following cases:
+
+- A single ``sendmsg()`` or ``sendfile()`` system call contains multiple
+ logical messages that the BPF program is supposed to read and for which it
+ should apply a verdict.
+- A BPF program only cares to read the first ``bytes`` of a ``msg``. If the
+ message has a large payload, then setting up and calling the BPF program
+ repeatedly for all bytes, even though the verdict is already known, would
+ create unnecessary overhead.
+
+Returns 0
+
+bpf_msg_cork_bytes()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+
+For socket policies, prevent the execution of the verdict BPF program for
+message ``msg`` until the number of ``bytes`` have been accumulated.
+
+This can be used when one needs a specific number of bytes before a verdict can
+be assigned, even if the data spans multiple ``sendmsg()`` or ``sendfile()``
+calls.
+
+Returns 0
+
+bpf_msg_pull_data()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+
+For socket policies, pull in non-linear data from user space for ``msg`` and set
+pointers ``msg->data`` and ``msg->data_end`` to ``start`` and ``end`` bytes
+offsets into ``msg``, respectively.
+
+If a program of type ``BPF_PROG_TYPE_SK_MSG`` is run on a ``msg`` it can only
+parse data that the (``data``, ``data_end``) pointers have already consumed.
+For ``sendmsg()`` hooks this is likely the first scatterlist element. But for
+calls relying on the ``sendpage`` handler (e.g., ``sendfile()``) this will be
+the range (**0**, **0**) because the data is shared with user space and by
+default the objective is to avoid allowing user space to modify data while (or
+after) BPF verdict is being decided. This helper can be used to pull in data
+and to set the start and end pointers to given values. Data will be copied if
+necessary (i.e., if data was not linear and if start and end pointers do not
+point to the same chunk).
+
+A call to this helper is susceptible to change the underlying packet buffer.
+Therefore, at load time, all checks on pointers previously done by the verifier
+are invalidated and must be performed again, if the helper is used in
+combination with direct packet access.
+
+All values for ``flags`` are reserved for future usage, and must be left at
+zero.
+
+Returns 0 on success, or a negative error in case of failure.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+Look up a socket entry in the sockmap or sockhash map.
+
+Returns the socket entry associated to ``key``, or NULL if no entry was found.
+
+bpf_map_update_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+
+Add or update a socket entry in a sockmap or sockhash.
+
+The flags argument can be one of the following:
+
+- BPF_ANY: Create a new element or update an existing element.
+- BPF_NOEXIST: Create a new element only if it did not exist.
+- BPF_EXIST: Update an existing element.
+
+Returns 0 on success, or a negative error in case of failure.
+
+bpf_map_delete_elem()
+^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_map_delete_elem(struct bpf_map *map, const void *key)
+
+Delete a socket entry from a sockmap or a sockhash.
+
+Returns 0 on success, or a negative error in case of failure.
+
+User space
+----------
+bpf_map_update_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags)
+
+Sockmap entries can be added or updated using the ``bpf_map_update_elem()``
+function. The ``key`` parameter is the index value of the sockmap array. And the
+``value`` parameter is the FD value of that socket.
+
+Under the hood, the sockmap update function uses the socket FD value to
+retrieve the associated socket and its attached psock.
+
+The flags argument can be one of the following:
+
+- BPF_ANY: Create a new element or update an existing element.
+- BPF_NOEXIST: Create a new element only if it did not exist.
+- BPF_EXIST: Update an existing element.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_lookup_elem(int fd, const void *key, void *value)
+
+Sockmap entries can be retrieved using the ``bpf_map_lookup_elem()`` function.
+
+.. note::
+ The entry returned is a socket cookie rather than a socket itself.
+
+bpf_map_delete_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_delete_elem(int fd, const void *key)
+
+Sockmap entries can be deleted using the ``bpf_map_delete_elem()``
+function.
+
+Returns 0 on success, or negative error in case of failure.
+
+Examples
+========
+
+Kernel BPF
+----------
+Several examples of the use of sockmap APIs can be found in:
+
+- `tools/testing/selftests/bpf/progs/test_sockmap_kern.h`_
+- `tools/testing/selftests/bpf/progs/sockmap_parse_prog.c`_
+- `tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c`_
+- `tools/testing/selftests/bpf/progs/test_sockmap_listen.c`_
+- `tools/testing/selftests/bpf/progs/test_sockmap_update.c`_
+
+The following code snippet shows how to declare a sockmap.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+ } sock_map_rx SEC(".maps");
+
+The following code snippet shows a sample parser program.
+
+.. code-block:: c
+
+ SEC("sk_skb/stream_parser")
+ int bpf_prog_parser(struct __sk_buff *skb)
+ {
+ return skb->len;
+ }
+
+The following code snippet shows a simple verdict program that interacts with a
+sockmap to redirect traffic to another socket based on the local port.
+
+.. code-block:: c
+
+ SEC("sk_skb/stream_verdict")
+ int bpf_prog_verdict(struct __sk_buff *skb)
+ {
+ __u32 lport = skb->local_port;
+ __u32 idx = 0;
+
+ if (lport == 10000)
+ return bpf_sk_redirect_map(skb, &sock_map_rx, idx, 0);
+
+ return SK_PASS;
+ }
+
+The following code snippet shows how to declare a sockhash map.
+
+.. code-block:: c
+
+ struct socket_key {
+ __u32 src_ip;
+ __u32 dst_ip;
+ __u32 src_port;
+ __u32 dst_port;
+ };
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 1);
+ __type(key, struct socket_key);
+ __type(value, __u64);
+ } sock_hash_rx SEC(".maps");
+
+The following code snippet shows a simple verdict program that interacts with a
+sockhash to redirect traffic to another socket based on a hash of some of the
+skb parameters.
+
+.. code-block:: c
+
+ static inline
+ void extract_socket_key(struct __sk_buff *skb, struct socket_key *key)
+ {
+ key->src_ip = skb->remote_ip4;
+ key->dst_ip = skb->local_ip4;
+ key->src_port = skb->remote_port >> 16;
+ key->dst_port = (bpf_htonl(skb->local_port)) >> 16;
+ }
+
+ SEC("sk_skb/stream_verdict")
+ int bpf_prog_verdict(struct __sk_buff *skb)
+ {
+ struct socket_key key;
+
+ extract_socket_key(skb, &key);
+
+ return bpf_sk_redirect_hash(skb, &sock_hash_rx, &key, 0);
+ }
+
+User space
+----------
+Several examples of the use of sockmap APIs can be found in:
+
+- `tools/testing/selftests/bpf/prog_tests/sockmap_basic.c`_
+- `tools/testing/selftests/bpf/test_sockmap.c`_
+- `tools/testing/selftests/bpf/test_maps.c`_
+
+The following code sample shows how to create a sockmap, attach a parser and
+verdict program, as well as add a socket entry.
+
+.. code-block:: c
+
+ int create_sample_sockmap(int sock, int parse_prog_fd, int verdict_prog_fd)
+ {
+ int index = 0;
+ int map, err;
+
+ map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
+ if (map < 0) {
+ fprintf(stderr, "Failed to create sockmap: %s\n", strerror(errno));
+ return -1;
+ }
+
+ err = bpf_prog_attach(parse_prog_fd, map, BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err){
+ fprintf(stderr, "Failed to attach_parser_prog_to_map: %s\n", strerror(errno));
+ goto out;
+ }
+
+ err = bpf_prog_attach(verdict_prog_fd, map, BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err){
+ fprintf(stderr, "Failed to attach_verdict_prog_to_map: %s\n", strerror(errno));
+ goto out;
+ }
+
+ err = bpf_map_update_elem(map, &index, &sock, BPF_NOEXIST);
+ if (err) {
+ fprintf(stderr, "Failed to update sockmap: %s\n", strerror(errno));
+ goto out;
+ }
+
+ out:
+ close(map);
+ return err;
+ }
+
+References
+===========
+
+- https://github.com/jrfastab/linux-kernel-xdp/commit/c89fd73cb9d2d7f3c716c3e00836f07b1aeb261f
+- https://lwn.net/Articles/731133/
+- http://vger.kernel.org/lpc_net2018_talks/ktls_bpf_paper.pdf
+- https://lwn.net/Articles/748628/
+- https://lore.kernel.org/bpf/20200218171023.844439-7-jakub@cloudflare.com/
+
+.. _`tools/testing/selftests/bpf/progs/test_sockmap_kern.h`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
+.. _`tools/testing/selftests/bpf/progs/sockmap_parse_prog.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c
+.. _`tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c
+.. _`tools/testing/selftests/bpf/prog_tests/sockmap_basic.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+.. _`tools/testing/selftests/bpf/test_sockmap.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/test_sockmap.c
+.. _`tools/testing/selftests/bpf/test_maps.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/test_maps.c
+.. _`tools/testing/selftests/bpf/progs/test_sockmap_listen.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
+.. _`tools/testing/selftests/bpf/progs/test_sockmap_update.c`: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/selftests/bpf/progs/test_sockmap_update.c
diff --git a/Documentation/bpf/map_xskmap.rst b/Documentation/bpf/map_xskmap.rst
new file mode 100644
index 000000000000..dc143edd9233
--- /dev/null
+++ b/Documentation/bpf/map_xskmap.rst
@@ -0,0 +1,192 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+===================
+BPF_MAP_TYPE_XSKMAP
+===================
+
+.. note::
+ - ``BPF_MAP_TYPE_XSKMAP`` was introduced in kernel version 4.18
+
+The ``BPF_MAP_TYPE_XSKMAP`` is used as a backend map for XDP BPF helper
+call ``bpf_redirect_map()`` and ``XDP_REDIRECT`` action, like 'devmap' and 'cpumap'.
+This map type redirects raw XDP frames to `AF_XDP`_ sockets (XSKs), a new type of
+address family in the kernel that allows redirection of frames from a driver to
+user space without having to traverse the full network stack. An AF_XDP socket
+binds to a single netdev queue. A mapping of XSKs to queues is shown below:
+
+.. code-block:: none
+
+ +---------------------------------------------------+
+ | xsk A | xsk B | xsk C |<---+ User space
+ =========================================================|==========
+ | Queue 0 | Queue 1 | Queue 2 | | Kernel
+ +---------------------------------------------------+ |
+ | Netdev eth0 | |
+ +---------------------------------------------------+ |
+ | +=============+ | |
+ | | key | xsk | | |
+ | +---------+ +=============+ | |
+ | | | | 0 | xsk A | | |
+ | | | +-------------+ | |
+ | | | | 1 | xsk B | | |
+ | | BPF |-- redirect -->+-------------+-------------+
+ | | prog | | 2 | xsk C | |
+ | | | +-------------+ |
+ | | | |
+ | | | |
+ | +---------+ |
+ | |
+ +---------------------------------------------------+
+
+.. note::
+ An AF_XDP socket that is bound to a certain <netdev/queue_id> will *only*
+ accept XDP frames from that <netdev/queue_id>. If an XDP program tries to redirect
+ from a <netdev/queue_id> other than what the socket is bound to, the frame will
+ not be received on the socket.
+
+Typically an XSKMAP is created per netdev. This map contains an array of XSK File
+Descriptors (FDs). The number of array elements is typically set or adjusted using
+the ``max_entries`` map parameter. For AF_XDP ``max_entries`` is equal to the number
+of queues supported by the netdev.
+
+.. note::
+ Both the map key and map value size must be 4 bytes.
+
+Usage
+=====
+
+Kernel BPF
+----------
+bpf_redirect_map()
+^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+
+Redirect the packet to the endpoint referenced by ``map`` at index ``key``.
+For ``BPF_MAP_TYPE_XSKMAP`` this map contains references to XSK FDs
+for sockets attached to a netdev's queues.
+
+.. note::
+ If the map is empty at an index, the packet is dropped. This means that it is
+ necessary to have an XDP program loaded with at least one XSK in the
+ XSKMAP to be able to get any traffic to user space through the socket.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+
+XSK entry references of type ``struct xdp_sock *`` can be retrieved using the
+``bpf_map_lookup_elem()`` helper.
+
+User space
+----------
+.. note::
+ XSK entries can only be updated/deleted from user space and not from
+ a BPF program. Trying to call these functions from a kernel BPF program will
+ result in the program failing to load and a verifier warning.
+
+bpf_map_update_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags)
+
+XSK entries can be added or updated using the ``bpf_map_update_elem()``
+helper. The ``key`` parameter is equal to the queue_id of the queue the XSK
+is attaching to. And the ``value`` parameter is the FD value of that socket.
+
+Under the hood, the XSKMAP update function uses the XSK FD value to retrieve the
+associated ``struct xdp_sock`` instance.
+
+The flags argument can be one of the following:
+
+- BPF_ANY: Create a new element or update an existing element.
+- BPF_NOEXIST: Create a new element only if it did not exist.
+- BPF_EXIST: Update an existing element.
+
+bpf_map_lookup_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_lookup_elem(int fd, const void *key, void *value)
+
+Returns ``struct xdp_sock *`` or negative error in case of failure.
+
+bpf_map_delete_elem()
+^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: c
+
+ int bpf_map_delete_elem(int fd, const void *key)
+
+XSK entries can be deleted using the ``bpf_map_delete_elem()``
+helper. This helper will return 0 on success, or negative error in case of
+failure.
+
+.. note::
+ When `libxdp`_ deletes an XSK it also removes the associated socket
+ entry from the XSKMAP.
+
+Examples
+========
+Kernel
+------
+
+The following code snippet shows how to declare a ``BPF_MAP_TYPE_XSKMAP`` called
+``xsks_map`` and how to redirect packets to an XSK.
+
+.. code-block:: c
+
+ struct {
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 64);
+ } xsks_map SEC(".maps");
+
+
+ SEC("xdp")
+ int xsk_redir_prog(struct xdp_md *ctx)
+ {
+ __u32 index = ctx->rx_queue_index;
+
+ if (bpf_map_lookup_elem(&xsks_map, &index))
+ return bpf_redirect_map(&xsks_map, index, 0);
+ return XDP_PASS;
+ }
+
+User space
+----------
+
+The following code snippet shows how to update an XSKMAP with an XSK entry.
+
+.. code-block:: c
+
+ int update_xsks_map(struct bpf_map *xsks_map, int queue_id, int xsk_fd)
+ {
+ int ret;
+
+ ret = bpf_map_update_elem(bpf_map__fd(xsks_map), &queue_id, &xsk_fd, 0);
+ if (ret < 0)
+ fprintf(stderr, "Failed to update xsks_map: %s\n", strerror(errno));
+
+ return ret;
+ }
+
+For an example on how create AF_XDP sockets, please see the AF_XDP-example and
+AF_XDP-forwarding programs in the `bpf-examples`_ directory in the `libxdp`_ repository.
+For a detailed explanation of the AF_XDP interface please see:
+
+- `libxdp-readme`_.
+- `AF_XDP`_ kernel documentation.
+
+.. note::
+ The most comprehensive resource for using XSKMAPs and AF_XDP is `libxdp`_.
+
+.. _libxdp: https://github.com/xdp-project/xdp-tools/tree/master/lib/libxdp
+.. _AF_XDP: https://www.kernel.org/doc/html/latest/networking/af_xdp.html
+.. _bpf-examples: https://github.com/xdp-project/bpf-examples
+.. _libxdp-readme: https://github.com/xdp-project/xdp-tools/tree/master/lib/libxdp#using-af_xdp-sockets
diff --git a/Documentation/bpf/maps.rst b/Documentation/bpf/maps.rst
new file mode 100644
index 000000000000..6f069f3d6f4b
--- /dev/null
+++ b/Documentation/bpf/maps.rst
@@ -0,0 +1,82 @@
+
+========
+BPF maps
+========
+
+BPF 'maps' provide generic storage of different types for sharing data between
+kernel and user space. There are several storage types available, including
+hash, array, bloom filter and radix-tree. Several of the map types exist to
+support specific BPF helpers that perform actions based on the map contents. The
+maps are accessed from BPF programs via BPF helpers which are documented in the
+`man-pages`_ for `bpf-helpers(7)`_.
+
+BPF maps are accessed from user space via the ``bpf`` syscall, which provides
+commands to create maps, lookup elements, update elements and delete elements.
+More details of the BPF syscall are available in `ebpf-syscall`_ and in the
+`man-pages`_ for `bpf(2)`_.
+
+Map Types
+=========
+
+.. toctree::
+ :maxdepth: 1
+ :glob:
+
+ map_*
+
+Usage Notes
+===========
+
+.. c:function::
+ int bpf(int command, union bpf_attr *attr, u32 size)
+
+Use the ``bpf()`` system call to perform the operation specified by
+``command``. The operation takes parameters provided in ``attr``. The ``size``
+argument is the size of the ``union bpf_attr`` in ``attr``.
+
+**BPF_MAP_CREATE**
+
+Create a map with the desired type and attributes in ``attr``:
+
+.. code-block:: c
+
+ int fd;
+ union bpf_attr attr = {
+ .map_type = BPF_MAP_TYPE_ARRAY; /* mandatory */
+ .key_size = sizeof(__u32); /* mandatory */
+ .value_size = sizeof(__u32); /* mandatory */
+ .max_entries = 256; /* mandatory */
+ .map_flags = BPF_F_MMAPABLE;
+ .map_name = "example_array";
+ };
+
+ fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+
+Returns a process-local file descriptor on success, or negative error in case of
+failure. The map can be deleted by calling ``close(fd)``. Maps held by open
+file descriptors will be deleted automatically when a process exits.
+
+.. note:: Valid characters for ``map_name`` are ``A-Z``, ``a-z``, ``0-9``,
+ ``'_'`` and ``'.'``.
+
+**BPF_MAP_LOOKUP_ELEM**
+
+Lookup key in a given map using ``attr->map_fd``, ``attr->key``,
+``attr->value``. Returns zero and stores found elem into ``attr->value`` on
+success, or negative error on failure.
+
+**BPF_MAP_UPDATE_ELEM**
+
+Create or update key/value pair in a given map using ``attr->map_fd``, ``attr->key``,
+``attr->value``. Returns zero on success or negative error on failure.
+
+**BPF_MAP_DELETE_ELEM**
+
+Find and delete element by key in a given map using ``attr->map_fd``,
+``attr->key``. Returns zero on success or negative error on failure.
+
+.. Links:
+.. _man-pages: https://www.kernel.org/doc/man-pages/
+.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html
+.. _bpf-helpers(7): https://man7.org/linux/man-pages/man7/bpf-helpers.7.html
+.. _ebpf-syscall: https://docs.kernel.org/userspace-api/ebpf/syscall.html
diff --git a/Documentation/bpf/other.rst b/Documentation/bpf/other.rst
new file mode 100644
index 000000000000..7e6b12018802
--- /dev/null
+++ b/Documentation/bpf/other.rst
@@ -0,0 +1,10 @@
+=====
+Other
+=====
+
+.. toctree::
+ :maxdepth: 1
+
+ ringbuf
+ llvm_reloc
+ graph_ds_impl
diff --git a/Documentation/bpf/prog_lsm.rst b/Documentation/bpf/prog_lsm.rst
new file mode 100644
index 000000000000..ad2be02f30c2
--- /dev/null
+++ b/Documentation/bpf/prog_lsm.rst
@@ -0,0 +1,143 @@
+.. SPDX-License-Identifier: GPL-2.0+
+.. Copyright (C) 2020 Google LLC.
+
+================
+LSM BPF Programs
+================
+
+These BPF programs allow runtime instrumentation of the LSM hooks by privileged
+users to implement system-wide MAC (Mandatory Access Control) and Audit
+policies using eBPF.
+
+Structure
+---------
+
+The example shows an eBPF program that can be attached to the ``file_mprotect``
+LSM hook:
+
+.. c:function:: int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot);
+
+Other LSM hooks which can be instrumented can be found in
+``security/security.c``.
+
+eBPF programs that use Documentation/bpf/btf.rst do not need to include kernel
+headers for accessing information from the attached eBPF program's context.
+They can simply declare the structures in the eBPF program and only specify
+the fields that need to be accessed.
+
+.. code-block:: c
+
+ struct mm_struct {
+ unsigned long start_brk, brk, start_stack;
+ } __attribute__((preserve_access_index));
+
+ struct vm_area_struct {
+ unsigned long start_brk, brk, start_stack;
+ unsigned long vm_start, vm_end;
+ struct mm_struct *vm_mm;
+ } __attribute__((preserve_access_index));
+
+
+.. note:: The order of the fields is irrelevant.
+
+This can be further simplified (if one has access to the BTF information at
+build time) by generating the ``vmlinux.h`` with:
+
+.. code-block:: console
+
+ # bpftool btf dump file <path-to-btf-vmlinux> format c > vmlinux.h
+
+.. note:: ``path-to-btf-vmlinux`` can be ``/sys/kernel/btf/vmlinux`` if the
+ build environment matches the environment the BPF programs are
+ deployed in.
+
+The ``vmlinux.h`` can then simply be included in the BPF programs without
+requiring the definition of the types.
+
+The eBPF programs can be declared using the``BPF_PROG``
+macros defined in `tools/lib/bpf/bpf_tracing.h`_. In this
+example:
+
+ * ``"lsm/file_mprotect"`` indicates the LSM hook that the program must
+ be attached to
+ * ``mprotect_audit`` is the name of the eBPF program
+
+.. code-block:: c
+
+ SEC("lsm/file_mprotect")
+ int BPF_PROG(mprotect_audit, struct vm_area_struct *vma,
+ unsigned long reqprot, unsigned long prot, int ret)
+ {
+ /* ret is the return value from the previous BPF program
+ * or 0 if it's the first hook.
+ */
+ if (ret != 0)
+ return ret;
+
+ int is_heap;
+
+ is_heap = (vma->vm_start >= vma->vm_mm->start_brk &&
+ vma->vm_end <= vma->vm_mm->brk);
+
+ /* Return an -EPERM or write information to the perf events buffer
+ * for auditing
+ */
+ if (is_heap)
+ return -EPERM;
+ }
+
+The ``__attribute__((preserve_access_index))`` is a clang feature that allows
+the BPF verifier to update the offsets for the access at runtime using the
+Documentation/bpf/btf.rst information. Since the BPF verifier is aware of the
+types, it also validates all the accesses made to the various types in the
+eBPF program.
+
+Loading
+-------
+
+eBPF programs can be loaded with the :manpage:`bpf(2)` syscall's
+``BPF_PROG_LOAD`` operation:
+
+.. code-block:: c
+
+ struct bpf_object *obj;
+
+ obj = bpf_object__open("./my_prog.o");
+ bpf_object__load(obj);
+
+This can be simplified by using a skeleton header generated by ``bpftool``:
+
+.. code-block:: console
+
+ # bpftool gen skeleton my_prog.o > my_prog.skel.h
+
+and the program can be loaded by including ``my_prog.skel.h`` and using
+the generated helper, ``my_prog__open_and_load``.
+
+Attachment to LSM Hooks
+-----------------------
+
+The LSM allows attachment of eBPF programs as LSM hooks using :manpage:`bpf(2)`
+syscall's ``BPF_RAW_TRACEPOINT_OPEN`` operation or more simply by
+using the libbpf helper ``bpf_program__attach_lsm``.
+
+The program can be detached from the LSM hook by *destroying* the ``link``
+link returned by ``bpf_program__attach_lsm`` using ``bpf_link__destroy``.
+
+One can also use the helpers generated in ``my_prog.skel.h`` i.e.
+``my_prog__attach`` for attachment and ``my_prog__destroy`` for cleaning up.
+
+Examples
+--------
+
+An example eBPF program can be found in
+`tools/testing/selftests/bpf/progs/lsm.c`_ and the corresponding
+userspace code in `tools/testing/selftests/bpf/prog_tests/test_lsm.c`_
+
+.. Links
+.. _tools/lib/bpf/bpf_tracing.h:
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h
+.. _tools/testing/selftests/bpf/progs/lsm.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/lsm.c
+.. _tools/testing/selftests/bpf/prog_tests/test_lsm.c:
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/test_lsm.c
diff --git a/Documentation/bpf/programs.rst b/Documentation/bpf/programs.rst
new file mode 100644
index 000000000000..c99000ab6d9b
--- /dev/null
+++ b/Documentation/bpf/programs.rst
@@ -0,0 +1,12 @@
+=============
+Program Types
+=============
+
+.. toctree::
+ :maxdepth: 1
+ :glob:
+
+ prog_*
+
+For a list of all program types, see :ref:`program_types_and_elf` in
+the :ref:`libbpf` documentation.
diff --git a/Documentation/bpf/redirect.rst b/Documentation/bpf/redirect.rst
new file mode 100644
index 000000000000..2fa2b0b05004
--- /dev/null
+++ b/Documentation/bpf/redirect.rst
@@ -0,0 +1,81 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2022 Red Hat, Inc.
+
+========
+Redirect
+========
+XDP_REDIRECT
+############
+Supported maps
+--------------
+
+XDP_REDIRECT works with the following map types:
+
+- ``BPF_MAP_TYPE_DEVMAP``
+- ``BPF_MAP_TYPE_DEVMAP_HASH``
+- ``BPF_MAP_TYPE_CPUMAP``
+- ``BPF_MAP_TYPE_XSKMAP``
+
+For more information on these maps, please see the specific map documentation.
+
+Process
+-------
+
+.. kernel-doc:: net/core/filter.c
+ :doc: xdp redirect
+
+.. note::
+ Not all drivers support transmitting frames after a redirect, and for
+ those that do, not all of them support non-linear frames. Non-linear xdp
+ bufs/frames are bufs/frames that contain more than one fragment.
+
+Debugging packet drops
+----------------------
+Silent packet drops for XDP_REDIRECT can be debugged using:
+
+- bpf_trace
+- perf_record
+
+bpf_trace
+^^^^^^^^^
+The following bpftrace command can be used to capture and count all XDP tracepoints:
+
+.. code-block:: none
+
+ sudo bpftrace -e 'tracepoint:xdp:* { @cnt[probe] = count(); }'
+ Attaching 12 probes...
+ ^C
+
+ @cnt[tracepoint:xdp:mem_connect]: 18
+ @cnt[tracepoint:xdp:mem_disconnect]: 18
+ @cnt[tracepoint:xdp:xdp_exception]: 19605
+ @cnt[tracepoint:xdp:xdp_devmap_xmit]: 1393604
+ @cnt[tracepoint:xdp:xdp_redirect]: 22292200
+
+.. note::
+ The various xdp tracepoints can be found in ``source/include/trace/events/xdp.h``
+
+The following bpftrace command can be used to extract the ``ERRNO`` being returned as
+part of the err parameter:
+
+.. code-block:: none
+
+ sudo bpftrace -e \
+ 'tracepoint:xdp:xdp_redirect*_err {@redir_errno[-args->err] = count();}
+ tracepoint:xdp:xdp_devmap_xmit {@devmap_errno[-args->err] = count();}'
+
+perf record
+^^^^^^^^^^^
+The perf tool also supports recording tracepoints:
+
+.. code-block:: none
+
+ perf record -a -e xdp:xdp_redirect_err \
+ -e xdp:xdp_redirect_map_err \
+ -e xdp:xdp_exception \
+ -e xdp:xdp_devmap_xmit
+
+References
+===========
+
+- https://github.com/xdp-project/xdp-tutorial/tree/master/tracing02-xdp-monitor
diff --git a/Documentation/bpf/ringbuf.rst b/Documentation/bpf/ringbuf.rst
index 6a615cd62bda..a99cd05d79d4 100644
--- a/Documentation/bpf/ringbuf.rst
+++ b/Documentation/bpf/ringbuf.rst
@@ -124,7 +124,7 @@ buffer. Currently 4 are supported:
- ``BPF_RB_AVAIL_DATA`` returns amount of unconsumed data in ring buffer;
- ``BPF_RB_RING_SIZE`` returns the size of ring buffer;
-- ``BPF_RB_CONS_POS``/``BPF_RB_PROD_POS`` returns current logical possition
+- ``BPF_RB_CONS_POS``/``BPF_RB_PROD_POS`` returns current logical position
of consumer/producer, respectively.
Returned values are momentarily snapshots of ring buffer state and could be
@@ -146,7 +146,7 @@ Design and Implementation
This reserve/commit schema allows a natural way for multiple producers, either
on different CPUs or even on the same CPU/in the same BPF program, to reserve
independent records and work with them without blocking other producers. This
-means that if BPF program was interruped by another BPF program sharing the
+means that if BPF program was interrupted by another BPF program sharing the
same ring buffer, they will both get a record reserved (provided there is
enough space left) and can work with it and submit it independently. This
applies to NMI context as well, except that due to using a spinlock during
diff --git a/Documentation/bpf/syscall_api.rst b/Documentation/bpf/syscall_api.rst
new file mode 100644
index 000000000000..f0a1dff087ad
--- /dev/null
+++ b/Documentation/bpf/syscall_api.rst
@@ -0,0 +1,11 @@
+===========
+Syscall API
+===========
+
+The primary info for the bpf syscall is available in the `man-pages`_
+for `bpf(2)`_. For more information about the userspace API, see
+Documentation/userspace-api/ebpf/index.rst.
+
+.. Links:
+.. _man-pages: https://www.kernel.org/doc/man-pages/
+.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html \ No newline at end of file
diff --git a/Documentation/bpf/test_debug.rst b/Documentation/bpf/test_debug.rst
new file mode 100644
index 000000000000..ebf0caceb6a6
--- /dev/null
+++ b/Documentation/bpf/test_debug.rst
@@ -0,0 +1,9 @@
+=========================
+Testing and debugging BPF
+=========================
+
+.. toctree::
+ :maxdepth: 1
+
+ drgn
+ s390
diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst
new file mode 100644
index 000000000000..f0ec19db301c
--- /dev/null
+++ b/Documentation/bpf/verifier.rst
@@ -0,0 +1,824 @@
+
+=============
+eBPF verifier
+=============
+
+The safety of the eBPF program is determined in two steps.
+
+First step does DAG check to disallow loops and other CFG validation.
+In particular it will detect programs that have unreachable instructions.
+(though classic BPF checker allows them)
+
+Second step starts from the first insn and descends all possible paths.
+It simulates execution of every insn and observes the state change of
+registers and stack.
+
+At the start of the program the register R1 contains a pointer to context
+and has type PTR_TO_CTX.
+If verifier sees an insn that does R2=R1, then R2 has now type
+PTR_TO_CTX as well and can be used on the right hand side of expression.
+If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE,
+since addition of two valid pointers makes invalid pointer.
+(In 'secure' mode verifier will reject any type of pointer arithmetic to make
+sure that kernel addresses don't leak to unprivileged users)
+
+If register was never written to, it's not readable::
+
+ bpf_mov R0 = R2
+ bpf_exit
+
+will be rejected, since R2 is unreadable at the start of the program.
+
+After kernel function call, R1-R5 are reset to unreadable and
+R0 has a return type of the function.
+
+Since R6-R9 are callee saved, their state is preserved across the call.
+
+::
+
+ bpf_mov R6 = 1
+ bpf_call foo
+ bpf_mov R0 = R6
+ bpf_exit
+
+is a correct program. If there was R1 instead of R6, it would have
+been rejected.
+
+load/store instructions are allowed only with registers of valid types, which
+are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked.
+For example::
+
+ bpf_mov R1 = 1
+ bpf_mov R2 = 2
+ bpf_xadd *(u32 *)(R1 + 3) += R2
+ bpf_exit
+
+will be rejected, since R1 doesn't have a valid pointer type at the time of
+execution of instruction bpf_xadd.
+
+At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``)
+A callback is used to customize verifier to restrict eBPF program access to only
+certain fields within ctx structure with specified size and alignment.
+
+For example, the following insn::
+
+ bpf_ld R0 = *(u32 *)(R6 + 8)
+
+intends to load a word from address R6 + 8 and store it into R0
+If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know
+that offset 8 of size 4 bytes can be accessed for reading, otherwise
+the verifier will reject the program.
+If R6=PTR_TO_STACK, then access should be aligned and be within
+stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8,
+so it will fail verification, since it's out of bounds.
+
+The verifier will allow eBPF program to read data from stack only after
+it wrote into it.
+
+Classic BPF verifier does similar check with M[0-15] memory slots.
+For example::
+
+ bpf_ld R0 = *(u32 *)(R10 - 4)
+ bpf_exit
+
+is invalid program.
+Though R10 is correct read-only register and has type PTR_TO_STACK
+and R10 - 4 is within stack bounds, there were no stores into that location.
+
+Pointer register spill/fill is tracked as well, since four (R6-R9)
+callee saved registers may not be enough for some programs.
+
+Allowed function calls are customized with bpf_verifier_ops->get_func_proto()
+The eBPF verifier will check that registers match argument constraints.
+After the call register R0 will be set to return type of the function.
+
+Function calls is a main mechanism to extend functionality of eBPF programs.
+Socket filters may let programs to call one set of functions, whereas tracing
+filters may allow completely different set.
+
+If a function made accessible to eBPF program, it needs to be thought through
+from safety point of view. The verifier will guarantee that the function is
+called with valid arguments.
+
+seccomp vs socket filters have different security restrictions for classic BPF.
+Seccomp solves this by two stage verifier: classic BPF verifier is followed
+by seccomp verifier. In case of eBPF one configurable verifier is shared for
+all use cases.
+
+See details of eBPF verifier in kernel/bpf/verifier.c
+
+Register value tracking
+=======================
+
+In order to determine the safety of an eBPF program, the verifier must track
+the range of possible values in each register and also in each stack slot.
+This is done with ``struct bpf_reg_state``, defined in include/linux/
+bpf_verifier.h, which unifies tracking of scalar and pointer values. Each
+register state has a type, which is either NOT_INIT (the register has not been
+written to), SCALAR_VALUE (some value which is not usable as a pointer), or a
+pointer type. The types of pointers describe their base, as follows:
+
+
+ PTR_TO_CTX
+ Pointer to bpf_context.
+ CONST_PTR_TO_MAP
+ Pointer to struct bpf_map. "Const" because arithmetic
+ on these pointers is forbidden.
+ PTR_TO_MAP_VALUE
+ Pointer to the value stored in a map element.
+ PTR_TO_MAP_VALUE_OR_NULL
+ Either a pointer to a map value, or NULL; map accesses
+ (see maps.rst) return this type, which becomes a
+ PTR_TO_MAP_VALUE when checked != NULL. Arithmetic on
+ these pointers is forbidden.
+ PTR_TO_STACK
+ Frame pointer.
+ PTR_TO_PACKET
+ skb->data.
+ PTR_TO_PACKET_END
+ skb->data + headlen; arithmetic forbidden.
+ PTR_TO_SOCKET
+ Pointer to struct bpf_sock_ops, implicitly refcounted.
+ PTR_TO_SOCKET_OR_NULL
+ Either a pointer to a socket, or NULL; socket lookup
+ returns this type, which becomes a PTR_TO_SOCKET when
+ checked != NULL. PTR_TO_SOCKET is reference-counted,
+ so programs must release the reference through the
+ socket release function before the end of the program.
+ Arithmetic on these pointers is forbidden.
+
+However, a pointer may be offset from this base (as a result of pointer
+arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
+offset'. The former is used when an exactly-known value (e.g. an immediate
+operand) is added to a pointer, while the latter is used for values which are
+not exactly known. The variable offset is also used in SCALAR_VALUEs, to track
+the range of possible values in the register.
+
+The verifier's knowledge about the variable offset consists of:
+
+* minimum and maximum values as unsigned
+* minimum and maximum values as signed
+
+* knowledge of the values of individual bits, in the form of a 'tnum': a u64
+ 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown;
+ 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both
+ mask and value; no bit should ever be 1 in both. For example, if a byte is read
+ into a register from memory, the register's top 56 bits are known zero, while
+ the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we
+ then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0;
+ 0x1ff), because of potential carries.
+
+Besides arithmetic, the register state can also be updated by conditional
+branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
+it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false'
+branch it will have a umax_value of 8. A signed compare (with BPF_JSGT or
+BPF_JSGE) would instead update the signed minimum/maximum values. Information
+from the signed and unsigned bounds can be combined; for instance if a value is
+first tested < 8 and then tested s> 4, the verifier will conclude that the value
+is also > 4 and s< 8, since the bounds prevent crossing the sign boundary.
+
+PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all
+pointers sharing that same variable offset. This is important for packet range
+checks: after adding a variable to a packet pointer register A, if you then copy
+it to another register B and then add a constant 4 to A, both registers will
+share the same 'id' but the A will have a fixed offset of +4. Then if A is
+bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is
+now known to have a safe range of at least 4 bytes. See 'Direct packet access',
+below, for more on PTR_TO_PACKET ranges.
+
+The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of
+the pointer returned from a map lookup. This means that when one copy is
+checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs.
+As well as range-checking, the tracked information is also used for enforcing
+alignment of pointer accesses. For instance, on most systems the packet pointer
+is 2 bytes after a 4-byte alignment. If a program adds 14 bytes to that to jump
+over the Ethernet header, then reads IHL and adds (IHL * 4), the resulting
+pointer will have a variable offset known to be 4n+2 for some n, so adding the 2
+bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through
+that pointer are safe.
+The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common
+to all copies of the pointer returned from a socket lookup. This has similar
+behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but
+it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly
+represents a reference to the corresponding ``struct sock``. To ensure that the
+reference is not leaked, it is imperative to NULL-check the reference and in
+the non-NULL case, and pass the valid reference to the socket release function.
+
+Direct packet access
+====================
+
+In cls_bpf and act_bpf programs the verifier allows direct access to the packet
+data via skb->data and skb->data_end pointers.
+Ex::
+
+ 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */
+ 2: r3 = *(u32 *)(r1 +76) /* load skb->data */
+ 3: r5 = r3
+ 4: r5 += 14
+ 5: if r5 > r4 goto pc+16
+ R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
+ 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */
+
+this 2byte load from the packet is safe to do, since the program author
+did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which
+means that in the fall-through case the register R3 (which points to skb->data)
+has at least 14 directly accessible bytes. The verifier marks it
+as R3=pkt(id=0,off=0,r=14).
+id=0 means that no additional variables were added to the register.
+off=0 means that no additional constants were added.
+r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok.
+Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points
+to the packet data, but constant 14 was added to the register, so
+it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14)
+which is zero bytes.
+
+More complex packet access may look like::
+
+
+ R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
+ 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */
+ 7: r4 = *(u8 *)(r3 +12)
+ 8: r4 *= 14
+ 9: r3 = *(u32 *)(r1 +76) /* load skb->data */
+ 10: r3 += r4
+ 11: r2 = r1
+ 12: r2 <<= 48
+ 13: r2 >>= 48
+ 14: r3 += r2
+ 15: r2 = r3
+ 16: r2 += 8
+ 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */
+ 18: if r2 > r1 goto pc+2
+ R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp
+ 19: r1 = *(u8 *)(r3 +4)
+
+The state of the register R3 is R3=pkt(id=2,off=0,r=8)
+id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some
+offset within a packet and since the program author did
+``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8).
+The verifier only allows 'add'/'sub' operations on packet registers. Any other
+operation will set the register state to 'SCALAR_VALUE' and it won't be
+available for direct packet access.
+
+Operation ``r3 += rX`` may overflow and become less than original skb->data,
+therefore the verifier has to prevent that. So when it sees ``r3 += rX``
+instruction and rX is more than 16-bit value, any subsequent bounds-check of r3
+against skb->data_end will not give us 'range' information, so attempts to read
+through the pointer will give "invalid access to packet" error.
+
+Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is
+R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits
+of the register are guaranteed to be zero, and nothing is known about the lower
+8 bits. After insn ``r4 *= 14`` the state becomes
+R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit
+value by constant 14 will keep upper 52 bits as zero, also the least significant
+bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make
+R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign
+extending. This logic is implemented in adjust_reg_min_max_vals() function,
+which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice
+versa) and adjust_scalar_min_max_vals() for operations on two scalars.
+
+The end result is that bpf program author can access packet directly
+using normal C code as::
+
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+ struct eth_hdr *eth = data;
+ struct iphdr *iph = data + sizeof(*eth);
+ struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph);
+
+ if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end)
+ return 0;
+ if (eth->h_proto != htons(ETH_P_IP))
+ return 0;
+ if (iph->protocol != IPPROTO_UDP || iph->ihl != 5)
+ return 0;
+ if (udp->dest == 53 || udp->source == 9)
+ ...;
+
+which makes such programs easier to write comparing to LD_ABS insn
+and significantly faster.
+
+Pruning
+=======
+
+The verifier does not actually walk all possible paths through the program. For
+each new branch to analyse, the verifier looks at all the states it's previously
+been in when at this instruction. If any of them contain the current state as a
+subset, the branch is 'pruned' - that is, the fact that the previous state was
+accepted implies the current state would be as well. For instance, if in the
+previous state, r1 held a packet-pointer, and in the current state, r1 holds a
+packet-pointer with a range as long or longer and at least as strict an
+alignment, then r1 is safe. Similarly, if r2 was NOT_INIT before then it can't
+have been used by any path from that point, so any value in r2 (including
+another NOT_INIT) is safe. The implementation is in the function regsafe().
+Pruning considers not only the registers but also the stack (and any spilled
+registers it may hold). They must all be safe for the branch to be pruned.
+This is implemented in states_equal().
+
+Some technical details about state pruning implementation could be found below.
+
+Register liveness tracking
+--------------------------
+
+In order to make state pruning effective, liveness state is tracked for each
+register and stack slot. The basic idea is to track which registers and stack
+slots are actually used during subseqeuent execution of the program, until
+program exit is reached. Registers and stack slots that were never used could be
+removed from the cached state thus making more states equivalent to a cached
+state. This could be illustrated by the following program::
+
+ 0: call bpf_get_prandom_u32()
+ 1: r1 = 0
+ 2: if r0 == 0 goto +1
+ 3: r0 = 1
+ --- checkpoint ---
+ 4: r0 = r1
+ 5: exit
+
+Suppose that a state cache entry is created at instruction #4 (such entries are
+also called "checkpoints" in the text below). The verifier could reach the
+instruction with one of two possible register states:
+
+* r0 = 1, r1 = 0
+* r0 = 0, r1 = 0
+
+However, only the value of register ``r1`` is important to successfully finish
+verification. The goal of the liveness tracking algorithm is to spot this fact
+and figure out that both states are actually equivalent.
+
+Data structures
+~~~~~~~~~~~~~~~
+
+Liveness is tracked using the following data structures::
+
+ enum bpf_reg_liveness {
+ REG_LIVE_NONE = 0,
+ REG_LIVE_READ32 = 0x1,
+ REG_LIVE_READ64 = 0x2,
+ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
+ REG_LIVE_WRITTEN = 0x4,
+ REG_LIVE_DONE = 0x8,
+ };
+
+ struct bpf_reg_state {
+ ...
+ struct bpf_reg_state *parent;
+ ...
+ enum bpf_reg_liveness live;
+ ...
+ };
+
+ struct bpf_stack_state {
+ struct bpf_reg_state spilled_ptr;
+ ...
+ };
+
+ struct bpf_func_state {
+ struct bpf_reg_state regs[MAX_BPF_REG];
+ ...
+ struct bpf_stack_state *stack;
+ }
+
+ struct bpf_verifier_state {
+ struct bpf_func_state *frame[MAX_CALL_FRAMES];
+ struct bpf_verifier_state *parent;
+ ...
+ }
+
+* ``REG_LIVE_NONE`` is an initial value assigned to ``->live`` fields upon new
+ verifier state creation;
+
+* ``REG_LIVE_WRITTEN`` means that the value of the register (or stack slot) is
+ defined by some instruction verified between this verifier state's parent and
+ verifier state itself;
+
+* ``REG_LIVE_READ{32,64}`` means that the value of the register (or stack slot)
+ is read by a some child state of this verifier state;
+
+* ``REG_LIVE_DONE`` is a marker used by ``clean_verifier_state()`` to avoid
+ processing same verifier state multiple times and for some sanity checks;
+
+* ``->live`` field values are formed by combining ``enum bpf_reg_liveness``
+ values using bitwise or.
+
+Register parentage chains
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to propagate information between parent and child states, a *register
+parentage chain* is established. Each register or stack slot is linked to a
+corresponding register or stack slot in its parent state via a ``->parent``
+pointer. This link is established upon state creation in ``is_state_visited()``
+and might be modified by ``set_callee_state()`` called from
+``__check_func_call()``.
+
+The rules for correspondence between registers / stack slots are as follows:
+
+* For the current stack frame, registers and stack slots of the new state are
+ linked to the registers and stack slots of the parent state with the same
+ indices.
+
+* For the outer stack frames, only caller saved registers (r6-r9) and stack
+ slots are linked to the registers and stack slots of the parent state with the
+ same indices.
+
+* When function call is processed a new ``struct bpf_func_state`` instance is
+ allocated, it encapsulates a new set of registers and stack slots. For this
+ new frame, parent links for r6-r9 and stack slots are set to nil, parent links
+ for r1-r5 are set to match caller r1-r5 parent links.
+
+This could be illustrated by the following diagram (arrows stand for
+``->parent`` pointers)::
+
+ ... ; Frame #0, some instructions
+ --- checkpoint #0 ---
+ 1 : r6 = 42 ; Frame #0
+ --- checkpoint #1 ---
+ 2 : call foo() ; Frame #0
+ ... ; Frame #1, instructions from foo()
+ --- checkpoint #2 ---
+ ... ; Frame #1, instructions from foo()
+ --- checkpoint #3 ---
+ exit ; Frame #1, return from foo()
+ 3 : r1 = r6 ; Frame #0 <- current state
+
+ +-------------------------------+-------------------------------+
+ | Frame #0 | Frame #1 |
+ Checkpoint +-------------------------------+-------------------------------+
+ #0 | r0 | r1-r5 | r6-r9 | fp-8 ... |
+ +-------------------------------+
+ ^ ^ ^ ^
+ | | | |
+ Checkpoint +-------------------------------+
+ #1 | r0 | r1-r5 | r6-r9 | fp-8 ... |
+ +-------------------------------+
+ ^ ^ ^
+ |_______|_______|_______________
+ | | |
+ nil nil | | | nil nil
+ | | | | | | |
+ Checkpoint +-------------------------------+-------------------------------+
+ #2 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... |
+ +-------------------------------+-------------------------------+
+ ^ ^ ^ ^ ^
+ nil nil | | | | |
+ | | | | | | |
+ Checkpoint +-------------------------------+-------------------------------+
+ #3 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... |
+ +-------------------------------+-------------------------------+
+ ^ ^
+ nil nil | |
+ | | | |
+ Current +-------------------------------+
+ state | r0 | r1-r5 | r6-r9 | fp-8 ... |
+ +-------------------------------+
+ \
+ r6 read mark is propagated via these links
+ all the way up to checkpoint #1.
+ The checkpoint #1 contains a write mark for r6
+ because of instruction (1), thus read propagation
+ does not reach checkpoint #0 (see section below).
+
+Liveness marks tracking
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For each processed instruction, the verifier tracks read and written registers
+and stack slots. The main idea of the algorithm is that read marks propagate
+back along the state parentage chain until they hit a write mark, which 'screens
+off' earlier states from the read. The information about reads is propagated by
+function ``mark_reg_read()`` which could be summarized as follows::
+
+ mark_reg_read(struct bpf_reg_state *state, ...):
+ parent = state->parent
+ while parent:
+ if state->live & REG_LIVE_WRITTEN:
+ break
+ if parent->live & REG_LIVE_READ64:
+ break
+ parent->live |= REG_LIVE_READ64
+ state = parent
+ parent = state->parent
+
+Notes:
+
+* The read marks are applied to the **parent** state while write marks are
+ applied to the **current** state. The write mark on a register or stack slot
+ means that it is updated by some instruction in the straight-line code leading
+ from the parent state to the current state.
+
+* Details about REG_LIVE_READ32 are omitted.
+
+* Function ``propagate_liveness()`` (see section :ref:`read_marks_for_cache_hits`)
+ might override the first parent link. Please refer to the comments in the
+ ``propagate_liveness()`` and ``mark_reg_read()`` source code for further
+ details.
+
+Because stack writes could have different sizes ``REG_LIVE_WRITTEN`` marks are
+applied conservatively: stack slots are marked as written only if write size
+corresponds to the size of the register, e.g. see function ``save_register_state()``.
+
+Consider the following example::
+
+ 0: (*u64)(r10 - 8) = 0 ; define 8 bytes of fp-8
+ --- checkpoint #0 ---
+ 1: (*u32)(r10 - 8) = 1 ; redefine lower 4 bytes
+ 2: r1 = (*u32)(r10 - 8) ; read lower 4 bytes defined at (1)
+ 3: r2 = (*u32)(r10 - 4) ; read upper 4 bytes defined at (0)
+
+As stated above, the write at (1) does not count as ``REG_LIVE_WRITTEN``. Should
+it be otherwise, the algorithm above wouldn't be able to propagate the read mark
+from (3) to checkpoint #0.
+
+Once the ``BPF_EXIT`` instruction is reached ``update_branch_counts()`` is
+called to update the ``->branches`` counter for each verifier state in a chain
+of parent verifier states. When the ``->branches`` counter reaches zero the
+verifier state becomes a valid entry in a set of cached verifier states.
+
+Each entry of the verifier states cache is post-processed by a function
+``clean_live_states()``. This function marks all registers and stack slots
+without ``REG_LIVE_READ{32,64}`` marks as ``NOT_INIT`` or ``STACK_INVALID``.
+Registers/stack slots marked in this way are ignored in function ``stacksafe()``
+called from ``states_equal()`` when a state cache entry is considered for
+equivalence with a current state.
+
+Now it is possible to explain how the example from the beginning of the section
+works::
+
+ 0: call bpf_get_prandom_u32()
+ 1: r1 = 0
+ 2: if r0 == 0 goto +1
+ 3: r0 = 1
+ --- checkpoint[0] ---
+ 4: r0 = r1
+ 5: exit
+
+* At instruction #2 branching point is reached and state ``{ r0 == 0, r1 == 0, pc == 4 }``
+ is pushed to states processing queue (pc stands for program counter).
+
+* At instruction #4:
+
+ * ``checkpoint[0]`` states cache entry is created: ``{ r0 == 1, r1 == 0, pc == 4 }``;
+ * ``checkpoint[0].r0`` is marked as written;
+ * ``checkpoint[0].r1`` is marked as read;
+
+* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed
+ by ``clean_live_states()``. After this processing ``checkpoint[0].r0`` has a
+ read mark and all other registers and stack slots are marked as ``NOT_INIT``
+ or ``STACK_INVALID``
+
+* The state ``{ r0 == 0, r1 == 0, pc == 4 }`` is popped from the states queue
+ and is compared against a cached state ``{ r1 == 0, pc == 4 }``, the states
+ are considered equivalent.
+
+.. _read_marks_for_cache_hits:
+
+Read marks propagation for cache hits
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Another point is the handling of read marks when a previously verified state is
+found in the states cache. Upon cache hit verifier must behave in the same way
+as if the current state was verified to the program exit. This means that all
+read marks, present on registers and stack slots of the cached state, must be
+propagated over the parentage chain of the current state. Example below shows
+why this is important. Function ``propagate_liveness()`` handles this case.
+
+Consider the following state parentage chain (S is a starting state, A-E are
+derived states, -> arrows show which state is derived from which)::
+
+ r1 read
+ <------------- A[r1] == 0
+ C[r1] == 0
+ S ---> A ---> B ---> exit E[r1] == 1
+ |
+ ` ---> C ---> D
+ |
+ ` ---> E ^
+ |___ suppose all these
+ ^ states are at insn #Y
+ |
+ suppose all these
+ states are at insn #X
+
+* Chain of states ``S -> A -> B -> exit`` is verified first.
+
+* While ``B -> exit`` is verified, register ``r1`` is read and this read mark is
+ propagated up to state ``A``.
+
+* When chain of states ``C -> D`` is verified the state ``D`` turns out to be
+ equivalent to state ``B``.
+
+* The read mark for ``r1`` has to be propagated to state ``C``, otherwise state
+ ``C`` might get mistakenly marked as equivalent to state ``E`` even though
+ values for register ``r1`` differ between ``C`` and ``E``.
+
+Understanding eBPF verifier messages
+====================================
+
+The following are few examples of invalid eBPF programs and verifier error
+messages as seen in the log:
+
+Program with unreachable instructions::
+
+ static struct bpf_insn prog[] = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ };
+
+Error::
+
+ unreachable insn 1
+
+Program that reads uninitialized register::
+
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (bf) r0 = r2
+ R2 !read_ok
+
+Program that doesn't initialize R0 before exiting::
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (bf) r2 = r1
+ 1: (95) exit
+ R0 !read_ok
+
+Program that accesses stack out of bounds::
+
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (7a) *(u64 *)(r10 +8) = 0
+ invalid stack off=8 size=8
+
+Program that doesn't initialize stack before passing its address into function::
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (bf) r2 = r10
+ 1: (07) r2 += -8
+ 2: (b7) r1 = 0x0
+ 3: (85) call 1
+ invalid indirect read from stack off -8+0 size 8
+
+Program that uses invalid map_fd=0 while calling to map_lookup_elem() function::
+
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ fd 0 is not pointing to valid bpf_map
+
+Program that doesn't check return value of map_lookup_elem() before accessing
+map element::
+
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ 5: (7a) *(u64 *)(r0 +0) = 0
+ R0 invalid mem access 'map_value_or_null'
+
+Program that correctly checks map_lookup_elem() returned value for NULL, but
+accesses the memory with incorrect alignment::
+
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+1
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +4) = 0
+ misaligned access off 4 size 8
+
+Program that correctly checks map_lookup_elem() returned value for NULL and
+accesses memory with correct alignment in one side of 'if' branch, but fails
+to do so in the other side of 'if' branch::
+
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+2
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +0) = 0
+ 7: (95) exit
+
+ from 5 to 8: R0=imm0 R10=fp
+ 8: (7a) *(u64 *)(r0 +0) = 1
+ R0 invalid mem access 'imm'
+
+Program that performs a socket lookup then sets the pointer to NULL without
+checking it::
+
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (b7) r2 = 0
+ 1: (63) *(u32 *)(r10 -8) = r2
+ 2: (bf) r2 = r10
+ 3: (07) r2 += -8
+ 4: (b7) r3 = 4
+ 5: (b7) r4 = 0
+ 6: (b7) r5 = 0
+ 7: (85) call bpf_sk_lookup_tcp#65
+ 8: (b7) r0 = 0
+ 9: (95) exit
+ Unreleased reference id=1, alloc_insn=7
+
+Program that performs a socket lookup but does not NULL-check the returned
+value::
+
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+ BPF_EXIT_INSN(),
+
+Error::
+
+ 0: (b7) r2 = 0
+ 1: (63) *(u32 *)(r10 -8) = r2
+ 2: (bf) r2 = r10
+ 3: (07) r2 += -8
+ 4: (b7) r3 = 4
+ 5: (b7) r4 = 0
+ 6: (b7) r5 = 0
+ 7: (85) call bpf_sk_lookup_tcp#65
+ 8: (95) exit
+ Unreleased reference id=1, alloc_insn=7
diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst
index 5845960ca382..7964fe134277 100644
--- a/Documentation/cdrom/cdrom-standard.rst
+++ b/Documentation/cdrom/cdrom-standard.rst
@@ -218,7 +218,6 @@ current *struct* is::
int (*tray_move)(struct cdrom_device_info *, int);
int (*lock_door)(struct cdrom_device_info *, int);
int (*select_speed)(struct cdrom_device_info *, int);
- int (*select_disc)(struct cdrom_device_info *, int);
int (*get_last_session) (struct cdrom_device_info *,
struct cdrom_multisession *);
int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *);
@@ -421,15 +420,6 @@ return value indicates an error.
::
- int select_disc(struct cdrom_device_info *cdi, int number)
-
-If the drive can store multiple discs (a juke-box) this function
-will perform disc selection. It should return the number of the
-selected disc on success, a negative value on error. Currently, only
-the ide-cd driver supports this functionality.
-
-::
-
int get_last_session(struct cdrom_device_info *cdi,
struct cdrom_multisession *ms_info)
@@ -907,6 +897,17 @@ commands can be identified by the underscores in their names.
specifies the slot for which the information is given. The special
value *CDSL_CURRENT* requests that information about the currently
selected slot be returned.
+`CDROM_TIMED_MEDIA_CHANGE`
+ Checks whether the disc has been changed since a user supplied time
+ and returns the time of the last disc change.
+
+ *arg* is a pointer to a *cdrom_timed_media_change_info* struct.
+ *arg->last_media_change* may be set by calling code to signal
+ the timestamp of the last known media change (by the caller).
+ Upon successful return, this ioctl call will set
+ *arg->last_media_change* to the latest media change timestamp (in ms)
+ known by the kernel/driver and set *arg->has_changed* to 1 if
+ that timestamp is more recent than the timestamp set by the caller.
`CDROM_DRIVE_STATUS`
Returns the status of the drive by a call to
*drive_status()*. Return values are defined in cdrom_drive_status_.
diff --git a/Documentation/cdrom/ide-cd.rst b/Documentation/cdrom/ide-cd.rst
deleted file mode 100644
index bdccb74fc92d..000000000000
--- a/Documentation/cdrom/ide-cd.rst
+++ /dev/null
@@ -1,538 +0,0 @@
-IDE-CD driver documentation
-===========================
-
-:Originally by: scott snyder <snyder@fnald0.fnal.gov> (19 May 1996)
-:Carrying on the torch is: Erik Andersen <andersee@debian.org>
-:New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk>
-
-1. Introduction
----------------
-
-The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant
-CDROM drives which attach to an IDE interface. Note that some CDROM vendors
-(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made
-both ATAPI-compliant drives and drives which use a proprietary
-interface. If your drive uses one of those proprietary interfaces,
-this driver will not work with it (but one of the other CDROM drivers
-probably will). This driver will not work with `ATAPI` drives which
-attach to the parallel port. In addition, there is at least one drive
-(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI;
-this driver will not work with drives like that either (but see the
-aztcd driver).
-
-This driver provides the following features:
-
- - Reading from data tracks, and mounting ISO 9660 filesystems.
-
- - Playing audio tracks. Most of the CDROM player programs floating
- around should work; I usually use Workman.
-
- - Multisession support.
-
- - On drives which support it, reading digital audio data directly
- from audio tracks. The program cdda2wav can be used for this.
- Note, however, that only some drives actually support this.
-
- - There is now support for CDROM changers which comply with the
- ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional
- functionality includes a function call to query which slot is the
- currently selected slot, a function call to query which slots contain
- CDs, etc. A sample program which demonstrates this functionality is
- appended to the end of this file. The Sanyo 3-disc changer
- (which does not conform to the standard) is also now supported.
- Please note the driver refers to the first CD as slot # 0.
-
-
-2. Installation
----------------
-
-0. The ide-cd relies on the ide disk driver. See
- Documentation/ide/ide.rst for up-to-date information on the ide
- driver.
-
-1. Make sure that the ide and ide-cd drivers are compiled into the
- kernel you're using. When configuring the kernel, in the section
- entitled "Floppy, IDE, and other block devices", say either `Y`
- (which will compile the support directly into the kernel) or `M`
- (to compile support as a module which can be loaded and unloaded)
- to the options::
-
- ATA/ATAPI/MFM/RLL support
- Include IDE/ATAPI CDROM support
-
- Depending on what type of IDE interface you have, you may need to
- specify additional configuration options. See
- Documentation/ide/ide.rst.
-
-2. You should also ensure that the iso9660 filesystem is either
- compiled into the kernel or available as a loadable module. You
- can see if a filesystem is known to the kernel by catting
- /proc/filesystems.
-
-3. The CDROM drive should be connected to the host on an IDE
- interface. Each interface on a system is defined by an I/O port
- address and an IRQ number, the standard assignments being
- 0x1f0 and 14 for the primary interface and 0x170 and 15 for the
- secondary interface. Each interface can control up to two devices,
- where each device can be a hard drive, a CDROM drive, a floppy drive,
- or a tape drive. The two devices on an interface are called `master`
- and `slave`; this is usually selectable via a jumper on the drive.
-
- Linux names these devices as follows. The master and slave devices
- on the primary IDE interface are called `hda` and `hdb`,
- respectively. The drives on the secondary interface are called
- `hdc` and `hdd`. (Interfaces at other locations get other letters
- in the third position; see Documentation/ide/ide.rst.)
-
- If you want your CDROM drive to be found automatically by the
- driver, you should make sure your IDE interface uses either the
- primary or secondary addresses mentioned above. In addition, if
- the CDROM drive is the only device on the IDE interface, it should
- be jumpered as `master`. (If for some reason you cannot configure
- your system in this manner, you can probably still use the driver.
- You may have to pass extra configuration information to the kernel
- when you boot, however. See Documentation/ide/ide.rst for more
- information.)
-
-4. Boot the system. If the drive is recognized, you should see a
- message which looks like::
-
- hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive
-
- If you do not see this, see section 5 below.
-
-5. You may want to create a symbolic link /dev/cdrom pointing to the
- actual device. You can do this with the command::
-
- ln -s /dev/hdX /dev/cdrom
-
- where X should be replaced by the letter indicating where your
- drive is installed.
-
-6. You should be able to see any error messages from the driver with
- the `dmesg` command.
-
-
-3. Basic usage
---------------
-
-An ISO 9660 CDROM can be mounted by putting the disc in the drive and
-typing (as root)::
-
- mount -t iso9660 /dev/cdrom /mnt/cdrom
-
-where it is assumed that /dev/cdrom is a link pointing to the actual
-device (as described in step 5 of the last section) and /mnt/cdrom is
-an empty directory. You should now be able to see the contents of the
-CDROM under the /mnt/cdrom directory. If you want to eject the CDROM,
-you must first dismount it with a command like::
-
- umount /mnt/cdrom
-
-Note that audio CDs cannot be mounted.
-
-Some distributions set up /etc/fstab to always try to mount a CDROM
-filesystem on bootup. It is not required to mount the CDROM in this
-manner, though, and it may be a nuisance if you change CDROMs often.
-You should feel free to remove the cdrom line from /etc/fstab and
-mount CDROMs manually if that suits you better.
-
-Multisession and photocd discs should work with no special handling.
-The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be
-useful for reading photocds.
-
-To play an audio CD, you should first unmount and remove any data
-CDROM. Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.).
-
-On a few drives, you can read digital audio directly using a program
-such as cdda2wav. The only types of drive which I've heard support
-this are Sony and Toshiba drives. You will get errors if you try to
-use this function on a drive which does not support it.
-
-For supported changers, you can use the `cdchange` program (appended to
-the end of this file) to switch between changer slots. Note that the
-drive should be unmounted before attempting this. The program takes
-two arguments: the CDROM device, and the slot number to which you wish
-to change. If the slot number is -1, the drive is unloaded.
-
-
-4. Common problems
-------------------
-
-This section discusses some common problems encountered when trying to
-use the driver, and some possible solutions. Note that if you are
-experiencing problems, you should probably also review
-Documentation/ide/ide.rst for current information about the underlying
-IDE support code. Some of these items apply only to earlier versions
-of the driver, but are mentioned here for completeness.
-
-In most cases, you should probably check with `dmesg` for any errors
-from the driver.
-
-a. Drive is not detected during booting.
-
- - Review the configuration instructions above and in
- Documentation/ide/ide.rst, and check how your hardware is
- configured.
-
- - If your drive is the only device on an IDE interface, it should
- be jumpered as master, if at all possible.
-
- - If your IDE interface is not at the standard addresses of 0x170
- or 0x1f0, you'll need to explicitly inform the driver using a
- lilo option. See Documentation/ide/ide.rst. (This feature was
- added around kernel version 1.3.30.)
-
- - If the autoprobing is not finding your drive, you can tell the
- driver to assume that one exists by using a lilo option of the
- form `hdX=cdrom`, where X is the drive letter corresponding to
- where your drive is installed. Note that if you do this and you
- see a boot message like::
-
- hdX: ATAPI cdrom (?)
-
- this does _not_ mean that the driver has successfully detected
- the drive; rather, it means that the driver has not detected a
- drive, but is assuming there's one there anyway because you told
- it so. If you actually try to do I/O to a drive defined at a
- nonexistent or nonresponding I/O address, you'll probably get
- errors with a status value of 0xff.
-
- - Some IDE adapters require a nonstandard initialization sequence
- before they'll function properly. (If this is the case, there
- will often be a separate MS-DOS driver just for the controller.)
- IDE interfaces on sound cards often fall into this category.
-
- Support for some interfaces needing extra initialization is
- provided in later 1.3.x kernels. You may need to turn on
- additional kernel configuration options to get them to work;
- see Documentation/ide/ide.rst.
-
- Even if support is not available for your interface, you may be
- able to get it to work with the following procedure. First boot
- MS-DOS and load the appropriate drivers. Then warm-boot linux
- (i.e., without powering off). If this works, it can be automated
- by running loadlin from the MS-DOS autoexec.
-
-
-b. Timeout/IRQ errors.
-
- - If you always get timeout errors, interrupts from the drive are
- probably not making it to the host.
-
- - IRQ problems may also be indicated by the message
- `IRQ probe failed (<n>)` while booting. If <n> is zero, that
- means that the system did not see an interrupt from the drive when
- it was expecting one (on any feasible IRQ). If <n> is negative,
- that means the system saw interrupts on multiple IRQ lines, when
- it was expecting to receive just one from the CDROM drive.
-
- - Double-check your hardware configuration to make sure that the IRQ
- number of your IDE interface matches what the driver expects.
- (The usual assignments are 14 for the primary (0x1f0) interface
- and 15 for the secondary (0x170) interface.) Also be sure that
- you don't have some other hardware which might be conflicting with
- the IRQ you're using. Also check the BIOS setup for your system;
- some have the ability to disable individual IRQ levels, and I've
- had one report of a system which was shipped with IRQ 15 disabled
- by default.
-
- - Note that many MS-DOS CDROM drivers will still function even if
- there are hardware problems with the interrupt setup; they
- apparently don't use interrupts.
-
- - If you own a Pioneer DR-A24X, you _will_ get nasty error messages
- on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }"
- The Pioneer DR-A24X CDROM drives are fairly popular these days.
- Unfortunately, these drives seem to become very confused when we perform
- the standard Linux ATA disk drive probe. If you own one of these drives,
- you can bypass the ATA probing which confuses these CDROM drives, by
- adding `append="hdX=noprobe hdX=cdrom"` to your lilo.conf file and running
- lilo (again where X is the drive letter corresponding to where your drive
- is installed.)
-
-c. System hangups.
-
- - If the system locks up when you try to access the CDROM, the most
- likely cause is that you have a buggy IDE adapter which doesn't
- properly handle simultaneous transactions on multiple interfaces.
- The most notorious of these is the CMD640B chip. This problem can
- be worked around by specifying the `serialize` option when
- booting. Recent kernels should be able to detect the need for
- this automatically in most cases, but the detection is not
- foolproof. See Documentation/ide/ide.rst for more information
- about the `serialize` option and the CMD640B.
-
- - Note that many MS-DOS CDROM drivers will work with such buggy
- hardware, apparently because they never attempt to overlap CDROM
- operations with other disk activity.
-
-
-d. Can't mount a CDROM.
-
- - If you get errors from mount, it may help to check `dmesg` to see
- if there are any more specific errors from the driver or from the
- filesystem.
-
- - Make sure there's a CDROM loaded in the drive, and that's it's an
- ISO 9660 disc. You can't mount an audio CD.
-
- - With the CDROM in the drive and unmounted, try something like::
-
- cat /dev/cdrom | od | more
-
- If you see a dump, then the drive and driver are probably working
- OK, and the problem is at the filesystem level (i.e., the CDROM is
- not ISO 9660 or has errors in the filesystem structure).
-
- - If you see `not a block device` errors, check that the definitions
- of the device special files are correct. They should be as
- follows::
-
- brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda
- brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb
- brw-rw---- 1 root disk 22, 0 Nov 11 18:48 /dev/hdc
- brw-rw---- 1 root disk 22, 64 Nov 11 18:48 /dev/hdd
-
- Some early Slackware releases had these defined incorrectly. If
- these are wrong, you can remake them by running the script
- scripts/MAKEDEV.ide. (You may have to make it executable
- with chmod first.)
-
- If you have a /dev/cdrom symbolic link, check that it is pointing
- to the correct device file.
-
- If you hear people talking of the devices `hd1a` and `hd1b`, these
- were old names for what are now called hdc and hdd. Those names
- should be considered obsolete.
-
- - If mount is complaining that the iso9660 filesystem is not
- available, but you know it is (check /proc/filesystems), you
- probably need a newer version of mount. Early versions would not
- always give meaningful error messages.
-
-
-e. Directory listings are unpredictably truncated, and `dmesg` shows
- `buffer botch` error messages from the driver.
-
- - There was a bug in the version of the driver in 1.2.x kernels
- which could cause this. It was fixed in 1.3.0. If you can't
- upgrade, you can probably work around the problem by specifying a
- blocksize of 2048 when mounting. (Note that you won't be able to
- directly execute binaries off the CDROM in that case.)
-
- If you see this in kernels later than 1.3.0, please report it as a
- bug.
-
-
-f. Data corruption.
-
- - Random data corruption was occasionally observed with the Hitachi
- CDR-7730 CDROM. If you experience data corruption, using "hdx=slow"
- as a command line parameter may work around the problem, at the
- expense of low system performance.
-
-
-5. cdchange.c
--------------
-
-::
-
- /*
- * cdchange.c [-v] <device> [<slot>]
- *
- * This loads a CDROM from a specified slot in a changer, and displays
- * information about the changer status. The drive should be unmounted before
- * using this program.
- *
- * Changer information is displayed if either the -v flag is specified
- * or no slot was specified.
- *
- * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>.
- * Changer status information, and rewrite for the new Uniform CDROM driver
- * interface by Erik Andersen <andersee@debian.org>.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <errno.h>
- #include <string.h>
- #include <unistd.h>
- #include <fcntl.h>
- #include <sys/ioctl.h>
- #include <linux/cdrom.h>
-
-
- int
- main (int argc, char **argv)
- {
- char *program;
- char *device;
- int fd; /* file descriptor for CD-ROM device */
- int status; /* return status for system calls */
- int verbose = 0;
- int slot=-1, x_slot;
- int total_slots_available;
-
- program = argv[0];
-
- ++argv;
- --argc;
-
- if (argc < 1 || argc > 3) {
- fprintf (stderr, "usage: %s [-v] <device> [<slot>]\n",
- program);
- fprintf (stderr, " Slots are numbered 1 -- n.\n");
- exit (1);
- }
-
- if (strcmp (argv[0], "-v") == 0) {
- verbose = 1;
- ++argv;
- --argc;
- }
-
- device = argv[0];
-
- if (argc == 2)
- slot = atoi (argv[1]) - 1;
-
- /* open device */
- fd = open(device, O_RDONLY | O_NONBLOCK);
- if (fd < 0) {
- fprintf (stderr, "%s: open failed for `%s`: %s\n",
- program, device, strerror (errno));
- exit (1);
- }
-
- /* Check CD player status */
- total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS);
- if (total_slots_available <= 1 ) {
- fprintf (stderr, "%s: Device `%s` is not an ATAPI "
- "compliant CD changer.\n", program, device);
- exit (1);
- }
-
- if (slot >= 0) {
- if (slot >= total_slots_available) {
- fprintf (stderr, "Bad slot number. "
- "Should be 1 -- %d.\n",
- total_slots_available);
- exit (1);
- }
-
- /* load */
- slot=ioctl (fd, CDROM_SELECT_DISC, slot);
- if (slot<0) {
- fflush(stdout);
- perror ("CDROM_SELECT_DISC ");
- exit(1);
- }
- }
-
- if (slot < 0 || verbose) {
-
- status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT);
- if (status<0) {
- fflush(stdout);
- perror (" CDROM_SELECT_DISC");
- exit(1);
- }
- slot=status;
-
- printf ("Current slot: %d\n", slot+1);
- printf ("Total slots available: %d\n",
- total_slots_available);
-
- printf ("Drive status: ");
- status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
- if (status<0) {
- perror(" CDROM_DRIVE_STATUS");
- } else switch(status) {
- case CDS_DISC_OK:
- printf ("Ready.\n");
- break;
- case CDS_TRAY_OPEN:
- printf ("Tray Open.\n");
- break;
- case CDS_DRIVE_NOT_READY:
- printf ("Drive Not Ready.\n");
- break;
- default:
- printf ("This Should not happen!\n");
- break;
- }
-
- for (x_slot=0; x_slot<total_slots_available; x_slot++) {
- printf ("Slot %2d: ", x_slot+1);
- status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot);
- if (status<0) {
- perror(" CDROM_DRIVE_STATUS");
- } else switch(status) {
- case CDS_DISC_OK:
- printf ("Disc present.");
- break;
- case CDS_NO_DISC:
- printf ("Empty slot.");
- break;
- case CDS_TRAY_OPEN:
- printf ("CD-ROM tray open.\n");
- break;
- case CDS_DRIVE_NOT_READY:
- printf ("CD-ROM drive not ready.\n");
- break;
- case CDS_NO_INFO:
- printf ("No Information available.");
- break;
- default:
- printf ("This Should not happen!\n");
- break;
- }
- if (slot == x_slot) {
- status = ioctl (fd, CDROM_DISC_STATUS);
- if (status<0) {
- perror(" CDROM_DISC_STATUS");
- }
- switch (status) {
- case CDS_AUDIO:
- printf ("\tAudio disc.\t");
- break;
- case CDS_DATA_1:
- case CDS_DATA_2:
- printf ("\tData disc type %d.\t", status-CDS_DATA_1+1);
- break;
- case CDS_XA_2_1:
- case CDS_XA_2_2:
- printf ("\tXA data disc type %d.\t", status-CDS_XA_2_1+1);
- break;
- default:
- printf ("\tUnknown disc type 0x%x!\t", status);
- break;
- }
- }
- status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot);
- if (status<0) {
- perror(" CDROM_MEDIA_CHANGED");
- }
- switch (status) {
- case 1:
- printf ("Changed.\n");
- break;
- default:
- printf ("\n");
- break;
- }
- }
- }
-
- /* close device */
- status = close (fd);
- if (status != 0) {
- fprintf (stderr, "%s: close failed for `%s`: %s\n",
- program, device, strerror (errno));
- exit (1);
- }
-
- exit (0);
- }
diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst
index 338ad5f94e7c..e87a8785bc1a 100644
--- a/Documentation/cdrom/index.rst
+++ b/Documentation/cdrom/index.rst
@@ -8,7 +8,6 @@ cdrom
:maxdepth: 1
cdrom-standard
- ide-cd
packet-writing
.. only:: subproject and html
diff --git a/Documentation/cdrom/packet-writing.rst b/Documentation/cdrom/packet-writing.rst
index c5c957195a5a..43db58c50d29 100644
--- a/Documentation/cdrom/packet-writing.rst
+++ b/Documentation/cdrom/packet-writing.rst
@@ -11,7 +11,7 @@ Getting started quick
- Compile and install kernel and modules, reboot.
- You need the udftools package (pktsetup, mkudffs, cdrwtool).
- Download from http://sourceforge.net/projects/linux-udf/
+ Download from https://github.com/pali/udftools
- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
as appropriate)::
@@ -102,7 +102,7 @@ Using the pktcdvd sysfs interface
Since Linux 2.6.20, the pktcdvd module has a sysfs interface
and can be controlled by it. For example the "pktcdvd" tool uses
-this interface. (see http://tom.ist-im-web.de/download/pktcdvd )
+this interface. (see http://tom.ist-im-web.de/linux/software/pktcdvd )
"pktcdvd" works similar to "pktsetup", e.g.::
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 948a97d6387d..37314afd1ac8 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -15,10 +15,28 @@
import sys
import os
import sphinx
+import shutil
+
+# helper
+# ------
+
+def have_command(cmd):
+ """Search ``cmd`` in the ``PATH`` environment.
+
+ If found, return True.
+ If not found, return False.
+ """
+ return shutil.which(cmd) is not None
# Get Sphinx version
major, minor, patch = sphinx.version_info[:3]
+#
+# Warn about older versions that we don't want to support for much
+# longer.
+#
+if (major < 2) or (major == 2 and minor < 4):
+ print('WARNING: support for Sphinx < 2.4 will be removed soon.')
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
@@ -86,6 +104,7 @@ if major >= 3:
"__used",
"__weak",
"noinline",
+ "__fix_address",
# include/linux/memblock.h:
"__init_memblock",
@@ -97,6 +116,9 @@ if major >= 3:
# include/linux/linkage.h:
"asmlinkage",
+
+ # include/linux/btf.h
+ "__bpf_kfunc",
]
else:
@@ -106,10 +128,35 @@ else:
autosectionlabel_prefix_document = True
autosectionlabel_maxdepth = 2
-extensions.append("sphinx.ext.imgmath")
+# Load math renderer:
+# For html builder, load imgmath only when its dependencies are met.
+# mathjax is the default math renderer since Sphinx 1.8.
+have_latex = have_command('latex')
+have_dvipng = have_command('dvipng')
+load_imgmath = have_latex and have_dvipng
+
+# Respect SPHINX_IMGMATH (for html docs only)
+if 'SPHINX_IMGMATH' in os.environ:
+ env_sphinx_imgmath = os.environ['SPHINX_IMGMATH']
+ if 'yes' in env_sphinx_imgmath:
+ load_imgmath = True
+ elif 'no' in env_sphinx_imgmath:
+ load_imgmath = False
+ else:
+ sys.stderr.write("Unknown env SPHINX_IMGMATH=%s ignored.\n" % env_sphinx_imgmath)
+
+# Always load imgmath for Sphinx <1.8 or for epub docs
+load_imgmath = (load_imgmath or (major == 1 and minor < 8)
+ or 'epub' in sys.argv)
+
+if load_imgmath:
+ extensions.append("sphinx.ext.imgmath")
+ math_renderer = 'imgmath'
+else:
+ math_renderer = 'mathjax'
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ['sphinx/templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
@@ -156,12 +203,30 @@ finally:
else:
version = release = "unknown version"
+#
+# HACK: there seems to be no easy way for us to get at the version and
+# release information passed in from the makefile...so go pawing through the
+# command-line options and find it for ourselves.
+#
+def get_cline_version():
+ c_version = c_release = ''
+ for arg in sys.argv:
+ if arg.startswith('version='):
+ c_version = arg[8:]
+ elif arg.startswith('release='):
+ c_release = arg[8:]
+ if c_version:
+ if c_release:
+ return c_version + '-' + c_release
+ return c_version
+ return version # Whatever we came up with before
+
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = 'en'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
@@ -208,112 +273,88 @@ highlight_language = 'none'
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-# The Read the Docs theme is available from
-# - https://github.com/snide/sphinx_rtd_theme
-# - https://pypi.python.org/pypi/sphinx_rtd_theme
-# - python-sphinx-rtd-theme package (on Debian)
-try:
- import sphinx_rtd_theme
- html_theme = 'sphinx_rtd_theme'
- html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-except ImportError:
- sys.stderr.write('Warning: The Sphinx \'sphinx_rtd_theme\' HTML theme was not found. Make sure you have the theme installed to produce pretty HTML output. Falling back to the default theme.\n')
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further. For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents. If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# Default theme
+html_theme = 'alabaster'
+html_css_files = []
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
+if "DOCS_THEME" in os.environ:
+ html_theme = os.environ["DOCS_THEME"]
-# The name of an image file (within the static path) to use as favicon of the
-# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
+if html_theme == 'sphinx_rtd_theme' or html_theme == 'sphinx_rtd_dark_mode':
+ # Read the Docs theme
+ try:
+ import sphinx_rtd_theme
+ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-
-html_static_path = ['sphinx-static']
-
-html_context = {
- 'css_files': [
- '_static/theme_overrides.css',
- ],
-}
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-#html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+ # Add any paths that contain custom static files (such as style sheets) here,
+ # relative to this directory. They are copied after the builtin static files,
+ # so a file named "default.css" will overwrite the builtin "default.css".
+ html_css_files = [
+ 'theme_overrides.css',
+ ]
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-html_use_smartypants = False
+ # Read the Docs dark mode override theme
+ if html_theme == 'sphinx_rtd_dark_mode':
+ try:
+ import sphinx_rtd_dark_mode
+ extensions.append('sphinx_rtd_dark_mode')
+ except ImportError:
+ html_theme == 'sphinx_rtd_theme'
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+ if html_theme == 'sphinx_rtd_theme':
+ # Add color-specific RTD normal mode
+ html_css_files.append('theme_rtd_colors.css')
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
+ html_theme_options = {
+ 'navigation_depth': -1,
+ }
-# If false, no module index is generated.
-#html_domain_indices = True
+ except ImportError:
+ html_theme = 'alabaster'
-# If false, no index is generated.
-#html_use_index = True
+if "DOCS_CSS" in os.environ:
+ css = os.environ["DOCS_CSS"].split(" ")
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+ for l in css:
+ html_css_files.append(l)
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+if major <= 1 and minor < 8:
+ html_context = {
+ 'css_files': [],
+ }
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+ for l in html_css_files:
+ html_context['css_files'].append('_static/' + l)
+
+if html_theme == 'alabaster':
+ html_theme_options = {
+ 'description': get_cline_version(),
+ 'page_width': '65em',
+ 'sidebar_width': '15em',
+ 'fixed_sidebar': 'true',
+ 'font_size': 'inherit',
+ 'font_family': 'serif',
+ }
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it. The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+sys.stderr.write("Using %s theme\n" % html_theme)
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['sphinx-static']
-# Language to be used for generating the HTML full-text search index.
-# Sphinx supports the following languages:
-# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
-# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
-#html_search_language = 'en'
+# If true, Docutils "smart quotes" will be used to convert quotes and dashes
+# to typographically correct entities. This will convert "--" to "—",
+# which is not always what we want, so disable it.
+smartquotes = False
-# A dictionary with options for the search language support, empty by default.
-# Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
+# Custom sidebar templates, maps document names to template names.
+# Note that the RTD theme ignores this
+html_sidebars = { '**': ['searchbox.html', 'kernel-toc.html', 'sourcelink.html']}
-# The name of a javascript file (relative to the configuration directory) that
-# implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
+# about.html is available for alabaster theme. Add it at the front.
+if html_theme == 'alabaster':
+ html_sidebars['**'].insert(0, 'about.html')
# Output file base name for HTML help builder.
htmlhelp_basename = 'TheLinuxKerneldoc'
@@ -346,132 +387,25 @@ latex_elements = {
# Additional stuff for the LaTeX preamble.
'preamble': '''
- % Prevent column squeezing of tabulary.
- \\setlength{\\tymin}{20em}
% Use some font with UTF-8 support with XeLaTeX
\\usepackage{fontspec}
\\setsansfont{DejaVu Sans}
\\setromanfont{DejaVu Serif}
\\setmonofont{DejaVu Sans Mono}
- ''',
+ ''',
}
-# Translations have Asian (CJK) characters which are only displayed if
-# xeCJK is used
-
-latex_elements['preamble'] += '''
- \\IfFontExistsTF{Noto Sans CJK SC}{
- % This is needed for translations
- \\usepackage{xeCJK}
- \\IfFontExistsTF{Noto Serif CJK SC}{
- \\setCJKmainfont{Noto Serif CJK SC}[AutoFakeSlant]
- }{
- \\setCJKmainfont{Noto Sans CJK SC}[AutoFakeSlant]
- }
- \\setCJKsansfont{Noto Sans CJK SC}[AutoFakeSlant]
- \\setCJKmonofont{Noto Sans Mono CJK SC}[AutoFakeSlant]
- % CJK Language-specific font choices
- \\IfFontExistsTF{Noto Serif CJK SC}{
- \\newCJKfontfamily[SCmain]\\scmain{Noto Serif CJK SC}[AutoFakeSlant]
- \\newCJKfontfamily[SCserif]\\scserif{Noto Serif CJK SC}[AutoFakeSlant]
- }{
- \\newCJKfontfamily[SCmain]\\scmain{Noto Sans CJK SC}[AutoFakeSlant]
- \\newCJKfontfamily[SCserif]\\scserif{Noto Sans CJK SC}[AutoFakeSlant]
- }
- \\newCJKfontfamily[SCsans]\\scsans{Noto Sans CJK SC}[AutoFakeSlant]
- \\newCJKfontfamily[SCmono]\\scmono{Noto Sans Mono CJK SC}[AutoFakeSlant]
- \\IfFontExistsTF{Noto Serif CJK TC}{
- \\newCJKfontfamily[TCmain]\\tcmain{Noto Serif CJK TC}[AutoFakeSlant]
- \\newCJKfontfamily[TCserif]\\tcserif{Noto Serif CJK TC}[AutoFakeSlant]
- }{
- \\newCJKfontfamily[TCmain]\\tcmain{Noto Sans CJK TC}[AutoFakeSlant]
- \\newCJKfontfamily[TCserif]\\tcserif{Noto Sans CJK TC}[AutoFakeSlant]
- }
- \\newCJKfontfamily[TCsans]\\tcsans{Noto Sans CJK TC}[AutoFakeSlant]
- \\newCJKfontfamily[TCmono]\\tcmono{Noto Sans Mono CJK TC}[AutoFakeSlant]
- \\IfFontExistsTF{Noto Serif CJK KR}{
- \\newCJKfontfamily[KRmain]\\krmain{Noto Serif CJK KR}[AutoFakeSlant]
- \\newCJKfontfamily[KRserif]\\krserif{Noto Serif CJK KR}[AutoFakeSlant]
- }{
- \\newCJKfontfamily[KRmain]\\krmain{Noto Sans CJK KR}[AutoFakeSlant]
- \\newCJKfontfamily[KRserif]\\krserif{Noto Sans CJK KR}[AutoFakeSlant]
- }
- \\newCJKfontfamily[KRsans]\\krsans{Noto Sans CJK KR}[AutoFakeSlant]
- \\newCJKfontfamily[KRmono]\\krmono{Noto Sans Mono CJK KR}[AutoFakeSlant]
- \\IfFontExistsTF{Noto Serif CJK JP}{
- \\newCJKfontfamily[JPmain]\\jpmain{Noto Serif CJK JP}[AutoFakeSlant]
- \\newCJKfontfamily[JPserif]\\jpserif{Noto Serif CJK JP}[AutoFakeSlant]
- }{
- \\newCJKfontfamily[JPmain]\\jpmain{Noto Sans CJK JP}[AutoFakeSlant]
- \\newCJKfontfamily[JPserif]\\jpserif{Noto Sans CJK JP}[AutoFakeSlant]
- }
- \\newCJKfontfamily[JPsans]\\jpsans{Noto Sans CJK JP}[AutoFakeSlant]
- \\newCJKfontfamily[JPmono]\\jpmono{Noto Sans Mono CJK JP}[AutoFakeSlant]
- % Dummy commands for Sphinx < 2.3 (no 'extrapackages' support)
- \\providecommand{\\onehalfspacing}{}
- \\providecommand{\\singlespacing}{}
- % Define custom macros to on/off CJK
- \\newcommand{\\kerneldocCJKon}{\\makexeCJKactive\\onehalfspacing}
- \\newcommand{\\kerneldocCJKoff}{\\makexeCJKinactive\\singlespacing}
- \\newcommand{\\kerneldocBeginSC}{%
- \\begingroup%
- \\scmain%
- }
- \\newcommand{\\kerneldocEndSC}{\\endgroup}
- \\newcommand{\\kerneldocBeginTC}{%
- \\begingroup%
- \\tcmain%
- \\renewcommand{\\CJKrmdefault}{TCserif}%
- \\renewcommand{\\CJKsfdefault}{TCsans}%
- \\renewcommand{\\CJKttdefault}{TCmono}%
- }
- \\newcommand{\\kerneldocEndTC}{\\endgroup}
- \\newcommand{\\kerneldocBeginKR}{%
- \\begingroup%
- \\xeCJKDeclareCharClass{HalfLeft}{`“,`‘}%
- \\xeCJKDeclareCharClass{HalfRight}{`”,`’}%
- \\krmain%
- \\renewcommand{\\CJKrmdefault}{KRserif}%
- \\renewcommand{\\CJKsfdefault}{KRsans}%
- \\renewcommand{\\CJKttdefault}{KRmono}%
- \\xeCJKsetup{CJKspace = true} % For inter-phrase space
- }
- \\newcommand{\\kerneldocEndKR}{\\endgroup}
- \\newcommand{\\kerneldocBeginJP}{%
- \\begingroup%
- \\xeCJKDeclareCharClass{HalfLeft}{`“,`‘}%
- \\xeCJKDeclareCharClass{HalfRight}{`”,`’}%
- \\jpmain%
- \\renewcommand{\\CJKrmdefault}{JPserif}%
- \\renewcommand{\\CJKsfdefault}{JPsans}%
- \\renewcommand{\\CJKttdefault}{JPmono}%
- }
- \\newcommand{\\kerneldocEndJP}{\\endgroup}
- % Single spacing in literal blocks
- \\fvset{baselinestretch=1}
- % To customize \\sphinxtableofcontents
- \\usepackage{etoolbox}
- % Inactivate CJK after tableofcontents
- \\apptocmd{\\sphinxtableofcontents}{\\kerneldocCJKoff}{}{}
- }{ % No CJK font found
- % Custom macros to on/off CJK (Dummy)
- \\newcommand{\\kerneldocCJKon}{}
- \\newcommand{\\kerneldocCJKoff}{}
- \\newcommand{\\kerneldocBeginSC}{}
- \\newcommand{\\kerneldocEndSC}{}
- \\newcommand{\\kerneldocBeginTC}{}
- \\newcommand{\\kerneldocEndTC}{}
- \\newcommand{\\kerneldocBeginKR}{}
- \\newcommand{\\kerneldocEndKR}{}
- \\newcommand{\\kerneldocBeginJP}{}
- \\newcommand{\\kerneldocEndJP}{}
- }
-'''
-
# Fix reference escape troubles with Sphinx 1.4.x
if major == 1:
latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
+
+# Load kerneldoc specific LaTeX settings
+latex_elements['preamble'] += '''
+ % Load kerneldoc specific LaTeX settings
+ \\input{kerneldoc-preamble.sty}
+'''
+
# With Sphinx 1.6, it is possible to change the Bg color directly
# by using:
# \definecolor{sphinxnoteBgColor}{RGB}{204,255,255}
@@ -533,6 +467,11 @@ for fn in os.listdir('.'):
# If false, no module index is generated.
#latex_domain_indices = True
+# Additional LaTeX stuff to be copied to build directory
+latex_additional_files = [
+ 'sphinx/kerneldoc-preamble.sty',
+]
+
# -- Options for manual page output ---------------------------------------
@@ -558,19 +497,6 @@ texinfo_documents = [
'Miscellaneous'),
]
-# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
-
-# If false, no module index is generated.
-#texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
-
-
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
@@ -579,67 +505,9 @@ epub_author = author
epub_publisher = author
epub_copyright = copyright
-# The basename for the epub file. It defaults to the project name.
-#epub_basename = project
-
-# The HTML theme for the epub output. Since the default themes are not
-# optimized for small screen space, using the same theme for HTML and epub
-# output is usually not wise. This defaults to 'epub', a theme designed to save
-# visual space.
-#epub_theme = 'epub'
-
-# The language of the text. It defaults to the language option
-# or 'en' if the language is not set.
-#epub_language = ''
-
-# The scheme of the identifier. Typical schemes are ISBN or URL.
-#epub_scheme = ''
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#epub_identifier = ''
-
-# A unique identification for the text.
-#epub_uid = ''
-
-# A tuple containing the cover image and cover page html template filenames.
-#epub_cover = ()
-
-# A sequence of (type, uri, title) tuples for the guide element of content.opf.
-#epub_guide = ()
-
-# HTML files that should be inserted before the pages created by sphinx.
-# The format is a list of tuples containing the path and title.
-#epub_pre_files = []
-
-# HTML files that should be inserted after the pages created by sphinx.
-# The format is a list of tuples containing the path and title.
-#epub_post_files = []
-
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
-# The depth of the table of contents in toc.ncx.
-#epub_tocdepth = 3
-
-# Allow duplicate toc entries.
-#epub_tocdup = True
-
-# Choose between 'default' and 'includehidden'.
-#epub_tocscope = 'default'
-
-# Fix unsupported image types using the Pillow.
-#epub_fix_images = False
-
-# Scale large images.
-#epub_max_image_width = 0
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#epub_show_urls = 'inline'
-
-# If false, no index is generated.
-#epub_use_index = True
-
#=======
# rst2pdf
#
diff --git a/Documentation/asm-annotations.rst b/Documentation/core-api/asm-annotations.rst
index 76424e0431f4..11c96d3f9ad6 100644
--- a/Documentation/asm-annotations.rst
+++ b/Documentation/core-api/asm-annotations.rst
@@ -43,10 +43,11 @@ annotated objects like this, tools can be run on them to generate more useful
information. In particular, on properly annotated objects, ``objtool`` can be
run to check and fix the object if needed. Currently, ``objtool`` can report
missing frame pointer setup/destruction in functions. It can also
-automatically generate annotations for :doc:`ORC unwinder <x86/orc-unwinder>`
+automatically generate annotations for the ORC unwinder
+(Documentation/arch/x86/orc-unwinder.rst)
for most code. Both of these are especially important to support reliable
-stack traces which are in turn necessary for :doc:`Kernel live patching
-<livepatch/livepatch>`.
+stack traces which are in turn necessary for kernel live patching
+(Documentation/livepatch/livepatch.rst).
Caveat and Discussion
---------------------
@@ -64,7 +65,7 @@ macros, it was decided that brand new macros should be introduced instead::
of importing all the crappy, historic, essentially randomly chosen
debug symbol macro names from the binutils and older kernels?
-.. _discussion: https://lkml.kernel.org/r/20170217104757.28588-1-jslaby@suse.cz
+.. _discussion: https://lore.kernel.org/r/20170217104757.28588-1-jslaby@suse.cz
Macros Description
------------------
@@ -130,14 +131,13 @@ denoting a range of code via ``SYM_*_START/END`` annotations.
In fact, this kind of annotation corresponds to the now deprecated ``ENTRY``
and ``ENDPROC`` macros.
-* ``SYM_FUNC_START_ALIAS`` and ``SYM_FUNC_START_LOCAL_ALIAS`` serve for those
- who decided to have two or more names for one function. The typical use is::
+* ``SYM_FUNC_ALIAS``, ``SYM_FUNC_ALIAS_LOCAL``, and ``SYM_FUNC_ALIAS_WEAK`` can
+ be used to define multiple names for a function. The typical use is::
- SYM_FUNC_START_ALIAS(__memset)
- SYM_FUNC_START(memset)
+ SYM_FUNC_START(__memset)
... asm insns ...
- SYM_FUNC_END(memset)
- SYM_FUNC_END_ALIAS(__memset)
+ SYN_FUNC_END(__memset)
+ SYM_FUNC_ALIAS(memset, __memset)
In this example, one can call ``__memset`` or ``memset`` with the same
result, except the debug information for the instructions is generated to
diff --git a/Documentation/core-api/bus-virt-phys-mapping.rst b/Documentation/core-api/bus-virt-phys-mapping.rst
deleted file mode 100644
index c72b24a7d52c..000000000000
--- a/Documentation/core-api/bus-virt-phys-mapping.rst
+++ /dev/null
@@ -1,220 +0,0 @@
-==========================================================
-How to access I/O mapped memory from within device drivers
-==========================================================
-
-:Author: Linus
-
-.. warning::
-
- The virt_to_bus() and bus_to_virt() functions have been
- superseded by the functionality provided by the PCI DMA interface
- (see Documentation/core-api/dma-api-howto.rst). They continue
- to be documented below for historical purposes, but new code
- must not use them. --davidm 00/12/12
-
-::
-
- [ This is a mail message in response to a query on IO mapping, thus the
- strange format for a "document" ]
-
-The AHA-1542 is a bus-master device, and your patch makes the driver give the
-controller the physical address of the buffers, which is correct on x86
-(because all bus master devices see the physical memory mappings directly).
-
-However, on many setups, there are actually **three** different ways of looking
-at memory addresses, and in this case we actually want the third, the
-so-called "bus address".
-
-Essentially, the three ways of addressing memory are (this is "real memory",
-that is, normal RAM--see later about other details):
-
- - CPU untranslated. This is the "physical" address. Physical address
- 0 is what the CPU sees when it drives zeroes on the memory bus.
-
- - CPU translated address. This is the "virtual" address, and is
- completely internal to the CPU itself with the CPU doing the appropriate
- translations into "CPU untranslated".
-
- - bus address. This is the address of memory as seen by OTHER devices,
- not the CPU. Now, in theory there could be many different bus
- addresses, with each device seeing memory in some device-specific way, but
- happily most hardware designers aren't actually actively trying to make
- things any more complex than necessary, so you can assume that all
- external hardware sees the memory the same way.
-
-Now, on normal PCs the bus address is exactly the same as the physical
-address, and things are very simple indeed. However, they are that simple
-because the memory and the devices share the same address space, and that is
-not generally necessarily true on other PCI/ISA setups.
-
-Now, just as an example, on the PReP (PowerPC Reference Platform), the
-CPU sees a memory map something like this (this is from memory)::
-
- 0-2 GB "real memory"
- 2 GB-3 GB "system IO" (inb/out and similar accesses on x86)
- 3 GB-4 GB "IO memory" (shared memory over the IO bus)
-
-Now, that looks simple enough. However, when you look at the same thing from
-the viewpoint of the devices, you have the reverse, and the physical memory
-address 0 actually shows up as address 2 GB for any IO master.
-
-So when the CPU wants any bus master to write to physical memory 0, it
-has to give the master address 0x80000000 as the memory address.
-
-So, for example, depending on how the kernel is actually mapped on the
-PPC, you can end up with a setup like this::
-
- physical address: 0
- virtual address: 0xC0000000
- bus address: 0x80000000
-
-where all the addresses actually point to the same thing. It's just seen
-through different translations..
-
-Similarly, on the Alpha, the normal translation is::
-
- physical address: 0
- virtual address: 0xfffffc0000000000
- bus address: 0x40000000
-
-(but there are also Alphas where the physical address and the bus address
-are the same).
-
-Anyway, the way to look up all these translations, you do::
-
- #include <asm/io.h>
-
- phys_addr = virt_to_phys(virt_addr);
- virt_addr = phys_to_virt(phys_addr);
- bus_addr = virt_to_bus(virt_addr);
- virt_addr = bus_to_virt(bus_addr);
-
-Now, when do you need these?
-
-You want the **virtual** address when you are actually going to access that
-pointer from the kernel. So you can have something like this::
-
- /*
- * this is the hardware "mailbox" we use to communicate with
- * the controller. The controller sees this directly.
- */
- struct mailbox {
- __u32 status;
- __u32 bufstart;
- __u32 buflen;
- ..
- } mbox;
-
- unsigned char * retbuffer;
-
- /* get the address from the controller */
- retbuffer = bus_to_virt(mbox.bufstart);
- switch (retbuffer[0]) {
- case STATUS_OK:
- ...
-
-on the other hand, you want the bus address when you have a buffer that
-you want to give to the controller::
-
- /* ask the controller to read the sense status into "sense_buffer" */
- mbox.bufstart = virt_to_bus(&sense_buffer);
- mbox.buflen = sizeof(sense_buffer);
- mbox.status = 0;
- notify_controller(&mbox);
-
-And you generally **never** want to use the physical address, because you can't
-use that from the CPU (the CPU only uses translated virtual addresses), and
-you can't use it from the bus master.
-
-So why do we care about the physical address at all? We do need the physical
-address in some cases, it's just not very often in normal code. The physical
-address is needed if you use memory mappings, for example, because the
-"remap_pfn_range()" mm function wants the physical address of the memory to
-be remapped as measured in units of pages, a.k.a. the pfn (the memory
-management layer doesn't know about devices outside the CPU, so it
-shouldn't need to know about "bus addresses" etc).
-
-.. note::
-
- The above is only one part of the whole equation. The above
- only talks about "real memory", that is, CPU memory (RAM).
-
-There is a completely different type of memory too, and that's the "shared
-memory" on the PCI or ISA bus. That's generally not RAM (although in the case
-of a video graphics card it can be normal DRAM that is just used for a frame
-buffer), but can be things like a packet buffer in a network card etc.
-
-This memory is called "PCI memory" or "shared memory" or "IO memory" or
-whatever, and there is only one way to access it: the readb/writeb and
-related functions. You should never take the address of such memory, because
-there is really nothing you can do with such an address: it's not
-conceptually in the same memory space as "real memory" at all, so you cannot
-just dereference a pointer. (Sadly, on x86 it **is** in the same memory space,
-so on x86 it actually works to just deference a pointer, but it's not
-portable).
-
-For such memory, you can do things like:
-
- - reading::
-
- /*
- * read first 32 bits from ISA memory at 0xC0000, aka
- * C000:0000 in DOS terms
- */
- unsigned int signature = isa_readl(0xC0000);
-
- - remapping and writing::
-
- /*
- * remap framebuffer PCI memory area at 0xFC000000,
- * size 1MB, so that we can access it: We can directly
- * access only the 640k-1MB area, so anything else
- * has to be remapped.
- */
- void __iomem *baseptr = ioremap(0xFC000000, 1024*1024);
-
- /* write a 'A' to the offset 10 of the area */
- writeb('A',baseptr+10);
-
- /* unmap when we unload the driver */
- iounmap(baseptr);
-
- - copying and clearing::
-
- /* get the 6-byte Ethernet address at ISA address E000:0040 */
- memcpy_fromio(kernel_buffer, 0xE0040, 6);
- /* write a packet to the driver */
- memcpy_toio(0xE1000, skb->data, skb->len);
- /* clear the frame buffer */
- memset_io(0xA0000, 0, 0x10000);
-
-OK, that just about covers the basics of accessing IO portably. Questions?
-Comments? You may think that all the above is overly complex, but one day you
-might find yourself with a 500 MHz Alpha in front of you, and then you'll be
-happy that your driver works ;)
-
-Note that kernel versions 2.0.x (and earlier) mistakenly called the
-ioremap() function "vremap()". ioremap() is the proper name, but I
-didn't think straight when I wrote it originally. People who have to
-support both can do something like::
-
- /* support old naming silliness */
- #if LINUX_VERSION_CODE < 0x020100
- #define ioremap vremap
- #define iounmap vfree
- #endif
-
-at the top of their source files, and then they can use the right names
-even on 2.0.x systems.
-
-And the above sounds worse than it really is. Most real drivers really
-don't do all that complex things (or rather: the complexity is not so
-much in the actual IO accesses as in error handling and timeouts etc).
-It's generally not hard to fix drivers, and in many cases the code
-actually looks better afterwards::
-
- unsigned long signature = *(unsigned int *) 0xC0000;
- vs
- unsigned long signature = readl(0xC0000);
-
-I think the second version actually is more readable, no?
diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst
index 8aed9103e48a..5c0552e78c58 100644
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
@@ -326,6 +326,12 @@ maps this page at its virtual address.
dirty. Again, see sparc64 for examples of how
to deal with this.
+ ``void flush_dcache_folio(struct folio *folio)``
+ This function is called under the same circumstances as
+ flush_dcache_page(). It allows the architecture to
+ optimise for flushing the entire folio of pages instead
+ of flushing one page at a time.
+
``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long user_vaddr, void *dst, void *src, int len)``
``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index c6f4ba2fb32d..f75778d37488 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -560,7 +560,7 @@ available:
* cpuhp_state_remove_instance(state, node)
* cpuhp_state_remove_instance_nocalls(state, node)
-The arguments are the same as for the the cpuhp_state_add_instance*()
+The arguments are the same as for the cpuhp_state_add_instance*()
variants above.
The functions differ in the way how the installed callbacks are treated:
diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst
index 358d495456d1..72f6cdb6be1c 100644
--- a/Documentation/core-api/dma-api-howto.rst
+++ b/Documentation/core-api/dma-api-howto.rst
@@ -185,7 +185,7 @@ device struct of your device is embedded in the bus-specific device struct of
your device. For example, &pdev->dev is a pointer to the device struct of a
PCI device (pdev is a pointer to the PCI device struct of your device).
-These calls usually return zero to indicated your device can perform DMA
+These calls usually return zero to indicate your device can perform DMA
properly on the machine given the address mask you provided, but they might
return an error if the mask is too small to be supportable on the given
system. If it returns non-zero, your device cannot perform DMA properly on
@@ -707,20 +707,6 @@ to use the dma_sync_*() interfaces::
}
}
-Drivers converted fully to this interface should not use virt_to_bus() any
-longer, nor should they use bus_to_virt(). Some drivers have to be changed a
-little bit, because there is no longer an equivalent to bus_to_virt() in the
-dynamic DMA mapping scheme - you have to always store the DMA addresses
-returned by the dma_alloc_coherent(), dma_pool_alloc(), and dma_map_single()
-calls (dma_map_sg() stores them in the scatterlist itself if the platform
-supports dynamic DMA mapping in hardware) in your driver structures and/or
-in the card registers.
-
-All drivers should be using these interfaces with no exceptions. It
-is planned to completely remove virt_to_bus() and bus_to_virt() as
-they are entirely deprecated. Some ports already do not provide these
-as it is impossible to correctly support them.
-
Handling Errors
===============
diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 6d6d0edd2d27..829f20a193ca 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -206,6 +206,20 @@ others should not be larger than the returned value.
::
+ size_t
+ dma_opt_mapping_size(struct device *dev);
+
+Returns the maximum optimal size of a mapping for the device.
+
+Mapping larger buffers may take much longer in certain scenarios. In
+addition, for high-rate short-lived streaming mappings, the upfront time
+spent on the mapping may account for an appreciable part of the total
+request lifetime. As such, if splitting larger requests incurs no
+significant performance penalty, then device drivers are advised to
+limit total DMA streaming mappings length to the returned value.
+
+::
+
bool
dma_need_sync(struct device *dev, dma_addr_t dma_addr);
diff --git a/Documentation/core-api/entry.rst b/Documentation/core-api/entry.rst
new file mode 100644
index 000000000000..e12f22ab33c7
--- /dev/null
+++ b/Documentation/core-api/entry.rst
@@ -0,0 +1,279 @@
+Entry/exit handling for exceptions, interrupts, syscalls and KVM
+================================================================
+
+All transitions between execution domains require state updates which are
+subject to strict ordering constraints. State updates are required for the
+following:
+
+ * Lockdep
+ * RCU / Context tracking
+ * Preemption counter
+ * Tracing
+ * Time accounting
+
+The update order depends on the transition type and is explained below in
+the transition type sections: `Syscalls`_, `KVM`_, `Interrupts and regular
+exceptions`_, `NMI and NMI-like exceptions`_.
+
+Non-instrumentable code - noinstr
+---------------------------------
+
+Most instrumentation facilities depend on RCU, so intrumentation is prohibited
+for entry code before RCU starts watching and exit code after RCU stops
+watching. In addition, many architectures must save and restore register state,
+which means that (for example) a breakpoint in the breakpoint entry code would
+overwrite the debug registers of the initial breakpoint.
+
+Such code must be marked with the 'noinstr' attribute, placing that code into a
+special section inaccessible to instrumentation and debug facilities. Some
+functions are partially instrumentable, which is handled by marking them
+noinstr and using instrumentation_begin() and instrumentation_end() to flag the
+instrumentable ranges of code:
+
+.. code-block:: c
+
+ noinstr void entry(void)
+ {
+ handle_entry(); // <-- must be 'noinstr' or '__always_inline'
+ ...
+
+ instrumentation_begin();
+ handle_context(); // <-- instrumentable code
+ instrumentation_end();
+
+ ...
+ handle_exit(); // <-- must be 'noinstr' or '__always_inline'
+ }
+
+This allows verification of the 'noinstr' restrictions via objtool on
+supported architectures.
+
+Invoking non-instrumentable functions from instrumentable context has no
+restrictions and is useful to protect e.g. state switching which would
+cause malfunction if instrumented.
+
+All non-instrumentable entry/exit code sections before and after the RCU
+state transitions must run with interrupts disabled.
+
+Syscalls
+--------
+
+Syscall-entry code starts in assembly code and calls out into low-level C code
+after establishing low-level architecture-specific state and stack frames. This
+low-level C code must not be instrumented. A typical syscall handling function
+invoked from low-level assembly code looks like this:
+
+.. code-block:: c
+
+ noinstr void syscall(struct pt_regs *regs, int nr)
+ {
+ arch_syscall_enter(regs);
+ nr = syscall_enter_from_user_mode(regs, nr);
+
+ instrumentation_begin();
+ if (!invoke_syscall(regs, nr) && nr != -1)
+ result_reg(regs) = __sys_ni_syscall(regs);
+ instrumentation_end();
+
+ syscall_exit_to_user_mode(regs);
+ }
+
+syscall_enter_from_user_mode() first invokes enter_from_user_mode() which
+establishes state in the following order:
+
+ * Lockdep
+ * RCU / Context tracking
+ * Tracing
+
+and then invokes the various entry work functions like ptrace, seccomp, audit,
+syscall tracing, etc. After all that is done, the instrumentable invoke_syscall
+function can be invoked. The instrumentable code section then ends, after which
+syscall_exit_to_user_mode() is invoked.
+
+syscall_exit_to_user_mode() handles all work which needs to be done before
+returning to user space like tracing, audit, signals, task work etc. After
+that it invokes exit_to_user_mode() which again handles the state
+transition in the reverse order:
+
+ * Tracing
+ * RCU / Context tracking
+ * Lockdep
+
+syscall_enter_from_user_mode() and syscall_exit_to_user_mode() are also
+available as fine grained subfunctions in cases where the architecture code
+has to do extra work between the various steps. In such cases it has to
+ensure that enter_from_user_mode() is called first on entry and
+exit_to_user_mode() is called last on exit.
+
+Do not nest syscalls. Nested systcalls will cause RCU and/or context tracking
+to print a warning.
+
+KVM
+---
+
+Entering or exiting guest mode is very similar to syscalls. From the host
+kernel point of view the CPU goes off into user space when entering the
+guest and returns to the kernel on exit.
+
+kvm_guest_enter_irqoff() is a KVM-specific variant of exit_to_user_mode()
+and kvm_guest_exit_irqoff() is the KVM variant of enter_from_user_mode().
+The state operations have the same ordering.
+
+Task work handling is done separately for guest at the boundary of the
+vcpu_run() loop via xfer_to_guest_mode_handle_work() which is a subset of
+the work handled on return to user space.
+
+Do not nest KVM entry/exit transitions because doing so is nonsensical.
+
+Interrupts and regular exceptions
+---------------------------------
+
+Interrupts entry and exit handling is slightly more complex than syscalls
+and KVM transitions.
+
+If an interrupt is raised while the CPU executes in user space, the entry
+and exit handling is exactly the same as for syscalls.
+
+If the interrupt is raised while the CPU executes in kernel space the entry and
+exit handling is slightly different. RCU state is only updated when the
+interrupt is raised in the context of the CPU's idle task. Otherwise, RCU will
+already be watching. Lockdep and tracing have to be updated unconditionally.
+
+irqentry_enter() and irqentry_exit() provide the implementation for this.
+
+The architecture-specific part looks similar to syscall handling:
+
+.. code-block:: c
+
+ noinstr void interrupt(struct pt_regs *regs, int nr)
+ {
+ arch_interrupt_enter(regs);
+ state = irqentry_enter(regs);
+
+ instrumentation_begin();
+
+ irq_enter_rcu();
+ invoke_irq_handler(regs, nr);
+ irq_exit_rcu();
+
+ instrumentation_end();
+
+ irqentry_exit(regs, state);
+ }
+
+Note that the invocation of the actual interrupt handler is within a
+irq_enter_rcu() and irq_exit_rcu() pair.
+
+irq_enter_rcu() updates the preemption count which makes in_hardirq()
+return true, handles NOHZ tick state and interrupt time accounting. This
+means that up to the point where irq_enter_rcu() is invoked in_hardirq()
+returns false.
+
+irq_exit_rcu() handles interrupt time accounting, undoes the preemption
+count update and eventually handles soft interrupts and NOHZ tick state.
+
+In theory, the preemption count could be updated in irqentry_enter(). In
+practice, deferring this update to irq_enter_rcu() allows the preemption-count
+code to be traced, while also maintaining symmetry with irq_exit_rcu() and
+irqentry_exit(), which are described in the next paragraph. The only downside
+is that the early entry code up to irq_enter_rcu() must be aware that the
+preemption count has not yet been updated with the HARDIRQ_OFFSET state.
+
+Note that irq_exit_rcu() must remove HARDIRQ_OFFSET from the preemption count
+before it handles soft interrupts, whose handlers must run in BH context rather
+than irq-disabled context. In addition, irqentry_exit() might schedule, which
+also requires that HARDIRQ_OFFSET has been removed from the preemption count.
+
+Even though interrupt handlers are expected to run with local interrupts
+disabled, interrupt nesting is common from an entry/exit perspective. For
+example, softirq handling happens within an irqentry_{enter,exit}() block with
+local interrupts enabled. Also, although uncommon, nothing prevents an
+interrupt handler from re-enabling interrupts.
+
+Interrupt entry/exit code doesn't strictly need to handle reentrancy, since it
+runs with local interrupts disabled. But NMIs can happen anytime, and a lot of
+the entry code is shared between the two.
+
+NMI and NMI-like exceptions
+---------------------------
+
+NMIs and NMI-like exceptions (machine checks, double faults, debug
+interrupts, etc.) can hit any context and must be extra careful with
+the state.
+
+State changes for debug exceptions and machine-check exceptions depend on
+whether these exceptions happened in user-space (breakpoints or watchpoints) or
+in kernel mode (code patching). From user-space, they are treated like
+interrupts, while from kernel mode they are treated like NMIs.
+
+NMIs and other NMI-like exceptions handle state transitions without
+distinguishing between user-mode and kernel-mode origin.
+
+The state update on entry is handled in irqentry_nmi_enter() which updates
+state in the following order:
+
+ * Preemption counter
+ * Lockdep
+ * RCU / Context tracking
+ * Tracing
+
+The exit counterpart irqentry_nmi_exit() does the reverse operation in the
+reverse order.
+
+Note that the update of the preemption counter has to be the first
+operation on enter and the last operation on exit. The reason is that both
+lockdep and RCU rely on in_nmi() returning true in this case. The
+preemption count modification in the NMI entry/exit case must not be
+traced.
+
+Architecture-specific code looks like this:
+
+.. code-block:: c
+
+ noinstr void nmi(struct pt_regs *regs)
+ {
+ arch_nmi_enter(regs);
+ state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+ nmi_handler(regs);
+ instrumentation_end();
+
+ irqentry_nmi_exit(regs);
+ }
+
+and for e.g. a debug exception it can look like this:
+
+.. code-block:: c
+
+ noinstr void debug(struct pt_regs *regs)
+ {
+ arch_nmi_enter(regs);
+
+ debug_regs = save_debug_regs();
+
+ if (user_mode(regs)) {
+ state = irqentry_enter(regs);
+
+ instrumentation_begin();
+ user_mode_debug_handler(regs, debug_regs);
+ instrumentation_end();
+
+ irqentry_exit(regs, state);
+ } else {
+ state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+ kernel_mode_debug_handler(regs, debug_regs);
+ instrumentation_end();
+
+ irqentry_nmi_exit(regs, state);
+ }
+ }
+
+There is no combined irqentry_nmi_if_kernel() function available as the
+above cannot be handled in an exception-agnostic way.
+
+NMIs can happen in any context. For example, an NMI-like exception triggered
+while handling an NMI. So NMI entry code has to be reentrant and state updates
+need to handle nesting.
diff --git a/Documentation/core-api/idr.rst b/Documentation/core-api/idr.rst
index 2eb5afdb9931..18d724867064 100644
--- a/Documentation/core-api/idr.rst
+++ b/Documentation/core-api/idr.rst
@@ -17,6 +17,9 @@ solution to the problem to avoid everybody inventing their own. The IDR
provides the ability to map an ID to a pointer, while the IDA provides
only ID allocation, and as a result is much more memory-efficient.
+The IDR interface is deprecated; please use the :doc:`XArray <xarray>`
+instead.
+
IDR usage
=========
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 5de2c7a4b1b3..7a3a08d81f11 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -18,9 +18,12 @@ it.
kernel-api
workqueue
+ watch_queue
printk-basics
printk-formats
+ printk-index
symbol-namespaces
+ asm-annotations
Data structures and low-level utilities
=======================================
@@ -34,15 +37,25 @@ Library functionality that is used throughout the kernel.
kref
assoc_array
xarray
+ maple_tree
idr
circular-buffers
rbtree
generic-radix-tree
packing
- bus-virt-phys-mapping
this_cpu_ops
timekeeping
errseq
+ wrappers/atomic_t
+ wrappers/atomic_bitops
+
+Low level entry and exit
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ entry
Concurrency primitives
======================
@@ -58,6 +71,7 @@ Documentation/locking/index.rst for more related documentation.
local_ops
padata
../RCU/index
+ wrappers/memory-barriers.rst
Low-level hardware management
=============================
@@ -77,7 +91,7 @@ Memory management
=================
How to allocate and use memory in the kernel. Note that there is a lot
-more memory-management documentation in Documentation/vm/index.rst.
+more memory-management documentation in Documentation/mm/index.rst.
.. toctree::
:maxdepth: 1
@@ -113,6 +127,7 @@ Documents that don't fit elsewhere or which have yet to be categorized.
:maxdepth: 1
librs
+ netlink
.. only:: subproject and html
diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index 9c0e8758037a..f88a6ee67a35 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -67,14 +67,11 @@ variety of methods:
deprecated
- generic_handle_domain_irq() handles an interrupt described by a
domain and a hwirq number
-- handle_domain_irq() does the same thing for root interrupt
- controllers and deals with the set_irq_reg()/irq_enter() sequences
- that most architecture requires
Note that irq domain lookups must happen in contexts that are
compatible with a RCU read-side critical section.
-The irq_create_mapping() function must be called *atleast once*
+The irq_create_mapping() function must be called *at least once*
before any call to irq_find_mapping(), lest the descriptor will not
be allocated.
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 2e7186805148..9b3f3e5f5a95 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -36,6 +36,9 @@ String Conversions
String Manipulation
-------------------
+.. kernel-doc:: include/linux/fortify-string.h
+ :internal:
+
.. kernel-doc:: lib/string.c
:export:
@@ -118,6 +121,12 @@ Text Searching
CRC and Math Functions in Linux
===============================
+Arithmetic Overflow Checking
+----------------------------
+
+.. kernel-doc:: include/linux/overflow.h
+ :internal:
+
CRC Functions
-------------
@@ -165,9 +174,6 @@ Division Functions
.. kernel-doc:: include/linux/math64.h
:internal:
-.. kernel-doc:: lib/math/div64.c
- :functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
-
.. kernel-doc:: lib/math/gcd.c
:export:
@@ -214,16 +220,34 @@ relay interface
Module Support
==============
-Module Loading
---------------
+Kernel module auto-loading
+--------------------------
-.. kernel-doc:: kernel/kmod.c
+.. kernel-doc:: kernel/module/kmod.c
:export:
+Module debugging
+----------------
+
+.. kernel-doc:: kernel/module/stats.c
+ :doc: module debugging statistics overview
+
+dup_failed_modules - tracks duplicate failed modules
+****************************************************
+
+.. kernel-doc:: kernel/module/stats.c
+ :doc: dup_failed_modules - tracks duplicate failed modules
+
+module statistics debugfs counters
+**********************************
+
+.. kernel-doc:: kernel/module/stats.c
+ :doc: module statistics debugfs counters
+
Inter Module support
--------------------
-Refer to the file kernel/module.c for more information.
+Refer to the files in kernel/module/ for more information.
Hardware Interfaces
===================
@@ -279,6 +303,7 @@ Accounting Framework
Block Devices
=============
+.. kernel-doc:: include/linux/bio.h
.. kernel-doc:: block/blk-core.c
:export:
@@ -294,9 +319,6 @@ Block Devices
.. kernel-doc:: block/blk-settings.c
:export:
-.. kernel-doc:: block/blk-exec.c
- :export:
-
.. kernel-doc:: block/blk-flush.c
:export:
diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst
index 2739f8b72575..7310247310a0 100644
--- a/Documentation/core-api/kobject.rst
+++ b/Documentation/core-api/kobject.rst
@@ -118,7 +118,7 @@ Initialization of kobjects
Code which creates a kobject must, of course, initialize that object. Some
of the internal fields are setup with a (mandatory) call to kobject_init()::
- void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
+ void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
The ktype is required for a kobject to be created properly, as every kobject
must have an associated kobj_type. After calling kobject_init(), to
@@ -156,7 +156,7 @@ kobject_name()::
There is a helper function to both initialize and add the kobject to the
kernel at the same time, called surprisingly enough kobject_init_and_add()::
- int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+ int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
struct kobject *parent, const char *fmt, ...);
The arguments are the same as the individual kobject_init() and
@@ -299,7 +299,6 @@ kobj_type::
struct kobj_type {
void (*release)(struct kobject *kobj);
const struct sysfs_ops *sysfs_ops;
- struct attribute **default_attrs;
const struct attribute_group **default_groups;
const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
const void *(*namespace)(struct kobject *kobj);
@@ -313,10 +312,10 @@ call kobject_init() or kobject_init_and_add().
The release field in struct kobj_type is, of course, a pointer to the
release() method for this type of kobject. The other two fields (sysfs_ops
-and default_attrs) control how objects of this type are represented in
+and default_groups) control how objects of this type are represented in
sysfs; they are beyond the scope of this document.
-The default_attrs pointer is a list of default attributes that will be
+The default_groups pointer is a list of default attributes that will be
automatically created for any kobject that is registered with this ktype.
@@ -373,10 +372,9 @@ If a kset wishes to control the uevent operations of the kobjects
associated with it, it can use the struct kset_uevent_ops to handle it::
struct kset_uevent_ops {
- int (* const filter)(struct kset *kset, struct kobject *kobj);
- const char *(* const name)(struct kset *kset, struct kobject *kobj);
- int (* const uevent)(struct kset *kset, struct kobject *kobj,
- struct kobj_uevent_env *env);
+ int (* const filter)(struct kobject *kobj);
+ const char *(* const name)(struct kobject *kobj);
+ int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env);
};
diff --git a/Documentation/core-api/local_ops.rst b/Documentation/core-api/local_ops.rst
index 2ac3f9f29845..0b42ceaaf3c4 100644
--- a/Documentation/core-api/local_ops.rst
+++ b/Documentation/core-api/local_ops.rst
@@ -191,7 +191,7 @@ Here is a sample module which implements a basic per cpu counter using
static void __exit test_exit(void)
{
- del_timer_sync(&test_timer);
+ timer_shutdown_sync(&test_timer);
}
module_init(test_init);
diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst
new file mode 100644
index 000000000000..45defcf15da7
--- /dev/null
+++ b/Documentation/core-api/maple_tree.rst
@@ -0,0 +1,217 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+
+==========
+Maple Tree
+==========
+
+:Author: Liam R. Howlett
+
+Overview
+========
+
+The Maple Tree is a B-Tree data type which is optimized for storing
+non-overlapping ranges, including ranges of size 1. The tree was designed to
+be simple to use and does not require a user written search method. It
+supports iterating over a range of entries and going to the previous or next
+entry in a cache-efficient manner. The tree can also be put into an RCU-safe
+mode of operation which allows reading and writing concurrently. Writers must
+synchronize on a lock, which can be the default spinlock, or the user can set
+the lock to an external lock of a different type.
+
+The Maple Tree maintains a small memory footprint and was designed to use
+modern processor cache efficiently. The majority of the users will be able to
+use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
+scenarios. The most important usage of the Maple Tree is the tracking of the
+virtual memory areas.
+
+The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
+Tree reserves values with the bottom two bits set to '10' which are below 4096
+(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
+entries then the users can convert the entries using xa_mk_value() and convert
+them back by calling xa_to_value(). If the user needs to use a reserved
+value, then the user can convert the value when using the
+:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
+
+The Maple Tree can also be configured to support searching for a gap of a given
+size (or larger).
+
+Pre-allocating of nodes is also supported using the
+:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
+successful store operation within a given
+code segment when allocating cannot be done. Allocations of nodes are
+relatively small at around 256 bytes.
+
+.. _maple-tree-normal-api:
+
+Normal API
+==========
+
+Start by initialising a maple tree, either with DEFINE_MTREE() for statically
+allocated maple trees or mt_init() for dynamically allocated ones. A
+freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
+- ``ULONG_MAX``. There are currently two types of maple trees supported: the
+allocation tree and the regular tree. The regular tree has a higher branching
+factor for internal nodes. The allocation tree has a lower branching factor
+but allows the user to search for a gap of a given size or larger from either
+``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
+passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
+
+You can then set entries using mtree_store() or mtree_store_range().
+mtree_store() will overwrite any entry with the new entry and return 0 on
+success or an error code otherwise. mtree_store_range() works in the same way
+but takes a range. mtree_load() is used to retrieve the entry stored at a
+given index. You can use mtree_erase() to erase an entire range by only
+knowing one value within that range, or mtree_store() call with an entry of
+NULL may be used to partially erase a range or many ranges at once.
+
+If you want to only store a new entry to a range (or index) if that range is
+currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
+return -EEXIST if the range is not empty.
+
+You can search for an entry from an index upwards by using mt_find().
+
+You can walk each entry within a range by calling mt_for_each(). You must
+provide a temporary variable to store a cursor. If you want to walk each
+element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
+the caller is going to hold the lock for the duration of the walk then it is
+worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
+section.
+
+Sometimes it is necessary to ensure the next call to store to a maple tree does
+not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
+
+Finally, you can remove all entries from a maple tree by calling
+mtree_destroy(). If the maple tree entries are pointers, you may wish to free
+the entries first.
+
+Allocating Nodes
+----------------
+
+The allocations are handled by the internal tree code. See
+:ref:`maple-tree-advanced-alloc` for other options.
+
+Locking
+-------
+
+You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
+for other options.
+
+The Maple Tree uses RCU and an internal spinlock to synchronise access:
+
+Takes RCU read lock:
+ * mtree_load()
+ * mt_find()
+ * mt_for_each()
+ * mt_next()
+ * mt_prev()
+
+Takes ma_lock internally:
+ * mtree_store()
+ * mtree_store_range()
+ * mtree_insert()
+ * mtree_insert_range()
+ * mtree_erase()
+ * mtree_destroy()
+ * mt_set_in_rcu()
+ * mt_clear_in_rcu()
+
+If you want to take advantage of the internal lock to protect the data
+structures that you are storing in the Maple Tree, you can call mtree_lock()
+before calling mtree_load(), then take a reference count on the object you
+have found before calling mtree_unlock(). This will prevent stores from
+removing the object from the tree between looking up the object and
+incrementing the refcount. You can also use RCU to avoid dereferencing
+freed memory, but an explanation of that is beyond the scope of this
+document.
+
+.. _maple-tree-advanced-api:
+
+Advanced API
+============
+
+The advanced API offers more flexibility and better performance at the
+cost of an interface which can be harder to use and has fewer safeguards.
+You must take care of your own locking while using the advanced API.
+You can use the ma_lock, RCU or an external lock for protection.
+You can mix advanced and normal operations on the same array, as long
+as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
+in terms of the advanced API.
+
+The advanced API is based around the ma_state, this is where the 'mas'
+prefix originates. The ma_state struct keeps track of tree operations to make
+life easier for both internal and external tree users.
+
+Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
+Please see above.
+
+The maple state keeps track of the range start and end in mas->index and
+mas->last, respectively.
+
+mas_walk() will walk the tree to the location of mas->index and set the
+mas->index and mas->last according to the range for the entry.
+
+You can set entries using mas_store(). mas_store() will overwrite any entry
+with the new entry and return the first existing entry that is overwritten.
+The range is passed in as members of the maple state: index and last.
+
+You can use mas_erase() to erase an entire range by setting index and
+last of the maple state to the desired range to erase. This will erase
+the first range that is found in that range, set the maple state index
+and last as the range that was erased and return the entry that existed
+at that location.
+
+You can walk each entry within a range by using mas_for_each(). If you want
+to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
+the range. If the lock needs to be periodically dropped, see the locking
+section mas_pause().
+
+Using a maple state allows mas_next() and mas_prev() to function as if the
+tree was a linked list. With such a high branching factor the amortized
+performance penalty is outweighed by cache optimization. mas_next() will
+return the next entry which occurs after the entry at index. mas_prev()
+will return the previous entry which occurs before the entry at index.
+
+mas_find() will find the first entry which exists at or above index on
+the first call, and the next entry from every subsequent calls.
+
+mas_find_rev() will find the fist entry which exists at or below the last on
+the first call, and the previous entry from every subsequent calls.
+
+If the user needs to yield the lock during an operation, then the maple state
+must be paused using mas_pause().
+
+There are a few extra interfaces provided when using an allocation tree.
+If you wish to search for a gap within a range, then mas_empty_area()
+or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
+starting at the lowest index given up to the maximum of the range.
+mas_empty_area_rev() searches for a gap starting at the highest index given
+and continues downward to the lower bound of the range.
+
+.. _maple-tree-advanced-alloc:
+
+Advanced Allocating Nodes
+-------------------------
+
+Allocations are usually handled internally to the tree, however if allocations
+need to occur before a write occurs then calling mas_expected_entries() will
+allocate the worst-case number of needed nodes to insert the provided number of
+ranges. This also causes the tree to enter mass insertion mode. Once
+insertions are complete calling mas_destroy() on the maple state will free the
+unused allocations.
+
+.. _maple-tree-advanced-locks:
+
+Advanced Locking
+----------------
+
+The maple tree uses a spinlock by default, but external locks can be used for
+tree updates as well. To use an external lock, the tree must be initialized
+with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
+MTREE_INIT_EXT() #define, which takes an external lock as an argument.
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/maple_tree.h
+.. kernel-doc:: lib/maple_tree.c
diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 5954ddf6ee13..1c58d883b273 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -170,7 +170,16 @@ should be used if a part of the cache might be copied to the userspace.
After the cache is created kmem_cache_alloc() and its convenience
wrappers can allocate memory from that cache.
-When the allocated memory is no longer needed it must be freed. You can
-use kvfree() for the memory allocated with `kmalloc`, `vmalloc` and
-`kvmalloc`. The slab caches should be freed with kmem_cache_free(). And
-don't forget to destroy the cache with kmem_cache_destroy().
+When the allocated memory is no longer needed it must be freed.
+
+Objects allocated by `kmalloc` can be freed by `kfree` or `kvfree`. Objects
+allocated by `kmem_cache_alloc` can be freed with `kmem_cache_free`, `kfree`
+or `kvfree`, where the latter two might be more convenient thanks to not
+needing the kmem_cache pointer.
+
+The same rules apply to _bulk and _rcu flavors of freeing functions.
+
+Memory allocated by `vmalloc` can be freed with `vfree` or `kvfree`.
+Memory allocated by `kvmalloc` can be freed with `kvfree`.
+Caches created by `kmem_cache_create` should be freed with
+`kmem_cache_destroy` only after freeing all the allocated objects first.
diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index de7467e48067..682259ee633a 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
- int status_change_nid_high;
int status_change_nid;
}
@@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
- nr_pages is # of pages of online/offline memory.
- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
is (will be) set/clear, if this is -1, then nodemask status is not changed.
-- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
- is (will be) set/clear, if this is -1, then nodemask status is not changed.
- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
set/clear. It means a new(memoryless) node gets new memory by online and a
node loses all memory. If this is -1, then nodemask status is not changed.
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index a42f9baddfbf..f5dde5bceaea 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -19,19 +19,16 @@ User Space Memory Access
Memory Allocation Controls
==========================
-.. kernel-doc:: include/linux/gfp.h
- :internal:
-
-.. kernel-doc:: include/linux/gfp.h
+.. kernel-doc:: include/linux/gfp_types.h
:doc: Page mobility and placement hints
-.. kernel-doc:: include/linux/gfp.h
+.. kernel-doc:: include/linux/gfp_types.h
:doc: Watermark modifiers
-.. kernel-doc:: include/linux/gfp.h
+.. kernel-doc:: include/linux/gfp_types.h
:doc: Reclaim modifiers
-.. kernel-doc:: include/linux/gfp.h
+.. kernel-doc:: include/linux/gfp_types.h
:doc: Useful GFP flag combinations
The Slab Cache
@@ -58,15 +55,30 @@ Virtually Contiguous Mappings
File Mapping and Page Cache
===========================
-.. kernel-doc:: mm/readahead.c
- :export:
+Filemap
+-------
.. kernel-doc:: mm/filemap.c
:export:
+Readahead
+---------
+
+.. kernel-doc:: mm/readahead.c
+ :doc: Readahead Overview
+
+.. kernel-doc:: mm/readahead.c
+ :export:
+
+Writeback
+---------
+
.. kernel-doc:: mm/page-writeback.c
:export:
+Truncate
+--------
+
.. kernel-doc:: mm/truncate.c
:export:
@@ -95,6 +107,11 @@ More Memory Management Functions
.. kernel-doc:: mm/mempolicy.c
.. kernel-doc:: include/linux/mm_types.h
:internal:
+.. kernel-doc:: include/linux/mm_inline.h
+.. kernel-doc:: include/linux/page-flags.h
.. kernel-doc:: include/linux/mm.h
:internal:
+.. kernel-doc:: include/linux/page_ref.h
.. kernel-doc:: include/linux/mmzone.h
+.. kernel-doc:: mm/util.c
+ :functions: folio_mapping
diff --git a/Documentation/core-api/netlink.rst b/Documentation/core-api/netlink.rst
new file mode 100644
index 000000000000..e4a938a05cc9
--- /dev/null
+++ b/Documentation/core-api/netlink.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+
+.. _kernel_netlink:
+
+===================================
+Netlink notes for kernel developers
+===================================
+
+General guidance
+================
+
+Attribute enums
+---------------
+
+Older families often define "null" attributes and commands with value
+of ``0`` and named ``unspec``. This is supported (``type: unused``)
+but should be avoided in new families. The ``unspec`` enum values are
+not used in practice, so just set the value of the first attribute to ``1``.
+
+Message enums
+-------------
+
+Use the same command IDs for requests and replies. This makes it easier
+to match them up, and we have plenty of ID space.
+
+Use separate command IDs for notifications. This makes it easier to
+sort the notifications from replies (and present them to the user
+application via a different API than replies).
+
+Answer requests
+---------------
+
+Older families do not reply to all of the commands, especially NEW / ADD
+commands. User only gets information whether the operation succeeded or
+not via the ACK. Try to find useful data to return. Once the command is
+added whether it replies with a full message or only an ACK is uAPI and
+cannot be changed. It's better to err on the side of replying.
+
+Specifically NEW and ADD commands should reply with information identifying
+the created object such as the allocated object's ID (without having to
+resort to using ``NLM_F_ECHO``).
+
+NLM_F_ECHO
+----------
+
+Make sure to pass the request info to genl_notify() to allow ``NLM_F_ECHO``
+to take effect. This is useful for programs that need precise feedback
+from the kernel (for example for logging purposes).
+
+Support dump consistency
+------------------------
+
+If iterating over objects during dump may skip over objects or repeat
+them - make sure to report dump inconsistency with ``NLM_F_DUMP_INTR``.
+This is usually implemented by maintaining a generation id for the
+structure and recording it in the ``seq`` member of struct netlink_callback.
+
+Netlink specification
+=====================
+
+Documentation of the Netlink specification parts which are only relevant
+to the kernel space.
+
+Globals
+-------
+
+kernel-policy
+~~~~~~~~~~~~~
+
+Defines if the kernel validation policy is per operation (``per-op``)
+or for the entire family (``global``). New families should use ``per-op``
+(default) to be able to narrow down the attributes accepted by a specific
+command.
+
+checks
+------
+
+Documentation for the ``checks`` sub-sections of attribute specs.
+
+unterminated-ok
+~~~~~~~~~~~~~~~
+
+Accept strings without the null-termination (for legacy families only).
+Switches from the ``NLA_NUL_STRING`` to ``NLA_STRING`` policy type.
+
+max-len
+~~~~~~~
+
+Defines max length for a binary or string attribute (corresponding
+to the ``len`` member of struct nla_policy). For string attributes terminating
+null character is not counted towards ``max-len``.
+
+The field may either be a literal integer value or a name of a defined
+constant. String types may reduce the constant by one
+(i.e. specify ``max-len: CONST - 1``) to reserve space for the terminating
+character so implementations should recognize such pattern.
+
+min-len
+~~~~~~~
+
+Similar to ``max-len`` but defines minimum length.
diff --git a/Documentation/core-api/packing.rst b/Documentation/core-api/packing.rst
index d8c341fe383e..3ed13bc9a195 100644
--- a/Documentation/core-api/packing.rst
+++ b/Documentation/core-api/packing.rst
@@ -161,6 +161,6 @@ xxx_packing() that calls it using the proper QUIRK_* one-hot bits set.
The packing() function returns an int-encoded error code, which protects the
programmer against incorrect API use. The errors are not expected to occur
-durring runtime, therefore it is reasonable for xxx_packing() to return void
+during runtime, therefore it is reasonable for xxx_packing() to return void
and simply swallow those errors. Optionally it can dump stack or print the
error description.
diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst
index 35175710b43c..05b73c6c105f 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -42,7 +42,7 @@ padata_shells associated with it, each allowing a separate series of jobs.
Modifying cpumasks
------------------
-The CPUs used to run jobs can be changed in two ways, programatically with
+The CPUs used to run jobs can be changed in two ways, programmatically with
padata_set_cpumask() or via sysfs. The former is defined::
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst
index fcf605be43d0..9fb0b1080d3b 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct
pages* array, and the function then pins pages by incrementing each by a special
value: GUP_PIN_COUNTING_BIAS.
-For huge pages (and in fact, any compound page of more than 2 pages), the
-GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting
-is achieved, by using the 3rd struct page in the compound page. A new struct
-page field, hpage_pinned_refcount, has been added in order to support this.
-
-This approach for compound pages avoids the counting upper limit problems that
-are discussed below. Those limitations would have been aggravated severely by
-huge pages, because each tail page adds a refcount to the head page. And in
-fact, testing revealed that, without a separate hpage_pinned_refcount field,
-page overflows were seen in some huge page stress tests.
-
-This also means that huge pages and compound pages (of order > 1) do not suffer
+For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead,
+the extra space available in the struct folio is used to store the
+pincount directly.
+
+This approach for large folios avoids the counting upper limit problems
+that are discussed below. Those limitations would have been aggravated
+severely by huge pages, because each tail page adds a refcount to the
+head page. And in fact, testing revealed that, without a separate pincount
+field, refcount overflows were seen in some huge page stress tests.
+
+This also means that huge pages and large folios do not suffer
from the false positives problem that is mentioned below.::
Function
@@ -221,7 +220,7 @@ Unit testing
============
This file::
- tools/testing/selftests/vm/gup_test.c
+ tools/testing/selftests/mm/gup_test.c
has the following new calls to exercise the new pin*() wrapper functions:
@@ -264,9 +263,9 @@ place.)
Other diagnostics
=================
-dump_page() has been enhanced slightly, to handle these new counting fields, and
-to better report on compound pages in general. Specifically, for compound pages
-with order > 1, the exact (hpage_pinned_refcount) pincount is reported.
+dump_page() has been enhanced slightly to handle these new counting
+fields, and to better report on large folios in general. Specifically,
+for large folios, the exact pincount is reported.
References
==========
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index e08bbe9b0cbf..dfe7e75a71de 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -575,20 +575,26 @@ The field width is passed by value, the bitmap is passed by reference.
Helper macros cpumask_pr_args() and nodemask_pr_args() are available to ease
printing cpumask and nodemask.
-Flags bitfields such as page flags, gfp_flags
----------------------------------------------
+Flags bitfields such as page flags, page_type, gfp_flags
+--------------------------------------------------------
::
- %pGp referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff
+ %pGp 0x17ffffc0002036(referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff)
+ %pGt 0xffffff7f(buddy)
%pGg GFP_USER|GFP_DMA32|GFP_NOWARN
%pGv read|exec|mayread|maywrite|mayexec|denywrite
For printing flags bitfields as a collection of symbolic constants that
would construct the value. The type of flags is given by the third
-character. Currently supported are [p]age flags, [v]ma_flags (both
-expect ``unsigned long *``) and [g]fp_flags (expects ``gfp_t *``). The flag
-names and print order depends on the particular type.
+character. Currently supported are:
+
+ - p - [p]age flags, expects value of type (``unsigned long *``)
+ - t - page [t]ype, expects value of type (``unsigned int *``)
+ - v - [v]ma_flags, expects value of type (``unsigned long *``)
+ - g - [g]fp_flags, expects value of type (``gfp_t *``)
+
+The flag names and print order depends on the particular type.
Note that this format should not be used directly in the
:c:func:`TP_printk()` part of a tracepoint. Instead, use the show_*_flags()
@@ -625,6 +631,16 @@ Examples::
%p4cc Y10 little-endian (0x20303159)
%p4cc NV12 big-endian (0xb231564e)
+Rust
+----
+
+::
+
+ %pA
+
+Only intended to be used from Rust code to format ``core::fmt::Arguments``.
+Do *not* use it from C.
+
Thanks
======
diff --git a/Documentation/core-api/printk-index.rst b/Documentation/core-api/printk-index.rst
new file mode 100644
index 000000000000..3062f37d119b
--- /dev/null
+++ b/Documentation/core-api/printk-index.rst
@@ -0,0 +1,137 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Printk Index
+============
+
+There are many ways how to monitor the state of the system. One important
+source of information is the system log. It provides a lot of information,
+including more or less important warnings and error messages.
+
+There are monitoring tools that filter and take action based on messages
+logged.
+
+The kernel messages are evolving together with the code. As a result,
+particular kernel messages are not KABI and never will be!
+
+It is a huge challenge for maintaining the system log monitors. It requires
+knowing what messages were updated in a particular kernel version and why.
+Finding these changes in the sources would require non-trivial parsers.
+Also it would require matching the sources with the binary kernel which
+is not always trivial. Various changes might be backported. Various kernel
+versions might be used on different monitored systems.
+
+This is where the printk index feature might become useful. It provides
+a dump of printk formats used all over the source code used for the kernel
+and modules on the running system. It is accessible at runtime via debugfs.
+
+The printk index helps to find changes in the message formats. Also it helps
+to track the strings back to the kernel sources and the related commit.
+
+
+User Interface
+==============
+
+The index of printk formats are split in into separate files. The files are
+named according to the binaries where the printk formats are built-in. There
+is always "vmlinux" and optionally also modules, for example::
+
+ /sys/kernel/debug/printk/index/vmlinux
+ /sys/kernel/debug/printk/index/ext4
+ /sys/kernel/debug/printk/index/scsi_mod
+
+Note that only loaded modules are shown. Also printk formats from a module
+might appear in "vmlinux" when the module is built-in.
+
+The content is inspired by the dynamic debug interface and looks like::
+
+ $> head -1 /sys/kernel/debug/printk/index/vmlinux; shuf -n 5 vmlinux
+ # <level[,flags]> filename:line function "format"
+ <5> block/blk-settings.c:661 disk_stack_limits "%s: Warning: Device %s is misaligned\n"
+ <4> kernel/trace/trace.c:8296 trace_create_file "Could not create tracefs '%s' entry\n"
+ <6> arch/x86/kernel/hpet.c:144 _hpet_print_config "hpet: %s(%d):\n"
+ <6> init/do_mounts.c:605 prepare_namespace "Waiting for root device %s...\n"
+ <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n"
+
+, where the meaning is:
+
+ - :level: log level value: 0-7 for particular severity, -1 as default,
+ 'c' as continuous line without an explicit log level
+ - :flags: optional flags: currently only 'c' for KERN_CONT
+ - :filename\:line: source filename and line number of the related
+ printk() call. Note that there are many wrappers, for example,
+ pr_warn(), pr_warn_once(), dev_warn().
+ - :function: function name where the printk() call is used.
+ - :format: format string
+
+The extra information makes it a bit harder to find differences
+between various kernels. Especially the line number might change
+very often. On the other hand, it helps a lot to confirm that
+it is the same string or find the commit that is responsible
+for eventual changes.
+
+
+printk() Is Not a Stable KABI
+=============================
+
+Several developers are afraid that exporting all these implementation
+details into the user space will transform particular printk() calls
+into KABI.
+
+But it is exactly the opposite. printk() calls must _not_ be KABI.
+And the printk index helps user space tools to deal with this.
+
+
+Subsystem specific printk wrappers
+==================================
+
+The printk index is generated using extra metadata that are stored in
+a dedicated .elf section ".printk_index". It is achieved using macro
+wrappers doing __printk_index_emit() together with the real printk()
+call. The same technique is used also for the metadata used by
+the dynamic debug feature.
+
+The metadata are stored for a particular message only when it is printed
+using these special wrappers. It is implemented for the commonly
+used printk() calls, including, for example, pr_warn(), or pr_once().
+
+Additional changes are necessary for various subsystem specific wrappers
+that call the original printk() via a common helper function. These needs
+their own wrappers adding __printk_index_emit().
+
+Only few subsystem specific wrappers have been updated so far,
+for example, dev_printk(). As a result, the printk formats from
+some subsystes can be missing in the printk index.
+
+
+Subsystem specific prefix
+=========================
+
+The macro pr_fmt() macro allows to define a prefix that is printed
+before the string generated by the related printk() calls.
+
+Subsystem specific wrappers usually add even more complicated
+prefixes.
+
+These prefixes can be stored into the printk index metadata
+by an optional parameter of __printk_index_emit(). The debugfs
+interface might then show the printk formats including these prefixes.
+For example, drivers/acpi/osl.c contains::
+
+ #define pr_fmt(fmt) "ACPI: OSL: " fmt
+
+ static int __init acpi_no_auto_serialize_setup(char *str)
+ {
+ acpi_gbl_auto_serialize_methods = FALSE;
+ pr_info("Auto-serialization disabled\n");
+
+ return 1;
+ }
+
+This results in the following printk index entry::
+
+ <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n"
+
+It helps matching messages from the real log with printk index.
+Then the source file name, line number, and function name can
+be used to match the string with the source code.
diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index ec575e72d0b2..bf28ac0401f3 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,31 +4,29 @@
Memory Protection Keys
======================
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
-which is found on Intel's Skylake (and later) "Scalable Processor"
-Server CPUs. It will be available in future non-server Intel parts
-and future AMD processors.
-
-For anyone wishing to test or use this feature, it is available in
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
-17.04 image.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains. It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key. Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
+Memory Protection Keys provide a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables when an
+application changes protection domains.
+
+Pkeys Userspace (PKU) is a feature which can be found on:
+ * Intel server CPUs, Skylake and later
+ * Intel client CPUs, Tiger Lake (11th Gen Core) and later
+ * Future AMD CPUs
+
+Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
+a "protection key", giving 16 possible keys.
+
+Protections for each key are defined with a per-CPU user-accessible register
+(PKRU). Each of these is a 32-bit register storing two bits (Access Disable
+and Write Disable) for each of 16 keys.
+
+Being a CPU register, PKRU is inherently thread-local, potentially giving each
thread a different set of protections from every other thread.
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
-to the new register. The feature is only available in 64-bit mode,
-even though there is theoretically space in the PAE PTEs. These
-permissions are enforced on data access only and have no effect on
-instruction fetches.
+There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
+register. The feature is only available in 64-bit mode, even though there is
+theoretically space in the PAE PTEs. These permissions are enforced on data
+access only and have no effect on instruction fetches.
Syscalls
========
diff --git a/Documentation/core-api/symbol-namespaces.rst b/Documentation/core-api/symbol-namespaces.rst
index 5ad9e0abe42c..12e4aecdae94 100644
--- a/Documentation/core-api/symbol-namespaces.rst
+++ b/Documentation/core-api/symbol-namespaces.rst
@@ -51,8 +51,8 @@ namespace ``USB_STORAGE``, use::
The corresponding ksymtab entry struct ``kernel_symbol`` will have the member
``namespace`` set accordingly. A symbol that is exported without a namespace will
refer to ``NULL``. There is no default namespace if none is defined. ``modpost``
-and kernel/module.c make use the namespace at build time or module load time,
-respectively.
+and kernel/module/main.c make use the namespace at build time or module load
+time, respectively.
2.2 Using the DEFAULT_SYMBOL_NAMESPACE define
=============================================
diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 729e24864fe7..22ec68f24421 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -132,6 +132,7 @@ Some additional variants exist for more specialized cases:
.. c:function:: u64 ktime_get_mono_fast_ns( void )
u64 ktime_get_raw_fast_ns( void )
u64 ktime_get_boot_fast_ns( void )
+ u64 ktime_get_tai_fast_ns( void )
u64 ktime_get_real_fast_ns( void )
These variants are safe to call from any context, including from
diff --git a/Documentation/watch_queue.rst b/Documentation/core-api/watch_queue.rst
index 54f13ad5fc17..54f13ad5fc17 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/core-api/watch_queue.rst
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index 541d31de8926..8ec4d6270b24 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -216,10 +216,6 @@ resources, scheduled and executed.
This flag is meaningless for unbound wq.
-Note that the flag ``WQ_NON_REENTRANT`` no longer exists as all
-workqueues are now non-reentrant - any work item is guaranteed to be
-executed by at most one worker system-wide at any given time.
-
``max_active``
--------------
@@ -374,8 +370,8 @@ of possible problems:
The first one can be tracked using tracing: ::
- $ echo workqueue:workqueue_queue_work > /sys/kernel/debug/tracing/set_event
- $ cat /sys/kernel/debug/tracing/trace_pipe > out.txt
+ $ echo workqueue:workqueue_queue_work > /sys/kernel/tracing/set_event
+ $ cat /sys/kernel/tracing/trace_pipe > out.txt
(wait a few secs)
^C
@@ -391,6 +387,23 @@ the stack trace of the offending worker thread. ::
The work item's function should be trivially visible in the stack
trace.
+Non-reentrance Conditions
+=========================
+
+Workqueue guarantees that a work item cannot be re-entrant if the following
+conditions hold after a work item gets queued:
+
+ 1. The work function hasn't been changed.
+ 2. No one queues the work item to another workqueue.
+ 3. The work item hasn't been reinitiated.
+
+In other words, if the above conditions hold, the work item is guaranteed to be
+executed by at most one worker system-wide at any given time.
+
+Note that requeuing the work item (to the same queue) in the self function
+doesn't break these conditions, so it's safe to do. Otherwise, caution is
+required when breaking the conditions inside a work function.
+
Kernel Inline Documentations Reference
======================================
diff --git a/Documentation/core-api/wrappers/atomic_bitops.rst b/Documentation/core-api/wrappers/atomic_bitops.rst
new file mode 100644
index 000000000000..bf24e4081a8f
--- /dev/null
+++ b/Documentation/core-api/wrappers/atomic_bitops.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+ This is a simple wrapper to bring atomic_bitops.txt into the RST world
+ until such a time as that file can be converted directly.
+
+=============
+Atomic bitops
+=============
+
+.. raw:: latex
+
+ \footnotesize
+
+.. include:: ../../atomic_bitops.txt
+ :literal:
+
+.. raw:: latex
+
+ \normalsize
diff --git a/Documentation/core-api/wrappers/atomic_t.rst b/Documentation/core-api/wrappers/atomic_t.rst
new file mode 100644
index 000000000000..ed109a964c77
--- /dev/null
+++ b/Documentation/core-api/wrappers/atomic_t.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+ This is a simple wrapper to bring atomic_t.txt into the RST world
+ until such a time as that file can be converted directly.
+
+============
+Atomic types
+============
+
+.. raw:: latex
+
+ \footnotesize
+
+.. include:: ../../atomic_t.txt
+ :literal:
+
+.. raw:: latex
+
+ \normalsize
+
diff --git a/Documentation/core-api/wrappers/memory-barriers.rst b/Documentation/core-api/wrappers/memory-barriers.rst
new file mode 100644
index 000000000000..532460b5e3eb
--- /dev/null
+++ b/Documentation/core-api/wrappers/memory-barriers.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+ This is a simple wrapper to bring memory-barriers.txt into the RST world
+ until such a time as that file can be converted directly.
+
+============================
+Linux kernel memory barriers
+============================
+
+.. raw:: latex
+
+ \footnotesize
+
+.. include:: ../../memory-barriers.txt
+ :literal:
+
+.. raw:: latex
+
+ \normalsize
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index a137a0e6d068..77e0ece2b1d6 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -315,11 +315,15 @@ indeed the normal API is implemented in terms of the advanced API. The
advanced API is only available to modules with a GPL-compatible license.
The advanced API is based around the xa_state. This is an opaque data
-structure which you declare on the stack using the XA_STATE()
-macro. This macro initialises the xa_state ready to start walking
-around the XArray. It is used as a cursor to maintain the position
-in the XArray and let you compose various operations together without
-having to restart from the top every time.
+structure which you declare on the stack using the XA_STATE() macro.
+This macro initialises the xa_state ready to start walking around the
+XArray. It is used as a cursor to maintain the position in the XArray
+and let you compose various operations together without having to restart
+from the top every time. The contents of the xa_state are protected by
+the rcu_read_lock() or the xas_lock(). If you need to drop whichever of
+those locks is protecting your state and tree, you must call xas_pause()
+so that future calls do not rely on the parts of the state which were
+left unprotected.
The xa_state is also used to store errors. You can call
xas_error() to retrieve the error. All operations check whether
diff --git a/Documentation/cpu-freq/core.rst b/Documentation/cpu-freq/core.rst
index 33cb90bd1d8f..4ceef8e7217c 100644
--- a/Documentation/cpu-freq/core.rst
+++ b/Documentation/cpu-freq/core.rst
@@ -73,12 +73,12 @@ CPUFREQ_POSTCHANGE.
The third argument is a struct cpufreq_freqs with the following
values:
-===== ===========================
-cpu number of the affected CPU
+====== ======================================
+policy a pointer to the struct cpufreq_policy
old old frequency
new new frequency
flags flags of the cpufreq driver
-===== ===========================
+====== ======================================
3. CPUFreq Table Generation with Operating Performance Point (OPP)
==================================================================
diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst
index 3b32336a7803..d84ededb66f9 100644
--- a/Documentation/cpu-freq/cpu-drivers.rst
+++ b/Documentation/cpu-freq/cpu-drivers.rst
@@ -75,6 +75,9 @@ And optionally
.resume - A pointer to a per-policy resume function which is called
with interrupts disabled and _before_ the governor is started again.
+ .ready - A pointer to a per-policy ready function which is called after
+ the policy is fully initialized.
+
.attr - A pointer to a NULL-terminated list of "struct freq_attr" which
allow to export values to sysfs.
diff --git a/Documentation/cpu-freq/index.rst b/Documentation/cpu-freq/index.rst
index aba7831ab1cb..de25740651f7 100644
--- a/Documentation/cpu-freq/index.rst
+++ b/Documentation/cpu-freq/index.rst
@@ -1,8 +1,8 @@
.. SPDX-License-Identifier: GPL-2.0
-==============================================================================
-Linux CPUFreq - CPU frequency and voltage scaling code in the Linux(TM) kernel
-==============================================================================
+========================================================================
+CPUFreq - CPU frequency and voltage scaling code in the Linux(TM) kernel
+========================================================================
Author: Dominik Brodowski <linux@brodo.de>
@@ -20,18 +20,15 @@ Author: Dominik Brodowski <linux@brodo.de>
Mailing List
------------
-There is a CPU frequency changing CVS commit and general list where
-you can report bugs, problems or submit patches. To post a message,
-send an email to linux-pm@vger.kernel.org.
+There is a CPU frequency general list where you can report bugs,
+problems or submit patches. To post a message, send an email to
+linux-pm@vger.kernel.org.
Links
-----
the FTP archives:
* ftp://ftp.linux.org.uk/pub/linux/cpufreq/
-how to access the CVS repository:
-* http://cvs.arm.linux.org.uk/
-
the CPUFreq Mailing list:
* http://vger.kernel.org/vger-lists.html#linux-pm
diff --git a/Documentation/crypto/crypto_engine.rst b/Documentation/crypto/crypto_engine.rst
index 25cf9836c336..d562ea17d994 100644
--- a/Documentation/crypto/crypto_engine.rst
+++ b/Documentation/crypto/crypto_engine.rst
@@ -69,6 +69,8 @@ the crypto engine via one of:
* crypto_transfer_hash_request_to_engine()
+* crypto_transfer_kpp_request_to_engine()
+
* crypto_transfer_skcipher_request_to_engine()
At the end of the request process, a call to one of the following functions is needed:
@@ -79,4 +81,6 @@ At the end of the request process, a call to one of the following functions is n
* crypto_finalize_hash_request()
+* crypto_finalize_kpp_request()
+
* crypto_finalize_skcipher_request()
diff --git a/Documentation/crypto/devel-algos.rst b/Documentation/crypto/devel-algos.rst
index f225a953ab4b..3506899ef83e 100644
--- a/Documentation/crypto/devel-algos.rst
+++ b/Documentation/crypto/devel-algos.rst
@@ -172,7 +172,7 @@ Here are schematics of how these functions are called when operated from
other part of the kernel. Note that the .setkey() call might happen
before or after any of these schematics happen, but must not happen
during any of these are in-flight. Please note that calling .init()
-followed immediately by .finish() is also a perfectly valid
+followed immediately by .final() is also a perfectly valid
transformation.
::
diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 21338fa92642..da5d5ad2bdf3 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -1,6 +1,6 @@
-=======================
-Linux Kernel Crypto API
-=======================
+==========
+Crypto API
+==========
:Author: Stephan Mueller
:Author: Marek Vasut
diff --git a/Documentation/crypto/userspace-if.rst b/Documentation/crypto/userspace-if.rst
index b45dabbf69d6..f80f243e227e 100644
--- a/Documentation/crypto/userspace-if.rst
+++ b/Documentation/crypto/userspace-if.rst
@@ -131,9 +131,9 @@ from the kernel crypto API. If the buffer is too small for the message
digest, the flag MSG_TRUNC is set by the kernel.
In order to set a message digest key, the calling application must use
-the setsockopt() option of ALG_SET_KEY. If the key is not set the HMAC
-operation is performed without the initial HMAC state change caused by
-the key.
+the setsockopt() option of ALG_SET_KEY or ALG_SET_KEY_BY_KEY_SERIAL. If the
+key is not set the HMAC operation is performed without the initial HMAC state
+change caused by the key.
Symmetric Cipher API
--------------------
@@ -382,6 +382,15 @@ mentioned optname:
- the RNG cipher type to provide the seed
+- ALG_SET_KEY_BY_KEY_SERIAL -- Setting the key via keyring key_serial_t.
+ This operation behaves the same as ALG_SET_KEY. The decrypted
+ data is copied from a keyring key, and uses that data as the
+ key for symmetric encryption.
+
+ The passed in key_serial_t must have the KEY_(POS|USR|GRP|OTH)_SEARCH
+ permission set, otherwise -EPERM is returned. Supports key types: user,
+ logon, encrypted, and trusted.
+
- ALG_SET_AEAD_AUTHSIZE -- Setting the authentication tag size for
AEAD ciphers. For a encryption operation, the authentication tag of
the given size will be generated. For a decryption operation, the
diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst
index f0956e9ea2d8..c3389c6f3838 100644
--- a/Documentation/dev-tools/checkpatch.rst
+++ b/Documentation/dev-tools/checkpatch.rst
@@ -612,6 +612,13 @@ Commit message
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
+ **BAD_FIXES_TAG**
+ The Fixes: tag is malformed or does not follow the community conventions.
+ This can occur if the tag have been split into multiple lines (e.g., when
+ pasted in an email program with word wrapping enabled).
+
+ See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
+
Comparison style
----------------
@@ -710,6 +717,39 @@ Indentation and Line Breaks
See: https://www.kernel.org/doc/html/latest/process/coding-style.html#breaking-long-lines-and-strings
+ **SPLIT_STRING**
+ Quoted strings that appear as messages in userspace and can be
+ grepped, should not be split across multiple lines.
+
+ See: https://lore.kernel.org/lkml/20120203052727.GA15035@leaf/
+
+ **MULTILINE_DEREFERENCE**
+ A single dereferencing identifier spanned on multiple lines like::
+
+ struct_identifier->member[index].
+ member = <foo>;
+
+ is generally hard to follow. It can easily lead to typos and so makes
+ the code vulnerable to bugs.
+
+ If fixing the multiple line dereferencing leads to an 80 column
+ violation, then either rewrite the code in a more simple way or if the
+ starting part of the dereferencing identifier is the same and used at
+ multiple places then store it in a temporary variable, and use that
+ temporary variable only at all the places. For example, if there are
+ two dereferencing identifiers::
+
+ member1->member2->member3.foo1;
+ member1->member2->member3.foo2;
+
+ then store the member1->member2->member3 part in a temporary variable.
+ It not only helps to avoid the 80 column violation but also reduces
+ the program size by removing the unnecessary dereferences.
+
+ But if none of the above methods work then ignore the 80 column
+ violation because it is much easier to read a dereferencing identifier
+ on a single line.
+
**TRAILING_STATEMENTS**
Trailing statements (for example after any conditional) should be
on the next line.
@@ -845,6 +885,38 @@ Macros, Attributes and Symbols
Use the `fallthrough;` pseudo keyword instead of
`/* fallthrough */` like comments.
+ **TRAILING_SEMICOLON**
+ Macro definition should not end with a semicolon. The macro
+ invocation style should be consistent with function calls.
+ This can prevent any unexpected code paths::
+
+ #define MAC do_something;
+
+ If this macro is used within a if else statement, like::
+
+ if (some_condition)
+ MAC;
+
+ else
+ do_something;
+
+ Then there would be a compilation error, because when the macro is
+ expanded there are two trailing semicolons, so the else branch gets
+ orphaned.
+
+ See: https://lore.kernel.org/lkml/1399671106.2912.21.camel@joe-AO725/
+
+ **SINGLE_STATEMENT_DO_WHILE_MACRO**
+ For the multi-statement macros, it is necessary to use the do-while
+ loop to avoid unpredictable code paths. The do-while loop helps to
+ group the multiple statements into a single one so that a
+ function-like macro can be used as a function only.
+
+ But for the single statement macros, it is unnecessary to use the
+ do-while loop. Although the code is syntactically correct but using
+ the do-while loop is redundant. So remove the do-while loop for single
+ statement macros.
+
**WEAK_DECLARATION**
Using weak declarations like __attribute__((weak)) or __weak
can have unintended link defects. Avoid using them.
@@ -920,6 +992,11 @@ Functions and Variables
Your compiler (or rather your loader) automatically does
it for you.
+ **MULTIPLE_ASSIGNMENTS**
+ Multiple assignments on a single line makes the code unnecessarily
+ complicated. So on a single line assign value to a single variable
+ only, this makes the code more readable and helps avoid typos.
+
**RETURN_PARENTHESES**
return is not a function and as such doesn't need parentheses::
@@ -957,6 +1034,17 @@ Permissions
Permission bits should use 4 digit octal permissions (like 0700 or 0444).
Avoid using any other base like decimal.
+ **SYMBOLIC_PERMS**
+ Permission bits in the octal form are more readable and easier to
+ understand than their symbolic counterparts because many command-line
+ tools use this notation. Experienced kernel developers have been using
+ these traditional Unix permission bits for decades and so they find it
+ easier to understand the octal notation than the symbolic macros.
+ For example, it is harder to read S_IWUSR|S_IRUGO than 0644, which
+ obscures the developer's intent rather than clarifying it.
+
+ See: https://lore.kernel.org/lkml/CA+55aFw5v23T-zvDZp-MmD_EYxF8WbafwwB59934FV7g21uMGQ@mail.gmail.com/
+
Spacing and Brackets
--------------------
diff --git a/Documentation/dev-tools/coccinelle.rst b/Documentation/dev-tools/coccinelle.rst
index 9c454de5a7f7..535ce126fb4f 100644
--- a/Documentation/dev-tools/coccinelle.rst
+++ b/Documentation/dev-tools/coccinelle.rst
@@ -66,7 +66,7 @@ The wiki documentation always refers to the linux-next version of the script.
For Semantic Patch Language(SmPL) grammar documentation refer to:
-http://coccinelle.lip6.fr/documentation.php
+https://coccinelle.gitlabpages.inria.fr/website/docs/main_grammar.html
Using Coccinelle on the Linux kernel
------------------------------------
@@ -219,7 +219,7 @@ instance::
cat cocci.err
You can use SPFLAGS to add debugging flags; for instance you may want to
-add both --profile --show-trying to SPFLAGS when debugging. For example
+add both ``--profile --show-trying`` to SPFLAGS when debugging. For example
you may want to use::
rm -f err.log
@@ -248,7 +248,7 @@ variables for .cocciconfig is as follows:
- Your current user's home directory is processed first
- Your directory from which spatch is called is processed next
-- The directory provided with the --dir option is processed last, if used
+- The directory provided with the ``--dir`` option is processed last, if used
Since coccicheck runs through make, it naturally runs from the kernel
proper dir; as such the second rule above would be implied for picking up a
@@ -265,8 +265,8 @@ The kernel coccicheck script has::
fi
KBUILD_EXTMOD is set when an explicit target with M= is used. For both cases
-the spatch --dir argument is used, as such third rule applies when whether M=
-is used or not, and when M= is used the target directory can have its own
+the spatch ``--dir`` argument is used, as such third rule applies when whether
+M= is used or not, and when M= is used the target directory can have its own
.cocciconfig file. When M= is not passed as an argument to coccicheck the
target directory is the same as the directory from where spatch was called.
diff --git a/Documentation/dev-tools/gdb-kernel-debugging.rst b/Documentation/dev-tools/gdb-kernel-debugging.rst
index 8e0f1fe8d17a..895285c037c7 100644
--- a/Documentation/dev-tools/gdb-kernel-debugging.rst
+++ b/Documentation/dev-tools/gdb-kernel-debugging.rst
@@ -39,6 +39,10 @@ Setup
this mode. In this case, you should build the kernel with
CONFIG_RANDOMIZE_BASE disabled if the architecture supports KASLR.
+- Build the gdb scripts (required on kernels v5.1 and above)::
+
+ make scripts_gdb
+
- Enable the gdb stub of QEMU/KVM, either
- at VM startup time by appending "-s" to the QEMU command line
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index 010a2af1e7d9..6b0663075dc0 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst
kcov
gcov
kasan
+ kmsan
ubsan
kmemleak
kcsan
@@ -32,6 +33,7 @@ Documentation/dev-tools/testing-overview.rst
kgdb
kselftest
kunit/index
+ ktap
.. only:: subproject and html
diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 21dc03bc10a4..e66916a483cd 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -4,39 +4,76 @@ The Kernel Address Sanitizer (KASAN)
Overview
--------
-KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector
-designed to find out-of-bound and use-after-free bugs. KASAN has three modes:
+Kernel Address Sanitizer (KASAN) is a dynamic memory safety error detector
+designed to find out-of-bounds and use-after-free bugs.
-1. generic KASAN (similar to userspace ASan),
-2. software tag-based KASAN (similar to userspace HWASan),
-3. hardware tag-based KASAN (based on hardware memory tagging).
+KASAN has three modes:
-Generic KASAN is mainly used for debugging due to a large memory overhead.
-Software tag-based KASAN can be used for dogfood testing as it has a lower
-memory overhead that allows using it with real workloads. Hardware tag-based
-KASAN comes with low memory and performance overheads and, therefore, can be
-used in production. Either as an in-field memory bug detector or as a security
-mitigation.
+1. Generic KASAN
+2. Software Tag-Based KASAN
+3. Hardware Tag-Based KASAN
-Software KASAN modes (#1 and #2) use compile-time instrumentation to insert
-validity checks before every memory access and, therefore, require a compiler
-version that supports that.
+Generic KASAN, enabled with CONFIG_KASAN_GENERIC, is the mode intended for
+debugging, similar to userspace ASan. This mode is supported on many CPU
+architectures, but it has significant performance and memory overheads.
-Generic KASAN is supported in GCC and Clang. With GCC, it requires version
-8.3.0 or later. Any supported Clang version is compatible, but detection of
-out-of-bounds accesses for global variables is only supported since Clang 11.
+Software Tag-Based KASAN or SW_TAGS KASAN, enabled with CONFIG_KASAN_SW_TAGS,
+can be used for both debugging and dogfood testing, similar to userspace HWASan.
+This mode is only supported for arm64, but its moderate memory overhead allows
+using it for testing on memory-restricted devices with real workloads.
-Software tag-based KASAN mode is only supported in Clang.
+Hardware Tag-Based KASAN or HW_TAGS KASAN, enabled with CONFIG_KASAN_HW_TAGS,
+is the mode intended to be used as an in-field memory bug detector or as a
+security mitigation. This mode only works on arm64 CPUs that support MTE
+(Memory Tagging Extension), but it has low memory and performance overheads and
+thus can be used in production.
-The hardware KASAN mode (#3) relies on hardware to perform the checks but
-still requires a compiler version that supports memory tagging instructions.
-This mode is supported in GCC 10+ and Clang 11+.
+For details about the memory and performance impact of each KASAN mode, see the
+descriptions of the corresponding Kconfig options.
-Both software KASAN modes work with SLUB and SLAB memory allocators,
-while the hardware tag-based KASAN currently only supports SLUB.
+The Generic and the Software Tag-Based modes are commonly referred to as the
+software modes. The Software Tag-Based and the Hardware Tag-Based modes are
+referred to as the tag-based modes.
-Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390,
-and riscv architectures, and tag-based KASAN modes are supported only for arm64.
+Support
+-------
+
+Architectures
+~~~~~~~~~~~~~
+
+Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and
+xtensa, and the tag-based KASAN modes are supported only on arm64.
+
+Compilers
+~~~~~~~~~
+
+Software KASAN modes use compile-time instrumentation to insert validity checks
+before every memory access and thus require a compiler version that provides
+support for that. The Hardware Tag-Based mode relies on hardware to perform
+these checks but still requires a compiler version that supports the memory
+tagging instructions.
+
+Generic KASAN requires GCC version 8.3.0 or later
+or any Clang version supported by the kernel.
+
+Software Tag-Based KASAN requires GCC 11+
+or any Clang version supported by the kernel.
+
+Hardware Tag-Based KASAN requires GCC 10+ or Clang 12+.
+
+Memory types
+~~~~~~~~~~~~
+
+Generic KASAN supports finding bugs in all of slab, page_alloc, vmap, vmalloc,
+stack, and global memory.
+
+Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory.
+
+Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc
+memory.
+
+For slab, both software KASAN modes support SLUB and SLAB allocators, while
+Hardware Tag-Based KASAN only supports SLUB.
Usage
-----
@@ -45,18 +82,81 @@ To enable KASAN, configure the kernel with::
CONFIG_KASAN=y
-and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN),
-``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and
-``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN).
+and choose between ``CONFIG_KASAN_GENERIC`` (to enable Generic KASAN),
+``CONFIG_KASAN_SW_TAGS`` (to enable Software Tag-Based KASAN), and
+``CONFIG_KASAN_HW_TAGS`` (to enable Hardware Tag-Based KASAN).
-For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and
+For the software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and
``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types.
-The former produces a smaller binary while the latter is 1.1-2 times faster.
+The former produces a smaller binary while the latter is up to 2 times faster.
To include alloc and free stack traces of affected slab objects into reports,
enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected
physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``.
+Boot parameters
+~~~~~~~~~~~~~~~
+
+KASAN is affected by the generic ``panic_on_warn`` command line parameter.
+When it is enabled, KASAN panics the kernel after printing a bug report.
+
+By default, KASAN prints a bug report only for the first invalid memory access.
+With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
+effectively disables ``panic_on_warn`` for KASAN reports.
+
+Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot
+parameter can be used to control panic and reporting behaviour:
+
+- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
+ report or also panic the kernel (default: ``report``). The panic happens even
+ if ``kasan_multi_shot`` is enabled.
+
+Software and Hardware Tag-Based KASAN modes (see the section about various
+modes below) support altering stack trace collection behavior:
+
+- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
+ traces collection (default: ``on``).
+- ``kasan.stack_ring_size=<number of entries>`` specifies the number of entries
+ in the stack ring (default: ``32768``).
+
+Hardware Tag-Based KASAN mode is intended for use in production as a security
+mitigation. Therefore, it supports additional boot parameters that allow
+disabling KASAN altogether or controlling its features:
+
+- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
+
+- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN
+ is configured in synchronous, asynchronous or asymmetric mode of
+ execution (default: ``sync``).
+ Synchronous mode: a bad access is detected immediately when a tag
+ check fault occurs.
+ Asynchronous mode: a bad access detection is delayed. When a tag check
+ fault occurs, the information is stored in hardware (in the TFSR_EL1
+ register for arm64). The kernel periodically checks the hardware and
+ only reports tag faults during these checks.
+ Asymmetric mode: a bad access is detected synchronously on reads and
+ asynchronously on writes.
+
+- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
+ allocations (default: ``on``).
+
+- ``kasan.page_alloc.sample=<sampling interval>`` makes KASAN tag only every
+ Nth page_alloc allocation with the order equal or greater than
+ ``kasan.page_alloc.sample.order``, where N is the value of the ``sample``
+ parameter (default: ``1``, or tag every such allocation).
+ This parameter is intended to mitigate the performance overhead introduced
+ by KASAN.
+ Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks
+ of allocations chosen by sampling and thus miss bad accesses to these
+ allocations. Use the default value for accurate bug detection.
+
+- ``kasan.page_alloc.sample.order=<minimum page order>`` specifies the minimum
+ order of allocations that are affected by sampling (default: ``3``).
+ Only applies when ``kasan.page_alloc.sample`` is set to a value greater
+ than ``1``.
+ This parameter is intended to allow sampling only large page_alloc
+ allocations, which is the biggest source of the performance overhead.
+
Error reports
~~~~~~~~~~~~~
@@ -146,7 +246,7 @@ is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the
memory state section of the report shows the state of one of the memory
granules that surround the accessed address.
-For generic KASAN, the size of each memory granule is 8. The state of each
+For Generic KASAN, the size of each memory granule is 8. The state of each
granule is encoded in one shadow byte. Those 8 bytes can be accessible,
partially accessible, freed, or be a part of a redzone. KASAN uses the following
encoding for each shadow byte: 00 means that all 8 bytes of the corresponding
@@ -171,41 +271,6 @@ traces point to places in code that interacted with the object but that are not
directly present in the bad access stack trace. Currently, this includes
call_rcu() and workqueue queuing.
-Boot parameters
-~~~~~~~~~~~~~~~
-
-KASAN is affected by the generic ``panic_on_warn`` command line parameter.
-When it is enabled, KASAN panics the kernel after printing a bug report.
-
-By default, KASAN prints a bug report only for the first invalid memory access.
-With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
-effectively disables ``panic_on_warn`` for KASAN reports.
-
-Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
-parameter can be used to control panic and reporting behaviour:
-
-- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
- report or also panic the kernel (default: ``report``). The panic happens even
- if ``kasan_multi_shot`` is enabled.
-
-Hardware tag-based KASAN mode (see the section about various modes below) is
-intended for use in production as a security mitigation. Therefore, it supports
-additional boot parameters that allow disabling KASAN or controlling features:
-
-- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
-
-- ``kasan.mode=sync`` or ``=async`` controls whether KASAN is configured in
- synchronous or asynchronous mode of execution (default: ``sync``).
- Synchronous mode: a bad access is detected immediately when a tag
- check fault occurs.
- Asynchronous mode: a bad access detection is delayed. When a tag check
- fault occurs, the information is stored in hardware (in the TFSR_EL1
- register for arm64). The kernel periodically checks the hardware and
- only reports tag faults during these checks.
-
-- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
- traces collection (default: ``on``).
-
Implementation details
----------------------
@@ -244,49 +309,46 @@ outline-instrumented kernel.
Generic KASAN is the only mode that delays the reuse of freed objects via
quarantine (see mm/kasan/quarantine.c for implementation).
-Software tag-based KASAN
+Software Tag-Based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~
-Software tag-based KASAN uses a software memory tagging approach to checking
+Software Tag-Based KASAN uses a software memory tagging approach to checking
access validity. It is currently only implemented for the arm64 architecture.
-Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
+Software Tag-Based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
to store a pointer tag in the top byte of kernel pointers. It uses shadow memory
to store memory tags associated with each 16-byte memory cell (therefore, it
dedicates 1/16th of the kernel memory for shadow memory).
-On each memory allocation, software tag-based KASAN generates a random tag, tags
+On each memory allocation, Software Tag-Based KASAN generates a random tag, tags
the allocated memory with this tag, and embeds the same tag into the returned
pointer.
-Software tag-based KASAN uses compile-time instrumentation to insert checks
+Software Tag-Based KASAN uses compile-time instrumentation to insert checks
before each memory access. These checks make sure that the tag of the memory
that is being accessed is equal to the tag of the pointer that is used to access
-this memory. In case of a tag mismatch, software tag-based KASAN prints a bug
+this memory. In case of a tag mismatch, Software Tag-Based KASAN prints a bug
report.
-Software tag-based KASAN also has two instrumentation modes (outline, which
+Software Tag-Based KASAN also has two instrumentation modes (outline, which
emits callbacks to check memory accesses; and inline, which performs the shadow
memory checks inline). With outline instrumentation mode, a bug report is
printed from the function that performs the access check. With inline
instrumentation, a ``brk`` instruction is emitted by the compiler, and a
dedicated ``brk`` handler is used to print bug reports.
-Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
+Software Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions.
-Software tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
-
-Hardware tag-based KASAN
+Hardware Tag-Based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~
-Hardware tag-based KASAN is similar to the software mode in concept but uses
+Hardware Tag-Based KASAN is similar to the software mode in concept but uses
hardware memory tagging support instead of compiler instrumentation and
shadow memory.
-Hardware tag-based KASAN is currently only implemented for arm64 architecture
+Hardware Tag-Based KASAN is currently only implemented for arm64 architecture
and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5
Instruction Set Architecture and Top Byte Ignore (TBI).
@@ -296,26 +358,25 @@ access, hardware makes sure that the tag of the memory that is being accessed is
equal to the tag of the pointer that is used to access this memory. In case of a
tag mismatch, a fault is generated, and a report is printed.
-Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
+Hardware Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions.
-Hardware tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
-
-If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN
+If the hardware does not support MTE (pre ARMv8.5), Hardware Tag-Based KASAN
will not be enabled. In this case, all KASAN boot parameters are ignored.
Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being
enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not
support MTE (but supports TBI).
-Hardware tag-based KASAN only reports the first found bug. After that, MTE tag
+Hardware Tag-Based KASAN only reports the first found bug. After that, MTE tag
checking gets disabled.
Shadow memory
-------------
+The contents of this section are only applicable to software KASAN modes.
+
The kernel maps memory in several different parts of the address space.
The range of kernel virtual addresses is large: there is not enough real
memory to support a real shadow region for every address that could be
@@ -346,7 +407,7 @@ CONFIG_KASAN_VMALLOC
With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
cost of greater memory usage. Currently, this is supported on x86,
-riscv, s390, and powerpc.
+arm64, riscv, s390, and powerpc.
This works by hooking into vmalloc and vmap and dynamically
allocating real shadow memory to back the mappings.
@@ -406,19 +467,18 @@ generic ``noinstr`` one.
Note that disabling compiler instrumentation (either on a per-file or a
per-function basis) makes KASAN ignore the accesses that happen directly in
that code for software KASAN modes. It does not help when the accesses happen
-indirectly (through calls to instrumented functions) or with the hardware
-tag-based mode that does not use compiler instrumentation.
+indirectly (through calls to instrumented functions) or with Hardware
+Tag-Based KASAN, which does not use compiler instrumentation.
For software KASAN modes, to disable KASAN reports in a part of the kernel code
for the current task, annotate this part of the code with a
``kasan_disable_current()``/``kasan_enable_current()`` section. This also
disables the reports for indirect accesses that happen through function calls.
-For tag-based KASAN modes (include the hardware one), to disable access
-checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that
-temporarily disabling access checking via ``page_kasan_tag_reset()`` requires
-saving and restoring the per-page KASAN tag via
-``page_kasan_tag``/``page_kasan_tag_set``.
+For tag-based KASAN modes, to disable access checking, use
+``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that temporarily
+disabling access checking via ``page_kasan_tag_reset()`` requires saving and
+restoring the per-page KASAN tag via ``page_kasan_tag``/``page_kasan_tag_set``.
Tests
~~~~~
diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst
index d2c4c27e1702..6611434e2dd2 100644
--- a/Documentation/dev-tools/kcov.rst
+++ b/Documentation/dev-tools/kcov.rst
@@ -1,42 +1,50 @@
-kcov: code coverage for fuzzing
+KCOV: code coverage for fuzzing
===============================
-kcov exposes kernel code coverage information in a form suitable for coverage-
-guided fuzzing (randomized testing). Coverage data of a running kernel is
-exported via the "kcov" debugfs file. Coverage collection is enabled on a task
-basis, and thus it can capture precise coverage of a single system call.
+KCOV collects and exposes kernel code coverage information in a form suitable
+for coverage-guided fuzzing. Coverage data of a running kernel is exported via
+the ``kcov`` debugfs file. Coverage collection is enabled on a task basis, and
+thus KCOV can capture precise coverage of a single system call.
-Note that kcov does not aim to collect as much coverage as possible. It aims
-to collect more or less stable coverage that is function of syscall inputs.
-To achieve this goal it does not collect coverage in soft/hard interrupts
-and instrumentation of some inherently non-deterministic parts of kernel is
-disabled (e.g. scheduler, locking).
+Note that KCOV does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is a function of syscall inputs.
+To achieve this goal, it does not collect coverage in soft/hard interrupts
+(unless remove coverage collection is enabled, see below) and from some
+inherently non-deterministic parts of the kernel (e.g. scheduler, locking).
-kcov is also able to collect comparison operands from the instrumented code
-(this feature currently requires that the kernel is compiled with clang).
+Besides collecting code coverage, KCOV can also collect comparison operands.
+See the "Comparison operands collection" section for details.
+
+Besides collecting coverage data from syscall handlers, KCOV can also collect
+coverage for annotated parts of the kernel executing in background kernel
+tasks or soft interrupts. See the "Remote coverage collection" section for
+details.
Prerequisites
-------------
-Configure the kernel with::
+KCOV relies on compiler instrumentation and requires GCC 6.1.0 or later
+or any Clang version supported by the kernel.
- CONFIG_KCOV=y
+Collecting comparison operands is supported with GCC 8+ or with Clang.
+
+To enable KCOV, configure the kernel with::
-CONFIG_KCOV requires gcc 6.1.0 or later.
+ CONFIG_KCOV=y
-If the comparison operands need to be collected, set::
+To enable comparison operands collection, set::
CONFIG_KCOV_ENABLE_COMPARISONS=y
-Profiling data will only become accessible once debugfs has been mounted::
+Coverage data only becomes accessible once debugfs has been mounted::
mount -t debugfs none /sys/kernel/debug
Coverage collection
-------------------
-The following program demonstrates coverage collection from within a test
-program using kcov:
+The following program demonstrates how to use KCOV to collect coverage for a
+single syscall from within a test program:
.. code-block:: c
@@ -50,6 +58,7 @@ program using kcov:
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
+ #include <linux/types.h>
#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long)
#define KCOV_ENABLE _IO('c', 100)
@@ -83,7 +92,7 @@ program using kcov:
perror("ioctl"), exit(1);
/* Reset coverage from the tail of the ioctl() call. */
__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
- /* That's the target syscal call. */
+ /* Call the target syscall call. */
read(-1, NULL, 0);
/* Read number of PCs collected. */
n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
@@ -102,7 +111,7 @@ program using kcov:
return 0;
}
-After piping through addr2line output of the program looks as follows::
+After piping through ``addr2line`` the output of the program looks as follows::
SyS_read
fs/read_write.c:562
@@ -120,12 +129,13 @@ After piping through addr2line output of the program looks as follows::
fs/read_write.c:562
If a program needs to collect coverage from several threads (independently),
-it needs to open /sys/kernel/debug/kcov in each thread separately.
+it needs to open ``/sys/kernel/debug/kcov`` in each thread separately.
The interface is fine-grained to allow efficient forking of test processes.
-That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
-mmaps coverage buffer and then forks child processes in a loop. Child processes
-only need to enable coverage (disable happens automatically on thread end).
+That is, a parent process opens ``/sys/kernel/debug/kcov``, enables trace mode,
+mmaps coverage buffer, and then forks child processes in a loop. The child
+processes only need to enable coverage (it gets disabled automatically when
+a thread exits).
Comparison operands collection
------------------------------
@@ -177,6 +187,8 @@ Comparison operands collection is similar to coverage collection:
/* Read number of comparisons collected. */
n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
for (i = 0; i < n; i++) {
+ uint64_t ip;
+
type = cover[i * KCOV_WORDS_PER_CMP + 1];
/* arg1 and arg2 - operands of the comparison. */
arg1 = cover[i * KCOV_WORDS_PER_CMP + 2];
@@ -202,55 +214,83 @@ Comparison operands collection is similar to coverage collection:
return 0;
}
-Note that the kcov modes (coverage collection or comparison operands) are
-mutually exclusive.
+Note that the KCOV modes (collection of code coverage or comparison operands)
+are mutually exclusive.
Remote coverage collection
--------------------------
-With KCOV_ENABLE coverage is collected only for syscalls that are issued
-from the current process. With KCOV_REMOTE_ENABLE it's possible to collect
-coverage for arbitrary parts of the kernel code, provided that those parts
-are annotated with kcov_remote_start()/kcov_remote_stop().
-
-This allows to collect coverage from two types of kernel background
-threads: the global ones, that are spawned during kernel boot in a limited
-number of instances (e.g. one USB hub_event() worker thread is spawned per
-USB HCD); and the local ones, that are spawned when a user interacts with
-some kernel interface (e.g. vhost workers); as well as from soft
-interrupts.
-
-To enable collecting coverage from a global background thread or from a
-softirq, a unique global handle must be assigned and passed to the
-corresponding kcov_remote_start() call. Then a userspace process can pass
-a list of such handles to the KCOV_REMOTE_ENABLE ioctl in the handles
-array field of the kcov_remote_arg struct. This will attach the used kcov
-device to the code sections, that are referenced by those handles.
-
-Since there might be many local background threads spawned from different
-userspace processes, we can't use a single global handle per annotation.
-Instead, the userspace process passes a non-zero handle through the
-common_handle field of the kcov_remote_arg struct. This common handle gets
-saved to the kcov_handle field in the current task_struct and needs to be
-passed to the newly spawned threads via custom annotations. Those threads
-should in turn be annotated with kcov_remote_start()/kcov_remote_stop().
-
-Internally kcov stores handles as u64 integers. The top byte of a handle
-is used to denote the id of a subsystem that this handle belongs to, and
-the lower 4 bytes are used to denote the id of a thread instance within
-that subsystem. A reserved value 0 is used as a subsystem id for common
-handles as they don't belong to a particular subsystem. The bytes 4-7 are
-currently reserved and must be zero. In the future the number of bytes
-used for the subsystem or handle ids might be increased.
-
-When a particular userspace process collects coverage via a common
-handle, kcov will collect coverage for each code section that is annotated
-to use the common handle obtained as kcov_handle from the current
-task_struct. However non common handles allow to collect coverage
-selectively from different subsystems.
+Besides collecting coverage data from handlers of syscalls issued from a
+userspace process, KCOV can also collect coverage for parts of the kernel
+executing in other contexts - so-called "remote" coverage.
+
+Using KCOV to collect remote coverage requires:
+
+1. Modifying kernel code to annotate the code section from where coverage
+ should be collected with ``kcov_remote_start`` and ``kcov_remote_stop``.
+
+2. Using ``KCOV_REMOTE_ENABLE`` instead of ``KCOV_ENABLE`` in the userspace
+ process that collects coverage.
+
+Both ``kcov_remote_start`` and ``kcov_remote_stop`` annotations and the
+``KCOV_REMOTE_ENABLE`` ioctl accept handles that identify particular coverage
+collection sections. The way a handle is used depends on the context where the
+matching code section executes.
+
+KCOV supports collecting remote coverage from the following contexts:
+
+1. Global kernel background tasks. These are the tasks that are spawned during
+ kernel boot in a limited number of instances (e.g. one USB ``hub_event``
+ worker is spawned per one USB HCD).
+
+2. Local kernel background tasks. These are spawned when a userspace process
+ interacts with some kernel interface and are usually killed when the process
+ exits (e.g. vhost workers).
+
+3. Soft interrupts.
+
+For #1 and #3, a unique global handle must be chosen and passed to the
+corresponding ``kcov_remote_start`` call. Then a userspace process must pass
+this handle to ``KCOV_REMOTE_ENABLE`` in the ``handles`` array field of the
+``kcov_remote_arg`` struct. This will attach the used KCOV device to the code
+section referenced by this handle. Multiple global handles identifying
+different code sections can be passed at once.
+
+For #2, the userspace process instead must pass a non-zero handle through the
+``common_handle`` field of the ``kcov_remote_arg`` struct. This common handle
+gets saved to the ``kcov_handle`` field in the current ``task_struct`` and
+needs to be passed to the newly spawned local tasks via custom kernel code
+modifications. Those tasks should in turn use the passed handle in their
+``kcov_remote_start`` and ``kcov_remote_stop`` annotations.
+
+KCOV follows a predefined format for both global and common handles. Each
+handle is a ``u64`` integer. Currently, only the one top and the lower 4 bytes
+are used. Bytes 4-7 are reserved and must be zero.
+
+For global handles, the top byte of the handle denotes the id of a subsystem
+this handle belongs to. For example, KCOV uses ``1`` as the USB subsystem id.
+The lower 4 bytes of a global handle denote the id of a task instance within
+that subsystem. For example, each ``hub_event`` worker uses the USB bus number
+as the task instance id.
+
+For common handles, a reserved value ``0`` is used as a subsystem id, as such
+handles don't belong to a particular subsystem. The lower 4 bytes of a common
+handle identify a collective instance of all local tasks spawned by the
+userspace process that passed a common handle to ``KCOV_REMOTE_ENABLE``.
+
+In practice, any value can be used for common handle instance id if coverage
+is only collected from a single userspace process on the system. However, if
+common handles are used by multiple processes, unique instance ids must be
+used for each process. One option is to use the process id as the common
+handle instance id.
+
+The following program demonstrates using KCOV to collect coverage from both
+local tasks spawned by the process and the global task that handles USB bus #1:
.. code-block:: c
+ /* Same includes and defines as above. */
+
struct kcov_remote_arg {
__u32 trace_mode;
__u32 area_size;
diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index 7db43c7c09b8..3ae866dcc924 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -204,17 +204,17 @@ Ultimately this allows to determine the possible executions of concurrent code,
and if that code is free from data races.
KCSAN is aware of *marked atomic operations* (``READ_ONCE``, ``WRITE_ONCE``,
-``atomic_*``, etc.), but is oblivious of any ordering guarantees and simply
-assumes that memory barriers are placed correctly. In other words, KCSAN
-assumes that as long as a plain access is not observed to race with another
-conflicting access, memory operations are correctly ordered.
-
-This means that KCSAN will not report *potential* data races due to missing
-memory ordering. Developers should therefore carefully consider the required
-memory ordering requirements that remain unchecked. If, however, missing
-memory ordering (that is observable with a particular compiler and
-architecture) leads to an observable data race (e.g. entering a critical
-section erroneously), KCSAN would report the resulting data race.
+``atomic_*``, etc.), and a subset of ordering guarantees implied by memory
+barriers. With ``CONFIG_KCSAN_WEAK_MEMORY=y``, KCSAN models load or store
+buffering, and can detect missing ``smp_mb()``, ``smp_wmb()``, ``smp_rmb()``,
+``smp_store_release()``, and all ``atomic_*`` operations with equivalent
+implied barriers.
+
+Note, KCSAN will not report all data races due to missing memory ordering,
+specifically where a memory barrier would be required to prohibit subsequent
+memory operation from reordering before the barrier. Developers should
+therefore carefully consider the required memory ordering requirements that
+remain unchecked.
Race Detection Beyond Data Races
--------------------------------
@@ -268,6 +268,56 @@ marked operations, if all accesses to a variable that is accessed concurrently
are properly marked, KCSAN will never trigger a watchpoint and therefore never
report the accesses.
+Modeling Weak Memory
+~~~~~~~~~~~~~~~~~~~~
+
+KCSAN's approach to detecting data races due to missing memory barriers is
+based on modeling access reordering (with ``CONFIG_KCSAN_WEAK_MEMORY=y``).
+Each plain memory access for which a watchpoint is set up, is also selected for
+simulated reordering within the scope of its function (at most 1 in-flight
+access).
+
+Once an access has been selected for reordering, it is checked along every
+other access until the end of the function scope. If an appropriate memory
+barrier is encountered, the access will no longer be considered for simulated
+reordering.
+
+When the result of a memory operation should be ordered by a barrier, KCSAN can
+then detect data races where the conflict only occurs as a result of a missing
+barrier. Consider the example::
+
+ int x, flag;
+ void T1(void)
+ {
+ x = 1; // data race!
+ WRITE_ONCE(flag, 1); // correct: smp_store_release(&flag, 1)
+ }
+ void T2(void)
+ {
+ while (!READ_ONCE(flag)); // correct: smp_load_acquire(&flag)
+ ... = x; // data race!
+ }
+
+When weak memory modeling is enabled, KCSAN can consider ``x`` in ``T1`` for
+simulated reordering. After the write of ``flag``, ``x`` is again checked for
+concurrent accesses: because ``T2`` is able to proceed after the write of
+``flag``, a data race is detected. With the correct barriers in place, ``x``
+would not be considered for reordering after the proper release of ``flag``,
+and no data race would be detected.
+
+Deliberate trade-offs in complexity but also practical limitations mean only a
+subset of data races due to missing memory barriers can be detected. With
+currently available compiler support, the implementation is limited to modeling
+the effects of "buffering" (delaying accesses), since the runtime cannot
+"prefetch" accesses. Also recall that watchpoints are only set up for plain
+accesses, and the only access type for which KCSAN simulates reordering. This
+means reordering of marked accesses is not modeled.
+
+A consequence of the above is that acquire operations do not require barrier
+instrumentation (no prefetching). Furthermore, marked accesses introducing
+address or control dependencies do not require special handling (the marked
+access cannot be reordered, later dependent accesses cannot be prefetched).
+
Key Properties
~~~~~~~~~~~~~~
@@ -290,8 +340,8 @@ Key Properties
4. **Detects Racy Writes from Devices:** Due to checking data values upon
setting up watchpoints, racy writes from devices can also be detected.
-5. **Memory Ordering:** KCSAN is *not* explicitly aware of the LKMM's ordering
- rules; this may result in missed data races (false negatives).
+5. **Memory Ordering:** KCSAN is aware of only a subset of LKMM ordering rules;
+ this may result in missed data races (false negatives).
6. **Analysis Accuracy:** For observed executions, due to using a sampling
strategy, the analysis is *unsound* (false negatives possible), but aims to
diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37..936f6aaa75c8 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -41,6 +41,18 @@ guarded by KFENCE. The default is configurable via the Kconfig option
``CONFIG_KFENCE_SAMPLE_INTERVAL``. Setting ``kfence.sample_interval=0``
disables KFENCE.
+The sample interval controls a timer that sets up KFENCE allocations. By
+default, to keep the real sample interval predictable, the normal timer also
+causes CPU wake-ups when the system is completely idle. This may be undesirable
+on power-constrained systems. The boot parameter ``kfence.deferrable=1``
+instead switches to a "deferrable" timer which does not force CPU wake-ups on
+idle systems, at the risk of unpredictable sample intervals. The default is
+configurable via the Kconfig option ``CONFIG_KFENCE_DEFERRABLE``.
+
+.. warning::
+ The KUnit test suite is very likely to fail when using a deferrable timer
+ since it currently causes very unpredictable sample intervals.
+
The KFENCE memory pool is of fixed size, and if the pool is exhausted, no
further KFENCE allocations occur. With ``CONFIG_KFENCE_NUM_OBJECTS`` (default
255), the number of available guarded objects can be controlled. Each object
@@ -231,10 +243,14 @@ Guarded allocations are set up based on the sample interval. After expiration
of the sample interval, the next allocation through the main allocator (SLAB or
SLUB) returns a guarded allocation from the KFENCE object pool (allocation
sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
-the next allocation is set up after the expiration of the interval. To "gate" a
-KFENCE allocation through the main allocator's fast-path without overhead,
-KFENCE relies on static branches via the static keys infrastructure. The static
-branch is toggled to redirect the allocation to KFENCE.
+the next allocation is set up after the expiration of the interval.
+
+When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated"
+through the main allocator's fast-path by relying on static branches via the
+static keys infrastructure. The static branch is toggled to redirect the
+allocation to KFENCE. Depending on sample interval, target workloads, and
+system architecture, this may perform better than the simple dynamic branch.
+Careful benchmarking is recommended.
KFENCE objects each reside on a dedicated page, at either the left or right
page boundaries selected at random. The pages to the left and right of the
@@ -269,6 +285,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
first, and the chances of detecting use-after-frees of recently freed objects
is increased.
+If pool utilization reaches 75% (default) or above, to reduce the risk of the
+pool eventually being fully occupied by allocated objects yet ensure diverse
+coverage of allocations, KFENCE limits currently covered allocations of the
+same source from further filling up the pool. The "source" of an allocation is
+based on its partial allocation stack trace. A side-effect is that this also
+limits frequent long-lived allocations (e.g. pagecache) of the same source
+filling up the pool permanently, which is the most common risk for the pool
+becoming full and the sampled allocation rate dropping to zero. The threshold
+at which to start limiting currently covered allocations can be configured via
+the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
+
Interface
---------
diff --git a/Documentation/dev-tools/kgdb.rst b/Documentation/dev-tools/kgdb.rst
index 43456244651a..f83ba2601e55 100644
--- a/Documentation/dev-tools/kgdb.rst
+++ b/Documentation/dev-tools/kgdb.rst
@@ -402,7 +402,7 @@ This is a quick example of how to use kdb.
2. Enter the kernel debugger manually or by waiting for an oops or
fault. There are several ways you can enter the kernel debugger
manually; all involve using the :kbd:`SysRq-G`, which means you must have
- enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config.
+ enabled ``CONFIG_MAGIC_SYSRQ=y`` in your kernel config.
- When logged in as root or with a super user session you can run::
@@ -461,7 +461,7 @@ This is a quick example of how to use kdb with a keyboard.
2. Enter the kernel debugger manually or by waiting for an oops or
fault. There are several ways you can enter the kernel debugger
manually; all involve using the :kbd:`SysRq-G`, which means you must have
- enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config.
+ enabled ``CONFIG_MAGIC_SYSRQ=y`` in your kernel config.
- When logged in as root or with a super user session you can run::
@@ -557,7 +557,7 @@ Connecting with gdb to a serial port
Example (using a directly connected port)::
% gdb ./vmlinux
- (gdb) set remotebaud 115200
+ (gdb) set serial baud 115200
(gdb) target remote /dev/ttyS0
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 1c935f41cd3a..2cb00b53339f 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -174,7 +174,6 @@ mapping:
- ``kmemleak_alloc_phys``
- ``kmemleak_free_part_phys``
-- ``kmemleak_not_leak_phys``
- ``kmemleak_ignore_phys``
Dealing with false positives/negatives
@@ -228,7 +227,7 @@ Testing with kmemleak-test
--------------------------
To check if you have all set up to use kmemleak, you can use the kmemleak-test
-module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST
+module, a module that deliberately leaks memory. Set CONFIG_SAMPLE_KMEMLEAK
as module (it can't be used as built-in) and boot the kernel with kmemleak
enabled. Load the module and perform a scan with::
diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst
new file mode 100644
index 000000000000..55fa82212eb2
--- /dev/null
+++ b/Documentation/dev-tools/kmsan.rst
@@ -0,0 +1,428 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2022, Google LLC.
+
+===================================
+The Kernel Memory Sanitizer (KMSAN)
+===================================
+
+KMSAN is a dynamic error detector aimed at finding uses of uninitialized
+values. It is based on compiler instrumentation, and is quite similar to the
+userspace `MemorySanitizer tool`_.
+
+An important note is that KMSAN is not intended for production use, because it
+drastically increases kernel memory footprint and slows the whole system down.
+
+Usage
+=====
+
+Building the kernel
+-------------------
+
+In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+).
+Please refer to `LLVM documentation`_ for the instructions on how to build Clang.
+
+Now configure and build the kernel with CONFIG_KMSAN enabled.
+
+Example report
+--------------
+
+Here is an example of a KMSAN report::
+
+ =====================================================
+ BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test]
+ test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273
+ kunit_run_case_internal lib/kunit/test.c:333
+ kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
+ kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
+ kthread+0x721/0x850 kernel/kthread.c:327
+ ret_from_fork+0x1f/0x30 ??:?
+
+ Uninit was stored to memory at:
+ do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260
+ test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
+ kunit_run_case_internal lib/kunit/test.c:333
+ kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
+ kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
+ kthread+0x721/0x850 kernel/kthread.c:327
+ ret_from_fork+0x1f/0x30 ??:?
+
+ Local variable uninit created at:
+ do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256
+ test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
+
+ Bytes 4-7 of 8 are uninitialized
+ Memory access of size 8 starts at ffff888083fe3da0
+
+ CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
+ =====================================================
+
+The report says that the local variable ``uninit`` was created uninitialized in
+``do_uninit_local_array()``. The third stack trace corresponds to the place
+where this variable was created.
+
+The first stack trace shows where the uninit value was used (in
+``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left
+uninitialized in the local variable, as well as the stack where the value was
+copied to another memory location before use.
+
+A use of uninitialized value ``v`` is reported by KMSAN in the following cases:
+
+ - in a condition, e.g. ``if (v) { ... }``;
+ - in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``;
+ - when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``;
+ - when it is passed as an argument to a function, and
+ ``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below).
+
+The mentioned cases (apart from copying data to userspace or hardware, which is
+a security issue) are considered undefined behavior from the C11 Standard point
+of view.
+
+Disabling the instrumentation
+-----------------------------
+
+A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN
+ignore uninitialized values in that function and mark its output as initialized.
+As a result, the user will not get KMSAN reports related to that function.
+
+Another function attribute supported by KMSAN is ``__no_sanitize_memory``.
+Applying this attribute to a function will result in KMSAN not instrumenting
+it, which can be helpful if we do not want the compiler to interfere with some
+low-level code (e.g. that marked with ``noinstr`` which implicitly adds
+``__no_sanitize_memory``).
+
+This however comes at a cost: stack allocations from such functions will have
+incorrect shadow/origin values, likely leading to false positives. Functions
+called from non-instrumented code may also receive incorrect metadata for their
+parameters.
+
+As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly.
+
+It is also possible to disable KMSAN for a single file (e.g. main.o)::
+
+ KMSAN_SANITIZE_main.o := n
+
+or for the whole directory::
+
+ KMSAN_SANITIZE := n
+
+in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every
+function in the file or directory. Most users won't need KMSAN_SANITIZE, unless
+their code gets broken by KMSAN (e.g. runs at early boot time).
+
+Support
+=======
+
+In order for KMSAN to work the kernel must be built with Clang, which so far is
+the only compiler that has KMSAN support. The kernel instrumentation pass is
+based on the userspace `MemorySanitizer tool`_.
+
+The runtime library only supports x86_64 at the moment.
+
+How KMSAN works
+===============
+
+KMSAN shadow memory
+-------------------
+
+KMSAN associates a metadata byte (also called shadow byte) with every byte of
+kernel memory. A bit in the shadow byte is set iff the corresponding bit of the
+kernel memory byte is uninitialized. Marking the memory uninitialized (i.e.
+setting its shadow bytes to ``0xff``) is called poisoning, marking it
+initialized (setting the shadow bytes to ``0x00``) is called unpoisoning.
+
+When a new variable is allocated on the stack, it is poisoned by default by
+instrumentation code inserted by the compiler (unless it is a stack variable
+that is immediately initialized). Any new heap allocation done without
+``__GFP_ZERO`` is also poisoned.
+
+Compiler instrumentation also tracks the shadow values as they are used along
+the code. When needed, instrumentation code invokes the runtime library in
+``mm/kmsan/`` to persist shadow values.
+
+The shadow value of a basic or compound type is an array of bytes of the same
+length. When a constant value is written into memory, that memory is unpoisoned.
+When a value is read from memory, its shadow memory is also obtained and
+propagated into all the operations which use that value. For every instruction
+that takes one or more values the compiler generates code that calculates the
+shadow of the result depending on those values and their shadows.
+
+Example::
+
+ int a = 0xff; // i.e. 0x000000ff
+ int b;
+ int c = a | b;
+
+In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``,
+shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of
+``c`` are uninitialized, while the lower byte is initialized.
+
+Origin tracking
+---------------
+
+Every four bytes of kernel memory also have a so-called origin mapped to them.
+This origin describes the point in program execution at which the uninitialized
+value was created. Every origin is associated with either the full allocation
+stack (for heap-allocated memory), or the function containing the uninitialized
+variable (for locals).
+
+When an uninitialized variable is allocated on stack or heap, a new origin
+value is created, and that variable's origin is filled with that value. When a
+value is read from memory, its origin is also read and kept together with the
+shadow. For every instruction that takes one or more values, the origin of the
+result is one of the origins corresponding to any of the uninitialized inputs.
+If a poisoned value is written into memory, its origin is written to the
+corresponding storage as well.
+
+Example 1::
+
+ int a = 42;
+ int b;
+ int c = a + b;
+
+In this case the origin of ``b`` is generated upon function entry, and is
+stored to the origin of ``c`` right before the addition result is written into
+memory.
+
+Several variables may share the same origin address, if they are stored in the
+same four-byte chunk. In this case every write to either variable updates the
+origin for all of them. We have to sacrifice precision in this case, because
+storing origins for individual bits (and even bytes) would be too costly.
+
+Example 2::
+
+ int combine(short a, short b) {
+ union ret_t {
+ int i;
+ short s[2];
+ } ret;
+ ret.s[0] = a;
+ ret.s[1] = b;
+ return ret.i;
+ }
+
+If ``a`` is initialized and ``b`` is not, the shadow of the result would be
+0xffff0000, and the origin of the result would be the origin of ``b``.
+``ret.s[0]`` would have the same origin, but it will never be used, because
+that variable is initialized.
+
+If both function arguments are uninitialized, only the origin of the second
+argument is preserved.
+
+Origin chaining
+~~~~~~~~~~~~~~~
+
+To ease debugging, KMSAN creates a new origin for every store of an
+uninitialized value to memory. The new origin references both its creation stack
+and the previous origin the value had. This may cause increased memory
+consumption, so we limit the length of origin chains in the runtime.
+
+Clang instrumentation API
+-------------------------
+
+Clang instrumentation pass inserts calls to functions defined in
+``mm/kmsan/nstrumentation.c`` into the kernel code.
+
+Shadow manipulation
+~~~~~~~~~~~~~~~~~~~
+
+For every memory access the compiler emits a call to a function that returns a
+pair of pointers to the shadow and origin addresses of the given memory::
+
+ typedef struct {
+ void *shadow, *origin;
+ } shadow_origin_ptr_t
+
+ shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr)
+ shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr)
+ shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size)
+ shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size)
+
+The function name depends on the memory access size.
+
+The compiler makes sure that for every loaded value its shadow and origin
+values are read from memory. When a value is stored to memory, its shadow and
+origin are also stored using the metadata pointers.
+
+Handling locals
+~~~~~~~~~~~~~~~
+
+A special function is used to create a new origin value for a local variable and
+set the origin of that variable to that value::
+
+ void __msan_poison_alloca(void *addr, uintptr_t size, char *descr)
+
+Access to per-task data
+~~~~~~~~~~~~~~~~~~~~~~~
+
+At the beginning of every instrumented function KMSAN inserts a call to
+``__msan_get_context_state()``::
+
+ kmsan_context_state *__msan_get_context_state(void)
+
+``kmsan_context_state`` is declared in ``include/linux/kmsan.h``::
+
+ struct kmsan_context_state {
+ char param_tls[KMSAN_PARAM_SIZE];
+ char retval_tls[KMSAN_RETVAL_SIZE];
+ char va_arg_tls[KMSAN_PARAM_SIZE];
+ char va_arg_origin_tls[KMSAN_PARAM_SIZE];
+ u64 va_arg_overflow_size_tls;
+ char param_origin_tls[KMSAN_PARAM_SIZE];
+ depot_stack_handle_t retval_origin_tls;
+ };
+
+This structure is used by KMSAN to pass parameter shadows and origins between
+instrumented functions (unless the parameters are checked immediately by
+``CONFIG_KMSAN_CHECK_PARAM_RETVAL``).
+
+Passing uninitialized values to functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Clang's MemorySanitizer instrumentation has an option,
+``-fsanitize-memory-param-retval``, which makes the compiler check function
+parameters passed by value, as well as function return values.
+
+The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is
+enabled by default to let KMSAN report uninitialized values earlier.
+Please refer to the `LKML discussion`_ for more details.
+
+Because of the way the checks are implemented in LLVM (they are only applied to
+parameters marked as ``noundef``), not all parameters are guaranteed to be
+checked, so we cannot give up the metadata storage in ``kmsan_context_state``.
+
+String functions
+~~~~~~~~~~~~~~~~
+
+The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the
+following functions. These functions are also called when data structures are
+initialized or copied, making sure shadow and origin values are copied alongside
+with the data::
+
+ void *__msan_memcpy(void *dst, void *src, uintptr_t n)
+ void *__msan_memmove(void *dst, void *src, uintptr_t n)
+ void *__msan_memset(void *dst, int c, uintptr_t n)
+
+Error reporting
+~~~~~~~~~~~~~~~
+
+For each use of a value the compiler emits a shadow check that calls
+``__msan_warning()`` in the case that value is poisoned::
+
+ void __msan_warning(u32 origin)
+
+``__msan_warning()`` causes KMSAN runtime to print an error report.
+
+Inline assembly instrumentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+KMSAN instruments every inline assembly output with a call to::
+
+ void __msan_instrument_asm_store(void *addr, uintptr_t size)
+
+, which unpoisons the memory region.
+
+This approach may mask certain errors, but it also helps to avoid a lot of
+false positives in bitwise operations, atomics etc.
+
+Sometimes the pointers passed into inline assembly do not point to valid memory.
+In such cases they are ignored at runtime.
+
+
+Runtime library
+---------------
+
+The code is located in ``mm/kmsan/``.
+
+Per-task KMSAN state
+~~~~~~~~~~~~~~~~~~~~
+
+Every task_struct has an associated KMSAN task state that holds the KMSAN
+context (see above) and a per-task flag disallowing KMSAN reports::
+
+ struct kmsan_context {
+ ...
+ bool allow_reporting;
+ struct kmsan_context_state cstate;
+ ...
+ }
+
+ struct task_struct {
+ ...
+ struct kmsan_context kmsan;
+ ...
+ }
+
+KMSAN contexts
+~~~~~~~~~~~~~~
+
+When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to
+hold the metadata for function parameters and return values.
+
+But in the case the kernel is running in the interrupt, softirq or NMI context,
+where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state::
+
+ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+Metadata allocation
+~~~~~~~~~~~~~~~~~~~
+
+There are several places in the kernel for which the metadata is stored.
+
+1. Each ``struct page`` instance contains two pointers to its shadow and
+origin pages::
+
+ struct page {
+ ...
+ struct page *shadow, *origin;
+ ...
+ };
+
+At boot-time, the kernel allocates shadow and origin pages for every available
+kernel page. This is done quite late, when the kernel address space is already
+fragmented, so normal data pages may arbitrarily interleave with the metadata
+pages.
+
+This means that in general for two contiguous memory pages their shadow/origin
+pages may not be contiguous. Consequently, if a memory access crosses the
+boundary of a memory block, accesses to shadow/origin memory may potentially
+corrupt other pages or read incorrect values from them.
+
+In practice, contiguous memory pages returned by the same ``alloc_pages()``
+call will have contiguous metadata, whereas if these pages belong to two
+different allocations their metadata pages can be fragmented.
+
+For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions
+there also are no guarantees on metadata contiguity.
+
+In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two
+pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions::
+
+ char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+ char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+
+``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes.
+All stores to ``dummy_store_page`` are ignored.
+
+2. For vmalloc memory and modules, there is a direct mapping between the memory
+range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only
+the first quarter available to ``vmalloc()``. The second quarter of the vmalloc
+area contains shadow memory for the first quarter, the third one holds the
+origins. A small part of the fourth quarter contains shadow and origins for the
+kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for
+more details.
+
+When an array of pages is mapped into a contiguous virtual memory space, their
+shadow and origin pages are similarly mapped into contiguous regions.
+
+References
+==========
+
+E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized
+memory use in C++
+<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43308.pdf>`_.
+In Proceedings of CGO 2015.
+
+.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html
+.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html
+.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/
diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst
index dcefee707ccd..12b575b76b20 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -7,6 +7,14 @@ directory. These are intended to be small tests to exercise individual code
paths in the kernel. Tests are intended to be run after building, installing
and booting a kernel.
+Kselftest from mainline can be run on older stable kernels. Running tests
+from mainline offers the best coverage. Several test rings run mainline
+kselftest suite on stable releases. The reason is that when a new test
+gets added to test existing code to regression test a bug, we should be
+able to run that test on an older kernel. Hence, it is important to keep
+code that can still test an older kernel and make sure it skips the test
+gracefully on newer releases.
+
You can find additional information on Kselftest framework, how to
write new tests using the framework on Kselftest wiki:
@@ -200,6 +208,14 @@ In general, the rules for selftests are
Contributing new tests (details)
================================
+ * In your Makefile, use facilities from lib.mk by including it instead of
+ reinventing the wheel. Specify flags and binaries generation flags on
+ need basis before including lib.mk. ::
+
+ CFLAGS = $(KHDR_INCLUDES)
+ TEST_GEN_PROGS := close_range_test
+ include ../lib.mk
+
* Use TEST_GEN_XXX if such binaries or files are generated during
compiling.
@@ -222,13 +238,30 @@ Contributing new tests (details)
* First use the headers inside the kernel source and/or git repo, and then the
system headers. Headers for the kernel release as opposed to headers
installed by the distro on the system should be the primary focus to be able
- to find regressions.
+ to find regressions. Use KHDR_INCLUDES in Makefile to include headers from
+ the kernel source.
* If a test needs specific kernel config options enabled, add a config file in
the test directory to enable them.
e.g: tools/testing/selftests/android/config
+ * Create a .gitignore file inside test directory and add all generated objects
+ in it.
+
+ * Add new test name in TARGETS in selftests/Makefile::
+
+ TARGETS += android
+
+ * All changes should pass::
+
+ kselftest-{all,install,clean,gen_tar}
+ kselftest-{all,install,clean,gen_tar} O=abo_path
+ kselftest-{all,install,clean,gen_tar} O=rel_path
+ make -C tools/testing/selftests {all,install,clean,gen_tar}
+ make -C tools/testing/selftests {all,install,clean,gen_tar} O=abs_path
+ make -C tools/testing/selftests {all,install,clean,gen_tar} O=rel_path
+
Test Module
===========
@@ -242,6 +275,14 @@ assist writing kernel modules that are for use with kselftest:
- ``tools/testing/selftests/kselftest_module.h``
- ``tools/testing/selftests/kselftest/module.sh``
+Note that test modules should taint the kernel with TAINT_TEST. This will
+happen automatically for modules which are in the ``tools/testing/``
+directory, or for modules which use the ``kselftest_module.h`` header above.
+Otherwise, you'll need to add ``MODULE_INFO(test, "Y")`` to your module
+source. selftests which do not load modules typically should not taint the
+kernel, but in cases where a non-test module is loaded, TEST_TAINT can be
+applied from userspace by writing to ``/proc/sys/kernel/tainted``.
+
How to use
----------
@@ -279,7 +320,7 @@ A bare bones test module might look like this:
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
- #include "../tools/testing/selftests/kselftest/module.h"
+ #include "../tools/testing/selftests/kselftest_module.h"
KSTM_MODULE_GLOBALS();
@@ -300,6 +341,7 @@ A bare bones test module might look like this:
KSTM_MODULE_LOADERS(test_foo);
MODULE_AUTHOR("John Developer <jd@fooman.org>");
MODULE_LICENSE("GPL");
+ MODULE_INFO(test, "Y");
Example test script
-------------------
diff --git a/Documentation/dev-tools/ktap.rst b/Documentation/dev-tools/ktap.rst
new file mode 100644
index 000000000000..414c105b10a9
--- /dev/null
+++ b/Documentation/dev-tools/ktap.rst
@@ -0,0 +1,311 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================
+The Kernel Test Anything Protocol (KTAP), version 1
+===================================================
+
+TAP, or the Test Anything Protocol is a format for specifying test results used
+by a number of projects. It's website and specification are found at this `link
+<https://testanything.org/>`_. The Linux Kernel largely uses TAP output for test
+results. However, Kernel testing frameworks have special needs for test results
+which don't align with the original TAP specification. Thus, a "Kernel TAP"
+(KTAP) format is specified to extend and alter TAP to support these use-cases.
+This specification describes the generally accepted format of KTAP as it is
+currently used in the kernel.
+
+KTAP test results describe a series of tests (which may be nested: i.e., test
+can have subtests), each of which can contain both diagnostic data -- e.g., log
+lines -- and a final result. The test structure and results are
+machine-readable, whereas the diagnostic data is unstructured and is there to
+aid human debugging.
+
+KTAP output is built from four different types of lines:
+- Version lines
+- Plan lines
+- Test case result lines
+- Diagnostic lines
+
+In general, valid KTAP output should also form valid TAP output, but some
+information, in particular nested test results, may be lost. Also note that
+there is a stagnant draft specification for TAP14, KTAP diverges from this in
+a couple of places (notably the "Subtest" header), which are described where
+relevant later in this document.
+
+Version lines
+-------------
+
+All KTAP-formatted results begin with a "version line" which specifies which
+version of the (K)TAP standard the result is compliant with.
+
+For example:
+- "KTAP version 1"
+- "TAP version 13"
+- "TAP version 14"
+
+Note that, in KTAP, subtests also begin with a version line, which denotes the
+start of the nested test results. This differs from TAP14, which uses a
+separate "Subtest" line.
+
+While, going forward, "KTAP version 1" should be used by compliant tests, it
+is expected that most parsers and other tooling will accept the other versions
+listed here for compatibility with existing tests and frameworks.
+
+Plan lines
+----------
+
+A test plan provides the number of tests (or subtests) in the KTAP output.
+
+Plan lines must follow the format of "1..N" where N is the number of tests or subtests.
+Plan lines follow version lines to indicate the number of nested tests.
+
+While there are cases where the number of tests is not known in advance -- in
+which case the test plan may be omitted -- it is strongly recommended one is
+present where possible.
+
+Test case result lines
+----------------------
+
+Test case result lines indicate the final status of a test.
+They are required and must have the format:
+
+.. code-block:: none
+
+ <result> <number> [<description>][ # [<directive>] [<diagnostic data>]]
+
+The result can be either "ok", which indicates the test case passed,
+or "not ok", which indicates that the test case failed.
+
+<number> represents the number of the test being performed. The first test must
+have the number 1 and the number then must increase by 1 for each additional
+subtest within the same test at the same nesting level.
+
+The description is a description of the test, generally the name of
+the test, and can be any string of characters other than # or a
+newline. The description is optional, but recommended.
+
+The directive and any diagnostic data is optional. If either are present, they
+must follow a hash sign, "#".
+
+A directive is a keyword that indicates a different outcome for a test other
+than passed and failed. The directive is optional, and consists of a single
+keyword preceding the diagnostic data. In the event that a parser encounters
+a directive it doesn't support, it should fall back to the "ok" / "not ok"
+result.
+
+Currently accepted directives are:
+
+- "SKIP", which indicates a test was skipped (note the result of the test case
+ result line can be either "ok" or "not ok" if the SKIP directive is used)
+- "TODO", which indicates that a test is not expected to pass at the moment,
+ e.g. because the feature it is testing is known to be broken. While this
+ directive is inherited from TAP, its use in the kernel is discouraged.
+- "XFAIL", which indicates that a test is expected to fail. This is similar
+ to "TODO", above, and is used by some kselftest tests.
+- “TIMEOUT”, which indicates a test has timed out (note the result of the test
+ case result line should be “not ok” if the TIMEOUT directive is used)
+- “ERROR”, which indicates that the execution of a test has failed due to a
+ specific error that is included in the diagnostic data. (note the result of
+ the test case result line should be “not ok” if the ERROR directive is used)
+
+The diagnostic data is a plain-text field which contains any additional details
+about why this result was produced. This is typically an error message for ERROR
+or failed tests, or a description of missing dependencies for a SKIP result.
+
+The diagnostic data field is optional, and results which have neither a
+directive nor any diagnostic data do not need to include the "#" field
+separator.
+
+Example result lines include::
+
+ ok 1 test_case_name
+
+The test "test_case_name" passed.
+
+::
+
+ not ok 1 test_case_name
+
+The test "test_case_name" failed.
+
+::
+
+ ok 1 test # SKIP necessary dependency unavailable
+
+The test "test" was SKIPPED with the diagnostic message "necessary dependency
+unavailable".
+
+::
+
+ not ok 1 test # TIMEOUT 30 seconds
+
+The test "test" timed out, with diagnostic data "30 seconds".
+
+::
+
+ ok 5 check return code # rcode=0
+
+The test "check return code" passed, with additional diagnostic data “rcode=0”
+
+
+Diagnostic lines
+----------------
+
+If tests wish to output any further information, they should do so using
+"diagnostic lines". Diagnostic lines are optional, freeform text, and are
+often used to describe what is being tested and any intermediate results in
+more detail than the final result and diagnostic data line provides.
+
+Diagnostic lines are formatted as "# <diagnostic_description>", where the
+description can be any string. Diagnostic lines can be anywhere in the test
+output. As a rule, diagnostic lines regarding a test are directly before the
+test result line for that test.
+
+Note that most tools will treat unknown lines (see below) as diagnostic lines,
+even if they do not start with a "#": this is to capture any other useful
+kernel output which may help debug the test. It is nevertheless recommended
+that tests always prefix any diagnostic output they have with a "#" character.
+
+Unknown lines
+-------------
+
+There may be lines within KTAP output that do not follow the format of one of
+the four formats for lines described above. This is allowed, however, they will
+not influence the status of the tests.
+
+This is an important difference from TAP. Kernel tests may print messages
+to the system console or a log file. Both of these destinations may contain
+messages either from unrelated kernel or userspace activity, or kernel
+messages from non-test code that is invoked by the test. The kernel code
+invoked by the test likely is not aware that a test is in progress and
+thus can not print the message as a diagnostic message.
+
+Nested tests
+------------
+
+In KTAP, tests can be nested. This is done by having a test include within its
+output an entire set of KTAP-formatted results. This can be used to categorize
+and group related tests, or to split out different results from the same test.
+
+The "parent" test's result should consist of all of its subtests' results,
+starting with another KTAP version line and test plan, and end with the overall
+result. If one of the subtests fail, for example, the parent test should also
+fail.
+
+Additionally, all lines in a subtest should be indented. One level of
+indentation is two spaces: " ". The indentation should begin at the version
+line and should end before the parent test's result line.
+
+"Unknown lines" are not considered to be lines in a subtest and thus are
+allowed to be either indented or not indented.
+
+An example of a test with two nested subtests:
+
+::
+
+ KTAP version 1
+ 1..1
+ KTAP version 1
+ 1..2
+ ok 1 test_1
+ not ok 2 test_2
+ # example failed
+ not ok 1 example
+
+An example format with multiple levels of nested testing:
+
+::
+
+ KTAP version 1
+ 1..2
+ KTAP version 1
+ 1..2
+ KTAP version 1
+ 1..2
+ not ok 1 test_1
+ ok 2 test_2
+ not ok 1 test_3
+ ok 2 test_4 # SKIP
+ not ok 1 example_test_1
+ ok 2 example_test_2
+
+
+Major differences between TAP and KTAP
+--------------------------------------
+
+================================================== ========= ===============
+Feature TAP KTAP
+================================================== ========= ===============
+yaml and json in diagnosic message ok not recommended
+TODO directive ok not recognized
+allows an arbitrary number of tests to be nested no yes
+"Unknown lines" are in category of "Anything else" yes no
+"Unknown lines" are incorrect allowed
+================================================== ========= ===============
+
+The TAP14 specification does permit nested tests, but instead of using another
+nested version line, uses a line of the form
+"Subtest: <name>" where <name> is the name of the parent test.
+
+Example KTAP output
+--------------------
+::
+
+ KTAP version 1
+ 1..1
+ KTAP version 1
+ 1..3
+ KTAP version 1
+ 1..1
+ # test_1: initializing test_1
+ ok 1 test_1
+ ok 1 example_test_1
+ KTAP version 1
+ 1..2
+ ok 1 test_1 # SKIP test_1 skipped
+ ok 2 test_2
+ ok 2 example_test_2
+ KTAP version 1
+ 1..3
+ ok 1 test_1
+ # test_2: FAIL
+ not ok 2 test_2
+ ok 3 test_3 # SKIP test_3 skipped
+ not ok 3 example_test_3
+ not ok 1 main_test
+
+This output defines the following hierarchy:
+
+A single test called "main_test", which fails, and has three subtests:
+- "example_test_1", which passes, and has one subtest:
+
+ - "test_1", which passes, and outputs the diagnostic message "test_1: initializing test_1"
+
+- "example_test_2", which passes, and has two subtests:
+
+ - "test_1", which is skipped, with the explanation "test_1 skipped"
+ - "test_2", which passes
+
+- "example_test_3", which fails, and has three subtests
+
+ - "test_1", which passes
+ - "test_2", which outputs the diagnostic line "test_2: FAIL", and fails.
+ - "test_3", which is skipped with the explanation "test_3 skipped"
+
+Note that the individual subtests with the same names do not conflict, as they
+are found in different parent tests. This output also exhibits some sensible
+rules for "bubbling up" test results: a test fails if any of its subtests fail.
+Skipped tests do not affect the result of the parent test (though it often
+makes sense for a test to be marked skipped if _all_ of its subtests have been
+skipped).
+
+See also:
+---------
+
+- The TAP specification:
+ https://testanything.org/tap-version-13-specification.html
+- The (stagnant) TAP version 14 specification:
+ https://github.com/TestAnything/Specification/blob/tap-14-specification/specification.md
+- The kselftest documentation:
+ Documentation/dev-tools/kselftest.rst
+- The KUnit documentation:
+ Documentation/dev-tools/kunit/index.rst
diff --git a/Documentation/dev-tools/kunit/api/functionredirection.rst b/Documentation/dev-tools/kunit/api/functionredirection.rst
new file mode 100644
index 000000000000..3791efc2fcca
--- /dev/null
+++ b/Documentation/dev-tools/kunit/api/functionredirection.rst
@@ -0,0 +1,162 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Function Redirection API
+========================
+
+Overview
+========
+
+When writing unit tests, it's important to be able to isolate the code being
+tested from other parts of the kernel. This ensures the reliability of the test
+(it won't be affected by external factors), reduces dependencies on specific
+hardware or config options (making the test easier to run), and protects the
+stability of the rest of the system (making it less likely for test-specific
+state to interfere with the rest of the system).
+
+While for some code (typically generic data structures, helpers, and other
+"pure functions") this is trivial, for others (like device drivers,
+filesystems, core subsystems) the code is heavily coupled with other parts of
+the kernel.
+
+This coupling is often due to global state in some way: be it a global list of
+devices, the filesystem, or some hardware state. Tests need to either carefully
+manage, isolate, and restore state, or they can avoid it altogether by
+replacing access to and mutation of this state with a "fake" or "mock" variant.
+
+By refactoring access to such state, such as by introducing a layer of
+indirection which can use or emulate a separate set of test state. However,
+such refactoring comes with its own costs (and undertaking significant
+refactoring before being able to write tests is suboptimal).
+
+A simpler way to intercept and replace some of the function calls is to use
+function redirection via static stubs.
+
+
+Static Stubs
+============
+
+Static stubs are a way of redirecting calls to one function (the "real"
+function) to another function (the "replacement" function).
+
+It works by adding a macro to the "real" function which checks to see if a test
+is running, and if a replacement function is available. If so, that function is
+called in place of the original.
+
+Using static stubs is pretty straightforward:
+
+1. Add the KUNIT_STATIC_STUB_REDIRECT() macro to the start of the "real"
+ function.
+
+ This should be the first statement in the function, after any variable
+ declarations. KUNIT_STATIC_STUB_REDIRECT() takes the name of the
+ function, followed by all of the arguments passed to the real function.
+
+ For example:
+
+ .. code-block:: c
+
+ void send_data_to_hardware(const char *str)
+ {
+ KUNIT_STATIC_STUB_REDIRECT(send_data_to_hardware, str);
+ /* real implementation */
+ }
+
+2. Write one or more replacement functions.
+
+ These functions should have the same function signature as the real function.
+ In the event they need to access or modify test-specific state, they can use
+ kunit_get_current_test() to get a struct kunit pointer. This can then
+ be passed to the expectation/assertion macros, or used to look up KUnit
+ resources.
+
+ For example:
+
+ .. code-block:: c
+
+ void fake_send_data_to_hardware(const char *str)
+ {
+ struct kunit *test = kunit_get_current_test();
+ KUNIT_EXPECT_STREQ(test, str, "Hello World!");
+ }
+
+3. Activate the static stub from your test.
+
+ From within a test, the redirection can be enabled with
+ kunit_activate_static_stub(), which accepts a struct kunit pointer,
+ the real function, and the replacement function. You can call this several
+ times with different replacement functions to swap out implementations of the
+ function.
+
+ In our example, this would be
+
+ .. code-block:: c
+
+ kunit_activate_static_stub(test,
+ send_data_to_hardware,
+ fake_send_data_to_hardware);
+
+4. Call (perhaps indirectly) the real function.
+
+ Once the redirection is activated, any call to the real function will call
+ the replacement function instead. Such calls may be buried deep in the
+ implementation of another function, but must occur from the test's kthread.
+
+ For example:
+
+ .. code-block:: c
+
+ send_data_to_hardware("Hello World!"); /* Succeeds */
+ send_data_to_hardware("Something else"); /* Fails the test. */
+
+5. (Optionally) disable the stub.
+
+ When you no longer need it, disable the redirection (and hence resume the
+ original behaviour of the 'real' function) using
+ kunit_deactivate_static_stub(). Otherwise, it will be automatically disabled
+ when the test exits.
+
+ For example:
+
+ .. code-block:: c
+
+ kunit_deactivate_static_stub(test, send_data_to_hardware);
+
+
+It's also possible to use these replacement functions to test to see if a
+function is called at all, for example:
+
+.. code-block:: c
+
+ void send_data_to_hardware(const char *str)
+ {
+ KUNIT_STATIC_STUB_REDIRECT(send_data_to_hardware, str);
+ /* real implementation */
+ }
+
+ /* In test file */
+ int times_called = 0;
+ void fake_send_data_to_hardware(const char *str)
+ {
+ times_called++;
+ }
+ ...
+ /* In the test case, redirect calls for the duration of the test */
+ kunit_activate_static_stub(test, send_data_to_hardware, fake_send_data_to_hardware);
+
+ send_data_to_hardware("hello");
+ KUNIT_EXPECT_EQ(test, times_called, 1);
+
+ /* Can also deactivate the stub early, if wanted */
+ kunit_deactivate_static_stub(test, send_data_to_hardware);
+
+ send_data_to_hardware("hello again");
+ KUNIT_EXPECT_EQ(test, times_called, 1);
+
+
+
+API Reference
+=============
+
+.. kernel-doc:: include/kunit/static_stub.h
+ :internal:
diff --git a/Documentation/dev-tools/kunit/api/index.rst b/Documentation/dev-tools/kunit/api/index.rst
index b33ad72bcf0b..2d8f756aab56 100644
--- a/Documentation/dev-tools/kunit/api/index.rst
+++ b/Documentation/dev-tools/kunit/api/index.rst
@@ -4,13 +4,24 @@
API Reference
=============
.. toctree::
+ :hidden:
test
+ resource
+ functionredirection
-This section documents the KUnit kernel testing API. It is divided into the
+
+This page documents the KUnit kernel testing API. It is divided into the
following sections:
Documentation/dev-tools/kunit/api/test.rst
- - documents all of the standard testing API excluding mocking
- or mocking related features.
+ - Documents all of the standard testing API
+
+Documentation/dev-tools/kunit/api/resource.rst
+
+ - Documents the KUnit resource API
+
+Documentation/dev-tools/kunit/api/functionredirection.rst
+
+ - Documents the KUnit Function Redirection API
diff --git a/Documentation/dev-tools/kunit/api/resource.rst b/Documentation/dev-tools/kunit/api/resource.rst
new file mode 100644
index 000000000000..0a94f831259e
--- /dev/null
+++ b/Documentation/dev-tools/kunit/api/resource.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Resource API
+============
+
+This file documents the KUnit resource API.
+
+Most users won't need to use this API directly, power users can use it to store
+state on a per-test basis, register custom cleanup actions, and more.
+
+.. kernel-doc:: include/kunit/resource.h
+ :internal:
diff --git a/Documentation/dev-tools/kunit/api/test.rst b/Documentation/dev-tools/kunit/api/test.rst
index aaa97f17e5b3..c5eca423e8b6 100644
--- a/Documentation/dev-tools/kunit/api/test.rst
+++ b/Documentation/dev-tools/kunit/api/test.rst
@@ -4,8 +4,7 @@
Test API
========
-This file documents all of the standard testing API excluding mocking or mocking
-related features.
+This file documents all of the standard testing API.
.. kernel-doc:: include/kunit/test.h
:internal:
diff --git a/Documentation/dev-tools/kunit/architecture.rst b/Documentation/dev-tools/kunit/architecture.rst
new file mode 100644
index 000000000000..e95ab05342bb
--- /dev/null
+++ b/Documentation/dev-tools/kunit/architecture.rst
@@ -0,0 +1,196 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+KUnit Architecture
+==================
+
+The KUnit architecture is divided into two parts:
+
+- `In-Kernel Testing Framework`_
+- `kunit_tool (Command-line Test Harness)`_
+
+In-Kernel Testing Framework
+===========================
+
+The kernel testing library supports KUnit tests written in C using
+KUnit. These KUnit tests are kernel code. KUnit performs the following
+tasks:
+
+- Organizes tests
+- Reports test results
+- Provides test utilities
+
+Test Cases
+----------
+
+The test case is the fundamental unit in KUnit. KUnit test cases are organised
+into suites. A KUnit test case is a function with type signature
+``void (*)(struct kunit *test)``. These test case functions are wrapped in a
+struct called struct kunit_case.
+
+.. note:
+ ``generate_params`` is optional for non-parameterized tests.
+
+Each KUnit test case receives a ``struct kunit`` context object that tracks a
+running test. The KUnit assertion macros and other KUnit utilities use the
+``struct kunit`` context object. As an exception, there are two fields:
+
+- ``->priv``: The setup functions can use it to store arbitrary test
+ user data.
+
+- ``->param_value``: It contains the parameter value which can be
+ retrieved in the parameterized tests.
+
+Test Suites
+-----------
+
+A KUnit suite includes a collection of test cases. The KUnit suites
+are represented by the ``struct kunit_suite``. For example:
+
+.. code-block:: c
+
+ static struct kunit_case example_test_cases[] = {
+ KUNIT_CASE(example_test_foo),
+ KUNIT_CASE(example_test_bar),
+ KUNIT_CASE(example_test_baz),
+ {}
+ };
+
+ static struct kunit_suite example_test_suite = {
+ .name = "example",
+ .init = example_test_init,
+ .exit = example_test_exit,
+ .test_cases = example_test_cases,
+ };
+ kunit_test_suite(example_test_suite);
+
+In the above example, the test suite ``example_test_suite``, runs the
+test cases ``example_test_foo``, ``example_test_bar``, and
+``example_test_baz``. Before running the test, the ``example_test_init``
+is called and after running the test, ``example_test_exit`` is called.
+The ``kunit_test_suite(example_test_suite)`` registers the test suite
+with the KUnit test framework.
+
+Executor
+--------
+
+The KUnit executor can list and run built-in KUnit tests on boot.
+The Test suites are stored in a linker section
+called ``.kunit_test_suites``. For the code, see ``KUNIT_TABLE()`` macro
+definition in
+`include/asm-generic/vmlinux.lds.h <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/asm-generic/vmlinux.lds.h?h=v6.0#n950>`_.
+The linker section consists of an array of pointers to
+``struct kunit_suite``, and is populated by the ``kunit_test_suites()``
+macro. The KUnit executor iterates over the linker section array in order to
+run all the tests that are compiled into the kernel.
+
+.. kernel-figure:: kunit_suitememorydiagram.svg
+ :alt: KUnit Suite Memory
+
+ KUnit Suite Memory Diagram
+
+On the kernel boot, the KUnit executor uses the start and end addresses
+of this section to iterate over and run all tests. For the implementation of the
+executor, see
+`lib/kunit/executor.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/executor.c>`_.
+When built as a module, the ``kunit_test_suites()`` macro defines a
+``module_init()`` function, which runs all the tests in the compilation
+unit instead of utilizing the executor.
+
+In KUnit tests, some error classes do not affect other tests
+or parts of the kernel, each KUnit case executes in a separate thread
+context. See the ``kunit_try_catch_run()`` function in
+`lib/kunit/try-catch.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/kunit/try-catch.c?h=v5.15#n58>`_.
+
+Assertion Macros
+----------------
+
+KUnit tests verify state using expectations/assertions.
+All expectations/assertions are formatted as:
+``KUNIT_{EXPECT|ASSERT}_<op>[_MSG](kunit, property[, message])``
+
+- ``{EXPECT|ASSERT}`` determines whether the check is an assertion or an
+ expectation.
+ In the event of a failure, the testing flow differs as follows:
+
+ - For expectations, the test is marked as failed and the failure is logged.
+
+ - Failing assertions, on the other hand, result in the test case being
+ terminated immediately.
+
+ - Assertions call the function:
+ ``void __noreturn kunit_abort(struct kunit *)``.
+
+ - ``kunit_abort`` calls the function:
+ ``void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch)``.
+
+ - ``kunit_try_catch_throw`` calls the function:
+ ``void kthread_complete_and_exit(struct completion *, long) __noreturn;``
+ and terminates the special thread context.
+
+- ``<op>`` denotes a check with options: ``TRUE`` (supplied property
+ has the boolean value "true"), ``EQ`` (two supplied properties are
+ equal), ``NOT_ERR_OR_NULL`` (supplied pointer is not null and does not
+ contain an "err" value).
+
+- ``[_MSG]`` prints a custom message on failure.
+
+Test Result Reporting
+---------------------
+KUnit prints the test results in KTAP format. KTAP is based on TAP14, see
+Documentation/dev-tools/ktap.rst.
+KTAP works with KUnit and Kselftest. The KUnit executor prints KTAP results to
+dmesg, and debugfs (if configured).
+
+Parameterized Tests
+-------------------
+
+Each KUnit parameterized test is associated with a collection of
+parameters. The test is invoked multiple times, once for each parameter
+value and the parameter is stored in the ``param_value`` field.
+The test case includes a KUNIT_CASE_PARAM() macro that accepts a
+generator function. The generator function is passed the previous parameter
+and returns the next parameter. It also includes a macro for generating
+array-based common-case generators.
+
+kunit_tool (Command-line Test Harness)
+======================================
+
+``kunit_tool`` is a Python script, found in ``tools/testing/kunit/kunit.py``. It
+is used to configure, build, execute, parse test results and run all of the
+previous commands in correct order (i.e., configure, build, execute and parse).
+You have two options for running KUnit tests: either build the kernel with KUnit
+enabled and manually parse the results (see
+Documentation/dev-tools/kunit/run_manual.rst) or use ``kunit_tool``
+(see Documentation/dev-tools/kunit/run_wrapper.rst).
+
+- ``configure`` command generates the kernel ``.config`` from a
+ ``.kunitconfig`` file (and any architecture-specific options).
+ The Python scripts available in ``qemu_configs`` folder
+ (for example, ``tools/testing/kunit/qemu configs/powerpc.py``) contains
+ additional configuration options for specific architectures.
+ It parses both the existing ``.config`` and the ``.kunitconfig`` files
+ to ensure that ``.config`` is a superset of ``.kunitconfig``.
+ If not, it will combine the two and run ``make olddefconfig`` to regenerate
+ the ``.config`` file. It then checks to see if ``.config`` has become a superset.
+ This verifies that all the Kconfig dependencies are correctly specified in the
+ file ``.kunitconfig``. The ``kunit_config.py`` script contains the code for parsing
+ Kconfigs. The code which runs ``make olddefconfig`` is part of the
+ ``kunit_kernel.py`` script. You can invoke this command through:
+ ``./tools/testing/kunit/kunit.py config`` and
+ generate a ``.config`` file.
+- ``build`` runs ``make`` on the kernel tree with required options
+ (depends on the architecture and some options, for example: build_dir)
+ and reports any errors.
+ To build a KUnit kernel from the current ``.config``, you can use the
+ ``build`` argument: ``./tools/testing/kunit/kunit.py build``.
+- ``exec`` command executes kernel results either directly (using
+ User-mode Linux configuration), or through an emulator such
+ as QEMU. It reads results from the log using standard
+ output (stdout), and passes them to ``parse`` to be parsed.
+ If you already have built a kernel with built-in KUnit tests,
+ you can run the kernel and display the test results with the ``exec``
+ argument: ``./tools/testing/kunit/kunit.py exec``.
+- ``parse`` extracts the KTAP output from a kernel log, parses
+ the test results, and prints a summary. For failed tests, any
+ diagnostic output will be included.
diff --git a/Documentation/dev-tools/kunit/faq.rst b/Documentation/dev-tools/kunit/faq.rst
index 5c6555d020f3..fae426f2634a 100644
--- a/Documentation/dev-tools/kunit/faq.rst
+++ b/Documentation/dev-tools/kunit/faq.rst
@@ -4,56 +4,58 @@
Frequently Asked Questions
==========================
-How is this different from Autotest, kselftest, etc?
-====================================================
+How is this different from Autotest, kselftest, and so on?
+==========================================================
KUnit is a unit testing framework. Autotest, kselftest (and some others) are
not.
A `unit test <https://martinfowler.com/bliki/UnitTest.html>`_ is supposed to
-test a single unit of code in isolation, hence the name. A unit test should be
-the finest granularity of testing and as such should allow all possible code
-paths to be tested in the code under test; this is only possible if the code
-under test is very small and does not have any external dependencies outside of
+test a single unit of code in isolation and hence the name *unit test*. A unit
+test should be the finest granularity of testing and should allow all possible
+code paths to be tested in the code under test. This is only possible if the
+code under test is small and does not have any external dependencies outside of
the test's control like hardware.
There are no testing frameworks currently available for the kernel that do not
-require installing the kernel on a test machine or in a VM and all require
-tests to be written in userspace and run on the kernel under test; this is true
-for Autotest, kselftest, and some others, disqualifying any of them from being
-considered unit testing frameworks.
+require installing the kernel on a test machine or in a virtual machine. All
+testing frameworks require tests to be written in userspace and run on the
+kernel under test. This is true for Autotest, kselftest, and some others,
+disqualifying any of them from being considered unit testing frameworks.
Does KUnit support running on architectures other than UML?
===========================================================
-Yes, well, mostly.
+Yes, mostly.
-For the most part, the KUnit core framework (what you use to write the tests)
-can compile to any architecture; it compiles like just another part of the
+For the most part, the KUnit core framework (what we use to write the tests)
+can compile to any architecture. It compiles like just another part of the
kernel and runs when the kernel boots, or when built as a module, when the
-module is loaded. However, there is some infrastructure,
-like the KUnit Wrapper (``tools/testing/kunit/kunit.py``) that does not support
-other architectures.
+module is loaded. However, there is infrastructure, like the KUnit Wrapper
+(``tools/testing/kunit/kunit.py``) that might not support some architectures
+(see :ref:`kunit-on-qemu`).
-In short, this means that, yes, you can run KUnit on other architectures, but
-it might require more work than using KUnit on UML.
+In short, yes, you can run KUnit on other architectures, but it might require
+more work than using KUnit on UML.
For more information, see :ref:`kunit-on-non-uml`.
-What is the difference between a unit test and these other kinds of tests?
-==========================================================================
+.. _kinds-of-tests:
+
+What is the difference between a unit test and other kinds of tests?
+====================================================================
Most existing tests for the Linux kernel would be categorized as an integration
test, or an end-to-end test.
-- A unit test is supposed to test a single unit of code in isolation, hence the
- name. A unit test should be the finest granularity of testing and as such
- should allow all possible code paths to be tested in the code under test; this
- is only possible if the code under test is very small and does not have any
- external dependencies outside of the test's control like hardware.
+- A unit test is supposed to test a single unit of code in isolation. A unit
+ test should be the finest granularity of testing and, as such, allows all
+ possible code paths to be tested in the code under test. This is only possible
+ if the code under test is small and does not have any external dependencies
+ outside of the test's control like hardware.
- An integration test tests the interaction between a minimal set of components,
usually just two or three. For example, someone might write an integration
test to test the interaction between a driver and a piece of hardware, or to
test the interaction between the userspace libraries the kernel provides and
- the kernel itself; however, one of these tests would probably not test the
+ the kernel itself. However, one of these tests would probably not test the
entire kernel along with hardware interactions and interactions with the
userspace.
- An end-to-end test usually tests the entire system from the perspective of the
@@ -62,26 +64,26 @@ test, or an end-to-end test.
hardware with a production userspace and then trying to exercise some behavior
that depends on interactions between the hardware, the kernel, and userspace.
-KUnit isn't working, what should I do?
-======================================
+KUnit is not working, what should I do?
+=======================================
Unfortunately, there are a number of things which can break, but here are some
things to try.
-1. Try running ``./tools/testing/kunit/kunit.py run`` with the ``--raw_output``
+1. Run ``./tools/testing/kunit/kunit.py run`` with the ``--raw_output``
parameter. This might show details or error messages hidden by the kunit_tool
parser.
2. Instead of running ``kunit.py run``, try running ``kunit.py config``,
``kunit.py build``, and ``kunit.py exec`` independently. This can help track
down where an issue is occurring. (If you think the parser is at fault, you
- can run it manually against stdin or a file with ``kunit.py parse``.)
-3. Running the UML kernel directly can often reveal issues or error messages
- kunit_tool ignores. This should be as simple as running ``./vmlinux`` after
- building the UML kernel (e.g., by using ``kunit.py build``). Note that UML
- has some unusual requirements (such as the host having a tmpfs filesystem
- mounted), and has had issues in the past when built statically and the host
- has KASLR enabled. (On older host kernels, you may need to run ``setarch
- `uname -m` -R ./vmlinux`` to disable KASLR.)
+ can run it manually against ``stdin`` or a file with ``kunit.py parse``.)
+3. Running the UML kernel directly can often reveal issues or error messages,
+ ``kunit_tool`` ignores. This should be as simple as running ``./vmlinux``
+ after building the UML kernel (for example, by using ``kunit.py build``).
+ Note that UML has some unusual requirements (such as the host having a tmpfs
+ filesystem mounted), and has had issues in the past when built statically and
+ the host has KASLR enabled. (On older host kernels, you may need to run
+ ``setarch `uname -m` -R ./vmlinux`` to disable KASLR.)
4. Make sure the kernel .config has ``CONFIG_KUNIT=y`` and at least one test
(e.g. ``CONFIG_KUNIT_EXAMPLE_TEST=y``). kunit_tool will keep its .config
around, so you can see what config was used after running ``kunit.py run``.
@@ -96,8 +98,7 @@ things to try.
seeing. When tests are built-in, they will execute when the kernel boots, and
modules will automatically execute associated tests when loaded. Test results
can be collected from ``/sys/kernel/debug/kunit/<test suite>/results``, and
- can be parsed with ``kunit.py parse``. For more details, see "KUnit on
- non-UML architectures" in Documentation/dev-tools/kunit/usage.rst.
+ can be parsed with ``kunit.py parse``. For more details, see :ref:`kunit-on-qemu`.
If none of the above tricks help, you are always welcome to email any issues to
kunit-dev@googlegroups.com.
diff --git a/Documentation/dev-tools/kunit/index.rst b/Documentation/dev-tools/kunit/index.rst
index cacb35ec658d..b3593ae29ace 100644
--- a/Documentation/dev-tools/kunit/index.rst
+++ b/Documentation/dev-tools/kunit/index.rst
@@ -1,97 +1,109 @@
.. SPDX-License-Identifier: GPL-2.0
-=========================================
-KUnit - Unit Testing for the Linux Kernel
-=========================================
+=================================
+KUnit - Linux Kernel Unit Testing
+=================================
.. toctree::
:maxdepth: 2
+ :caption: Contents:
start
+ architecture
+ run_wrapper
+ run_manual
usage
- kunit-tool
api/index
style
faq
- tips
running_tips
-What is KUnit?
-==============
-
-KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
-
-KUnit is heavily inspired by JUnit, Python's unittest.mock, and
-Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
-cases, grouping related test cases into test suites, providing common
-infrastructure for running tests, and much more.
-
-KUnit consists of a kernel component, which provides a set of macros for easily
-writing unit tests. Tests written against KUnit will run on kernel boot if
-built-in, or when loaded if built as a module. These tests write out results to
-the kernel log in `TAP <https://testanything.org/>`_ format.
-
-To make running these tests (and reading the results) easier, KUnit offers
-:doc:`kunit_tool <kunit-tool>`, which builds a `User Mode Linux
-<http://user-mode-linux.sourceforge.net>`_ kernel, runs it, and parses the test
-results. This provides a quick way of running KUnit tests during development,
-without requiring a virtual machine or separate hardware.
-
-Get started now: Documentation/dev-tools/kunit/start.rst
-
-Why KUnit?
-==========
-
-A unit test is supposed to test a single unit of code in isolation, hence the
-name. A unit test should be the finest granularity of testing and as such should
-allow all possible code paths to be tested in the code under test; this is only
-possible if the code under test is very small and does not have any external
-dependencies outside of the test's control like hardware.
-
-KUnit provides a common framework for unit tests within the kernel.
-
-KUnit tests can be run on most architectures, and most tests are architecture
-independent. All built-in KUnit tests run on kernel startup. Alternatively,
-KUnit and KUnit tests can be built as modules and tests will run when the test
-module is loaded.
-
-.. note::
-
- KUnit can also run tests without needing a virtual machine or actual
- hardware under User Mode Linux. User Mode Linux is a Linux architecture,
- like ARM or x86, which compiles the kernel as a Linux executable. KUnit
- can be used with UML either by building with ``ARCH=um`` (like any other
- architecture), or by using :doc:`kunit_tool <kunit-tool>`.
-
-KUnit is fast. Excluding build time, from invocation to completion KUnit can run
-several dozen tests in only 10 to 20 seconds; this might not sound like a big
-deal to some people, but having such fast and easy to run tests fundamentally
-changes the way you go about testing and even writing code in the first place.
-Linus himself said in his `git talk at Google
-<https://gist.github.com/lorn/1272686/revisions#diff-53c65572127855f1b003db4064a94573R874>`_:
-
- "... a lot of people seem to think that performance is about doing the
- same thing, just doing it faster, and that is not true. That is not what
- performance is all about. If you can do something really fast, really
- well, people will start using it differently."
-
-In this context Linus was talking about branching and merging,
-but this point also applies to testing. If your tests are slow, unreliable, are
-difficult to write, and require a special setup or special hardware to run,
-then you wait a lot longer to write tests, and you wait a lot longer to run
-tests; this means that tests are likely to break, unlikely to test a lot of
-things, and are unlikely to be rerun once they pass. If your tests are really
-fast, you run them all the time, every time you make a change, and every time
-someone sends you some code. Why trust that someone ran all their tests
-correctly on every change when you can just run them yourself in less time than
-it takes to read their test log?
+This section details the kernel unit testing framework.
+
+Introduction
+============
+
+KUnit (Kernel unit testing framework) provides a common framework for
+unit tests within the Linux kernel. Using KUnit, you can define groups
+of test cases called test suites. The tests either run on kernel boot
+if built-in, or load as a module. KUnit automatically flags and reports
+failed test cases in the kernel log. The test results appear in
+:doc:`KTAP (Kernel - Test Anything Protocol) format</dev-tools/ktap>`.
+It is inspired by JUnit, Python’s unittest.mock, and GoogleTest/GoogleMock
+(C++ unit testing framework).
+
+KUnit tests are part of the kernel, written in the C (programming)
+language, and test parts of the Kernel implementation (example: a C
+language function). Excluding build time, from invocation to
+completion, KUnit can run around 100 tests in less than 10 seconds.
+KUnit can test any kernel component, for example: file system, system
+calls, memory management, device drivers and so on.
+
+KUnit follows the white-box testing approach. The test has access to
+internal system functionality. KUnit runs in kernel space and is not
+restricted to things exposed to user-space.
+
+In addition, KUnit has kunit_tool, a script (``tools/testing/kunit/kunit.py``)
+that configures the Linux kernel, runs KUnit tests under QEMU or UML
+(:doc:`User Mode Linux </virt/uml/user_mode_linux_howto_v2>`),
+parses the test results and
+displays them in a user friendly manner.
+
+Features
+--------
+
+- Provides a framework for writing unit tests.
+- Runs tests on any kernel architecture.
+- Runs a test in milliseconds.
+
+Prerequisites
+-------------
+
+- Any Linux kernel compatible hardware.
+- For Kernel under test, Linux kernel version 5.5 or greater.
+
+Unit Testing
+============
+
+A unit test tests a single unit of code in isolation. A unit test is the finest
+granularity of testing and allows all possible code paths to be tested in the
+code under test. This is possible if the code under test is small and does not
+have any external dependencies outside of the test's control like hardware.
+
+
+Write Unit Tests
+----------------
+
+To write good unit tests, there is a simple but powerful pattern:
+Arrange-Act-Assert. This is a great way to structure test cases and
+defines an order of operations.
+
+- Arrange inputs and targets: At the start of the test, arrange the data
+ that allows a function to work. Example: initialize a statement or
+ object.
+- Act on the target behavior: Call your function/code under test.
+- Assert expected outcome: Verify that the result (or resulting state) is as
+ expected.
+
+Unit Testing Advantages
+-----------------------
+
+- Increases testing speed and development in the long run.
+- Detects bugs at initial stage and therefore decreases bug fix cost
+ compared to acceptance testing.
+- Improves code quality.
+- Encourages writing testable code.
+
+Read also :ref:`kinds-of-tests`.
How do I use it?
================
-* Documentation/dev-tools/kunit/start.rst - for new users of KUnit
-* Documentation/dev-tools/kunit/tips.rst - for short examples of best practices
-* Documentation/dev-tools/kunit/usage.rst - for a more detailed explanation of KUnit features
-* Documentation/dev-tools/kunit/api/index.rst - for the list of KUnit APIs used for testing
-* Documentation/dev-tools/kunit/kunit-tool.rst - for more information on the kunit_tool helper script
-* Documentation/dev-tools/kunit/faq.rst - for answers to some common questions about KUnit
+You can find a step-by-step guide to writing and running KUnit tests in
+Documentation/dev-tools/kunit/start.rst
+
+Alternatively, feel free to look through the rest of the KUnit documentation,
+or to experiment with tools/testing/kunit/kunit.py and the example test under
+lib/kunit/kunit-example-test.c
+
+Happy testing!
diff --git a/Documentation/dev-tools/kunit/kunit-tool.rst b/Documentation/dev-tools/kunit/kunit-tool.rst
deleted file mode 100644
index ae52e0f489f9..000000000000
--- a/Documentation/dev-tools/kunit/kunit-tool.rst
+++ /dev/null
@@ -1,232 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-=================
-kunit_tool How-To
-=================
-
-What is kunit_tool?
-===================
-
-kunit_tool is a script (``tools/testing/kunit/kunit.py``) that aids in building
-the Linux kernel as UML (`User Mode Linux
-<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
-the test results and displaying them in a user friendly manner.
-
-kunit_tool addresses the problem of being able to run tests without needing a
-virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
-Linux architecture, like ARM or x86; however, unlike other architectures it
-compiles the kernel as a standalone Linux executable that can be run like any
-other program directly inside of a host operating system. To be clear, it does
-not require any virtualization support: it is just a regular program.
-
-What is a .kunitconfig?
-=======================
-
-It's just a defconfig that kunit_tool looks for in the build directory
-(``.kunit`` by default). kunit_tool uses it to generate a .config as you might
-expect. In addition, it verifies that the generated .config contains the CONFIG
-options in the .kunitconfig; the reason it does this is so that it is easy to
-be sure that a CONFIG that enables a test actually ends up in the .config.
-
-It's also possible to pass a separate .kunitconfig fragment to kunit_tool,
-which is useful if you have several different groups of tests you wish
-to run independently, or if you want to use pre-defined test configs for
-certain subsystems.
-
-Getting Started with kunit_tool
-===============================
-
-If a kunitconfig is present at the root directory, all you have to do is:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run
-
-However, you most likely want to use it with the following options:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
-
-- ``--timeout`` sets a maximum amount of time to allow tests to run.
-- ``--jobs`` sets the number of threads to use to build the kernel.
-
-.. note::
- This command will work even without a .kunitconfig file: if no
- .kunitconfig is present, a default one will be used instead.
-
-If you wish to use a different .kunitconfig file (such as one provided for
-testing a particular subsystem), you can pass it as an option.
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4/.kunitconfig
-
-For a list of all the flags supported by kunit_tool, you can run:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run --help
-
-Configuring, Building, and Running Tests
-========================================
-
-It's also possible to run just parts of the KUnit build process independently,
-which is useful if you want to make manual changes to part of the process.
-
-A .config can be generated from a .kunitconfig by using the ``config`` argument
-when running kunit_tool:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py config
-
-Similarly, if you just want to build a KUnit kernel from the current .config,
-you can use the ``build`` argument:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py build
-
-And, if you already have a built UML kernel with built-in KUnit tests, you can
-run the kernel and display the test results with the ``exec`` argument:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py exec
-
-The ``run`` command which is discussed above is equivalent to running all three
-of these in sequence.
-
-All of these commands accept a number of optional command-line arguments. The
-``--help`` flag will give a complete list of these, or keep reading this page
-for a guide to some of the more useful ones.
-
-Parsing Test Results
-====================
-
-KUnit tests output their results in TAP (Test Anything Protocol) format.
-kunit_tool will, when running tests, parse this output and print a summary
-which is much more pleasant to read. If you wish to look at the raw test
-results in TAP format, you can pass the ``--raw_output`` argument.
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run --raw_output
-
-The raw output from test runs may contain other, non-KUnit kernel log
-lines. You can see just KUnit output with ``--raw_output=kunit``:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run --raw_output=kunit
-
-If you have KUnit results in their raw TAP format, you can parse them and print
-the human-readable summary with the ``parse`` command for kunit_tool. This
-accepts a filename for an argument, or will read from standard input.
-
-.. code-block:: bash
-
- # Reading from a file
- ./tools/testing/kunit/kunit.py parse /var/log/dmesg
- # Reading from stdin
- dmesg | ./tools/testing/kunit/kunit.py parse
-
-This is very useful if you wish to run tests in a configuration not supported
-by kunit_tool (such as on real hardware, or an unsupported architecture).
-
-Filtering Tests
-===============
-
-It's possible to run only a subset of the tests built into a kernel by passing
-a filter to the ``exec`` or ``run`` commands. For example, if you only wanted
-to run KUnit resource tests, you could use:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run 'kunit-resource*'
-
-This uses the standard glob format for wildcards.
-
-Running Tests on QEMU
-=====================
-
-kunit_tool supports running tests on QEMU as well as via UML (as mentioned
-elsewhere). The default way of running tests on QEMU requires two flags:
-
-``--arch``
- Selects a collection of configs (Kconfig as well as QEMU configs
- options, etc) that allow KUnit tests to be run on the specified
- architecture in a minimal way; this is usually not much slower than
- using UML. The architecture argument is the same as the name of the
- option passed to the ``ARCH`` variable used by Kbuild. Not all
- architectures are currently supported by this flag, but can be handled
- by the ``--qemu_config`` discussed later. If ``um`` is passed (or this
- this flag is ignored) the tests will run via UML. Non-UML architectures,
- e.g. i386, x86_64, arm, um, etc. Non-UML run on QEMU.
-
-``--cross_compile``
- Specifies the use of a toolchain by Kbuild. The argument passed here is
- the same passed to the ``CROSS_COMPILE`` variable used by Kbuild. As a
- reminder this will be the prefix for the toolchain binaries such as gcc
- for example ``sparc64-linux-gnu-`` if you have the sparc toolchain
- installed on your system, or
- ``$HOME/toolchains/microblaze/gcc-9.2.0-nolibc/microblaze-linux/bin/microblaze-linux-``
- if you have downloaded the microblaze toolchain from the 0-day website
- to a directory in your home directory called ``toolchains``.
-
-In many cases it is likely that you may want to run an architecture which is
-not supported by the ``--arch`` flag, or you may want to just run KUnit tests
-on QEMU using a non-default configuration. For this use case, you can write
-your own QemuConfig. These QemuConfigs are written in Python. They must have an
-import line ``from ..qemu_config import QemuArchParams`` at the top of the file
-and the file must contain a variable called ``QEMU_ARCH`` that has an instance
-of ``QemuArchParams`` assigned to it. An example can be seen in
-``tools/testing/kunit/qemu_configs/x86_64.py``.
-
-Once you have a QemuConfig you can pass it into kunit_tool using the
-``--qemu_config`` flag; when used this flag replaces the ``--arch`` flag. If we
-were to do this with the ``x86_64.py`` example from above, the invocation would
-look something like this:
-
-.. code-block:: bash
-
- ./tools/testing/kunit/kunit.py run \
- --timeout=60 \
- --jobs=12 \
- --qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
-
-Other Useful Options
-====================
-
-kunit_tool has a number of other command-line arguments which can be useful
-when adapting it to fit your environment or needs.
-
-Some of the more useful ones are:
-
-``--help``
- Lists all of the available options. Note that different commands
- (``config``, ``build``, ``run``, etc) will have different supported
- options. Place ``--help`` before the command to list common options,
- and after the command for options specific to that command.
-
-``--build_dir``
- Specifies the build directory that kunit_tool will use. This is where
- the .kunitconfig file is located, as well as where the .config and
- compiled kernel will be placed. Defaults to ``.kunit``.
-
-``--make_options``
- Specifies additional options to pass to ``make`` when compiling a
- kernel (with the ``build`` or ``run`` commands). For example, to enable
- compiler warnings, you can pass ``--make_options W=1``.
-
-``--alltests``
- Builds a UML kernel with all config options enabled using ``make
- allyesconfig``. This allows you to run as many tests as is possible,
- but is very slow and prone to breakage as new options are added or
- modified. In most cases, enabling all tests which have satisfied
- dependencies by adding ``CONFIG_KUNIT_ALL_TESTS=1`` to your
- .kunitconfig is preferable.
-
-There are several other options (and new ones are often added), so do check
-``--help`` if you're looking for something not mentioned here.
diff --git a/Documentation/dev-tools/kunit/kunit_suitememorydiagram.svg b/Documentation/dev-tools/kunit/kunit_suitememorydiagram.svg
new file mode 100644
index 000000000000..cf8fddc27500
--- /dev/null
+++ b/Documentation/dev-tools/kunit/kunit_suitememorydiagram.svg
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="796.93" height="555.73" version="1.1" viewBox="0 0 796.93 555.73" xmlns="http://www.w3.org/2000/svg">
+ <g transform="translate(-13.724 -17.943)">
+ <g fill="#dad4d4" fill-opacity=".91765" stroke="#1a1a1a">
+ <rect x="323.56" y="18.443" width="115.75" height="41.331"/>
+ <rect x="323.56" y="463.09" width="115.75" height="41.331"/>
+ <rect x="323.56" y="531.84" width="115.75" height="41.331"/>
+ <rect x="323.56" y="88.931" width="115.75" height="74.231"/>
+ </g>
+ <g>
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(0 -258.6)">
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(0 -217.27)">
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(0 -175.94)">
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(0 -134.61)">
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(0 -41.331)">
+ <rect x="323.56" y="421.76" width="115.75" height="41.331" fill="#b9dbc6" stroke="#1a1a1a"/>
+ <text x="328.00888" y="446.61826" fill="#000000" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="328.00888" y="446.61826" font-family="monospace" font-size="16px">kunit_suite</tspan></text>
+ </g>
+ <g transform="translate(3.4459e-5 -.71088)">
+ <rect x="502.19" y="143.16" width="201.13" height="41.331" fill="#dad4d4" fill-opacity=".91765" stroke="#1a1a1a"/>
+ <text x="512.02319" y="168.02026" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="512.02319" y="168.02026" font-family="monospace">_kunit_suites_start</tspan></text>
+ </g>
+ <g transform="translate(3.0518e-5 -3.1753)">
+ <rect x="502.19" y="445.69" width="201.13" height="41.331" fill="#dad4d4" fill-opacity=".91765" stroke="#1a1a1a"/>
+ <text x="521.61694" y="470.54846" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="521.61694" y="470.54846" font-family="monospace">_kunit_suites_end</tspan></text>
+ </g>
+ <rect x="14.224" y="277.78" width="134.47" height="41.331" fill="#dad4d4" fill-opacity=".91765" stroke="#1a1a1a"/>
+ <text x="32.062176" y="304.41287" font-family="sans-serif" font-size="16px" style="line-height:1.25" xml:space="preserve"><tspan x="32.062176" y="304.41287" font-family="monospace">.init.data</tspan></text>
+ <g transform="translate(217.98 145.12)" stroke="#1a1a1a">
+ <circle cx="149.97" cy="373.01" r="3.4012"/>
+ <circle cx="163.46" cy="373.01" r="3.4012"/>
+ <circle cx="176.95" cy="373.01" r="3.4012"/>
+ </g>
+ <g transform="translate(217.98 -298.66)" stroke="#1a1a1a">
+ <circle cx="149.97" cy="373.01" r="3.4012"/>
+ <circle cx="163.46" cy="373.01" r="3.4012"/>
+ <circle cx="176.95" cy="373.01" r="3.4012"/>
+ </g>
+ <g stroke="#1a1a1a">
+ <rect x="323.56" y="328.49" width="115.75" height="51.549" fill="#b9dbc6"/>
+ <g transform="translate(217.98 -18.75)">
+ <circle cx="149.97" cy="373.01" r="3.4012"/>
+ <circle cx="163.46" cy="373.01" r="3.4012"/>
+ <circle cx="176.95" cy="373.01" r="3.4012"/>
+ </g>
+ </g>
+ <g transform="scale(1.0933 .9147)" stroke-width="32.937" aria-label="{">
+ <path d="m275.49 545.57c-35.836-8.432-47.43-24.769-47.957-64.821v-88.536c-0.527-44.795-10.54-57.97-49.538-67.456 38.998-10.013 49.011-23.715 49.538-67.983v-88.536c0.527-40.052 12.121-56.389 47.957-64.821v-5.797c-65.348 0-85.901 17.391-86.955 73.253v93.806c-0.527 36.89-10.013 50.065-44.795 59.551 34.782 10.013 44.268 23.188 44.795 60.078v93.279c1.581 56.389 21.607 73.78 86.955 73.78z"/>
+ </g>
+ <g transform="scale(1.1071 .90325)" stroke-width="14.44" aria-label="{">
+ <path d="m461.46 443.55c-15.711-3.6967-20.794-10.859-21.025-28.418v-38.815c-0.23104-19.639-4.6209-25.415-21.718-29.574 17.097-4.3898 21.487-10.397 21.718-29.805v-38.815c0.23105-17.559 5.314-24.722 21.025-28.418v-2.5415c-28.649 0-37.66 7.6244-38.122 32.115v41.126c-0.23105 16.173-4.3898 21.949-19.639 26.108 15.249 4.3898 19.408 10.166 19.639 26.339v40.895c0.69313 24.722 9.4728 32.346 38.122 32.346z"/>
+ </g>
+ <path d="m449.55 161.84v2.5h49.504v-2.5z" color="#000000" style="-inkscape-stroke:none"/>
+ <g fill-rule="evenodd">
+ <path d="m443.78 163.09 8.65-5v10z" color="#000000" stroke-width="1pt" style="-inkscape-stroke:none"/>
+ <path d="m453.1 156.94-10.648 6.1543 0.99804 0.57812 9.6504 5.5781zm-1.334 2.3125v7.6856l-6.6504-3.8438z" color="#000000" style="-inkscape-stroke:none"/>
+ </g>
+ <path d="m449.55 461.91v2.5h49.504v-2.5z" color="#000000" style="-inkscape-stroke:none"/>
+ <g fill-rule="evenodd">
+ <path d="m443.78 463.16 8.65-5v10z" color="#000000" stroke-width="1pt" style="-inkscape-stroke:none"/>
+ <path d="m453.1 457-10.648 6.1562 0.99804 0.57617 9.6504 5.5781zm-1.334 2.3125v7.6856l-6.6504-3.8438z" color="#000000" style="-inkscape-stroke:none"/>
+ </g>
+ <rect x="515.64" y="223.9" width="294.52" height="178.49" fill="#dad4d4" fill-opacity=".91765" stroke="#1a1a1a"/>
+ <text x="523.33319" y="262.52542" font-family="monospace" font-size="14.667px" style="line-height:1.25" xml:space="preserve"><tspan x="523.33319" y="262.52542"><tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">struct</tspan> kunit_suite {</tspan><tspan x="523.33319" y="280.8588"><tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold"> const char</tspan> name[<tspan fill="#ff00ff" font-size="14.667px">256</tspan>];</tspan><tspan x="523.33319" y="299.19217"> <tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">int</tspan> (*init)(<tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">struct</tspan> kunit *);</tspan><tspan x="523.33319" y="317.52554"> <tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">void</tspan> (*exit)(<tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">struct</tspan> kunit *);</tspan><tspan x="523.33319" y="335.85892"> <tspan fill="#008000" font-family="monospace" font-size="14.667px" font-weight="bold">struct</tspan> kunit_case *test_cases;</tspan><tspan x="523.33319" y="354.19229"> ...</tspan><tspan x="523.33319" y="372.52567">};</tspan></text>
+ </g>
+</svg>
diff --git a/Documentation/dev-tools/kunit/run_manual.rst b/Documentation/dev-tools/kunit/run_manual.rst
new file mode 100644
index 000000000000..e7b46421f247
--- /dev/null
+++ b/Documentation/dev-tools/kunit/run_manual.rst
@@ -0,0 +1,57 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+Run Tests without kunit_tool
+============================
+
+If we do not want to use kunit_tool (For example: we want to integrate
+with other systems, or run tests on real hardware), we can
+include KUnit in any kernel, read out results, and parse manually.
+
+.. note:: KUnit is not designed for use in a production system. It is
+ possible that tests may reduce the stability or security of
+ the system.
+
+Configure the Kernel
+====================
+
+KUnit tests can run without kunit_tool. This can be useful, if:
+
+- We have an existing kernel configuration to test.
+- Need to run on real hardware (or using an emulator/VM kunit_tool
+ does not support).
+- Wish to integrate with some existing testing systems.
+
+KUnit is configured with the ``CONFIG_KUNIT`` option, and individual
+tests can also be built by enabling their config options in our
+``.config``. KUnit tests usually (but don't always) have config options
+ending in ``_KUNIT_TEST``. Most tests can either be built as a module,
+or be built into the kernel.
+
+.. note ::
+
+ We can enable the ``KUNIT_ALL_TESTS`` config option to
+ automatically enable all tests with satisfied dependencies. This is
+ a good way of quickly testing everything applicable to the current
+ config.
+
+Once we have built our kernel (and/or modules), it is simple to run
+the tests. If the tests are built-in, they will run automatically on the
+kernel boot. The results will be written to the kernel log (``dmesg``)
+in TAP format.
+
+If the tests are built as modules, they will run when the module is
+loaded.
+
+.. code-block :: bash
+
+ # modprobe example-test
+
+The results will appear in TAP format in ``dmesg``.
+
+.. note ::
+
+ If ``CONFIG_KUNIT_DEBUGFS`` is enabled, KUnit test results will
+ be accessible from the ``debugfs`` filesystem (if mounted).
+ They will be in ``/sys/kernel/debug/kunit/<test_suite>/results``, in
+ TAP format.
diff --git a/Documentation/dev-tools/kunit/run_wrapper.rst b/Documentation/dev-tools/kunit/run_wrapper.rst
new file mode 100644
index 000000000000..dafe8eb28d30
--- /dev/null
+++ b/Documentation/dev-tools/kunit/run_wrapper.rst
@@ -0,0 +1,323 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+Running tests with kunit_tool
+=============================
+
+We can either run KUnit tests using kunit_tool or can run tests
+manually, and then use kunit_tool to parse the results. To run tests
+manually, see: Documentation/dev-tools/kunit/run_manual.rst.
+As long as we can build the kernel, we can run KUnit.
+
+kunit_tool is a Python script which configures and builds a kernel, runs
+tests, and formats the test results.
+
+Run command:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run
+
+We should see the following:
+
+.. code-block::
+
+ Configuring KUnit Kernel ...
+ Building KUnit kernel...
+ Starting KUnit kernel...
+
+We may want to use the following options:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
+
+- ``--timeout`` sets a maximum amount of time for tests to run.
+- ``--jobs`` sets the number of threads to build the kernel.
+
+kunit_tool will generate a ``.kunitconfig`` with a default
+configuration, if no other ``.kunitconfig`` file exists
+(in the build directory). In addition, it verifies that the
+generated ``.config`` file contains the ``CONFIG`` options in the
+``.kunitconfig``.
+It is also possible to pass a separate ``.kunitconfig`` fragment to
+kunit_tool. This is useful if we have several different groups of
+tests we want to run independently, or if we want to use pre-defined
+test configs for certain subsystems.
+
+To use a different ``.kunitconfig`` file (such as one
+provided to test a particular subsystem), pass it as an option:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4/.kunitconfig
+
+To view kunit_tool flags (optional command-line arguments), run:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run --help
+
+Creating a ``.kunitconfig`` file
+================================
+
+If we want to run a specific set of tests (rather than those listed
+in the KUnit ``defconfig``), we can provide Kconfig options in the
+``.kunitconfig`` file. For default .kunitconfig, see:
+https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/testing/kunit/configs/default.config.
+A ``.kunitconfig`` is a ``minconfig`` (a .config
+generated by running ``make savedefconfig``), used for running a
+specific set of tests. This file contains the regular Kernel configs
+with specific test targets. The ``.kunitconfig`` also
+contains any other config options required by the tests (For example:
+dependencies for features under tests, configs that enable/disable
+certain code blocks, arch configs and so on).
+
+To create a ``.kunitconfig``, using the KUnit ``defconfig``:
+
+.. code-block::
+
+ cd $PATH_TO_LINUX_REPO
+ cp tools/testing/kunit/configs/default.config .kunit/.kunitconfig
+
+We can then add any other Kconfig options. For example:
+
+.. code-block::
+
+ CONFIG_LIST_KUNIT_TEST=y
+
+kunit_tool ensures that all config options in ``.kunitconfig`` are
+set in the kernel ``.config`` before running the tests. It warns if we
+have not included the options dependencies.
+
+.. note:: Removing something from the ``.kunitconfig`` will
+ not rebuild the ``.config file``. The configuration is only
+ updated if the ``.kunitconfig`` is not a subset of ``.config``.
+ This means that we can use other tools
+ (For example: ``make menuconfig``) to adjust other config options.
+ The build dir needs to be set for ``make menuconfig`` to
+ work, therefore by default use ``make O=.kunit menuconfig``.
+
+Configuring, building, and running tests
+========================================
+
+If we want to make manual changes to the KUnit build process, we
+can run part of the KUnit build process independently.
+When running kunit_tool, from a ``.kunitconfig``, we can generate a
+``.config`` by using the ``config`` argument:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py config
+
+To build a KUnit kernel from the current ``.config``, we can use the
+``build`` argument:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py build
+
+If we already have built UML kernel with built-in KUnit tests, we
+can run the kernel, and display the test results with the ``exec``
+argument:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py exec
+
+The ``run`` command discussed in section: **Running tests with kunit_tool**,
+is equivalent to running the above three commands in sequence.
+
+Parsing test results
+====================
+
+KUnit tests output displays results in TAP (Test Anything Protocol)
+format. When running tests, kunit_tool parses this output and prints
+a summary. To see the raw test results in TAP format, we can pass the
+``--raw_output`` argument:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run --raw_output
+
+If we have KUnit results in the raw TAP format, we can parse them and
+print the human-readable summary with the ``parse`` command for
+kunit_tool. This accepts a filename for an argument, or will read from
+standard input.
+
+.. code-block:: bash
+
+ # Reading from a file
+ ./tools/testing/kunit/kunit.py parse /var/log/dmesg
+ # Reading from stdin
+ dmesg | ./tools/testing/kunit/kunit.py parse
+
+Filtering tests
+===============
+
+By passing a bash style glob filter to the ``exec`` or ``run``
+commands, we can run a subset of the tests built into a kernel . For
+example: if we only want to run KUnit resource tests, use:
+
+.. code-block::
+
+ ./tools/testing/kunit/kunit.py run 'kunit-resource*'
+
+This uses the standard glob format with wildcard characters.
+
+.. _kunit-on-qemu:
+
+Running tests on QEMU
+=====================
+
+kunit_tool supports running tests on qemu as well as
+via UML. To run tests on qemu, by default it requires two flags:
+
+- ``--arch``: Selects a configs collection (Kconfig, qemu config options
+ and so on), that allow KUnit tests to be run on the specified
+ architecture in a minimal way. The architecture argument is same as
+ the option name passed to the ``ARCH`` variable used by Kbuild.
+ Not all architectures currently support this flag, but we can use
+ ``--qemu_config`` to handle it. If ``um`` is passed (or this flag
+ is ignored), the tests will run via UML. Non-UML architectures,
+ for example: i386, x86_64, arm and so on; run on qemu.
+
+- ``--cross_compile``: Specifies the Kbuild toolchain. It passes the
+ same argument as passed to the ``CROSS_COMPILE`` variable used by
+ Kbuild. As a reminder, this will be the prefix for the toolchain
+ binaries such as GCC. For example:
+
+ - ``sparc64-linux-gnu`` if we have the sparc toolchain installed on
+ our system.
+
+ - ``$HOME/toolchains/microblaze/gcc-9.2.0-nolibc/microblaze-linux/bin/microblaze-linux``
+ if we have downloaded the microblaze toolchain from the 0-day
+ website to a directory in our home directory called toolchains.
+
+This means that for most architectures, running under qemu is as simple as:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run --arch=x86_64
+
+When cross-compiling, we'll likely need to specify a different toolchain, for
+example:
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run \
+ --arch=s390 \
+ --cross_compile=s390x-linux-gnu-
+
+If we want to run KUnit tests on an architecture not supported by
+the ``--arch`` flag, or want to run KUnit tests on qemu using a
+non-default configuration; then we can write our own``QemuConfig``.
+These ``QemuConfigs`` are written in Python. They have an import line
+``from..qemu_config import QemuArchParams`` at the top of the file.
+The file must contain a variable called ``QEMU_ARCH`` that has an
+instance of ``QemuArchParams`` assigned to it. See example in:
+``tools/testing/kunit/qemu_configs/x86_64.py``.
+
+Once we have a ``QemuConfig``, we can pass it into kunit_tool,
+using the ``--qemu_config`` flag. When used, this flag replaces the
+``--arch`` flag. For example: using
+``tools/testing/kunit/qemu_configs/x86_64.py``, the invocation appear
+as
+
+.. code-block:: bash
+
+ ./tools/testing/kunit/kunit.py run \
+ --timeout=60 \
+ --jobs=12 \
+ --qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
+
+Running command-line arguments
+==============================
+
+kunit_tool has a number of other command-line arguments which can
+be useful for our test environment. Below are the most commonly used
+command line arguments:
+
+- ``--help``: Lists all available options. To list common options,
+ place ``--help`` before the command. To list options specific to that
+ command, place ``--help`` after the command.
+
+ .. note:: Different commands (``config``, ``build``, ``run``, etc)
+ have different supported options.
+- ``--build_dir``: Specifies kunit_tool build directory. It includes
+ the ``.kunitconfig``, ``.config`` files and compiled kernel.
+
+- ``--make_options``: Specifies additional options to pass to make, when
+ compiling a kernel (using ``build`` or ``run`` commands). For example:
+ to enable compiler warnings, we can pass ``--make_options W=1``.
+
+- ``--alltests``: Enable a predefined set of options in order to build
+ as many tests as possible.
+
+ .. note:: The list of enabled options can be found in
+ ``tools/testing/kunit/configs/all_tests.config``.
+
+ If you only want to enable all tests with otherwise satisfied
+ dependencies, instead add ``CONFIG_KUNIT_ALL_TESTS=y`` to your
+ ``.kunitconfig``.
+
+- ``--kunitconfig``: Specifies the path or the directory of the ``.kunitconfig``
+ file. For example:
+
+ - ``lib/kunit/.kunitconfig`` can be the path of the file.
+
+ - ``lib/kunit`` can be the directory in which the file is located.
+
+ This file is used to build and run with a predefined set of tests
+ and their dependencies. For example, to run tests for a given subsystem.
+
+- ``--kconfig_add``: Specifies additional configuration options to be
+ appended to the ``.kunitconfig`` file. For example:
+
+ .. code-block::
+
+ ./tools/testing/kunit/kunit.py run --kconfig_add CONFIG_KASAN=y
+
+- ``--arch``: Runs tests on the specified architecture. The architecture
+ argument is same as the Kbuild ARCH environment variable.
+ For example, i386, x86_64, arm, um, etc. Non-UML architectures run on qemu.
+ Default is `um`.
+
+- ``--cross_compile``: Specifies the Kbuild toolchain. It passes the
+ same argument as passed to the ``CROSS_COMPILE`` variable used by
+ Kbuild. This will be the prefix for the toolchain
+ binaries such as GCC. For example:
+
+ - ``sparc64-linux-gnu-`` if we have the sparc toolchain installed on
+ our system.
+
+ - ``$HOME/toolchains/microblaze/gcc-9.2.0-nolibc/microblaze-linux/bin/microblaze-linux``
+ if we have downloaded the microblaze toolchain from the 0-day
+ website to a specified path in our home directory called toolchains.
+
+- ``--qemu_config``: Specifies the path to a file containing a
+ custom qemu architecture definition. This should be a python file
+ containing a `QemuArchParams` object.
+
+- ``--qemu_args``: Specifies additional qemu arguments, for example, ``-smp 8``.
+
+- ``--jobs``: Specifies the number of jobs (commands) to run simultaneously.
+ By default, this is set to the number of cores on your system.
+
+- ``--timeout``: Specifies the maximum number of seconds allowed for all tests to run.
+ This does not include the time taken to build the tests.
+
+- ``--kernel_args``: Specifies additional kernel command-line arguments. May be repeated.
+
+- ``--run_isolated``: If set, boots the kernel for each individual suite/test.
+ This is useful for debugging a non-hermetic test, one that
+ might pass/fail based on what ran before it.
+
+- ``--raw_output``: If set, generates unformatted output from kernel. Possible options are:
+
+ - ``all``: To view the full kernel output, use ``--raw_output=all``.
+
+ - ``kunit``: This is the default option and filters to KUnit output. Use ``--raw_output`` or ``--raw_output=kunit``.
+
+- ``--json``: If set, stores the test results in a JSON format and prints to `stdout` or
+ saves to a file if a filename is specified.
diff --git a/Documentation/dev-tools/kunit/running_tips.rst b/Documentation/dev-tools/kunit/running_tips.rst
index 30d2147eb5b5..8e8c493f17d1 100644
--- a/Documentation/dev-tools/kunit/running_tips.rst
+++ b/Documentation/dev-tools/kunit/running_tips.rst
@@ -15,7 +15,7 @@ It can be handy to create a bash function like:
.. code-block:: bash
function run_kunit() {
- ( cd "$(git rev-parse --show-toplevel)" && ./tools/testing/kunit/kunit.py run $@ )
+ ( cd "$(git rev-parse --show-toplevel)" && ./tools/testing/kunit/kunit.py run "$@" )
}
.. note::
@@ -25,8 +25,8 @@ It can be handy to create a bash function like:
Running a subset of tests
-------------------------
-``kunit.py run`` accepts an optional glob argument to filter tests. Currently
-this only matches against suite names, but this may change in the future.
+``kunit.py run`` accepts an optional glob argument to filter tests. The format
+is ``"<suite_glob>[.test_glob]"``.
Say that we wanted to run the sysctl tests, we could do so via:
@@ -35,6 +35,13 @@ Say that we wanted to run the sysctl tests, we could do so via:
$ echo -e 'CONFIG_KUNIT=y\nCONFIG_KUNIT_ALL_TESTS=y' > .kunit/.kunitconfig
$ ./tools/testing/kunit/kunit.py run 'sysctl*'
+We can filter down to just the "write" tests via:
+
+.. code-block:: bash
+
+ $ echo -e 'CONFIG_KUNIT=y\nCONFIG_KUNIT_ALL_TESTS=y' > .kunit/.kunitconfig
+ $ ./tools/testing/kunit/kunit.py run 'sysctl*.*write*'
+
We're paying the cost of building more tests than we need this way, but it's
easier than fiddling with ``.kunitconfig`` files or commenting out
``kunit_suite``'s.
@@ -107,6 +114,7 @@ Instead of enabling ``CONFIG_GCOV_KERNEL=y``, we can set these options:
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
+ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_GCOV=y
@@ -115,8 +123,7 @@ Putting it together into a copy-pastable sequence of commands:
.. code-block:: bash
# Append coverage options to the current config
- $ echo -e "CONFIG_DEBUG_KERNEL=y\nCONFIG_DEBUG_INFO=y\nCONFIG_GCOV=y" >> .kunit/.kunitconfig
- $ ./tools/testing/kunit/kunit.py run
+ $ ./tools/testing/kunit/kunit.py run --kunitconfig=.kunit/ --kunitconfig=tools/testing/kunit/configs/coverage_uml.config
# Extract the coverage information from the build dir (.kunit/)
$ lcov -t "my_kunit_tests" -o coverage.info -c -d .kunit/
diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst
index 1e00f9226f74..c736613c9b19 100644
--- a/Documentation/dev-tools/kunit/start.rst
+++ b/Documentation/dev-tools/kunit/start.rst
@@ -4,132 +4,190 @@
Getting Started
===============
-Installing dependencies
-=======================
-KUnit has the same dependencies as the Linux kernel. As long as you can build
-the kernel, you can run KUnit.
+This page contains an overview of the kunit_tool and KUnit framework,
+teaching how to run existing tests and then how to write a simple test case,
+and covers common problems users face when using KUnit for the first time.
-Running tests with the KUnit Wrapper
-====================================
-Included with KUnit is a simple Python wrapper which runs tests under User Mode
-Linux, and formats the test results.
+Installing Dependencies
+=======================
+KUnit has the same dependencies as the Linux kernel. As long as you can
+build the kernel, you can run KUnit.
-The wrapper can be run with:
+Running tests with kunit_tool
+=============================
+kunit_tool is a Python script, which configures and builds a kernel, runs
+tests, and formats the test results. From the kernel repository, you
+can run kunit_tool:
.. code-block:: bash
./tools/testing/kunit/kunit.py run
-For more information on this wrapper (also called kunit_tool) check out the
-Documentation/dev-tools/kunit/kunit-tool.rst page.
+.. note ::
+ You may see the following error:
+ "The source tree is not clean, please run 'make ARCH=um mrproper'"
-Creating a .kunitconfig
------------------------
-If you want to run a specific set of tests (rather than those listed in the
-KUnit defconfig), you can provide Kconfig options in the ``.kunitconfig`` file.
-This file essentially contains the regular Kernel config, with the specific
-test targets as well. The ``.kunitconfig`` should also contain any other config
-options required by the tests.
+ This happens because internally kunit.py specifies ``.kunit``
+ (default option) as the build directory in the command ``make O=output/dir``
+ through the argument ``--build_dir``. Hence, before starting an
+ out-of-tree build, the source tree must be clean.
-A good starting point for a ``.kunitconfig`` is the KUnit defconfig:
+ There is also the same caveat mentioned in the "Build directory for
+ the kernel" section of the :doc:`admin-guide </admin-guide/README>`,
+ that is, its use, it must be used for all invocations of ``make``.
+ The good news is that it can indeed be solved by running
+ ``make ARCH=um mrproper``, just be aware that this will delete the
+ current configuration and all generated files.
-.. code-block:: bash
+If everything worked correctly, you should see the following:
- cd $PATH_TO_LINUX_REPO
- cp tools/testing/kunit/configs/default.config .kunitconfig
+.. code-block::
-You can then add any other Kconfig options you wish, e.g.:
+ Configuring KUnit Kernel ...
+ Building KUnit Kernel ...
+ Starting KUnit Kernel ...
-.. code-block:: none
+The tests will pass or fail.
- CONFIG_LIST_KUNIT_TEST=y
+.. note ::
+ Because it is building a lot of sources for the first time,
+ the ``Building KUnit Kernel`` step may take a while.
-:doc:`kunit_tool <kunit-tool>` will ensure that all config options set in
-``.kunitconfig`` are set in the kernel ``.config`` before running the tests.
-It'll warn you if you haven't included the dependencies of the options you're
-using.
+For detailed information on this wrapper, see:
+Documentation/dev-tools/kunit/run_wrapper.rst.
-.. note::
- Note that removing something from the ``.kunitconfig`` will not trigger a
- rebuild of the ``.config`` file: the configuration is only updated if the
- ``.kunitconfig`` is not a subset of ``.config``. This means that you can use
- other tools (such as make menuconfig) to adjust other config options.
+Selecting which tests to run
+----------------------------
+By default, kunit_tool runs all tests reachable with minimal configuration,
+that is, using default values for most of the kconfig options. However,
+you can select which tests to run by:
-Running the tests (KUnit Wrapper)
----------------------------------
+- `Customizing Kconfig`_ used to compile the kernel, or
+- `Filtering tests by name`_ to select specifically which compiled tests to run.
-To make sure that everything is set up correctly, simply invoke the Python
-wrapper from your kernel repo:
+Customizing Kconfig
+~~~~~~~~~~~~~~~~~~~
+A good starting point for the ``.kunitconfig`` is the KUnit default config.
+If you didn't run ``kunit.py run`` yet, you can generate it by running:
.. code-block:: bash
- ./tools/testing/kunit/kunit.py run
+ cd $PATH_TO_LINUX_REPO
+ tools/testing/kunit/kunit.py config
+ cat .kunit/.kunitconfig
-.. note::
- You may want to run ``make mrproper`` first.
+.. note ::
+ ``.kunitconfig`` lives in the ``--build_dir`` used by kunit.py, which is
+ ``.kunit`` by default.
-If everything worked correctly, you should see the following:
+Before running the tests, kunit_tool ensures that all config options
+set in ``.kunitconfig`` are set in the kernel ``.config``. It will warn
+you if you have not included dependencies for the options used.
-.. code-block:: bash
+There are many ways to customize the configurations:
- Generating .config ...
- Building KUnit Kernel ...
- Starting KUnit Kernel ...
+a. Edit ``.kunit/.kunitconfig``. The file should contain the list of kconfig
+ options required to run the desired tests, including their dependencies.
+ You may want to remove CONFIG_KUNIT_ALL_TESTS from the ``.kunitconfig`` as
+ it will enable a number of additional tests that you may not want.
+ If you need to run on an architecture other than UML see :ref:`kunit-on-qemu`.
-followed by a list of tests that are run. All of them should be passing.
+b. Enable additional kconfig options on top of ``.kunit/.kunitconfig``.
+ For example, to include the kernel's linked-list test you can run::
-.. note::
- Because it is building a lot of sources for the first time, the
- ``Building KUnit kernel`` step may take a while.
+ ./tools/testing/kunit/kunit.py run \
+ --kconfig_add CONFIG_LIST_KUNIT_TEST=y
-Running tests without the KUnit Wrapper
-=======================================
+c. Provide the path of one or more .kunitconfig files from the tree.
+ For example, to run only ``FAT_FS`` and ``EXT4`` tests you can run::
-If you'd rather not use the KUnit Wrapper (if, for example, you need to
-integrate with other systems, or use an architecture other than UML), KUnit can
-be included in any kernel, and the results read out and parsed manually.
+ ./tools/testing/kunit/kunit.py run \
+ --kunitconfig ./fs/fat/.kunitconfig \
+ --kunitconfig ./fs/ext4/.kunitconfig
-.. note::
- KUnit is not designed for use in a production system, and it's possible that
- tests may reduce the stability or security of the system.
+d. If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
+ ``.config`` file. But you can edit the ``.config`` file directly or with
+ tools like ``make menuconfig O=.kunit``. As long as its a superset of
+ ``.kunitconfig``, kunit.py won't overwrite your changes.
+.. note ::
-Configuring the kernel
-----------------------
+ To save a .kunitconfig after finding a satisfactory configuration::
-In order to enable KUnit itself, you simply need to enable the ``CONFIG_KUNIT``
-Kconfig option (it's under Kernel Hacking/Kernel Testing and Coverage in
-menuconfig). From there, you can enable any KUnit tests you want: they usually
-have config options ending in ``_KUNIT_TEST``.
+ make savedefconfig O=.kunit
+ cp .kunit/defconfig .kunit/.kunitconfig
-KUnit and KUnit tests can be compiled as modules: in this case the tests in a
-module will be run when the module is loaded.
+Filtering tests by name
+~~~~~~~~~~~~~~~~~~~~~~~
+If you want to be more specific than Kconfig can provide, it is also possible
+to select which tests to execute at boot-time by passing a glob filter
+(read instructions regarding the pattern in the manpage :manpage:`glob(7)`).
+If there is a ``"."`` (period) in the filter, it will be interpreted as a
+separator between the name of the test suite and the test case,
+otherwise, it will be interpreted as the name of the test suite.
+For example, let's assume we are using the default config:
+a. inform the name of a test suite, like ``"kunit_executor_test"``,
+ to run every test case it contains::
-Running the tests (w/o KUnit Wrapper)
--------------------------------------
+ ./tools/testing/kunit/kunit.py run "kunit_executor_test"
+
+b. inform the name of a test case prefixed by its test suite,
+ like ``"example.example_simple_test"``, to run specifically that test case::
+
+ ./tools/testing/kunit/kunit.py run "example.example_simple_test"
+
+c. use wildcard characters (``*?[``) to run any test case that matches the pattern,
+ like ``"*.*64*"`` to run test cases containing ``"64"`` in the name inside
+ any test suite::
+
+ ./tools/testing/kunit/kunit.py run "*.*64*"
+
+Running Tests without the KUnit Wrapper
+=======================================
+If you do not want to use the KUnit Wrapper (for example: you want code
+under test to integrate with other systems, or use a different/
+unsupported architecture or configuration), KUnit can be included in
+any kernel, and the results are read out and parsed manually.
+
+.. note ::
+ ``CONFIG_KUNIT`` should not be enabled in a production environment.
+ Enabling KUnit disables Kernel Address-Space Layout Randomization
+ (KASLR), and tests may affect the state of the kernel in ways not
+ suitable for production.
+
+Configuring the Kernel
+----------------------
+To enable KUnit itself, you need to enable the ``CONFIG_KUNIT`` Kconfig
+option (under Kernel Hacking/Kernel Testing and Coverage in
+``menuconfig``). From there, you can enable any KUnit tests. They
+usually have config options ending in ``_KUNIT_TEST``.
-Build and run your kernel as usual. Test output will be written to the kernel
-log in `TAP <https://testanything.org/>`_ format.
+KUnit and KUnit tests can be compiled as modules. The tests in a module
+will run when the module is loaded.
-.. note::
- It's possible that there will be other lines and/or data interspersed in the
- TAP output.
+Running Tests (without KUnit Wrapper)
+-------------------------------------
+Build and run your kernel. In the kernel log, the test output is printed
+out in the TAP format. This will only happen by default if KUnit/tests
+are built-in. Otherwise the module will need to be loaded.
+.. note ::
+ Some lines and/or data may get interspersed in the TAP output.
-Writing your first test
+Writing Your First Test
=======================
+In your kernel repository, let's add some code that we can test.
-In your kernel repo let's add some code that we can test. Create a file
-``drivers/misc/example.h`` with the contents:
+1. Create a file ``drivers/misc/example.h``, which includes:
.. code-block:: c
int misc_example_add(int left, int right);
-create a file ``drivers/misc/example.c``:
+2. Create a file ``drivers/misc/example.c``, which includes:
.. code-block:: c
@@ -142,21 +200,22 @@ create a file ``drivers/misc/example.c``:
return left + right;
}
-Now add the following lines to ``drivers/misc/Kconfig``:
+3. Add the following lines to ``drivers/misc/Kconfig``:
.. code-block:: kconfig
config MISC_EXAMPLE
bool "My example"
-and the following lines to ``drivers/misc/Makefile``:
+4. Add the following lines to ``drivers/misc/Makefile``:
.. code-block:: make
obj-$(CONFIG_MISC_EXAMPLE) += example.o
-Now we are ready to write the test. The test will be in
-``drivers/misc/example-test.c``:
+Now we are ready to write the test cases.
+
+1. Add the below test case in ``drivers/misc/example_test.c``:
.. code-block:: c
@@ -191,7 +250,7 @@ Now we are ready to write the test. The test will be in
};
kunit_test_suite(misc_example_test_suite);
-Now add the following to ``drivers/misc/Kconfig``:
+2. Add the following lines to ``drivers/misc/Kconfig``:
.. code-block:: kconfig
@@ -200,20 +259,20 @@ Now add the following to ``drivers/misc/Kconfig``:
depends on MISC_EXAMPLE && KUNIT=y
default KUNIT_ALL_TESTS
-and the following to ``drivers/misc/Makefile``:
+3. Add the following lines to ``drivers/misc/Makefile``:
.. code-block:: make
- obj-$(CONFIG_MISC_EXAMPLE_TEST) += example-test.o
+ obj-$(CONFIG_MISC_EXAMPLE_TEST) += example_test.o
-Now add it to your ``.kunitconfig``:
+4. Add the following lines to ``.kunit/.kunitconfig``:
.. code-block:: none
CONFIG_MISC_EXAMPLE=y
CONFIG_MISC_EXAMPLE_TEST=y
-Now you can run the test:
+5. Run the test:
.. code-block:: bash
@@ -227,16 +286,19 @@ You should see the following failure:
[16:08:57] [PASSED] misc-example:misc_example_add_test_basic
[16:08:57] [FAILED] misc-example:misc_example_test_failure
[16:08:57] EXPECTATION FAILED at drivers/misc/example-test.c:17
- [16:08:57] This test never passes.
+ [16:08:57] This test never passes.
...
-Congrats! You just wrote your first KUnit test!
+Congrats! You just wrote your first KUnit test.
Next Steps
==========
-* Check out the Documentation/dev-tools/kunit/tips.rst page for tips on
- writing idiomatic KUnit tests.
-* Check out the :doc:`running_tips` page for tips on
- how to make running KUnit tests easier.
-* Optional: see the :doc:`usage` page for a more
- in-depth explanation of KUnit.
+
+If you're interested in using some of the more advanced features of kunit.py,
+take a look at Documentation/dev-tools/kunit/run_wrapper.rst
+
+If you'd like to run tests without using kunit.py, check out
+Documentation/dev-tools/kunit/run_manual.rst
+
+For more information on writing KUnit tests (including some common techniques
+for testing different things), see Documentation/dev-tools/kunit/usage.rst
diff --git a/Documentation/dev-tools/kunit/style.rst b/Documentation/dev-tools/kunit/style.rst
index 8dbcdc552606..b6d0d7359f00 100644
--- a/Documentation/dev-tools/kunit/style.rst
+++ b/Documentation/dev-tools/kunit/style.rst
@@ -4,37 +4,36 @@
Test Style and Nomenclature
===========================
-To make finding, writing, and using KUnit tests as simple as possible, it's
+To make finding, writing, and using KUnit tests as simple as possible, it is
strongly encouraged that they are named and written according to the guidelines
-below. While it's possible to write KUnit tests which do not follow these rules,
+below. While it is possible to write KUnit tests which do not follow these rules,
they may break some tooling, may conflict with other tests, and may not be run
automatically by testing systems.
-It's recommended that you only deviate from these guidelines when:
+It is recommended that you only deviate from these guidelines when:
-1. Porting tests to KUnit which are already known with an existing name, or
-2. Writing tests which would cause serious problems if automatically run (e.g.,
- non-deterministically producing false positives or negatives, or taking an
- extremely long time to run).
+1. Porting tests to KUnit which are already known with an existing name.
+2. Writing tests which would cause serious problems if automatically run. For
+ example, non-deterministically producing false positives or negatives, or
+ taking a long time to run.
Subsystems, Suites, and Tests
=============================
-In order to make tests as easy to find as possible, they're grouped into suites
-and subsystems. A test suite is a group of tests which test a related area of
-the kernel, and a subsystem is a set of test suites which test different parts
-of the same kernel subsystem or driver.
+To make tests easy to find, they are grouped into suites and subsystems. A test
+suite is a group of tests which test a related area of the kernel. A subsystem
+is a set of test suites which test different parts of a kernel subsystem
+or a driver.
Subsystems
----------
Every test suite must belong to a subsystem. A subsystem is a collection of one
or more KUnit test suites which test the same driver or part of the kernel. A
-rule of thumb is that a test subsystem should match a single kernel module. If
-the code being tested can't be compiled as a module, in many cases the subsystem
-should correspond to a directory in the source tree or an entry in the
-MAINTAINERS file. If unsure, follow the conventions set by tests in similar
-areas.
+test subsystem should match a single kernel module. If the code being tested
+cannot be compiled as a module, in many cases the subsystem should correspond to
+a directory in the source tree or an entry in the ``MAINTAINERS`` file. If
+unsure, follow the conventions set by tests in similar areas.
Test subsystems should be named after the code being tested, either after the
module (wherever possible), or after the directory or files being tested. Test
@@ -42,9 +41,8 @@ subsystems should be named to avoid ambiguity where necessary.
If a test subsystem name has multiple components, they should be separated by
underscores. *Do not* include "test" or "kunit" directly in the subsystem name
-unless you are actually testing other tests or the kunit framework itself.
-
-Example subsystems could be:
+unless we are actually testing other tests or the kunit framework itself. For
+example, subsystems could be called:
``ext4``
Matches the module and filesystem name.
@@ -56,48 +54,46 @@ Example subsystems could be:
Has several components (``snd``, ``hda``, ``codec``, ``hdmi``) separated by
underscores. Matches the module name.
-Avoid names like these:
+Avoid names as shown in examples below:
``linear-ranges``
Names should use underscores, not dashes, to separate words. Prefer
``linear_ranges``.
``qos-kunit-test``
- As well as using underscores, this name should not have "kunit-test" as a
- suffix, and ``qos`` is ambiguous as a subsystem name. ``power_qos`` would be a
- better name.
+ This name should use underscores, and not have "kunit-test" as a
+ suffix. ``qos`` is also ambiguous as a subsystem name, because several parts
+ of the kernel have a ``qos`` subsystem. ``power_qos`` would be a better name.
``pc_parallel_port``
The corresponding module name is ``parport_pc``, so this subsystem should also
be named ``parport_pc``.
.. note::
- The KUnit API and tools do not explicitly know about subsystems. They're
- simply a way of categorising test suites and naming modules which
- provides a simple, consistent way for humans to find and run tests. This
- may change in the future, though.
+ The KUnit API and tools do not explicitly know about subsystems. They are
+ a way of categorizing test suites and naming modules which provides a
+ simple, consistent way for humans to find and run tests. This may change
+ in the future.
Suites
------
KUnit tests are grouped into test suites, which cover a specific area of
-functionality being tested. Test suites can have shared initialisation and
-shutdown code which is run for all tests in the suite.
-Not all subsystems will need to be split into multiple test suites (e.g. simple drivers).
+functionality being tested. Test suites can have shared initialization and
+shutdown code which is run for all tests in the suite. Not all subsystems need
+to be split into multiple test suites (for example, simple drivers).
Test suites are named after the subsystem they are part of. If a subsystem
contains several suites, the specific area under test should be appended to the
subsystem name, separated by an underscore.
In the event that there are multiple types of test using KUnit within a
-subsystem (e.g., both unit tests and integration tests), they should be put into
-separate suites, with the type of test as the last element in the suite name.
-Unless these tests are actually present, avoid using ``_test``, ``_unittest`` or
-similar in the suite name.
+subsystem (for example, both unit tests and integration tests), they should be
+put into separate suites, with the type of test as the last element in the suite
+name. Unless these tests are actually present, avoid using ``_test``, ``_unittest``
+or similar in the suite name.
The full test suite name (including the subsystem name) should be specified as
the ``.name`` member of the ``kunit_suite`` struct, and forms the base for the
-module name (see below).
-
-Example test suites could include:
+module name. For example, test suites could include:
``ext4_inode``
Part of the ``ext4`` subsystem, testing the ``inode`` area.
@@ -109,26 +105,27 @@ Example test suites could include:
The ``kasan`` subsystem has only one suite, so the suite name is the same as
the subsystem name.
-Avoid names like:
+Avoid names, for example:
``ext4_ext4_inode``
- There's no reason to state the subsystem twice.
+ There is no reason to state the subsystem twice.
``property_entry``
The suite name is ambiguous without the subsystem name.
``kasan_integration_test``
Because there is only one suite in the ``kasan`` subsystem, the suite should
- just be called ``kasan``. There's no need to redundantly add
- ``integration_test``. Should a separate test suite with, for example, unit
- tests be added, then that suite could be named ``kasan_unittest`` or similar.
+ just be called as ``kasan``. Do not redundantly add
+ ``integration_test``. It should be a separate test suite. For example, if the
+ unit tests are added, then that suite could be named as ``kasan_unittest`` or
+ similar.
Test Cases
----------
Individual tests consist of a single function which tests a constrained
-codepath, property, or function. In the test output, individual tests' results
-will show up as subtests of the suite's results.
+codepath, property, or function. In the test output, an individual test's
+results will show up as subtests of the suite's results.
-Tests should be named after what they're testing. This is often the name of the
+Tests should be named after what they are testing. This is often the name of the
function being tested, with a description of the input or codepath being tested.
As tests are C functions, they should be named and written in accordance with
the kernel coding style.
@@ -136,7 +133,7 @@ the kernel coding style.
.. note::
As tests are themselves functions, their names cannot conflict with
other C identifiers in the kernel. This may require some creative
- naming. It's a good idea to make your test functions `static` to avoid
+ naming. It is a good idea to make your test functions `static` to avoid
polluting the global namespace.
Example test names include:
@@ -162,16 +159,16 @@ This Kconfig entry must:
* be named ``CONFIG_<name>_KUNIT_TEST``: where <name> is the name of the test
suite.
* be listed either alongside the config entries for the driver/subsystem being
- tested, or be under [Kernel Hacking]→[Kernel Testing and Coverage]
-* depend on ``CONFIG_KUNIT``
+ tested, or be under [Kernel Hacking]->[Kernel Testing and Coverage]
+* depend on ``CONFIG_KUNIT``.
* be visible only if ``CONFIG_KUNIT_ALL_TESTS`` is not enabled.
* have a default value of ``CONFIG_KUNIT_ALL_TESTS``.
-* have a brief description of KUnit in the help text
+* have a brief description of KUnit in the help text.
-Unless there's a specific reason not to (e.g. the test is unable to be built as
-a module), Kconfig entries for tests should be tristate.
+If we are not able to meet above conditions (for example, the test is unable to
+be built as a module), Kconfig entries for tests should be tristate.
-An example Kconfig entry:
+For example, a Kconfig entry might look like:
.. code-block:: none
@@ -182,8 +179,8 @@ An example Kconfig entry:
help
This builds unit tests for foo.
- For more information on KUnit and unit tests in general, please refer
- to the KUnit documentation in Documentation/dev-tools/kunit/.
+ For more information on KUnit and unit tests in general,
+ please refer to the KUnit documentation in Documentation/dev-tools/kunit/.
If unsure, say N.
diff --git a/Documentation/dev-tools/kunit/tips.rst b/Documentation/dev-tools/kunit/tips.rst
deleted file mode 100644
index 492d2ded2f5a..000000000000
--- a/Documentation/dev-tools/kunit/tips.rst
+++ /dev/null
@@ -1,190 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-============================
-Tips For Writing KUnit Tests
-============================
-
-Exiting early on failed expectations
-------------------------------------
-
-``KUNIT_EXPECT_EQ`` and friends will mark the test as failed and continue
-execution. In some cases, it's unsafe to continue and you can use the
-``KUNIT_ASSERT`` variant to exit on failure.
-
-.. code-block:: c
-
- void example_test_user_alloc_function(struct kunit *test)
- {
- void *object = alloc_some_object_for_me();
-
- /* Make sure we got a valid pointer back. */
- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, object);
- do_something_with_object(object);
- }
-
-Allocating memory
------------------
-
-Where you would use ``kzalloc``, you should prefer ``kunit_kzalloc`` instead.
-KUnit will ensure the memory is freed once the test completes.
-
-This is particularly useful since it lets you use the ``KUNIT_ASSERT_EQ``
-macros to exit early from a test without having to worry about remembering to
-call ``kfree``.
-
-Example:
-
-.. code-block:: c
-
- void example_test_allocation(struct kunit *test)
- {
- char *buffer = kunit_kzalloc(test, 16, GFP_KERNEL);
- /* Ensure allocation succeeded. */
- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer);
-
- KUNIT_ASSERT_STREQ(test, buffer, "");
- }
-
-
-Testing static functions
-------------------------
-
-If you don't want to expose functions or variables just for testing, one option
-is to conditionally ``#include`` the test file at the end of your .c file, e.g.
-
-.. code-block:: c
-
- /* In my_file.c */
-
- static int do_interesting_thing();
-
- #ifdef CONFIG_MY_KUNIT_TEST
- #include "my_kunit_test.c"
- #endif
-
-Injecting test-only code
-------------------------
-
-Similarly to the above, it can be useful to add test-specific logic.
-
-.. code-block:: c
-
- /* In my_file.h */
-
- #ifdef CONFIG_MY_KUNIT_TEST
- /* Defined in my_kunit_test.c */
- void test_only_hook(void);
- #else
- void test_only_hook(void) { }
- #endif
-
-This test-only code can be made more useful by accessing the current kunit
-test, see below.
-
-Accessing the current test
---------------------------
-
-In some cases, you need to call test-only code from outside the test file, e.g.
-like in the example above or if you're providing a fake implementation of an
-ops struct.
-There is a ``kunit_test`` field in ``task_struct``, so you can access it via
-``current->kunit_test``.
-
-Here's a slightly in-depth example of how one could implement "mocking":
-
-.. code-block:: c
-
- #include <linux/sched.h> /* for current */
-
- struct test_data {
- int foo_result;
- int want_foo_called_with;
- };
-
- static int fake_foo(int arg)
- {
- struct kunit *test = current->kunit_test;
- struct test_data *test_data = test->priv;
-
- KUNIT_EXPECT_EQ(test, test_data->want_foo_called_with, arg);
- return test_data->foo_result;
- }
-
- static void example_simple_test(struct kunit *test)
- {
- /* Assume priv is allocated in the suite's .init */
- struct test_data *test_data = test->priv;
-
- test_data->foo_result = 42;
- test_data->want_foo_called_with = 1;
-
- /* In a real test, we'd probably pass a pointer to fake_foo somewhere
- * like an ops struct, etc. instead of calling it directly. */
- KUNIT_EXPECT_EQ(test, fake_foo(1), 42);
- }
-
-
-Note: here we're able to get away with using ``test->priv``, but if you wanted
-something more flexible you could use a named ``kunit_resource``, see
-Documentation/dev-tools/kunit/api/test.rst.
-
-Failing the current test
-------------------------
-
-But sometimes, you might just want to fail the current test. In that case, we
-have ``kunit_fail_current_test(fmt, args...)`` which is defined in ``<kunit/test-bug.h>`` and
-doesn't require pulling in ``<kunit/test.h>``.
-
-E.g. say we had an option to enable some extra debug checks on some data structure:
-
-.. code-block:: c
-
- #include <kunit/test-bug.h>
-
- #ifdef CONFIG_EXTRA_DEBUG_CHECKS
- static void validate_my_data(struct data *data)
- {
- if (is_valid(data))
- return;
-
- kunit_fail_current_test("data %p is invalid", data);
-
- /* Normal, non-KUnit, error reporting code here. */
- }
- #else
- static void my_debug_function(void) { }
- #endif
-
-
-Customizing error messages
---------------------------
-
-Each of the ``KUNIT_EXPECT`` and ``KUNIT_ASSERT`` macros have a ``_MSG`` variant.
-These take a format string and arguments to provide additional context to the automatically generated error messages.
-
-.. code-block:: c
-
- char some_str[41];
- generate_sha1_hex_string(some_str);
-
- /* Before. Not easy to tell why the test failed. */
- KUNIT_EXPECT_EQ(test, strlen(some_str), 40);
-
- /* After. Now we see the offending string. */
- KUNIT_EXPECT_EQ_MSG(test, strlen(some_str), 40, "some_str='%s'", some_str);
-
-Alternatively, one can take full control over the error message by using ``KUNIT_FAIL()``, e.g.
-
-.. code-block:: c
-
- /* Before */
- KUNIT_EXPECT_EQ(test, some_setup_function(), 0);
-
- /* After: full control over the failure message. */
- if (some_setup_function())
- KUNIT_FAIL(test, "Failed to setup thing for testing");
-
-Next Steps
-==========
-* Optional: see the Documentation/dev-tools/kunit/usage.rst page for a more
- in-depth explanation of KUnit.
diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index 63f1bb89ebf5..9faf2b4153fc 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -1,57 +1,13 @@
.. SPDX-License-Identifier: GPL-2.0
-===========
-Using KUnit
-===========
-
-The purpose of this document is to describe what KUnit is, how it works, how it
-is intended to be used, and all the concepts and terminology that are needed to
-understand it. This guide assumes a working knowledge of the Linux kernel and
-some basic knowledge of testing.
-
-For a high level introduction to KUnit, including setting up KUnit for your
-project, see Documentation/dev-tools/kunit/start.rst.
-
-Organization of this document
-=============================
-
-This document is organized into two main sections: Testing and Common Patterns.
-The first covers what unit tests are and how to use KUnit to write them. The
-second covers common testing patterns, e.g. how to isolate code and make it
-possible to unit test code that was otherwise un-unit-testable.
-
-Testing
-=======
-
-What is KUnit?
---------------
-
-"K" is short for "kernel" so "KUnit" is the "(Linux) Kernel Unit Testing
-Framework." KUnit is intended first and foremost for writing unit tests; it is
-general enough that it can be used to write integration tests; however, this is
-a secondary goal. KUnit has no ambition of being the only testing framework for
-the kernel; for example, it does not intend to be an end-to-end testing
-framework.
-
-What is Unit Testing?
----------------------
-
-A `unit test <https://martinfowler.com/bliki/UnitTest.html>`_ is a test that
-tests code at the smallest possible scope, a *unit* of code. In the C
-programming language that's a function.
-
-Unit tests should be written for all the publicly exposed functions in a
-compilation unit; so that is all the functions that are exported in either a
-*class* (defined below) or all functions which are **not** static.
-
Writing Tests
--------------
+=============
Test Cases
-~~~~~~~~~~
+----------
The fundamental unit in KUnit is the test case. A test case is a function with
-the signature ``void (*)(struct kunit *test)``. It calls a function to be tested
+the signature ``void (*)(struct kunit *test)``. It calls the function under test
and then sets *expectations* for what should happen. For example:
.. code-block:: c
@@ -65,18 +21,19 @@ and then sets *expectations* for what should happen. For example:
KUNIT_FAIL(test, "This test never passes.");
}
-In the above example ``example_test_success`` always passes because it does
-nothing; no expectations are set, so all expectations pass. On the other hand
-``example_test_failure`` always fails because it calls ``KUNIT_FAIL``, which is
-a special expectation that logs a message and causes the test case to fail.
+In the above example, ``example_test_success`` always passes because it does
+nothing; no expectations are set, and therefore all expectations pass. On the
+other hand ``example_test_failure`` always fails because it calls ``KUNIT_FAIL``,
+which is a special expectation that logs a message and causes the test case to
+fail.
Expectations
~~~~~~~~~~~~
-An *expectation* is a way to specify that you expect a piece of code to do
-something in a test. An expectation is called like a function. A test is made
-by setting expectations about the behavior of a piece of code under test; when
-one or more of the expectations fail, the test case fails and information about
-the failure is logged. For example:
+An *expectation* specifies that we expect a piece of code to do something in a
+test. An expectation is called like a function. A test is made by setting
+expectations about the behavior of a piece of code under test. When one or more
+expectations fail, the test case fails and information about the failure is
+logged. For example:
.. code-block:: c
@@ -86,29 +43,28 @@ the failure is logged. For example:
KUNIT_EXPECT_EQ(test, 2, add(1, 1));
}
-In the above example ``add_test_basic`` makes a number of assertions about the
-behavior of a function called ``add``; the first parameter is always of type
-``struct kunit *``, which contains information about the current test context;
-the second parameter, in this case, is what the value is expected to be; the
+In the above example, ``add_test_basic`` makes a number of assertions about the
+behavior of a function called ``add``. The first parameter is always of type
+``struct kunit *``, which contains information about the current test context.
+The second parameter, in this case, is what the value is expected to be. The
last value is what the value actually is. If ``add`` passes all of these
expectations, the test case, ``add_test_basic`` will pass; if any one of these
expectations fails, the test case will fail.
-It is important to understand that a test case *fails* when any expectation is
-violated; however, the test will continue running, potentially trying other
-expectations until the test case ends or is otherwise terminated. This is as
-opposed to *assertions* which are discussed later.
+A test case *fails* when any expectation is violated; however, the test will
+continue to run, and try other expectations until the test case ends or is
+otherwise terminated. This is as opposed to *assertions* which are discussed
+later.
-To learn about more expectations supported by KUnit, see
-Documentation/dev-tools/kunit/api/test.rst.
+To learn about more KUnit expectations, see Documentation/dev-tools/kunit/api/test.rst.
.. note::
- A single test case should be pretty short, pretty easy to understand,
- focused on a single behavior.
+ A single test case should be short, easy to understand, and focused on a
+ single behavior.
-For example, if we wanted to properly test the add function above, we would
-create additional tests cases which would each test a different property that an
-add function should have like this:
+For example, if we want to rigorously test the ``add`` function above, create
+additional tests cases which would test each property that an ``add`` function
+should have as shown below:
.. code-block:: c
@@ -134,56 +90,77 @@ add function should have like this:
KUNIT_EXPECT_EQ(test, INT_MIN, add(INT_MAX, 1));
}
-Notice how it is immediately obvious what all the properties that we are testing
-for are.
-
Assertions
~~~~~~~~~~
-KUnit also has the concept of an *assertion*. An assertion is just like an
-expectation except the assertion immediately terminates the test case if it is
-not satisfied.
-
-For example:
+An assertion is like an expectation, except that the assertion immediately
+terminates the test case if the condition is not satisfied. For example:
.. code-block:: c
- static void mock_test_do_expect_default_return(struct kunit *test)
+ static void test_sort(struct kunit *test)
{
- struct mock_test_context *ctx = test->priv;
- struct mock *mock = ctx->mock;
- int param0 = 5, param1 = -5;
- const char *two_param_types[] = {"int", "int"};
- const void *two_params[] = {&param0, &param1};
- const void *ret;
-
- ret = mock->do_expect(mock,
- "test_printk", test_printk,
- two_param_types, two_params,
- ARRAY_SIZE(two_params));
- KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ret);
- KUNIT_EXPECT_EQ(test, -4, *((int *) ret));
+ int *a, i, r = 1;
+ a = kunit_kmalloc_array(test, TEST_LEN, sizeof(*a), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, a);
+ for (i = 0; i < TEST_LEN; i++) {
+ r = (r * 725861) % 6599;
+ a[i] = r;
+ }
+ sort(a, TEST_LEN, sizeof(*a), cmpint, NULL);
+ for (i = 0; i < TEST_LEN-1; i++)
+ KUNIT_EXPECT_LE(test, a[i], a[i + 1]);
}
-In this example, the method under test should return a pointer to a value, so
-if the pointer returned by the method is null or an errno, we don't want to
-bother continuing the test since the following expectation could crash the test
-case. `ASSERT_NOT_ERR_OR_NULL(...)` allows us to bail out of the test case if
-the appropriate conditions have not been satisfied to complete the test.
+In this example, we need to be able to allocate an array to test the ``sort()``
+function. So we use ``KUNIT_ASSERT_NOT_ERR_OR_NULL()`` to abort the test if
+there's an allocation error.
+
+.. note::
+ In other test frameworks, ``ASSERT`` macros are often implemented by calling
+ ``return`` so they only work from the test function. In KUnit, we stop the
+ current kthread on failure, so you can call them from anywhere.
+
+Customizing error messages
+--------------------------
+
+Each of the ``KUNIT_EXPECT`` and ``KUNIT_ASSERT`` macros have a ``_MSG``
+variant. These take a format string and arguments to provide additional
+context to the automatically generated error messages.
+
+.. code-block:: c
+
+ char some_str[41];
+ generate_sha1_hex_string(some_str);
+
+ /* Before. Not easy to tell why the test failed. */
+ KUNIT_EXPECT_EQ(test, strlen(some_str), 40);
+
+ /* After. Now we see the offending string. */
+ KUNIT_EXPECT_EQ_MSG(test, strlen(some_str), 40, "some_str='%s'", some_str);
+
+Alternatively, one can take full control over the error message by using
+``KUNIT_FAIL()``, e.g.
+
+.. code-block:: c
+
+ /* Before */
+ KUNIT_EXPECT_EQ(test, some_setup_function(), 0);
+
+ /* After: full control over the failure message. */
+ if (some_setup_function())
+ KUNIT_FAIL(test, "Failed to setup thing for testing");
+
Test Suites
~~~~~~~~~~~
-Now obviously one unit test isn't very helpful; the power comes from having
-many test cases covering all of a unit's behaviors. Consequently it is common
-to have many *similar* tests; in order to reduce duplication in these closely
-related tests most unit testing frameworks - including KUnit - provide the
-concept of a *test suite*. A *test suite* is just a collection of test cases
-for a unit of code with a set up function that gets invoked before every test
-case and then a tear down function that gets invoked after every test case
-completes.
-
-Example:
+We need many test cases covering all the unit's behaviors. It is common to have
+many similar tests. In order to reduce duplication in these closely related
+tests, most unit testing frameworks (including KUnit) provide the concept of a
+*test suite*. A test suite is a collection of test cases for a unit of code
+with optional setup and teardown functions that run before/after the whole
+suite and/or every test case. For example:
.. code-block:: c
@@ -198,27 +175,57 @@ Example:
.name = "example",
.init = example_test_init,
.exit = example_test_exit,
+ .suite_init = example_suite_init,
+ .suite_exit = example_suite_exit,
.test_cases = example_test_cases,
};
kunit_test_suite(example_test_suite);
-In the above example the test suite, ``example_test_suite``, would run the test
-cases ``example_test_foo``, ``example_test_bar``, and ``example_test_baz``;
-each would have ``example_test_init`` called immediately before it and would
-have ``example_test_exit`` called immediately after it.
-``kunit_test_suite(example_test_suite)`` registers the test suite with the
-KUnit test framework.
+In the above example, the test suite ``example_test_suite`` would first run
+``example_suite_init``, then run the test cases ``example_test_foo``,
+``example_test_bar``, and ``example_test_baz``. Each would have
+``example_test_init`` called immediately before it and ``example_test_exit``
+called immediately after it. Finally, ``example_suite_exit`` would be called
+after everything else. ``kunit_test_suite(example_test_suite)`` registers the
+test suite with the KUnit test framework.
.. note::
- A test case will only be run if it is associated with a test suite.
+ A test case will only run if it is associated with a test suite.
+
+``kunit_test_suite(...)`` is a macro which tells the linker to put the
+specified test suite in a special linker section so that it can be run by KUnit
+either after ``late_init``, or when the test module is loaded (if the test was
+built as a module).
+
+For more information, see Documentation/dev-tools/kunit/api/test.rst.
+
+.. _kunit-on-non-uml:
+
+Writing Tests For Other Architectures
+-------------------------------------
+
+It is better to write tests that run on UML to tests that only run under a
+particular architecture. It is better to write tests that run under QEMU or
+another easy to obtain (and monetarily free) software environment to a specific
+piece of hardware.
-``kunit_test_suite(...)`` is a macro which tells the linker to put the specified
-test suite in a special linker section so that it can be run by KUnit either
-after late_init, or when the test module is loaded (depending on whether the
-test was built in or not).
+Nevertheless, there are still valid reasons to write a test that is architecture
+or hardware specific. For example, we might want to test code that really
+belongs in ``arch/some-arch/*``. Even so, try to write the test so that it does
+not depend on physical hardware. Some of our test cases may not need hardware,
+only few tests actually require the hardware to test it. When hardware is not
+available, instead of disabling tests, we can skip them.
-For more information on these types of things see the
-Documentation/dev-tools/kunit/api/test.rst.
+Now that we have narrowed down exactly what bits are hardware specific, the
+actual procedure for writing and running the tests is same as writing normal
+KUnit tests.
+
+.. important::
+ We may have to reset hardware state. If this is not possible, we may only
+ be able to run one test case per invocation.
+
+.. TODO(brendanhiggins@google.com): Add an actual example of an architecture-
+ dependent KUnit test.
Common Patterns
===============
@@ -226,43 +233,39 @@ Common Patterns
Isolating Behavior
------------------
-The most important aspect of unit testing that other forms of testing do not
-provide is the ability to limit the amount of code under test to a single unit.
-In practice, this is only possible by being able to control what code gets run
-when the unit under test calls a function and this is usually accomplished
-through some sort of indirection where a function is exposed as part of an API
-such that the definition of that function can be changed without affecting the
-rest of the code base. In the kernel this primarily comes from two constructs,
-classes, structs that contain function pointers that are provided by the
-implementer, and architecture-specific functions which have definitions selected
-at compile time.
+Unit testing limits the amount of code under test to a single unit. It controls
+what code gets run when the unit under test calls a function. Where a function
+is exposed as part of an API such that the definition of that function can be
+changed without affecting the rest of the code base. In the kernel, this comes
+from two constructs: classes, which are structs that contain function pointers
+provided by the implementer, and architecture-specific functions, which have
+definitions selected at compile time.
Classes
~~~~~~~
Classes are not a construct that is built into the C programming language;
-however, it is an easily derived concept. Accordingly, pretty much every project
-that does not use a standardized object oriented library (like GNOME's GObject)
-has their own slightly different way of doing object oriented programming; the
-Linux kernel is no exception.
+however, it is an easily derived concept. Accordingly, in most cases, every
+project that does not use a standardized object oriented library (like GNOME's
+GObject) has their own slightly different way of doing object oriented
+programming; the Linux kernel is no exception.
The central concept in kernel object oriented programming is the class. In the
kernel, a *class* is a struct that contains function pointers. This creates a
contract between *implementers* and *users* since it forces them to use the
-same function signature without having to call the function directly. In order
-for it to truly be a class, the function pointers must specify that a pointer
-to the class, known as a *class handle*, be one of the parameters; this makes
-it possible for the member functions (also known as *methods*) to have access
-to member variables (more commonly known as *fields*) allowing the same
-implementation to have multiple *instances*.
-
-Typically a class can be *overridden* by *child classes* by embedding the
-*parent class* in the child class. Then when a method provided by the child
-class is called, the child implementation knows that the pointer passed to it is
-of a parent contained within the child; because of this, the child can compute
-the pointer to itself because the pointer to the parent is always a fixed offset
-from the pointer to the child; this offset is the offset of the parent contained
-in the child struct. For example:
+same function signature without having to call the function directly. To be a
+class, the function pointers must specify that a pointer to the class, known as
+a *class handle*, be one of the parameters. Thus the member functions (also
+known as *methods*) have access to member variables (also known as *fields*)
+allowing the same implementation to have multiple *instances*.
+
+A class can be *overridden* by *child classes* by embedding the *parent class*
+in the child class. Then when the child class *method* is called, the child
+implementation knows that the pointer passed to it is of a parent contained
+within the child. Thus, the child can compute the pointer to itself because the
+pointer to the parent is always a fixed offset from the pointer to the child.
+This offset is the offset of the parent contained in the child struct. For
+example:
.. code-block:: c
@@ -278,7 +281,7 @@ in the child struct. For example:
int rectangle_area(struct shape *this)
{
- struct rectangle *self = container_of(this, struct shape, parent);
+ struct rectangle *self = container_of(this, struct rectangle, parent);
return self->length * self->width;
};
@@ -290,8 +293,8 @@ in the child struct. For example:
self->width = width;
}
-In this example (as in most kernel code) the operation of computing the pointer
-to the child from the pointer to the parent is done by ``container_of``.
+In this example, computing the pointer to the child from the pointer to the
+parent is done by ``container_of``.
Faking Classes
~~~~~~~~~~~~~~
@@ -300,14 +303,11 @@ In order to unit test a piece of code that calls a method in a class, the
behavior of the method must be controllable, otherwise the test ceases to be a
unit test and becomes an integration test.
-A fake just provides an implementation of a piece of code that is different than
-what runs in a production instance, but behaves identically from the standpoint
-of the callers; this is usually done to replace a dependency that is hard to
-deal with, or is slow.
-
-A good example for this might be implementing a fake EEPROM that just stores the
-"contents" in an internal buffer. For example, let's assume we have a class that
-represents an EEPROM:
+A fake class implements a piece of code that is different than what runs in a
+production instance, but behaves identical from the standpoint of the callers.
+This is done to replace a dependency that is hard to deal with, or is slow. For
+example, implementing a fake EEPROM that stores the "contents" in an
+internal buffer. Assume we have a class that represents an EEPROM:
.. code-block:: c
@@ -316,7 +316,7 @@ represents an EEPROM:
ssize_t (*write)(struct eeprom *this, size_t offset, const char *buffer, size_t count);
};
-And we want to test some code that buffers writes to the EEPROM:
+And we want to test code that buffers writes to the EEPROM:
.. code-block:: c
@@ -329,7 +329,7 @@ And we want to test some code that buffers writes to the EEPROM:
struct eeprom_buffer *new_eeprom_buffer(struct eeprom *eeprom);
void destroy_eeprom_buffer(struct eeprom *eeprom);
-We can easily test this code by *faking out* the underlying EEPROM:
+We can test this code by *faking out* the underlying EEPROM:
.. code-block:: c
@@ -456,14 +456,14 @@ We can now use it to test ``struct eeprom_buffer``:
destroy_eeprom_buffer(ctx->eeprom_buffer);
}
-Testing against multiple inputs
+Testing Against Multiple Inputs
-------------------------------
-Testing just a few inputs might not be enough to have confidence that the code
-works correctly, e.g. for a hash function.
+Testing just a few inputs is not enough to ensure that the code works correctly,
+for example: testing a hash function.
-In such cases, it can be helpful to have a helper macro or function, e.g. this
-fictitious example for ``sha1sum(1)``
+We can write a helper macro or function. The function is called for each input.
+For example, to test ``sha1sum(1)``, we can write:
.. code-block:: c
@@ -475,16 +475,15 @@ fictitious example for ``sha1sum(1)``
TEST_SHA1("hello world", "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed");
TEST_SHA1("hello world!", "430ce34d020724ed75a196dfc2ad67c77772d169");
+Note the use of the ``_MSG`` version of ``KUNIT_EXPECT_STREQ`` to print a more
+detailed error and make the assertions clearer within the helper macros.
-Note the use of ``KUNIT_EXPECT_STREQ_MSG`` to give more context when it fails
-and make it easier to track down. (Yes, in this example, ``want`` is likely
-going to be unique enough on its own).
-
-The ``_MSG`` variants are even more useful when the same expectation is called
-multiple times (in a loop or helper function) and thus the line number isn't
-enough to identify what failed, like below.
+The ``_MSG`` variants are useful when the same expectation is called multiple
+times (in a loop or helper function) and thus the line number is not enough to
+identify what failed, as shown below.
-In some cases, it can be helpful to write a *table-driven test* instead, e.g.
+In complicated cases, we recommend using a *table-driven test* compared to the
+helper macro variation, for example:
.. code-block:: c
@@ -513,17 +512,18 @@ In some cases, it can be helpful to write a *table-driven test* instead, e.g.
}
-There's more boilerplate involved, but it can:
+There is more boilerplate code involved, but it can:
+
+* be more readable when there are multiple inputs/outputs (due to field names).
-* be more readable when there are multiple inputs/outputs thanks to field names,
+ * For example, see ``fs/ext4/inode-test.c``.
- * E.g. see ``fs/ext4/inode-test.c`` for an example of both.
-* reduce duplication if test cases can be shared across multiple tests.
+* reduce duplication if test cases are shared across multiple tests.
- * E.g. if we wanted to also test ``sha256sum``, we could add a ``sha256``
+ * For example: if we want to test ``sha256sum``, we could add a ``sha256``
field and reuse ``cases``.
-* be converted to a "parameterized test", see below.
+* be converted to a "parameterized test".
Parameterized Testing
~~~~~~~~~~~~~~~~~~~~~
@@ -531,7 +531,7 @@ Parameterized Testing
The table-driven testing pattern is common enough that KUnit has special
support for it.
-Reusing the same ``cases`` array from above, we can write the test as a
+By reusing the same ``cases`` array from above, we can write the test as a
"parameterized test" with the following.
.. code-block:: c
@@ -541,7 +541,7 @@ Reusing the same ``cases`` array from above, we can write the test as a
const char *str;
const char *sha1;
};
- struct sha1_test_case cases[] = {
+ const struct sha1_test_case cases[] = {
{
.str = "hello world",
.sha1 = "2aae6c35c94fcfb415dbe95f408b9ce91ee846ed",
@@ -580,195 +580,151 @@ Reusing the same ``cases`` array from above, we can write the test as a
{}
};
-.. _kunit-on-non-uml:
+Allocating Memory
+-----------------
-KUnit on non-UML architectures
-==============================
+Where you might use ``kzalloc``, you can instead use ``kunit_kzalloc`` as KUnit
+will then ensure that the memory is freed once the test completes.
-By default KUnit uses UML as a way to provide dependencies for code under test.
-Under most circumstances KUnit's usage of UML should be treated as an
-implementation detail of how KUnit works under the hood. Nevertheless, there
-are instances where being able to run architecture-specific code or test
-against real hardware is desirable. For these reasons KUnit supports running on
-other architectures.
-
-Running existing KUnit tests on non-UML architectures
------------------------------------------------------
-
-There are some special considerations when running existing KUnit tests on
-non-UML architectures:
-
-* Hardware may not be deterministic, so a test that always passes or fails
- when run under UML may not always do so on real hardware.
-* Hardware and VM environments may not be hermetic. KUnit tries its best to
- provide a hermetic environment to run tests; however, it cannot manage state
- that it doesn't know about outside of the kernel. Consequently, tests that
- may be hermetic on UML may not be hermetic on other architectures.
-* Some features and tooling may not be supported outside of UML.
-* Hardware and VMs are slower than UML.
-
-None of these are reasons not to run your KUnit tests on real hardware; they are
-only things to be aware of when doing so.
-
-Currently, the KUnit Wrapper (``tools/testing/kunit/kunit.py``) (aka
-kunit_tool) only fully supports running tests inside of UML and QEMU; however,
-this is only due to our own time limitations as humans working on KUnit. It is
-entirely possible to support other emulators and even actual hardware, but for
-now QEMU and UML is what is fully supported within the KUnit Wrapper. Again, to
-be clear, this is just the Wrapper. The actualy KUnit tests and the KUnit
-library they are written in is fully architecture agnostic and can be used in
-virtually any setup, you just won't have the benefit of typing a single command
-out of the box and having everything magically work perfectly.
-
-Again, all core KUnit framework features are fully supported on all
-architectures, and using them is straightforward: Most popular architectures
-are supported directly in the KUnit Wrapper via QEMU. Currently, supported
-architectures on QEMU include:
-
-* i386
-* x86_64
-* arm
-* arm64
-* alpha
-* powerpc
-* riscv
-* s390
-* sparc
-
-In order to run KUnit tests on one of these architectures via QEMU with the
-KUnit wrapper, all you need to do is specify the flags ``--arch`` and
-``--cross_compile`` when invoking the KUnit Wrapper. For example, we could run
-the default KUnit tests on ARM in the following manner (assuming we have an ARM
-toolchain installed):
+This is useful because it lets us use the ``KUNIT_ASSERT_EQ`` macros to exit
+early from a test without having to worry about remembering to call ``kfree``.
+For example:
-.. code-block:: bash
+.. code-block:: c
- tools/testing/kunit/kunit.py run --timeout=60 --jobs=12 --arch=arm --cross_compile=arm-linux-gnueabihf-
+ void example_test_allocation(struct kunit *test)
+ {
+ char *buffer = kunit_kzalloc(test, 16, GFP_KERNEL);
+ /* Ensure allocation succeeded. */
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer);
-Alternatively, if you want to run your tests on real hardware or in some other
-emulation environment, all you need to do is to take your kunitconfig, your
-Kconfig options for the tests you would like to run, and merge them into
-whatever config your are using for your platform. That's it!
+ KUNIT_ASSERT_STREQ(test, buffer, "");
+ }
-For example, let's say you have the following kunitconfig:
-.. code-block:: none
+Testing Static Functions
+------------------------
- CONFIG_KUNIT=y
- CONFIG_KUNIT_EXAMPLE_TEST=y
+If we do not want to expose functions or variables for testing, one option is to
+conditionally ``#include`` the test file at the end of your .c file. For
+example:
-If you wanted to run this test on an x86 VM, you might add the following config
-options to your ``.config``:
+.. code-block:: c
-.. code-block:: none
+ /* In my_file.c */
- CONFIG_KUNIT=y
- CONFIG_KUNIT_EXAMPLE_TEST=y
- CONFIG_SERIAL_8250=y
- CONFIG_SERIAL_8250_CONSOLE=y
+ static int do_interesting_thing();
-All these new options do is enable support for a common serial console needed
-for logging.
+ #ifdef CONFIG_MY_KUNIT_TEST
+ #include "my_kunit_test.c"
+ #endif
-Next, you could build a kernel with these tests as follows:
+Injecting Test-Only Code
+------------------------
+Similar to as shown above, we can add test-specific logic. For example:
-.. code-block:: bash
+.. code-block:: c
- make ARCH=x86 olddefconfig
- make ARCH=x86
+ /* In my_file.h */
-Once you have built a kernel, you could run it on QEMU as follows:
+ #ifdef CONFIG_MY_KUNIT_TEST
+ /* Defined in my_kunit_test.c */
+ void test_only_hook(void);
+ #else
+ void test_only_hook(void) { }
+ #endif
-.. code-block:: bash
+This test-only code can be made more useful by accessing the current ``kunit_test``
+as shown in next section: *Accessing The Current Test*.
- qemu-system-x86_64 -enable-kvm \
- -m 1024 \
- -kernel arch/x86_64/boot/bzImage \
- -append 'console=ttyS0' \
- --nographic
+Accessing The Current Test
+--------------------------
-Interspersed in the kernel logs you might see the following:
+In some cases, we need to call test-only code from outside the test file. This
+is helpful, for example, when providing a fake implementation of a function, or
+to fail any current test from within an error handler.
+We can do this via the ``kunit_test`` field in ``task_struct``, which we can
+access using the ``kunit_get_current_test()`` function in ``kunit/test-bug.h``.
-.. code-block:: none
+``kunit_get_current_test()`` is safe to call even if KUnit is not enabled. If
+KUnit is not enabled, or if no test is running in the current task, it will
+return ``NULL``. This compiles down to either a no-op or a static key check,
+so will have a negligible performance impact when no test is running.
- TAP version 14
- # Subtest: example
- 1..1
- # example_simple_test: initializing
- ok 1 - example_simple_test
- ok 1 - example
+The example below uses this to implement a "mock" implementation of a function, ``foo``:
-Congratulations, you just ran a KUnit test on the x86 architecture!
+.. code-block:: c
-In a similar manner, kunit and kunit tests can also be built as modules,
-so if you wanted to run tests in this way you might add the following config
-options to your ``.config``:
+ #include <kunit/test-bug.h> /* for kunit_get_current_test */
-.. code-block:: none
+ struct test_data {
+ int foo_result;
+ int want_foo_called_with;
+ };
- CONFIG_KUNIT=m
- CONFIG_KUNIT_EXAMPLE_TEST=m
+ static int fake_foo(int arg)
+ {
+ struct kunit *test = kunit_get_current_test();
+ struct test_data *test_data = test->priv;
-Once the kernel is built and installed, a simple
+ KUNIT_EXPECT_EQ(test, test_data->want_foo_called_with, arg);
+ return test_data->foo_result;
+ }
-.. code-block:: bash
+ static void example_simple_test(struct kunit *test)
+ {
+ /* Assume priv (private, a member used to pass test data from
+ * the init function) is allocated in the suite's .init */
+ struct test_data *test_data = test->priv;
- modprobe example-test
+ test_data->foo_result = 42;
+ test_data->want_foo_called_with = 1;
-...will run the tests.
+ /* In a real test, we'd probably pass a pointer to fake_foo somewhere
+ * like an ops struct, etc. instead of calling it directly. */
+ KUNIT_EXPECT_EQ(test, fake_foo(1), 42);
+ }
-.. note::
- Note that you should make sure your test depends on ``KUNIT=y`` in Kconfig
- if the test does not support module build. Otherwise, it will trigger
- compile errors if ``CONFIG_KUNIT`` is ``m``.
+In this example, we are using the ``priv`` member of ``struct kunit`` as a way
+of passing data to the test from the init function. In general ``priv`` is
+pointer that can be used for any user data. This is preferred over static
+variables, as it avoids concurrency issues.
-Writing new tests for other architectures
------------------------------------------
+Had we wanted something more flexible, we could have used a named ``kunit_resource``.
+Each test can have multiple resources which have string names providing the same
+flexibility as a ``priv`` member, but also, for example, allowing helper
+functions to create resources without conflicting with each other. It is also
+possible to define a clean up function for each resource, making it easy to
+avoid resource leaks. For more information, see Documentation/dev-tools/kunit/api/resource.rst.
-The first thing you must do is ask yourself whether it is necessary to write a
-KUnit test for a specific architecture, and then whether it is necessary to
-write that test for a particular piece of hardware. In general, writing a test
-that depends on having access to a particular piece of hardware or software (not
-included in the Linux source repo) should be avoided at all costs.
+Failing The Current Test
+------------------------
-Even if you only ever plan on running your KUnit test on your hardware
-configuration, other people may want to run your tests and may not have access
-to your hardware. If you write your test to run on UML, then anyone can run your
-tests without knowing anything about your particular setup, and you can still
-run your tests on your hardware setup just by compiling for your architecture.
+If we want to fail the current test, we can use ``kunit_fail_current_test(fmt, args...)``
+which is defined in ``<kunit/test-bug.h>`` and does not require pulling in ``<kunit/test.h>``.
+For example, we have an option to enable some extra debug checks on some data
+structures as shown below:
-.. important::
- Always prefer tests that run on UML to tests that only run under a particular
- architecture, and always prefer tests that run under QEMU or another easy
- (and monetarily free) to obtain software environment to a specific piece of
- hardware.
-
-Nevertheless, there are still valid reasons to write an architecture or hardware
-specific test: for example, you might want to test some code that really belongs
-in ``arch/some-arch/*``. Even so, try your best to write the test so that it
-does not depend on physical hardware: if some of your test cases don't need the
-hardware, only require the hardware for tests that actually need it.
-
-Now that you have narrowed down exactly what bits are hardware specific, the
-actual procedure for writing and running the tests is pretty much the same as
-writing normal KUnit tests. One special caveat is that you have to reset
-hardware state in between test cases; if this is not possible, you may only be
-able to run one test case per invocation.
+.. code-block:: c
-.. TODO(brendanhiggins@google.com): Add an actual example of an architecture-
- dependent KUnit test.
+ #include <kunit/test-bug.h>
-KUnit debugfs representation
-============================
-When kunit test suites are initialized, they create an associated directory
-in ``/sys/kernel/debug/kunit/<test-suite>``. The directory contains one file
+ #ifdef CONFIG_EXTRA_DEBUG_CHECKS
+ static void validate_my_data(struct data *data)
+ {
+ if (is_valid(data))
+ return;
-- results: "cat results" displays results of each test case and the results
- of the entire suite for the last test run.
+ kunit_fail_current_test("data %p is invalid", data);
-The debugfs representation is primarily of use when kunit test suites are
-run in a native environment, either as modules or builtin. Having a way
-to display results like this is valuable as otherwise results can be
-intermixed with other events in dmesg output. The maximum size of each
-results file is KUNIT_LOG_SIZE bytes (defined in ``include/kunit/test.h``).
+ /* Normal, non-KUnit, error reporting code here. */
+ }
+ #else
+ static void my_debug_function(void) { }
+ #endif
+
+``kunit_fail_current_test()`` is safe to call even if KUnit is not enabled. If
+KUnit is not enabled, or if no test is running in the current task, it will do
+nothing. This compiles down to either a no-op or a static key check, so will
+have a negligible performance impact when no test is running.
diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index 02102be7ff49..dc791c8d84d1 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -100,3 +100,5 @@ have already built it.
The optional make variable CF can be used to pass arguments to sparse. The
build system passes -Wbitwise to sparse automatically.
+
+Note that sparse defines the __CHECKER__ preprocessor symbol.
diff --git a/Documentation/dev-tools/testing-overview.rst b/Documentation/dev-tools/testing-overview.rst
index 65feb81edb14..0aaf6ea53608 100644
--- a/Documentation/dev-tools/testing-overview.rst
+++ b/Documentation/dev-tools/testing-overview.rst
@@ -115,3 +115,66 @@ that none of these errors are occurring during the test.
Some of these tools integrate with KUnit or kselftest and will
automatically fail tests if an issue is detected.
+Static Analysis Tools
+=====================
+
+In addition to testing a running kernel, one can also analyze kernel source code
+directly (**at compile time**) using **static analysis** tools. The tools
+commonly used in the kernel allow one to inspect the whole source tree or just
+specific files within it. They make it easier to detect and fix problems during
+the development process.
+
+Sparse can help test the kernel by performing type-checking, lock checking,
+value range checking, in addition to reporting various errors and warnings while
+examining the code. See the Documentation/dev-tools/sparse.rst documentation
+page for details on how to use it.
+
+Smatch extends Sparse and provides additional checks for programming logic
+mistakes such as missing breaks in switch statements, unused return values on
+error checking, forgetting to set an error code in the return of an error path,
+etc. Smatch also has tests against more serious issues such as integer
+overflows, null pointer dereferences, and memory leaks. See the project page at
+http://smatch.sourceforge.net/.
+
+Coccinelle is another static analyzer at our disposal. Coccinelle is often used
+to aid refactoring and collateral evolution of source code, but it can also help
+to avoid certain bugs that occur in common code patterns. The types of tests
+available include API tests, tests for correct usage of kernel iterators, checks
+for the soundness of free operations, analysis of locking behavior, and further
+tests known to help keep consistent kernel usage. See the
+Documentation/dev-tools/coccinelle.rst documentation page for details.
+
+Beware, though, that static analysis tools suffer from **false positives**.
+Errors and warns need to be evaluated carefully before attempting to fix them.
+
+When to use Sparse and Smatch
+-----------------------------
+
+Sparse does type checking, such as verifying that annotated variables do not
+cause endianness bugs, detecting places that use ``__user`` pointers improperly,
+and analyzing the compatibility of symbol initializers.
+
+Smatch does flow analysis and, if allowed to build the function database, it
+also does cross function analysis. Smatch tries to answer questions like where
+is this buffer allocated? How big is it? Can this index be controlled by the
+user? Is this variable larger than that variable?
+
+It's generally easier to write checks in Smatch than it is to write checks in
+Sparse. Nevertheless, there are some overlaps between Sparse and Smatch checks.
+
+Strong points of Smatch and Coccinelle
+--------------------------------------
+
+Coccinelle is probably the easiest for writing checks. It works before the
+pre-processor so it's easier to check for bugs in macros using Coccinelle.
+Coccinelle also creates patches for you, which no other tool does.
+
+For example, with Coccinelle you can do a mass conversion from
+``kmalloc(x * size, GFP_KERNEL)`` to ``kmalloc_array(x, size, GFP_KERNEL)``, and
+that's really useful. If you just created a Smatch warning and try to push the
+work of converting on to the maintainers they would be annoyed. You'd have to
+argue about each warning if can really overflow or not.
+
+Coccinelle does no analysis of variable values, which is the strong point of
+Smatch. On the other hand, Coccinelle allows you to do simple things in a simple
+way.
diff --git a/Documentation/devicetree/bindings/.gitignore b/Documentation/devicetree/bindings/.gitignore
index a77719968a7e..51ddb26d93f0 100644
--- a/Documentation/devicetree/bindings/.gitignore
+++ b/Documentation/devicetree/bindings/.gitignore
@@ -2,3 +2,8 @@
*.example.dts
/processed-schema*.yaml
/processed-schema*.json
+
+#
+# We don't want to ignore the following even if they are dot-files
+#
+!.yamllint
diff --git a/Documentation/devicetree/bindings/.yamllint b/Documentation/devicetree/bindings/.yamllint
index 214abd3ec440..4abe9f0a1d46 100644
--- a/Documentation/devicetree/bindings/.yamllint
+++ b/Documentation/devicetree/bindings/.yamllint
@@ -19,7 +19,7 @@ rules:
colons: {max-spaces-before: 0, max-spaces-after: 1}
commas: {min-spaces-after: 1, max-spaces-after: 1}
comments:
- require-starting-space: false
+ require-starting-space: true
min-spaces-from-content: 1
comments-indentation: disable
document-start:
diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile
index a072e95de626..8b395893bd85 100644
--- a/Documentation/devicetree/bindings/Makefile
+++ b/Documentation/devicetree/bindings/Makefile
@@ -3,12 +3,18 @@ DT_DOC_CHECKER ?= dt-doc-validate
DT_EXTRACT_EX ?= dt-extract-example
DT_MK_SCHEMA ?= dt-mk-schema
-DT_SCHEMA_LINT = $(shell which yamllint)
+DT_SCHEMA_LINT = $(shell which yamllint || \
+ echo "warning: python package 'yamllint' not installed, skipping" >&2)
-DT_SCHEMA_MIN_VERSION = 2021.2.1
+DT_SCHEMA_MIN_VERSION = 2022.3
PHONY += check_dtschema_version
check_dtschema_version:
+ @which $(DT_DOC_CHECKER) >/dev/null || \
+ { echo "Error: '$(DT_DOC_CHECKER)' not found!" >&2; \
+ echo "Ensure dtschema python package is installed and in your PATH." >&2; \
+ echo "Current PATH is:" >&2; \
+ echo "$$PATH" >&2; false; }
@{ echo $(DT_SCHEMA_MIN_VERSION); \
$(DT_DOC_CHECKER) --version 2>/dev/null || echo 0; } | sort -Vc >/dev/null || \
{ echo "ERROR: dtschema minimum version is v$(DT_SCHEMA_MIN_VERSION)" >&2; false; }
@@ -19,16 +25,16 @@ quiet_cmd_extract_ex = DTEX $@
$(obj)/%.example.dts: $(src)/%.yaml check_dtschema_version FORCE
$(call if_changed,extract_ex)
-# Use full schemas when checking %.example.dts
-DT_TMP_SCHEMA := $(obj)/processed-schema-examples.json
+find_all_cmd = find $(srctree)/$(src) \( -name '*.yaml' ! \
+ -name 'processed-schema*' \)
-find_cmd = find $(srctree)/$(src) \( -name '*.yaml' ! \
- -name 'processed-schema*' ! \
- -name '*.example.dt.yaml' \)
+find_cmd = $(find_all_cmd) | grep -F -e "$(subst :," -e ",$(DT_SCHEMA_FILES))"
+CHK_DT_DOCS := $(shell $(find_cmd))
quiet_cmd_yamllint = LINT $(src)
cmd_yamllint = ($(find_cmd) | \
- xargs $(DT_SCHEMA_LINT) -f parsable -c $(srctree)/$(src)/.yamllint >&2) || true
+ xargs -n200 -P$$(nproc) \
+ $(DT_SCHEMA_LINT) -f parsable -c $(srctree)/$(src)/.yamllint >&2) || true
quiet_cmd_chk_bindings = CHKDT $@
cmd_chk_bindings = ($(find_cmd) | \
@@ -36,9 +42,7 @@ quiet_cmd_chk_bindings = CHKDT $@
quiet_cmd_mk_schema = SCHEMA $@
cmd_mk_schema = f=$$(mktemp) ; \
- $(if $(DT_MK_SCHEMA_FLAGS), \
- printf '%s\n' $(real-prereqs), \
- $(find_cmd)) > $$f ; \
+ $(find_all_cmd) > $$f ; \
$(DT_MK_SCHEMA) -j $(DT_MK_SCHEMA_FLAGS) @$$f > $@ ; \
rm -f $$f
@@ -48,45 +52,29 @@ define rule_chkdt
$(call cmd,mk_schema)
endef
-DT_DOCS = $(patsubst $(srctree)/%,%,$(shell $(find_cmd)))
+DT_DOCS = $(patsubst $(srctree)/%,%,$(shell $(find_all_cmd)))
override DTC_FLAGS := \
-Wno-avoid_unnecessary_addr_size \
-Wno-graph_child_address \
- -Wno-interrupt_provider
+ -Wno-interrupt_provider \
+ -Wno-unique_unit_address \
+ -Wunique_unit_address_if_enabled
# Disable undocumented compatible checks until warning free
override DT_CHECKER_FLAGS ?=
-$(obj)/processed-schema-examples.json: $(DT_DOCS) $(src)/.yamllint check_dtschema_version FORCE
+$(obj)/processed-schema.json: $(DT_DOCS) $(src)/.yamllint check_dtschema_version FORCE
$(call if_changed_rule,chkdt)
-ifeq ($(DT_SCHEMA_FILES),)
-
-# Unless DT_SCHEMA_FILES is specified, use the full schema for dtbs_check too.
-# Just copy processed-schema-examples.json
-
-$(obj)/processed-schema.json: $(obj)/processed-schema-examples.json FORCE
- $(call if_changed,copy)
-
-DT_SCHEMA_FILES = $(DT_DOCS)
-
-else
-
-# If DT_SCHEMA_FILES is specified, use it for processed-schema.json
-
-$(obj)/processed-schema.json: DT_MK_SCHEMA_FLAGS := -u
-$(obj)/processed-schema.json: $(DT_SCHEMA_FILES) check_dtschema_version FORCE
- $(call if_changed,mk_schema)
-
-endif
-
-always-$(CHECK_DT_BINDING) += processed-schema-examples.json
-always-$(CHECK_DTBS) += processed-schema.json
-always-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
-always-$(CHECK_DT_BINDING) += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
+always-y += processed-schema.json
+always-$(CHECK_DT_BINDING) += $(patsubst $(srctree)/$(src)/%.yaml,%.example.dts, $(CHK_DT_DOCS))
+always-$(CHECK_DT_BINDING) += $(patsubst $(srctree)/$(src)/%.yaml,%.example.dtb, $(CHK_DT_DOCS))
# Hack: avoid 'Argument list too long' error for 'make clean'. Remove most of
# build artifacts here before they are processed by scripts/Makefile.clean
clean-files = $(shell find $(obj) \( -name '*.example.dts' -o \
- -name '*.example.dt.yaml' \) -delete 2>/dev/null)
+ -name '*.example.dtb' \) -delete 2>/dev/null)
+
+dt_compatible_check: $(obj)/processed-schema.json
+ $(Q)$(srctree)/scripts/dtc/dt-extract-compatibles $(srctree) | xargs dt-check-compatible -v -s $<
diff --git a/Documentation/devicetree/bindings/arm/actions.yaml b/Documentation/devicetree/bindings/arm/actions.yaml
index 02dc72c97645..e012f612f039 100644
--- a/Documentation/devicetree/bindings/arm/actions.yaml
+++ b/Documentation/devicetree/bindings/arm/actions.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/actions.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Actions Semi platforms device tree bindings
+title: Actions Semi platforms
maintainers:
- Andreas Färber <afaerber@suse.de>
diff --git a/Documentation/devicetree/bindings/arm/airoha.yaml b/Documentation/devicetree/bindings/arm/airoha.yaml
new file mode 100644
index 000000000000..3292c669ee11
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/airoha.yaml
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/airoha.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Airoha SoC based Platforms
+
+maintainers:
+ - Felix Fietkau <nbd@nbd.name>
+ - John Crispin <john@phrozen.org>
+
+description:
+ Boards with an Airoha SoC shall have the following properties.
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - airoha,en7523-evb
+ - const: airoha,en7523
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/altera.yaml b/Documentation/devicetree/bindings/arm/altera.yaml
index c15c92fdf2ed..8c7575455422 100644
--- a/Documentation/devicetree/bindings/arm/altera.yaml
+++ b/Documentation/devicetree/bindings/arm/altera.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/altera.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Altera's SoCFPGA platform device tree bindings
+title: Altera's SoCFPGA platform
maintainers:
- Dinh Nguyen <dinguyen@kernel.org>
@@ -13,12 +13,55 @@ properties:
$nodename:
const: "/"
compatible:
- items:
- - enum:
- - altr,socfpga-cyclone5
- - altr,socfpga-arria5
- - altr,socfpga-arria10
- - const: altr,socfpga
+ oneOf:
+ - description: Arria 5 boards
+ items:
+ - enum:
+ - altr,socfpga-arria5-socdk
+ - const: altr,socfpga-arria5
+ - const: altr,socfpga
+
+ - description: Arria 10 boards
+ items:
+ - enum:
+ - altr,socfpga-arria10-socdk
+ - const: altr,socfpga-arria10
+ - const: altr,socfpga
+
+ - description: Mercury+ AA1 boards
+ items:
+ - enum:
+ - enclustra,mercury-pe1
+ - google,chameleon-v3
+ - const: enclustra,mercury-aa1
+ - const: altr,socfpga-arria10
+ - const: altr,socfpga
+
+ - description: Cyclone 5 boards
+ items:
+ - enum:
+ - altr,socfpga-cyclone5-socdk
+ - denx,mcvevk
+ - ebv,socrates
+ - macnica,sodia
+ - novtech,chameleon96
+ - samtec,vining
+ - terasic,de0-atlas
+ - terasic,socfpga-cyclone5-sockit
+ - const: altr,socfpga-cyclone5
+ - const: altr,socfpga
+
+ - description: Stratix 10 boards
+ items:
+ - enum:
+ - altr,socfpga-stratix10-socdk
+ - altr,socfpga-stratix10-swvp
+ - const: altr,socfpga-stratix10
+
+ - description: SoCFPGA VT
+ items:
+ - const: altr,socfpga-vt
+ - const: altr,socfpga
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/amazon,al.yaml b/Documentation/devicetree/bindings/arm/amazon,al.yaml
index 0f03135d91b6..37dbb4768e5b 100644
--- a/Documentation/devicetree/bindings/arm/amazon,al.yaml
+++ b/Documentation/devicetree/bindings/arm/amazon,al.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/amazon,al.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Amazon's Annapurna Labs Alpine Platform Device Tree Bindings
+title: Amazon's Annapurna Labs Alpine Platform
maintainers:
- Hanna Hawa <hhhawa@amazon.com>
diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml
index 6423377710ee..274ee0890312 100644
--- a/Documentation/devicetree/bindings/arm/amlogic.yaml
+++ b/Documentation/devicetree/bindings/arm/amlogic.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/amlogic.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Amlogic MesonX device tree bindings
+title: Amlogic MesonX
maintainers:
- Kevin Hilman <khilman@baylibre.com>
@@ -86,6 +86,7 @@ properties:
- enum:
- amlogic,p281
- oranth,tx3-mini
+ - jethome,jethub-j80
- const: amlogic,s905w
- const: amlogic,meson-gxl
@@ -107,6 +108,7 @@ properties:
- amlogic,p230
- amlogic,p231
- libretech,aml-s905d-pc
+ - osmc,vero4k-plus
- phicomm,n1
- smartlabs,sml5442tw
- videostrong,gxl-kii-pro
@@ -118,6 +120,7 @@ properties:
- enum:
- amlogic,q200
- amlogic,q201
+ - azw,gt1-ultimate
- khadas,vim2
- kingnovel,r-box-pro
- libretech,aml-s912-pc
@@ -133,6 +136,8 @@ properties:
items:
- enum:
- amlogic,s400
+ - jethome,jethub-j100
+ - jethome,jethub-j110
- const: amlogic,a113d
- const: amlogic,meson-axg
@@ -141,13 +146,24 @@ properties:
- enum:
- amediatech,x96-max
- amlogic,u200
+ - radxa,zero
- seirobotics,sei510
- const: amlogic,g12a
- description: Boards with the Amlogic Meson G12B A311D SoC
items:
- enum:
+ - bananapi,bpi-m2s
- khadas,vim3
+ - radxa,zero2
+ - const: amlogic,a311d
+ - const: amlogic,g12b
+
+ - description: Boards using the BPI-CM4 module with Amlogic Meson G12B A311D SoC
+ items:
+ - enum:
+ - bananapi,bpi-cm4io
+ - const: bananapi,bpi-cm4
- const: amlogic,a311d
- const: amlogic,g12b
@@ -157,7 +173,10 @@ properties:
- azw,gsking-x
- azw,gtking
- azw,gtking-pro
+ - bananapi,bpi-m2s
+ - hardkernel,odroid-go-ultra
- hardkernel,odroid-n2
+ - hardkernel,odroid-n2l
- hardkernel,odroid-n2-plus
- khadas,vim3
- ugoos,am6
@@ -167,9 +186,15 @@ properties:
- description: Boards with the Amlogic Meson SM1 S905X3/D3/Y3 SoC
items:
- enum:
+ - amediatech,x96-air
+ - amediatech,x96-air-gbit
+ - bananapi,bpi-m2-pro
- bananapi,bpi-m5
+ - cyx,a95xf3-air
+ - cyx,a95xf3-air-gbit
- hardkernel,odroid-c4
- hardkernel,odroid-hc4
+ - haochuangyi,h96-max
- khadas,vim3l
- seirobotics,sei610
- const: amlogic,sm1
@@ -180,6 +205,12 @@ properties:
- amlogic,ad401
- const: amlogic,a1
+ - description: Boards with the Amlogic Meson S4 S805X2 SoC
+ items:
+ - enum:
+ - amlogic,aq222
+ - const: amlogic,s4
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml
index 6cc74523ebfd..7dff32f373cb 100644
--- a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml
+++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.yaml
@@ -2,13 +2,13 @@
# Copyright 2019 BayLibre, SAS
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/amlogic/amlogic,meson-gx-ao-secure.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/amlogic/amlogic,meson-gx-ao-secure.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Amlogic Meson Firmware registers Interface
maintainers:
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
description: |
The Meson SoCs have a register bank with status and data shared with the
diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-mx-secbus2.yaml b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-mx-secbus2.yaml
index eee7cda9f91b..09b27e98d4c9 100644
--- a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-mx-secbus2.yaml
+++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-mx-secbus2.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/amlogic/amlogic,meson-mx-secbus2.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/amlogic/amlogic,meson-mx-secbus2.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Amlogic Meson8/Meson8b/Meson8m2 SECBUS2 register interface
diff --git a/Documentation/devicetree/bindings/arm/apple.yaml b/Documentation/devicetree/bindings/arm/apple.yaml
index 1e772c85206c..883fd67e3752 100644
--- a/Documentation/devicetree/bindings/arm/apple.yaml
+++ b/Documentation/devicetree/bindings/arm/apple.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/apple.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Apple ARM Machine Device Tree Bindings
+title: Apple ARM Machine
maintainers:
- Hector Martin <marcan@marcan.st>
@@ -12,12 +12,27 @@ maintainers:
description: |
ARM platforms using SoCs designed by Apple Inc., branded "Apple Silicon".
- This currently includes devices based on the "M1" SoC, starting with the
- three Mac models released in late 2020:
+ This currently includes devices based on the "M1" SoC:
- Mac mini (M1, 2020)
- MacBook Pro (13-inch, M1, 2020)
- MacBook Air (M1, 2020)
+ - iMac (24-inch, M1, 2021)
+
+ Devices based on the "M2" SoC:
+
+ - MacBook Air (M2, 2022)
+ - MacBook Pro (13-inch, M2, 2022)
+ - Mac mini (M2, 2023)
+
+ And devices based on the "M1 Pro", "M1 Max" and "M1 Ultra" SoCs:
+
+ - MacBook Pro (14-inch, M1 Pro, 2021)
+ - MacBook Pro (14-inch, M1 Max, 2021)
+ - MacBook Pro (16-inch, M1 Pro, 2021)
+ - MacBook Pro (16-inch, M1 Max, 2021)
+ - Mac Studio (M1 Max, 2022)
+ - Mac Studio (M1 Ultra, 2022)
The compatible property should follow this format:
@@ -56,9 +71,44 @@ properties:
- apple,j274 # Mac mini (M1, 2020)
- apple,j293 # MacBook Pro (13-inch, M1, 2020)
- apple,j313 # MacBook Air (M1, 2020)
+ - apple,j456 # iMac (24-inch, 4x USB-C, M1, 2021)
+ - apple,j457 # iMac (24-inch, 2x USB-C, M1, 2021)
- const: apple,t8103
- const: apple,arm-platform
+ - description: Apple M2 SoC based platforms
+ items:
+ - enum:
+ - apple,j413 # MacBook Air (M2, 2022)
+ - apple,j473 # Mac mini (M2, 2023)
+ - apple,j493 # MacBook Pro (13-inch, M2, 2022)
+ - const: apple,t8112
+ - const: apple,arm-platform
+
+ - description: Apple M1 Pro SoC based platforms
+ items:
+ - enum:
+ - apple,j314s # MacBook Pro (14-inch, M1 Pro, 2021)
+ - apple,j316s # MacBook Pro (16-inch, M1 Pro, 2021)
+ - const: apple,t6000
+ - const: apple,arm-platform
+
+ - description: Apple M1 Max SoC based platforms
+ items:
+ - enum:
+ - apple,j314c # MacBook Pro (14-inch, M1 Max, 2021)
+ - apple,j316c # MacBook Pro (16-inch, M1 Max, 2021)
+ - apple,j375c # Mac Studio (M1 Max, 2022)
+ - const: apple,t6001
+ - const: apple,arm-platform
+
+ - description: Apple M1 Ultra SoC based platforms
+ items:
+ - enum:
+ - apple,j375d # Mac Studio (M1 Ultra, 2022)
+ - const: apple,t6002
+ - const: apple,arm-platform
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml b/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml
new file mode 100644
index 000000000000..673277a7a224
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/apple/apple,pmgr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Apple SoC Power Manager (PMGR)
+
+maintainers:
+ - Hector Martin <marcan@marcan.st>
+
+description: |
+ Apple SoCs include PMGR blocks responsible for power management,
+ which can control various clocks, resets, power states, and
+ performance features. This node represents the PMGR as a syscon,
+ with sub-nodes representing individual features.
+
+properties:
+ $nodename:
+ pattern: "^power-management@[0-9a-f]+$"
+
+ compatible:
+ items:
+ - enum:
+ - apple,t8103-pmgr
+ - apple,t8112-pmgr
+ - apple,t6000-pmgr
+ - const: apple,pmgr
+ - const: syscon
+ - const: simple-mfd
+
+ reg:
+ maxItems: 1
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 1
+
+patternProperties:
+ "power-controller@[0-9a-f]+$":
+ description:
+ The individual power management domains within this controller
+ type: object
+ $ref: /schemas/power/apple,pmgr-pwrstate.yaml#
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ power-management@23b700000 {
+ compatible = "apple,t8103-pmgr", "apple,pmgr", "syscon", "simple-mfd";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x2 0x3b700000 0x0 0x14000>;
+
+ ps_sio: power-controller@1c0 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x1c0 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "sio";
+ apple,always-on;
+ };
+
+ ps_uart_p: power-controller@220 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x220 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "uart_p";
+ power-domains = <&ps_sio>;
+ };
+
+ ps_uart0: power-controller@270 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x270 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "uart0";
+ power-domains = <&ps_uart_p>;
+ };
+ };
+
+ power-management@23d280000 {
+ compatible = "apple,t8103-pmgr", "apple,pmgr", "syscon", "simple-mfd";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x2 0x3d280000 0x0 0xc000>;
+
+ ps_aop_filter: power-controller@4000 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x4000 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "aop_filter";
+ };
+
+ ps_aop_base: power-controller@4010 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x4010 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "aop_base";
+ power-domains = <&ps_aop_filter>;
+ };
+
+ ps_aop_shim: power-controller@4038 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x4038 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "aop_shim";
+ power-domains = <&ps_aop_base>;
+ };
+
+ ps_aop_uart0: power-controller@4048 {
+ compatible = "apple,t8103-pmgr-pwrstate", "apple,pmgr-pwrstate";
+ reg = <0x4048 8>;
+ #power-domain-cells = <0>;
+ #reset-cells = <0>;
+ label = "aop_uart0";
+ power-domains = <&ps_aop_shim>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/arm,cci-400.yaml b/Documentation/devicetree/bindings/arm/arm,cci-400.yaml
new file mode 100644
index 000000000000..d28303d909e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,cci-400.yaml
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,cci-400.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM CCI Cache Coherent Interconnect
+
+maintainers:
+ - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+
+description: >
+ ARM multi-cluster systems maintain intra-cluster coherency through a cache
+ coherent interconnect (CCI) that is capable of monitoring bus transactions
+ and manage coherency, TLB invalidations and memory barriers.
+
+ It allows snooping and distributed virtual memory message broadcast across
+ clusters, through memory mapped interface, with a global control register
+ space and multiple sets of interface control registers, one per slave
+ interface.
+
+properties:
+ $nodename:
+ pattern: "^cci(@[0-9a-f]+)?$"
+
+ compatible:
+ enum:
+ - arm,cci-400
+ - arm,cci-500
+ - arm,cci-550
+
+ reg:
+ maxItems: 1
+ description: >
+ Specifies base physical address of CCI control registers common to all
+ interfaces.
+
+ "#address-cells": true
+ "#size-cells": true
+ ranges: true
+
+patternProperties:
+ "^slave-if@[0-9a-f]+$":
+ type: object
+
+ properties:
+ compatible:
+ const: arm,cci-400-ctrl-if
+
+ interface-type:
+ enum:
+ - ace
+ - ace-lite
+
+ reg:
+ maxItems: 1
+
+ required:
+ - compatible
+ - interface-type
+ - reg
+
+ additionalProperties: false
+
+ "^pmu@[0-9a-f]+$":
+ type: object
+
+ properties:
+ compatible:
+ oneOf:
+ - const: arm,cci-400-pmu,r0
+ - const: arm,cci-400-pmu,r1
+ - const: arm,cci-400-pmu
+ deprecated: true
+ description: >
+ Permitted only where OS has secure access to CCI registers
+ - const: arm,cci-500-pmu,r0
+ - const: arm,cci-550-pmu,r0
+
+ interrupts:
+ minItems: 1
+ maxItems: 8
+ description: >
+ List of counter overflow interrupts, one per counter. The interrupts
+ must be specified starting with the cycle counter overflow interrupt,
+ followed by counter0 overflow interrupt, counter1 overflow
+ interrupt,... ,counterN overflow interrupt.
+
+ The CCI PMU has an interrupt signal for each counter. The number of
+ interrupts must be equal to the number of counters.
+
+ reg:
+ maxItems: 1
+
+ required:
+ - compatible
+ - interrupts
+ - reg
+
+ additionalProperties: false
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+ - compatible
+ - ranges
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ / {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ compatible = "arm,vexpress,v2p-ca15_a7", "arm,vexpress";
+ model = "V2P-CA15_CA7";
+ arm,hbi = <0x249>;
+ interrupt-parent = <&gic>;
+
+ gic: interrupt-controller {
+ interrupt-controller;
+ #interrupt-cells = <3>;
+ };
+
+ /*
+ * This CCI node corresponds to a CCI component whose control
+ * registers sits at address 0x000000002c090000.
+ *
+ * CCI slave interface @0x000000002c091000 is connected to dma
+ * controller dma0.
+ *
+ * CCI slave interface @0x000000002c094000 is connected to CPUs
+ * {CPU0, CPU1};
+ *
+ * CCI slave interface @0x000000002c095000 is connected to CPUs
+ * {CPU2, CPU3};
+ */
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <1>;
+
+ CPU0: cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ cci-control-port = <&cci_control1>;
+ reg = <0x0>;
+ };
+
+ CPU1: cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ cci-control-port = <&cci_control1>;
+ reg = <0x1>;
+ };
+
+ CPU2: cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ cci-control-port = <&cci_control2>;
+ reg = <0x100>;
+ };
+
+ CPU3: cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ cci-control-port = <&cci_control2>;
+ reg = <0x101>;
+ };
+ };
+
+ cci@2c090000 {
+ compatible = "arm,cci-400";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x0 0x2c090000 0 0x1000>;
+ ranges = <0x0 0x0 0x2c090000 0x10000>;
+
+ cci_control0: slave-if@1000 {
+ compatible = "arm,cci-400-ctrl-if";
+ interface-type = "ace-lite";
+ reg = <0x1000 0x1000>;
+ };
+
+ cci_control1: slave-if@4000 {
+ compatible = "arm,cci-400-ctrl-if";
+ interface-type = "ace";
+ reg = <0x4000 0x1000>;
+ };
+
+ cci_control2: slave-if@5000 {
+ compatible = "arm,cci-400-ctrl-if";
+ interface-type = "ace";
+ reg = <0x5000 0x1000>;
+ };
+
+ pmu@9000 {
+ compatible = "arm,cci-400-pmu";
+ reg = <0x9000 0x5000>;
+ interrupts = <0 101 4>,
+ <0 102 4>,
+ <0 103 4>,
+ <0 104 4>,
+ <0 105 4>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-catu.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-catu.yaml
new file mode 100644
index 000000000000..2bae06eed693
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-catu.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-catu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Coresight Address Translation Unit (CATU)
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The CoreSight Address Translation Unit (CATU) translates addresses between an
+ AXI master and system memory. The CATU is normally used along with the TMC to
+ implement scattering of virtual trace buffers in physical memory. The CATU
+ translates contiguous Virtual Addresses (VAs) from an AXI master into
+ non-contiguous Physical Addresses (PAs) that are intended for system memory.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-catu
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-catu
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ interrupts:
+ maxItems: 1
+ description: Address translation error interrupt
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: AXI Slave connected to another Coresight component
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ catu@207e0000 {
+ compatible = "arm,coresight-catu", "arm,primecell";
+ reg = <0x207e0000 0x1000>;
+
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+
+ interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
+ in-ports {
+ port {
+ catu_in_port: endpoint {
+ remote-endpoint = <&etr_out_port>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-cpu-debug.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-cpu-debug.yaml
new file mode 100644
index 000000000000..0a6bc03ebe00
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-cpu-debug.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-cpu-debug.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: CoreSight CPU Debug Component
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight CPU debug component are compliant with the ARMv8 architecture
+ reference manual (ARM DDI 0487A.k) Chapter 'Part H: External debug'. The
+ external debug module is mainly used for two modes: self-hosted debug and
+ external debug, and it can be accessed from mmio region from Coresight and
+ eventually the debug module connects with CPU for debugging. And the debug
+ module provides sample-based profiling extension, which can be used to sample
+ CPU program counter, secure state and exception level, etc; usually every CPU
+ has one dedicated debug module to be connected.
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-cpu-debug
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-cpu-debug
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ maxItems: 1
+
+ cpu:
+ description:
+ A phandle to the cpu this debug component is bound to.
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ power-domains:
+ maxItems: 1
+ description:
+ A phandle to the debug power domain if the debug logic has its own
+ dedicated power domain. CPU idle states may also need to be separately
+ constrained to keep CPU cores powered.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - cpu
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ debug@f6590000 {
+ compatible = "arm,coresight-cpu-debug", "arm,primecell";
+ reg = <0xf6590000 0x1000>;
+ clocks = <&sys_ctrl 1>;
+ clock-names = "apb_pclk";
+ cpu = <&cpu0>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-cti.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-cti.yaml
new file mode 100644
index 000000000000..0c5b875cb654
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-cti.yaml
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2019 Linaro Ltd.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-cti.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Coresight Cross Trigger Interface (CTI) device.
+
+description: |
+ The CoreSight Embedded Cross Trigger (ECT) consists of CTI devices connected
+ to one or more CoreSight components and/or a CPU, with CTIs interconnected in
+ a star topology via the Cross Trigger Matrix (CTM), which is not programmable.
+ The ECT components are not part of the trace generation data path and are thus
+ not part of the CoreSight graph.
+
+ The CTI component properties define the connections between the individual
+ CTI and the components it is directly connected to, consisting of input and
+ output hardware trigger signals. CTIs can have a maximum number of input and
+ output hardware trigger signals (8 each for v1 CTI, 32 each for v2 CTI). The
+ number is defined at design time, the maximum of each defined in the DEVID
+ register.
+
+ CTIs are interconnected in a star topology via the CTM, using a number of
+ programmable channels, usually 4, but again implementation defined and
+ described in the DEVID register. The star topology is not required to be
+ described in the bindings as the actual connections are software
+ programmable.
+
+ In general the connections between CTI and components via the trigger signals
+ are implementation defined, except when the CTI is connected to an ARM v8
+ architecture core and optional ETM.
+
+ In this case the ARM v8 architecture defines the required signal connections
+ between CTI and the CPU core and ETM if present. In the case of a v8
+ architecturally connected CTI an additional compatible string is used to
+ indicate this feature (arm,coresight-cti-v8-arch).
+
+ When CTI trigger connection information is unavailable then a minimal driver
+ binding can be declared with no explicit trigger signals. This will result
+ the driver detecting the maximum available triggers and channels from the
+ DEVID register and make them all available for use as a single default
+ connection. Any user / client application will require additional information
+ on the connections between the CTI and other components for correct operation.
+ This information might be found by enabling the Integration Test registers in
+ the driver (set CONFIG_CORESIGHT_CTI_INTEGRATION_TEST in Kernel
+ configuration). These registers may be used to explore the trigger connections
+ between CTI and other CoreSight components.
+
+ Certain triggers between CoreSight devices and the CTI have specific types
+ and usages. These can be defined along with the signal indexes with the
+ constants defined in <dt-bindings/arm/coresight-cti-dt.h>
+
+ For example a CTI connected to a core will usually have a DBGREQ signal. This
+ is defined in the binding as type PE_EDBGREQ. These types will appear in an
+ optional array alongside the signal indexes. Omitting types will default all
+ signals to GEN_IO.
+
+ Note that some hardware trigger signals can be connected to non-CoreSight
+ components (e.g. UART etc) depending on hardware implementation.
+
+maintainers:
+ - Mike Leach <mike.leach@linaro.org>
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - arm,coresight-cti
+ required:
+ - compatible
+
+properties:
+ $nodename:
+ pattern: "^cti(@[0-9a-f]+)$"
+ compatible:
+ oneOf:
+ - items:
+ - const: arm,coresight-cti
+ - const: arm,primecell
+ - items:
+ - const: arm,coresight-cti-v8-arch
+ - const: arm,coresight-cti
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ cpu:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Handle to cpu this device is associated with. This must appear in the
+ base cti node if compatible string arm,coresight-cti-v8-arch is used,
+ or may appear in a trig-conns child node when appropriate.
+
+ power-domains:
+ maxItems: 1
+
+ arm,cti-ctm-id:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Defines the CTM this CTI is connected to, in large systems with multiple
+ separate CTI/CTM nets. Typically multi-socket systems where the CTM is
+ propagated between sockets.
+
+ arm,cs-dev-assoc:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ defines a phandle reference to an associated CoreSight trace device.
+ When the associated trace device is enabled, then the respective CTI
+ will be enabled. Use in a trig-conns node, or in CTI base node when
+ compatible string arm,coresight-cti-v8-arch used. If the associated
+ device has not been registered then the node name will be stored as
+ the connection name for later resolution. If the associated device is
+ not a CoreSight device or not registered then the node name will remain
+ the connection name and automatic enabling will not occur.
+
+ # size cells and address cells required if trig-conns node present.
+ "#size-cells":
+ const: 0
+
+ "#address-cells":
+ const: 1
+
+patternProperties:
+ '^trig-conns@([0-9]+)$':
+ type: object
+ description:
+ A trigger connections child node which describes the trigger signals
+ between this CTI and another hardware device. This device may be a CPU,
+ CoreSight device, any other hardware device or simple external IO lines.
+ The connection may have both input and output triggers, or only one or the
+ other.
+
+ properties:
+ reg:
+ maxItems: 1
+
+ arm,trig-in-sigs:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 32
+ description:
+ List of CTI trigger in signal numbers in use by a trig-conns node.
+
+ arm,trig-in-types:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 32
+ description:
+ List of constants representing the types for the CTI trigger in
+ signals. Types in this array match to the corresponding signal in the
+ arm,trig-in-sigs array. If the -types array is smaller, or omitted
+ completely, then the types will default to GEN_IO.
+
+ arm,trig-out-sigs:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 32
+ description:
+ List of CTI trigger out signal numbers in use by a trig-conns node.
+
+ arm,trig-out-types:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 32
+ description:
+ List of constants representing the types for the CTI trigger out
+ signals. Types in this array match to the corresponding signal
+ in the arm,trig-out-sigs array. If the "-types" array is smaller,
+ or omitted completely, then the types will default to GEN_IO.
+
+ arm,trig-filters:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 32
+ description:
+ List of CTI trigger out signals that will be blocked from becoming
+ active, unless filtering is disabled on the driver.
+
+ arm,trig-conn-name:
+ $ref: /schemas/types.yaml#/definitions/string
+ description:
+ Defines a connection name that will be displayed, if the cpu or
+ arm,cs-dev-assoc properties are not being used in this connection.
+ Principle use for CTI that are connected to non-CoreSight devices, or
+ external IO.
+
+ anyOf:
+ - required:
+ - arm,trig-in-sigs
+ - required:
+ - arm,trig-out-sigs
+ oneOf:
+ - required:
+ - arm,trig-conn-name
+ - required:
+ - cpu
+ - required:
+ - arm,cs-dev-assoc
+ required:
+ - reg
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+if:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-cti-v8-arch
+
+then:
+ required:
+ - cpu
+
+unevaluatedProperties: false
+
+examples:
+ # minimum CTI definition. DEVID register used to set number of triggers.
+ - |
+ cti@20020000 {
+ compatible = "arm,coresight-cti", "arm,primecell";
+ reg = <0x20020000 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ };
+ # v8 architecturally defined CTI - CPU + ETM connections generated by the
+ # driver according to the v8 architecture specification.
+ - |
+ cti@859000 {
+ compatible = "arm,coresight-cti-v8-arch", "arm,coresight-cti",
+ "arm,primecell";
+ reg = <0x859000 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+
+ cpu = <&CPU1>;
+ arm,cs-dev-assoc = <&etm1>;
+ };
+ # Implementation defined CTI - CPU + ETM connections explicitly defined..
+ # Shows use of type constants from dt-bindings/arm/coresight-cti-dt.h
+ # #size-cells and #address-cells are required if trig-conns@ nodes present.
+ - |
+ #include <dt-bindings/arm/coresight-cti-dt.h>
+
+ cti@858000 {
+ compatible = "arm,coresight-cti", "arm,primecell";
+ reg = <0x858000 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+
+ arm,cti-ctm-id = <1>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ trig-conns@0 {
+ reg = <0>;
+ arm,trig-in-sigs = <4 5 6 7>;
+ arm,trig-in-types = <ETM_EXTOUT
+ ETM_EXTOUT
+ ETM_EXTOUT
+ ETM_EXTOUT>;
+ arm,trig-out-sigs = <4 5 6 7>;
+ arm,trig-out-types = <ETM_EXTIN
+ ETM_EXTIN
+ ETM_EXTIN
+ ETM_EXTIN>;
+ arm,cs-dev-assoc = <&etm0>;
+ };
+
+ trig-conns@1 {
+ reg = <1>;
+ cpu = <&CPU0>;
+ arm,trig-in-sigs = <0 1>;
+ arm,trig-in-types = <PE_DBGTRIGGER
+ PE_PMUIRQ>;
+ arm,trig-out-sigs=<0 1 2 >;
+ arm,trig-out-types = <PE_EDBGREQ
+ PE_DBGRESTART
+ PE_CTIIRQ>;
+
+ arm,trig-filters = <0>;
+ };
+ };
+ # Implementation defined CTI - non CoreSight component connections.
+ - |
+ cti@20110000 {
+ compatible = "arm,coresight-cti", "arm,primecell";
+ reg = <0x20110000 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ trig-conns@0 {
+ reg = <0>;
+ arm,trig-in-sigs=<0>;
+ arm,trig-in-types=<GEN_INTREQ>;
+ arm,trig-out-sigs=<0>;
+ arm,trig-out-types=<GEN_HALTREQ>;
+ arm,trig-conn-name = "sys_profiler";
+ };
+
+ trig-conns@1 {
+ reg = <1>;
+ arm,trig-out-sigs=<2 3>;
+ arm,trig-out-types=<GEN_HALTREQ GEN_RESTARTREQ>;
+ arm,trig-conn-name = "watchdog";
+ };
+
+ trig-conns@2 {
+ reg = <2>;
+ arm,trig-in-sigs=<1 6>;
+ arm,trig-in-types=<GEN_HALTREQ GEN_RESTARTREQ>;
+ arm,trig-conn-name = "g_counter";
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-funnel.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-funnel.yaml
new file mode 100644
index 000000000000..44a1041cb0fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-funnel.yaml
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-dynamic-funnel.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Programmable Trace Bus Funnel
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The Coresight funnel merges 2-8 trace sources into a single trace
+ stream with programmable enable and priority of input ports.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-dynamic-funnel
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-dynamic-funnel
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ patternProperties:
+ '^port(@[0-7])?$':
+ description: Input connections from CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Output connection to CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+ - out-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ funnel@20040000 {
+ compatible = "arm,coresight-dynamic-funnel", "arm,primecell";
+ reg = <0x20040000 0x1000>;
+
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+ out-ports {
+ port {
+ funnel_out_port0: endpoint {
+ remote-endpoint = <&replicator_in_port0>;
+ };
+ };
+ };
+
+ in-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ funnel_in_port0: endpoint {
+ remote-endpoint = <&ptm0_out_port>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ funnel_in_port1: endpoint {
+ remote-endpoint = <&ptm1_out_port>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+ funnel_in_port2: endpoint {
+ remote-endpoint = <&etm0_out_port>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-replicator.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-replicator.yaml
new file mode 100644
index 000000000000..03792e9bd97a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-dynamic-replicator.yaml
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-dynamic-replicator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Coresight Programmable Trace Bus Replicator
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The Coresight replicator splits a single trace stream into two trace streams
+ for systems that have more than one trace sink component.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-dynamic-replicator
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-dynamic-replicator
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ qcom,replicator-loses-context:
+ type: boolean
+ description:
+ Indicates that the replicator will lose register context when AMBA clock
+ is removed which is observed in some replicator designs.
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Input connection from CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ patternProperties:
+ '^port(@[01])?$':
+ description: Output connections to CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+ - out-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ replicator@20120000 {
+ compatible = "arm,coresight-dynamic-replicator", "arm,primecell";
+ reg = <0x20120000 0x1000>;
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+
+ out-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* replicator output ports */
+ port@0 {
+ reg = <0>;
+ replicator_out_port0: endpoint {
+ remote-endpoint = <&tpiu_in_port>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ replicator_out_port1: endpoint {
+ remote-endpoint = <&etr_in_port>;
+ };
+ };
+ };
+ in-ports {
+ port {
+ replicator_in_port0: endpoint {
+ remote-endpoint = <&csys2_funnel_out_port>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-etb10.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-etb10.yaml
new file mode 100644
index 000000000000..90679788e0bf
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-etb10.yaml
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-etb10.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Embedded Trace Buffer
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The CoreSight Embedded Trace Buffer stores traces in a dedicated SRAM that is
+ used as a circular buffer.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-etb10
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-etb10
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Input connection from CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ etb@20010000 {
+ compatible = "arm,coresight-etb10", "arm,primecell";
+ reg = <0x20010000 0x1000>;
+
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+ in-ports {
+ port {
+ etb_in_port: endpoint {
+ remote-endpoint = <&replicator_out_port0>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-etm.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-etm.yaml
new file mode 100644
index 000000000000..01200f67504a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-etm.yaml
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-etm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Embedded Trace MacroCell
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The Embedded Trace Macrocell (ETM) is a real-time trace module providing
+ instruction and data tracing of a processor.
+
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - arm,coresight-etm3x
+ - arm,coresight-etm4x
+ - arm,coresight-etm4x-sysreg
+ required:
+ - compatible
+
+allOf:
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-etm4x-sysreg
+ then:
+ $ref: /schemas/arm/primecell.yaml#
+ required:
+ - reg
+
+properties:
+ compatible:
+ oneOf:
+ - description:
+ Embedded Trace Macrocell with memory mapped access.
+ items:
+ - enum:
+ - arm,coresight-etm3x
+ - arm,coresight-etm4x
+ - const: arm,primecell
+ - description:
+ Embedded Trace Macrocell (version 4.x), with system register access only
+ const: arm,coresight-etm4x-sysreg
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ arm,coresight-loses-context-with-cpu:
+ type: boolean
+ description:
+ Indicates that the hardware will lose register context on CPU power down
+ (e.g. CPUIdle). An example of where this may be needed are systems which
+ contain a coresight component and CPU in the same power domain. When the
+ CPU powers down the coresight component also powers down and loses its
+ context.
+
+ arm,cp14:
+ type: boolean
+ description:
+ Must be present if the system accesses ETM/PTM management registers via
+ co-processor 14.
+
+ qcom,skip-power-up:
+ type: boolean
+ description:
+ Indicates that an implementation can skip powering up the trace unit.
+ TRCPDCR.PU does not have to be set on Qualcomm Technologies Inc. systems
+ since ETMs are in the same power domain as their CPU cores. This property
+ is required to identify such systems with hardware errata where the CPU
+ watchdog counter is stopped when TRCPDCR.PU is set.
+
+ cpu:
+ description:
+ phandle to the cpu this ETM is bound to.
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Output connection from the ETM to CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - cpu
+ - out-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ ptm@2201c000 {
+ compatible = "arm,coresight-etm3x", "arm,primecell";
+ reg = <0x2201c000 0x1000>;
+
+ cpu = <&cpu0>;
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+ out-ports {
+ port {
+ ptm0_out_port: endpoint {
+ remote-endpoint = <&funnel_in_port0>;
+ };
+ };
+ };
+ };
+
+ ptm@2201d000 {
+ compatible = "arm,coresight-etm3x", "arm,primecell";
+ reg = <0x2201d000 0x1000>;
+
+ cpu = <&cpu1>;
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+ out-ports {
+ port {
+ ptm1_out_port: endpoint {
+ remote-endpoint = <&funnel_in_port1>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-static-funnel.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-static-funnel.yaml
new file mode 100644
index 000000000000..cc8c3baa79b4
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-static-funnel.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-static-funnel.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Static Trace Bus Funnel
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The Coresight static funnel merges 2-8 trace sources into a single trace
+ stream.
+
+properties:
+ compatible:
+ const: arm,coresight-static-funnel
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ patternProperties:
+ '^port@[0-7]$':
+ description: Input connections from CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Output connection to CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - in-ports
+ - out-ports
+
+additionalProperties: false
+
+examples:
+ - |
+ funnel {
+ /*
+ * non-configurable replicators don't show up on the
+ * AMBA bus. As such no need to add "arm,primecell".
+ */
+ compatible = "arm,coresight-static-funnel";
+
+ out-ports {
+ port {
+ combo_funnel_out: endpoint {
+ remote-endpoint = <&top_funnel_in>;
+ };
+ };
+ };
+
+ in-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ combo_funnel_in0: endpoint {
+ remote-endpoint = <&cluster0_etf_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ combo_funnel_in1: endpoint {
+ remote-endpoint = <&cluster1_etf_out>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-static-replicator.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-static-replicator.yaml
new file mode 100644
index 000000000000..1892a091ac35
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-static-replicator.yaml
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-static-replicator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Static Trace Bus Replicator
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The Coresight replicator splits a single trace stream into two trace streams
+ for systems that have more than one trace sink component.
+
+properties:
+ compatible:
+ const: arm,coresight-static-replicator
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Input connection from CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ patternProperties:
+ '^port@[01]$':
+ description: Output connections to CoreSight Trace bus
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - in-ports
+ - out-ports
+
+additionalProperties: false
+
+examples:
+ - |
+ replicator {
+ /*
+ * non-configurable replicators don't show up on the
+ * AMBA bus. As such no need to add "arm,primecell".
+ */
+ compatible = "arm,coresight-static-replicator";
+
+ out-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* replicator output ports */
+ port@0 {
+ reg = <0>;
+ replicator_out_port0: endpoint {
+ remote-endpoint = <&etb_in_port>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ replicator_out_port1: endpoint {
+ remote-endpoint = <&tpiu_in_port>;
+ };
+ };
+ };
+
+ in-ports {
+ port {
+ replicator_in_port0: endpoint {
+ remote-endpoint = <&funnel_out_port0>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-stm.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-stm.yaml
new file mode 100644
index 000000000000..378380c3f5aa
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-stm.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-stm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight System Trace MacroCell
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The STM is a trace source that is integrated into a CoreSight system, designed
+ primarily for high-bandwidth trace of instrumentation embedded into software.
+ This instrumentation is made up of memory-mapped writes to the STM Advanced
+ eXtensible Interface (AXI) slave, which carry information about the behavior
+ of the software.
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-stm
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-stm
+ - const: arm,primecell
+
+ reg:
+ maxItems: 2
+
+ reg-names:
+ items:
+ - const: stm-base
+ - const: stm-stimulus-base
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Output connection to the CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+ - out-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ stm@20100000 {
+ compatible = "arm,coresight-stm", "arm,primecell";
+ reg = <0x20100000 0x1000>,
+ <0x28000000 0x180000>;
+ reg-names = "stm-base", "stm-stimulus-base";
+
+ clocks = <&soc_smc50mhz>;
+ clock-names = "apb_pclk";
+ out-ports {
+ port {
+ stm_out_port: endpoint {
+ remote-endpoint = <&main_funnel_in_port2>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-tmc.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-tmc.yaml
new file mode 100644
index 000000000000..cb8dceaca70e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-tmc.yaml
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-tmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Trace Memory Controller
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ Trace Memory Controller is used for Embedded Trace Buffer(ETB), Embedded Trace
+ FIFO(ETF) and Embedded Trace Router(ETR) configurations. The configuration
+ mode (ETB, ETF, ETR) is discovered at boot time when the device is probed.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-tmc
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-tmc
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ iommus:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ arm,buffer-size:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ deprecated: true
+ description:
+ Size of contiguous buffer space for TMC ETR (embedded trace router). The
+ buffer size can be configured dynamically via buffer_size property in
+ sysfs instead.
+
+ arm,scatter-gather:
+ type: boolean
+ description:
+ Indicates that the TMC-ETR can safely use the SG mode on this system.
+
+ arm,max-burst-size:
+ description:
+ The maximum burst size initiated by TMC on the AXI master interface. The
+ burst size can be in the range [0..15], the setting supports one data
+ transfer per burst up to a maximum of 16 data transfers per burst.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 15
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Input connection from the CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+ out-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: AXI or ATB Master output connection. Used for ETR
+ and ETF configurations.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ etr@20070000 {
+ compatible = "arm,coresight-tmc", "arm,primecell";
+ reg = <0x20070000 0x1000>;
+
+ clocks = <&oscclk6a>;
+ clock-names = "apb_pclk";
+ in-ports {
+ port {
+ etr_in_port: endpoint {
+ remote-endpoint = <&replicator2_out_port0>;
+ };
+ };
+ };
+
+ out-ports {
+ port {
+ etr_out_port: endpoint {
+ remote-endpoint = <&catu_in_port>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-tpiu.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-tpiu.yaml
new file mode 100644
index 000000000000..61a0cdc27745
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-tpiu.yaml
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,coresight-tpiu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm CoreSight Trace Port Interface Unit
+
+maintainers:
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+ - Mike Leach <mike.leach@linaro.org>
+ - Leo Yan <leo.yan@linaro.org>
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+
+description: |
+ CoreSight components are compliant with the ARM CoreSight architecture
+ specification and can be connected in various topologies to suit a particular
+ SoCs tracing needs. These trace components can generally be classified as
+ sinks, links and sources. Trace data produced by one or more sources flows
+ through the intermediate links connecting the source to the currently selected
+ sink.
+
+ The CoreSight Trace Port Interface Unit captures trace data from the trace bus
+ and outputs it to an external trace port.
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,coresight-tpiu
+ required:
+ - compatible
+
+allOf:
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: arm,coresight-tpiu
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: apb_pclk
+ - const: atclk
+
+ power-domains:
+ maxItems: 1
+
+ in-ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ additionalProperties: false
+
+ properties:
+ port:
+ description: Input connection from the CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ tpiu@e3c05000 {
+ compatible = "arm,coresight-tpiu", "arm,primecell";
+ reg = <0xe3c05000 0x1000>;
+
+ clocks = <&clk_375m>;
+ clock-names = "apb_pclk";
+ in-ports {
+ port {
+ tpiu_in_port: endpoint {
+ remote-endpoint = <&funnel4_out_port0>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,corstone1000.yaml b/Documentation/devicetree/bindings/arm/arm,corstone1000.yaml
new file mode 100644
index 000000000000..693f3fe7be60
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,corstone1000.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,corstone1000.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Corstone1000
+
+maintainers:
+ - Vishnu Banavath <vishnu.banavath@arm.com>
+ - Rui Miguel Silva <rui.silva@linaro.org>
+
+description: |+
+ ARM's Corstone1000 includes pre-verified Corstone SSE-710 subsystem that
+ provides a flexible compute architecture that combines Cortex‑A and Cortex‑M
+ processors.
+
+ Support for Cortex‑A32, Cortex‑A35 and Cortex‑A53 processors. Two expansion
+ systems for M-Class (or other) processors for adding sensors, connectivity,
+ video, audio and machine learning at the edge System and security IPs to build
+ a secure SoC for a range of rich IoT applications, for example gateways, smart
+ cameras and embedded systems.
+
+ Integrated Secure Enclave providing hardware Root of Trust and supporting
+ seamless integration of the optional CryptoCell™-312 cryptographic
+ accelerator.
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: Corstone1000 MPS3 it has 1 Cortex-A35 CPU core in a FPGA
+ implementation of the Corstone1000 in the MPS3 prototyping board. See
+ ARM document DAI0550.
+ items:
+ - const: arm,corstone1000-mps3
+ - description: Corstone1000 FVP is the Fixed Virtual Platform
+ implementation of this system. See ARM ecosystems FVP's.
+ items:
+ - const: arm,corstone1000-fvp
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,embedded-trace-extension.yaml b/Documentation/devicetree/bindings/arm/arm,embedded-trace-extension.yaml
new file mode 100644
index 000000000000..108460627d9a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,embedded-trace-extension.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/arm,embedded-trace-extension.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Embedded Trace Extensions
+
+maintainers:
+ - Suzuki K Poulose <suzuki.poulose@arm.com>
+ - Mathieu Poirier <mathieu.poirier@linaro.org>
+
+description: |
+ Arm Embedded Trace Extension(ETE) is a per CPU trace component that
+ allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
+ architecture and has extended support for future architecture changes.
+ The trace generated by the ETE could be stored via legacy CoreSight
+ components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
+ Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
+ legacy CoreSight components, a node must be listed per instance, along
+ with any optional connection graph as per the coresight bindings.
+
+properties:
+ $nodename:
+ pattern: "^ete([0-9a-f]+)$"
+ compatible:
+ items:
+ - const: arm,embedded-trace-extension
+
+ cpu:
+ description: |
+ Handle to the cpu this ETE is bound to.
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ power-domains:
+ maxItems: 1
+
+ out-ports:
+ description: |
+ Output connections from the ETE to legacy CoreSight trace bus.
+ $ref: /schemas/graph.yaml#/properties/ports
+ properties:
+ port:
+ description: Output connection from the ETE to legacy CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - cpu
+
+additionalProperties: false
+
+examples:
+
+# An ETE node without legacy CoreSight connections
+ - |
+ ete0 {
+ compatible = "arm,embedded-trace-extension";
+ cpu = <&cpu_0>;
+ };
+# An ETE node with legacy CoreSight connections
+ - |
+ ete1 {
+ compatible = "arm,embedded-trace-extension";
+ cpu = <&cpu_1>;
+
+ out-ports { /* legacy coresight connection */
+ port {
+ ete1_out_port: endpoint {
+ remote-endpoint = <&funnel_in_port0>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,integrator.yaml b/Documentation/devicetree/bindings/arm/arm,integrator.yaml
index 528eee64290a..98ff5698ae1f 100644
--- a/Documentation/devicetree/bindings/arm/arm,integrator.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,integrator.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/arm,integrator.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM Integrator Boards Device Tree Bindings
+title: ARM Integrator Boards
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/arm,realview.yaml b/Documentation/devicetree/bindings/arm/arm,realview.yaml
index 4f9b21f49e84..8d3ed2e4ed31 100644
--- a/Documentation/devicetree/bindings/arm/arm,realview.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,realview.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/arm,realview.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM RealView Boards Device Tree Bindings
+title: ARM RealView Boards
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/arm,trace-buffer-extension.yaml b/Documentation/devicetree/bindings/arm/arm,trace-buffer-extension.yaml
new file mode 100644
index 000000000000..b1322658063a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,trace-buffer-extension.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright 2021, Arm Ltd
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/arm,trace-buffer-extension.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ARM Trace Buffer Extensions
+
+maintainers:
+ - Anshuman Khandual <anshuman.khandual@arm.com>
+
+description: |
+ Arm Trace Buffer Extension (TRBE) is a per CPU component
+ for storing trace generated on the CPU to memory. It is
+ accessed via CPU system registers. The software can verify
+ if it is permitted to use the component by checking the
+ TRBIDR register.
+
+properties:
+ $nodename:
+ const: "trbe"
+ compatible:
+ items:
+ - const: arm,trace-buffer-extension
+
+ interrupts:
+ description: |
+ Exactly 1 PPI must be listed. For heterogeneous systems where
+ TRBE is only supported on a subset of the CPUs, please consult
+ the arm,gic-v3 binding for details on describing a PPI partition.
+ maxItems: 1
+
+required:
+ - compatible
+ - interrupts
+
+additionalProperties: false
+
+examples:
+
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ trbe {
+ compatible = "arm,trace-buffer-extension";
+ interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,versatile-sysreg.yaml b/Documentation/devicetree/bindings/arm/arm,versatile-sysreg.yaml
new file mode 100644
index 000000000000..491eef1e1b10
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/arm,versatile-sysreg.yaml
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: (GPL-2.0-only or BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/arm,versatile-sysreg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Versatile system registers
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description:
+ This is a system control registers block, providing multiple low level
+ platform functions like board detection and identification, software
+ interrupt generation, MMC and NOR Flash control, etc.
+
+properties:
+ compatible:
+ items:
+ - const: arm,versatile-sysreg
+ - const: syscon
+ - const: simple-mfd
+
+ reg:
+ maxItems: 1
+
+ panel:
+ type: object
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+...
diff --git a/Documentation/devicetree/bindings/arm/arm,versatile.yaml b/Documentation/devicetree/bindings/arm/arm,versatile.yaml
index 34b437c72751..13e52ba92060 100644
--- a/Documentation/devicetree/bindings/arm/arm,versatile.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,versatile.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/arm,versatile.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM Versatile Boards Device Tree Bindings
+title: ARM Versatile Boards
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml b/Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml
index 55ef656d1192..09c319f803ba 100644
--- a/Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,vexpress-juno.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/arm,vexpress-juno.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM Versatile Express and Juno Boards Device Tree Bindings
+title: ARM Versatile Express and Juno Boards
maintainers:
- Sudeep Holla <sudeep.holla@arm.com>
@@ -119,22 +119,6 @@ properties:
- const: arm,foundation-aarch64
- const: arm,vexpress
- arm,hbi:
- $ref: '/schemas/types.yaml#/definitions/uint32'
- description: This indicates the ARM HBI (Hardware Board ID), this is
- ARM's unique board model ID, visible on the PCB's silkscreen.
-
- arm,vexpress,site:
- description: As Versatile Express can be configured in number of physically
- different setups, the device tree should describe platform topology.
- For this reason the root node and main motherboard node must define this
- property, describing the physical location of the children nodes.
- 0 means motherboard site, while 1 and 2 are daughterboard sites, and
- 0xf means "sisterboard" which is the site containing the main CPU tile.
- $ref: '/schemas/types.yaml#/definitions/uint32'
- minimum: 0
- maximum: 15
-
arm,vexpress,position:
description: When daughterboards are stacked on one site, their position
in the stack be be described this attribute.
@@ -154,12 +138,13 @@ patternProperties:
description: Static Memory Bus (SMB) node, if this exists it describes
the connection between the motherboard and any tiles. Sometimes the
compatible is placed directly under this node, sometimes it is placed
- in a subnode named "motherboard". Sometimes the compatible includes
+ in a subnode named "motherboard-bus". Sometimes the compatible includes
"arm,vexpress,v2?-p1" sometimes (on software models) is is just
- "simple-bus". If the compatible is placed in the "motherboard" node,
+ "simple-bus". If the compatible is placed in the "motherboard-bus" node,
it is stricter and always has two compatibles.
type: object
$ref: '/schemas/simple-bus.yaml'
+ unevaluatedProperties: false
properties:
compatible:
@@ -170,7 +155,9 @@ patternProperties:
- arm,vexpress,v2p-p1
- const: simple-bus
- const: simple-bus
- motherboard:
+
+ patternProperties:
+ '^motherboard-bus@':
type: object
description: The motherboard description provides a single "motherboard"
node using 2 address cells corresponding to the Static Memory Bus
@@ -183,6 +170,8 @@ patternProperties:
const: 2
"#size-cells":
const: 1
+ ranges: true
+
compatible:
items:
- enum:
@@ -196,8 +185,28 @@ patternProperties:
- rs1
- rs2
+ arm,hbi:
+ $ref: '/schemas/types.yaml#/definitions/uint32'
+ description: This indicates the ARM HBI (Hardware Board ID), this is
+ ARM's unique board model ID, visible on the PCB's silkscreen.
+
+ arm,vexpress,site:
+ description: As Versatile Express can be configured in number of physically
+ different setups, the device tree should describe platform topology.
+ For this reason the root node and main motherboard node must define this
+ property, describing the physical location of the children nodes.
+ 0 means motherboard site, while 1 and 2 are daughterboard sites, and
+ 0xf means "sisterboard" which is the site containing the main CPU tile.
+ $ref: '/schemas/types.yaml#/definitions/uint32'
+ minimum: 0
+ maximum: 15
+
required:
- compatible
+
+ additionalProperties:
+ type: object
+
required:
- compatible
diff --git a/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt b/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt
deleted file mode 100644
index 6efabba530f1..000000000000
--- a/Documentation/devicetree/bindings/arm/arm-dsu-pmu.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* ARM DynamIQ Shared Unit (DSU) Performance Monitor Unit (PMU)
-
-ARM DyanmIQ Shared Unit (DSU) integrates one or more CPU cores
-with a shared L3 memory system, control logic and external interfaces to
-form a multicore cluster. The PMU enables to gather various statistics on
-the operations of the DSU. The PMU provides independent 32bit counters that
-can count any of the supported events, along with a 64bit cycle counter.
-The PMU is accessed via CPU system registers and has no MMIO component.
-
-** DSU PMU required properties:
-
-- compatible : should be one of :
-
- "arm,dsu-pmu"
-
-- interrupts : Exactly 1 SPI must be listed.
-
-- cpus : List of phandles for the CPUs connected to this DSU instance.
-
-
-** Example:
-
-dsu-pmu-0 {
- compatible = "arm,dsu-pmu";
- interrupts = <GIC_SPI 02 IRQ_TYPE_LEVEL_HIGH>;
- cpus = <&cpu_0>, <&cpu_1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed,sbc.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed,sbc.yaml
new file mode 100644
index 000000000000..c72aab706484
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed,sbc.yaml
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+# Copyright 2021 Joel Stanley, IBM Corp.
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/aspeed/aspeed,sbc.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ASPEED Secure Boot Controller
+
+maintainers:
+ - Joel Stanley <joel@jms.id.au>
+ - Andrew Jeffery <andrew@aj.id.au>
+
+description: |
+ The ASPEED SoCs have a register bank for interacting with the secure boot
+ controller.
+
+properties:
+ compatible:
+ items:
+ - const: aspeed,ast2600-sbc
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ sbc: secure-boot-controller@1e6f2000 {
+ compatible = "aspeed,ast2600-sbc";
+ reg = <0x1e6f2000 0x1000>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
new file mode 100644
index 000000000000..e0eff4c05879
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/aspeed/aspeed.yaml
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/aspeed/aspeed.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Aspeed SoC based boards
+
+maintainers:
+ - Joel Stanley <joel@jms.id.au>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: AST2400 based boards
+ items:
+ - enum:
+ - delta,ahe50dc-bmc
+ - facebook,galaxy100-bmc
+ - facebook,wedge100-bmc
+ - facebook,wedge40-bmc
+ - microsoft,olympus-bmc
+ - quanta,q71l-bmc
+ - tyan,palmetto-bmc
+ - yadro,vesnin-bmc
+ - const: aspeed,ast2400
+
+ - description: AST2500 based boards
+ items:
+ - enum:
+ - amd,daytonax-bmc
+ - amd,ethanolx-bmc
+ - ampere,mtjade-bmc
+ - aspeed,ast2500-evb
+ - asrock,e3c246d4i-bmc
+ - asrock,romed8hm3-bmc
+ - bytedance,g220a-bmc
+ - facebook,cmm-bmc
+ - facebook,minipack-bmc
+ - facebook,tiogapass-bmc
+ - facebook,yamp-bmc
+ - facebook,yosemitev2-bmc
+ - facebook,wedge400-bmc
+ - hxt,stardragon4800-rep2-bmc
+ - ibm,mihawk-bmc
+ - ibm,mowgli-bmc
+ - ibm,romulus-bmc
+ - ibm,swift-bmc
+ - ibm,witherspoon-bmc
+ - ingrasys,zaius-bmc
+ - inspur,fp5280g2-bmc
+ - inspur,nf5280m6-bmc
+ - inspur,on5263m5-bmc
+ - intel,s2600wf-bmc
+ - inventec,lanyang-bmc
+ - lenovo,hr630-bmc
+ - lenovo,hr855xg2-bmc
+ - portwell,neptune-bmc
+ - qcom,centriq2400-rep-bmc
+ - supermicro,x11spi-bmc
+ - tyan,s7106-bmc
+ - tyan,s8036-bmc
+ - yadro,nicole-bmc
+ - yadro,vegman-n110-bmc
+ - yadro,vegman-rx20-bmc
+ - yadro,vegman-sx20-bmc
+ - const: aspeed,ast2500
+
+ - description: AST2600 based boards
+ items:
+ - enum:
+ - ampere,mtmitchell-bmc
+ - aspeed,ast2600-evb
+ - aspeed,ast2600-evb-a1
+ - facebook,bletchley-bmc
+ - facebook,cloudripper-bmc
+ - facebook,elbert-bmc
+ - facebook,fuji-bmc
+ - facebook,greatlakes-bmc
+ - ibm,everest-bmc
+ - ibm,rainier-bmc
+ - ibm,tacoma-bmc
+ - inventec,transformer-bmc
+ - jabil,rbp-bmc
+ - qcom,dc-scm-v1-bmc
+ - quanta,s6q-bmc
+ - ufispace,ncplite-bmc
+ - const: aspeed,ast2600
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
index fba071b9af1d..dfb8fd089197 100644
--- a/Documentation/devicetree/bindings/arm/atmel-at91.yaml
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
@@ -4,11 +4,12 @@
$id: http://devicetree.org/schemas/arm/atmel-at91.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Atmel AT91 device tree bindings.
+title: Atmel AT91.
maintainers:
- Alexandre Belloni <alexandre.belloni@bootlin.com>
- - Ludovic Desroches <ludovic.desroches@microchip.com>
+ - Claudiu Beznea <claudiu.beznea@microchip.com>
+ - Nicolas Ferre <nicolas.ferre@microchip.com>
description: |
Boards with a SoC of the Atmel AT91 or SMART family shall have the following
@@ -90,9 +91,11 @@ properties:
- const: atmel,sama5d2
- const: atmel,sama5
- - description: SAM9X60-EK board
+ - description: Microchip SAM9X60 Evaluation Boards
items:
- - const: microchip,sam9x60ek
+ - enum:
+ - microchip,sam9x60ek
+ - microchip,sam9x60-curiosity
- const: microchip,sam9x60
- const: atmel,at91sam9
@@ -126,6 +129,25 @@ properties:
- const: atmel,sama5d3
- const: atmel,sama5
+ - description: Microchip SAMA5D3 Ethernet Development System Board
+ items:
+ - const: microchip,sama5d3-eds
+ - const: atmel,sama5d36
+ - const: atmel,sama5d3
+ - const: atmel,sama5
+
+ - description: CalAmp LMU5000 board
+ items:
+ - const: calamp,lmu5000
+ - const: atmel,at91sam9g20
+ - const: atmel,at91sam9
+
+ - description: Exegin Q5xR5 board
+ items:
+ - const: exegin,q5xr5
+ - const: atmel,at91sam9g20
+ - const: atmel,at91sam9
+
- items:
- enum:
- atmel,sama5d31
@@ -150,6 +172,29 @@ properties:
- const: microchip,sama7g5
- const: microchip,sama7
+ - description: Microchip LAN9662 Evaluation Boards.
+ items:
+ - enum:
+ - microchip,lan9662-pcb8291
+ - microchip,lan9662-pcb8309
+ - const: microchip,lan9662
+ - const: microchip,lan966
+
+ - description: Microchip LAN9668 PCB8290 Evaluation Board.
+ items:
+ - const: microchip,lan9668-pcb8290
+ - const: microchip,lan9668
+ - const: microchip,lan966
+
+ - description: Kontron KSwitch D10 MMT series
+ items:
+ - enum:
+ - kontron,kswitch-d10-mmt-8g
+ - kontron,kswitch-d10-mmt-6g-2gs
+ - const: kontron,s1921
+ - const: microchip,lan9668
+ - const: microchip,lan966
+
- items:
- enum:
- atmel,sams70j19
diff --git a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
index 16eef600d599..ab1b352344ae 100644
--- a/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
+++ b/Documentation/devicetree/bindings/arm/atmel-sysregs.txt
@@ -25,21 +25,6 @@ System Timer (ST) required properties:
Its subnodes can be:
- watchdog: compatible should be "atmel,at91rm9200-wdt"
-RSTC Reset Controller required properties:
-- compatible: Should be "atmel,<chip>-rstc".
- <chip> can be "at91sam9260", "at91sam9g45", "sama5d3" or "samx7"
- it also can be "microchip,sam9x60-rstc"
-- reg: Should contain registers location and length
-- clocks: phandle to input clock.
-
-Example:
-
- rstc@fffffd00 {
- compatible = "atmel,at91sam9260-rstc";
- reg = <0xfffffd00 0x10>;
- clocks = <&clk32k>;
- };
-
RAMC SDRAM/DDR Controller required properties:
- compatible: Should be "atmel,at91rm9200-sdramc", "syscon"
"atmel,at91sam9260-sdramc",
diff --git a/Documentation/devicetree/bindings/arm/axxia.yaml b/Documentation/devicetree/bindings/arm/axxia.yaml
index e0d2bb71cf50..d60907e43efc 100644
--- a/Documentation/devicetree/bindings/arm/axxia.yaml
+++ b/Documentation/devicetree/bindings/arm/axxia.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/axxia.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Axxia AXM55xx device tree bindings
+title: Axxia AXM55xx
maintainers:
- Anders Berg <anders.berg@lsi.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/bcm2835.yaml b/Documentation/devicetree/bindings/arm/bcm/bcm2835.yaml
index 230b80d9d6cf..162a39dab218 100644
--- a/Documentation/devicetree/bindings/arm/bcm/bcm2835.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/bcm2835.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/bcm2835.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM2711/BCM2835 Platforms Device Tree Bindings
+title: Broadcom BCM2711/BCM2835 Platforms
maintainers:
- Eric Anholt <eric@anholt.net>
@@ -19,6 +19,7 @@ properties:
items:
- enum:
- raspberrypi,400
+ - raspberrypi,4-compute-module
- raspberrypi,4-model-b
- const: brcm,bcm2711
@@ -50,6 +51,7 @@ properties:
- raspberrypi,3-model-b-plus
- raspberrypi,3-compute-module
- raspberrypi,3-compute-module-lite
+ - raspberrypi,model-zero-2-w
- const: brcm,bcm2837
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm11351.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm11351.yaml
index c60324357435..f2bcac0096b7 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm11351.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm11351.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,bcm11351.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM11351 device tree bindings
+title: Broadcom BCM11351
maintainers:
- Florian Fainelli <f.fainelli@gmail.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm21664.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm21664.yaml
index b3020757380f..cf4e254e32f1 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm21664.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm21664.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,bcm21664.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM21664 device tree bindings
+title: Broadcom BCM21664
maintainers:
- Florian Fainelli <f.fainelli@gmail.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm23550.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm23550.yaml
index 37f3a6fcde76..eafec29ba7ab 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm23550.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm23550.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,bcm23550.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM23550 device tree bindings
+title: Broadcom BCM23550
maintainers:
- Florian Fainelli <f.fainelli@gmail.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.yaml
index 434d3c6db61e..454b0e93245d 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4708.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,bcm4708.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM4708 device tree bindings
+title: Broadcom BCM4708
description:
Broadcom BCM4708/47081/4709/47094/53012 Wi-Fi/network SoCs based
@@ -64,7 +64,9 @@ properties:
- description: BCM47094 based boards
items:
- enum:
+ - asus,rt-ac88u
- dlink,dir-885l
+ - dlink,dir-890l
- linksys,panamera
- luxul,abr-4500-v1
- luxul,xap-1610-v1
@@ -83,9 +85,22 @@ properties:
- brcm,bcm953012er
- brcm,bcm953012hr
- brcm,bcm953012k
+ - const: brcm,bcm53012
+ - const: brcm,bcm4708
+
+ - description: BCM53015 based boards
+ items:
+ - enum:
+ - meraki,mr26
+ - const: brcm,bcm53015
+ - const: brcm,bcm4708
+
+ - description: BCM53016 based boards
+ items:
+ - enum:
+ - dlink,dwl-8610ap
- meraki,mr32
- - const: brcm,brcm53012
- - const: brcm,brcm53016
+ - const: brcm,bcm53016
- const: brcm,bcm4708
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4908.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4908.yaml
deleted file mode 100644
index 2cd4e4a32278..000000000000
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm4908.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/bcm/brcm,bcm4908.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Broadcom BCM4908 device tree bindings
-
-description:
- Broadcom BCM4906 / BCM4908 / BCM49408 Wi-Fi/network SoCs with Brahma CPUs.
-
-maintainers:
- - Rafał Miłecki <rafal@milecki.pl>
-
-properties:
- $nodename:
- const: '/'
- compatible:
- oneOf:
- - description: BCM4906 based boards
- items:
- - enum:
- - netgear,r8000p
- - tplink,archer-c2300-v1
- - const: brcm,bcm4906
- - const: brcm,bcm4908
-
- - description: BCM4908 based boards
- items:
- - enum:
- - asus,gt-ac5300
- - const: brcm,bcm4908
-
- - description: BCM49408 based boards
- items:
- - const: brcm,bcm49408
- - const: brcm,bcm4908
-
-additionalProperties: true
-
-...
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm63138.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm63138.txt
index 8c7a4908a849..a8866c6e9d46 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm63138.txt
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm63138.txt
@@ -30,7 +30,7 @@ Example:
cpus {
cpu@0 {
- compatible = "arm,cotex-a9";
+ compatible = "arm,cortex-a9";
reg = <0>;
...
enable-method = "brcm,bcm63138";
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml
new file mode 100644
index 000000000000..07892cbdd23c
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/bcm/brcm,bcmbca.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom Broadband SoC
+
+description:
+ Broadcom Broadband SoCs include family of high performance DSL/PON/Wireless
+ chips that can be used as home gateway, router and WLAN AP for residential,
+ enterprise and carrier applications.
+
+maintainers:
+ - William Zhang <william.zhang@broadcom.com>
+ - Anand Gore <anand.gore@broadcom.com>
+ - Kursad Oney <kursad.oney@broadcom.com>
+ - Rafał Miłecki <rafal@milecki.pl>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: BCM47622 based boards
+ items:
+ - enum:
+ - brcm,bcm947622
+ - const: brcm,bcm47622
+ - const: brcm,bcmbca
+
+ - description: BCM4906 based boards
+ items:
+ - enum:
+ - netgear,r8000p
+ - tplink,archer-c2300-v1
+ - const: brcm,bcm4906
+ - const: brcm,bcm4908
+ - const: brcm,bcmbca
+
+ - description: BCM4908 based boards
+ items:
+ - enum:
+ - asus,gt-ac5300
+ - brcm,bcm94908
+ - netgear,raxe500
+ - const: brcm,bcm4908
+ - const: brcm,bcmbca
+
+ - description: BCM49408 based boards
+ items:
+ - const: brcm,bcm49408
+ - const: brcm,bcm4908
+ - const: brcm,bcmbca
+
+ - description: BCM4912 based boards
+ items:
+ - enum:
+ - asus,gt-ax6000
+ - brcm,bcm94912
+ - const: brcm,bcm4912
+ - const: brcm,bcmbca
+
+ - description: BCM63138 based boards
+ items:
+ - enum:
+ - brcm,bcm963138
+ - brcm,BCM963138DVT
+ - const: brcm,bcm63138
+ - const: brcm,bcmbca
+
+ - description: BCM63146 based boards
+ items:
+ - enum:
+ - brcm,bcm963146
+ - const: brcm,bcm63146
+ - const: brcm,bcmbca
+
+ - description: BCM63148 based boards
+ items:
+ - enum:
+ - brcm,bcm963148
+ - const: brcm,bcm63148
+ - const: brcm,bcmbca
+
+ - description: BCM63158 based boards
+ items:
+ - enum:
+ - brcm,bcm963158
+ - const: brcm,bcm63158
+ - const: brcm,bcmbca
+
+ - description: BCM63178 based boards
+ items:
+ - enum:
+ - brcm,bcm963178
+ - const: brcm,bcm63178
+ - const: brcm,bcmbca
+
+ - description: BCM6756 based boards
+ items:
+ - enum:
+ - brcm,bcm96756
+ - const: brcm,bcm6756
+ - const: brcm,bcmbca
+
+ - description: BCM6813 based boards
+ items:
+ - enum:
+ - brcm,bcm96813
+ - const: brcm,bcm6813
+ - const: brcm,bcmbca
+
+ - description: BCM6846 based boards
+ items:
+ - enum:
+ - brcm,bcm96846
+ - const: brcm,bcm6846
+ - const: brcm,bcmbca
+
+ - description: BCM6855 based boards
+ items:
+ - enum:
+ - brcm,bcm96855
+ - const: brcm,bcm6855
+ - const: brcm,bcmbca
+
+ - description: BCM6856 based boards
+ items:
+ - enum:
+ - brcm,bcm96856
+ - const: brcm,bcm6856
+ - const: brcm,bcmbca
+
+ - description: BCM6858 based boards
+ items:
+ - enum:
+ - brcm,bcm96858
+ - const: brcm,bcm6858
+ - const: brcm,bcmbca
+
+ - description: BCM6878 based boards
+ items:
+ - enum:
+ - brcm,bcm96878
+ - const: brcm,bcm6878
+ - const: brcm,bcmbca
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,brcmstb.txt b/Documentation/devicetree/bindings/arm/bcm/brcm,brcmstb.txt
index 104cc9b41df4..071421dbc4d0 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,brcmstb.txt
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,brcmstb.txt
@@ -187,15 +187,8 @@ Required properties:
Sequencer DRAM parameters and control registers. Used for Self-Refresh
Power-Down (SRPD), among other things.
-Required properties:
-- compatible : should contain one of these
- "brcm,brcmstb-memc-ddr-rev-b.2.1"
- "brcm,brcmstb-memc-ddr-rev-b.2.2"
- "brcm,brcmstb-memc-ddr-rev-b.2.3"
- "brcm,brcmstb-memc-ddr-rev-b.3.0"
- "brcm,brcmstb-memc-ddr-rev-b.3.1"
- "brcm,brcmstb-memc-ddr"
-- reg : the MEMC DDR register range
+See Documentation/devicetree/bindings/memory-controllers/brcm,brcmstb-memc-ddr.yaml for a
+full list of supported compatible strings and properties.
Example:
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,cygnus.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,cygnus.yaml
index 432ccf990f9e..a0a3f32db54e 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,cygnus.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,cygnus.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,cygnus.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom Cygnus device tree bindings
+title: Broadcom Cygnus
maintainers:
- Ray Jui <rjui@broadcom.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,hr2.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,hr2.yaml
index 294948399f82..cc6add0e933a 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,hr2.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,hr2.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,hr2.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom Hurricane 2 device tree bindings
+title: Broadcom Hurricane 2
description:
Broadcom Hurricane 2 family of SoCs are used for switching control. These SoCs
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,ns2.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,ns2.yaml
index c4847abbecd8..6696598eca0e 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,ns2.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,ns2.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,ns2.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom North Star 2 (NS2) device tree bindings
+title: Broadcom North Star 2 (NS2)
maintainers:
- Ray Jui <rjui@broadcom.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,nsp.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,nsp.yaml
index 476bc23a7f75..a43b2d4d936b 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,nsp.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,nsp.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,nsp.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom Northstar Plus device tree bindings
+title: Broadcom Northstar Plus
description:
Broadcom Northstar Plus family of SoCs are used for switching control
@@ -22,16 +22,61 @@ properties:
$nodename:
const: '/'
compatible:
- items:
- - enum:
- - brcm,bcm58522
- - brcm,bcm58525
- - brcm,bcm58535
- - brcm,bcm58622
- - brcm,bcm58623
- - brcm,bcm58625
- - brcm,bcm88312
- - const: brcm,nsp
+ oneOf:
+ - description: BCM58522 based boards
+ items:
+ - enum:
+ - brcm,bcm958522er
+ - const: brcm,bcm58522
+ - const: brcm,nsp
+
+ - description: BCM58525 based boards
+ items:
+ - enum:
+ - brcm,bcm958525er
+ - brcm,bcm958525xmc
+ - const: brcm,bcm58525
+ - const: brcm,nsp
+
+ - description: BCM58535 based boards
+ items:
+ - const: brcm,bcm58535
+ - const: brcm,nsp
+
+ - description: BCM58622 based boards
+ items:
+ - enum:
+ - brcm,bcm958622hr
+ - const: brcm,bcm58622
+ - const: brcm,nsp
+
+ - description: BCM58623 based boards
+ items:
+ - enum:
+ - brcm,bcm958623hr
+ - const: brcm,bcm58623
+ - const: brcm,nsp
+
+ - description: BCM58625 based boards
+ items:
+ - enum:
+ - brcm,bcm958625hr
+ - brcm,bcm958625k
+ - meraki,mx64
+ - meraki,mx64-a0
+ - meraki,mx64w
+ - meraki,mx64w-a0
+ - meraki,mx65
+ - meraki,mx65w
+ - const: brcm,bcm58625
+ - const: brcm,nsp
+
+ - description: BCM88312 based boards
+ items:
+ - enum:
+ - brcm,bcm988312hr
+ - const: brcm,bcm88312
+ - const: brcm,nsp
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,stingray.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,stingray.yaml
index c638e04ebae0..c6ccb78aab0a 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,stingray.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,stingray.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,stingray.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom Stingray device tree bindings
+title: Broadcom Stingray
maintainers:
- Ray Jui <rjui@broadcom.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,vulcan-soc.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,vulcan-soc.yaml
index 4eba182abd53..3f441352fbf0 100644
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,vulcan-soc.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,vulcan-soc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bcm/brcm,vulcan-soc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom Vulcan device tree bindings
+title: Broadcom Vulcan
maintainers:
- Robert Richter <rrichter@marvell.com>
diff --git a/Documentation/devicetree/bindings/arm/bcm/raspberrypi,bcm2835-firmware.yaml b/Documentation/devicetree/bindings/arm/bcm/raspberrypi,bcm2835-firmware.yaml
index b369b374fc4a..39e3c248f5b7 100644
--- a/Documentation/devicetree/bindings/arm/bcm/raspberrypi,bcm2835-firmware.yaml
+++ b/Documentation/devicetree/bindings/arm/bcm/raspberrypi,bcm2835-firmware.yaml
@@ -30,6 +30,7 @@ properties:
clocks:
type: object
+ additionalProperties: false
properties:
compatible:
@@ -47,6 +48,7 @@ properties:
reset:
type: object
+ additionalProperties: false
properties:
compatible:
@@ -63,6 +65,7 @@ properties:
pwm:
type: object
+ additionalProperties: false
properties:
compatible:
@@ -76,8 +79,6 @@ properties:
- compatible
- "#pwm-cells"
- additionalProperties: false
-
required:
- compatible
- mboxes
diff --git a/Documentation/devicetree/bindings/arm/bitmain.yaml b/Documentation/devicetree/bindings/arm/bitmain.yaml
index 90ba02be48ce..55a5a570b5bc 100644
--- a/Documentation/devicetree/bindings/arm/bitmain.yaml
+++ b/Documentation/devicetree/bindings/arm/bitmain.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/bitmain.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Bitmain platform device tree bindings
+title: Bitmain platform
maintainers:
- Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/calxeda.yaml b/Documentation/devicetree/bindings/arm/calxeda.yaml
index 46f78addebb0..3e9f5e1d862e 100644
--- a/Documentation/devicetree/bindings/arm/calxeda.yaml
+++ b/Documentation/devicetree/bindings/arm/calxeda.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/calxeda.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Calxeda Platforms Device Tree Bindings
+title: Calxeda Platforms
maintainers:
- Rob Herring <robh@kernel.org>
diff --git a/Documentation/devicetree/bindings/arm/cci-control-port.yaml b/Documentation/devicetree/bindings/arm/cci-control-port.yaml
new file mode 100644
index 000000000000..c29d250a6d77
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/cci-control-port.yaml
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/cci-control-port.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: CCI Interconnect Bus Masters
+
+maintainers:
+ - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+
+description: |
+ Masters in the device tree connected to a CCI port (inclusive of CPUs
+ and their cpu nodes).
+
+select: true
+
+properties:
+ cci-control-port:
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+additionalProperties: true
+
+examples:
+ - |
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ compatible = "arm,cortex-a15";
+ device_type = "cpu";
+ cci-control-port = <&cci_control1>;
+ reg = <0>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/cci.txt b/Documentation/devicetree/bindings/arm/cci.txt
deleted file mode 100644
index 9600761f2d5b..000000000000
--- a/Documentation/devicetree/bindings/arm/cci.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-=======================================================
-ARM CCI cache coherent interconnect binding description
-=======================================================
-
-ARM multi-cluster systems maintain intra-cluster coherency through a
-cache coherent interconnect (CCI) that is capable of monitoring bus
-transactions and manage coherency, TLB invalidations and memory barriers.
-
-It allows snooping and distributed virtual memory message broadcast across
-clusters, through memory mapped interface, with a global control register
-space and multiple sets of interface control registers, one per slave
-interface.
-
-* CCI interconnect node
-
- Description: Describes a CCI cache coherent Interconnect component
-
- Node name must be "cci".
- Node's parent must be the root node /, and the address space visible
- through the CCI interconnect is the same as the one seen from the
- root node (ie from CPUs perspective as per DT standard).
- Every CCI node has to define the following properties:
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: must contain one of the following:
- "arm,cci-400"
- "arm,cci-500"
- "arm,cci-550"
-
- - reg
- Usage: required
- Value type: Integer cells. A register entry, expressed as a pair
- of cells, containing base and size.
- Definition: A standard property. Specifies base physical
- address of CCI control registers common to all
- interfaces.
-
- - ranges:
- Usage: required
- Value type: Integer cells. An array of range entries, expressed
- as a tuple of cells, containing child address,
- parent address and the size of the region in the
- child address space.
- Definition: A standard property. Follow rules in the Devicetree
- Specification for hierarchical bus addressing. CCI
- interfaces addresses refer to the parent node
- addressing scheme to declare their register bases.
-
- CCI interconnect node can define the following child nodes:
-
- - CCI control interface nodes
-
- Node name must be "slave-if".
- Parent node must be CCI interconnect node.
-
- A CCI control interface node must contain the following
- properties:
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: must be set to
- "arm,cci-400-ctrl-if"
-
- - interface-type:
- Usage: required
- Value type: <string>
- Definition: must be set to one of {"ace", "ace-lite"}
- depending on the interface type the node
- represents.
-
- - reg:
- Usage: required
- Value type: Integer cells. A register entry, expressed
- as a pair of cells, containing base and
- size.
- Definition: the base address and size of the
- corresponding interface programming
- registers.
-
- - CCI PMU node
-
- Parent node must be CCI interconnect node.
-
- A CCI pmu node must contain the following properties:
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must contain one of:
- "arm,cci-400-pmu,r0"
- "arm,cci-400-pmu,r1"
- "arm,cci-400-pmu" - DEPRECATED, permitted only where OS has
- secure access to CCI registers
- "arm,cci-500-pmu,r0"
- "arm,cci-550-pmu,r0"
- - reg:
- Usage: required
- Value type: Integer cells. A register entry, expressed
- as a pair of cells, containing base and
- size.
- Definition: the base address and size of the
- corresponding interface programming
- registers.
-
- - interrupts:
- Usage: required
- Value type: Integer cells. Array of interrupt specifier
- entries, as defined in
- ../interrupt-controller/interrupts.txt.
- Definition: list of counter overflow interrupts, one per
- counter. The interrupts must be specified
- starting with the cycle counter overflow
- interrupt, followed by counter0 overflow
- interrupt, counter1 overflow interrupt,...
- ,counterN overflow interrupt.
-
- The CCI PMU has an interrupt signal for each
- counter. The number of interrupts must be
- equal to the number of counters.
-
-* CCI interconnect bus masters
-
- Description: masters in the device tree connected to a CCI port
- (inclusive of CPUs and their cpu nodes).
-
- A CCI interconnect bus master node must contain the following
- properties:
-
- - cci-control-port:
- Usage: required
- Value type: <phandle>
- Definition: a phandle containing the CCI control interface node
- the master is connected to.
-
-Example:
-
- cpus {
- #size-cells = <0>;
- #address-cells = <1>;
-
- CPU0: cpu@0 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- cci-control-port = <&cci_control1>;
- reg = <0x0>;
- };
-
- CPU1: cpu@1 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- cci-control-port = <&cci_control1>;
- reg = <0x1>;
- };
-
- CPU2: cpu@100 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- cci-control-port = <&cci_control2>;
- reg = <0x100>;
- };
-
- CPU3: cpu@101 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- cci-control-port = <&cci_control2>;
- reg = <0x101>;
- };
-
- };
-
- dma0: dma@3000000 {
- compatible = "arm,pl330", "arm,primecell";
- cci-control-port = <&cci_control0>;
- reg = <0x0 0x3000000 0x0 0x1000>;
- interrupts = <10>;
- #dma-cells = <1>;
- #dma-channels = <8>;
- #dma-requests = <32>;
- };
-
- cci@2c090000 {
- compatible = "arm,cci-400";
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x0 0x2c090000 0 0x1000>;
- ranges = <0x0 0x0 0x2c090000 0x10000>;
-
- cci_control0: slave-if@1000 {
- compatible = "arm,cci-400-ctrl-if";
- interface-type = "ace-lite";
- reg = <0x1000 0x1000>;
- };
-
- cci_control1: slave-if@4000 {
- compatible = "arm,cci-400-ctrl-if";
- interface-type = "ace";
- reg = <0x4000 0x1000>;
- };
-
- cci_control2: slave-if@5000 {
- compatible = "arm,cci-400-ctrl-if";
- interface-type = "ace";
- reg = <0x5000 0x1000>;
- };
-
- pmu@9000 {
- compatible = "arm,cci-400-pmu";
- reg = <0x9000 0x5000>;
- interrupts = <0 101 4>,
- <0 102 4>,
- <0 103 4>,
- <0 104 4>,
- <0 105 4>;
- };
- };
-
-This CCI node corresponds to a CCI component whose control registers sits
-at address 0x000000002c090000.
-CCI slave interface @0x000000002c091000 is connected to dma controller dma0.
-CCI slave interface @0x000000002c094000 is connected to CPUs {CPU0, CPU1};
-CCI slave interface @0x000000002c095000 is connected to CPUs {CPU2, CPU3};
diff --git a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt b/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
deleted file mode 100644
index f1de3247c1b7..000000000000
--- a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-* CoreSight CPU Debug Component:
-
-CoreSight CPU debug component are compliant with the ARMv8 architecture
-reference manual (ARM DDI 0487A.k) Chapter 'Part H: External debug'. The
-external debug module is mainly used for two modes: self-hosted debug and
-external debug, and it can be accessed from mmio region from Coresight
-and eventually the debug module connects with CPU for debugging. And the
-debug module provides sample-based profiling extension, which can be used
-to sample CPU program counter, secure state and exception level, etc;
-usually every CPU has one dedicated debug module to be connected.
-
-Required properties:
-
-- compatible : should be "arm,coresight-cpu-debug"; supplemented with
- "arm,primecell" since this driver is using the AMBA bus
- interface.
-
-- reg : physical base address and length of the register set.
-
-- clocks : the clock associated to this component.
-
-- clock-names : the name of the clock referenced by the code. Since we are
- using the AMBA framework, the name of the clock providing
- the interconnect should be "apb_pclk" and the clock is
- mandatory. The interface between the debug logic and the
- processor core is clocked by the internal CPU clock, so it
- is enabled with CPU clock by default.
-
-- cpu : the CPU phandle the debug module is affined to. Do not assume it
- to default to CPU0 if omitted.
-
-Optional properties:
-
-- power-domains: a phandle to the debug power domain. We use "power-domains"
- binding to turn on the debug logic if it has own dedicated
- power domain and if necessary to use "cpuidle.off=1" or
- "nohlt" in the kernel command line or sysfs node to
- constrain idle states to ensure registers in the CPU power
- domain are accessible.
-
-Example:
-
- debug@f6590000 {
- compatible = "arm,coresight-cpu-debug","arm,primecell";
- reg = <0 0xf6590000 0 0x1000>;
- clocks = <&sys_ctrl HI6220_DAPB_CLK>;
- clock-names = "apb_pclk";
- cpu = <&cpu0>;
- };
diff --git a/Documentation/devicetree/bindings/arm/coresight-cti.yaml b/Documentation/devicetree/bindings/arm/coresight-cti.yaml
deleted file mode 100644
index 21e3515491f4..000000000000
--- a/Documentation/devicetree/bindings/arm/coresight-cti.yaml
+++ /dev/null
@@ -1,332 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
-# Copyright 2019 Linaro Ltd.
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/coresight-cti.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: ARM Coresight Cross Trigger Interface (CTI) device.
-
-description: |
- The CoreSight Embedded Cross Trigger (ECT) consists of CTI devices connected
- to one or more CoreSight components and/or a CPU, with CTIs interconnected in
- a star topology via the Cross Trigger Matrix (CTM), which is not programmable.
- The ECT components are not part of the trace generation data path and are thus
- not part of the CoreSight graph described in the general CoreSight bindings
- file coresight.txt.
-
- The CTI component properties define the connections between the individual
- CTI and the components it is directly connected to, consisting of input and
- output hardware trigger signals. CTIs can have a maximum number of input and
- output hardware trigger signals (8 each for v1 CTI, 32 each for v2 CTI). The
- number is defined at design time, the maximum of each defined in the DEVID
- register.
-
- CTIs are interconnected in a star topology via the CTM, using a number of
- programmable channels, usually 4, but again implementation defined and
- described in the DEVID register. The star topology is not required to be
- described in the bindings as the actual connections are software
- programmable.
-
- In general the connections between CTI and components via the trigger signals
- are implementation defined, except when the CTI is connected to an ARM v8
- architecture core and optional ETM.
-
- In this case the ARM v8 architecture defines the required signal connections
- between CTI and the CPU core and ETM if present. In the case of a v8
- architecturally connected CTI an additional compatible string is used to
- indicate this feature (arm,coresight-cti-v8-arch).
-
- When CTI trigger connection information is unavailable then a minimal driver
- binding can be declared with no explicit trigger signals. This will result
- the driver detecting the maximum available triggers and channels from the
- DEVID register and make them all available for use as a single default
- connection. Any user / client application will require additional information
- on the connections between the CTI and other components for correct operation.
- This information might be found by enabling the Integration Test registers in
- the driver (set CONFIG_CORESIGHT_CTI_INTEGRATION_TEST in Kernel
- configuration). These registers may be used to explore the trigger connections
- between CTI and other CoreSight components.
-
- Certain triggers between CoreSight devices and the CTI have specific types
- and usages. These can be defined along with the signal indexes with the
- constants defined in <dt-bindings/arm/coresight-cti-dt.h>
-
- For example a CTI connected to a core will usually have a DBGREQ signal. This
- is defined in the binding as type PE_EDBGREQ. These types will appear in an
- optional array alongside the signal indexes. Omitting types will default all
- signals to GEN_IO.
-
- Note that some hardware trigger signals can be connected to non-CoreSight
- components (e.g. UART etc) depending on hardware implementation.
-
-maintainers:
- - Mike Leach <mike.leach@linaro.org>
-
-allOf:
- - $ref: /schemas/arm/primecell.yaml#
-
-# Need a custom select here or 'arm,primecell' will match on lots of nodes
-select:
- properties:
- compatible:
- contains:
- enum:
- - arm,coresight-cti
- required:
- - compatible
-
-properties:
- $nodename:
- pattern: "^cti(@[0-9a-f]+)$"
- compatible:
- oneOf:
- - items:
- - const: arm,coresight-cti
- - const: arm,primecell
- - items:
- - const: arm,coresight-cti-v8-arch
- - const: arm,coresight-cti
- - const: arm,primecell
-
- reg:
- maxItems: 1
-
- cpu:
- $ref: /schemas/types.yaml#/definitions/phandle
- description:
- Handle to cpu this device is associated with. This must appear in the
- base cti node if compatible string arm,coresight-cti-v8-arch is used,
- or may appear in a trig-conns child node when appropriate.
-
- arm,cti-ctm-id:
- $ref: /schemas/types.yaml#/definitions/uint32
- description:
- Defines the CTM this CTI is connected to, in large systems with multiple
- separate CTI/CTM nets. Typically multi-socket systems where the CTM is
- propagated between sockets.
-
- arm,cs-dev-assoc:
- $ref: /schemas/types.yaml#/definitions/phandle
- description:
- defines a phandle reference to an associated CoreSight trace device.
- When the associated trace device is enabled, then the respective CTI
- will be enabled. Use in a trig-conns node, or in CTI base node when
- compatible string arm,coresight-cti-v8-arch used. If the associated
- device has not been registered then the node name will be stored as
- the connection name for later resolution. If the associated device is
- not a CoreSight device or not registered then the node name will remain
- the connection name and automatic enabling will not occur.
-
- # size cells and address cells required if trig-conns node present.
- "#size-cells":
- const: 0
-
- "#address-cells":
- const: 1
-
-patternProperties:
- '^trig-conns@([0-9]+)$':
- type: object
- description:
- A trigger connections child node which describes the trigger signals
- between this CTI and another hardware device. This device may be a CPU,
- CoreSight device, any other hardware device or simple external IO lines.
- The connection may have both input and output triggers, or only one or the
- other.
-
- properties:
- reg:
- maxItems: 1
-
- arm,trig-in-sigs:
- $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 1
- maxItems: 32
- description:
- List of CTI trigger in signal numbers in use by a trig-conns node.
-
- arm,trig-in-types:
- $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 1
- maxItems: 32
- description:
- List of constants representing the types for the CTI trigger in
- signals. Types in this array match to the corresponding signal in the
- arm,trig-in-sigs array. If the -types array is smaller, or omitted
- completely, then the types will default to GEN_IO.
-
- arm,trig-out-sigs:
- $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 1
- maxItems: 32
- description:
- List of CTI trigger out signal numbers in use by a trig-conns node.
-
- arm,trig-out-types:
- $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 1
- maxItems: 32
- description:
- List of constants representing the types for the CTI trigger out
- signals. Types in this array match to the corresponding signal
- in the arm,trig-out-sigs array. If the "-types" array is smaller,
- or omitted completely, then the types will default to GEN_IO.
-
- arm,trig-filters:
- $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 1
- maxItems: 32
- description:
- List of CTI trigger out signals that will be blocked from becoming
- active, unless filtering is disabled on the driver.
-
- arm,trig-conn-name:
- $ref: /schemas/types.yaml#/definitions/string
- description:
- Defines a connection name that will be displayed, if the cpu or
- arm,cs-dev-assoc properties are not being used in this connection.
- Principle use for CTI that are connected to non-CoreSight devices, or
- external IO.
-
- anyOf:
- - required:
- - arm,trig-in-sigs
- - required:
- - arm,trig-out-sigs
- oneOf:
- - required:
- - arm,trig-conn-name
- - required:
- - cpu
- - required:
- - arm,cs-dev-assoc
- required:
- - reg
-
-required:
- - compatible
- - reg
- - clocks
- - clock-names
-
-if:
- properties:
- compatible:
- contains:
- const: arm,coresight-cti-v8-arch
-
-then:
- required:
- - cpu
-
-unevaluatedProperties: false
-
-examples:
- # minimum CTI definition. DEVID register used to set number of triggers.
- - |
- cti@20020000 {
- compatible = "arm,coresight-cti", "arm,primecell";
- reg = <0x20020000 0x1000>;
-
- clocks = <&soc_smc50mhz>;
- clock-names = "apb_pclk";
- };
- # v8 architecturally defined CTI - CPU + ETM connections generated by the
- # driver according to the v8 architecture specification.
- - |
- cti@859000 {
- compatible = "arm,coresight-cti-v8-arch", "arm,coresight-cti",
- "arm,primecell";
- reg = <0x859000 0x1000>;
-
- clocks = <&soc_smc50mhz>;
- clock-names = "apb_pclk";
-
- cpu = <&CPU1>;
- arm,cs-dev-assoc = <&etm1>;
- };
- # Implementation defined CTI - CPU + ETM connections explicitly defined..
- # Shows use of type constants from dt-bindings/arm/coresight-cti-dt.h
- # #size-cells and #address-cells are required if trig-conns@ nodes present.
- - |
- #include <dt-bindings/arm/coresight-cti-dt.h>
-
- cti@858000 {
- compatible = "arm,coresight-cti", "arm,primecell";
- reg = <0x858000 0x1000>;
-
- clocks = <&soc_smc50mhz>;
- clock-names = "apb_pclk";
-
- arm,cti-ctm-id = <1>;
-
- #address-cells = <1>;
- #size-cells = <0>;
-
- trig-conns@0 {
- reg = <0>;
- arm,trig-in-sigs = <4 5 6 7>;
- arm,trig-in-types = <ETM_EXTOUT
- ETM_EXTOUT
- ETM_EXTOUT
- ETM_EXTOUT>;
- arm,trig-out-sigs = <4 5 6 7>;
- arm,trig-out-types = <ETM_EXTIN
- ETM_EXTIN
- ETM_EXTIN
- ETM_EXTIN>;
- arm,cs-dev-assoc = <&etm0>;
- };
-
- trig-conns@1 {
- reg = <1>;
- cpu = <&CPU0>;
- arm,trig-in-sigs = <0 1>;
- arm,trig-in-types = <PE_DBGTRIGGER
- PE_PMUIRQ>;
- arm,trig-out-sigs=<0 1 2 >;
- arm,trig-out-types = <PE_EDBGREQ
- PE_DBGRESTART
- PE_CTIIRQ>;
-
- arm,trig-filters = <0>;
- };
- };
- # Implementation defined CTI - non CoreSight component connections.
- - |
- cti@20110000 {
- compatible = "arm,coresight-cti", "arm,primecell";
- reg = <0x20110000 0x1000>;
-
- clocks = <&soc_smc50mhz>;
- clock-names = "apb_pclk";
-
- #address-cells = <1>;
- #size-cells = <0>;
-
- trig-conns@0 {
- reg = <0>;
- arm,trig-in-sigs=<0>;
- arm,trig-in-types=<GEN_INTREQ>;
- arm,trig-out-sigs=<0>;
- arm,trig-out-types=<GEN_HALTREQ>;
- arm,trig-conn-name = "sys_profiler";
- };
-
- trig-conns@1 {
- reg = <1>;
- arm,trig-out-sigs=<2 3>;
- arm,trig-out-types=<GEN_HALTREQ GEN_RESTARTREQ>;
- arm,trig-conn-name = "watchdog";
- };
-
- trig-conns@2 {
- reg = <2>;
- arm,trig-in-sigs=<1 6>;
- arm,trig-in-types=<GEN_HALTREQ GEN_RESTARTREQ>;
- arm,trig-conn-name = "g_counter";
- };
- };
-
-...
diff --git a/Documentation/devicetree/bindings/arm/coresight.txt b/Documentation/devicetree/bindings/arm/coresight.txt
deleted file mode 100644
index 7f9c1ca87487..000000000000
--- a/Documentation/devicetree/bindings/arm/coresight.txt
+++ /dev/null
@@ -1,397 +0,0 @@
-* CoreSight Components:
-
-CoreSight components are compliant with the ARM CoreSight architecture
-specification and can be connected in various topologies to suit a particular
-SoCs tracing needs. These trace components can generally be classified as
-sinks, links and sources. Trace data produced by one or more sources flows
-through the intermediate links connecting the source to the currently selected
-sink. Each CoreSight component device should use these properties to describe
-its hardware characteristcs.
-
-* Required properties for all components *except* non-configurable replicators
- and non-configurable funnels:
-
- * compatible: These have to be supplemented with "arm,primecell" as
- drivers are using the AMBA bus interface. Possible values include:
- - Embedded Trace Buffer (version 1.0):
- "arm,coresight-etb10", "arm,primecell";
-
- - Trace Port Interface Unit:
- "arm,coresight-tpiu", "arm,primecell";
-
- - Trace Memory Controller, used for Embedded Trace Buffer(ETB),
- Embedded Trace FIFO(ETF) and Embedded Trace Router(ETR)
- configuration. The configuration mode (ETB, ETF, ETR) is
- discovered at boot time when the device is probed.
- "arm,coresight-tmc", "arm,primecell";
-
- - Trace Programmable Funnel:
- "arm,coresight-dynamic-funnel", "arm,primecell";
- "arm,coresight-funnel", "arm,primecell"; (OBSOLETE. For
- backward compatibility and will be removed)
-
- - Embedded Trace Macrocell (version 3.x) and
- Program Flow Trace Macrocell:
- "arm,coresight-etm3x", "arm,primecell";
-
- - Embedded Trace Macrocell (version 4.x), with memory mapped access.
- "arm,coresight-etm4x", "arm,primecell";
-
- - Embedded Trace Macrocell (version 4.x), with system register access only.
- "arm,coresight-etm4x-sysreg";
-
- - Coresight programmable Replicator :
- "arm,coresight-dynamic-replicator", "arm,primecell";
-
- - System Trace Macrocell:
- "arm,coresight-stm", "arm,primecell"; [1]
- - Coresight Address Translation Unit (CATU)
- "arm,coresight-catu", "arm,primecell";
-
- - Coresight Cross Trigger Interface (CTI):
- "arm,coresight-cti", "arm,primecell";
- See coresight-cti.yaml for full CTI definitions.
-
- * reg: physical base address and length of the register
- set(s) of the component.
-
- * clocks: the clocks associated to this component.
-
- * clock-names: the name of the clocks referenced by the code.
- Since we are using the AMBA framework, the name of the clock
- providing the interconnect should be "apb_pclk", and some
- coresight blocks also have an additional clock "atclk", which
- clocks the core of that coresight component. The latter clock
- is optional.
-
- * port or ports: see "Graph bindings for Coresight" below.
-
-* Additional required property for Embedded Trace Macrocell (version 3.x and
- version 4.x):
- * cpu: the cpu phandle this ETM/PTM is affined to. Do not
- assume it to default to CPU0 if omitted.
-
-* Additional required properties for System Trace Macrocells (STM):
- * reg: along with the physical base address and length of the register
- set as described above, another entry is required to describe the
- mapping of the extended stimulus port area.
-
- * reg-names: the only acceptable values are "stm-base" and
- "stm-stimulus-base", each corresponding to the areas defined in "reg".
-
-* Required properties for Coresight Cross Trigger Interface (CTI)
- See coresight-cti.yaml for full CTI definitions.
-
-* Required properties for devices that don't show up on the AMBA bus, such as
- non-configurable replicators and non-configurable funnels:
-
- * compatible: Currently supported value is (note the absence of the
- AMBA markee):
- - Coresight Non-configurable Replicator:
- "arm,coresight-static-replicator";
- "arm,coresight-replicator"; (OBSOLETE. For backward
- compatibility and will be removed)
-
- - Coresight Non-configurable Funnel:
- "arm,coresight-static-funnel";
-
- * port or ports: see "Graph bindings for Coresight" below.
-
-* Optional properties for all components:
-
- * arm,coresight-loses-context-with-cpu : boolean. Indicates that the
- hardware will lose register context on CPU power down (e.g. CPUIdle).
- An example of where this may be needed are systems which contain a
- coresight component and CPU in the same power domain. When the CPU
- powers down the coresight component also powers down and loses its
- context. This property is currently only used for the ETM 4.x driver.
-
-* Optional properties for ETM/PTMs:
-
- * arm,cp14: must be present if the system accesses ETM/PTM management
- registers via co-processor 14.
-
- * qcom,skip-power-up: boolean. Indicates that an implementation can
- skip powering up the trace unit. TRCPDCR.PU does not have to be set
- on Qualcomm Technologies Inc. systems since ETMs are in the same power
- domain as their CPU cores. This property is required to identify such
- systems with hardware errata where the CPU watchdog counter is stopped
- when TRCPDCR.PU is set.
-
-* Optional property for TMC:
-
- * arm,buffer-size: size of contiguous buffer space for TMC ETR
- (embedded trace router). This property is obsolete. The buffer size
- can be configured dynamically via buffer_size property in sysfs.
-
- * arm,scatter-gather: boolean. Indicates that the TMC-ETR can safely
- use the SG mode on this system.
-
-* Optional property for CATU :
- * interrupts : Exactly one SPI may be listed for reporting the address
- error
-
-* Optional property for configurable replicators:
-
- * qcom,replicator-loses-context: boolean. Indicates that the replicator
- will lose register context when AMBA clock is removed which is observed
- in some replicator designs.
-
-Graph bindings for Coresight
--------------------------------
-
-Coresight components are interconnected to create a data path for the flow of
-trace data generated from the "sources" to their collection points "sink".
-Each coresight component must describe the "input" and "output" connections.
-The connections must be described via generic DT graph bindings as described
-by the "bindings/graph.txt", where each "port" along with an "endpoint"
-component represents a hardware port and the connection.
-
- * All output ports must be listed inside a child node named "out-ports"
- * All input ports must be listed inside a child node named "in-ports".
- * Port address must match the hardware port number.
-
-Example:
-
-1. Sinks
- etb@20010000 {
- compatible = "arm,coresight-etb10", "arm,primecell";
- reg = <0 0x20010000 0 0x1000>;
-
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- in-ports {
- port {
- etb_in_port: endpoint@0 {
- remote-endpoint = <&replicator_out_port0>;
- };
- };
- };
- };
-
- tpiu@20030000 {
- compatible = "arm,coresight-tpiu", "arm,primecell";
- reg = <0 0x20030000 0 0x1000>;
-
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- in-ports {
- port {
- tpiu_in_port: endpoint@0 {
- remote-endpoint = <&replicator_out_port1>;
- };
- };
- };
- };
-
- etr@20070000 {
- compatible = "arm,coresight-tmc", "arm,primecell";
- reg = <0 0x20070000 0 0x1000>;
-
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- in-ports {
- port {
- etr_in_port: endpoint {
- remote-endpoint = <&replicator2_out_port0>;
- };
- };
- };
-
- out-ports {
- port {
- etr_out_port: endpoint {
- remote-endpoint = <&catu_in_port>;
- };
- };
- };
- };
-
-2. Links
- replicator {
- /* non-configurable replicators don't show up on the
- * AMBA bus. As such no need to add "arm,primecell".
- */
- compatible = "arm,coresight-static-replicator";
-
- out-ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- /* replicator output ports */
- port@0 {
- reg = <0>;
- replicator_out_port0: endpoint {
- remote-endpoint = <&etb_in_port>;
- };
- };
-
- port@1 {
- reg = <1>;
- replicator_out_port1: endpoint {
- remote-endpoint = <&tpiu_in_port>;
- };
- };
- };
-
- in-ports {
- port {
- replicator_in_port0: endpoint {
- remote-endpoint = <&funnel_out_port0>;
- };
- };
- };
- };
-
- funnel {
- /*
- * non-configurable funnel don't show up on the AMBA
- * bus. As such no need to add "arm,primecell".
- */
- compatible = "arm,coresight-static-funnel";
- clocks = <&crg_ctrl HI3660_PCLK>;
- clock-names = "apb_pclk";
-
- out-ports {
- port {
- combo_funnel_out: endpoint {
- remote-endpoint = <&top_funnel_in>;
- };
- };
- };
-
- in-ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- combo_funnel_in0: endpoint {
- remote-endpoint = <&cluster0_etf_out>;
- };
- };
-
- port@1 {
- reg = <1>;
- combo_funnel_in1: endpoint {
- remote-endpoint = <&cluster1_etf_out>;
- };
- };
- };
- };
-
- funnel@20040000 {
- compatible = "arm,coresight-dynamic-funnel", "arm,primecell";
- reg = <0 0x20040000 0 0x1000>;
-
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- out-ports {
- port {
- funnel_out_port0: endpoint {
- remote-endpoint =
- <&replicator_in_port0>;
- };
- };
- };
-
- in-ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- funnel_in_port0: endpoint {
- remote-endpoint = <&ptm0_out_port>;
- };
- };
-
- port@1 {
- reg = <1>;
- funnel_in_port1: endpoint {
- remote-endpoint = <&ptm1_out_port>;
- };
- };
-
- port@2 {
- reg = <2>;
- funnel_in_port2: endpoint {
- remote-endpoint = <&etm0_out_port>;
- };
- };
-
- };
- };
-
-3. Sources
- ptm@2201c000 {
- compatible = "arm,coresight-etm3x", "arm,primecell";
- reg = <0 0x2201c000 0 0x1000>;
-
- cpu = <&cpu0>;
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- out-ports {
- port {
- ptm0_out_port: endpoint {
- remote-endpoint = <&funnel_in_port0>;
- };
- };
- };
- };
-
- ptm@2201d000 {
- compatible = "arm,coresight-etm3x", "arm,primecell";
- reg = <0 0x2201d000 0 0x1000>;
-
- cpu = <&cpu1>;
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
- out-ports {
- port {
- ptm1_out_port: endpoint {
- remote-endpoint = <&funnel_in_port1>;
- };
- };
- };
- };
-
-4. STM
- stm@20100000 {
- compatible = "arm,coresight-stm", "arm,primecell";
- reg = <0 0x20100000 0 0x1000>,
- <0 0x28000000 0 0x180000>;
- reg-names = "stm-base", "stm-stimulus-base";
-
- clocks = <&soc_smc50mhz>;
- clock-names = "apb_pclk";
- out-ports {
- port {
- stm_out_port: endpoint {
- remote-endpoint = <&main_funnel_in_port2>;
- };
- };
- };
- };
-
-5. CATU
-
- catu@207e0000 {
- compatible = "arm,coresight-catu", "arm,primecell";
- reg = <0 0x207e0000 0 0x1000>;
-
- clocks = <&oscclk6a>;
- clock-names = "apb_pclk";
-
- interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
- in-ports {
- port {
- catu_in_port: endpoint {
- remote-endpoint = <&etr_out_port>;
- };
- };
- };
- };
-
-[1]. There is currently two version of STM: STM32 and STM500. Both
-have the same HW interface and as such don't need an explicit binding name.
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
index 9a2432a88074..ff272e517d57 100644
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/cpus.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM CPUs bindings
+title: ARM CPUs
maintainers:
- Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
@@ -85,6 +85,8 @@ properties:
compatible:
enum:
+ - apple,avalanche
+ - apple,blizzard
- apple,icestorm
- apple,firestorm
- arm,arm710t
@@ -137,6 +139,12 @@ properties:
- arm,cortex-a75
- arm,cortex-a76
- arm,cortex-a77
+ - arm,cortex-a78
+ - arm,cortex-a78ae
+ - arm,cortex-a78c
+ - arm,cortex-a510
+ - arm,cortex-a710
+ - arm,cortex-a715
- arm,cortex-m0
- arm,cortex-m0+
- arm,cortex-m1
@@ -145,8 +153,14 @@ properties:
- arm,cortex-r4
- arm,cortex-r5
- arm,cortex-r7
+ - arm,cortex-x1
+ - arm,cortex-x1c
+ - arm,cortex-x2
+ - arm,cortex-x3
- arm,neoverse-e1
- arm,neoverse-n1
+ - arm,neoverse-n2
+ - arm,neoverse-v1
- brcm,brahma-b15
- brcm,brahma-b53
- brcm,vulcan
@@ -166,12 +180,19 @@ properties:
- nvidia,tegra194-carmel
- qcom,krait
- qcom,kryo
+ - qcom,kryo240
+ - qcom,kryo250
- qcom,kryo260
- qcom,kryo280
+ - qcom,kryo360
- qcom,kryo385
- qcom,kryo468
- qcom,kryo485
+ - qcom,kryo560
+ - qcom,kryo570
+ - qcom,kryo660
- qcom,kryo685
+ - qcom,kryo780
- qcom,scorpion
enable-method:
@@ -209,6 +230,10 @@ properties:
- qcom,gcc-msm8660
- qcom,kpss-acc-v1
- qcom,kpss-acc-v2
+ - qcom,msm8226-smp
+ - qcom,msm8909-smp
+ # Only valid on ARM 32-bit, see above for ARM v8 64-bit
+ - qcom,msm8916-smp
- renesas,apmu
- renesas,r9a06g032-smp
- rockchip,rk3036-smp
@@ -219,27 +244,31 @@ properties:
- ti,am4372
cpu-release-addr:
- $ref: '/schemas/types.yaml#/definitions/uint64'
-
+ oneOf:
+ - $ref: '/schemas/types.yaml#/definitions/uint32'
+ - $ref: '/schemas/types.yaml#/definitions/uint64'
description:
+ The DT specification defines this as 64-bit always, but some 32-bit Arm
+ systems have used a 32-bit value which must be supported.
Required for systems that have an "enable-method"
property value of "spin-table".
- On ARM v8 64-bit systems must be a two cell
- property identifying a 64-bit zero-initialised
- memory location.
cpu-idle-states:
$ref: '/schemas/types.yaml#/definitions/phandle-array'
+ items:
+ maxItems: 1
description: |
List of phandles to idle state nodes supported
by this cpu (see ./idle-states.yaml).
capacity-dmips-mhz:
description:
- u32 value representing CPU capacity (see ./cpu-capacity.txt) in
+ u32 value representing CPU capacity (see ../cpu/cpu-capacity.txt) in
DMIPS/MHz, relative to highest capacity-dmips-mhz
in the system.
+ cci-control-port: true
+
dynamic-power-coefficient:
$ref: '/schemas/types.yaml#/definitions/uint32'
description:
@@ -293,7 +322,8 @@ properties:
Specifies the ACC* node associated with this CPU.
Required for systems that have an "enable-method" property
- value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
+ value of "qcom,kpss-acc-v1", "qcom,kpss-acc-v2", "qcom,msm8226-smp" or
+ "qcom,msm8916-smp".
* arm/msm/qcom,kpss-acc.txt
diff --git a/Documentation/devicetree/bindings/arm/digicolor.yaml b/Documentation/devicetree/bindings/arm/digicolor.yaml
index a35de3c9e284..0cf9ddaa527e 100644
--- a/Documentation/devicetree/bindings/arm/digicolor.yaml
+++ b/Documentation/devicetree/bindings/arm/digicolor.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/digicolor.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Conexant Digicolor Platforms Device Tree Bindings
+title: Conexant Digicolor Platforms
maintainers:
- Baruch Siach <baruch@tkos.co.il>
diff --git a/Documentation/devicetree/bindings/arm/ete.yaml b/Documentation/devicetree/bindings/arm/ete.yaml
deleted file mode 100644
index 7f9b2d1e1147..000000000000
--- a/Documentation/devicetree/bindings/arm/ete.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
-# Copyright 2021, Arm Ltd
-%YAML 1.2
----
-$id: "http://devicetree.org/schemas/arm/ete.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
-
-title: ARM Embedded Trace Extensions
-
-maintainers:
- - Suzuki K Poulose <suzuki.poulose@arm.com>
- - Mathieu Poirier <mathieu.poirier@linaro.org>
-
-description: |
- Arm Embedded Trace Extension(ETE) is a per CPU trace component that
- allows tracing the CPU execution. It overlaps with the CoreSight ETMv4
- architecture and has extended support for future architecture changes.
- The trace generated by the ETE could be stored via legacy CoreSight
- components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer
- Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to
- legacy CoreSight components, a node must be listed per instance, along
- with any optional connection graph as per the coresight bindings.
- See bindings/arm/coresight.txt.
-
-properties:
- $nodename:
- pattern: "^ete([0-9a-f]+)$"
- compatible:
- items:
- - const: arm,embedded-trace-extension
-
- cpu:
- description: |
- Handle to the cpu this ETE is bound to.
- $ref: /schemas/types.yaml#/definitions/phandle
-
- out-ports:
- description: |
- Output connections from the ETE to legacy CoreSight trace bus.
- $ref: /schemas/graph.yaml#/properties/ports
- properties:
- port:
- description: Output connection from the ETE to legacy CoreSight Trace bus.
- $ref: /schemas/graph.yaml#/properties/port
-
-required:
- - compatible
- - cpu
-
-additionalProperties: false
-
-examples:
-
-# An ETE node without legacy CoreSight connections
- - |
- ete0 {
- compatible = "arm,embedded-trace-extension";
- cpu = <&cpu_0>;
- };
-# An ETE node with legacy CoreSight connections
- - |
- ete1 {
- compatible = "arm,embedded-trace-extension";
- cpu = <&cpu_1>;
-
- out-ports { /* legacy coresight connection */
- port {
- ete1_out_port: endpoint {
- remote-endpoint = <&funnel_in_port0>;
- };
- };
- };
- };
-
-...
diff --git a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.yaml b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.yaml
index c24047c1fdd5..5d033570b57b 100644
--- a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.yaml
+++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/firmware/linaro,optee-tz.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: OP-TEE Device Tree Bindings
+title: OP-TEE
maintainers:
- Jens Wiklander <jens.wiklander@linaro.org>
@@ -24,6 +24,13 @@ properties:
compatible:
const: linaro,optee-tz
+ interrupts:
+ maxItems: 1
+ description: |
+ This interrupt which is used to signal an event by the secure world
+ software is expected to be either a per-cpu interrupt or an
+ edge-triggered peripheral interrupt.
+
method:
enum: [smc, hvc]
description: |
@@ -42,10 +49,12 @@ additionalProperties: false
examples:
- |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
firmware {
optee {
compatible = "linaro,optee-tz";
method = "smc";
+ interrupts = <GIC_SPI 187 IRQ_TYPE_EDGE_RISING>;
};
};
diff --git a/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.txt b/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.txt
deleted file mode 100644
index 780d0392a66b..000000000000
--- a/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Trusted Foundations
--------------------
-
-Boards that use the Trusted Foundations secure monitor can signal its
-presence by declaring a node compatible with "tlm,trusted-foundations"
-under the /firmware/ node
-
-Required properties:
-- compatible: "tlm,trusted-foundations"
-- tlm,version-major: major version number of Trusted Foundations firmware
-- tlm,version-minor: minor version number of Trusted Foundations firmware
-
-Example:
- firmware {
- trusted-foundations {
- compatible = "tlm,trusted-foundations";
- tlm,version-major = <2>;
- tlm,version-minor = <8>;
- };
- };
diff --git a/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.yaml b/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.yaml
new file mode 100644
index 000000000000..9d1857c0aa07
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/firmware/tlm,trusted-foundations.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/arm/firmware/tlm,trusted-foundations.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: Trusted Foundations
+
+description: |
+ Boards that use the Trusted Foundations secure monitor can signal its
+ presence by declaring a node compatible under the /firmware/ node
+
+maintainers:
+ - Stephen Warren <swarren@nvidia.com>
+
+properties:
+ $nodename:
+ const: trusted-foundations
+
+ compatible:
+ const: tlm,trusted-foundations
+
+ tlm,version-major:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: major version number of Trusted Foundations firmware
+
+ tlm,version-minor:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: minor version number of Trusted Foundations firmware
+
+required:
+ - compatible
+ - tlm,version-major
+ - tlm,version-minor
+
+additionalProperties: false
+
+examples:
+ - |
+ firmware {
+ trusted-foundations {
+ compatible = "tlm,trusted-foundations";
+ tlm,version-major = <2>;
+ tlm,version-minor = <8>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt
deleted file mode 100644
index b5cb374dc47d..000000000000
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-dcfg.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Freescale DCFG
-
-DCFG is the device configuration unit, that provides general purpose
-configuration and status for the device. Such as setting the secondary
-core start address and release the secondary core from holdoff and startup.
-
-Required properties:
- - compatible: Should contain a chip-specific compatible string,
- Chip-specific strings are of the form "fsl,<chip>-dcfg",
- The following <chip>s are known to be supported:
- ls1012a, ls1021a, ls1043a, ls1046a, ls2080a.
-
- - reg : should contain base address and length of DCFG memory-mapped registers
-
-Example:
- dcfg: dcfg@1ee0000 {
- compatible = "fsl,ls1021a-dcfg";
- reg = <0x0 0x1ee0000 0x0 0x10000>;
- };
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt
deleted file mode 100644
index 0ab67b0b216d..000000000000
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,layerscape-scfg.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Freescale SCFG
-
-SCFG is the supplemental configuration unit, that provides SoC specific
-configuration and status registers for the chip. Such as getting PEX port
-status.
-
-Required properties:
- - compatible: Should contain a chip-specific compatible string,
- Chip-specific strings are of the form "fsl,<chip>-scfg",
- The following <chip>s are known to be supported:
- ls1012a, ls1021a, ls1043a, ls1046a, ls2080a.
-
- - reg: should contain base address and length of SCFG memory-mapped registers
-
-Example:
- scfg: scfg@1570000 {
- compatible = "fsl,ls1021a-scfg";
- reg = <0x0 0x1570000 0x0 0x10000>;
- };
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
deleted file mode 100644
index fd0061712443..000000000000
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
+++ /dev/null
@@ -1,270 +0,0 @@
-NXP i.MX System Controller Firmware (SCFW)
---------------------------------------------------------------------
-
-The System Controller Firmware (SCFW) is a low-level system function
-which runs on a dedicated Cortex-M core to provide power, clock, and
-resource management. It exists on some i.MX8 processors. e.g. i.MX8QM
-(QM, QP), and i.MX8QX (QXP, DX).
-
-The AP communicates with the SC using a multi-ported MU module found
-in the LSIO subsystem. The current definition of this MU module provides
-5 remote AP connections to the SC to support up to 5 execution environments
-(TZ, HV, standard Linux, etc.). The SC side of this MU module interfaces
-with the LSIO DSC IP bus. The SC firmware will communicate with this MU
-using the MSI bus.
-
-System Controller Device Node:
-============================================================
-
-The scu node with the following properties shall be under the /firmware/ node.
-
-Required properties:
--------------------
-- compatible: should be "fsl,imx-scu".
-- mbox-names: should include "tx0", "tx1", "tx2", "tx3",
- "rx0", "rx1", "rx2", "rx3";
- include "gip3" if want to support general MU interrupt.
-- mboxes: List of phandle of 4 MU channels for tx, 4 MU channels for
- rx, and 1 optional MU channel for general interrupt.
- All MU channels must be in the same MU instance.
- Cross instances are not allowed. The MU instance can only
- be one of LSIO MU0~M4 for imx8qxp and imx8qm. Users need
- to make sure use the one which is not conflict with other
- execution environments. e.g. ATF.
- Note:
- Channel 0 must be "tx0" or "rx0".
- Channel 1 must be "tx1" or "rx1".
- Channel 2 must be "tx2" or "rx2".
- Channel 3 must be "tx3" or "rx3".
- General interrupt rx channel must be "gip3".
- e.g.
- mboxes = <&lsio_mu1 0 0
- &lsio_mu1 0 1
- &lsio_mu1 0 2
- &lsio_mu1 0 3
- &lsio_mu1 1 0
- &lsio_mu1 1 1
- &lsio_mu1 1 2
- &lsio_mu1 1 3
- &lsio_mu1 3 3>;
- See Documentation/devicetree/bindings/mailbox/fsl,mu.yaml
- for detailed mailbox binding.
-
-Note: Each mu which supports general interrupt should have an alias correctly
-numbered in "aliases" node.
-e.g.
-aliases {
- mu1 = &lsio_mu1;
-};
-
-i.MX SCU Client Device Node:
-============================================================
-
-Client nodes are maintained as children of the relevant IMX-SCU device node.
-
-Power domain bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-This binding for the SCU power domain providers uses the generic power
-domain binding[2].
-
-Required properties:
-- compatible: Should be one of:
- "fsl,imx8qm-scu-pd",
- "fsl,imx8qxp-scu-pd"
- followed by "fsl,scu-pd"
-
-- #power-domain-cells: Must be 1. Contains the Resource ID used by
- SCU commands.
- See detailed Resource ID list from:
- include/dt-bindings/firmware/imx/rsrc.h
-
-Clock bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-This binding uses the common clock binding[1].
-
-Required properties:
-- compatible: Should be one of:
- "fsl,imx8qm-clk"
- "fsl,imx8qxp-clk"
- followed by "fsl,scu-clk"
-- #clock-cells: Should be 2.
- Contains the Resource and Clock ID value.
-- clocks: List of clock specifiers, must contain an entry for
- each required entry in clock-names
-- clock-names: Should include entries "xtal_32KHz", "xtal_24MHz"
-
-The clock consumer should specify the desired clock by having the clock
-ID in its "clocks" phandle cell.
-
-See the full list of clock IDs from:
-include/dt-bindings/clock/imx8qxp-clock.h
-
-Pinctrl bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-This binding uses the i.MX common pinctrl binding[3].
-
-Required properties:
-- compatible: Should be one of:
- "fsl,imx8qm-iomuxc",
- "fsl,imx8qxp-iomuxc",
- "fsl,imx8dxl-iomuxc".
-
-Required properties for Pinctrl sub nodes:
-- fsl,pins: Each entry consists of 3 integers which represents
- the mux and config setting for one pin. The first 2
- integers <pin_id mux_mode> are specified using a
- PIN_FUNC_ID macro, which can be found in
- <dt-bindings/pinctrl/pads-imx8qm.h>,
- <dt-bindings/pinctrl/pads-imx8qxp.h>,
- <dt-bindings/pinctrl/pads-imx8dxl.h>.
- The last integer CONFIG is the pad setting value like
- pull-up on this pin.
-
- Please refer to i.MX8QXP Reference Manual for detailed
- CONFIG settings.
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] Documentation/devicetree/bindings/power/power-domain.yaml
-[3] Documentation/devicetree/bindings/pinctrl/fsl,imx-pinctrl.txt
-
-RTC bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-Required properties:
-- compatible: should be "fsl,imx8qxp-sc-rtc";
-
-OCOTP bindings based on SCU Message Protocol
-------------------------------------------------------------
-Required properties:
-- compatible: Should be one of:
- "fsl,imx8qm-scu-ocotp",
- "fsl,imx8qxp-scu-ocotp".
-- #address-cells: Must be 1. Contains byte index
-- #size-cells: Must be 1. Contains byte length
-
-Optional Child nodes:
-
-- Data cells of ocotp:
- Detailed bindings are described in bindings/nvmem/nvmem.txt
-
-Watchdog bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-Required properties:
-- compatible: should be:
- "fsl,imx8qxp-sc-wdt"
- followed by "fsl,imx-sc-wdt";
-Optional properties:
-- timeout-sec: contains the watchdog timeout in seconds.
-
-SCU key bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-Required properties:
-- compatible: should be:
- "fsl,imx8qxp-sc-key"
- followed by "fsl,imx-sc-key";
-- linux,keycodes: See Documentation/devicetree/bindings/input/input.yaml
-
-Thermal bindings based on SCU Message Protocol
-------------------------------------------------------------
-
-Required properties:
-- compatible: Should be :
- "fsl,imx8qxp-sc-thermal"
- followed by "fsl,imx-sc-thermal";
-
-- #thermal-sensor-cells: See Documentation/devicetree/bindings/thermal/thermal-sensor.yaml
- for a description.
-
-Example (imx8qxp):
--------------
-aliases {
- mu1 = &lsio_mu1;
-};
-
-lsio_mu1: mailbox@5d1c0000 {
- ...
- #mbox-cells = <2>;
-};
-
-firmware {
- scu {
- compatible = "fsl,imx-scu";
- mbox-names = "tx0", "tx1", "tx2", "tx3",
- "rx0", "rx1", "rx2", "rx3",
- "gip3";
- mboxes = <&lsio_mu1 0 0
- &lsio_mu1 0 1
- &lsio_mu1 0 2
- &lsio_mu1 0 3
- &lsio_mu1 1 0
- &lsio_mu1 1 1
- &lsio_mu1 1 2
- &lsio_mu1 1 3
- &lsio_mu1 3 3>;
-
- clk: clk {
- compatible = "fsl,imx8qxp-clk", "fsl,scu-clk";
- #clock-cells = <2>;
- };
-
- iomuxc {
- compatible = "fsl,imx8qxp-iomuxc";
-
- pinctrl_lpuart0: lpuart0grp {
- fsl,pins = <
- SC_P_UART0_RX_ADMA_UART0_RX 0x06000020
- SC_P_UART0_TX_ADMA_UART0_TX 0x06000020
- >;
- };
- ...
- };
-
- ocotp: imx8qx-ocotp {
- compatible = "fsl,imx8qxp-scu-ocotp";
- #address-cells = <1>;
- #size-cells = <1>;
-
- fec_mac0: mac@2c4 {
- reg = <0x2c4 8>;
- };
- };
-
- pd: imx8qx-pd {
- compatible = "fsl,imx8qxp-scu-pd", "fsl,scu-pd";
- #power-domain-cells = <1>;
- };
-
- rtc: rtc {
- compatible = "fsl,imx8qxp-sc-rtc";
- };
-
- scu_key: scu-key {
- compatible = "fsl,imx8qxp-sc-key", "fsl,imx-sc-key";
- linux,keycodes = <KEY_POWER>;
- };
-
- watchdog {
- compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
- timeout-sec = <60>;
- };
-
- tsens: thermal-sensor {
- compatible = "fsl,imx8qxp-sc-thermal", "fsl,imx-sc-thermal";
- #thermal-sensor-cells = <1>;
- };
- };
-};
-
-serial@5a060000 {
- ...
- pinctrl-names = "default";
- pinctrl-0 = <&pinctrl_lpuart0>;
- clocks = <&uart0_clk IMX_SC_R_UART_0 IMX_SC_PM_CLK_PER>;
- clock-names = "ipg";
- power-domains = <&pd IMX_SC_R_UART_0>;
-};
diff --git a/Documentation/devicetree/bindings/arm/fsl.yaml b/Documentation/devicetree/bindings/arm/fsl.yaml
index 60f4862ba15e..15d411084065 100644
--- a/Documentation/devicetree/bindings/arm/fsl.yaml
+++ b/Documentation/devicetree/bindings/arm/fsl.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/fsl.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Freescale i.MX Platforms Device Tree Bindings
+title: Freescale i.MX Platforms
maintainers:
- Shawn Guo <shawnguo@kernel.org>
@@ -88,12 +88,56 @@ properties:
items:
- enum:
- armadeus,imx28-apf28 # APF28 SoM
- - armadeus,imx28-apf28dev # APF28 SoM on APF28Dev board
+ - bluegiga,apx4devkit # Bluegiga APx4 SoM on dev board
+ - crystalfontz,cfa10036 # Crystalfontz CFA-10036 SoM
+ - eukrea,mbmx28lc
- fsl,imx28-evk
- i2se,duckbill
- i2se,duckbill-2
+ - karo,tx28 # Ka-Ro electronics TX28 module
+ - lwn,imx28-xea
+ - msr,m28cu3 # M28 SoM with custom base board
+ - schulercontrol,imx28-sps1
- technologic,imx28-ts4600
- const: fsl,imx28
+
+ - description: i.MX28 Aries M28 SoM Board
+ items:
+ - const: aries,m28
+ - const: denx,m28
+ - const: fsl,imx28
+
+ - description: i.MX28 Aries M28EVK Board
+ items:
+ - const: aries,m28evk
+ - const: denx,m28evk
+ - const: fsl,imx28
+
+ - description: i.MX28 Armadeus Systems APF28Dev Board
+ items:
+ - const: armadeus,imx28-apf28dev
+ - const: armadeus,imx28-apf28
+ - const: fsl,imx28
+
+ - description: i.MX28 Crystalfontz CFA-10036 based Boards
+ items:
+ - enum:
+ - crystalfontz,cfa10037
+ - crystalfontz,cfa10049
+ - crystalfontz,cfa10057
+ - crystalfontz,cfa10058
+ - const: crystalfontz,cfa10036
+ - const: fsl,imx28
+
+ - description: i.MX28 Crystalfontz CFA-10037 based Boards
+ items:
+ - enum:
+ - crystalfontz,cfa10055
+ - crystalfontz,cfa10056
+ - const: crystalfontz,cfa10037
+ - const: crystalfontz,cfa10036
+ - const: fsl,imx28
+
- description: i.MX28 Duckbill 2 based Boards
items:
- enum:
@@ -103,6 +147,19 @@ properties:
- const: i2se,duckbill-2
- const: fsl,imx28
+ - description: i.MX28 Eukrea Electromatique MBMX283LC Board
+ items:
+ - const: eukrea,mbmx283lc
+ - const: eukrea,mbmx28lc
+ - const: fsl,imx28
+
+ - description: i.MX28 Eukrea Electromatique MBMX287LC Board
+ items:
+ - const: eukrea,mbmx287lc
+ - const: eukrea,mbmx283lc
+ - const: eukrea,mbmx28lc
+ - const: fsl,imx28
+
- description: i.MX31 based Boards
items:
- enum:
@@ -172,7 +229,8 @@ properties:
- karo,tx53 # Ka-Ro electronics TX53 module
- kiebackpeter,imx53-ddc # K+P imx53 DDC
- kiebackpeter,imx53-hsc # K+P imx53 HSC
- - menlo,m53menlo
+ - menlo,m53menlo # i.MX53 Menlo board
+ - starterkit,sk-imx53
- voipac,imx53-dmm-668 # Voipac i.MX53 X53-DMM-668
- const: fsl,imx53
@@ -192,6 +250,7 @@ properties:
items:
- enum:
- auvidea,h100 # Auvidea H100
+ - bosch,imx6q-acc # Bosch ACC i.MX6 Dual
- boundary,imx6q-nitrogen6_max
- boundary,imx6q-nitrogen6_som2
- boundary,imx6q-nitrogen6x
@@ -235,11 +294,13 @@ properties:
- technexion,imx6q-pico-pi # TechNexion i.MX6Q Pico-Pi
- technologic,imx6q-ts4900
- technologic,imx6q-ts7970
- - toradex,apalis_imx6q # Apalis iMX6 Module
+ - toradex,apalis_imx6q # Apalis iMX6 Modules
- udoo,imx6q-udoo # Udoo i.MX6 Quad Board
- uniwest,imx6q-evi # Uniwest Evi
- variscite,dt6customboard
- wand,imx6q-wandboard # Wandboard i.MX6 Quad Board
+ - ysoft,imx6q-yapp4-crux # i.MX6 Quad Y Soft IOTA Crux board
+ - ysoft,imx6q-yapp4-pegasus # i.MX6 Quad Y Soft IOTA Pegasus board
- zealz,imx6q-gk802 # Zealz GK802
- zii,imx6q-zii-rdu2 # ZII RDU2 Board
- const: fsl,imx6q
@@ -314,19 +375,28 @@ properties:
- const: phytec,imx6q-pfla02 # PHYTEC phyFLEX-i.MX6 Quad
- const: fsl,imx6q
- - description: i.MX6Q Boards with Toradex Apalis iMX6Q/D Module
+ - description: i.MX6Q Boards with Toradex Apalis iMX6Q/D Modules
items:
- enum:
- - toradex,apalis_imx6q-ixora # Apalis iMX6Q/D Module on Ixora Carrier Board
- - toradex,apalis_imx6q-eval # Apalis iMX6Q/D Module on Apalis Evaluation Board
+ - toradex,apalis_imx6q-ixora # Apalis iMX6Q/D Module on Ixora Carrier Board
+ - toradex,apalis_imx6q-ixora-v1.1 # Apalis iMX6Q/D Module on Ixora V1.1 Carrier Board
+ - toradex,apalis_imx6q-ixora-v1.2 # Apalis iMX6Q/D Module on Ixora V1.2 Carrier Board
+ - toradex,apalis_imx6q-eval # Apalis iMX6Q/D Module on Apalis Evaluation Board
- const: toradex,apalis_imx6q
- const: fsl,imx6q
- - description: i.MX6Q Toradex Apalis iMX6Q/D Module on Ixora Carrier Board V1.1
+ - description: TQ-Systems TQMa6Q SoM (variant A) on MBa6x
items:
- - const: toradex,apalis_imx6q-ixora-v1.1
- - const: toradex,apalis_imx6q-ixora
- - const: toradex,apalis_imx6q
+ - const: tq,imx6q-mba6x-a
+ - const: tq,mba6a # Expected by bootloader, to be removed in the future
+ - const: tq,imx6q-tqma6q-a
+ - const: fsl,imx6q
+
+ - description: TQ-Systems TQMa6Q SoM (variant B) on MBa6x
+ items:
+ - const: tq,imx6q-mba6x-b
+ - const: tq,mba6b # Expected by bootloader, to be removed in the future
+ - const: tq,imx6q-tqma6q-b
- const: fsl,imx6q
- description: i.MX6QP based Boards
@@ -340,6 +410,8 @@ properties:
- kvg,vicutp # Kverneland UT1P board
- prt,prtwd3 # Protonic WD3 board
- wand,imx6qp-wandboard # Wandboard i.MX6 QuadPlus Board
+ - ysoft,imx6qp-yapp4-crux-plus # i.MX6 Quad Plus Y Soft IOTA Crux+ board
+ - ysoft,imx6qp-yapp4-pegasus-plus # i.MX6 Quad Plus Y Soft IOTA Pegasus+ board
- zii,imx6qp-zii-rdu2 # ZII RDU2+ Board
- const: fsl,imx6qp
@@ -350,6 +422,13 @@ properties:
- const: phytec,imx6qdl-pcm058 # PHYTEC phyCORE-i.MX6
- const: fsl,imx6qp
+ - description: TQ-Systems TQMa6QP SoM on MBa6x
+ items:
+ - const: tq,imx6qp-mba6x-b
+ - const: tq,mba6b # Expected by bootloader, to be removed in the future
+ - const: tq,imx6qp-tqma6qp-b
+ - const: fsl,imx6qp
+
- description: i.MX6DL based Boards
items:
- enum:
@@ -393,12 +472,15 @@ properties:
- technexion,imx6dl-pico-pi # TechNexion i.MX6DL Pico-Pi
- technologic,imx6dl-ts4900
- technologic,imx6dl-ts7970
+ - toradex,colibri_imx6dl # Colibri iMX6 Modules
- udoo,imx6dl-udoo # Udoo i.MX6 Dual-lite Board
- vdl,lanmcu # Van der Laan LANMCU board
- wand,imx6dl-wandboard # Wandboard i.MX6 Dual Lite Board
- - ysoft,imx6dl-yapp4-draco # i.MX6 DualLite Y Soft IOTA Draco board
+ - ysoft,imx6dl-yapp4-draco # i.MX6 Solo Y Soft IOTA Draco board
- ysoft,imx6dl-yapp4-hydra # i.MX6 DualLite Y Soft IOTA Hydra board
+ - ysoft,imx6dl-yapp4-lynx # i.MX6 DualLite Y Soft IOTA Lynx board
- ysoft,imx6dl-yapp4-orion # i.MX6 DualLite Y Soft IOTA Orion board
+ - ysoft,imx6dl-yapp4-phoenix # i.MX6 DualLite Y Soft IOTA Phoenix board
- ysoft,imx6dl-yapp4-ursa # i.MX6 Solo Y Soft IOTA Ursa board
- const: fsl,imx6dl
@@ -466,20 +548,14 @@ properties:
- const: phytec,imx6dl-pfla02 # PHYTEC phyFLEX-i.MX6 Quad
- const: fsl,imx6dl
- - description: i.MX6DL Toradex Colibri iMX6 Module on Colibri
- Evaluation Board V3
+ - description: i.MX6DL Boards with Toradex Colibri iMX6DL/S Modules
items:
- - const: toradex,colibri_imx6dl-eval-v3
- - const: toradex,colibri_imx6dl # Colibri iMX6 Module
- - const: fsl,imx6dl
-
- - description: i.MX6DL Toradex Colibri iMX6 Module V1.1 on Colibri
- Evaluation Board V3
- items:
- - const: toradex,colibri_imx6dl-v1_1-eval-v3
- - const: toradex,colibri_imx6dl-v1_1 # Colibri iMX6 Module V1.1
- - const: toradex,colibri_imx6dl-eval-v3
- - const: toradex,colibri_imx6dl # Colibri iMX6 Module
+ - enum:
+ - toradex,colibri_imx6dl-aster # Colibri iMX6DL/S Module on Aster Board
+ - toradex,colibri_imx6dl-eval-v3 # Colibri iMX6DL/S Module on Colibri Evaluation Board V3
+ - toradex,colibri_imx6dl-iris # Colibri iMX6DL/S Module on Iris Board
+ - toradex,colibri_imx6dl-iris-v2 # Colibri iMX6DL/S Module on Iris Board V2
+ - const: toradex,colibri_imx6dl # Colibri iMX6DL/S Module
- const: fsl,imx6dl
- description: i.MX6S DHCOM DRC02 Board
@@ -488,12 +564,29 @@ properties:
- const: dh,imx6s-dhcom-som
- const: fsl,imx6dl
+ - description: TQ-Systems TQMa6DL SoM (variant A) on MBa6x
+ items:
+ - const: tq,imx6dl-mba6x-a
+ - const: tq,mba6a # Expected by bootloader, to be removed in the future
+ - const: tq,imx6dl-tqma6dl-a
+ - const: fsl,imx6dl
+
+ - description: TQ-Systems TQMa6DL SoM (variant B) on MBa6x
+ items:
+ - const: tq,imx6dl-mba6x-b
+ - const: tq,mba6b # Expected by bootloader, to be removed in the future
+ - const: tq,imx6dl-tqma6dl-b
+ - const: fsl,imx6dl
+
- description: i.MX6SL based Boards
items:
- enum:
- fsl,imx6sl-evk # i.MX6 SoloLite EVK Board
+ - kobo,aura2
- kobo,tolino-shine2hd
- kobo,tolino-shine3
+ - kobo,tolino-vision
+ - kobo,tolino-vision5
- revotics,imx6sl-warp # Revotics WaRP Board
- const: fsl,imx6sl
@@ -502,6 +595,7 @@ properties:
- enum:
- fsl,imx6sll-evk
- kobo,clarahd
+ - kobo,librah2o
- const: fsl,imx6sll
- description: i.MX6SX based Boards
@@ -524,8 +618,7 @@ properties:
- engicam,imx6ul-isiot # Engicam Is.IoT MX6UL eMMC/NAND Starter kit
- fsl,imx6ul-14x14-evk # i.MX6 UltraLite 14x14 EVK Board
- karo,imx6ul-tx6ul # Ka-Ro electronics TXUL-0010 Module
- - kontron,imx6ul-n6310-som # Kontron N6310 SOM
- - kontron,imx6ul-n6311-som # Kontron N6311 SOM
+ - kontron,sl-imx6ul # Kontron SL i.MX6UL SoM
- prt,prti6g # Protonic PRTI6G Board
- technexion,imx6ul-pico-dwarf # TechNexion i.MX6UL Pico-Dwarf
- technexion,imx6ul-pico-hobbit # TechNexion i.MX6UL Pico-Hobbit
@@ -561,33 +654,51 @@ properties:
- const: phytec,imx6ul-pcl063 # PHYTEC phyCORE-i.MX 6UL
- const: fsl,imx6ul
- - description: Kontron N6310 S Board
+ - description: Kontron BL i.MX6UL (N631X S) Board
items:
- - const: kontron,imx6ul-n6310-s
- - const: kontron,imx6ul-n6310-som
+ - const: kontron,bl-imx6ul # Kontron BL i.MX6UL Carrier Board
+ - const: kontron,sl-imx6ul # Kontron SL i.MX6UL SoM
- const: fsl,imx6ul
- - description: Kontron N6311 S Board
+ - description: Kontron BL i.MX6UL 43 (N631X S 43) Board
items:
- - const: kontron,imx6ul-n6311-s
- - const: kontron,imx6ul-n6311-som
+ - const: kontron,bl-imx6ul-43 # Kontron BL i.MX6UL Carrier Board with 4.3" Display
+ - const: kontron,bl-imx6ul # Kontron BL i.MX6UL Carrier Board
+ - const: kontron,sl-imx6ul # Kontron SL i.MX6UL SoM
- const: fsl,imx6ul
- - description: Kontron N6310 S 43 Board
+ - description: TQ-Systems TQMa6UL1 SoM on MBa6ULx board
items:
- - const: kontron,imx6ul-n6310-s-43
- - const: kontron,imx6ul-n6310-s
- - const: kontron,imx6ul-n6310-som
+ - enum:
+ - tq,imx6ul-tqma6ul1-mba6ulx
+ - const: tq,imx6ul-tqma6ul1 # MCIMX6G1
+ - const: fsl,imx6ul
+
+ - description: TQ-Systems TQMa6UL2 SoM on MBa6ULx board
+ items:
+ - enum:
+ - tq,imx6ul-tqma6ul2-mba6ulx
+ - const: tq,imx6ul-tqma6ul2 # MCIMX6G2
+ - const: fsl,imx6ul
+
+ - description: TQ-Systems TQMa6ULxL SoM on MBa6ULx[L] board
+ items:
+ - enum:
+ - tq,imx6ul-tqma6ul2l-mba6ulx # using LGA adapter
+ - tq,imx6ul-tqma6ul2l-mba6ulxl
+ - const: tq,imx6ul-tqma6ul2l # MCIMX6G2, LGA SoM variant
- const: fsl,imx6ul
- description: i.MX6ULL based Boards
items:
- enum:
- fsl,imx6ull-14x14-evk # i.MX6 UltraLiteLite 14x14 EVK Board
- - kontron,imx6ull-n6411-som # Kontron N6411 SOM
+ - joz,jozacp # JOZ Access Point
+ - kontron,sl-imx6ull # Kontron SL i.MX6ULL SoM
- myir,imx6ull-mys-6ulx-eval # MYiR Tech iMX6ULL Evaluation Board
- - toradex,colibri-imx6ull-eval # Colibri iMX6ULL Module on Colibri Eval Board
- - toradex,colibri-imx6ull-wifi-eval # Colibri iMX6ULL Wi-Fi / BT Module on Colibri Eval Board
+ - toradex,colibri-imx6ull # Colibri iMX6ULL Modules
+ - toradex,colibri-imx6ull-emmc # Colibri iMX6ULL 1GB (eMMC) Module
+ - toradex,colibri-imx6ull-wifi # Colibri iMX6ULL Wi-Fi / BT Modules
- const: fsl,imx6ull
- description: i.MX6ULL Armadeus Systems OPOS6ULDev Board
@@ -596,6 +707,25 @@ properties:
- const: armadeus,imx6ull-opos6ul # OPOS6UL (i.MX6ULL) SoM
- const: fsl,imx6ull
+ - description: i.MX6ULL chargebyte Tarragon Boards
+ items:
+ - enum:
+ - chargebyte,imx6ull-tarragon-master
+ - chargebyte,imx6ull-tarragon-micro
+ - chargebyte,imx6ull-tarragon-slave
+ - chargebyte,imx6ull-tarragon-slavext
+ - const: fsl,imx6ull
+
+ - description: i.MX6ULL DHCOM SoM based Boards
+ items:
+ - enum:
+ - dh,imx6ull-dhcom-drc02
+ - dh,imx6ull-dhcom-pdk2
+ - dh,imx6ull-dhcom-picoitx
+ - const: dh,imx6ull-dhcom-som # The DHCOR is soldered on the DHCOM
+ - const: dh,imx6ull-dhcor-som
+ - const: fsl,imx6ull
+
- description: i.MX6ULL PHYTEC phyBOARD-Segin
items:
- enum:
@@ -605,15 +735,70 @@ properties:
- const: phytec,imx6ull-pcl063 # PHYTEC phyCORE-i.MX 6ULL
- const: fsl,imx6ull
- - description: Kontron N6411 S Board
+ - description: i.MX6ULL PHYTEC phyGATE-Tauri
items:
- - const: kontron,imx6ull-n6411-s
- - const: kontron,imx6ull-n6411-som
+ - enum:
+ - phytec,imx6ull-phygate-tauri-emmc
+ - phytec,imx6ull-phygate-tauri-nand
+ - const: phytec,imx6ull-phygate-tauri # PHYTEC phyGATE-Tauri with i.MX6 ULL
+ - const: phytec,imx6ull-pcl063 # PHYTEC phyCORE-i.MX 6ULL
+ - const: fsl,imx6ull
+
+ - description: i.MX6ULL Boards with Toradex Colibri iMX6ULL Modules
+ items:
+ - enum:
+ - toradex,colibri-imx6ull-aster # Aster Carrier Board
+ - toradex,colibri-imx6ull-eval # Colibri Evaluation Board V3
+ - toradex,colibri-imx6ull-iris # Iris Carrier Board
+ - toradex,colibri-imx6ull-iris-v2 # Iris V2 Carrier Board
+ - const: toradex,colibri-imx6ull # Colibri iMX6ULL Module
+ - const: fsl,imx6ull
+
+ - description: i.MX6ULL Boards with Toradex Colibri iMX6ULL 1GB (eMMC) Module
+ items:
+ - enum:
+ - toradex,colibri-imx6ull-emmc-aster # Aster Carrier Board
+ - toradex,colibri-imx6ull-emmc-eval # Colibri Evaluation B. V3
+ - toradex,colibri-imx6ull-emmc-iris # Iris Carrier Board
+ - toradex,colibri-imx6ull-emmc-iris-v2 # Iris V2 Carrier Board
+ - const: toradex,colibri-imx6ull-emmc # Colibri iMX6ULL 1GB (eMMC) Module
+ - const: fsl,imx6ull
+
+ - description: i.MX6ULL Boards with Toradex Colibri iMX6ULL Wi-Fi / BT Modules
+ items:
+ - enum:
+ - toradex,colibri-imx6ull-wifi-eval # Colibri Eval. B. V3
+ - toradex,colibri-imx6ull-wifi-aster # Aster Carrier Board
+ - toradex,colibri-imx6ull-wifi-iris # Iris Carrier Board
+ - toradex,colibri-imx6ull-wifi-iris-v2 # Iris V2 Carrier Board
+ - const: toradex,colibri-imx6ull-wifi # Colibri iMX6ULL Wi-Fi / BT Module
+ - const: fsl,imx6ull
+
+ - description: Kontron BL i.MX6ULL (N6411 S) Board
+ items:
+ - const: kontron,bl-imx6ull # Kontron BL i.MX6ULL Carrier Board
+ - const: kontron,sl-imx6ull # Kontron SL i.MX6ULL SoM
+ - const: fsl,imx6ull
+
+ - description: TQ Systems TQMa6ULLx SoM on MBa6ULx board
+ items:
+ - enum:
+ - tq,imx6ull-tqma6ull2-mba6ulx
+ - const: tq,imx6ull-tqma6ull2 # MCIMX6Y2
+ - const: fsl,imx6ull
+
+ - description: TQ Systems TQMa6ULLxL SoM on MBa6ULx[L] board
+ items:
+ - enum:
+ - tq,imx6ull-tqma6ull2l-mba6ulx # using LGA adapter
+ - tq,imx6ull-tqma6ull2l-mba6ulxl
+ - const: tq,imx6ull-tqma6ull2l # MCIMX6Y2, LGA SoM variant
- const: fsl,imx6ull
- description: i.MX6ULZ based Boards
items:
- enum:
+ - bsh,imx6ulz-bsh-smm-m2 # i.MX6 ULZ BSH SystemMaster
- fsl,imx6ulz-14x14-evk # i.MX6 ULZ 14x14 EVK Board
- const: fsl,imx6ull # This seems odd. Should be last?
- const: fsl,imx6ulz
@@ -622,6 +807,7 @@ properties:
items:
- enum:
- element14,imx7s-warp # Element14 Warp i.MX7 Board
+ - toradex,colibri-imx7s # Colibri iMX7S Module
- const: fsl,imx7s
- description: i.MX7S Boards with Toradex Colibri iMX7S Module
@@ -629,6 +815,8 @@ properties:
- enum:
- toradex,colibri-imx7s-aster # Module on Aster Carrier Board
- toradex,colibri-imx7s-eval-v3 # Module on Colibri Evaluation Board V3
+ - toradex,colibri-imx7s-iris # Module on Iris Carrier Board
+ - toradex,colibri-imx7s-iris-v2 # Module on Iris Carrier Board V2
- const: toradex,colibri-imx7s
- const: fsl,imx7s
@@ -649,19 +837,13 @@ properties:
- kam,imx7d-flex-concentrator-mfg # Kamstrup OMNIA Flex Concentrator in manufacturing mode
- novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
- remarkable,imx7d-remarkable2 # i.MX7D ReMarkable 2 E-Ink Tablet
+ - storopack,imx7d-smegw01 # Storopack i.MX7D SMEGW01
- technexion,imx7d-pico-dwarf # TechNexion i.MX7D Pico-Dwarf
- technexion,imx7d-pico-hobbit # TechNexion i.MX7D Pico-Hobbit
- technexion,imx7d-pico-nymph # TechNexion i.MX7D Pico-Nymph
- technexion,imx7d-pico-pi # TechNexion i.MX7D Pico-Pi
- - toradex,colibri-imx7d # Colibri iMX7 Dual Module
- - toradex,colibri-imx7d-aster # Colibri iMX7 Dual Module on Aster Carrier Board
- - toradex,colibri-imx7d-emmc # Colibri iMX7 Dual 1GB (eMMC) Module
- - toradex,colibri-imx7d-emmc-aster # Colibri iMX7 Dual 1GB (eMMC) Module on
- # Aster Carrier Board
- - toradex,colibri-imx7d-emmc-eval-v3 # Colibri iMX7 Dual 1GB (eMMC) Module on
- # Colibri Evaluation Board V3
- - toradex,colibri-imx7d-eval-v3 # Colibri iMX7 Dual Module on
- # Colibri Evaluation Board V3
+ - toradex,colibri-imx7d # Colibri iMX7D Module
+ - toradex,colibri-imx7d-emmc # Colibri iMX7D 1GB (eMMC) Module
- zii,imx7d-rmu2 # ZII RMU2 Board
- zii,imx7d-rpu2 # ZII RPU2 Board
- const: fsl,imx7d
@@ -686,16 +868,20 @@ properties:
- description: i.MX7D Boards with Toradex Colibri i.MX7D Module
items:
- enum:
- - toradex,colibri-imx7d-aster # Module on Aster Carrier Board
- - toradex,colibri-imx7d-eval-v3 # Module on Colibri Evaluation Board V3
+ - toradex,colibri-imx7d-aster # Aster Carrier Board
+ - toradex,colibri-imx7d-eval-v3 # Colibri Evaluation Board V3
+ - toradex,colibri-imx7d-iris # Iris Carrier Board
+ - toradex,colibri-imx7d-iris-v2 # Iris Carrier Board V2
- const: toradex,colibri-imx7d
- const: fsl,imx7d
- - description: i.MX7D Boards with Toradex Colibri i.MX7D eMMC Module
+ - description: i.MX7D Boards with Toradex Colibri i.MX7D 1GB (eMMC) Module
items:
- enum:
- toradex,colibri-imx7d-emmc-aster # Module on Aster Carrier Board
- toradex,colibri-imx7d-emmc-eval-v3 # Module on Colibri Evaluation Board V3
+ - toradex,colibri-imx7d-emmc-iris # Module on Iris Carrier Board
+ - toradex,colibri-imx7d-emmc-iris-v2 # Module on Iris Carrier Board V2
- const: toradex,colibri-imx7d-emmc
- const: fsl,imx7d
@@ -711,15 +897,25 @@ properties:
- enum:
- beacon,imx8mm-beacon-kit # i.MX8MM Beacon Development Kit
- boundary,imx8mm-nitrogen8mm # i.MX8MM Nitrogen Board
+ - dmo,imx8mm-data-modul-edm-sbc # i.MX8MM eDM SBC
+ - emtrion,emcon-mx8mm-avari # emCON-MX8MM SoM on Avari Base
- fsl,imx8mm-ddr4-evk # i.MX8MM DDR4 EVK Board
- fsl,imx8mm-evk # i.MX8MM EVK Board
+ - gateworks,imx8mm-gw7904
- gw,imx8mm-gw71xx-0x # i.MX8MM Gateworks Development Kit
- gw,imx8mm-gw72xx-0x # i.MX8MM Gateworks Development Kit
- gw,imx8mm-gw73xx-0x # i.MX8MM Gateworks Development Kit
- gw,imx8mm-gw7901 # i.MX8MM Gateworks Board
- gw,imx8mm-gw7902 # i.MX8MM Gateworks Board
- - kontron,imx8mm-n801x-som # i.MX8MM Kontron SL (N801X) SOM
+ - gw,imx8mm-gw7903 # i.MX8MM Gateworks Board
+ - innocomm,wb15-evk # i.MX8MM Innocomm EVK board with WB15 SoM
+ - kontron,imx8mm-sl # i.MX8MM Kontron SL (N801X) SOM
+ - kontron,imx8mm-osm-s # i.MX8MM Kontron OSM-S (N802X) SOM
+ - toradex,verdin-imx8mm # Verdin iMX8M Mini Modules
+ - toradex,verdin-imx8mm-nonwifi # Verdin iMX8M Mini Modules without Wi-Fi / BT
+ - toradex,verdin-imx8mm-wifi # Verdin iMX8M Mini Wi-Fi / BT Modules
- variscite,var-som-mx8mm # i.MX8MM Variscite VAR-SOM-MX8MM module
+ - prt,prt8mm # i.MX8MM Protonic PRT8MM Board
- const: fsl,imx8mm
- description: Engicam i.Core MX8M Mini SoM based boards
@@ -732,8 +928,41 @@ properties:
- description: Kontron BL i.MX8MM (N801X S) Board
items:
- - const: kontron,imx8mm-n801x-s
- - const: kontron,imx8mm-n801x-som
+ - const: kontron,imx8mm-bl
+ - const: kontron,imx8mm-sl
+ - const: fsl,imx8mm
+
+ - description: Kontron BL i.MX8MM OSM-S (N802X S) Board
+ items:
+ - const: kontron,imx8mm-bl-osm-s
+ - const: kontron,imx8mm-osm-s
+ - const: fsl,imx8mm
+
+ - description: Toradex Boards with Verdin iMX8M Mini Modules
+ items:
+ - enum:
+ - menlo,mx8menlo # Verdin iMX8M Mini Module on i.MX8MM Menlo board
+ - toradex,verdin-imx8mm-nonwifi-dahlia # Verdin iMX8M Mini Module on Dahlia
+ - toradex,verdin-imx8mm-nonwifi-dev # Verdin iMX8M Mini Module on Verdin Development Board
+ - toradex,verdin-imx8mm-nonwifi-yavia # Verdin iMX8M Mini Module on Yavia
+ - const: toradex,verdin-imx8mm-nonwifi # Verdin iMX8M Mini Module without Wi-Fi / BT
+ - const: toradex,verdin-imx8mm # Verdin iMX8M Mini Module
+ - const: fsl,imx8mm
+
+ - description: Toradex Boards with Verdin iMX8M Mini Wi-Fi / BT Modules
+ items:
+ - enum:
+ - toradex,verdin-imx8mm-wifi-dahlia # Verdin iMX8M Mini Wi-Fi / BT Module on Dahlia
+ - toradex,verdin-imx8mm-wifi-dev # Verdin iMX8M Mini Wi-Fi / BT M. on Verdin Development B.
+ - toradex,verdin-imx8mm-wifi-yavia # Verdin iMX8M Mini Wi-Fi / BT Module on Yavia
+ - const: toradex,verdin-imx8mm-wifi # Verdin iMX8M Mini Wi-Fi / BT Module
+ - const: toradex,verdin-imx8mm # Verdin iMX8M Mini Module
+ - const: fsl,imx8mm
+
+ - description: PHYTEC phyCORE-i.MX8MM SoM based boards
+ items:
+ - const: phytec,imx8mm-phyboard-polis-rdk # phyBOARD-Polis RDK
+ - const: phytec,imx8mm-phycore-som # phyCORE-i.MX8MM SoM
- const: fsl,imx8mm
- description: Variscite VAR-SOM-MX8MM based boards
@@ -742,10 +971,25 @@ properties:
- const: variscite,var-som-mx8mm
- const: fsl,imx8mm
+ - description:
+ TQMa8MxML is a series of SOM featuring NXP i.MX8MM system-on-chip
+ variants. It is designed to be soldered on different carrier boards.
+ All variants (TQMa8M[Q,D,S][L]ML) use the same device tree, hence only
+ one compatible is needed.
+ items:
+ - enum:
+ - cloos,imx8mm-phg # i.MX8MM Cloos PHG Board
+ - tq,imx8mm-tqma8mqml-mba8mx # TQ-Systems GmbH i.MX8MM TQMa8MQML SOM on MBa8Mx
+ - const: tq,imx8mm-tqma8mqml # TQ-Systems GmbH i.MX8MM TQMa8MQML SOM
+ - const: fsl,imx8mm
+
- description: i.MX8MN based Boards
items:
- enum:
- beacon,imx8mn-beacon-kit # i.MX8MN Beacon Development Kit
+ - bsh,imx8mn-bsh-smm-s2 # i.MX8MN BSH SystemMaster S2
+ - bsh,imx8mn-bsh-smm-s2pro # i.MX8MN BSH SystemMaster S2 PRO
+ - fsl,imx8mn-ddr3l-evk # i.MX8MN DDR3L EVK Board
- fsl,imx8mn-ddr4-evk # i.MX8MN DDR4 EVK Board
- fsl,imx8mn-evk # i.MX8MN LPDDR4 EVK Board
- gw,imx8mn-gw7902 # i.MX8MM Gateworks Board
@@ -757,10 +1001,51 @@ properties:
- const: variscite,var-som-mx8mn
- const: fsl,imx8mn
+ - description:
+ TQMa8MxNL is a series of SOM featuring NXP i.MX8MN system-on-chip
+ variants. It is designed to be soldered on different carrier boards.
+ All variants (TQMa8M[Q,D,S][L]NL) use the same device tree, hence only
+ one compatible is needed.
+ items:
+ - enum:
+ - tq,imx8mn-tqma8mqnl-mba8mx # TQ-Systems GmbH i.MX8MN TQMa8MQNL SOM on MBa8Mx
+ - const: tq,imx8mn-tqma8mqnl # TQ-Systems GmbH i.MX8MN TQMa8MQNL SOM
+ - const: fsl,imx8mn
+
- description: i.MX8MP based Boards
items:
- enum:
+ - beacon,imx8mp-beacon-kit # i.MX8MP Beacon Development Kit
+ - dmo,imx8mp-data-modul-edm-sbc # i.MX8MP eDM SBC
- fsl,imx8mp-evk # i.MX8MP EVK Board
+ - gateworks,imx8mp-gw74xx # i.MX8MP Gateworks Board
+ - polyhex,imx8mp-debix # Polyhex Debix boards
+ - polyhex,imx8mp-debix-model-a # Polyhex Debix Model A Board
+ - toradex,verdin-imx8mp # Verdin iMX8M Plus Modules
+ - toradex,verdin-imx8mp-nonwifi # Verdin iMX8M Plus Modules without Wi-Fi / BT
+ - toradex,verdin-imx8mp-wifi # Verdin iMX8M Plus Wi-Fi / BT Modules
+ - const: fsl,imx8mp
+
+ - description: Avnet (MSC Branded) Boards with SM2S i.MX8M Plus Modules
+ items:
+ - const: avnet,sm2s-imx8mp-14N0600E-ep1 # SM2S-IMX8PLUS-14N0600E on SM2-MB-EP1 Carrier Board
+ - const: avnet,sm2s-imx8mp-14N0600E # 14N0600E variant of SM2S-IMX8PLUS SoM
+ - const: avnet,sm2s-imx8mp # SM2S-IMX8PLUS SoM
+ - const: fsl,imx8mp
+
+ - description: i.MX8MP DHCOM based Boards
+ items:
+ - enum:
+ - dh,imx8mp-dhcom-pdk2 # i.MX8MP DHCOM SoM on PDK2 board
+ - dh,imx8mp-dhcom-pdk3 # i.MX8MP DHCOM SoM on PDK3 board
+ - const: dh,imx8mp-dhcom-som # i.MX8MP DHCOM SoM
+ - const: fsl,imx8mp
+
+ - description: Engicam i.Core MX8M Plus SoM based boards
+ items:
+ - enum:
+ - engicam,icore-mx8mp-edimm2.2 # i.MX8MP Engicam i.Core MX8M Plus EDIMM2.2 Starter Kit
+ - const: engicam,icore-mx8mp # i.MX8MP Engicam i.Core MX8M Plus SoM
- const: fsl,imx8mp
- description: PHYTEC phyCORE-i.MX8MP SoM based boards
@@ -769,6 +1054,38 @@ properties:
- const: phytec,imx8mp-phycore-som # phyCORE-i.MX8MP SoM
- const: fsl,imx8mp
+ - description: Toradex Boards with Verdin iMX8M Plus Modules
+ items:
+ - enum:
+ - toradex,verdin-imx8mp-nonwifi-dahlia # Verdin iMX8M Plus Module on Dahlia
+ - toradex,verdin-imx8mp-nonwifi-dev # Verdin iMX8M Plus Module on Verdin Development Board
+ - toradex,verdin-imx8mp-nonwifi-yavia # Verdin iMX8M Plus Module on Yavia
+ - const: toradex,verdin-imx8mp-nonwifi # Verdin iMX8M Plus Module without Wi-Fi / BT
+ - const: toradex,verdin-imx8mp # Verdin iMX8M Plus Module
+ - const: fsl,imx8mp
+
+ - description: Toradex Boards with Verdin iMX8M Plus Wi-Fi / BT Modules
+ items:
+ - enum:
+ - toradex,verdin-imx8mp-wifi-dahlia # Verdin iMX8M Plus Wi-Fi / BT Module on Dahlia
+ - toradex,verdin-imx8mp-wifi-dev # Verdin iMX8M Plus Wi-Fi / BT M. on Verdin Development B.
+ - toradex,verdin-imx8mp-wifi-yavia # Verdin iMX8M Plus Wi-Fi / BT Module on Yavia
+ - const: toradex,verdin-imx8mp-wifi # Verdin iMX8M Plus Wi-Fi / BT Module
+ - const: toradex,verdin-imx8mp # Verdin iMX8M Plus Module
+ - const: fsl,imx8mp
+
+ - description:
+ TQMa8MPxL is a series of LGA SOM featuring NXP i.MX8MP system-on-chip
+ variants. It is designed to be soldered on different carrier boards.
+ All CPU variants use the same device tree hence only one compatible
+ is needed. MBa8MPxL mainboard can be used as starterkit or in a boxed
+ version as an industrial computing device.
+ items:
+ - enum:
+ - tq,imx8mp-tqma8mpql-mba8mpxl # TQ-Systems GmbH i.MX8MP TQMa8MPQL SOM on MBa8MPxL
+ - const: tq,imx8mp-tqma8mpql # TQ-Systems GmbH i.MX8MP TQMa8MPQL SOM
+ - const: fsl,imx8mp
+
- description: i.MX8MQ based Boards
items:
- enum:
@@ -778,12 +1095,17 @@ properties:
- fsl,imx8mq-evk # i.MX8MQ EVK Board
- google,imx8mq-phanbell # Google Coral Edge TPU
- kontron,pitx-imx8m # Kontron pITX-imx8m Board
- - mntre,reform2 # MNT Reform2 Laptop
- purism,librem5-devkit # Purism Librem5 devkit
- solidrun,hummingboard-pulse # SolidRun Hummingboard Pulse
- technexion,pico-pi-imx8m # TechNexion PICO-PI-8M evk
- const: fsl,imx8mq
+ - description: i.MX8MQ NITROGEN SoM based Boards
+ items:
+ - const: mntre,reform2 # MNT Reform2 Laptop
+ - const: boundary,imx8mq-nitrogen8m-som # i.MX8MQ NITROGEN SoM
+ - const: fsl,imx8mq
+
- description: Purism Librem5 phones
items:
- enum:
@@ -793,6 +1115,15 @@ properties:
- const: purism,librem5
- const: fsl,imx8mq
+ - description:
+ TQMa8Mx is a series of SOM featuring NXP i.MX8MQ system-on-chip
+ variants. It is designed to be clicked on different carrier boards.
+ items:
+ - enum:
+ - tq,imx8mq-tqma8mq-mba8mx # TQ-Systems GmbH i.MX8MQ TQMa8Mx SOM on MBa8Mx
+ - const: tq,imx8mq-tqma8mq # TQ-Systems GmbH i.MX8MQ TQMa8Mx SOM
+ - const: fsl,imx8mq
+
- description: Zodiac Inflight Innovations Ultra Boards
items:
- enum:
@@ -805,6 +1136,25 @@ properties:
items:
- enum:
- fsl,imx8qm-mek # i.MX8QM MEK Board
+ - toradex,apalis-imx8 # Apalis iMX8 Modules
+ - toradex,apalis-imx8-v1.1 # Apalis iMX8 V1.1 Modules
+ - const: fsl,imx8qm
+
+ - description: i.MX8QM Boards with Toradex Apalis iMX8 Modules
+ items:
+ - enum:
+ - toradex,apalis-imx8-eval # Apalis iMX8 Module on Apalis Evaluation Board
+ - toradex,apalis-imx8-ixora-v1.1 # Apalis iMX8 Module on Ixora V1.1 Carrier Board
+ - const: toradex,apalis-imx8
+ - const: fsl,imx8qm
+
+ - description: i.MX8QM Boards with Toradex Apalis iMX8 V1.1 Modules
+ items:
+ - enum:
+ - toradex,apalis-imx8-v1.1-eval # Apalis iMX8 V1.1 Module on Apalis Eval. Board
+ - toradex,apalis-imx8-v1.1-ixora-v1.1 # Apalis iMX8 V1.1 Module on Ixora V1.1 C. Board
+ - toradex,apalis-imx8-v1.1-ixora-v1.2 # Apalis iMX8 V1.1 Module on Ixora V1.2 C. Board
+ - const: toradex,apalis-imx8-v1.1
- const: fsl,imx8qm
- description: i.MX8QXP based Boards
@@ -812,16 +1162,49 @@ properties:
- enum:
- einfochips,imx8qxp-ai_ml # i.MX8QXP AI_ML Board
- fsl,imx8qxp-mek # i.MX8QXP MEK Board
- - toradex,colibri-imx8x # Colibri iMX8X Module
+ - toradex,colibri-imx8x # Colibri iMX8X Modules
- const: fsl,imx8qxp
- - description: Toradex Colibri i.MX8 Evaluation Board
+ - description: i.MX8DXL based Boards
items:
- enum:
+ - fsl,imx8dxl-evk # i.MX8DXL EVK Board
+ - const: fsl,imx8dxl
+
+ - description: i.MX8QXP Boards with Toradex Colibri iMX8X Modules
+ items:
+ - enum:
+ - toradex,colibri-imx8x-aster # Colibri iMX8X Module on Aster Board
- toradex,colibri-imx8x-eval-v3 # Colibri iMX8X Module on Colibri Evaluation Board V3
+ - toradex,colibri-imx8x-iris # Colibri iMX8X Module on Iris Board
+ - toradex,colibri-imx8x-iris-v2 # Colibri iMX8X Module on Iris Board V2
- const: toradex,colibri-imx8x
- const: fsl,imx8qxp
+ - description: i.MX8ULP based Boards
+ items:
+ - enum:
+ - fsl,imx8ulp-evk # i.MX8ULP EVK Board
+ - const: fsl,imx8ulp
+
+ - description: i.MX93 based Boards
+ items:
+ - enum:
+ - fsl,imx93-11x11-evk # i.MX93 11x11 EVK Board
+ - const: fsl,imx93
+
+ - description: i.MXRT1050 based Boards
+ items:
+ - enum:
+ - fsl,imxrt1050-evk # i.MXRT1050 EVK Board
+ - const: fsl,imxrt1050
+
+ - description: i.MXRT1170 based Boards
+ items:
+ - enum:
+ - fsl,imxrt1170-evk # i.MXRT1170 EVK Board
+ - const: fsl,imxrt1170
+
- description:
Freescale Vybrid Platform Device Tree Bindings
@@ -847,9 +1230,10 @@ properties:
- description: VF610 based Boards
items:
- enum:
+ - fsl,vf610-twr # VF610 Tower Board
- lwn,bk4 # Liebherr BK4 controller
- phytec,vf610-cosmic # PHYTEC Cosmic/Cosmic+ Board
- - fsl,vf610-twr # VF610 Tower Board
+ - toradex,vf610-colibri_vf61 # Colibri VF61 Modules
- const: fsl,vf610
- description: Toradex Colibri VF61 Module on Colibri Evaluation Board
@@ -884,8 +1268,10 @@ properties:
- description: LS1021A based Boards
items:
- enum:
+ - fsl,ls1021a-iot
- fsl,ls1021a-moxa-uc-8410a
- fsl,ls1021a-qds
+ - fsl,ls1021a-tsn
- fsl,ls1021a-twr
- const: fsl,ls1021a
@@ -977,6 +1363,8 @@ properties:
- description: LX2160A based Boards
items:
- enum:
+ - fsl,lx2160a-bluebox3
+ - fsl,lx2160a-bluebox3-rev-a
- fsl,lx2160a-qds
- fsl,lx2160a-rdb
- fsl,lx2162a-qds
@@ -990,6 +1378,13 @@ properties:
- const: solidrun,lx2160a-cex7
- const: fsl,lx2160a
+ - description: S32G2 based Boards
+ items:
+ - enum:
+ - nxp,s32g274a-evb
+ - nxp,s32g274a-rdb2
+ - const: nxp,s32g2
+
- description: S32V234 based Boards
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/fw-cfg.txt b/Documentation/devicetree/bindings/arm/fw-cfg.txt
deleted file mode 100644
index fd54e1db2156..000000000000
--- a/Documentation/devicetree/bindings/arm/fw-cfg.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-* QEMU Firmware Configuration bindings for ARM
-
-QEMU's arm-softmmu and aarch64-softmmu emulation / virtualization targets
-provide the following Firmware Configuration interface on the "virt" machine
-type:
-
-- A write-only, 16-bit wide selector (or control) register,
-- a read-write, 64-bit wide data register.
-
-QEMU exposes the control and data register to ARM guests as memory mapped
-registers; their location is communicated to the guest's UEFI firmware in the
-DTB that QEMU places at the bottom of the guest's DRAM.
-
-The authoritative guest-side hardware interface documentation to the fw_cfg
-device can be found in "docs/specs/fw_cfg.txt" in the QEMU source tree.
-
-
-Required properties:
-
-- compatible: "qemu,fw-cfg-mmio".
-
-- reg: the MMIO region used by the device.
- * Bytes 0x0 to 0x7 cover the data register.
- * Bytes 0x8 to 0x9 cover the selector register.
- * Further registers may be appended to the region in case of future interface
- revisions / feature bits.
-
-Example:
-
-/ {
- #size-cells = <0x2>;
- #address-cells = <0x2>;
-
- fw-cfg@9020000 {
- compatible = "qemu,fw-cfg-mmio";
- reg = <0x0 0x9020000 0x0 0xa>;
- };
-};
diff --git a/Documentation/devicetree/bindings/arm/hisilicon/controller/hip04-bootwrapper.yaml b/Documentation/devicetree/bindings/arm/hisilicon/controller/hip04-bootwrapper.yaml
index 7378159e61df..483caf0ce25b 100644
--- a/Documentation/devicetree/bindings/arm/hisilicon/controller/hip04-bootwrapper.yaml
+++ b/Documentation/devicetree/bindings/arm/hisilicon/controller/hip04-bootwrapper.yaml
@@ -17,14 +17,15 @@ properties:
- const: hisilicon,hip04-bootwrapper
boot-method:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
description: |
Address and size of boot method.
[0]: bootwrapper physical address
[1]: bootwrapper size
[2]: relocation physical address
[3]: relocation size
- minItems: 1
- maxItems: 2
+ minItems: 2
+ maxItems: 4
required:
- compatible
diff --git a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.yaml b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.yaml
index b38458022946..540876322040 100644
--- a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.yaml
+++ b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/hisilicon/hisilicon.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Hisilicon Platforms Device Tree Bindings
+title: Hisilicon Platforms
maintainers:
- Wei Xu <xuwei5@hisilicon.com>
diff --git a/Documentation/devicetree/bindings/arm/hpe,gxp.yaml b/Documentation/devicetree/bindings/arm/hpe,gxp.yaml
new file mode 100644
index 000000000000..224bbcb93f95
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/hpe,gxp.yaml
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/hpe,gxp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: HPE BMC GXP platforms
+
+maintainers:
+ - Nick Hawkins <nick.hawkins@hpe.com>
+ - Jean-Marie Verdun <verdun@hpe.com>
+
+properties:
+ compatible:
+ oneOf:
+ - description: GXP Based Boards
+ items:
+ - enum:
+ - hpe,gxp-dl360gen10
+ - const: hpe,gxp
+
+required:
+ - compatible
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/idle-states.yaml b/Documentation/devicetree/bindings/arm/idle-states.yaml
deleted file mode 100644
index 52bce5dbb11f..000000000000
--- a/Documentation/devicetree/bindings/arm/idle-states.yaml
+++ /dev/null
@@ -1,661 +0,0 @@
-# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/idle-states.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: ARM idle states binding description
-
-maintainers:
- - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
-
-description: |+
- ==========================================
- 1 - Introduction
- ==========================================
-
- ARM systems contain HW capable of managing power consumption dynamically,
- where cores can be put in different low-power states (ranging from simple wfi
- to power gating) according to OS PM policies. The CPU states representing the
- range of dynamic idle states that a processor can enter at run-time, can be
- specified through device tree bindings representing the parameters required to
- enter/exit specific idle states on a given processor.
-
- According to the Server Base System Architecture document (SBSA, [3]), the
- power states an ARM CPU can be put into are identified by the following list:
-
- - Running
- - Idle_standby
- - Idle_retention
- - Sleep
- - Off
-
- The power states described in the SBSA document define the basic CPU states on
- top of which ARM platforms implement power management schemes that allow an OS
- PM implementation to put the processor in different idle states (which include
- states listed above; "off" state is not an idle state since it does not have
- wake-up capabilities, hence it is not considered in this document).
-
- Idle state parameters (e.g. entry latency) are platform specific and need to
- be characterized with bindings that provide the required information to OS PM
- code so that it can build the required tables and use them at runtime.
-
- The device tree binding definition for ARM idle states is the subject of this
- document.
-
- ===========================================
- 2 - idle-states definitions
- ===========================================
-
- Idle states are characterized for a specific system through a set of
- timing and energy related properties, that underline the HW behaviour
- triggered upon idle states entry and exit.
-
- The following diagram depicts the CPU execution phases and related timing
- properties required to enter and exit an idle state:
-
- ..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
- | | | | |
-
- |<------ entry ------->|
- | latency |
- |<- exit ->|
- | latency |
- |<-------- min-residency -------->|
- |<------- wakeup-latency ------->|
-
- Diagram 1: CPU idle state execution phases
-
- EXEC: Normal CPU execution.
-
- PREP: Preparation phase before committing the hardware to idle mode
- like cache flushing. This is abortable on pending wake-up
- event conditions. The abort latency is assumed to be negligible
- (i.e. less than the ENTRY + EXIT duration). If aborted, CPU
- goes back to EXEC. This phase is optional. If not abortable,
- this should be included in the ENTRY phase instead.
-
- ENTRY: The hardware is committed to idle mode. This period must run
- to completion up to IDLE before anything else can happen.
-
- IDLE: This is the actual energy-saving idle period. This may last
- between 0 and infinite time, until a wake-up event occurs.
-
- EXIT: Period during which the CPU is brought back to operational
- mode (EXEC).
-
- entry-latency: Worst case latency required to enter the idle state. The
- exit-latency may be guaranteed only after entry-latency has passed.
-
- min-residency: Minimum period, including preparation and entry, for a given
- idle state to be worthwhile energywise.
-
- wakeup-latency: Maximum delay between the signaling of a wake-up event and the
- CPU being able to execute normal code again. If not specified, this is assumed
- to be entry-latency + exit-latency.
-
- These timing parameters can be used by an OS in different circumstances.
-
- An idle CPU requires the expected min-residency time to select the most
- appropriate idle state based on the expected expiry time of the next IRQ
- (i.e. wake-up) that causes the CPU to return to the EXEC phase.
-
- An operating system scheduler may need to compute the shortest wake-up delay
- for CPUs in the system by detecting how long will it take to get a CPU out
- of an idle state, e.g.:
-
- wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
-
- In other words, the scheduler can make its scheduling decision by selecting
- (e.g. waking-up) the CPU with the shortest wake-up delay.
- The wake-up delay must take into account the entry latency if that period
- has not expired. The abortable nature of the PREP period can be ignored
- if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
- the worst case since it depends on the CPU operating conditions, i.e. caches
- state).
-
- An OS has to reliably probe the wakeup-latency since some devices can enforce
- latency constraint guarantees to work properly, so the OS has to detect the
- worst case wake-up latency it can incur if a CPU is allowed to enter an
- idle state, and possibly to prevent that to guarantee reliable device
- functioning.
-
- The min-residency time parameter deserves further explanation since it is
- expressed in time units but must factor in energy consumption coefficients.
-
- The energy consumption of a cpu when it enters a power state can be roughly
- characterised by the following graph:
-
- |
- |
- |
- e |
- n | /---
- e | /------
- r | /------
- g | /-----
- y | /------
- | ----
- | /|
- | / |
- | / |
- | / |
- | / |
- | / |
- |/ |
- -----|-------+----------------------------------
- 0| 1 time(ms)
-
- Graph 1: Energy vs time example
-
- The graph is split in two parts delimited by time 1ms on the X-axis.
- The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
- and denotes the energy costs incurred while entering and leaving the idle
- state.
- The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
- shallower slope and essentially represents the energy consumption of the idle
- state.
-
- min-residency is defined for a given idle state as the minimum expected
- residency time for a state (inclusive of preparation and entry) after
- which choosing that state become the most energy efficient option. A good
- way to visualise this, is by taking the same graph above and comparing some
- states energy consumptions plots.
-
- For sake of simplicity, let's consider a system with two idle states IDLE1,
- and IDLE2:
-
- |
- |
- |
- | /-- IDLE1
- e | /---
- n | /----
- e | /---
- r | /-----/--------- IDLE2
- g | /-------/---------
- y | ------------ /---|
- | / /---- |
- | / /--- |
- | / /---- |
- | / /--- |
- | --- |
- | / |
- | / |
- |/ | time
- ---/----------------------------+------------------------
- |IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
- |
- IDLE2-min-residency
-
- Graph 2: idle states min-residency example
-
- In graph 2 above, that takes into account idle states entry/exit energy
- costs, it is clear that if the idle state residency time (i.e. time till next
- wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
- choice energywise.
-
- This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
- than IDLE2.
-
- However, the lower power consumption (i.e. shallower energy curve slope) of
- idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
- efficient.
-
- The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
- shallower states in a system with multiple idle states) is defined
- IDLE2-min-residency and corresponds to the time when energy consumption of
- IDLE1 and IDLE2 states breaks even.
-
- The definitions provided in this section underpin the idle states
- properties specification that is the subject of the following sections.
-
- ===========================================
- 3 - idle-states node
- ===========================================
-
- ARM processor idle states are defined within the idle-states node, which is
- a direct child of the cpus node [1] and provides a container where the
- processor idle states, defined as device tree nodes, are listed.
-
- On ARM systems, it is a container of processor idle states nodes. If the
- system does not provide CPU power management capabilities, or the processor
- just supports idle_standby, an idle-states node is not required.
-
- ===========================================
- 4 - References
- ===========================================
-
- [1] ARM Linux Kernel documentation - CPUs bindings
- Documentation/devicetree/bindings/arm/cpus.yaml
-
- [2] ARM Linux Kernel documentation - PSCI bindings
- Documentation/devicetree/bindings/arm/psci.yaml
-
- [3] ARM Server Base System Architecture (SBSA)
- http://infocenter.arm.com/help/index.jsp
-
- [4] ARM Architecture Reference Manuals
- http://infocenter.arm.com/help/index.jsp
-
- [6] ARM Linux Kernel documentation - Booting AArch64 Linux
- Documentation/arm64/booting.rst
-
-properties:
- $nodename:
- const: idle-states
-
- entry-method:
- description: |
- Usage and definition depend on ARM architecture version.
-
- On ARM v8 64-bit this property is required.
- On ARM 32-bit systems this property is optional
-
- This assumes that the "enable-method" property is set to "psci" in the cpu
- node[6] that is responsible for setting up CPU idle management in the OS
- implementation.
- const: psci
-
-patternProperties:
- "^(cpu|cluster)-":
- type: object
- description: |
- Each state node represents an idle state description and must be defined
- as follows.
-
- The idle state entered by executing the wfi instruction (idle_standby
- SBSA,[3][4]) is considered standard on all ARM platforms and therefore
- must not be listed.
-
- In addition to the properties listed above, a state node may require
- additional properties specific to the entry-method defined in the
- idle-states node. Please refer to the entry-method bindings
- documentation for properties definitions.
-
- properties:
- compatible:
- const: arm,idle-state
-
- local-timer-stop:
- description:
- If present the CPU local timer control logic is
- lost on state entry, otherwise it is retained.
- type: boolean
-
- entry-latency-us:
- description:
- Worst case latency in microseconds required to enter the idle state.
-
- exit-latency-us:
- description:
- Worst case latency in microseconds required to exit the idle state.
- The exit-latency-us duration may be guaranteed only after
- entry-latency-us has passed.
-
- min-residency-us:
- description:
- Minimum residency duration in microseconds, inclusive of preparation
- and entry, for this idle state to be considered worthwhile energy wise
- (refer to section 2 of this document for a complete description).
-
- wakeup-latency-us:
- description: |
- Maximum delay between the signaling of a wake-up event and the CPU
- being able to execute normal code again. If omitted, this is assumed
- to be equal to:
-
- entry-latency-us + exit-latency-us
-
- It is important to supply this value on systems where the duration of
- PREP phase (see diagram 1, section 2) is non-neglibigle. In such
- systems entry-latency-us + exit-latency-us will exceed
- wakeup-latency-us by this duration.
-
- idle-state-name:
- $ref: /schemas/types.yaml#/definitions/string
- description:
- A string used as a descriptive name for the idle state.
-
- required:
- - compatible
- - entry-latency-us
- - exit-latency-us
- - min-residency-us
-
-additionalProperties: false
-
-examples:
- - |
-
- cpus {
- #size-cells = <0>;
- #address-cells = <2>;
-
- cpu@0 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x0>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@1 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x1>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@100 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@101 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@10000 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10000>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@10001 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10001>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@10100 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@10101 {
- device_type = "cpu";
- compatible = "arm,cortex-a57";
- reg = <0x0 0x10101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_0_0 &CPU_SLEEP_0_0
- &CLUSTER_RETENTION_0 &CLUSTER_SLEEP_0>;
- };
-
- cpu@100000000 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x0>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100000001 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x1>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100000100 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100000101 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100010000 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10000>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100010001 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10001>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100010100 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10100>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- cpu@100010101 {
- device_type = "cpu";
- compatible = "arm,cortex-a53";
- reg = <0x1 0x10101>;
- enable-method = "psci";
- cpu-idle-states = <&CPU_RETENTION_1_0 &CPU_SLEEP_1_0
- &CLUSTER_RETENTION_1 &CLUSTER_SLEEP_1>;
- };
-
- idle-states {
- entry-method = "psci";
-
- CPU_RETENTION_0_0: cpu-retention-0-0 {
- compatible = "arm,idle-state";
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <20>;
- exit-latency-us = <40>;
- min-residency-us = <80>;
- };
-
- CLUSTER_RETENTION_0: cluster-retention-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <50>;
- exit-latency-us = <100>;
- min-residency-us = <250>;
- wakeup-latency-us = <130>;
- };
-
- CPU_SLEEP_0_0: cpu-sleep-0-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <250>;
- exit-latency-us = <500>;
- min-residency-us = <950>;
- };
-
- CLUSTER_SLEEP_0: cluster-sleep-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <600>;
- exit-latency-us = <1100>;
- min-residency-us = <2700>;
- wakeup-latency-us = <1500>;
- };
-
- CPU_RETENTION_1_0: cpu-retention-1-0 {
- compatible = "arm,idle-state";
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <20>;
- exit-latency-us = <40>;
- min-residency-us = <90>;
- };
-
- CLUSTER_RETENTION_1: cluster-retention-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <50>;
- exit-latency-us = <100>;
- min-residency-us = <270>;
- wakeup-latency-us = <100>;
- };
-
- CPU_SLEEP_1_0: cpu-sleep-1-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x0010000>;
- entry-latency-us = <70>;
- exit-latency-us = <100>;
- min-residency-us = <300>;
- wakeup-latency-us = <150>;
- };
-
- CLUSTER_SLEEP_1: cluster-sleep-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- arm,psci-suspend-param = <0x1010000>;
- entry-latency-us = <500>;
- exit-latency-us = <1200>;
- min-residency-us = <3500>;
- wakeup-latency-us = <1300>;
- };
- };
- };
-
- - |
- // Example 2 (ARM 32-bit, 8-cpu system, two clusters):
-
- cpus {
- #size-cells = <0>;
- #address-cells = <1>;
-
- cpu@0 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x0>;
- cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
- };
-
- cpu@1 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x1>;
- cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
- };
-
- cpu@2 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x2>;
- cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
- };
-
- cpu@3 {
- device_type = "cpu";
- compatible = "arm,cortex-a15";
- reg = <0x3>;
- cpu-idle-states = <&cpu_sleep_0_0 &cluster_sleep_0>;
- };
-
- cpu@100 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x100>;
- cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
- };
-
- cpu@101 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x101>;
- cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
- };
-
- cpu@102 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x102>;
- cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
- };
-
- cpu@103 {
- device_type = "cpu";
- compatible = "arm,cortex-a7";
- reg = <0x103>;
- cpu-idle-states = <&cpu_sleep_1_0 &cluster_sleep_1>;
- };
-
- idle-states {
- cpu_sleep_0_0: cpu-sleep-0-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <200>;
- exit-latency-us = <100>;
- min-residency-us = <400>;
- wakeup-latency-us = <250>;
- };
-
- cluster_sleep_0: cluster-sleep-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <500>;
- exit-latency-us = <1500>;
- min-residency-us = <2500>;
- wakeup-latency-us = <1700>;
- };
-
- cpu_sleep_1_0: cpu-sleep-1-0 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <300>;
- exit-latency-us = <500>;
- min-residency-us = <900>;
- wakeup-latency-us = <600>;
- };
-
- cluster_sleep_1: cluster-sleep-1 {
- compatible = "arm,idle-state";
- local-timer-stop;
- entry-latency-us = <800>;
- exit-latency-us = <2000>;
- min-residency-us = <6500>;
- wakeup-latency-us = <2300>;
- };
- };
- };
-
-...
diff --git a/Documentation/devicetree/bindings/arm/intel,keembay.yaml b/Documentation/devicetree/bindings/arm/intel,keembay.yaml
index 107e686ab207..53d2ce02b207 100644
--- a/Documentation/devicetree/bindings/arm/intel,keembay.yaml
+++ b/Documentation/devicetree/bindings/arm/intel,keembay.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/intel,keembay.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Keem Bay platform device tree bindings
+title: Keem Bay platform
maintainers:
- Paul J. Murphy <paul.j.murphy@intel.com>
diff --git a/Documentation/devicetree/bindings/arm/intel,socfpga.yaml b/Documentation/devicetree/bindings/arm/intel,socfpga.yaml
new file mode 100644
index 000000000000..4b4dcf551eb6
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/intel,socfpga.yaml
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/intel,socfpga.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Intel SoCFPGA platform
+
+maintainers:
+ - Dinh Nguyen <dinguyen@kernel.org>
+
+properties:
+ $nodename:
+ const: "/"
+ compatible:
+ oneOf:
+ - description: AgileX boards
+ items:
+ - enum:
+ - intel,n5x-socdk
+ - intel,socfpga-agilex-n6000
+ - intel,socfpga-agilex-socdk
+ - const: intel,socfpga-agilex
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/intel-ixp4xx.yaml b/Documentation/devicetree/bindings/arm/intel-ixp4xx.yaml
index 230bffeec0e5..553dcbc70e35 100644
--- a/Documentation/devicetree/bindings/arm/intel-ixp4xx.yaml
+++ b/Documentation/devicetree/bindings/arm/intel-ixp4xx.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/intel-ixp4xx.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel IXP4xx Device Tree Bindings
+title: Intel IXP4xx
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/keystone/ti,k3-sci-common.yaml b/Documentation/devicetree/bindings/arm/keystone/ti,k3-sci-common.yaml
index 5cbcacaeb441..ff378d5cbd32 100644
--- a/Documentation/devicetree/bindings/arm/keystone/ti,k3-sci-common.yaml
+++ b/Documentation/devicetree/bindings/arm/keystone/ti,k3-sci-common.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/keystone/ti,k3-sci-common.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Common K3 TI-SCI bindings
+title: Common K3 TI-SCI
maintainers:
- Nishanth Menon <nm@ti.com>
diff --git a/Documentation/devicetree/bindings/arm/keystone/ti,sci.yaml b/Documentation/devicetree/bindings/arm/keystone/ti,sci.yaml
index 34f5f877d444..91b96065f7df 100644
--- a/Documentation/devicetree/bindings/arm/keystone/ti,sci.yaml
+++ b/Documentation/devicetree/bindings/arm/keystone/ti,sci.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/keystone/ti,sci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: TI-SCI controller device node bindings
+title: TI-SCI controller
maintainers:
- Nishanth Menon <nm@ti.com>
diff --git a/Documentation/devicetree/bindings/arm/linux,dummy-virt.yaml b/Documentation/devicetree/bindings/arm/linux,dummy-virt.yaml
new file mode 100644
index 000000000000..c7c5eb48fc7e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/linux,dummy-virt.yaml
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/linux,dummy-virt.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: QEMU virt machine
+
+maintainers:
+ - Rob Herring <robh@kernel.org>
+
+properties:
+ $nodename:
+ const: "/"
+ compatible:
+ const: linux,dummy-virt
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt b/Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt
index 052a967c1f28..c83245065d44 100644
--- a/Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt
+++ b/Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt
@@ -72,7 +72,7 @@ mpp19 19 gpio, uart0(rxd), sdio(pw_off)
GPIO:
-----
For common binding part and usage, refer to
-Documentation/devicetree/bindings/gpio/gpio-mvebu.txt.
+Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml.
Required properties:
diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
index f6d6642d81c0..29fa93dad52b 100644
--- a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.txt
@@ -1,21 +1,3 @@
-Marvell Armada 37xx Platforms Device Tree Bindings
---------------------------------------------------
-
-Boards using a SoC of the Marvell Armada 37xx family must carry the
-following root node property:
-
- - compatible: must contain "marvell,armada3710"
-
-In addition, boards using the Marvell Armada 3720 SoC shall have the
-following property before the previous one:
-
- - compatible: must contain "marvell,armada3720"
-
-Example:
-
-compatible = "marvell,armada-3720-db", "marvell,armada3720", "marvell,armada3710";
-
-
Power management
----------------
@@ -48,11 +30,3 @@ avs: avs@11500 {
compatible = "marvell,armada-3700-avs", "syscon";
reg = <0x11500 0x40>;
}
-
-
-CZ.NIC's Turris Mox SOHO router Device Tree Bindings
-----------------------------------------------------
-
-Required root node property:
-
- - compatible: must contain "cznic,turris-mox"
diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-37xx.yaml b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.yaml
new file mode 100644
index 000000000000..6905d29f3108
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-37xx.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/marvell/armada-37xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell Armada 37xx Platforms
+
+maintainers:
+ - Robert Marko <robert.marko@sartura.hr>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: Armada 3720 SoC boards
+ items:
+ - enum:
+ - cznic,turris-mox
+ - globalscale,espressobin
+ - marvell,armada-3720-db
+ - methode,edpu
+ - methode,udpu
+ - const: marvell,armada3720
+ - const: marvell,armada3710
+
+ - description: Globalscale Espressobin boards
+ items:
+ - enum:
+ - globalscale,espressobin-emmc
+ - globalscale,espressobin-ultra
+ - globalscale,espressobin-v7
+ - const: globalscale,espressobin
+ - const: marvell,armada3720
+ - const: marvell,armada3710
+
+ - description: Globalscale Espressobin V7 boards
+ items:
+ - enum:
+ - globalscale,espressobin-v7-emmc
+ - const: globalscale,espressobin-v7
+ - const: globalscale,espressobin
+ - const: marvell,armada3720
+ - const: marvell,armada3710
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/marvell/armada-7k-8k.yaml b/Documentation/devicetree/bindings/arm/marvell/armada-7k-8k.yaml
index e9bf3054529f..52d78521e412 100644
--- a/Documentation/devicetree/bindings/arm/marvell/armada-7k-8k.yaml
+++ b/Documentation/devicetree/bindings/arm/marvell/armada-7k-8k.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/marvell/armada-7k-8k.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Marvell Armada 7K/8K Platforms Device Tree Bindings
+title: Marvell Armada 7K/8K Platforms
maintainers:
- Gregory CLEMENT <gregory.clement@bootlin.com>
diff --git a/Documentation/devicetree/bindings/arm/marvell/cp110-system-controller.txt b/Documentation/devicetree/bindings/arm/marvell/cp110-system-controller.txt
index 0705e765f432..d84105c7c935 100644
--- a/Documentation/devicetree/bindings/arm/marvell/cp110-system-controller.txt
+++ b/Documentation/devicetree/bindings/arm/marvell/cp110-system-controller.txt
@@ -156,7 +156,7 @@ GPIO:
-----
For common binding part and usage, refer to
-Documentation/devicetree/bindings/gpio/gpio-mvebu.txt.
+Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml.
Required properties:
diff --git a/Documentation/devicetree/bindings/arm/marvell/marvell,ac5.yaml b/Documentation/devicetree/bindings/arm/marvell/marvell,ac5.yaml
new file mode 100644
index 000000000000..8960fb8b2b2f
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/marvell/marvell,ac5.yaml
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/marvell/marvell,ac5.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell Alleycat5/5X Platforms
+
+maintainers:
+ - Chris Packham <chris.packham@alliedtelesis.co.nz>
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: Alleycat5 (98DX25xx) Reference Design
+ items:
+ - enum:
+ - marvell,rd-ac5
+ - const: marvell,ac5
+
+ - description: Alleycat5X (98DX35xx) Reference Design
+ items:
+ - enum:
+ - marvell,rd-ac5x
+ - const: marvell,ac5x
+ - const: marvell,ac5
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/mediatek.yaml b/Documentation/devicetree/bindings/arm/mediatek.yaml
index 80a05f6fee85..ae12b1cab9fb 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/mediatek.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MediaTek SoC based Platforms Device Tree Bindings
+title: MediaTek SoC based Platforms
maintainers:
- Sean Wang <sean.wang@mediatek.com>
@@ -32,6 +32,11 @@ properties:
- const: mediatek,mt6580
- items:
- enum:
+ - prestigio,pmt5008-3g
+ - const: mediatek,mt6582
+ - items:
+ - enum:
+ - fairphone,fp1
- mundoreader,bq-aquaris5
- const: mediatek,mt6589
- items:
@@ -53,6 +58,7 @@ properties:
- items:
- enum:
- mediatek,mt6795-evb
+ - sony,xperia-m5
- const: mediatek,mt6795
- items:
- enum:
@@ -78,6 +84,15 @@ properties:
- const: mediatek,mt7629
- items:
- enum:
+ - bananapi,bpi-r3
+ - mediatek,mt7986a-rfb
+ - const: mediatek,mt7986a
+ - items:
+ - enum:
+ - mediatek,mt7986b-rfb
+ - const: mediatek,mt7986b
+ - items:
+ - enum:
- mediatek,mt8127-moose
- const: mediatek,mt8127
- items:
@@ -118,8 +133,43 @@ properties:
- enum:
- mediatek,mt8183-evb
- const: mediatek,mt8183
+ - description: Google Hayato
+ items:
+ - const: google,hayato-rev1
+ - const: google,hayato
+ - const: mediatek,mt8192
+ - description: Google Spherion (Acer Chromebook 514)
+ items:
+ - const: google,spherion-rev3
+ - const: google,spherion-rev2
+ - const: google,spherion-rev1
+ - const: google,spherion-rev0
+ - const: google,spherion
+ - const: mediatek,mt8192
+ - description: Acer Tomato (Acer Chromebook Spin 513 CP513-2H)
+ items:
+ - enum:
+ - google,tomato-rev2
+ - google,tomato-rev1
+ - const: google,tomato
+ - const: mediatek,mt8195
+ - description: Acer Tomato rev3 - 4 (Acer Chromebook Spin 513 CP513-2H)
+ items:
+ - const: google,tomato-rev4
+ - const: google,tomato-rev3
+ - const: google,tomato
+ - const: mediatek,mt8195
- items:
- enum:
+ - mediatek,mt8186-evb
+ - const: mediatek,mt8186
+ - items:
+ - enum:
+ - mediatek,mt8192-evb
+ - const: mediatek,mt8192
+ - items:
+ - enum:
+ - mediatek,mt8195-demo
- mediatek,mt8195-evb
- const: mediatek,mt8195
- description: Google Burnet (HP Chromebook x360 11MK G3 EE)
@@ -133,6 +183,10 @@ properties:
- google,krane-sku176
- const: google,krane
- const: mediatek,mt8183
+ - description: Google Cozmo (Acer Chromebook 314)
+ items:
+ - const: google,cozmo
+ - const: mediatek,mt8183
- description: Google Damu (ASUS Chromebook Flip CM3)
items:
- const: google,damu
@@ -142,7 +196,9 @@ properties:
- enum:
- google,fennel-sku0
- google,fennel-sku1
+ - google,fennel-sku2
- google,fennel-sku6
+ - google,fennel-sku7
- const: google,fennel
- const: mediatek,mt8183
- description: Google Juniper (Acer Chromebook Spin 311) / Kenzo (Acer Chromebook 311)
@@ -158,6 +214,12 @@ properties:
- const: google,kakadu-rev2
- const: google,kakadu
- const: mediatek,mt8183
+ - description: Google Kakadu (ASUS Chromebook Detachable CM3)
+ items:
+ - const: google,kakadu-rev3-sku22
+ - const: google,kakadu-rev2-sku22
+ - const: google,kakadu
+ - const: mediatek,mt8183
- description: Google Kappa (HP Chromebook 11a)
items:
- const: google,kappa
@@ -184,6 +246,10 @@ properties:
- const: mediatek,mt8183
- items:
- enum:
+ - mediatek,mt8365-evk
+ - const: mediatek,mt8365
+ - items:
+ - enum:
- mediatek,mt8516-pumpkin
- const: mediatek,mt8516
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt
deleted file mode 100644
index ea827e8763de..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Mediatek apmixedsys controller
-==============================
-
-The Mediatek apmixedsys controller provides the PLLs to the system.
-
-Required Properties:
-
-- compatible: Should be one of:
- - "mediatek,mt2701-apmixedsys"
- - "mediatek,mt2712-apmixedsys", "syscon"
- - "mediatek,mt6765-apmixedsys", "syscon"
- - "mediatek,mt6779-apmixedsys", "syscon"
- - "mediatek,mt6797-apmixedsys"
- - "mediatek,mt7622-apmixedsys"
- - "mediatek,mt7623-apmixedsys", "mediatek,mt2701-apmixedsys"
- - "mediatek,mt7629-apmixedsys"
- - "mediatek,mt8135-apmixedsys"
- - "mediatek,mt8167-apmixedsys", "syscon"
- - "mediatek,mt8173-apmixedsys"
- - "mediatek,mt8183-apmixedsys", "syscon"
- - "mediatek,mt8516-apmixedsys"
-- #clock-cells: Must be 1
-
-The apmixedsys controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-apmixedsys: clock-controller@10209000 {
- compatible = "mediatek,mt8173-apmixedsys";
- reg = <0 0x10209000 0 0x1000>;
- #clock-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt
index 6b7e8067e7aa..eccd4b706a78 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,ethsys.txt
@@ -10,6 +10,8 @@ Required Properties:
- "mediatek,mt7622-ethsys", "syscon"
- "mediatek,mt7623-ethsys", "mediatek,mt2701-ethsys", "syscon"
- "mediatek,mt7629-ethsys", "syscon"
+ - "mediatek,mt7981-ethsys", "syscon"
+ - "mediatek,mt7986-ethsys", "syscon"
- #clock-cells: Must be 1
- #reset-cells: Must be 1
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt
deleted file mode 100644
index eb3523c7a7be..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Mediatek infracfg controller
-============================
-
-The Mediatek infracfg controller provides various clocks and reset
-outputs to the system.
-
-Required Properties:
-
-- compatible: Should be one of:
- - "mediatek,mt2701-infracfg", "syscon"
- - "mediatek,mt2712-infracfg", "syscon"
- - "mediatek,mt6765-infracfg", "syscon"
- - "mediatek,mt6779-infracfg_ao", "syscon"
- - "mediatek,mt6797-infracfg", "syscon"
- - "mediatek,mt7622-infracfg", "syscon"
- - "mediatek,mt7623-infracfg", "mediatek,mt2701-infracfg", "syscon"
- - "mediatek,mt7629-infracfg", "syscon"
- - "mediatek,mt8135-infracfg", "syscon"
- - "mediatek,mt8167-infracfg", "syscon"
- - "mediatek,mt8173-infracfg", "syscon"
- - "mediatek,mt8183-infracfg", "syscon"
- - "mediatek,mt8516-infracfg", "syscon"
-- #clock-cells: Must be 1
-- #reset-cells: Must be 1
-
-The infracfg controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-Also it uses the common reset controller binding from
-Documentation/devicetree/bindings/reset/reset.txt.
-The available reset outputs are defined in
-dt-bindings/reset/mt*-resets.h
-
-Example:
-
-infracfg: power-controller@10001000 {
- compatible = "mediatek,mt8173-infracfg", "syscon";
- reg = <0 0x10001000 0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.yaml
new file mode 100644
index 000000000000..ea98043c6ba3
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.yaml
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,infracfg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Infrastructure System Configuration Controller
+
+maintainers:
+ - Matthias Brugger <matthias.bgg@gmail.com>
+
+description:
+ The Mediatek infracfg controller provides various clocks and reset outputs
+ to the system. The clock values can be found in <dt-bindings/clock/mt*-clk.h>,
+ and reset values in <dt-bindings/reset/mt*-reset.h> and
+ <dt-bindings/reset/mt*-resets.h>.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - mediatek,mt2701-infracfg
+ - mediatek,mt2712-infracfg
+ - mediatek,mt6765-infracfg
+ - mediatek,mt6795-infracfg
+ - mediatek,mt6779-infracfg_ao
+ - mediatek,mt6797-infracfg
+ - mediatek,mt7622-infracfg
+ - mediatek,mt7629-infracfg
+ - mediatek,mt7981-infracfg
+ - mediatek,mt7986-infracfg
+ - mediatek,mt8135-infracfg
+ - mediatek,mt8167-infracfg
+ - mediatek,mt8173-infracfg
+ - mediatek,mt8183-infracfg
+ - mediatek,mt8516-infracfg
+ - const: syscon
+ - items:
+ - const: mediatek,mt7623-infracfg
+ - const: mediatek,mt2701-infracfg
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - mediatek,mt2701-infracfg
+ - mediatek,mt2712-infracfg
+ - mediatek,mt6795-infracfg
+ - mediatek,mt7622-infracfg
+ - mediatek,mt7986-infracfg
+ - mediatek,mt8135-infracfg
+ - mediatek,mt8173-infracfg
+ - mediatek,mt8183-infracfg
+then:
+ required:
+ - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ infracfg: clock-controller@10001000 {
+ compatible = "mediatek,mt8173-infracfg", "syscon";
+ reg = <0x10001000 0x1000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
index f9ffa5b703a5..536f5a5ebd24 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,mmsys.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mmsys.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: MediaTek mmsys controller
@@ -25,24 +25,71 @@ properties:
- mediatek,mt2712-mmsys
- mediatek,mt6765-mmsys
- mediatek,mt6779-mmsys
+ - mediatek,mt6795-mmsys
- mediatek,mt6797-mmsys
- mediatek,mt8167-mmsys
- mediatek,mt8173-mmsys
- mediatek,mt8183-mmsys
+ - mediatek,mt8186-mmsys
+ - mediatek,mt8188-vdosys0
- mediatek,mt8192-mmsys
+ - mediatek,mt8195-vdosys1
+ - mediatek,mt8195-vppsys0
+ - mediatek,mt8195-vppsys1
- mediatek,mt8365-mmsys
- const: syscon
+
+ - description: vdosys0 and vdosys1 are 2 display HW pipelines,
+ so mt8195 binding should be deprecated.
+ deprecated: true
+ items:
+ - const: mediatek,mt8195-mmsys
+ - const: syscon
+
- items:
- const: mediatek,mt7623-mmsys
- const: mediatek,mt2701-mmsys
- const: syscon
+ - items:
+ - const: mediatek,mt8195-vdosys0
+ - const: mediatek,mt8195-mmsys
+ - const: syscon
+
reg:
maxItems: 1
+ power-domains:
+ description:
+ A phandle and PM domain specifier as defined by bindings
+ of the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ mboxes:
+ description:
+ Using mailbox to communicate with GCE, it should have this
+ property and list of phandle, mailbox specifiers. See
+ Documentation/devicetree/bindings/mailbox/mediatek,gce-mailbox.yaml
+ for details.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+
+ mediatek,gce-client-reg:
+ description:
+ The register of client driver can be configured by gce with 4 arguments
+ defined in this property, such as phandle of gce, subsys id,
+ register offset and size.
+ Each subsys id is mapping to a base address of display function blocks
+ register which is defined in the gce header
+ include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
"#clock-cells":
const: 1
+ '#reset-cells':
+ const: 1
+
required:
- compatible
- reg
@@ -52,8 +99,16 @@ additionalProperties: false
examples:
- |
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+
mmsys: syscon@14000000 {
compatible = "mediatek,mt8173-mmsys", "syscon";
reg = <0x14000000 0x1000>;
+ power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
#clock-cells = <1>;
+ #reset-cells = <1>;
+ mboxes = <&gce 0 CMDQ_THR_PRIO_HIGHEST>,
+ <&gce 1 CMDQ_THR_PRIO_HIGHEST>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0 0x1000>;
};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-pcie-mirror.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-pcie-mirror.yaml
new file mode 100644
index 000000000000..d89848a8f478
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-pcie-mirror.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt7622-pcie-mirror.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek PCIE Mirror Controller for MT7622
+
+maintainers:
+ - Lorenzo Bianconi <lorenzo@kernel.org>
+ - Felix Fietkau <nbd@nbd.name>
+
+description:
+ The mediatek PCIE mirror provides a configuration interface for PCIE
+ controller on MT7622 soc.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt7622-pcie-mirror
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ pcie_mirror: pcie-mirror@10000400 {
+ compatible = "mediatek,mt7622-pcie-mirror", "syscon";
+ reg = <0 0x10000400 0 0x10>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml
new file mode 100644
index 000000000000..28ded09d72e3
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7622-wed.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt7622-wed.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Wireless Ethernet Dispatch Controller for MT7622
+
+maintainers:
+ - Lorenzo Bianconi <lorenzo@kernel.org>
+ - Felix Fietkau <nbd@nbd.name>
+
+description:
+ The mediatek wireless ethernet dispatch controller can be configured to
+ intercept and handle access to the WLAN DMA queues and PCIe interrupts
+ and implement hardware flow offloading from ethernet to WLAN.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt7622-wed
+ - mediatek,mt7981-wed
+ - mediatek,mt7986-wed
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ memory-region:
+ items:
+ - description: firmware EMI region
+ - description: firmware ILM region
+ - description: firmware DLM region
+ - description: firmware CPU DATA region
+ - description: firmware BOOT region
+
+ memory-region-names:
+ items:
+ - const: wo-emi
+ - const: wo-ilm
+ - const: wo-dlm
+ - const: wo-data
+ - const: wo-boot
+
+ mediatek,wo-ccif:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: mediatek wed-wo controller interface.
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt7622-wed
+ then:
+ properties:
+ memory-region-names: false
+ memory-region: false
+ mediatek,wo-ccif: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ wed0: wed@1020a000 {
+ compatible = "mediatek,mt7622-wed","syscon";
+ reg = <0 0x1020a000 0 0x1000>;
+ interrupts = <GIC_SPI 214 IRQ_TYPE_LEVEL_LOW>;
+ };
+ };
+
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ wed@15010000 {
+ compatible = "mediatek,mt7986-wed", "syscon";
+ reg = <0 0x15010000 0 0x1000>;
+ interrupts = <GIC_SPI 205 IRQ_TYPE_LEVEL_HIGH>;
+
+ memory-region = <&wo_emi>, <&wo_ilm>, <&wo_dlm>,
+ <&wo_data>, <&wo_boot>;
+ memory-region-names = "wo-emi", "wo-ilm", "wo-dlm",
+ "wo-data", "wo-boot";
+ mediatek,wo-ccif = <&wo_ccif0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7986-wed-pcie.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7986-wed-pcie.yaml
new file mode 100644
index 000000000000..82f64469a601
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt7986-wed-pcie.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt7986-wed-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek PCIE WED Controller for MT7986
+
+maintainers:
+ - Lorenzo Bianconi <lorenzo@kernel.org>
+ - Felix Fietkau <nbd@nbd.name>
+
+description:
+ The mediatek WED PCIE provides a configuration interface for PCIE
+ controller on MT7986 soc.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt7986-wed-pcie
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ wed_pcie: wed-pcie@10003000 {
+ compatible = "mediatek,mt7986-wed-pcie",
+ "syscon";
+ reg = <0 0x10003000 0 0x10>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-clock.yaml
new file mode 100644
index 000000000000..7cd14b163abe
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-clock.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8186-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT8186
+
+maintainers:
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description: |
+ The clock architecture in MediaTek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The devices provide clock gate control in different IP blocks.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8186-imp_iic_wrap
+ - mediatek,mt8186-mfgsys
+ - mediatek,mt8186-wpesys
+ - mediatek,mt8186-imgsys1
+ - mediatek,mt8186-imgsys2
+ - mediatek,mt8186-vdecsys
+ - mediatek,mt8186-vencsys
+ - mediatek,mt8186-camsys
+ - mediatek,mt8186-camsys_rawa
+ - mediatek,mt8186-camsys_rawb
+ - mediatek,mt8186-mdpsys
+ - mediatek,mt8186-ipesys
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ imp_iic_wrap: clock-controller@11017000 {
+ compatible = "mediatek,mt8186-imp_iic_wrap";
+ reg = <0x11017000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-sys-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-sys-clock.yaml
new file mode 100644
index 000000000000..64c769416690
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8186-sys-clock.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8186-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT8186
+
+maintainers:
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description: |
+ The clock architecture in MediaTek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The apmixedsys provides most of PLLs which generated from SoC 26m.
+ The topckgen provides dividers and muxes which provide the clock source to other IP blocks.
+ The infracfg_ao provides clock gate in peripheral and infrastructure IP blocks.
+ The mcusys provides mux control to select the clock source in AP MCU.
+ The device nodes also provide the system control capacity for configuration.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8186-mcusys
+ - mediatek,mt8186-topckgen
+ - mediatek,mt8186-infracfg_ao
+ - mediatek,mt8186-apmixedsys
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ topckgen: syscon@10000000 {
+ compatible = "mediatek,mt8186-topckgen", "syscon";
+ reg = <0x10000000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-clock.yaml
index c8c67c033f8c..dff4c8e8fd4b 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-clock.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-clock.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,mt8192-clock.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8192-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: MediaTek Functional Clock Controller for MT8192
@@ -24,7 +24,6 @@ properties:
- mediatek,mt8192-imp_iic_wrap_w
- mediatek,mt8192-imp_iic_wrap_n
- mediatek,mt8192-msdc_top
- - mediatek,mt8192-msdc
- mediatek,mt8192-mfgcfg
- mediatek,mt8192-imgsys
- mediatek,mt8192-imgsys2
@@ -108,13 +107,6 @@ examples:
};
- |
- msdc: clock-controller@11f60000 {
- compatible = "mediatek,mt8192-msdc";
- reg = <0x11f60000 0x1000>;
- #clock-cells = <1>;
- };
-
- - |
mfgcfg: clock-controller@13fbf000 {
compatible = "mediatek,mt8192-mfgcfg";
reg = <0x13fbf000 0x1000>;
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-sys-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-sys-clock.yaml
index 5705bcf1fe47..8d608fddf3f9 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-sys-clock.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8192-sys-clock.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,mt8192-sys-clock.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8192-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: MediaTek System Clock Controller for MT8192
@@ -29,6 +29,9 @@ properties:
'#clock-cells':
const: 1
+ '#reset-cells':
+ const: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-clock.yaml
new file mode 100644
index 000000000000..d17164b0b13e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-clock.yaml
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8195-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT8195
+
+maintainers:
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description:
+ The clock architecture in Mediatek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The devices except apusys_pll provide clock gate control in different IP blocks.
+ The apusys_pll provides Plls which generated from SoC 26m for AI Processing Unit.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8195-scp_adsp
+ - mediatek,mt8195-imp_iic_wrap_s
+ - mediatek,mt8195-imp_iic_wrap_w
+ - mediatek,mt8195-mfgcfg
+ - mediatek,mt8195-wpesys
+ - mediatek,mt8195-wpesys_vpp0
+ - mediatek,mt8195-wpesys_vpp1
+ - mediatek,mt8195-imgsys
+ - mediatek,mt8195-imgsys1_dip_top
+ - mediatek,mt8195-imgsys1_dip_nr
+ - mediatek,mt8195-imgsys1_wpe
+ - mediatek,mt8195-ipesys
+ - mediatek,mt8195-camsys
+ - mediatek,mt8195-camsys_rawa
+ - mediatek,mt8195-camsys_yuva
+ - mediatek,mt8195-camsys_rawb
+ - mediatek,mt8195-camsys_yuvb
+ - mediatek,mt8195-camsys_mraw
+ - mediatek,mt8195-ccusys
+ - mediatek,mt8195-vdecsys_soc
+ - mediatek,mt8195-vdecsys
+ - mediatek,mt8195-vdecsys_core1
+ - mediatek,mt8195-vencsys
+ - mediatek,mt8195-vencsys_core1
+ - mediatek,mt8195-apusys_pll
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ scp_adsp: clock-controller@10720000 {
+ compatible = "mediatek,mt8195-scp_adsp";
+ reg = <0x10720000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imp_iic_wrap_s: clock-controller@11d03000 {
+ compatible = "mediatek,mt8195-imp_iic_wrap_s";
+ reg = <0x11d03000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imp_iic_wrap_w: clock-controller@11e05000 {
+ compatible = "mediatek,mt8195-imp_iic_wrap_w";
+ reg = <0x11e05000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ mfgcfg: clock-controller@13fbf000 {
+ compatible = "mediatek,mt8195-mfgcfg";
+ reg = <0x13fbf000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ wpesys: clock-controller@14e00000 {
+ compatible = "mediatek,mt8195-wpesys";
+ reg = <0x14e00000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ wpesys_vpp0: clock-controller@14e02000 {
+ compatible = "mediatek,mt8195-wpesys_vpp0";
+ reg = <0x14e02000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ wpesys_vpp1: clock-controller@14e03000 {
+ compatible = "mediatek,mt8195-wpesys_vpp1";
+ reg = <0x14e03000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imgsys: clock-controller@15000000 {
+ compatible = "mediatek,mt8195-imgsys";
+ reg = <0x15000000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imgsys1_dip_top: clock-controller@15110000 {
+ compatible = "mediatek,mt8195-imgsys1_dip_top";
+ reg = <0x15110000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imgsys1_dip_nr: clock-controller@15130000 {
+ compatible = "mediatek,mt8195-imgsys1_dip_nr";
+ reg = <0x15130000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ imgsys1_wpe: clock-controller@15220000 {
+ compatible = "mediatek,mt8195-imgsys1_wpe";
+ reg = <0x15220000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ ipesys: clock-controller@15330000 {
+ compatible = "mediatek,mt8195-ipesys";
+ reg = <0x15330000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys: clock-controller@16000000 {
+ compatible = "mediatek,mt8195-camsys";
+ reg = <0x16000000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys_rawa: clock-controller@1604f000 {
+ compatible = "mediatek,mt8195-camsys_rawa";
+ reg = <0x1604f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys_yuva: clock-controller@1606f000 {
+ compatible = "mediatek,mt8195-camsys_yuva";
+ reg = <0x1606f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys_rawb: clock-controller@1608f000 {
+ compatible = "mediatek,mt8195-camsys_rawb";
+ reg = <0x1608f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys_yuvb: clock-controller@160af000 {
+ compatible = "mediatek,mt8195-camsys_yuvb";
+ reg = <0x160af000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ camsys_mraw: clock-controller@16140000 {
+ compatible = "mediatek,mt8195-camsys_mraw";
+ reg = <0x16140000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ ccusys: clock-controller@17200000 {
+ compatible = "mediatek,mt8195-ccusys";
+ reg = <0x17200000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ vdecsys_soc: clock-controller@1800f000 {
+ compatible = "mediatek,mt8195-vdecsys_soc";
+ reg = <0x1800f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ vdecsys: clock-controller@1802f000 {
+ compatible = "mediatek,mt8195-vdecsys";
+ reg = <0x1802f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ vdecsys_core1: clock-controller@1803f000 {
+ compatible = "mediatek,mt8195-vdecsys_core1";
+ reg = <0x1803f000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ vencsys: clock-controller@1a000000 {
+ compatible = "mediatek,mt8195-vencsys";
+ reg = <0x1a000000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ vencsys_core1: clock-controller@1b000000 {
+ compatible = "mediatek,mt8195-vencsys_core1";
+ reg = <0x1b000000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ apusys_pll: clock-controller@190f3000 {
+ compatible = "mediatek,mt8195-apusys_pll";
+ reg = <0x190f3000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-sys-clock.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-sys-clock.yaml
new file mode 100644
index 000000000000..066c9b3d6ac9
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mt8195-sys-clock.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,mt8195-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT8195
+
+maintainers:
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description:
+ The clock architecture in Mediatek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The apmixedsys provides most of PLLs which generated from SoC 26m.
+ The topckgen provides dividers and muxes which provide the clock source to other IP blocks.
+ The infracfg_ao and pericfg_ao provides clock gate in peripheral and infrastructure IP blocks.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8195-topckgen
+ - mediatek,mt8195-infracfg_ao
+ - mediatek,mt8195-apmixedsys
+ - mediatek,mt8195-pericfg_ao
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ topckgen: syscon@10000000 {
+ compatible = "mediatek,mt8195-topckgen", "syscon";
+ reg = <0x10000000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ infracfg_ao: syscon@10001000 {
+ compatible = "mediatek,mt8195-infracfg_ao", "syscon";
+ reg = <0x10001000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ apmixedsys: syscon@1000c000 {
+ compatible = "mediatek,mt8195-apmixedsys", "syscon";
+ reg = <0x1000c000 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ - |
+ pericfg_ao: syscon@11003000 {
+ compatible = "mediatek,mt8195-pericfg_ao", "syscon";
+ reg = <0x11003000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml
index 8723dfe34bab..26158d0d72f3 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,pericfg.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/mediatek/mediatek,pericfg.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/mediatek/mediatek,pericfg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: MediaTek Peripheral Configuration Controller
@@ -21,11 +21,14 @@ properties:
- mediatek,mt2701-pericfg
- mediatek,mt2712-pericfg
- mediatek,mt6765-pericfg
+ - mediatek,mt6795-pericfg
- mediatek,mt7622-pericfg
- mediatek,mt7629-pericfg
- mediatek,mt8135-pericfg
- mediatek,mt8173-pericfg
- mediatek,mt8183-pericfg
+ - mediatek,mt8186-pericfg
+ - mediatek,mt8195-pericfg
- mediatek,mt8516-pericfg
- const: syscon
- items:
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
deleted file mode 100644
index 30cb645c0e54..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-MediaTek SGMIISYS controller
-============================
-
-The MediaTek SGMIISYS controller provides various clocks to the system.
-
-Required Properties:
-
-- compatible: Should be:
- - "mediatek,mt7622-sgmiisys", "syscon"
- - "mediatek,mt7629-sgmiisys", "syscon"
-- #clock-cells: Must be 1
-
-The SGMIISYS controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-sgmiisys: sgmiisys@1b128000 {
- compatible = "mediatek,mt7622-sgmiisys", "syscon";
- reg = <0 0x1b128000 0 0x1000>;
- #clock-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt
deleted file mode 100644
index 5ce7578cf274..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Mediatek topckgen controller
-============================
-
-The Mediatek topckgen controller provides various clocks to the system.
-
-Required Properties:
-
-- compatible: Should be one of:
- - "mediatek,mt2701-topckgen"
- - "mediatek,mt2712-topckgen", "syscon"
- - "mediatek,mt6765-topckgen", "syscon"
- - "mediatek,mt6779-topckgen", "syscon"
- - "mediatek,mt6797-topckgen"
- - "mediatek,mt7622-topckgen"
- - "mediatek,mt7623-topckgen", "mediatek,mt2701-topckgen"
- - "mediatek,mt7629-topckgen"
- - "mediatek,mt8135-topckgen"
- - "mediatek,mt8167-topckgen", "syscon"
- - "mediatek,mt8173-topckgen"
- - "mediatek,mt8183-topckgen", "syscon"
- - "mediatek,mt8516-topckgen"
-- #clock-cells: Must be 1
-
-The topckgen controller uses the common clk binding from
-Documentation/devicetree/bindings/clock/clock-bindings.txt
-The available clocks are defined in dt-bindings/clock/mt*-clk.h.
-
-Example:
-
-topckgen: power-controller@10000000 {
- compatible = "mediatek,mt8173-topckgen";
- reg = <0 0x10000000 0 0x1000>;
- #clock-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/microchip,sparx5.yaml b/Documentation/devicetree/bindings/arm/microchip,sparx5.yaml
index 6193388c6318..9a0d54e9799c 100644
--- a/Documentation/devicetree/bindings/arm/microchip,sparx5.yaml
+++ b/Documentation/devicetree/bindings/arm/microchip,sparx5.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/microchip,sparx5.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Microchip Sparx5 Boards Device Tree Bindings
+title: Microchip Sparx5 Boards
maintainers:
- Lars Povlsen <lars.povlsen@microchip.com>
diff --git a/Documentation/devicetree/bindings/arm/moxart.yaml b/Documentation/devicetree/bindings/arm/moxart.yaml
index 670d24ce8ec5..42565280914c 100644
--- a/Documentation/devicetree/bindings/arm/moxart.yaml
+++ b/Documentation/devicetree/bindings/arm/moxart.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/moxart.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MOXA ART device tree bindings
+title: MOXA ART
maintainers:
- Jonas Jensen <jonas.jensen@gmail.com>
diff --git a/Documentation/devicetree/bindings/arm/mrvl/mrvl.yaml b/Documentation/devicetree/bindings/arm/mrvl/mrvl.yaml
index d58116136154..4c43eaf3632e 100644
--- a/Documentation/devicetree/bindings/arm/mrvl/mrvl.yaml
+++ b/Documentation/devicetree/bindings/arm/mrvl/mrvl.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/mrvl/mrvl.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Marvell Platforms Device Tree Bindings
+title: Marvell Platforms
maintainers:
- Lubomir Rintel <lkundrak@v3.sk>
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,idle-state.txt b/Documentation/devicetree/bindings/arm/msm/qcom,idle-state.txt
index 6ce0b212ec6d..606b4b1b709d 100644
--- a/Documentation/devicetree/bindings/arm/msm/qcom,idle-state.txt
+++ b/Documentation/devicetree/bindings/arm/msm/qcom,idle-state.txt
@@ -81,4 +81,4 @@ Example:
};
};
-[1]. Documentation/devicetree/bindings/arm/idle-states.yaml
+[1]. Documentation/devicetree/bindings/cpu/idle-states.yaml
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt
deleted file mode 100644
index 7f696362a4a1..000000000000
--- a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-acc.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-Krait Processor Sub-system (KPSS) Application Clock Controller (ACC)
-
-The KPSS ACC provides clock, power domain, and reset control to a Krait CPU.
-There is one ACC register region per CPU within the KPSS remapped region as
-well as an alias register region that remaps accesses to the ACC associated
-with the CPU accessing the region.
-
-PROPERTIES
-
-- compatible:
- Usage: required
- Value type: <string>
- Definition: should be one of:
- "qcom,kpss-acc-v1"
- "qcom,kpss-acc-v2"
-
-- reg:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: the first element specifies the base address and size of
- the register region. An optional second element specifies
- the base address and size of the alias register region.
-
-- clocks:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: reference to the pll parents.
-
-- clock-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "pll8_vote", "pxo".
-
-- clock-output-names:
- Usage: optional
- Value type: <string>
- Definition: Name of the output clock. Typically acpuX_aux where X is a
- CPU number starting at 0.
-
-Example:
-
- clock-controller@2088000 {
- compatible = "qcom,kpss-acc-v2";
- reg = <0x02088000 0x1000>,
- <0x02008000 0x1000>;
- clocks = <&gcc PLL8_VOTE>, <&gcc PXO_SRC>;
- clock-names = "pll8_vote", "pxo";
- clock-output-names = "acpu0_aux";
- };
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt b/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt
deleted file mode 100644
index e628758950e1..000000000000
--- a/Documentation/devicetree/bindings/arm/msm/qcom,kpss-gcc.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-Krait Processor Sub-system (KPSS) Global Clock Controller (GCC)
-
-PROPERTIES
-
-- compatible:
- Usage: required
- Value type: <string>
- Definition: should be one of the following. The generic compatible
- "qcom,kpss-gcc" should also be included.
- "qcom,kpss-gcc-ipq8064", "qcom,kpss-gcc"
- "qcom,kpss-gcc-apq8064", "qcom,kpss-gcc"
- "qcom,kpss-gcc-msm8974", "qcom,kpss-gcc"
- "qcom,kpss-gcc-msm8960", "qcom,kpss-gcc"
-
-- reg:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: base address and size of the register region
-
-- clocks:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: reference to the pll parents.
-
-- clock-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "pll8_vote", "pxo".
-
-- clock-output-names:
- Usage: required
- Value type: <string>
- Definition: Name of the output clock. Typically acpu_l2_aux indicating
- an L2 cache auxiliary clock.
-
-Example:
-
- l2cc: clock-controller@2011000 {
- compatible = "qcom,kpss-gcc-ipq8064", "qcom,kpss-gcc";
- reg = <0x2011000 0x1000>;
- clocks = <&gcc PLL8_VOTE>, <&gcc PXO_SRC>;
- clock-names = "pll8_vote", "pxo";
- clock-output-names = "acpu_l2_aux";
- };
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml b/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml
deleted file mode 100644
index 62fcbd883392..000000000000
--- a/Documentation/devicetree/bindings/arm/msm/qcom,llcc.yaml
+++ /dev/null
@@ -1,60 +0,0 @@
-# SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/msm/qcom,llcc.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Last Level Cache Controller
-
-maintainers:
- - Rishabh Bhatnagar <rishabhb@codeaurora.org>
- - Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
-
-description: |
- LLCC (Last Level Cache Controller) provides last level of cache memory in SoC,
- that can be shared by multiple clients. Clients here are different cores in the
- SoC, the idea is to minimize the local caches at the clients and migrate to
- common pool of memory. Cache memory is divided into partitions called slices
- which are assigned to clients. Clients can query the slice details, activate
- and deactivate them.
-
-properties:
- compatible:
- enum:
- - qcom,sc7180-llcc
- - qcom,sc7280-llcc
- - qcom,sdm845-llcc
- - qcom,sm8150-llcc
- - qcom,sm8250-llcc
-
- reg:
- items:
- - description: LLCC base register region
- - description: LLCC broadcast base register region
-
- reg-names:
- items:
- - const: llcc_base
- - const: llcc_broadcast_base
-
- interrupts:
- maxItems: 1
-
-required:
- - compatible
- - reg
- - reg-names
- - interrupts
-
-additionalProperties: false
-
-examples:
- - |
- #include <dt-bindings/interrupt-controller/arm-gic.h>
-
- system-cache-controller@1100000 {
- compatible = "qcom,sdm845-llcc";
- reg = <0x1100000 0x200000>, <0x1300000 0x50000> ;
- reg-names = "llcc_base", "llcc_broadcast_base";
- interrupts = <GIC_SPI 582 IRQ_TYPE_LEVEL_HIGH>;
- };
diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt b/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
index 94d50a949be1..c0e3c3a42bea 100644
--- a/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
+++ b/Documentation/devicetree/bindings/arm/msm/qcom,saw2.txt
@@ -10,7 +10,7 @@ system, notifying them when a low power state is entered or exited.
Multiple revisions of the SAW hardware are supported using these Device Nodes.
SAW2 revisions differ in the register offset and configuration data. Also, the
same revision of the SAW in different SoCs may have different configuration
-data due the the differences in hardware capabilities. Hence the SoC name, the
+data due the differences in hardware capabilities. Hence the SoC name, the
version of the SAW hardware in that SoC and the distinction between cpu (big
or Little) or cache, may be needed to uniquely identify the SAW register
configuration and initialization data. The compatible string is used to
diff --git a/Documentation/devicetree/bindings/arm/mstar/mstar.yaml b/Documentation/devicetree/bindings/arm/mstar/mstar.yaml
index a316eef1b728..937059fcc7b3 100644
--- a/Documentation/devicetree/bindings/arm/mstar/mstar.yaml
+++ b/Documentation/devicetree/bindings/arm/mstar/mstar.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/mstar/mstar.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MStar platforms device tree bindings
+title: MStar platforms
maintainers:
- Daniel Palmer <daniel@thingy.jp>
@@ -23,8 +23,12 @@ properties:
- description: infinity2m boards
items:
- enum:
+ - 100ask,dongshanpione # 100ask DongShanPiOne
- honestar,ssd201htv2 # Honestar SSD201_HT_V2 devkit
- m5stack,unitv2 # M5Stack UnitV2
+ - miyoo,miyoo-mini # Miyoo Mini
+ - wirelesstag,ido-som2d01 # Wireless Tag IDO-SOM2D01
+ - wirelesstag,ido-sbc2d06-v1b-22w # Wireless Tag IDO-SBC2D06-1VB-22W
- const: mstar,infinity2m
- description: infinity3 boards
diff --git a/Documentation/devicetree/bindings/arm/npcm/npcm.yaml b/Documentation/devicetree/bindings/arm/npcm/npcm.yaml
index 95e51378089c..6871483947c5 100644
--- a/Documentation/devicetree/bindings/arm/npcm/npcm.yaml
+++ b/Documentation/devicetree/bindings/arm/npcm/npcm.yaml
@@ -4,10 +4,11 @@
$id: http://devicetree.org/schemas/arm/npcm/npcm.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NPCM Platforms Device Tree Bindings
+title: NPCM Platforms
maintainers:
- Jonathan Neuschäfer <j.neuschaefer@gmx.net>
+ - Tomer Maimon <tmaimon77@gmail.com>
properties:
$nodename:
@@ -26,4 +27,10 @@ properties:
- nuvoton,npcm750-evb # NPCM750 evaluation board
- const: nuvoton,npcm750
+ - description: NPCM845 based boards
+ items:
+ - enum:
+ - nuvoton,npcm845-evb # NPCM845 evaluation board
+ - const: nuvoton,npcm845
+
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/npcm/nuvoton,gcr.yaml b/Documentation/devicetree/bindings/arm/npcm/nuvoton,gcr.yaml
new file mode 100644
index 000000000000..94e72f25b331
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/npcm/nuvoton,gcr.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/npcm/nuvoton,gcr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Global Control Registers block in Nuvoton SoCs
+
+maintainers:
+ - Jonathan Neuschäfer <j.neuschaefer@gmx.net>
+ - Tomer Maimon <tmaimon77@gmail.com>
+
+description:
+ The Global Control Registers (GCR) are a block of registers in Nuvoton SoCs
+ that expose misc functionality such as chip model and version information or
+ pinmux settings.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - nuvoton,wpcm450-gcr
+ - nuvoton,npcm750-gcr
+ - nuvoton,npcm845-gcr
+ - const: syscon
+ - const: simple-mfd
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ gcr: syscon@800000 {
+ compatible = "nuvoton,npcm750-gcr", "syscon", "simple-mfd";
+ reg = <0x800000 0x1000>;
+
+ mux-controller {
+ compatible = "mmio-mux";
+ #mux-control-cells = <1>;
+ mux-reg-masks = <0x38 0x07>;
+ idle-states = <2>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/nvidia,tegra194-ccplex.yaml b/Documentation/devicetree/bindings/arm/nvidia,tegra194-ccplex.yaml
index c9675c4cdc1b..84dc6b7512af 100644
--- a/Documentation/devicetree/bindings/arm/nvidia,tegra194-ccplex.yaml
+++ b/Documentation/devicetree/bindings/arm/nvidia,tegra194-ccplex.yaml
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/arm/nvidia,tegra194-ccplex.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/arm/nvidia,tegra194-ccplex.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NVIDIA Tegra194 CPU Complex device tree bindings
+title: NVIDIA Tegra194 CPU Complex
maintainers:
- Thierry Reding <thierry.reding@gmail.com>
@@ -25,7 +25,7 @@ properties:
- nvidia,tegra194-ccplex
nvidia,bpmp:
- $ref: '/schemas/types.yaml#/definitions/phandle'
+ $ref: /schemas/types.yaml#/definitions/phandle
description: |
Specifies the bpmp node that needs to be queried to get
operating point data for all CPUs.
diff --git a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
index 214c97bc3063..f1bd6f50e726 100644
--- a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
+++ b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/nxp/lpc32xx.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NXP LPC32xx Platforms Device Tree Bindings
+title: NXP LPC32xx Platforms
maintainers:
- Roland Stigge <stigge@antcom.de>
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt
index e77635c5422c..fa8b31660cad 100644
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -119,6 +119,9 @@ Boards (incomplete list of examples):
- OMAP3 BeagleBoard : Low cost community board
compatible = "ti,omap3-beagle", "ti,omap3430", "ti,omap3"
+- OMAP3 BeagleBoard A to B4 : Early BeagleBoard revisions A to B4 with a timer quirk
+ compatible = "ti,omap3-beagle-ab4", "ti,omap3-beagle", "ti,omap3430", "ti,omap3"
+
- OMAP3 Tobi with Overo : Commercial expansion board with daughter board
compatible = "gumstix,omap3-overo-tobi", "gumstix,omap3-overo", "ti,omap3430", "ti,omap3"
diff --git a/Documentation/devicetree/bindings/arm/omap/prcm.txt b/Documentation/devicetree/bindings/arm/omap/prcm.txt
index 3eb6d7afff14..431ef8c56a13 100644
--- a/Documentation/devicetree/bindings/arm/omap/prcm.txt
+++ b/Documentation/devicetree/bindings/arm/omap/prcm.txt
@@ -31,12 +31,17 @@ Required properties:
(base address and length)
- clocks: clocks for this module
- clockdomains: clockdomains for this module
+- #clock-cells: From common clock binding
+- clock-output-names: From common clock binding
+
Example:
-cm: cm@48004000 {
+cm: clock@48004000 {
compatible = "ti,omap3-cm";
reg = <0x48004000 0x4000>;
+ #clock-cells = <0>;
+ clock-output-names = "cm";
cm_clocks: clocks {
#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/arm/oxnas.txt b/Documentation/devicetree/bindings/arm/oxnas.txt
deleted file mode 100644
index ac64e60f99f1..000000000000
--- a/Documentation/devicetree/bindings/arm/oxnas.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Oxford Semiconductor OXNAS SoCs Family device tree bindings
--------------------------------------------
-
-Boards with the OX810SE SoC shall have the following properties:
- Required root node property:
- compatible: "oxsemi,ox810se"
-
-Boards with the OX820 SoC shall have the following properties:
- Required root node property:
- compatible: "oxsemi,ox820"
-
-Board compatible values:
- - "wd,mbwe" (OX810SE)
- - "cloudengines,pogoplugv3" (OX820)
diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml
index e17ac049e890..e14358bf0b9c 100644
--- a/Documentation/devicetree/bindings/arm/pmu.yaml
+++ b/Documentation/devicetree/bindings/arm/pmu.yaml
@@ -20,6 +20,10 @@ properties:
items:
- enum:
- apm,potenza-pmu
+ - apple,avalanche-pmu
+ - apple,blizzard-pmu
+ - apple,firestorm-pmu
+ - apple,icestorm-pmu
- arm,armv8-pmuv3 # Only for s/w models
- arm,arm1136-pmu
- arm,arm1176-pmu
@@ -44,10 +48,18 @@ properties:
- arm,cortex-a76-pmu
- arm,cortex-a77-pmu
- arm,cortex-a78-pmu
+ - arm,cortex-a510-pmu
+ - arm,cortex-a710-pmu
+ - arm,cortex-x1-pmu
+ - arm,cortex-x2-pmu
- arm,neoverse-e1-pmu
- arm,neoverse-n1-pmu
+ - arm,neoverse-n2-pmu
+ - arm,neoverse-v1-pmu
- brcm,vulcan-pmu
- cavium,thunder-pmu
+ - nvidia,denver-pmu
+ - nvidia,carmel-pmu
- qcom,krait-pmu
- qcom,scorpion-pmu
- qcom,scorpion-mp-pmu
@@ -58,6 +70,8 @@ properties:
interrupt-affinity:
$ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ maxItems: 1
description:
When using SPIs, specifies a list of phandles to CPU
nodes corresponding directly to the affinity of
diff --git a/Documentation/devicetree/bindings/arm/psci.yaml b/Documentation/devicetree/bindings/arm/psci.yaml
index 8b77cf83a095..3a2c908ff282 100644
--- a/Documentation/devicetree/bindings/arm/psci.yaml
+++ b/Documentation/devicetree/bindings/arm/psci.yaml
@@ -43,29 +43,24 @@ properties:
- description:
For implementations complying to PSCI 0.2.
- const: arm,psci-0.2
-
- - description:
- For implementations complying to PSCI 0.2.
Function IDs are not required and should be ignored by an OS with
PSCI 0.2 support, but are permitted to be present for compatibility
with existing software when "arm,psci" is later in the compatible
list.
+ minItems: 1
items:
- const: arm,psci-0.2
- const: arm,psci
- description:
For implementations complying to PSCI 1.0.
- const: arm,psci-1.0
-
- - description:
- For implementations complying to PSCI 1.0.
PSCI 1.0 is backward compatible with PSCI 0.2 with minor
specification updates, as defined in the PSCI specification[2].
+ minItems: 1
items:
- const: arm,psci-1.0
- const: arm,psci-0.2
+ - const: arm,psci
method:
description: The method of calling the PSCI firmware.
@@ -101,7 +96,7 @@ properties:
bindings in [1]) must specify this property.
[1] Kernel documentation - ARM idle states bindings
- Documentation/devicetree/bindings/arm/idle-states.yaml
+ Documentation/devicetree/bindings/cpu/idle-states.yaml
patternProperties:
"^power-domain-":
diff --git a/Documentation/devicetree/bindings/arm/qcom,coresight-tpda.yaml b/Documentation/devicetree/bindings/arm/qcom,coresight-tpda.yaml
new file mode 100644
index 000000000000..2ec9b5b24d73
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/qcom,coresight-tpda.yaml
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/qcom,coresight-tpda.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Trace, Profiling and Diagnostics Aggregator - TPDA
+
+description: |
+ TPDAs are responsible for packetization and timestamping of data sets
+ utilizing the MIPI STPv2 packet protocol. Pulling data sets from one or
+ more attached TPDM and pushing the resultant (packetized) data out a
+ master ATB interface. Performing an arbitrated ATB interleaving (funneling)
+ task for free-flowing data from TPDM (i.e. CMB and DSB data set flows).
+
+ There is no strict binding between TPDM and TPDA. TPDA can have multiple
+ TPDMs connect to it. But There must be only one TPDA in the path from the
+ TPDM source to TMC sink. TPDM can directly connect to TPDA's inport or
+ connect to funnel which will connect to TPDA's inport.
+
+ We can use the commands are similar to the below to validate TPDMs.
+ Enable coresight sink first.
+
+ echo 1 > /sys/bus/coresight/devices/tmc_etf0/enable_sink
+ echo 1 > /sys/bus/coresight/devices/tpdm0/enable_source
+ echo 1 > /sys/bus/coresight/devices/tpdm0/integration_test
+ echo 2 > /sys/bus/coresight/devices/tpdm0/integration_test
+
+ The test data will be collected in the coresight sink which is enabled.
+ If rwp register of the sink is keeping updating when do integration_test
+ (by cat tmc_etf0/mgmt/rwp), it means there is data generated from TPDM
+ to sink.
+
+maintainers:
+ - Mao Jinlong <quic_jinlmao@quicinc.com>
+ - Tao Zhang <quic_taozha@quicinc.com>
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,coresight-tpda
+ required:
+ - compatible
+
+properties:
+ $nodename:
+ pattern: "^tpda(@[0-9a-f]+)$"
+ compatible:
+ items:
+ - const: qcom,coresight-tpda
+ - const: arm,primecell
+
+ reg:
+ minItems: 1
+ maxItems: 2
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: apb_pclk
+
+ in-ports:
+ type: object
+ description: |
+ Input connections from TPDM to TPDA
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ out-ports:
+ type: object
+ description: |
+ Output connections from the TPDA to legacy CoreSight trace bus.
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port:
+ description:
+ Output connection from the TPDA to legacy CoreSight Trace bus.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - in-ports
+ - out-ports
+
+additionalProperties: false
+
+examples:
+ # minimum tpda definition.
+ - |
+ tpda@6004000 {
+ compatible = "qcom,coresight-tpda", "arm,primecell";
+ reg = <0x6004000 0x1000>;
+
+ clocks = <&aoss_qmp>;
+ clock-names = "apb_pclk";
+
+ in-ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ tpda_qdss_0_in_tpdm_dcc: endpoint {
+ remote-endpoint =
+ <&tpdm_dcc_out_tpda_qdss_0>;
+ };
+ };
+ };
+
+ out-ports {
+ port {
+ tpda_qdss_out_funnel_in0: endpoint {
+ remote-endpoint =
+ <&funnel_in0_in_tpda_qdss>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml b/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml
new file mode 100644
index 000000000000..5c08342664ea
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/qcom,coresight-tpdm.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+# Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/qcom,coresight-tpdm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Trace, Profiling and Diagnostics Monitor - TPDM
+
+description: |
+ The TPDM or Monitor serves as data collection component for various dataset
+ types specified in the QPMDA spec. It covers Implementation defined ((ImplDef),
+ Basic Counts (BC), Tenure Counts (TC), Continuous Multi-Bit (CMB), and Discrete
+ Single Bit (DSB). It performs data collection in the data producing clock
+ domain and transfers it to the data collection time domain, generally ATB
+ clock domain.
+
+ The primary use case of the TPDM is to collect data from different data
+ sources and send it to a TPDA for packetization, timestamping, and funneling.
+
+maintainers:
+ - Mao Jinlong <quic_jinlmao@quicinc.com>
+ - Tao Zhang <quic_taozha@quicinc.com>
+
+# Need a custom select here or 'arm,primecell' will match on lots of nodes
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,coresight-tpdm
+ required:
+ - compatible
+
+properties:
+ $nodename:
+ pattern: "^tpdm(@[0-9a-f]+)$"
+ compatible:
+ items:
+ - const: qcom,coresight-tpdm
+ - const: arm,primecell
+
+ reg:
+ minItems: 1
+ maxItems: 2
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: apb_pclk
+
+ out-ports:
+ description: |
+ Output connections from the TPDM to coresight funnel/TPDA.
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port:
+ description: Output connection from the TPDM to coresight
+ funnel/TPDA.
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ # minimum TPDM definition. TPDM connect to coresight TPDA.
+ - |
+ tpdm@684c000 {
+ compatible = "qcom,coresight-tpdm", "arm,primecell";
+ reg = <0x0684c000 0x1000>;
+
+ clocks = <&aoss_qmp>;
+ clock-names = "apb_pclk";
+
+ out-ports {
+ port {
+ tpdm_prng_out_tpda_qdss: endpoint {
+ remote-endpoint =
+ <&tpda_qdss_in_tpdm_prng>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/qcom-soc.yaml b/Documentation/devicetree/bindings/arm/qcom-soc.yaml
new file mode 100644
index 000000000000..e333ec4a9c5f
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/qcom-soc.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/qcom-soc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SoC compatibles naming convention
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Guidelines for new compatibles for SoC blocks/components.
+ When adding new compatibles in new bindings, use the format::
+ qcom,SoC-IP
+
+ For example::
+ qcom,sdm845-llcc-bwmon
+
+ When adding new compatibles to existing bindings, use the format in the
+ existing binding, even if it contradicts the above.
+
+select:
+ properties:
+ compatible:
+ pattern: "^qcom,.*(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ required:
+ - compatible
+
+properties:
+ compatible:
+ oneOf:
+ # Preferred naming style for compatibles of SoC components:
+ - pattern: "^qcom,(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+-.*$"
+ - pattern: "^qcom,(sa|sc)8[0-9]+[a-z][a-z]?-.*$"
+
+ # Legacy namings - variations of existing patterns/compatibles are OK,
+ # but do not add completely new entries to these:
+ - pattern: "^qcom,[ak]pss-wdt-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - pattern: "^qcom,gcc-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - pattern: "^qcom,mmcc-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - pattern: "^qcom,pcie-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - pattern: "^qcom,rpm-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - pattern: "^qcom,scm-(apq|ipq|mdm|msm|qcm|qcs|sa|sc|sdm|sdx|sm)[0-9]+.*$"
+ - enum:
+ - qcom,dsi-ctrl-6g-qcm2290
+ - qcom,gpucc-sdm630
+ - qcom,gpucc-sdm660
+ - qcom,lcc-apq8064
+ - qcom,lcc-ipq8064
+ - qcom,lcc-mdm9615
+ - qcom,lcc-msm8960
+ - qcom,lpass-cpu-apq8016
+ - qcom,usb-ss-ipq4019-phy
+ - qcom,usb-hs-ipq4019-phy
+ - qcom,vqmmc-ipq4019-regulator
+
+ # Legacy compatibles with wild-cards - list cannot grow with new bindings:
+ - enum:
+ - qcom,ipq806x-gmac
+ - qcom,ipq806x-nand
+ - qcom,ipq806x-sata-phy
+ - qcom,ipq806x-usb-phy-ss
+ - qcom,ipq806x-usb-phy-hs
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index 880ddafc634e..d9dd25695c3d 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -4,10 +4,10 @@
$id: http://devicetree.org/schemas/arm/qcom.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: QCOM device tree bindings
+title: QCOM
maintainers:
- - Stephen Boyd <sboyd@codeaurora.org>
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
description: |
Some qcom based bootloaders identify the dtb blob based on a set of
@@ -25,32 +25,69 @@ description: |
The 'SoC' element must be one of the following strings:
apq8016
+ apq8026
apq8074
apq8084
apq8096
+ ipq4018
+ ipq5332
ipq6018
ipq8074
+ ipq9574
mdm9615
msm8226
msm8916
+ msm8939
+ msm8953
+ msm8956
msm8974
+ msm8976
msm8992
msm8994
msm8996
+ msm8998
+ qcs404
+ qcm2290
+ qdu1000
+ qrb2210
+ qrb4210
+ qru1000
sa8155p
+ sa8540p
+ sa8775p
sc7180
sc7280
+ sc8180x
+ sc8280xp
+ sda660
+ sdm450
sdm630
+ sdm632
+ sdm636
sdm660
+ sdm670
sdm845
sdx55
+ sdx65
+ sm4250
+ sm6115
+ sm6115p
+ sm6125
+ sm6350
+ sm6375
+ sm7225
sm8150
sm8250
sm8350
+ sm8450
+ sm8550
The 'board' element must be one of the following strings:
adp
+ ap-al02-c7
+ ap-mi01.2
+ ap-mi01.6
cdp
cp01-c1
dragonboard
@@ -61,7 +98,10 @@ description: |
liquid
mtp
qrd
+ rb2
+ ride
sbc
+ x100
The 'soc_version' and 'board_version' elements take the form of v<Major>.<Minor>
where the minor number may be omitted when it's zero, i.e. v1.0 is the same
@@ -82,6 +122,11 @@ description: |
A dragonboard board v0.1 of subtype 1 with an apq8074 SoC version 2, made in
foundry 2.
+ There are many devices in the list below that run the standard ChromeOS
+ bootloader setup and use the open source depthcharge bootloader to boot the
+ OS. These devices do not use the scheme described above. For details, see:
+ https://docs.kernel.org/arm/google/chromebook-boot-flow.html
+
properties:
$nodename:
const: "/"
@@ -94,6 +139,17 @@ properties:
- items:
- enum:
+ - asus,sparrow
+ - huawei,sturgeon
+ - lg,lenok
+ - samsung,matisse-wifi
+ - const: qcom,apq8026
+
+ - items:
+ - enum:
+ - asus,nexus7-flo
+ - lg,nexus4-mako
+ - sony,xperia-yuga
- qcom,apq8064-cm-qs600
- qcom,apq8064-ifc6410
- const: qcom,apq8064
@@ -122,47 +178,152 @@ properties:
- items:
- enum:
+ - sony,kanuti-tulip
+ - square,apq8039-t2
+ - const: qcom,msm8939
+
+ - items:
+ - enum:
+ - sony,kugo-row
+ - sony,suzu-row
+ - const: qcom,msm8956
+
+ - items:
+ - enum:
- qcom,msm8960-cdp
- const: qcom,msm8960
- items:
- enum:
- - fairphone,fp2
- lge,hammerhead
- sony,xperia-amami
- - sony,xperia-castor
- sony,xperia-honami
- const: qcom,msm8974
- items:
- enum:
- - alcatel,idol347
- - const: qcom,msm8916-mtp/1
+ - fairphone,fp2
+ - oneplus,bacon
+ - samsung,klte
+ - sony,xperia-castor
+ - const: qcom,msm8974pro
+ - const: qcom,msm8974
+
+ - items:
- const: qcom,msm8916-mtp
+ - const: qcom,msm8916-mtp/1
- const: qcom,msm8916
- items:
- enum:
- - longcheer,l8150
+ - acer,a1-724
+ - alcatel,idol347
+ - asus,z00l
+ - gplus,fl8005a
+ - huawei,g7
+ - longcheer,l8910
- samsung,a3u-eur
- samsung,a5u-eur
+ - samsung,e5
+ - samsung,e7
+ - samsung,grandmax
+ - samsung,gt510
+ - samsung,gt58
+ - samsung,j5
+ - samsung,j5x
+ - samsung,serranove
+ - thwc,uf896
+ - thwc,ufi001c
+ - wingtech,wt88047
+ - yiming,uz801-v3
- const: qcom,msm8916
- items:
+ - const: longcheer,l8150
+ - const: qcom,msm8916-v1-qrd/9-v1
+ - const: qcom,msm8916
+
+ - items:
+ - enum:
+ - motorola,potter
+ - xiaomi,daisy
+ - xiaomi,mido
+ - xiaomi,tissot
+ - xiaomi,vince
+ - const: qcom,msm8953
+
+ - items:
+ - enum:
+ - lg,bullhead
+ - microsoft,talkman
+ - xiaomi,libra
+ - const: qcom,msm8992
+
+ - items:
- enum:
- sony,karin_windy
+ - const: qcom,apq8094
+
+ - items:
+ - enum:
+ - huawei,angler
+ - microsoft,cityman
+ - sony,ivy-row
- sony,karin-row
- sony,satsuki-row
- sony,sumire-row
- sony,suzuran-row
- - qcom,msm8994
- - const: qcom,apq8094
+ - const: qcom,msm8994
- items:
- - const: qcom,msm8996-mtp
+ - enum:
+ - arrow,apq8096-db820c
+ - inforce,ifc6640
+ - const: qcom,apq8096-sbc
+ - const: qcom,apq8096
- items:
- enum:
+ - oneplus,oneplus3
+ - oneplus,oneplus3t
+ - qcom,msm8996-mtp
+ - sony,dora-row
+ - sony,kagura-row
+ - sony,keyaki-row
+ - xiaomi,gemini
+ - const: qcom,msm8996
+
+ - items:
+ - enum:
+ - xiaomi,natrium
+ - xiaomi,scorpio
+ - const: qcom,msm8996pro
+ - const: qcom,msm8996
+
+ - items:
+ - enum:
+ - asus,novago-tp370ql
+ - fxtec,pro1
+ - hp,envy-x2
+ - lenovo,miix-630
+ - oneplus,cheeseburger
+ - oneplus,dumpling
+ - qcom,msm8998-mtp
+ - sony,xperia-lilac
+ - sony,xperia-maple
+ - sony,xperia-poplar
+ - xiaomi,sagit
+ - const: qcom,msm8998
+
+ - items:
+ - enum:
+ - 8dev,jalapeno
+ - alfa-network,ap120c-ac
+ - const: qcom,ipq4018
+
+ - items:
+ - enum:
+ - qcom,ipq4019-ap-dk01.1-c1
- qcom,ipq4019-ap-dk04.1-c3
- qcom,ipq4019-ap-dk07.1-c1
- qcom,ipq4019-ap-dk07.1-c2
@@ -171,6 +332,13 @@ properties:
- items:
- enum:
+ - qcom,ipq5332-ap-mi01.2
+ - qcom,ipq5332-ap-mi01.6
+ - const: qcom,ipq5332
+
+ - items:
+ - enum:
+ - mikrotik,rb3011
- qcom,ipq8064-ap148
- const: qcom,ipq8064
@@ -183,16 +351,451 @@ properties:
- items:
- enum:
+ - qcom,ipq9574-ap-al02-c7
+ - const: qcom,ipq9574
+
+ - description: Sierra Wireless MangOH Green with WP8548 Module
+ items:
+ - const: swir,mangoh-green-wp8548
+ - const: swir,wp8548
+ - const: qcom,mdm9615
+
+ - description: Qualcomm Technologies, Inc. Robotics RB1
+ items:
+ - enum:
+ - qcom,qrb2210-rb1
+ - const: qcom,qrb2210
+ - const: qcom,qcm2290
+
+ - description: Qualcomm Technologies, Inc. Distributed Unit 1000 platform
+ items:
+ - enum:
+ - qcom,qdu1000-idp
+ - qcom,qdu1000-x100
+ - const: qcom,qdu1000
+
+ - description: Qualcomm Technologies, Inc. Radio Unit 1000 platform
+ items:
+ - enum:
+ - qcom,qru1000-idp
+ - const: qcom,qru1000
+
+ - description: Qualcomm Technologies, Inc. SC7180 IDP
+ items:
+ - enum:
- qcom,sc7180-idp
- const: qcom,sc7180
+ - description: HP Chromebook x2 11c (rev1 - 2)
+ items:
+ - const: google,coachz-rev1
+ - const: google,coachz-rev2
+ - const: qcom,sc7180
+
+ - description: HP Chromebook x2 11c (newest rev)
+ items:
+ - const: google,coachz
+ - const: qcom,sc7180
+
+ - description: HP Chromebook x2 11c with LTE (rev1 - 2)
+ items:
+ - const: google,coachz-rev1-sku0
+ - const: google,coachz-rev2-sku0
+ - const: qcom,sc7180
+
+ - description: HP Chromebook x2 11c with LTE (newest rev)
+ items:
+ - const: google,coachz-sku0
+ - const: qcom,sc7180
+
+ - description: Lenovo Chromebook Duet 5 13 (rev2)
+ items:
+ - const: google,homestar-rev2
+ - const: google,homestar-rev23
+ - const: qcom,sc7180
+
+ - description: Lenovo Chromebook Duet 5 13 (rev3)
+ items:
+ - const: google,homestar-rev3
+ - const: qcom,sc7180
+
+ - description: Lenovo Chromebook Duet 5 13 (newest rev)
+ items:
+ - const: google,homestar
+ - const: qcom,sc7180
+
+ - description: Google Kingoftown (rev0)
+ items:
+ - const: google,kingoftown-rev0
+ - const: qcom,sc7180
+
+ - description: Google Kingoftown (newest rev)
+ items:
+ - const: google,kingoftown
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 (rev0)
+ items:
+ - const: google,lazor-rev0
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 (rev1 - 2)
+ items:
+ - const: google,lazor-rev1
+ - const: google,lazor-rev2
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 (rev3 - 8)
+ items:
+ - const: google,lazor-rev3
+ - const: google,lazor-rev4
+ - const: google,lazor-rev5
+ - const: google,lazor-rev6
+ - const: google,lazor-rev7
+ - const: google,lazor-rev8
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 (newest rev)
+ items:
+ - const: google,lazor
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with KB Backlight (rev1 - 2)
+ items:
+ - const: google,lazor-rev1-sku2
+ - const: google,lazor-rev2-sku2
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with KB Backlight (rev3 - 8)
+ items:
+ - const: google,lazor-rev3-sku2
+ - const: google,lazor-rev4-sku2
+ - const: google,lazor-rev5-sku2
+ - const: google,lazor-rev6-sku2
+ - const: google,lazor-rev7-sku2
+ - const: google,lazor-rev8-sku2
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with KB Backlight (newest rev)
+ items:
+ - const: google,lazor-sku2
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with LTE (rev1 - 2)
+ items:
+ - const: google,lazor-rev1-sku0
+ - const: google,lazor-rev2-sku0
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with LTE (rev3 - 8)
+ items:
+ - const: google,lazor-rev3-sku0
+ - const: google,lazor-rev4-sku0
+ - const: google,lazor-rev5-sku0
+ - const: google,lazor-rev6-sku0
+ - const: google,lazor-rev7-sku0
+ - const: google,lazor-rev8-sku0
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook Spin 513 with LTE (newest rev)
+ items:
+ - const: google,lazor-sku0
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook 511 (rev4 - rev8)
+ items:
+ - const: google,lazor-rev4-sku4
+ - const: google,lazor-rev5-sku4
+ - const: google,lazor-rev6-sku4
+ - const: google,lazor-rev7-sku4
+ - const: google,lazor-rev8-sku4
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook 511 (newest rev)
+ items:
+ - const: google,lazor-sku4
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook 511 without Touchscreen (rev4)
+ items:
+ - const: google,lazor-rev4-sku5
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook 511 without Touchscreen (rev5 - rev8)
+ items:
+ - const: google,lazor-rev5-sku5
+ - const: google,lazor-rev5-sku6
+ - const: google,lazor-rev6-sku6
+ - const: google,lazor-rev7-sku6
+ - const: google,lazor-rev8-sku6
+ - const: qcom,sc7180
+
+ - description: Acer Chromebook 511 without Touchscreen (newest rev)
+ items:
+ - const: google,lazor-sku6
+ - const: qcom,sc7180
+
+ - description: Google Mrbland with AUO panel (rev0)
+ items:
+ - const: google,mrbland-rev0-sku0
+ - const: qcom,sc7180
+
+ - description: Google Mrbland with AUO panel (newest rev)
+ items:
+ - const: google,mrbland-sku1536
+ - const: qcom,sc7180
+
+ - description: Google Mrbland with BOE panel (rev0)
+ items:
+ - const: google,mrbland-rev0-sku16
+ - const: qcom,sc7180
+
+ - description: Google Mrbland with BOE panel (newest rev)
+ items:
+ - const: google,mrbland-sku1024
+ - const: google,mrbland-sku768
+ - const: qcom,sc7180
+
+ - description: Google Pazquel with Parade (newest rev)
+ items:
+ - const: google,pazquel-sku5
+ - const: qcom,sc7180
+
+ - description: Google Pazquel with TI (newest rev)
+ items:
+ - const: google,pazquel-sku1
+ - const: qcom,sc7180
+
+ - description: Google Pazquel with LTE and Parade (newest rev)
+ items:
+ - const: google,pazquel-sku6
+ - const: google,pazquel-sku4
+ - const: qcom,sc7180
+
+ - description: Google Pazquel with LTE and TI (newest rev)
+ items:
+ - const: google,pazquel-sku0
+ - const: google,pazquel-sku2
+ - const: qcom,sc7180
+
+ - description: Google Pazquel360 with LTE (newest rev)
+ items:
+ - const: google,pazquel-sku22
+ - const: google,pazquel-sku20
+ - const: qcom,sc7180
+
+ - description: Google Pazquel360 with WiFi (newest rev)
+ items:
+ - const: google,pazquel-sku21
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 (rev1)
+ items:
+ - const: google,pompom-rev1
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 (rev2)
+ items:
+ - const: google,pompom-rev2
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 (newest rev)
+ items:
+ - const: google,pompom
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 with LTE (rev1)
+ items:
+ - const: google,pompom-rev1-sku0
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 with LTE (rev2)
+ items:
+ - const: google,pompom-rev2-sku0
+ - const: qcom,sc7180
+
+ - description: Sharp Dynabook Chromebook C1 with LTE (newest rev)
+ items:
+ - const: google,pompom-sku0
+ - const: qcom,sc7180
+
+ - description: Google Quackingstick (newest rev)
+ items:
+ - const: google,quackingstick-sku1537
+ - const: qcom,sc7180
+
+ - description: Google Quackingstick with LTE (newest rev)
+ items:
+ - const: google,quackingstick-sku1536
+ - const: qcom,sc7180
+
+ - description: Google Trogdor (newest rev)
+ items:
+ - const: google,trogdor
+ - const: qcom,sc7180
+
+ - description: Google Trogdor with LTE (newest rev)
+ items:
+ - const: google,trogdor-sku0
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with BOE panel (rev0)
+ items:
+ - const: google,wormdingler-rev0-sku16
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with BOE panel (newest rev)
+ items:
+ - const: google,wormdingler-sku1024
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with BOE panel and rt5682s (newest rev)
+ items:
+ - const: google,wormdingler-sku1025
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with INX panel (rev0)
+ items:
+ - const: google,wormdingler-rev0-sku0
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with INX panel (newest rev)
+ items:
+ - const: google,wormdingler-sku0
+ - const: qcom,sc7180
+
+ - description: Lenovo IdeaPad Chromebook Duet 3 with INX panel and rt5682s (newest rev)
+ items:
+ - const: google,wormdingler-sku1
+ - const: qcom,sc7180
+
+ - description: Qualcomm Technologies, Inc. sc7280 CRD platform (rev3 - 4)
+ items:
+ - const: qcom,sc7280-crd
+ - const: google,hoglin-rev3
+ - const: google,hoglin-rev4
+ - const: google,piglin-rev3
+ - const: google,piglin-rev4
+ - const: qcom,sc7280
+
+ - description: Qualcomm Technologies, Inc. sc7280 CRD platform (newest rev)
+ items:
+ - const: google,zoglin
+ - const: google,hoglin
+ - const: qcom,sc7280
+
+ - description: Qualcomm Technologies, Inc. sc7280 CRD Pro platform (newest rev)
+ items:
+ - const: google,zoglin-sku1536
+ - const: google,hoglin-sku1536
+ - const: qcom,sc7280
+
+ - description: Qualcomm Technologies, Inc. sc7280 IDP SKU1 platform
+ items:
+ - const: qcom,sc7280-idp
+ - const: google,senor
+ - const: qcom,sc7280
+
+ - description: Qualcomm Technologies, Inc. sc7280 IDP SKU2 platform
+ items:
+ - const: qcom,sc7280-idp2
+ - const: google,piglin
+ - const: qcom,sc7280
+
+ - description: Google Evoker (newest rev)
+ items:
+ - const: google,evoker
+ - const: qcom,sc7280
+
+ - description: Google Evoker with LTE (newest rev)
+ items:
+ - const: google,evoker-sku512
+ - const: qcom,sc7280
+
+ - description: Google Herobrine (newest rev)
+ items:
+ - const: google,herobrine
+ - const: qcom,sc7280
+
+ - description: Google Villager (rev0)
+ items:
+ - const: google,villager-rev0
+ - const: qcom,sc7280
+
+ - description: Google Villager (newest rev)
+ items:
+ - const: google,villager
+ - const: qcom,sc7280
+
+ - description: Google Villager with LTE (newest rev)
+ items:
+ - const: google,villager-sku512
+ - const: qcom,sc7280
+
+ - description: Google Zombie (newest rev)
+ items:
+ - const: google,zombie
+ - const: qcom,sc7280
+
+ - description: Google Zombie with LTE (newest rev)
+ items:
+ - const: google,zombie-sku512
+ - const: qcom,sc7280
+
+ - description: Google Zombie with NVMe (newest rev)
+ items:
+ - const: google,zombie-sku2
+ - const: google,zombie-sku3
+ - const: google,zombie-sku515
+ - const: qcom,sc7280
+
+ - description: Google Zombie with LTE and NVMe (newest rev)
+ items:
+ - const: google,zombie-sku514
+ - const: qcom,sc7280
+
- items:
- enum:
- - qcom,sc7280-idp
- - qcom,sc7280-idp2
- - google,piglin
- - google,senor
- - const: qcom,sc7280
+ - lenovo,flex-5g
+ - microsoft,surface-prox
+ - qcom,sc8180x-primus
+ - const: qcom,sc8180x
+
+ - items:
+ - enum:
+ - lenovo,thinkpad-x13s
+ - qcom,sc8280xp-crd
+ - qcom,sc8280xp-qrd
+ - const: qcom,sc8280xp
+
+ - items:
+ - enum:
+ - motorola,ali
+ - const: qcom,sdm450
+
+ - items:
+ - enum:
+ - sony,discovery-row
+ - sony,kirin-row
+ - sony,pioneer-row
+ - sony,voyager-row
+ - const: qcom,sdm630
+
+ - items:
+ - enum:
+ - inforce,ifc6560
+ - const: qcom,sda660
+
+ - items:
+ - enum:
+ - fairphone,fp3
+ - motorola,ocean
+ - const: qcom,sdm632
+
+ - items:
+ - enum:
+ - sony,mermaid-row
+ - const: qcom,sdm636
- items:
- enum:
@@ -201,6 +804,11 @@ properties:
- items:
- enum:
+ - google,sargo
+ - const: qcom,sdm670
+
+ - items:
+ - enum:
- qcom,sdx55-mtp
- qcom,sdx55-telit-fn980-tlb
- qcom,sdx55-t55
@@ -208,32 +816,270 @@ properties:
- items:
- enum:
+ - qcom,sdx65-mtp
+ - const: qcom,sdx65
+
+ - items:
+ - enum:
- qcom,ipq6018-cp01
- qcom,ipq6018-cp01-c1
- const: qcom,ipq6018
- items:
- enum:
+ - qcom,qcs404-evb-1000
+ - qcom,qcs404-evb-4000
+ - const: qcom,qcs404-evb
+ - const: qcom,qcs404
+
+ - items:
+ - enum:
- qcom,sa8155p-adp
- const: qcom,sa8155p
- items:
- enum:
+ - qcom,sa8295p-adp
+ - qcom,sa8540p-ride
+ - const: qcom,sa8540p
+
+ - items:
+ - enum:
+ - qcom,sa8775p-ride
+ - const: qcom,sa8775p
+
+ - items:
+ - enum:
+ - google,cheza
+ - google,cheza-rev1
+ - google,cheza-rev2
+ - lenovo,yoga-c630
+ - lg,judyln
+ - lg,judyp
+ - oneplus,enchilada
+ - oneplus,fajita
+ - qcom,sdm845-mtp
+ - shift,axolotl
+ - samsung,starqltechn
+ - samsung,w737
+ - sony,akari-row
+ - sony,akatsuki-row
+ - sony,apollo-row
+ - thundercomm,db845c
+ - xiaomi,beryllium
+ - xiaomi,beryllium-ebbg
+ - xiaomi,polaris
+ - const: qcom,sdm845
+
+ - items:
+ - enum:
+ - oneplus,billie2
+ - const: qcom,sm4250
+
+ - items:
+ - enum:
+ - qcom,qrb4210-rb2
+ - const: qcom,qrb4210
+ - const: qcom,sm4250
+
+ - items:
+ - enum:
+ - lenovo,j606f
+ - const: qcom,sm6115p
+ - const: qcom,sm6115
+
+ - items:
+ - enum:
+ - sony,pdx201
+ - xiaomi,laurel-sprout
+ - const: qcom,sm6125
+
+ - items:
+ - enum:
+ - sony,pdx213
+ - const: qcom,sm6350
+
+ - items:
+ - enum:
+ - sony,pdx225
+ - const: qcom,sm6375
+
+ - items:
+ - enum:
+ - fairphone,fp4
+ - const: qcom,sm7225
+
+ - items:
+ - enum:
+ - microsoft,surface-duo
+ - qcom,sm8150-hdk
- qcom,sm8150-mtp
+ - sony,bahamut-generic
+ - sony,griffin-generic
- const: qcom,sm8150
- items:
- enum:
- qcom,qrb5165-rb5
+ - qcom,sm8250-hdk
- qcom,sm8250-mtp
+ - sony,pdx203-generic
+ - sony,pdx206-generic
+ - xiaomi,elish
- const: qcom,sm8250
- items:
- enum:
+ - microsoft,surface-duo2
- qcom,sm8350-hdk
- qcom,sm8350-mtp
+ - sony,pdx214-generic
+ - sony,pdx215-generic
- const: qcom,sm8350
+ - items:
+ - enum:
+ - qcom,sm8450-hdk
+ - qcom,sm8450-qrd
+ - sony,pdx223
+ - sony,pdx224
+ - const: qcom,sm8450
+
+ - items:
+ - enum:
+ - qcom,sm8550-mtp
+ - qcom,sm8550-qrd
+ - const: qcom,sm8550
+
+ # Board compatibles go above
+
+ qcom,msm-id:
+ $ref: /schemas/types.yaml#/definitions/uint32-matrix
+ minItems: 1
+ maxItems: 8
+ items:
+ items:
+ - description: |
+ MSM chipset ID - an exact match value consisting of two bitfields::
+ - bits 0-15 - The unique MSM chipset ID
+ - bits 16-31 - Reserved; should be 0
+ - description: |
+ Hardware revision ID - a chipset specific 32-bit ID representing
+ the version of the chipset. It is best a match value - the
+ bootloader will look for the closest possible match.
+ deprecated: true
+ description:
+ The MSM chipset and hardware revision used Qualcomm bootloaders. It
+ can optionally be an array of these to indicate multiple hardware that
+ use the same device tree. It is expected that the bootloader will use
+ this information at boot-up to decide which device tree to use when given
+ multiple device trees, some of which may not be compatible with the
+ actual hardware. It is the bootloader's responsibility to pass the
+ correct device tree to the kernel.
+ The property is deprecated.
+
+ qcom,board-id:
+ $ref: /schemas/types.yaml#/definitions/uint32-matrix
+ minItems: 1
+ maxItems: 8
+ oneOf:
+ - items:
+ - items:
+ - description: |
+ Board ID consisting of three bitfields::
+ - bits 31-24 - Unused
+ - bits 23-16 - Platform Version Major
+ - bits 15-8 - Platform Version Minor
+ - bits 7-0 - Platform Type
+ Platform Type field is an exact match value. The
+ Platform Major/Minor field is a best match. The bootloader will
+ look for the closest possible match.
+ - description: |
+ Subtype ID unique to a Platform Type/Chipset ID. For a given
+ Platform Type, there will typically only be a single board and the
+ subtype_id will be 0. However in some cases board variants may
+ need to be distinguished by different subtype_id values.
+ - items:
+ # OnePlus uses a variant of board-id with four elements:
+ - items:
+ - const: 8
+ - const: 0
+ - description: OnePlus board ID
+ - description: OnePlus subtype ID
+ deprecated: true
+ description:
+ The board type and revision information. It can optionally be an array
+ of these to indicate multiple boards that use the same device tree. It
+ is expected that the bootloader will use this information at boot-up to
+ decide which device tree to use when given multiple device trees, some of
+ which may not be compatible with the actual hardware. It is the
+ bootloader's responsibility to pass the correct device tree to the
+ kernel
+ The property is deprecated.
+
+allOf:
+ # Explicit allow-list for older SoCs. The legacy properties are not allowed
+ # on newer SoCs.
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,apq8026
+ - qcom,apq8094
+ - qcom,apq8096
+ - qcom,msm8939
+ - qcom,msm8953
+ - qcom,msm8956
+ - qcom,msm8992
+ - qcom,msm8994
+ - qcom,msm8996
+ - qcom,msm8998
+ - qcom,sdm450
+ - qcom,sdm630
+ - qcom,sdm632
+ - qcom,sdm636
+ - qcom,sdm845
+ - qcom,sdx55
+ - qcom,sdx65
+ - qcom,sm4250
+ - qcom,sm6115
+ - qcom,sm6125
+ - qcom,sm6350
+ - qcom,sm7225
+ - qcom,sm8150
+ - qcom,sm8250
+ then:
+ properties:
+ qcom,board-id: true
+ qcom,msm-id: true
+ else:
+ properties:
+ qcom,board-id: false
+ qcom,msm-id: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - oneplus,cheeseburger
+ - oneplus,dumpling
+ - oneplus,enchilada
+ - oneplus,fajita
+ - oneplus,oneplus3
+ - oneplus,oneplus3t
+ then:
+ properties:
+ qcom,board-id:
+ items:
+ minItems: 4
+ else:
+ properties:
+ qcom,board-id:
+ items:
+ maxItems: 2
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/arm/rda.yaml b/Documentation/devicetree/bindings/arm/rda.yaml
index a5c0444aa2b4..09241ea1d228 100644
--- a/Documentation/devicetree/bindings/arm/rda.yaml
+++ b/Documentation/devicetree/bindings/arm/rda.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/rda.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: RDA Micro platforms device tree bindings
+title: RDA Micro platforms
maintainers:
- Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
diff --git a/Documentation/devicetree/bindings/arm/realtek.yaml b/Documentation/devicetree/bindings/arm/realtek.yaml
index 9fb0297fe1ce..ddd9a85099e9 100644
--- a/Documentation/devicetree/bindings/arm/realtek.yaml
+++ b/Documentation/devicetree/bindings/arm/realtek.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/realtek.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Realtek platforms device tree bindings
+title: Realtek platforms
maintainers:
- Andreas Färber <afaerber@suse.de>
diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml b/Documentation/devicetree/bindings/arm/renesas.yaml
deleted file mode 100644
index 8a11918866b8..000000000000
--- a/Documentation/devicetree/bindings/arm/renesas.yaml
+++ /dev/null
@@ -1,353 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/renesas.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Renesas SH-Mobile, R-Mobile, and R-Car Platform Device Tree Bindings
-
-maintainers:
- - Geert Uytterhoeven <geert+renesas@glider.be>
-
-properties:
- $nodename:
- const: '/'
- compatible:
- oneOf:
- - description: Emma Mobile EV2
- items:
- - enum:
- - renesas,kzm9d # Kyoto Microcomputer Co. KZM-A9-Dual
- - const: renesas,emev2
-
- - description: RZ/A1H (R7S72100)
- items:
- - enum:
- - renesas,genmai # Genmai (RTK772100BC00000BR)
- - renesas,gr-peach # GR-Peach (X28A-M01-E/F)
- - renesas,rskrza1 # RSKRZA1 (YR0K77210C000BE)
- - const: renesas,r7s72100
-
- - description: RZ/A2 (R7S9210)
- items:
- - enum:
- - renesas,rza2mevb # RZ/A2M Eval Board (RTK7921053S00000BE)
- - const: renesas,r7s9210
-
- - description: SH-Mobile AG5 (R8A73A00/SH73A0)
- items:
- - enum:
- - renesas,kzm9g # Kyoto Microcomputer Co. KZM-A9-GT
- - const: renesas,sh73a0
-
- - description: R-Mobile APE6 (R8A73A40)
- items:
- - enum:
- - renesas,ape6evm
- - const: renesas,r8a73a4
-
- - description: R-Mobile A1 (R8A77400)
- items:
- - enum:
- - renesas,armadillo800eva # Atmark Techno Armadillo-800 EVA
- - const: renesas,r8a7740
-
- - description: RZ/G1H (R8A77420)
- items:
- - enum:
- # iWave Systems RZ/G1H Qseven System On Module (iW-RainboW-G21M-Qseven)
- - iwave,g21m
- - const: renesas,r8a7742
-
- - items:
- - enum:
- # iWave Systems RZ/G1H Qseven Development Platform (iW-RainboW-G21D-Qseven)
- - iwave,g21d
- - const: iwave,g21m
- - const: renesas,r8a7742
-
- - description: RZ/G1M (R8A77430)
- items:
- - enum:
- # iWave Systems RZ/G1M Qseven Development Platform (iW-RainboW-G20D-Qseven)
- - iwave,g20d
- - const: iwave,g20m
- - const: renesas,r8a7743
-
- - items:
- - enum:
- # iWave Systems RZ/G1M Qseven System On Module (iW-RainboW-G20M-Qseven)
- - iwave,g20m
- - renesas,sk-rzg1m # SK-RZG1M (YR8A77430S000BE)
- - const: renesas,r8a7743
-
- - description: RZ/G1N (R8A77440)
- items:
- - enum:
- # iWave Systems RZ/G1N Qseven Development Platform (iW-RainboW-G20D-Qseven)
- - iwave,g20d
- - const: iwave,g20m
- - const: renesas,r8a7744
-
- - items:
- - enum:
- # iWave Systems RZ/G1N Qseven System On Module (iW-RainboW-G20M-Qseven)
- - iwave,g20m
- - const: renesas,r8a7744
-
- - description: RZ/G1E (R8A77450)
- items:
- - enum:
- - iwave,g22m # iWave Systems RZ/G1E SODIMM System On Module (iW-RainboW-G22M-SM)
- - renesas,sk-rzg1e # SK-RZG1E (YR8A77450S000BE)
- - const: renesas,r8a7745
-
- - description: iWave Systems RZ/G1E SODIMM SOM Development Platform (iW-RainboW-G22D)
- items:
- - const: iwave,g22d
- - const: iwave,g22m
- - const: renesas,r8a7745
-
- - description: RZ/G1C (R8A77470)
- items:
- - enum:
- - iwave,g23s #iWave Systems RZ/G1C Single Board Computer (iW-RainboW-G23S)
- - const: renesas,r8a77470
-
- - description: RZ/G2M (R8A774A1)
- items:
- - enum:
- - hoperun,hihope-rzg2m # HopeRun HiHope RZ/G2M platform
- - beacon,beacon-rzg2m # Beacon EmbeddedWorks RZ/G2M Kit
- - const: renesas,r8a774a1
-
- - items:
- - enum:
- - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms
- - const: hoperun,hihope-rzg2m
- - const: renesas,r8a774a1
-
- - description: RZ/G2N (R8A774B1)
- items:
- - enum:
- - beacon,beacon-rzg2n # Beacon EmbeddedWorks RZ/G2N Kit
- - hoperun,hihope-rzg2n # HopeRun HiHope RZ/G2N platform
- - const: renesas,r8a774b1
-
- - items:
- - enum:
- - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms
- - const: hoperun,hihope-rzg2n
- - const: renesas,r8a774b1
-
- - description: RZ/G2E (R8A774C0)
- items:
- - enum:
- - si-linux,cat874 # Silicon Linux RZ/G2E 96board platform (CAT874)
- - const: renesas,r8a774c0
-
- - items:
- - enum:
- - si-linux,cat875 # Silicon Linux sub board for CAT874 (CAT875)
- - const: si-linux,cat874
- - const: renesas,r8a774c0
-
- - description: RZ/G2H (R8A774E1)
- items:
- - enum:
- - beacon,beacon-rzg2h # Beacon EmbeddedWorks RZ/G2H Kit
- - hoperun,hihope-rzg2h # HopeRun HiHope RZ/G2H platform
- - const: renesas,r8a774e1
-
- - items:
- - enum:
- - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms
- - const: hoperun,hihope-rzg2h
- - const: renesas,r8a774e1
-
- - description: R-Car M1A (R8A77781)
- items:
- - enum:
- - renesas,bockw
- - const: renesas,r8a7778
-
- - description: R-Car H1 (R8A77790)
- items:
- - enum:
- - renesas,marzen # Marzen (R0P7779A00010S)
- - const: renesas,r8a7779
-
- - description: R-Car H2 (R8A77900)
- items:
- - enum:
- - renesas,lager # Lager (RTP0RC7790SEB00010S)
- - renesas,stout # Stout (ADAS Starterkit, Y-R-CAR-ADAS-SKH2-BOARD)
- - const: renesas,r8a7790
-
- - description: R-Car M2-W (R8A77910)
- items:
- - enum:
- - renesas,henninger
- - renesas,koelsch # Koelsch (RTP0RC7791SEB00010S)
- - renesas,porter # Porter (M2-LCDP)
- - const: renesas,r8a7791
-
- - description: R-Car V2H (R8A77920)
- items:
- - enum:
- - renesas,blanche # Blanche (RTP0RC7792SEB00010S)
- - renesas,wheat # Wheat (RTP0RC7792ASKB0000JE)
- - const: renesas,r8a7792
-
- - description: R-Car M2-N (R8A77930)
- items:
- - enum:
- - renesas,gose # Gose (RTP0RC7793SEB00010S)
- - const: renesas,r8a7793
-
- - description: R-Car E2 (R8A77940)
- items:
- - enum:
- - renesas,alt # Alt (RTP0RC7794SEB00010S)
- - renesas,silk # SILK (RTP0RC7794LCB00011S)
- - const: renesas,r8a7794
-
- - description: R-Car H3 (R8A77950)
- items:
- - enum:
- # H3ULCB (R-Car Starter Kit Premier, RTP0RC7795SKBX0010SA00 (H3 ES1.1))
- # H3ULCB (R-Car Starter Kit Premier, RTP0RC77951SKBX010SA00 (H3 ES2.0))
- - renesas,h3ulcb
- - renesas,salvator-x # Salvator-X (RTP0RC7795SIPB0010S)
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version, RTP0RC7795SIPB0012S)
- - const: renesas,r8a7795
-
- - description: R-Car M3-W (R8A77960)
- items:
- - enum:
- - renesas,m3ulcb # M3ULCB (R-Car Starter Kit Pro, RTP0RC7796SKBX0010SA09 (M3 ES1.0))
- - renesas,salvator-x # Salvator-X (RTP0RC7796SIPB0011S)
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version, RTP0RC7796SIPB0012S)
- - const: renesas,r8a7796
-
- - description: R-Car M3-W+ (R8A77961)
- items:
- - enum:
- - renesas,m3ulcb # M3ULCB (R-Car Starter Kit Pro, RTP8J77961ASKB0SK0SA05A (M3 ES3.0))
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version, RTP0RC7796SIPB0012SA5A)
- - const: renesas,r8a77961
-
- - description: Kingfisher (SBEV-RCAR-KF-M03)
- oneOf:
- - items:
- - const: shimafuji,kingfisher
- - enum:
- - renesas,h3ulcb
- - renesas,m3ulcb
- - renesas,m3nulcb
- - enum:
- - renesas,r8a7795
- - renesas,r8a7796
- - renesas,r8a77961
- - renesas,r8a77965
- - items:
- - const: shimafuji,kingfisher
- - enum:
- - renesas,h3ulcb
- - renesas,m3ulcb
- - enum:
- - renesas,r8a779m1
- - renesas,r8a779m3
- - enum:
- - renesas,r8a7795
- - renesas,r8a77961
-
- - description: R-Car M3-N (R8A77965)
- items:
- - enum:
- - renesas,m3nulcb # M3NULCB (R-Car Starter Kit Pro, RTP0RC77965SKBX010SA00 (M3-N ES1.1))
- - renesas,salvator-x # Salvator-X (RTP0RC7796SIPB0011S (M3-N))
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version, RTP0RC77965SIPB012S)
- - const: renesas,r8a77965
-
- - description: R-Car V3M (R8A77970)
- items:
- - enum:
- - renesas,eagle # Eagle (RTP0RC77970SEB0010S)
- - renesas,v3msk # V3MSK (Y-ASK-RCAR-V3M-WS10)
- - const: renesas,r8a77970
-
- - description: R-Car V3H (R8A77980)
- items:
- - enum:
- - renesas,condor # Condor (RTP0RC77980SEB0010SS/RTP0RC77980SEB0010SA01)
- - renesas,v3hsk # V3HSK (Y-ASK-RCAR-V3H-WS10)
- - const: renesas,r8a77980
-
- - description: R-Car E3 (R8A77990)
- items:
- - enum:
- - renesas,ebisu # Ebisu (RTP0RC77990SEB0010S)
- - const: renesas,r8a77990
-
- - description: R-Car D3 (R8A77995)
- items:
- - enum:
- - renesas,draak # Draak (RTP0RC77995SEB0010S)
- - const: renesas,r8a77995
-
- - description: R-Car V3U (R8A779A0)
- items:
- - enum:
- - renesas,falcon-cpu # Falcon CPU board (RTP0RC779A0CPB0010S)
- - const: renesas,r8a779a0
-
- - items:
- - enum:
- - renesas,falcon-breakout # Falcon BreakOut board (RTP0RC779A0BOB0010S)
- - const: renesas,falcon-cpu
- - const: renesas,r8a779a0
-
- - description: R-Car H3e-2G (R8A779M1)
- items:
- - enum:
- - renesas,h3ulcb # H3ULCB (R-Car Starter Kit Premier)
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version)
- - const: renesas,r8a779m1
- - const: renesas,r8a7795
-
- - description: R-Car M3e-2G (R8A779M3)
- items:
- - enum:
- - renesas,m3ulcb # M3ULCB (R-Car Starter Kit Pro)
- - renesas,salvator-xs # Salvator-XS (Salvator-X 2nd version)
- - const: renesas,r8a779m3
- - const: renesas,r8a77961
-
- - description: RZ/N1D (R9A06G032)
- items:
- - enum:
- - renesas,rzn1d400-db # RZN1D-DB (RZ/N1D Demo Board for the RZ/N1D 400 pins package)
- - const: renesas,r9a06g032
-
- - description: RZ/G2UL (R9A07G043)
- items:
- - enum:
- - renesas,r9a07g043u11 # RZ/G2UL Type-1
- - renesas,r9a07g043u12 # RZ/G2UL Type-2
- - const: renesas,r9a07g043
-
- - description: RZ/G2{L,LC} (R9A07G044)
- items:
- - enum:
- - renesas,smarc-evk # SMARC EVK
- - enum:
- - renesas,r9a07g044c1 # Single Cortex-A55 RZ/G2LC
- - renesas,r9a07g044c2 # Dual Cortex-A55 RZ/G2LC
- - renesas,r9a07g044l1 # Single Cortex-A55 RZ/G2L
- - renesas,r9a07g044l2 # Dual Cortex-A55 RZ/G2L
- - const: renesas,r9a07g044
-
-additionalProperties: true
-
-...
diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml
index 6546b015fc62..ec141c937b8b 100644
--- a/Documentation/devicetree/bindings/arm/rockchip.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/rockchip.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Rockchip platforms device tree bindings
+title: Rockchip platforms
maintainers:
- Heiko Stuebner <heiko@sntech.de>
@@ -30,6 +30,31 @@ properties:
- const: amarula,vyasa-rk3288
- const: rockchip,rk3288
+ - description: Anbernic RG351M
+ items:
+ - const: anbernic,rg351m
+ - const: rockchip,rk3326
+
+ - description: Anbernic RG353P
+ items:
+ - const: anbernic,rg353p
+ - const: rockchip,rk3566
+
+ - description: Anbernic RG353V
+ items:
+ - const: anbernic,rg353v
+ - const: rockchip,rk3566
+
+ - description: Anbernic RG353VS
+ items:
+ - const: anbernic,rg353vs
+ - const: rockchip,rk3566
+
+ - description: Anbernic RG503
+ items:
+ - const: anbernic,rg503
+ - const: rockchip,rk3566
+
- description: Asus Tinker board
items:
- const: asus,rk3288-tinker
@@ -65,11 +90,33 @@ properties:
- const: chipspark,rayeager-px2
- const: rockchip,rk3066a
+ - description: Edgeble Neural Compute Module 2(Neu2) SoM based boards
+ items:
+ - const: edgeble,neural-compute-module-2-io # Edgeble Neural Compute Module 2 IO Board
+ - const: edgeble,neural-compute-module-2 # Edgeble Neural Compute Module 2 SoM
+ - const: rockchip,rv1126
+
+ - description: Edgeble Neural Compute Module 6(Neu6) Model A SoM based boards
+ items:
+ - const: edgeble,neural-compute-module-6a-io # Edgeble Neural Compute Module 6A IO Board
+ - const: edgeble,neural-compute-module-6a # Edgeble Neural Compute Module 6A SoM
+ - const: rockchip,rk3588
+
- description: Elgin RV1108 R1
items:
- const: elgin,rv1108-r1
- const: rockchip,rv1108
+ - description: EmbedFire LubanCat 1
+ items:
+ - const: embedfire,lubancat-1
+ - const: rockchip,rk3566
+
+ - description: EmbedFire LubanCat 2
+ items:
+ - const: embedfire,lubancat-2
+ - const: rockchip,rk3568
+
- description: Engicam PX30.Core C.TOUCH 2.0
items:
- const: engicam,px30-core-ctouch2
@@ -115,6 +162,11 @@ properties:
- const: firefly,roc-rk3328-cc
- const: rockchip,rk3328
+ - description: Firefly ROC-RK3328-PC
+ items:
+ - const: firefly,roc-rk3328-pc
+ - const: rockchip,rk3328
+
- description: Firefly ROC-RK3399-PC
items:
- enum:
@@ -122,9 +174,22 @@ properties:
- firefly,roc-rk3399-pc-mezzanine
- const: rockchip,rk3399
- - description: FriendlyElec NanoPi R2S
+ - description: Firefly ROC-RK3399-PC-PLUS
+ items:
+ - enum:
+ - firefly,roc-rk3399-pc-plus
+ - const: rockchip,rk3399
+
+ - description: Firefly Station M2
+ items:
+ - const: firefly,rk3566-roc-pc
+ - const: rockchip,rk3566
+
+ - description: FriendlyElec NanoPi R2 series boards
items:
- - const: friendlyarm,nanopi-r2s
+ - enum:
+ - friendlyarm,nanopi-r2c
+ - friendlyarm,nanopi-r2s
- const: rockchip,rk3328
- description: FriendlyElec NanoPi4 series boards
@@ -135,8 +200,16 @@ properties:
- friendlyarm,nanopi-m4b
- friendlyarm,nanopi-neo4
- friendlyarm,nanopi-r4s
+ - friendlyarm,nanopi-r4s-enterprise
- const: rockchip,rk3399
+ - description: FriendlyElec NanoPi R5 series boards
+ items:
+ - enum:
+ - friendlyarm,nanopi-r5c
+ - friendlyarm,nanopi-r5s
+ - const: rockchip,rk3568
+
- description: GeekBuying GeekBox
items:
- const: geekbuying,geekbox
@@ -287,6 +360,34 @@ properties:
- const: google,veyron
- const: rockchip,rk3288
+ - description: Google Scarlet - Dumo (ASUS Chromebook Tablet CT100)
+ items:
+ - const: google,scarlet-rev15-sku0
+ - const: google,scarlet-rev15
+ - const: google,scarlet-rev14-sku0
+ - const: google,scarlet-rev14
+ - const: google,scarlet-rev13-sku0
+ - const: google,scarlet-rev13
+ - const: google,scarlet-rev12-sku0
+ - const: google,scarlet-rev12
+ - const: google,scarlet-rev11-sku0
+ - const: google,scarlet-rev11
+ - const: google,scarlet-rev10-sku0
+ - const: google,scarlet-rev10
+ - const: google,scarlet-rev9-sku0
+ - const: google,scarlet-rev9
+ - const: google,scarlet-rev8-sku0
+ - const: google,scarlet-rev8
+ - const: google,scarlet-rev7-sku0
+ - const: google,scarlet-rev7
+ - const: google,scarlet-rev6-sku0
+ - const: google,scarlet-rev6
+ - const: google,scarlet-rev5-sku0
+ - const: google,scarlet-rev5
+ - const: google,scarlet
+ - const: google,gru
+ - const: rockchip,rk3399
+
- description: Google Scarlet - Kingdisplay (Acer Chromebook Tab 10)
items:
- const: google,scarlet-rev15-sku7
@@ -319,30 +420,55 @@ properties:
- const: google,gru
- const: rockchip,rk3399
- - description: Google Scarlet - Innolux display (Acer Chromebook Tab 10)
+ - description: |
+ Google Scarlet - Innolux display (Acer Chromebook Tab 10 and more)
items:
+ - const: google,scarlet-rev15-sku2
+ - const: google,scarlet-rev15-sku4
- const: google,scarlet-rev15-sku6
- const: google,scarlet-rev15
+ - const: google,scarlet-rev14-sku2
+ - const: google,scarlet-rev14-sku4
- const: google,scarlet-rev14-sku6
- const: google,scarlet-rev14
+ - const: google,scarlet-rev13-sku2
+ - const: google,scarlet-rev13-sku4
- const: google,scarlet-rev13-sku6
- const: google,scarlet-rev13
+ - const: google,scarlet-rev12-sku2
+ - const: google,scarlet-rev12-sku4
- const: google,scarlet-rev12-sku6
- const: google,scarlet-rev12
+ - const: google,scarlet-rev11-sku2
+ - const: google,scarlet-rev11-sku4
- const: google,scarlet-rev11-sku6
- const: google,scarlet-rev11
+ - const: google,scarlet-rev10-sku2
+ - const: google,scarlet-rev10-sku4
- const: google,scarlet-rev10-sku6
- const: google,scarlet-rev10
+ - const: google,scarlet-rev9-sku2
+ - const: google,scarlet-rev9-sku4
- const: google,scarlet-rev9-sku6
- const: google,scarlet-rev9
+ - const: google,scarlet-rev8-sku2
+ - const: google,scarlet-rev8-sku4
- const: google,scarlet-rev8-sku6
- const: google,scarlet-rev8
+ - const: google,scarlet-rev7-sku2
+ - const: google,scarlet-rev7-sku4
- const: google,scarlet-rev7-sku6
- const: google,scarlet-rev7
+ - const: google,scarlet-rev6-sku2
+ - const: google,scarlet-rev6-sku4
- const: google,scarlet-rev6-sku6
- const: google,scarlet-rev6
+ - const: google,scarlet-rev5-sku2
+ - const: google,scarlet-rev5-sku4
- const: google,scarlet-rev5-sku6
- const: google,scarlet-rev5
+ - const: google,scarlet-rev4-sku2
+ - const: google,scarlet-rev4-sku4
- const: google,scarlet-rev4-sku6
- const: google,scarlet-rev4
- const: google,scarlet
@@ -388,6 +514,21 @@ properties:
- const: hardkernel,rk3326-odroid-go2
- const: rockchip,rk3326
+ - description: Hardkernel Odroid Go Advance Black Edition
+ items:
+ - const: hardkernel,rk3326-odroid-go2-v11
+ - const: rockchip,rk3326
+
+ - description: Hardkernel Odroid Go Super
+ items:
+ - const: hardkernel,rk3326-odroid-go3
+ - const: rockchip,rk3326
+
+ - description: Hardkernel Odroid M1
+ items:
+ - const: rockchip,rk3568-odroid-m1
+ - const: rockchip,rk3568
+
- description: Hugsun X99 TV Box
items:
- const: hugsun,x99
@@ -401,6 +542,11 @@ properties:
- khadas,edge-v
- const: rockchip,rk3399
+ - description: Khadas Edge2 series boards
+ items:
+ - const: khadas,edge2
+ - const: rockchip,rk3588s
+
- description: Kobol Helios64
items:
- const: kobol,helios64
@@ -426,6 +572,11 @@ properties:
- const: netxeon,r89
- const: rockchip,rk3288
+ - description: OPEN AI LAB EAIDK-610
+ items:
+ - const: openailab,eaidk-610
+ - const: rockchip,rk3399
+
- description: Orange Pi RK3399 board
items:
- const: rockchip,rk3399-orangepi
@@ -442,6 +593,19 @@ properties:
- const: pine64,pinebook-pro
- const: rockchip,rk3399
+ - description: Pine64 PineNote
+ items:
+ - enum:
+ - pine64,pinenote-v1.1
+ - pine64,pinenote-v1.2
+ - const: pine64,pinenote
+ - const: rockchip,rk3566
+
+ - description: Pine64 PinePhonePro
+ items:
+ - const: pine64,pinephone-pro
+ - const: rockchip,rk3399
+
- description: Pine64 Rock64
items:
- const: pine64,rock64
@@ -455,20 +619,57 @@ properties:
- const: pine64,rockpro64
- const: rockchip,rk3399
+ - description: Pine64 Quartz64 Model A/B
+ items:
+ - enum:
+ - pine64,quartz64-a
+ - pine64,quartz64-b
+ - const: rockchip,rk3566
+
+ - description: Pine64 SoQuartz SoM
+ items:
+ - enum:
+ - pine64,soquartz-blade
+ - pine64,soquartz-cm4io
+ - pine64,soquartz-model-a
+ - const: pine64,soquartz
+ - const: rockchip,rk3566
+
+ - description: Radxa Compute Module 3(CM3)
+ items:
+ - enum:
+ - radxa,cm3-io
+ - const: radxa,cm3
+ - const: rockchip,rk3566
+
+ - description: Radxa CM3 Industrial
+ items:
+ - enum:
+ - radxa,e25
+ - const: radxa,cm3i
+ - const: rockchip,rk3568
+
- description: Radxa Rock
items:
- const: radxa,rock
- const: rockchip,rk3188
- - description: Radxa ROCK Pi 4A/B/C
+ - description: Radxa ROCK Pi 4A/A+/B/B+/C
items:
- enum:
- radxa,rockpi4a
+ - radxa,rockpi4a-plus
- radxa,rockpi4b
+ - radxa,rockpi4b-plus
- radxa,rockpi4c
- const: radxa,rockpi4
- const: rockchip,rk3399
+ - description: Radxa ROCK 4C+
+ items:
+ - const: radxa,rock-4c-plus
+ - const: rockchip,rk3399
+
- description: Radxa ROCK Pi E
items:
- const: radxa,rockpi-e
@@ -486,11 +687,31 @@ properties:
- const: vamrs,rk3399pro-vmarc-som
- const: rockchip,rk3399pro
+ - description: Radxa ROCK Pi S
+ items:
+ - const: radxa,rockpis
+ - const: rockchip,rk3308
+
- description: Radxa Rock2 Square
items:
- const: radxa,rock2-square
- const: rockchip,rk3288
+ - description: Radxa ROCK3 Model A
+ items:
+ - const: radxa,rock3a
+ - const: rockchip,rk3568
+
+ - description: Radxa ROCK 5 Model A
+ items:
+ - const: radxa,rock-5a
+ - const: rockchip,rk3588s
+
+ - description: Radxa ROCK 5 Model B
+ items:
+ - const: radxa,rock-5b
+ - const: rockchip,rk3588
+
- description: Rikomagic MK808 v1
items:
- const: rikomagic,mk808
@@ -528,6 +749,11 @@ properties:
- const: rockchip,rk3036-evb
- const: rockchip,rk3036
+ - description: Rockchip RK3128 Evaluation board
+ items:
+ - const: rockchip,rk3128-evb
+ - const: rockchip,rk3128
+
- description: Rockchip RK3228 Evaluation board
items:
- const: rockchip,rk3228-evb
@@ -575,11 +801,21 @@ properties:
- const: rockchip,rk3399-sapphire-excavator
- const: rockchip,rk3399
+ - description: Rockchip RK3588 Evaluation board
+ items:
+ - const: rockchip,rk3588-evb1-v10
+ - const: rockchip,rk3588
+
- description: Rockchip RV1108 Evaluation board
items:
- const: rockchip,rv1108-evb
- const: rockchip,rv1108
+ - description: Theobroma Systems PX30-uQ7 with Haikou baseboard
+ items:
+ - const: tsd,px30-ringneck-haikou
+ - const: rockchip,px30
+
- description: Theobroma Systems RK3368-uQ7 with Haikou baseboard
items:
- const: tsd,rk3368-lion-haikou
@@ -595,16 +831,33 @@ properties:
- const: tronsmart,orion-r68-meta
- const: rockchip,rk3368
+ - description: Xunlong Orange Pi R1 Plus / LTS
+ items:
+ - enum:
+ - xunlong,orangepi-r1-plus
+ - xunlong,orangepi-r1-plus-lts
+ - const: rockchip,rk3328
+
- description: Zkmagic A95X Z2
items:
- const: zkmagic,a95x-z2
- const: rockchip,rk3318
+ - description: Rockchip RK3566 BOX Evaluation Demo board
+ items:
+ - const: rockchip,rk3566-box-demo
+ - const: rockchip,rk3566
+
- description: Rockchip RK3568 Evaluation board
items:
- const: rockchip,rk3568-evb1-v10
- const: rockchip,rk3568
+ - description: Rockchip RK3568 Banana Pi R2 Pro
+ items:
+ - const: rockchip,rk3568-bpi-r2pro
+ - const: rockchip,rk3568
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml
index 53115b92d17f..b79c81cd9f0e 100644
--- a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml
@@ -21,8 +21,13 @@ select:
enum:
- rockchip,px30-pmu
- rockchip,rk3066-pmu
+ - rockchip,rk3128-pmu
- rockchip,rk3288-pmu
+ - rockchip,rk3368-pmu
- rockchip,rk3399-pmu
+ - rockchip,rk3568-pmu
+ - rockchip,rk3588-pmu
+ - rockchip,rv1126-pmu
required:
- compatible
@@ -33,8 +38,13 @@ properties:
- enum:
- rockchip,px30-pmu
- rockchip,rk3066-pmu
+ - rockchip,rk3128-pmu
- rockchip,rk3288-pmu
+ - rockchip,rk3368-pmu
- rockchip,rk3399-pmu
+ - rockchip,rk3568-pmu
+ - rockchip,rk3588-pmu
+ - rockchip,rv1126-pmu
- const: syscon
- const: simple-mfd
diff --git a/Documentation/devicetree/bindings/arm/samsung/exynos-chipid.yaml b/Documentation/devicetree/bindings/arm/samsung/exynos-chipid.yaml
deleted file mode 100644
index f99c0c6df21b..000000000000
--- a/Documentation/devicetree/bindings/arm/samsung/exynos-chipid.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/samsung/exynos-chipid.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Samsung Exynos SoC series Chipid driver
-
-maintainers:
- - Krzysztof Kozlowski <krzk@kernel.org>
-
-properties:
- compatible:
- items:
- - const: samsung,exynos4210-chipid
-
- reg:
- maxItems: 1
-
- samsung,asv-bin:
- description:
- Adaptive Supply Voltage bin selection. This can be used
- to determine the ASV bin of an SoC if respective information
- is missing in the CHIPID registers or in the OTP memory.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1, 2, 3]
-
-required:
- - compatible
- - reg
-
-additionalProperties: false
-
-examples:
- - |
- chipid@10000000 {
- compatible = "samsung,exynos4210-chipid";
- reg = <0x10000000 0x100>;
- samsung,asv-bin = <2>;
- };
diff --git a/Documentation/devicetree/bindings/arm/samsung/pmu.yaml b/Documentation/devicetree/bindings/arm/samsung/pmu.yaml
deleted file mode 100644
index 17678d9686c1..000000000000
--- a/Documentation/devicetree/bindings/arm/samsung/pmu.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/arm/samsung/pmu.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Samsung Exynos SoC series Power Management Unit (PMU)
-
-maintainers:
- - Krzysztof Kozlowski <krzk@kernel.org>
-
-# Custom select to avoid matching all nodes with 'syscon'
-select:
- properties:
- compatible:
- contains:
- enum:
- - samsung,exynos3250-pmu
- - samsung,exynos4210-pmu
- - samsung,exynos4412-pmu
- - samsung,exynos5250-pmu
- - samsung,exynos5260-pmu
- - samsung,exynos5410-pmu
- - samsung,exynos5420-pmu
- - samsung,exynos5433-pmu
- - samsung,exynos7-pmu
- - samsung-s5pv210-pmu
- required:
- - compatible
-
-properties:
- compatible:
- items:
- - enum:
- - samsung,exynos3250-pmu
- - samsung,exynos4210-pmu
- - samsung,exynos4412-pmu
- - samsung,exynos5250-pmu
- - samsung,exynos5260-pmu
- - samsung,exynos5410-pmu
- - samsung,exynos5420-pmu
- - samsung,exynos5433-pmu
- - samsung,exynos7-pmu
- - samsung-s5pv210-pmu
- - const: syscon
-
- reg:
- maxItems: 1
-
- assigned-clock-parents: true
- assigned-clocks: true
-
- '#clock-cells':
- const: 1
-
- clock-names:
- description:
- List of clock names for particular CLKOUT mux inputs
- minItems: 1
- maxItems: 32
- items:
- pattern: '^clkout([0-9]|[12][0-9]|3[0-1])$'
-
- clocks:
- minItems: 1
- maxItems: 32
-
- interrupt-controller:
- description:
- Some PMUs are capable of behaving as an interrupt controller (mostly
- to wake up a suspended PMU).
-
- '#interrupt-cells':
- description:
- Must be identical to the that of the parent interrupt controller.
- const: 3
-
- syscon-poweroff:
- $ref: "../../power/reset/syscon-poweroff.yaml#"
- type: object
- description:
- Node for power off method
-
- syscon-reboot:
- $ref: "../../power/reset/syscon-reboot.yaml#"
- type: object
- description:
- Node for reboot method
-
-required:
- - compatible
- - reg
-
-additionalProperties: false
-
-allOf:
- - if:
- properties:
- compatible:
- contains:
- enum:
- - samsung,exynos3250-pmu
- - samsung,exynos4210-pmu
- - samsung,exynos4412-pmu
- - samsung,exynos5250-pmu
- - samsung,exynos5410-pmu
- - samsung,exynos5420-pmu
- - samsung,exynos5433-pmu
- then:
- required:
- - '#clock-cells'
- - clock-names
- - clocks
-
-examples:
- - |
- #include <dt-bindings/clock/exynos5250.h>
-
- pmu_system_controller: system-controller@10040000 {
- compatible = "samsung,exynos5250-pmu", "syscon";
- reg = <0x10040000 0x5000>;
- interrupt-controller;
- #interrupt-cells = <3>;
- interrupt-parent = <&gic>;
- #clock-cells = <1>;
- clock-names = "clkout16";
- clocks = <&clock CLK_FIN_PLL>;
- };
diff --git a/Documentation/devicetree/bindings/arm/samsung/samsung-boards.yaml b/Documentation/devicetree/bindings/arm/samsung/samsung-boards.yaml
index 0796f0c87727..deb2cf971871 100644
--- a/Documentation/devicetree/bindings/arm/samsung/samsung-boards.yaml
+++ b/Documentation/devicetree/bindings/arm/samsung/samsung-boards.yaml
@@ -140,6 +140,8 @@ properties:
items:
- enum:
- insignal,arndale-octa # Insignal Arndale Octa
+ - samsung,chagall-wifi # Samsung SM-T800
+ - samsung,klimt-wifi # Samsung SM-T700
- samsung,smdk5420 # Samsung SMDK5420 eval
- const: samsung,exynos5420
- const: samsung,exynos5
@@ -169,6 +171,7 @@ properties:
- hardkernel,odroid-xu3-lite # Hardkernel Odroid XU3 Lite
- hardkernel,odroid-xu4 # Hardkernel Odroid XU4
- hardkernel,odroid-hc1 # Hardkernel Odroid HC1
+ - samsung,k3g # Samsung Galaxy S5 (SM-G900H)
- const: samsung,exynos5800
- const: samsung,exynos5
@@ -199,6 +202,24 @@ properties:
- samsung,exynos7-espresso # Samsung Exynos7 Espresso
- const: samsung,exynos7
+ - description: Exynos7885 based boards
+ items:
+ - enum:
+ - samsung,jackpotlte # Samsung Galaxy A8 (2018)
+ - const: samsung,exynos7885
+
+ - description: Exynos850 based boards
+ items:
+ - enum:
+ - winlink,e850-96 # WinLink E850-96
+ - const: samsung,exynos850
+
+ - description: Exynos Auto v9 based boards
+ items:
+ - enum:
+ - samsung,exynosautov9-sadk # Samsung Exynos Auto v9 SADK
+ - const: samsung,exynosautov9
+
required:
- compatible
diff --git a/Documentation/devicetree/bindings/arm/samsung/samsung-soc.yaml b/Documentation/devicetree/bindings/arm/samsung/samsung-soc.yaml
new file mode 100644
index 000000000000..653f85997643
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/samsung/samsung-soc.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/samsung/samsung-soc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S3C, S5P and Exynos SoC compatibles naming convention
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ Guidelines for new compatibles for SoC blocks/components.
+ When adding new compatibles in new bindings, use the format::
+ samsung,SoC-IP
+
+ For example::
+ samsung,exynos5433-cmu-isp
+
+select:
+ properties:
+ compatible:
+ pattern: "^samsung,.*(s3c|s5pv|exynos)[0-9a-z]+.*$"
+ required:
+ - compatible
+
+properties:
+ compatible:
+ oneOf:
+ - description: Preferred naming style for compatibles of SoC components
+ pattern: "^samsung,(s3c|s5pv|exynos|exynosautov)[0-9]+-.*$"
+
+ # Legacy compatibles with wild-cards - list cannot grow with new bindings:
+ - enum:
+ - samsung,exynos4x12-pinctrl
+ - samsung,exynos4x12-usb2-phy
+ - samsung,s3c64xx-pinctrl
+ - samsung,s3c64xx-wakeup-eint
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
index aa1d4afbc510..5a428a885760 100644
--- a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
+++ b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/socionext/milbeaut.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Milbeaut platforms device tree bindings
+title: Milbeaut platforms
maintainers:
- Taichi Sugaya <sugaya.taichi@socionext.com>
diff --git a/Documentation/devicetree/bindings/arm/socionext/uniphier.yaml b/Documentation/devicetree/bindings/arm/socionext/uniphier.yaml
index 8c0e91658474..3e7f3d927ec7 100644
--- a/Documentation/devicetree/bindings/arm/socionext/uniphier.yaml
+++ b/Documentation/devicetree/bindings/arm/socionext/uniphier.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/socionext/uniphier.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Socionext UniPhier platform device tree bindings
+title: Socionext UniPhier platform
maintainers:
- Masahiro Yamada <yamada.masahiro@socionext.com>
@@ -26,6 +26,12 @@ properties:
- socionext,uniphier-pro4-ref
- socionext,uniphier-pro4-sanji
- const: socionext,uniphier-pro4
+ - description: Pro5 SoC boards
+ items:
+ - enum:
+ - socionext,uniphier-pro5-epcore
+ - socionext,uniphier-pro5-proex
+ - const: socionext,uniphier-pro5
- description: sLD8 SoC boards
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/sp810.txt b/Documentation/devicetree/bindings/arm/sp810.txt
deleted file mode 100644
index 46652bf65147..000000000000
--- a/Documentation/devicetree/bindings/arm/sp810.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-SP810 System Controller
------------------------
-
-Required properties:
-
-- compatible: standard compatible string for a Primecell peripheral,
- see Documentation/devicetree/bindings/arm/primecell.yaml
- for more details
- should be: "arm,sp810", "arm,primecell"
-
-- reg: standard registers property, physical address and size
- of the control registers
-
-- clock-names: from the common clock bindings, for more details see
- Documentation/devicetree/bindings/clock/clock-bindings.txt;
- should be: "refclk", "timclk", "apb_pclk"
-
-- clocks: from the common clock bindings, phandle and clock
- specifier pairs for the entries of clock-names property
-
-- #clock-cells: from the common clock bindings;
- should be: <1>
-
-- clock-output-names: from the common clock bindings;
- should be: "timerclken0", "timerclken1", "timerclken2", "timerclken3"
-
-- assigned-clocks: from the common clock binding;
- should be: clock specifier for each output clock of this
- provider node
-
-- assigned-clock-parents: from the common clock binding;
- should be: phandle of input clock listed in clocks
- property with the highest frequency
-
-Example:
- v2m_sysctl: sysctl@20000 {
- compatible = "arm,sp810", "arm,primecell";
- reg = <0x020000 0x1000>;
- clocks = <&v2m_refclk32khz>, <&v2m_refclk1mhz>, <&smbclk>;
- clock-names = "refclk", "timclk", "apb_pclk";
- #clock-cells = <1>;
- clock-output-names = "timerclken0", "timerclken1", "timerclken2", "timerclken3";
- assigned-clocks = <&v2m_sysctl 0>, <&v2m_sysctl 1>, <&v2m_sysctl 3>, <&v2m_sysctl 3>;
- assigned-clock-parents = <&v2m_refclk1mhz>, <&v2m_refclk1mhz>, <&v2m_refclk1mhz>, <&v2m_refclk1mhz>;
-
- };
diff --git a/Documentation/devicetree/bindings/arm/sp810.yaml b/Documentation/devicetree/bindings/arm/sp810.yaml
new file mode 100644
index 000000000000..c9094e5ec565
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sp810.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sp810.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Versatile Express SP810 System Controller
+
+maintainers:
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ The Arm SP810 system controller provides clocks, timers and a watchdog.
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,sp810
+ required:
+ - compatible
+
+properties:
+ compatible:
+ items:
+ - const: arm,sp810
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: refclk
+ - const: timclk
+ - const: apb_pclk
+
+ clocks:
+ items:
+ - description: reference clock
+ - description: timer clock
+ - description: APB register access clock
+
+ "#clock-cells":
+ const: 1
+
+ clock-output-names:
+ maxItems: 4
+
+ assigned-clocks:
+ maxItems: 4
+
+ assigned-clock-parents:
+ maxItems: 4
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - "#clock-cells"
+
+examples:
+ - |
+ sysctl@20000 {
+ compatible = "arm,sp810", "arm,primecell";
+ reg = <0x020000 0x1000>;
+ clocks = <&v2m_refclk32khz>, <&v2m_refclk1mhz>, <&smbclk>;
+ clock-names = "refclk", "timclk", "apb_pclk";
+ #clock-cells = <1>;
+ clock-output-names = "timerclken0", "timerclken1",
+ "timerclken2", "timerclken3";
+ assigned-clocks = <&v2m_sysctl 0>, <&v2m_sysctl 1>,
+ <&v2m_sysctl 3>, <&v2m_sysctl 3>;
+ assigned-clock-parents = <&v2m_refclk1mhz>, <&v2m_refclk1mhz>,
+ <&v2m_refclk1mhz>, <&v2m_refclk1mhz>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/spe-pmu.txt b/Documentation/devicetree/bindings/arm/spe-pmu.txt
deleted file mode 100644
index 93372f2a7df9..000000000000
--- a/Documentation/devicetree/bindings/arm/spe-pmu.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-* ARMv8.2 Statistical Profiling Extension (SPE) Performance Monitor Units (PMU)
-
-ARMv8.2 introduces the optional Statistical Profiling Extension for collecting
-performance sample data using an in-memory trace buffer.
-
-** SPE Required properties:
-
-- compatible : should be one of:
- "arm,statistical-profiling-extension-v1"
-
-- interrupts : Exactly 1 PPI must be listed. For heterogeneous systems where
- SPE is only supported on a subset of the CPUs, please consult
- the arm,gic-v3 binding for details on describing a PPI partition.
-
-** Example:
-
-spe-pmu {
- compatible = "arm,statistical-profiling-extension-v1";
- interrupts = <GIC_PPI 05 IRQ_TYPE_LEVEL_HIGH &part1>;
-};
diff --git a/Documentation/devicetree/bindings/arm/spear.yaml b/Documentation/devicetree/bindings/arm/spear.yaml
index 605ad3f882ef..a465c9eca76e 100644
--- a/Documentation/devicetree/bindings/arm/spear.yaml
+++ b/Documentation/devicetree/bindings/arm/spear.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/spear.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ST SPEAr Platforms Device Tree Bindings
+title: ST SPEAr Platforms
maintainers:
- Viresh Kumar <vireshk@kernel.org>
diff --git a/Documentation/devicetree/bindings/arm/sprd/sprd.yaml b/Documentation/devicetree/bindings/arm/sprd/sprd.yaml
index 7b6ae3070396..eaa67b8e0d6c 100644
--- a/Documentation/devicetree/bindings/arm/sprd/sprd.yaml
+++ b/Documentation/devicetree/bindings/arm/sprd/sprd.yaml
@@ -5,7 +5,7 @@
$id: http://devicetree.org/schemas/arm/sprd/sprd.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Unisoc platforms device tree bindings
+title: Unisoc platforms
maintainers:
- Orson Zhai <orsonzhai@gmail.com>
@@ -30,6 +30,11 @@ properties:
- sprd,sp9863a-1h10
- const: sprd,sc9863a
+ - items:
+ - enum:
+ - sprd,ums512-1h10
+ - const: sprd,ums512
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/arm/sti.yaml b/Documentation/devicetree/bindings/arm/sti.yaml
index b1f28d16d3fb..3ca054c64377 100644
--- a/Documentation/devicetree/bindings/arm/sti.yaml
+++ b/Documentation/devicetree/bindings/arm/sti.yaml
@@ -4,10 +4,10 @@
$id: http://devicetree.org/schemas/arm/sti.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ST STi Platforms Device Tree Bindings
+title: ST STi Platforms
maintainers:
- - Patrice Chotard <patrice.chotard@st.com>
+ - Patrice Chotard <patrice.chotard@foss.st.com>
properties:
$nodename:
diff --git a/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
index 8e711bd202fd..2297ad3f4774 100644
--- a/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
+++ b/Documentation/devicetree/bindings/arm/stm32/st,mlahb.yaml
@@ -4,11 +4,11 @@
$id: "http://devicetree.org/schemas/arm/stm32/st,mlahb.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
-title: STMicroelectronics STM32 ML-AHB interconnect bindings
+title: STMicroelectronics STM32 ML-AHB interconnect
maintainers:
- - Fabien Dessenne <fabien.dessenne@st.com>
- - Arnaud Pouliquen <arnaud.pouliquen@st.com>
+ - Fabien Dessenne <fabien.dessenne@foss.st.com>
+ - Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
description: |
These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
diff --git a/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml b/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml
index 149afb5df5af..ad8e51aa01b0 100644
--- a/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml
+++ b/Documentation/devicetree/bindings/arm/stm32/st,stm32-syscon.yaml
@@ -4,11 +4,11 @@
$id: "http://devicetree.org/schemas/arm/stm32/st,stm32-syscon.yaml#"
$schema: "http://devicetree.org/meta-schemas/core.yaml#"
-title: STMicroelectronics STM32 Platforms System Controller bindings
+title: STMicroelectronics STM32 Platforms System Controller
maintainers:
- - Alexandre Torgue <alexandre.torgue@st.com>
- - Christophe Roullier <christophe.roullier@st.com>
+ - Alexandre Torgue <alexandre.torgue@foss.st.com>
+ - Christophe Roullier <christophe.roullier@foss.st.com>
properties:
compatible:
@@ -20,6 +20,7 @@ properties:
- st,stm32-syscfg
- st,stm32-power-config
- st,stm32-tamp
+ - st,stm32f4-gcan
- const: syscon
- items:
- const: st,stm32-tamp
@@ -42,6 +43,7 @@ if:
contains:
enum:
- st,stm32mp157-syscfg
+ - st,stm32f4-gcan
then:
required:
- clocks
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
index 9a77ab74be99..13e34241145b 100644
--- a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
@@ -4,30 +4,21 @@
$id: http://devicetree.org/schemas/arm/stm32/stm32.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 Platforms Device Tree Bindings
+title: STMicroelectronics STM32 Platforms
maintainers:
- - Alexandre Torgue <alexandre.torgue@st.com>
+ - Alexandre Torgue <alexandre.torgue@foss.st.com>
properties:
$nodename:
const: "/"
compatible:
oneOf:
- - description: DH STM32MP1 SoM based Boards
+ - description: emtrion STM32MP1 Argon based Boards
items:
- - enum:
- - arrow,stm32mp157a-avenger96 # Avenger96
- - dh,stm32mp153c-dhcom-drc02
- - dh,stm32mp157c-dhcom-pdk2
- - dh,stm32mp157c-dhcom-picoitx
- - enum:
- - dh,stm32mp153c-dhcom-som
- - dh,stm32mp157a-dhcor-som
- - dh,stm32mp157c-dhcom-som
- - enum:
- - st,stm32mp153
- - st,stm32mp157
+ - const: emtrion,stm32mp157c-emsbc-argon
+ - const: emtrion,stm32mp157c-emstamp-argon
+ - const: st,stm32mp157
- items:
- enum:
- st,stm32f429i-disco
@@ -57,22 +48,86 @@ properties:
- const: st,stm32h750
- items:
- enum:
+ - st,stm32mp135f-dk
+ - const: st,stm32mp135
+
+ - description: ST STM32MP151 based Boards
+ items:
+ - enum:
+ - prt,prtt1a # Protonic PRTT1A
+ - prt,prtt1c # Protonic PRTT1C
+ - prt,prtt1s # Protonic PRTT1S
+ - const: st,stm32mp151
+
+ - description: DH STM32MP151 DHCOR SoM based Boards
+ items:
+ - const: dh,stm32mp151a-dhcor-testbench
+ - const: dh,stm32mp151a-dhcor-som
+ - const: st,stm32mp151
+
+ - description: DH STM32MP153 DHCOM SoM based Boards
+ items:
+ - const: dh,stm32mp153c-dhcom-drc02
+ - const: dh,stm32mp153c-dhcom-som
+ - const: st,stm32mp153
+
+ - description: DH STM32MP153 DHCOR SoM based Boards
+ items:
+ - const: dh,stm32mp153c-dhcor-drc-compact
+ - const: dh,stm32mp153c-dhcor-som
+ - const: st,stm32mp153
+
+ - items:
+ - enum:
- shiratech,stm32mp157a-iot-box # IoT Box
- shiratech,stm32mp157a-stinger96 # Stinger96
- st,stm32mp157c-ed1
- st,stm32mp157a-dk1
- st,stm32mp157c-dk2
+ - const: st,stm32mp157
+ - items:
+ - const: st,stm32mp157a-dk1-scmi
+ - const: st,stm32mp157a-dk1
- const: st,stm32mp157
- items:
+ - const: st,stm32mp157c-dk2-scmi
+ - const: st,stm32mp157c-dk2
+ - const: st,stm32mp157
+ - items:
+ - const: st,stm32mp157c-ed1-scmi
+ - const: st,stm32mp157c-ed1
+ - const: st,stm32mp157
+ - items:
+ - const: st,stm32mp157c-ev1
+ - const: st,stm32mp157c-ed1
+ - const: st,stm32mp157
+ - items:
+ - const: st,stm32mp157c-ev1-scmi
- const: st,stm32mp157c-ev1
- const: st,stm32mp157c-ed1
- const: st,stm32mp157
+ - description: DH STM32MP1 SoM based Boards
+ items:
+ - enum:
+ - arrow,stm32mp157a-avenger96 # Avenger96
+ - const: dh,stm32mp157a-dhcor-som
+ - const: st,stm32mp157
+
+ - description: DH STM32MP1 SoM based Boards
+ items:
+ - enum:
+ - dh,stm32mp157c-dhcom-pdk2
+ - dh,stm32mp157c-dhcom-picoitx
+ - const: dh,stm32mp157c-dhcom-som
+ - const: st,stm32mp157
+
- description: Engicam i.Core STM32MP1 SoM based Boards
items:
- enum:
- engicam,icore-stm32mp1-ctouch2 # STM32MP1 Engicam i.Core STM32MP1 C.TOUCH 2.0
+ - engicam,icore-stm32mp1-ctouch2-of10 # STM32MP1 Engicam i.Core STM32MP1 C.TOUCH 2.0 10.1" OF
- engicam,icore-stm32mp1-edimm2.2 # STM32MP1 Engicam i.Core STM32MP1 EDIMM2.2 Starter Kit
- const: engicam,icore-stm32mp1 # STM32MP1 Engicam i.Core STM32MP1 SoM
- const: st,stm32mp157
@@ -92,6 +147,7 @@ properties:
- const: oct,stm32mp15xx-osd32
- enum:
- st,stm32mp157
+
- description: Odyssey STM32MP1 SoM based Boards
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml b/Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml
new file mode 100644
index 000000000000..def7d0cfeb31
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sunplus,sp7021.yaml
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) Sunplus Co., Ltd. 2021
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sunplus,sp7021.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sunplus SP7021 Boards
+
+maintainers:
+ - qinjian <qinjian@cqplus1.com>
+
+description: |
+ ARM platforms using Sunplus SP7021, an ARM Cortex A7 (4-cores) based SoC.
+ Wiki: https://sunplus-tibbo.atlassian.net/wiki/spaces/doc/overview
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ items:
+ - enum:
+ - sunplus,sp7021-achip
+ - sunplus,sp7021-demo-v3
+ - const: sunplus,sp7021
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index 889128acf49a..013821f4a7b8 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/sunxi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner platforms device tree bindings
+title: Allwinner platforms
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -366,6 +366,12 @@ properties:
- const: lamobo,lamobo-r1
- const: allwinner,sun7i-a20
+ - description: Lctech Pi F1C200s
+ items:
+ - const: lctech,pi-f1c200s
+ - const: allwinner,suniv-f1c200s
+ - const: allwinner,suniv-f1c100s
+
- description: Libre Computer Board ALL-H3-CC H2+
items:
- const: libretech,all-h3-cc-h2-plus
@@ -391,6 +397,11 @@ properties:
- const: libretech,all-h5-cc-h5
- const: allwinner,sun50i-h5
+ - description: Lichee Pi Nano
+ items:
+ - const: licheepi,licheepi-nano
+ - const: allwinner,suniv-f1c100s
+
- description: Lichee Pi One
items:
- const: licheepi,licheepi-one
@@ -444,6 +455,11 @@ properties:
- const: haoyu,a10-marsboard
- const: allwinner,sun4i-a10
+ - description: HAOYU Electronics Marsboard A20
+ items:
+ - const: haoyu,a20-marsboard
+ - const: allwinner,sun7i-a20
+
- description: MapleBoard MP130
items:
- const: mapleboard,mp130
@@ -797,6 +813,13 @@ properties:
- const: sinlinx,sina33
- const: allwinner,sun8i-a33
+ - description: SourceParts PopStick v1.1
+ items:
+ - const: sourceparts,popstick-v1.1
+ - const: sourceparts,popstick
+ - const: allwinner,suniv-f1c200s
+ - const: allwinner,suniv-f1c100s
+
- description: SL631 Action Camera with IMX179
items:
- const: allwinner,sl631-imx179
@@ -808,6 +831,11 @@ properties:
- const: oranth,tanix-tx6
- const: allwinner,sun50i-h6
+ - description: Tanix TX6 mini
+ items:
+ - const: oranth,tanix-tx6-mini
+ - const: allwinner,sun50i-h6
+
- description: TBS A711 Tablet
items:
- const: tbs-biometrics,a711
@@ -828,6 +856,11 @@ properties:
- const: wexler,tab7200
- const: allwinner,sun7i-a20
+ - description: MangoPi MQ-R board
+ items:
+ - const: widora,mangopi-mq-r-t113
+ - const: allwinner,sun8i-t113s
+
- description: WITS A31 Colombus Evaluation Board
items:
- const: wits,colombus
@@ -848,6 +881,11 @@ properties:
- const: yones-toptech,bs1078-v2
- const: allwinner,sun6i-a31s
+ - description: X96 Mate TV box
+ items:
+ - const: hechuang,x96-mate
+ - const: allwinner,sun50i-h616
+
- description: Xunlong OrangePi
items:
- const: xunlong,orangepi
@@ -948,4 +986,9 @@ properties:
- const: xunlong,orangepi-zero-plus2-h3
- const: allwinner,sun8i-h3
+ - description: Xunlong OrangePi Zero 2
+ items:
+ - const: xunlong,orangepi-zero2
+ - const: allwinner,sun50i-h616
+
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml
index e713a6fe4cf7..99566688d033 100644
--- a/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun4i-a10-mbus.yaml
@@ -29,14 +29,51 @@ properties:
compatible:
enum:
- allwinner,sun5i-a13-mbus
+ - allwinner,sun8i-a33-mbus
+ - allwinner,sun8i-a50-mbus
+ - allwinner,sun8i-a83t-mbus
- allwinner,sun8i-h3-mbus
+ - allwinner,sun8i-r40-mbus
+ - allwinner,sun8i-v3s-mbus
+ - allwinner,sun8i-v536-mbus
+ - allwinner,sun20i-d1-mbus
- allwinner,sun50i-a64-mbus
+ - allwinner,sun50i-a100-mbus
+ - allwinner,sun50i-h5-mbus
+ - allwinner,sun50i-h6-mbus
+ - allwinner,sun50i-h616-mbus
+ - allwinner,sun50i-r329-mbus
reg:
- maxItems: 1
+ minItems: 1
+ items:
+ - description: MBUS interconnect/bandwidth limit/PMU registers
+ - description: DRAM controller/PHY registers
+
+ reg-names:
+ minItems: 1
+ items:
+ - const: mbus
+ - const: dram
clocks:
+ minItems: 1
+ items:
+ - description: MBUS interconnect module clock
+ - description: DRAM controller/PHY module clock
+ - description: Register bus clock, shared by MBUS and DRAM
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: mbus
+ - const: dram
+ - const: bus
+
+ interrupts:
maxItems: 1
+ description:
+ MBUS PMU activity interrupt.
dma-ranges:
description:
@@ -53,13 +90,55 @@ required:
- clocks
- dma-ranges
+if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun5i-a13-mbus
+ - allwinner,sun8i-r40-mbus
+
+then:
+ properties:
+ reg:
+ minItems: 2
+
+ reg-names:
+ minItems: 2
+
+ clocks:
+ minItems: 3
+
+ clock-names:
+ minItems: 3
+
+ required:
+ - reg-names
+ - clock-names
+
+else:
+ properties:
+ reg:
+ maxItems: 1
+
+ reg-names:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ maxItems: 1
+
additionalProperties: false
examples:
- |
- #include <dt-bindings/clock/sun5i-ccu.h>
+ #include <dt-bindings/clock/sun50i-a64-ccu.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
- mbus: dram-controller@1c01000 {
+ dram-controller@1c01000 {
compatible = "allwinner,sun5i-a13-mbus";
reg = <0x01c01000 0x1000>;
clocks = <&ccu CLK_MBUS>;
@@ -69,4 +148,21 @@ examples:
#interconnect-cells = <1>;
};
+ - |
+ dram-controller@1c62000 {
+ compatible = "allwinner,sun50i-a64-mbus";
+ reg = <0x01c62000 0x1000>,
+ <0x01c63000 0x1000>;
+ reg-names = "mbus", "dram";
+ clocks = <&ccu CLK_MBUS>,
+ <&ccu CLK_DRAM>,
+ <&ccu CLK_BUS_DRAM>;
+ clock-names = "mbus", "dram", "bus";
+ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ dma-ranges = <0x00000000 0x40000000 0xc0000000>;
+ #interconnect-cells = <1>;
+ };
+
...
diff --git a/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun6i-a31-cpuconfig.yaml b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun6i-a31-cpuconfig.yaml
new file mode 100644
index 000000000000..d805c4508b4e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun6i-a31-cpuconfig.yaml
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun6i-a31-cpuconfig.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner CPU Configuration Controller
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - allwinner,sun6i-a31-cpuconfig
+ - allwinner,sun8i-a23-cpuconfig
+ - allwinner,sun8i-a83t-cpucfg
+ - allwinner,sun8i-a83t-r-cpucfg
+ - allwinner,sun9i-a80-cpucfg
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ cpucfg@1f01c00 {
+ compatible = "allwinner,sun6i-a31-cpuconfig";
+ reg = <0x01f01c00 0x300>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun9i-a80-prcm.yaml b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun9i-a80-prcm.yaml
new file mode 100644
index 000000000000..644f391afb32
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/sunxi/allwinner,sun9i-a80-prcm.yaml
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/sunxi/allwinner,sun9i-a80-prcm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A80 PRCM
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <mripard@kernel.org>
+
+properties:
+ compatible:
+ const: allwinner,sun9i-a80-prcm
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ prcm@8001400 {
+ compatible = "allwinner,sun9i-a80-prcm";
+ reg = <0x08001400 0x200>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/swir.txt b/Documentation/devicetree/bindings/arm/swir.txt
deleted file mode 100644
index 042be73a95d3..000000000000
--- a/Documentation/devicetree/bindings/arm/swir.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Sierra Wireless Modules device tree bindings
---------------------------------------------
-
-Supported Modules :
- - WP8548 : Includes MDM9615 and PM8018 in a module
-
-Sierra Wireless modules shall have the following properties :
- Required root node property
- - compatible: "swir,wp8548" for the WP8548 CF3 Module
-
-Board compatible values:
- - "swir,mangoh-green-wp8548" for the mangOH green board with the WP8548 module
diff --git a/Documentation/devicetree/bindings/arm/syna.txt b/Documentation/devicetree/bindings/arm/syna.txt
index d8b48f2edf1b..851f48ead927 100644
--- a/Documentation/devicetree/bindings/arm/syna.txt
+++ b/Documentation/devicetree/bindings/arm/syna.txt
@@ -18,10 +18,6 @@ stable binding/ABI.
---------------------------------------------------------------
-Boards with the Synaptics AS370 SoC shall have the following properties:
- Required root node property:
- compatible: "syna,as370"
-
Boards with a SoC of the Marvell Berlin family, e.g. Armada 1500
shall have the following properties:
diff --git a/Documentation/devicetree/bindings/arm/tegra.yaml b/Documentation/devicetree/bindings/arm/tegra.yaml
index d79d36ac0c44..0df41f5b7e2a 100644
--- a/Documentation/devicetree/bindings/arm/tegra.yaml
+++ b/Documentation/devicetree/bindings/arm/tegra.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/tegra.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NVIDIA Tegra device tree bindings
+title: NVIDIA Tegra
maintainers:
- Thierry Reding <thierry.reding@gmail.com>
@@ -37,6 +37,9 @@ properties:
- const: toradex,colibri_t20
- const: nvidia,tegra20
- items:
+ - const: asus,tf101
+ - const: nvidia,tegra20
+ - items:
- const: acer,picasso
- const: nvidia,tegra20
- items:
@@ -50,6 +53,18 @@ properties:
- const: nvidia,cardhu
- const: nvidia,tegra30
- items:
+ - const: asus,tf201
+ - const: nvidia,tegra30
+ - items:
+ - const: asus,tf300t
+ - const: nvidia,tegra30
+ - items:
+ - const: asus,tf300tg
+ - const: nvidia,tegra30
+ - items:
+ - const: asus,tf700t
+ - const: nvidia,tegra30
+ - items:
- const: toradex,apalis_t30-eval
- const: toradex,apalis_t30
- const: nvidia,tegra30
@@ -75,7 +90,11 @@ properties:
- const: ouya,ouya
- const: nvidia,tegra30
- items:
+ - const: pegatron,chagall
+ - const: nvidia,tegra30
+ - items:
- enum:
+ - asus,tf701t
- nvidia,dalmore
- nvidia,roth
- nvidia,tn7
@@ -108,14 +127,17 @@ properties:
- nvidia,p2571
- nvidia,p2894-0050-a08
- const: nvidia,tegra210
- - items:
- - enum:
- - nvidia,p2771-0000
- - nvidia,p3509-0000+p3636-0001
+ - description: Jetson TX2 Developer Kit
+ items:
+ - const: nvidia,p2771-0000
- const: nvidia,tegra186
- - items:
- - enum:
- - nvidia,p2972-0000
+ - description: Jetson TX2 NX Developer Kit
+ items:
+ - const: nvidia,p3509-0000+p3636-0001
+ - const: nvidia,tegra186
+ - description: Jetson AGX Xavier Developer Kit
+ items:
+ - const: nvidia,p2972-0000
- const: nvidia,tegra194
- description: Jetson Xavier NX
items:
@@ -134,8 +156,25 @@ properties:
- const: nvidia,p3509-0000+p3668-0001
- const: nvidia,tegra194
- items:
- - enum:
- - nvidia,tegra234-vdk
+ - const: nvidia,tegra234-vdk
+ - const: nvidia,tegra234
+ - description: Jetson AGX Orin
+ items:
+ - const: nvidia,p3701-0000
+ - const: nvidia,tegra234
+ - description: Jetson AGX Orin Developer Kit
+ items:
+ - const: nvidia,p3737-0000+p3701-0000
+ - const: nvidia,p3701-0000
+ - const: nvidia,tegra234
+ - description: Jetson Orin NX
+ items:
+ - const: nvidia,p3767-0000
+ - const: nvidia,tegra234
+ - description: Jetson Orin NX Engineering Reference Developer Kit
+ items:
+ - const: nvidia,p3768-0000+p3767-0000
+ - const: nvidia,p3767-0000
- const: nvidia,tegra234
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml
new file mode 100644
index 000000000000..36dbd0838f2d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra-ccplex-cluster.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tegra/nvidia,tegra-ccplex-cluster.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra CPU COMPLEX CLUSTER area
+
+maintainers:
+ - Sumit Gupta <sumitg@nvidia.com>
+ - Mikko Perttunen <mperttunen@nvidia.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+ - Thierry Reding <thierry.reding@gmail.com>
+
+description: |+
+ The Tegra CPU COMPLEX CLUSTER area contains memory-mapped
+ registers that initiate CPU frequency/voltage transitions.
+
+properties:
+ $nodename:
+ pattern: "ccplex@([0-9a-f]+)$"
+
+ compatible:
+ enum:
+ - nvidia,tegra186-ccplex-cluster
+ - nvidia,tegra234-ccplex-cluster
+
+ reg:
+ maxItems: 1
+
+ nvidia,bpmp:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: |
+ Specifies the BPMP node that needs to be queried to get
+ operating point data for all CPUs.
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - nvidia,bpmp
+
+examples:
+ - |
+ ccplex@e000000 {
+ compatible = "nvidia,tegra234-ccplex-cluster";
+ reg = <0x0e000000 0x5ffff>;
+ nvidia,bpmp = <&bpmp>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt
deleted file mode 100644
index 576462fae27f..000000000000
--- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt
+++ /dev/null
@@ -1,133 +0,0 @@
-NVIDIA Tegra Power Management Controller (PMC)
-
-Required properties:
-- compatible: Should contain one of the following:
- - "nvidia,tegra186-pmc": for Tegra186
- - "nvidia,tegra194-pmc": for Tegra194
- - "nvidia,tegra234-pmc": for Tegra234
-- reg: Must contain an (offset, length) pair of the register set for each
- entry in reg-names.
-- reg-names: Must include the following entries:
- - "pmc"
- - "wake"
- - "aotag"
- - "scratch"
- - "misc" (Only for Tegra194 and later)
-
-Optional properties:
-- nvidia,invert-interrupt: If present, inverts the PMU interrupt signal.
-- interrupt-controller: Identifies the node as an interrupt controller.
-- #interrupt-cells: Specifies the number of cells needed to encode an
- interrupt source. The value must be 2.
-
-Example:
-
-SoC DTSI:
-
- pmc@c3600000 {
- compatible = "nvidia,tegra186-pmc";
- reg = <0 0x0c360000 0 0x10000>,
- <0 0x0c370000 0 0x10000>,
- <0 0x0c380000 0 0x10000>,
- <0 0x0c390000 0 0x10000>;
- reg-names = "pmc", "wake", "aotag", "scratch";
- };
-
-Board DTS:
-
- pmc@c360000 {
- nvidia,invert-interrupt;
- };
-
-== Pad Control ==
-
-On Tegra SoCs a pad is a set of pins which are configured as a group.
-The pin grouping is a fixed attribute of the hardware. The PMC can be
-used to set pad power state and signaling voltage. A pad can be either
-in active or power down mode. The support for power state and signaling
-voltage configuration varies depending on the pad in question. 3.3 V and
-1.8 V signaling voltages are supported on pins where software
-controllable signaling voltage switching is available.
-
-Pad configurations are described with pin configuration nodes which
-are placed under the pmc node and they are referred to by the pinctrl
-client properties. For more information see
-Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt.
-
-The following pads are present on Tegra186:
-csia csib dsi mipi-bias
-pex-clk-bias pex-clk3 pex-clk2 pex-clk1
-usb0 usb1 usb2 usb-bias
-uart audio hsic dbg
-hdmi-dp0 hdmi-dp1 pex-cntrl sdmmc2-hv
-sdmmc4 cam dsib dsic
-dsid csic csid csie
-dsif spi ufs dmic-hv
-edp sdmmc1-hv sdmmc3-hv conn
-audio-hv ao-hv
-
-Required pin configuration properties:
- - pins: A list of strings, each of which contains the name of a pad
- to be configured.
-
-Optional pin configuration properties:
- - low-power-enable: Configure the pad into power down mode
- - low-power-disable: Configure the pad into active mode
- - power-source: Must contain either TEGRA_IO_PAD_VOLTAGE_1V8 or
- TEGRA_IO_PAD_VOLTAGE_3V3 to select between signaling voltages.
- The values are defined in
- include/dt-bindings/pinctrl/pinctrl-tegra-io-pad.h.
-
-Note: The power state can be configured on all of the above pads except
- for ao-hv. Following pads have software configurable signaling
- voltages: sdmmc2-hv, dmic-hv, sdmmc1-hv, sdmmc3-hv, audio-hv,
- ao-hv.
-
-Pad configuration state example:
- pmc: pmc@7000e400 {
- compatible = "nvidia,tegra186-pmc";
- reg = <0 0x0c360000 0 0x10000>,
- <0 0x0c370000 0 0x10000>,
- <0 0x0c380000 0 0x10000>,
- <0 0x0c390000 0 0x10000>;
- reg-names = "pmc", "wake", "aotag", "scratch";
-
- ...
-
- sdmmc1_3v3: sdmmc1-3v3 {
- pins = "sdmmc1-hv";
- power-source = <TEGRA_IO_PAD_VOLTAGE_3V3>;
- };
-
- sdmmc1_1v8: sdmmc1-1v8 {
- pins = "sdmmc1-hv";
- power-source = <TEGRA_IO_PAD_VOLTAGE_1V8>;
- };
-
- hdmi_off: hdmi-off {
- pins = "hdmi";
- low-power-enable;
- }
-
- hdmi_on: hdmi-on {
- pins = "hdmi";
- low-power-disable;
- }
- };
-
-Pinctrl client example:
- sdmmc1: sdhci@3400000 {
- ...
- pinctrl-names = "sdmmc-3v3", "sdmmc-1v8";
- pinctrl-0 = <&sdmmc1_3v3>;
- pinctrl-1 = <&sdmmc1_1v8>;
- };
-
- ...
-
- sor0: sor@15540000 {
- ...
- pinctrl-0 = <&hdmi_off>;
- pinctrl-1 = <&hdmi_on>;
- pinctrl-names = "hdmi-on", "hdmi-off";
- };
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.yaml
new file mode 100644
index 000000000000..0faa403f68c8
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.yaml
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tegra/nvidia,tegra186-pmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Power Management Controller (PMC)
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ compatible:
+ enum:
+ - nvidia,tegra186-pmc
+ - nvidia,tegra194-pmc
+ - nvidia,tegra234-pmc
+
+ reg:
+ minItems: 4
+ maxItems: 5
+
+ reg-names:
+ minItems: 4
+ items:
+ - const: pmc
+ - const: wake
+ - const: aotag
+ - const: scratch
+ - const: misc
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ description: Specifies the number of cells needed to encode an
+ interrupt source. The value must be 2.
+ const: 2
+
+ nvidia,invert-interrupt:
+ description: If present, inverts the PMU interrupt signal.
+ $ref: /schemas/types.yaml#/definitions/flag
+
+if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra186-pmc
+then:
+ properties:
+ reg:
+ maxItems: 4
+
+ reg-names:
+ maxItems: 4
+else:
+ properties:
+ reg:
+ minItems: 5
+
+ reg-names:
+ minItems: 5
+
+patternProperties:
+ "^[a-z0-9]+-[a-z0-9]+$":
+ if:
+ type: object
+ then:
+ description: |
+ These are pad configuration nodes. On Tegra SoCs a pad is a set of
+ pins which are configured as a group. The pin grouping is a fixed
+ attribute of the hardware. The PMC can be used to set pad power
+ state and signaling voltage. A pad can be either in active or
+ power down mode. The support for power state and signaling voltage
+ configuration varies depending on the pad in question. 3.3 V and
+ 1.8 V signaling voltages are supported on pins where software
+ controllable signaling voltage switching is available.
+
+ Pad configurations are described with pin configuration nodes
+ which are placed under the pmc node and they are referred to by
+ the pinctrl client properties. For more information see
+
+ Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+
+ The following pads are present on Tegra186:
+
+ csia, csib, dsi, mipi-bias, pex-clk-bias, pex-clk3, pex-clk2,
+ pex-clk1, usb0, usb1, usb2, usb-bias, uart, audio, hsic, dbg,
+ hdmi-dp0, hdmi-dp1, pex-cntrl, sdmmc2-hv, sdmmc4, cam, dsib,
+ dsic, dsid, csic, csid, csie, dsif, spi, ufs, dmic-hv, edp,
+ sdmmc1-hv, sdmmc3-hv, conn, audio-hv, ao-hv
+
+ The following pads are present on Tegra194:
+
+ csia, csib, mipi-bias, pex-clk-bias, pex-clk3, pex-clk2,
+ pex-clk1, eqos, pex-clk-2-bias, pex-clk-2, dap3, dap5, uart,
+ pwr-ctl, soc-gpio53, audio, gp-pwm2, gp-pwm3, soc-gpio12,
+ soc-gpio13, soc-gpio10, uart4, uart5, dbg, hdmi-dp3, hdmi-dp2,
+ hdmi-dp0, hdmi-dp1, pex-cntrl, pex-ctl2, pex-l0-rst,
+ pex-l1-rst, sdmmc4, pex-l5-rst, cam, csic, csid, csie, csif,
+ spi, ufs, csig, csih, edp, sdmmc1-hv, sdmmc3-hv, conn,
+ audio-hv, ao-hv
+
+ properties:
+ pins:
+ $ref: /schemas/types.yaml#/definitions/string
+ description: Must contain the name of the pad(s) to be
+ configured.
+
+ low-power-enable:
+ description: Configure the pad into power down mode.
+ $ref: /schemas/types.yaml#/definitions/flag
+
+ low-power-disable:
+ description: Configure the pad into active mode.
+ $ref: /schemas/types.yaml#/definitions/flag
+
+ power-source:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Must contain either TEGRA_IO_PAD_VOLTAGE_1V8 or
+ TEGRA_IO_PAD_VOLTAGE_3V3 to select between signalling
+ voltages.
+
+ The values are defined in
+
+ include/dt-bindings/pinctrl/pinctrl-tegra-io-pad.h
+
+ The power state can be configured on all of the above pads
+ except for ao-hv. Following pads have software configurable
+ signaling voltages: sdmmc2-hv, dmic-hv, sdmmc1-hv, sdmmc3-hv,
+ audio-hv, ao-hv.
+
+ phandle: true
+
+ required:
+ - pins
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - reg-names
+
+additionalProperties: false
+
+dependencies:
+ interrupt-controller: ['#interrupt-cells']
+ "#interrupt-cells":
+ required:
+ - interrupt-controller
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/pinctrl/pinctrl-tegra-io-pad.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ pmc@c3600000 {
+ compatible = "nvidia,tegra186-pmc";
+ reg = <0x0c360000 0x10000>,
+ <0x0c370000 0x10000>,
+ <0x0c380000 0x10000>,
+ <0x0c390000 0x10000>;
+ reg-names = "pmc", "wake", "aotag", "scratch";
+ nvidia,invert-interrupt;
+
+ sdmmc1_3v3: sdmmc1-3v3 {
+ pins = "sdmmc1-hv";
+ power-source = <TEGRA_IO_PAD_VOLTAGE_3V3>;
+ };
+
+ sdmmc1_1v8: sdmmc1-1v8 {
+ pins = "sdmmc1-hv";
+ power-source = <TEGRA_IO_PAD_VOLTAGE_1V8>;
+ };
+ };
+
+ sdmmc1: mmc@3400000 {
+ compatible = "nvidia,tegra186-sdhci";
+ reg = <0x03400000 0x10000>;
+ interrupts = <GIC_SPI 62 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA186_CLK_SDMMC1>,
+ <&bpmp TEGRA186_CLK_SDMMC_LEGACY_TM>;
+ clock-names = "sdhci", "tmclk";
+ resets = <&bpmp TEGRA186_RESET_SDMMC1>;
+ reset-names = "sdhci";
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_SDMMCRA &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_SDMMCWA &emc>;
+ interconnect-names = "dma-mem", "write";
+ iommus = <&smmu TEGRA186_SID_SDMMC1>;
+ pinctrl-names = "sdmmc-3v3", "sdmmc-1v8";
+ pinctrl-0 = <&sdmmc1_3v3>;
+ pinctrl-1 = <&sdmmc1_1v8>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-axi2apb.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-axi2apb.yaml
new file mode 100644
index 000000000000..5e0f1dc542b0
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-axi2apb.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tegra/nvidia,tegra194-axi2apb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra194 AXI2APB bridge
+
+maintainers:
+ - Sumit Gupta <sumitg@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^axi2apb@([0-9a-f]+)$"
+
+ compatible:
+ enum:
+ - nvidia,tegra194-axi2apb
+
+ reg:
+ maxItems: 6
+ description: Physical base address and length of registers for all bridges
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ axi2apb: axi2apb@2390000 {
+ compatible = "nvidia,tegra194-axi2apb";
+ reg = <0x02390000 0x1000>,
+ <0x023a0000 0x1000>,
+ <0x023b0000 0x1000>,
+ <0x023c0000 0x1000>,
+ <0x023d0000 0x1000>,
+ <0x023e0000 0x1000>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-cbb.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-cbb.yaml
new file mode 100644
index 000000000000..d9c54c32c6b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra194-cbb.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tegra/nvidia,tegra194-cbb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra194 CBB 1.0
+
+maintainers:
+ - Sumit Gupta <sumitg@nvidia.com>
+
+description: |+
+ The Control Backbone (CBB) is comprised of the physical path from an
+ initiator to a target's register configuration space. CBB 1.0 has
+ multiple hierarchical sub-NOCs (Network-on-Chip) and connects various
+ initiators and targets using different bridges like AXIP2P, AXI2APB.
+
+ This driver handles errors due to illegal register accesses reported
+ by the NOCs inside the CBB. NOCs reporting errors are cluster NOCs
+ "AON-NOC, SCE-NOC, RCE-NOC, BPMP-NOC, CV-NOC" and "CBB Central NOC"
+ which is the main NOC.
+
+ By default, the access issuing initiator is informed about the error
+ using SError or Data Abort exception unless the ERD (Error Response
+ Disable) is enabled/set for that initiator. If the ERD is enabled, then
+ SError or Data Abort is masked and the error is reported with interrupt.
+
+ - For CCPLEX (CPU Complex) initiator, the driver sets ERD bit. So, the
+ errors due to illegal accesses from CCPLEX are reported by interrupts.
+ If ERD is not set, then error is reported by SError.
+ - For other initiators, the ERD is disabled. So, the access issuing
+ initiator is informed about the illegal access by Data Abort exception.
+ In addition, an interrupt is also generated to CCPLEX. These initiators
+ include all engines using Cortex-R5 (which is ARMv7 CPU cluster) and
+ engines like TSEC (Security co-processor), NVDEC (NVIDIA Video Decoder
+ engine) etc which can initiate transactions.
+
+ The driver prints relevant debug information like Error Code, Error
+ Description, Master, Address, AXI ID, Cache, Protection, Security Group
+ etc on receiving error notification.
+
+properties:
+ $nodename:
+ pattern: "^[a-z]+-noc@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra194-cbb-noc
+ - nvidia,tegra194-aon-noc
+ - nvidia,tegra194-bpmp-noc
+ - nvidia,tegra194-rce-noc
+ - nvidia,tegra194-sce-noc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ description:
+ CCPLEX receives secure or nonsecure interrupt depending on error type.
+ A secure interrupt is received for SEC(firewall) & SLV errors and a
+ non-secure interrupt is received for TMO & DEC errors.
+ items:
+ - description: non-secure interrupt
+ - description: secure interrupt
+
+ nvidia,axi2apb:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Specifies the node having all axi2apb bridges which need to be checked
+ for any error logged in their status register.
+
+ nvidia,apbmisc:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Specifies the apbmisc node which need to be used for reading the ERD
+ register.
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - nvidia,apbmisc
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ cbb-noc@2300000 {
+ compatible = "nvidia,tegra194-cbb-noc";
+ reg = <0x02300000 0x1000>;
+ interrupts = <GIC_SPI 230 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 231 IRQ_TYPE_LEVEL_HIGH>;
+ nvidia,axi2apb = <&axi2apb>;
+ nvidia,apbmisc = <&apbmisc>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.yaml
index 0afec83cc723..89191cfdf619 100644
--- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.yaml
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-pmc.yaml
@@ -14,7 +14,6 @@ properties:
compatible:
enum:
- nvidia,tegra20-pmc
- - nvidia,tegra20-pmc
- nvidia,tegra30-pmc
- nvidia,tegra114-pmc
- nvidia,tegra124-pmc
@@ -124,6 +123,33 @@ properties:
some PLLs, clocks and then brings up CPU0 for resuming the
system.
+ core-supply:
+ description:
+ Phandle to voltage regulator connected to the SoC Core power rail.
+
+ core-domain:
+ type: object
+ description: |
+ The vast majority of hardware blocks of Tegra SoC belong to a
+ Core power domain, which has a dedicated voltage rail that powers
+ the blocks.
+
+ properties:
+ operating-points-v2:
+ description:
+ Should contain level, voltages and opp-supported-hw property.
+ The supported-hw is a bitfield indicating SoC speedo or process
+ ID mask.
+
+ "#power-domain-cells":
+ const: 0
+
+ required:
+ - operating-points-v2
+ - "#power-domain-cells"
+
+ additionalProperties: false
+
i2c-thermtrip:
type: object
description:
@@ -208,8 +234,9 @@ properties:
patternProperties:
"^[a-z0-9]+$":
type: object
+ additionalProperties: false
- patternProperties:
+ properties:
clocks:
minItems: 1
maxItems: 8
@@ -226,6 +253,9 @@ properties:
for controlling a power-gate.
See ../reset/reset.txt for more details.
+ power-domains:
+ maxItems: 1
+
'#power-domain-cells':
const: 0
description: Must be 0.
@@ -301,33 +331,6 @@ patternProperties:
additionalProperties: false
- core-domain:
- type: object
- description: |
- The vast majority of hardware blocks of Tegra SoC belong to a
- Core power domain, which has a dedicated voltage rail that powers
- the blocks.
-
- properties:
- operating-points-v2:
- description:
- Should contain level, voltages and opp-supported-hw property.
- The supported-hw is a bitfield indicating SoC speedo or process
- ID mask.
-
- "#power-domain-cells":
- const: 0
-
- required:
- - operating-points-v2
- - "#power-domain-cells"
-
- additionalProperties: false
-
- core-supply:
- description:
- Phandle to voltage regulator connected to the SoC Core power rail.
-
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra234-cbb.yaml b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra234-cbb.yaml
new file mode 100644
index 000000000000..fcdf03131323
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra234-cbb.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tegra/nvidia,tegra234-cbb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra CBB 2.0
+
+maintainers:
+ - Sumit Gupta <sumitg@nvidia.com>
+
+description: |+
+ The Control Backbone (CBB) is comprised of the physical path from an
+ initiator to a target's register configuration space. CBB 2.0 consists
+ of multiple sub-blocks connected to each other to create a topology.
+ The Tegra234 SoC has different fabrics based on CBB 2.0 architecture
+ which include cluster fabrics BPMP, AON, PSC, SCE, RCE, DCE, FSI and
+ "CBB central fabric".
+
+ In CBB 2.0, each initiator which can issue transactions connects to a
+ Root Master Node (MN) before it connects to any other element of the
+ fabric. Each Root MN contains a Error Monitor (EM) which detects and
+ logs error. Interrupts from various EM blocks are collated by Error
+ Notifier (EN) which is per fabric and presents a single interrupt from
+ fabric to the SoC interrupt controller.
+
+ The driver handles errors from CBB due to illegal register accesses
+ and prints debug information about failed transaction on receiving
+ the interrupt from EN. Debug information includes Error Code, Error
+ Description, MasterID, Fabric, SlaveID, Address, Cache, Protection,
+ Security Group etc on receiving error notification.
+
+ If the Error Response Disable (ERD) is set/enabled for an initiator,
+ then SError or Data abort exception error response is masked and an
+ interrupt is used for reporting errors due to illegal accesses from
+ that initiator. The value returned on read failures is '0xFFFFFFFF'
+ for compatibility with PCIE.
+
+properties:
+ $nodename:
+ pattern: "^[a-z]+-fabric@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra234-aon-fabric
+ - nvidia,tegra234-bpmp-fabric
+ - nvidia,tegra234-cbb-fabric
+ - nvidia,tegra234-dce-fabric
+ - nvidia,tegra234-rce-fabric
+ - nvidia,tegra234-sce-fabric
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ items:
+ - description: secure interrupt from error notifier
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ cbb-fabric@1300000 {
+ compatible = "nvidia,tegra234-cbb-fabric";
+ reg = <0x13a00000 0x400000>;
+ interrupts = <GIC_SPI 231 IRQ_TYPE_LEVEL_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/tesla.yaml b/Documentation/devicetree/bindings/arm/tesla.yaml
new file mode 100644
index 000000000000..d670a0d56222
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/tesla.yaml
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/tesla.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Tesla Full Self Driving(FSD) platforms
+
+maintainers:
+ - Alim Akhtar <alim.akhtar@samsung.com>
+ - linux-fsd@tesla.com
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+
+ - description: FSD SoC board
+ items:
+ - enum:
+ - tesla,fsd-evb # Tesla FSD Evaluation
+ - const: tesla,fsd
+
+additionalProperties: true
+
+...
diff --git a/Documentation/devicetree/bindings/arm/ti/k3.yaml b/Documentation/devicetree/bindings/arm/ti/k3.yaml
index c5aa362e4026..e1183f90bb06 100644
--- a/Documentation/devicetree/bindings/arm/ti/k3.yaml
+++ b/Documentation/devicetree/bindings/arm/ti/k3.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/ti/k3.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Texas Instruments K3 Multicore SoC architecture device tree bindings
+title: Texas Instruments K3 Multicore SoC architecture
maintainers:
- Nishanth Menon <nm@ti.com>
@@ -19,28 +19,75 @@ properties:
compatible:
oneOf:
+ - description: K3 AM62A7 SoC
+ items:
+ - enum:
+ - ti,am62a7-sk
+ - const: ti,am62a7
+
+ - description: K3 AM625 SoC
+ items:
+ - enum:
+ - beagle,am625-beagleplay
+ - ti,am625-sk
+ - ti,am62-lp-sk
+ - const: ti,am625
+
+ - description: K3 AM642 SoC
+ items:
+ - enum:
+ - ti,am642-evm
+ - ti,am642-sk
+ - const: ti,am642
+
+ - description: K3 AM642 SoC PHYTEC phyBOARD-Electra
+ items:
+ - const: phytec,am642-phyboard-electra-rdk
+ - const: phytec,am64-phycore-som
+ - const: ti,am642
+
- description: K3 AM654 SoC
items:
- enum:
- - ti,am654-evm
- - siemens,iot2050-basic
- siemens,iot2050-advanced
+ - siemens,iot2050-advanced-m2
+ - siemens,iot2050-advanced-pg2
+ - siemens,iot2050-basic
+ - siemens,iot2050-basic-pg2
+ - ti,am654-evm
- const: ti,am654
+ - description: K3 J7200 SoC
+ oneOf:
+ - const: ti,j7200
+ - items:
+ - enum:
+ - ti,j7200-evm
+ - const: ti,j7200
+
- description: K3 J721E SoC
- items:
+ oneOf:
- const: ti,j721e
+ - items:
+ - enum:
+ - beagle,j721e-beagleboneai64
+ - ti,j721e-evm
+ - ti,j721e-sk
+ - const: ti,j721e
- - description: K3 J7200 SoC
+ - description: K3 J721s2 SoC
items:
- - const: ti,j7200
+ - enum:
+ - ti,am68-sk
+ - ti,j721s2-evm
+ - const: ti,j721s2
- - description: K3 AM642 SoC
+ - description: K3 J784s4 SoC
items:
- enum:
- - ti,am642-evm
- - ti,am642-sk
- - const: ti,am642
+ - ti,am69-sk
+ - ti,j784s4-evm
+ - const: ti,j784s4
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
index c022d325fc08..1656d1a4476f 100644
--- a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
+++ b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/ti/ti,davinci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Texas Instruments DaVinci Platforms Device Tree Bindings
+title: Texas Instruments DaVinci Platforms
maintainers:
- Sekhar Nori <nsekhar@ti.com>
diff --git a/Documentation/devicetree/bindings/arm/toshiba.yaml b/Documentation/devicetree/bindings/arm/toshiba.yaml
index 001bbbcd1432..716ba4a3cab4 100644
--- a/Documentation/devicetree/bindings/arm/toshiba.yaml
+++ b/Documentation/devicetree/bindings/arm/toshiba.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/toshiba.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Toshiba Visconti Platform Device Tree Bindings
+title: Toshiba Visconti Platform
maintainers:
- Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
@@ -18,6 +18,7 @@ properties:
items:
- enum:
- toshiba,tmpv7708-rm-mbrc # TMPV7708 RM main board
+ - toshiba,tmpv7708-visrobo-vrb # TMPV7708 VisROBO VRB board
- const: toshiba,tmpv7708
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/trbe.yaml b/Documentation/devicetree/bindings/arm/trbe.yaml
deleted file mode 100644
index 4402d7bfd1fc..000000000000
--- a/Documentation/devicetree/bindings/arm/trbe.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
-# Copyright 2021, Arm Ltd
-%YAML 1.2
----
-$id: "http://devicetree.org/schemas/arm/trbe.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
-
-title: ARM Trace Buffer Extensions
-
-maintainers:
- - Anshuman Khandual <anshuman.khandual@arm.com>
-
-description: |
- Arm Trace Buffer Extension (TRBE) is a per CPU component
- for storing trace generated on the CPU to memory. It is
- accessed via CPU system registers. The software can verify
- if it is permitted to use the component by checking the
- TRBIDR register.
-
-properties:
- $nodename:
- const: "trbe"
- compatible:
- items:
- - const: arm,trace-buffer-extension
-
- interrupts:
- description: |
- Exactly 1 PPI must be listed. For heterogeneous systems where
- TRBE is only supported on a subset of the CPUs, please consult
- the arm,gic-v3 binding for details on describing a PPI partition.
- maxItems: 1
-
-required:
- - compatible
- - interrupts
-
-additionalProperties: false
-
-examples:
-
- - |
- #include <dt-bindings/interrupt-controller/arm-gic.h>
-
- trbe {
- compatible = "arm,trace-buffer-extension";
- interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>;
- };
-...
diff --git a/Documentation/devicetree/bindings/arm/ux500.yaml b/Documentation/devicetree/bindings/arm/ux500.yaml
index 5db7cfba81a4..b42d20fa4359 100644
--- a/Documentation/devicetree/bindings/arm/ux500.yaml
+++ b/Documentation/devicetree/bindings/arm/ux500.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/ux500.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ux500 platforms device tree bindings
+title: Ux500 platforms
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
@@ -20,6 +20,11 @@ properties:
- const: st-ericsson,mop500
- const: st-ericsson,u8500
+ - description: ST-Ericsson HREF520
+ items:
+ - const: st-ericsson,href520
+ - const: st-ericsson,u8500
+
- description: ST-Ericsson HREF (v60+)
items:
- const: st-ericsson,hrefv60+
@@ -30,9 +35,39 @@ properties:
- const: calaosystems,snowball-a9500
- const: st-ericsson,u9500
+ - description: Samsung Galaxy Ace 2 (GT-I8160)
+ items:
+ - const: samsung,codina
+ - const: st-ericsson,u8500
+
+ - description: Samsung Galaxy Exhibit (SGH-T599)
+ items:
+ - const: samsung,codina-tmo
+ - const: st-ericsson,u8500
+
+ - description: Samsung Galaxy Beam (GT-I8530)
+ items:
+ - const: samsung,gavini
+ - const: st-ericsson,u8500
+
- description: Samsung Galaxy S III mini (GT-I8190)
items:
- const: samsung,golden
- const: st-ericsson,u8500
+ - description: Samsung Galaxy S Advance (GT-I9070)
+ items:
+ - const: samsung,janice
+ - const: st-ericsson,u8500
+
+ - description: Samsung Galaxy Amp (SGH-I407)
+ items:
+ - const: samsung,kyle
+ - const: st-ericsson,u8500
+
+ - description: Samsung Galaxy XCover 2 (GT-S7710)
+ items:
+ - const: samsung,skomer
+ - const: st-ericsson,u8500
+
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/arm/versatile-sysreg.txt b/Documentation/devicetree/bindings/arm/versatile-sysreg.txt
deleted file mode 100644
index a4f15262d717..000000000000
--- a/Documentation/devicetree/bindings/arm/versatile-sysreg.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-ARM Versatile system registers
---------------------------------------
-
-This is a system control registers block, providing multiple low level
-platform functions like board detection and identification, software
-interrupt generation, MMC and NOR Flash control etc.
-
-Required node properties:
-- compatible value : = "arm,versatile-sysreg", "syscon"
-- reg : physical base address and the size of the registers window
diff --git a/Documentation/devicetree/bindings/arm/vexpress-config.yaml b/Documentation/devicetree/bindings/arm/vexpress-config.yaml
new file mode 100644
index 000000000000..b74380da3198
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/vexpress-config.yaml
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/vexpress-config.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Versatile Express configuration bus
+
+maintainers:
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ This is a system control register block, acting as a bridge to the
+ platform's configuration bus via "system control" interface, addressing
+ devices with site number, position in the board stack, config controller,
+ function and device numbers - see motherboard's TRM for more details.
+
+properties:
+ compatible:
+ const: arm,vexpress,config-bus
+
+ arm,vexpress,config-bridge:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the sysreg node.
+
+ muxfpga:
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-muxfpga
+
+ arm,vexpress-sysreg,func:
+ description: FPGA specifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 7
+ - description: device number
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ shutdown:
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-shutdown
+
+ arm,vexpress-sysreg,func:
+ description: shutdown identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 8
+ - description: device number
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ reboot:
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-reboot
+
+ arm,vexpress-sysreg,func:
+ description: reboot identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 9
+ - description: device number
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ dvimode:
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-dvimode
+
+ arm,vexpress-sysreg,func:
+ description: DVI mode identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 11
+ - description: device number
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+additionalProperties: false
+
+required:
+ - compatible
+ - arm,vexpress,config-bridge
+
+patternProperties:
+ 'clk[0-9]*$':
+ type: object
+ description:
+ clocks
+
+ properties:
+ compatible:
+ const: arm,vexpress-osc
+
+ arm,vexpress-sysreg,func:
+ description: clock specifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 1
+ - description: clock number
+
+ freq-range:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - description: minimal clock frequency
+ - description: maximum clock frequency
+
+ "#clock-cells":
+ const: 0
+
+ clock-output-names:
+ maxItems: 1
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+ - "#clock-cells"
+
+ "^volt-.+$":
+ $ref: /schemas/regulator/regulator.yaml#
+ properties:
+ compatible:
+ const: arm,vexpress-volt
+
+ arm,vexpress-sysreg,func:
+ description: regulator specifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 2
+ - description: device number
+
+ label:
+ maxItems: 1
+
+ unevaluatedProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ "^amp-.+$":
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-amp
+
+ arm,vexpress-sysreg,func:
+ description: current sensor identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 3
+ - description: device number
+
+ label:
+ maxItems: 1
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ "^temp-.+$":
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-temp
+
+ arm,vexpress-sysreg,func:
+ description: temperature sensor identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 4
+ - description: device number
+
+ label:
+ maxItems: 1
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ "^reset[0-9]*$":
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-reset
+
+ arm,vexpress-sysreg,func:
+ description: reset specifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 5
+ - description: reset device number
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ "^power-.+$":
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-power
+
+ arm,vexpress-sysreg,func:
+ description: power sensor identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - const: 12
+ - description: device number
+
+ label:
+ maxItems: 1
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+ "^energy(-.+)?$":
+ type: object
+ properties:
+ compatible:
+ const: arm,vexpress-energy
+
+ arm,vexpress-sysreg,func:
+ description: energy sensor identifier
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ oneOf:
+ - items:
+ - const: 13
+ - description: device number
+ - items:
+ - const: 13
+ - description: device number
+ - const: 13
+ - description: second device number
+
+ label:
+ maxItems: 1
+
+ additionalProperties: false
+ required:
+ - compatible
+ - arm,vexpress-sysreg,func
+
+examples:
+ - |
+ mcc {
+ compatible = "arm,vexpress,config-bus";
+ arm,vexpress,config-bridge = <&v2m_sysreg>;
+
+ clk0 {
+ compatible = "arm,vexpress-osc";
+ arm,vexpress-sysreg,func = <1 0>;
+ #clock-cells = <0>;
+ };
+
+ energy {
+ compatible = "arm,vexpress-energy";
+ arm,vexpress-sysreg,func = <13 0>, <13 1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/vexpress-sysreg.txt b/Documentation/devicetree/bindings/arm/vexpress-sysreg.txt
deleted file mode 100644
index 50095802fb4a..000000000000
--- a/Documentation/devicetree/bindings/arm/vexpress-sysreg.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-ARM Versatile Express system registers
---------------------------------------
-
-This is a system control registers block, providing multiple low level
-platform functions like board detection and identification, software
-interrupt generation, MMC and NOR Flash control etc.
-
-Required node properties:
-- compatible value : = "arm,vexpress,sysreg";
-- reg : physical base address and the size of the registers window
-
-Deprecated properties, replaced by GPIO subnodes (see below):
-- gpio-controller : specifies that the node is a GPIO controller
-- #gpio-cells : size of the GPIO specifier, should be 2:
- - first cell is the pseudo-GPIO line number:
- 0 - MMC CARDIN
- 1 - MMC WPROT
- 2 - NOR FLASH WPn
- - second cell can take standard GPIO flags (currently ignored).
-
-Control registers providing pseudo-GPIO lines must be represented
-by subnodes, each of them requiring the following properties:
-- compatible value : one of
- "arm,vexpress-sysreg,sys_led"
- "arm,vexpress-sysreg,sys_mci"
- "arm,vexpress-sysreg,sys_flash"
-- gpio-controller : makes the node a GPIO controller
-- #gpio-cells : size of the GPIO specifier, must be 2:
- - first cell is the function number:
- - for sys_led : 0..7 = LED 0..7
- - for sys_mci : 0 = MMC CARDIN, 1 = MMC WPROT
- - for sys_flash : 0 = NOR FLASH WPn
- - second cell can take standard GPIO flags (currently ignored).
-
-Example:
- v2m_sysreg: sysreg@10000000 {
- compatible = "arm,vexpress-sysreg";
- reg = <0x10000000 0x1000>;
-
- v2m_led_gpios: sys_led@8 {
- compatible = "arm,vexpress-sysreg,sys_led";
- gpio-controller;
- #gpio-cells = <2>;
- };
-
- v2m_mmc_gpios: sys_mci@48 {
- compatible = "arm,vexpress-sysreg,sys_mci";
- gpio-controller;
- #gpio-cells = <2>;
- };
-
- v2m_flash_gpios: sys_flash@4c {
- compatible = "arm,vexpress-sysreg,sys_flash";
- gpio-controller;
- #gpio-cells = <2>;
- };
- };
-
-This block also can also act a bridge to the platform's configuration
-bus via "system control" interface, addressing devices with site number,
-position in the board stack, config controller, function and device
-numbers - see motherboard's TRM for more details. All configuration
-controller accessible via this interface must reference the sysreg
-node via "arm,vexpress,config-bridge" phandle and define appropriate
-topology properties - see main vexpress node documentation for more
-details. Each child of such node describes one function and must
-define the following properties:
-- compatible value : must be one of (corresponding to the TRM):
- "arm,vexpress-amp"
- "arm,vexpress-dvimode"
- "arm,vexpress-energy"
- "arm,vexpress-muxfpga"
- "arm,vexpress-osc"
- "arm,vexpress-power"
- "arm,vexpress-reboot"
- "arm,vexpress-reset"
- "arm,vexpress-scc"
- "arm,vexpress-shutdown"
- "arm,vexpress-temp"
- "arm,vexpress-volt"
-- arm,vexpress-sysreg,func : must contain a set of two cells long groups:
- - first cell of each group defines the function number
- (eg. 1 for clock generator, 2 for voltage regulators etc.)
- - second cell of each group defines device number (eg. osc 0,
- osc 1 etc.)
- - some functions (eg. energy meter, with its 64 bit long counter)
- are using more than one function/device number pair
-
-Example:
- mcc {
- compatible = "arm,vexpress,config-bus";
- arm,vexpress,config-bridge = <&v2m_sysreg>;
-
- osc@0 {
- compatible = "arm,vexpress-osc";
- arm,vexpress-sysreg,func = <1 0>;
- };
-
- energy@0 {
- compatible = "arm,vexpress-energy";
- arm,vexpress-sysreg,func = <13 0>, <13 1>;
- };
- };
diff --git a/Documentation/devicetree/bindings/arm/vexpress-sysreg.yaml b/Documentation/devicetree/bindings/arm/vexpress-sysreg.yaml
new file mode 100644
index 000000000000..be6e3b542569
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/vexpress-sysreg.yaml
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/vexpress-sysreg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM Versatile Express system registers
+
+maintainers:
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ This is a system control registers block, providing multiple low level
+ platform functions like board detection and identification, software
+ interrupt generation, MMC and NOR Flash control, etc.
+
+properties:
+ compatible:
+ const: arm,vexpress-sysreg
+
+ reg:
+ maxItems: 1
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 1
+
+ ranges: true
+
+ gpio-controller:
+ deprecated: true
+
+ "#gpio-cells":
+ deprecated: true
+ const: 2
+
+additionalProperties: false
+
+patternProperties:
+ '^gpio@[0-9a-f]+$':
+ type: object
+ additionalProperties: false
+ description:
+ GPIO children
+
+ properties:
+ compatible:
+ enum:
+ - arm,vexpress-sysreg,sys_led
+ - arm,vexpress-sysreg,sys_mci
+ - arm,vexpress-sysreg,sys_flash
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+ description: |
+ The first cell is the function number:
+ for sys_led : 0..7 = LED 0..7
+ for sys_mci : 0 = MMC CARDIN, 1 = MMC WPROT
+ for sys_flash : 0 = NOR FLASH WPn
+ The second cell can take standard GPIO flags.
+
+ reg:
+ maxItems: 1
+
+ required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ sysreg@0 {
+ compatible = "arm,vexpress-sysreg";
+ reg = <0x00000 0x1000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0 0x1000>;
+
+ v2m_led_gpios: gpio@8 {
+ compatible = "arm,vexpress-sysreg,sys_led";
+ reg = <0x008 4>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/arm/vt8500.yaml b/Documentation/devicetree/bindings/arm/vt8500.yaml
index 7b762bfc11e7..5d5ad5a60451 100644
--- a/Documentation/devicetree/bindings/arm/vt8500.yaml
+++ b/Documentation/devicetree/bindings/arm/vt8500.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/vt8500.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: VIA/Wondermedia VT8500 Platforms Device Tree Bindings
+title: VIA/Wondermedia VT8500 Platforms
maintainers:
- Tony Prisk <linux@prisktech.co.nz>
diff --git a/Documentation/devicetree/bindings/arm/xen.txt b/Documentation/devicetree/bindings/arm/xen.txt
index db5c56db30ec..61d77acbeb5e 100644
--- a/Documentation/devicetree/bindings/arm/xen.txt
+++ b/Documentation/devicetree/bindings/arm/xen.txt
@@ -7,15 +7,17 @@ the following properties:
compatible = "xen,xen-<version>", "xen,xen";
where <version> is the version of the Xen ABI of the platform.
-- reg: specifies the base physical address and size of a region in
- memory where the grant table should be mapped to, using an
- HYPERVISOR_memory_op hypercall. The memory region is large enough to map
- the whole grant table (it is larger or equal to gnttab_max_grant_frames()).
- This property is unnecessary when booting Dom0 using ACPI.
+- reg: specifies the base physical address and size of the regions in memory
+ where the special resources should be mapped to, using an HYPERVISOR_memory_op
+ hypercall.
+ Region 0 is reserved for mapping grant table, it must be always present.
+ The memory region is large enough to map the whole grant table (it is larger
+ or equal to gnttab_max_grant_frames()).
+ Regions 1...N are extended regions (unused address space) for mapping foreign
+ GFNs and grants, they might be absent if there is nothing to expose.
- interrupts: the interrupt used by Xen to inject event notifications.
A GIC node is also required.
- This property is unnecessary when booting Dom0 using ACPI.
To support UEFI on Xen ARM virtual platforms, Xen populates the FDT "uefi" node
under /hypervisor with following parameters:
diff --git a/Documentation/devicetree/bindings/arm/xilinx.yaml b/Documentation/devicetree/bindings/arm/xilinx.yaml
index f52c7e8ce654..969cfe6dc434 100644
--- a/Documentation/devicetree/bindings/arm/xilinx.yaml
+++ b/Documentation/devicetree/bindings/arm/xilinx.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/arm/xilinx.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx Zynq Platforms Device Tree Bindings
+title: Xilinx Zynq Platforms
maintainers:
- Michal Simek <michal.simek@xilinx.com>
@@ -87,6 +87,7 @@ properties:
- xlnx,zynqmp-zcu102-revA
- xlnx,zynqmp-zcu102-revB
- xlnx,zynqmp-zcu102-rev1.0
+ - xlnx,zynqmp-zcu102-rev1.1
- const: xlnx,zynqmp-zcu102
- const: xlnx,zynqmp
@@ -115,6 +116,22 @@ properties:
- const: xlnx,zynqmp-zcu111
- const: xlnx,zynqmp
+ - description: Xilinx Kria SOMs
+ items:
+ - const: xlnx,zynqmp-sm-k26-rev1
+ - const: xlnx,zynqmp-sm-k26-revB
+ - const: xlnx,zynqmp-sm-k26-revA
+ - const: xlnx,zynqmp-sm-k26
+ - const: xlnx,zynqmp
+
+ - description: Xilinx Kria SOMs (starter)
+ items:
+ - const: xlnx,zynqmp-smk-k26-rev1
+ - const: xlnx,zynqmp-smk-k26-revB
+ - const: xlnx,zynqmp-smk-k26-revA
+ - const: xlnx,zynqmp-smk-k26
+ - const: xlnx,zynqmp
+
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/ata/ahci-ceva.txt b/Documentation/devicetree/bindings/ata/ahci-ceva.txt
deleted file mode 100644
index bfb6da0281ec..000000000000
--- a/Documentation/devicetree/bindings/ata/ahci-ceva.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-Binding for CEVA AHCI SATA Controller
-
-Required properties:
- - reg: Physical base address and size of the controller's register area.
- - compatible: Compatibility string. Must be 'ceva,ahci-1v84'.
- - clocks: Input clock specifier. Refer to common clock bindings.
- - interrupts: Interrupt specifier. Refer to interrupt binding.
- - ceva,p0-cominit-params: OOB timing value for COMINIT parameter for port 0.
- - ceva,p1-cominit-params: OOB timing value for COMINIT parameter for port 1.
- The fields for the above parameter must be as shown below:
- ceva,pN-cominit-params = /bits/ 8 <CIBGMN CIBGMX CIBGN CINMP>;
- CINMP : COMINIT Negate Minimum Period.
- CIBGN : COMINIT Burst Gap Nominal.
- CIBGMX: COMINIT Burst Gap Maximum.
- CIBGMN: COMINIT Burst Gap Minimum.
- - ceva,p0-comwake-params: OOB timing value for COMWAKE parameter for port 0.
- - ceva,p1-comwake-params: OOB timing value for COMWAKE parameter for port 1.
- The fields for the above parameter must be as shown below:
- ceva,pN-comwake-params = /bits/ 8 <CWBGMN CWBGMX CWBGN CWNMP>;
- CWBGMN: COMWAKE Burst Gap Minimum.
- CWBGMX: COMWAKE Burst Gap Maximum.
- CWBGN: COMWAKE Burst Gap Nominal.
- CWNMP: COMWAKE Negate Minimum Period.
- - ceva,p0-burst-params: Burst timing value for COM parameter for port 0.
- - ceva,p1-burst-params: Burst timing value for COM parameter for port 1.
- The fields for the above parameter must be as shown below:
- ceva,pN-burst-params = /bits/ 8 <BMX BNM SFD PTST>;
- BMX: COM Burst Maximum.
- BNM: COM Burst Nominal.
- SFD: Signal Failure Detection value.
- PTST: Partial to Slumber timer value.
- - ceva,p0-retry-params: Retry interval timing value for port 0.
- - ceva,p1-retry-params: Retry interval timing value for port 1.
- The fields for the above parameter must be as shown below:
- ceva,pN-retry-params = /bits/ 16 <RIT RCT>;
- RIT: Retry Interval Timer.
- RCT: Rate Change Timer.
-
-Optional properties:
- - ceva,broken-gen2: limit to gen1 speed instead of gen2.
- - phys: phandle for the PHY device
- - resets: phandle to the reset controller for the SATA IP
-
-Examples:
- ahci@fd0c0000 {
- compatible = "ceva,ahci-1v84";
- reg = <0xfd0c0000 0x200>;
- interrupt-parent = <&gic>;
- interrupts = <0 133 4>;
- clocks = <&clkc SATA_CLK_ID>;
- ceva,p0-cominit-params = /bits/ 8 <0x0F 0x25 0x18 0x29>;
- ceva,p0-comwake-params = /bits/ 8 <0x04 0x0B 0x08 0x0F>;
- ceva,p0-burst-params = /bits/ 8 <0x0A 0x08 0x4A 0x06>;
- ceva,p0-retry-params = /bits/ 16 <0x0216 0x7F06>;
-
- ceva,p1-cominit-params = /bits/ 8 <0x0F 0x25 0x18 0x29>;
- ceva,p1-comwake-params = /bits/ 8 <0x04 0x0B 0x08 0x0F>;
- ceva,p1-burst-params = /bits/ 8 <0x0A 0x08 0x4A 0x06>;
- ceva,p1-retry-params = /bits/ 16 <0x0216 0x7F06>;
- ceva,broken-gen2;
- phys = <&psgtr 1 PHY_TYPE_SATA 1 1>;
- resets = <&zynqmp_reset ZYNQMP_RESET_SATA>;
- };
diff --git a/Documentation/devicetree/bindings/ata/ahci-common.yaml b/Documentation/devicetree/bindings/ata/ahci-common.yaml
new file mode 100644
index 000000000000..7fdf40954a4c
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/ahci-common.yaml
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/ahci-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Common Properties for Serial ATA AHCI controllers
+
+maintainers:
+ - Hans de Goede <hdegoede@redhat.com>
+ - Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+description:
+ This document defines device tree properties for a common AHCI SATA
+ controller implementation. It's hardware interface is supposed to
+ conform to the technical standard defined by Intel (see Serial ATA
+ Advanced Host Controller Interface specification for details). The
+ document doesn't constitute a DT-node binding by itself but merely
+ defines a set of common properties for the AHCI-compatible devices.
+
+select: false
+
+allOf:
+ - $ref: sata-common.yaml#
+
+properties:
+ reg:
+ description:
+ Generic AHCI registers space conforming to the Serial ATA AHCI
+ specification.
+
+ reg-names:
+ description: CSR space IDs
+ contains:
+ const: ahci
+
+ interrupts:
+ description:
+ Generic AHCI state change interrupt. Can be implemented either as a
+ single line attached to the controller or as a set of the signals
+ indicating the particular port events.
+ minItems: 1
+ maxItems: 32
+
+ ahci-supply:
+ description: Power regulator for AHCI controller
+
+ target-supply:
+ description: Power regulator for SATA target device
+
+ phy-supply:
+ description: Power regulator for SATA PHY
+
+ phys:
+ description: Reference to the SATA PHY node
+ maxItems: 1
+
+ phy-names:
+ const: sata-phy
+
+ hba-cap:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Bitfield of the HBA generic platform capabilities like Staggered
+ Spin-up or Mechanical Presence Switch support. It can be used to
+ appropriately initialize the HWinit fields of the HBA CAP register
+ in case if the system firmware hasn't done it.
+
+ ports-implemented:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Mask that indicates which ports the HBA supports. Useful if PI is not
+ programmed by the BIOS, which is true for some embedded SoC's.
+
+patternProperties:
+ "^sata-port@[0-9a-f]+$":
+ $ref: '#/$defs/ahci-port'
+ description:
+ It is optionally possible to describe the ports as sub-nodes so
+ to enable each port independently when dealing with multiple PHYs.
+
+required:
+ - reg
+ - interrupts
+
+additionalProperties: true
+
+$defs:
+ ahci-port:
+ $ref: /schemas/ata/sata-common.yaml#/$defs/sata-port
+
+ properties:
+ reg:
+ description:
+ AHCI SATA port identifier. By design AHCI controller can't have
+ more than 32 ports due to the CAP.NP fields and PI register size
+ constraints.
+ minimum: 0
+ maximum: 31
+
+ phys:
+ description: Individual AHCI SATA port PHY
+ maxItems: 1
+
+ phy-names:
+ description: AHCI SATA port PHY ID
+ const: sata-phy
+
+ target-supply:
+ description: Power regulator for SATA port target device
+
+ hba-port-cap:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Bitfield of the HBA port-specific platform capabilities like Hot
+ plugging, eSATA, FIS-based Switching, etc (see AHCI specification
+ for details). It can be used to initialize the HWinit fields of
+ the PxCMD register in case if the system firmware hasn't done it.
+
+ required:
+ - reg
+
+...
diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.txt b/Documentation/devicetree/bindings/ata/ahci-platform.txt
deleted file mode 100644
index 77091a277642..000000000000
--- a/Documentation/devicetree/bindings/ata/ahci-platform.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-* AHCI SATA Controller
-
-SATA nodes are defined to describe on-chip Serial ATA controllers.
-Each SATA controller should have its own node.
-
-It is possible, but not required, to represent each port as a sub-node.
-It allows to enable each port independently when dealing with multiple
-PHYs.
-
-Required properties:
-- compatible : compatible string, one of:
- - "brcm,iproc-ahci"
- - "hisilicon,hisi-ahci"
- - "cavium,octeon-7130-ahci"
- - "ibm,476gtr-ahci"
- - "marvell,armada-380-ahci"
- - "marvell,armada-3700-ahci"
- - "snps,dwc-ahci"
- - "snps,spear-ahci"
- - "generic-ahci"
-- interrupts : <interrupt mapping for SATA IRQ>
-- reg : <registers mapping>
-
-Please note that when using "generic-ahci" you must also specify a SoC specific
-compatible:
- compatible = "manufacturer,soc-model-ahci", "generic-ahci";
-
-Optional properties:
-- dma-coherent : Present if dma operations are coherent
-- clocks : a list of phandle + clock specifier pairs
-- resets : a list of phandle + reset specifier pairs
-- target-supply : regulator for SATA target power
-- phy-supply : regulator for PHY power
-- phys : reference to the SATA PHY node
-- phy-names : must be "sata-phy"
-- ahci-supply : regulator for AHCI controller
-- ports-implemented : Mask that indicates which ports that the HBA supports
- are available for software to use. Useful if PORTS_IMPL
- is not programmed by the BIOS, which is true with
- some embedded SOC's.
-
-Required properties when using sub-nodes:
-- #address-cells : number of cells to encode an address
-- #size-cells : number of cells representing the size of an address
-
-Sub-nodes required properties:
-- reg : the port number
-And at least one of the following properties:
-- phys : reference to the SATA PHY node
-- target-supply : regulator for SATA target power
-
-Examples:
- sata@ffe08000 {
- compatible = "snps,spear-ahci";
- reg = <0xffe08000 0x1000>;
- interrupts = <115>;
- };
-
-With sub-nodes:
- sata@f7e90000 {
- compatible = "marvell,berlin2q-achi", "generic-ahci";
- reg = <0xe90000 0x1000>;
- interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&chip CLKID_SATA>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- sata0: sata-port@0 {
- reg = <0>;
- phys = <&sata_phy 0>;
- target-supply = <&reg_sata0>;
- };
-
- sata1: sata-port@1 {
- reg = <1>;
- phys = <&sata_phy 1>;
- target-supply = <&reg_sata1>;;
- };
- };
diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.yaml b/Documentation/devicetree/bindings/ata/ahci-platform.yaml
new file mode 100644
index 000000000000..358617115bb8
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/ahci-platform.yaml
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/ahci-platform.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AHCI SATA Controller
+
+description: |
+ SATA nodes are defined to describe on-chip Serial ATA controllers.
+ Each SATA controller should have its own node.
+
+ It is possible, but not required, to represent each port as a sub-node.
+ It allows to enable each port independently when dealing with multiple
+ PHYs.
+
+maintainers:
+ - Hans de Goede <hdegoede@redhat.com>
+ - Jens Axboe <axboe@kernel.dk>
+
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - brcm,iproc-ahci
+ - cavium,octeon-7130-ahci
+ - hisilicon,hisi-ahci
+ - ibm,476gtr-ahci
+ - marvell,armada-3700-ahci
+ - marvell,armada-8k-ahci
+ - marvell,berlin2q-ahci
+ - socionext,uniphier-pro4-ahci
+ - socionext,uniphier-pxs2-ahci
+ - socionext,uniphier-pxs3-ahci
+ required:
+ - compatible
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - brcm,iproc-ahci
+ - marvell,armada-8k-ahci
+ - marvell,berlin2-ahci
+ - marvell,berlin2q-ahci
+ - socionext,uniphier-pro4-ahci
+ - socionext,uniphier-pxs2-ahci
+ - socionext,uniphier-pxs3-ahci
+ - const: generic-ahci
+ - enum:
+ - cavium,octeon-7130-ahci
+ - hisilicon,hisi-ahci
+ - ibm,476gtr-ahci
+ - marvell,armada-3700-ahci
+
+ reg:
+ minItems: 1
+ maxItems: 2
+
+ reg-names:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 3
+
+ clock-names:
+ minItems: 1
+ maxItems: 3
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ minItems: 1
+ maxItems: 3
+
+patternProperties:
+ "^sata-port@[0-9a-f]+$":
+ $ref: /schemas/ata/ahci-common.yaml#/$defs/ahci-port
+
+ anyOf:
+ - required: [ phys ]
+ - required: [ target-supply ]
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+allOf:
+ - $ref: ahci-common.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: socionext,uniphier-pro4-ahci
+ then:
+ properties:
+ resets:
+ items:
+ - description: reset line for the parent
+ - description: reset line for the glue logic
+ - description: reset line for the controller
+ required:
+ - resets
+ else:
+ if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - socionext,uniphier-pxs2-ahci
+ - socionext,uniphier-pxs3-ahci
+ then:
+ properties:
+ resets:
+ items:
+ - description: reset for the glue logic
+ - description: reset for the controller
+ required:
+ - resets
+ else:
+ properties:
+ resets:
+ maxItems: 1
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ sata@ffe08000 {
+ compatible = "snps,spear-ahci";
+ reg = <0xffe08000 0x1000>;
+ interrupts = <115>;
+ };
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/berlin2q.h>
+ #include <dt-bindings/ata/ahci.h>
+
+ sata@f7e90000 {
+ compatible = "marvell,berlin2q-ahci", "generic-ahci";
+ reg = <0xf7e90000 0x1000>;
+ interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&chip CLKID_SATA>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ hba-cap = <HBA_SMPS>;
+
+ sata0: sata-port@0 {
+ reg = <0>;
+
+ phys = <&sata_phy 0>;
+ target-supply = <&reg_sata0>;
+
+ hba-port-cap = <(HBA_PORT_FBSCP | HBA_PORT_ESP)>;
+ };
+
+ sata1: sata-port@1 {
+ reg = <1>;
+
+ phys = <&sata_phy 1>;
+ target-supply = <&reg_sata1>;
+
+ hba-port-cap = <(HBA_PORT_HPCP | HBA_PORT_MPSP | HBA_PORT_FBSCP)>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml b/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml
index cb530b46beff..2011bd03cdcd 100644
--- a/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml
+++ b/Documentation/devicetree/bindings/ata/allwinner,sun4i-a10-ahci.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/ata/allwinner,sun4i-a10-ahci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 AHCI SATA Controller bindings
+title: Allwinner A10 AHCI SATA Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml b/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml
index e6b42a113ff1..a2afe2ad6063 100644
--- a/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml
+++ b/Documentation/devicetree/bindings/ata/allwinner,sun8i-r40-ahci.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/ata/allwinner,sun8i-r40-ahci.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner R40 AHCI SATA Controller bindings
+title: Allwinner R40 AHCI SATA Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/ata/ata-generic.yaml b/Documentation/devicetree/bindings/ata/ata-generic.yaml
new file mode 100644
index 000000000000..0697927f3d7e
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/ata-generic.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/ata-generic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Generic Parallel ATA Controller
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description:
+ Generic Parallel ATA controllers supporting PIO modes only.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - arm,vexpress-cf
+ - fsl,mpc8349emitx-pata
+ - const: ata-generic
+
+ reg:
+ items:
+ - description: Command interface registers
+ - description: Control interface registers
+
+ reg-shift:
+ enum: [ 1, 2 ]
+
+ interrupts:
+ maxItems: 1
+
+ ata-generic,use16bit:
+ type: boolean
+ description: Use 16-bit accesses instead of 32-bit for data transfers
+
+ pio-mode:
+ description: Maximum ATA PIO transfer mode
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 6
+ default: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ compact-flash@1a000 {
+ compatible = "arm,vexpress-cf", "ata-generic";
+ reg = <0x1a000 0x100>,
+ <0x1a100 0xf00>;
+ reg-shift = <2>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/ata/baikal,bt1-ahci.yaml b/Documentation/devicetree/bindings/ata/baikal,bt1-ahci.yaml
new file mode 100644
index 000000000000..9b7ca4759bd7
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/baikal,bt1-ahci.yaml
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/baikal,bt1-ahci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Baikal-T1 SoC AHCI SATA controller
+
+maintainers:
+ - Serge Semin <fancer.lancer@gmail.com>
+
+description:
+ AHCI SATA controller embedded into the Baikal-T1 SoC is based on the
+ DWC AHCI SATA v4.10a IP-core.
+
+allOf:
+ - $ref: snps,dwc-ahci-common.yaml#
+
+properties:
+ compatible:
+ const: baikal,bt1-ahci
+
+ clocks:
+ items:
+ - description: Peripheral APB bus clock
+ - description: Application AXI BIU clock
+ - description: SATA Ports reference clock
+
+ clock-names:
+ items:
+ - const: pclk
+ - const: aclk
+ - const: ref
+
+ resets:
+ items:
+ - description: Application AXI BIU domain reset
+ - description: SATA Ports clock domain reset
+
+ reset-names:
+ items:
+ - const: arst
+ - const: ref
+
+ ports-implemented:
+ maximum: 0x3
+
+patternProperties:
+ "^sata-port@[0-1]$":
+ $ref: /schemas/ata/snps,dwc-ahci-common.yaml#/$defs/dwc-ahci-port
+
+ properties:
+ reg:
+ minimum: 0
+ maximum: 1
+
+ snps,tx-ts-max:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Due to having AXI3 bus interface utilized the maximum Tx DMA
+ transaction size can't exceed 16 beats (AxLEN[3:0]).
+ enum: [ 1, 2, 4, 8, 16 ]
+
+ snps,rx-ts-max:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Due to having AXI3 bus interface utilized the maximum Rx DMA
+ transaction size can't exceed 16 beats (AxLEN[3:0]).
+ enum: [ 1, 2, 4, 8, 16 ]
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ sata@1f050000 {
+ compatible = "baikal,bt1-ahci";
+ reg = <0x1f050000 0x2000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ interrupts = <0 64 4>;
+
+ clocks = <&ccu_sys 1>, <&ccu_axi 2>, <&sata_ref_clk>;
+ clock-names = "pclk", "aclk", "ref";
+
+ resets = <&ccu_axi 2>, <&ccu_sys 0>;
+ reset-names = "arst", "ref";
+
+ ports-implemented = <0x3>;
+
+ sata-port@0 {
+ reg = <0>;
+
+ snps,tx-ts-max = <4>;
+ snps,rx-ts-max = <4>;
+ };
+
+ sata-port@1 {
+ reg = <1>;
+
+ snps,tx-ts-max = <4>;
+ snps,rx-ts-max = <4>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt
deleted file mode 100644
index b9ae4ce4a0a0..000000000000
--- a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-* Broadcom SATA3 AHCI Controller
-
-SATA nodes are defined to describe on-chip Serial ATA controllers.
-Each SATA controller should have its own node.
-
-Required properties:
-- compatible : should be one or more of
- "brcm,bcm7216-ahci"
- "brcm,bcm7425-ahci"
- "brcm,bcm7445-ahci"
- "brcm,bcm-nsp-ahci"
- "brcm,sata3-ahci"
- "brcm,bcm63138-ahci"
-- reg : register mappings for AHCI and SATA_TOP_CTRL
-- reg-names : "ahci" and "top-ctrl"
-- interrupts : interrupt mapping for SATA IRQ
-
-Optional properties:
-
-- reset: for "brcm,bcm7216-ahci" must be a valid reset phandle
- pointing to the RESCAL reset controller provider node.
-- reset-names: for "brcm,bcm7216-ahci", must be "rescal".
-
-Also see ahci-platform.txt.
-
-Example:
-
- sata@f045a000 {
- compatible = "brcm,bcm7445-ahci", "brcm,sata3-ahci";
- reg = <0xf045a000 0xa9c>, <0xf0458040 0x24>;
- reg-names = "ahci", "top-ctrl";
- interrupts = <0 30 0>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- sata0: sata-port@0 {
- reg = <0>;
- phys = <&sata_phy 0>;
- };
-
- sata1: sata-port@1 {
- reg = <1>;
- phys = <&sata_phy 1>;
- };
- };
diff --git a/Documentation/devicetree/bindings/ata/brcm,sata-brcm.yaml b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.yaml
new file mode 100644
index 000000000000..fe7f091e744f
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/brcm,sata-brcm.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/brcm,sata-brcm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom SATA3 AHCI Controller
+
+description:
+ SATA nodes are defined to describe on-chip Serial ATA controllers.
+ Each SATA controller should have its own node.
+
+maintainers:
+ - Florian Fainelli <f.fainelli@gmail.com>
+
+allOf:
+ - $ref: ahci-common.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - brcm,bcm7216-ahci
+ - brcm,bcm7445-ahci
+ - brcm,bcm7425-ahci
+ - brcm,bcm63138-ahci
+ - const: brcm,sata3-ahci
+ - items:
+ - const: brcm,bcm-nsp-ahci
+
+ reg:
+ maxItems: 2
+
+ reg-names:
+ items:
+ - const: ahci
+ - const: top-ctrl
+
+ interrupts:
+ maxItems: 1
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - brcm,bcm7216-ahci
+ - brcm,bcm63138-ahci
+then:
+ properties:
+ resets:
+ maxItems: 1
+ reset-names:
+ enum:
+ - rescal
+ - ahci
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - "#address-cells"
+ - "#size-cells"
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ sata@f045a000 {
+ compatible = "brcm,bcm7445-ahci", "brcm,sata3-ahci";
+ reg = <0xf045a000 0xa9c>, <0xf0458040 0x24>;
+ reg-names = "ahci", "top-ctrl";
+ interrupts = <0 30 0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sata0: sata-port@0 {
+ reg = <0>;
+ phys = <&sata_phy 0>;
+ };
+
+ sata1: sata-port@1 {
+ reg = <1>;
+ phys = <&sata_phy 1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml b/Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml
new file mode 100644
index 000000000000..9b31f864e071
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/ceva,ahci-1v84.yaml
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/ceva,ahci-1v84.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ceva AHCI SATA Controller
+
+maintainers:
+ - Piyush Mehta <piyush.mehta@xilinx.com>
+
+description: |
+ The Ceva SATA controller mostly conforms to the AHCI interface with some
+ special extensions to add functionality, is a high-performance dual-port
+ SATA host controller with an AHCI compliant command layer which supports
+ advanced features such as native command queuing and frame information
+ structure (FIS) based switching for systems employing port multipliers.
+
+properties:
+ compatible:
+ const: ceva,ahci-1v84
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ dma-coherent: true
+
+ interrupts:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ ceva,p0-cominit-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ OOB timing value for COMINIT parameter for port 0.
+ The fields for the above parameter must be as shown below:-
+ ceva,p0-cominit-params = /bits/ 8 <CIBGMN CIBGMX CIBGN CINMP>;
+ items:
+ - description: CINMP - COMINIT Negate Minimum Period.
+ - description: CIBGN - COMINIT Burst Gap Nominal.
+ - description: CIBGMX - COMINIT Burst Gap Maximum.
+ - description: CIBGMN - COMINIT Burst Gap Minimum.
+
+ ceva,p0-comwake-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ OOB timing value for COMWAKE parameter for port 0.
+ The fields for the above parameter must be as shown below:-
+ ceva,p0-comwake-params = /bits/ 8 <CWBGMN CWBGMX CWBGN CWNMP>;
+ items:
+ - description: CWBGMN - COMWAKE Burst Gap Minimum.
+ - description: CWBGMX - COMWAKE Burst Gap Maximum.
+ - description: CWBGN - COMWAKE Burst Gap Nominal.
+ - description: CWNMP - COMWAKE Negate Minimum Period.
+
+ ceva,p0-burst-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ Burst timing value for COM parameter for port 0.
+ The fields for the above parameter must be as shown below:-
+ ceva,p0-burst-params = /bits/ 8 <BMX BNM SFD PTST>;
+ items:
+ - description: BMX - COM Burst Maximum.
+ - description: BNM - COM Burst Nominal.
+ - description: SFD - Signal Failure Detection value.
+ - description: PTST - Partial to Slumber timer value.
+
+ ceva,p0-retry-params:
+ $ref: /schemas/types.yaml#/definitions/uint16-array
+ description: |
+ Retry interval timing value for port 0.
+ The fields for the above parameter must be as shown below:-
+ ceva,p0-retry-params = /bits/ 16 <RIT RCT>;
+ items:
+ - description: RIT - Retry Interval Timer.
+ - description: RCT - Rate Change Timer.
+
+ ceva,p1-cominit-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ OOB timing value for COMINIT parameter for port 1.
+ The fields for the above parameter must be as shown below:-
+ ceva,p1-cominit-params = /bits/ 8 <CIBGMN CIBGMX CIBGN CINMP>;
+ items:
+ - description: CINMP - COMINIT Negate Minimum Period.
+ - description: CIBGN - COMINIT Burst Gap Nominal.
+ - description: CIBGMX - COMINIT Burst Gap Maximum.
+ - description: CIBGMN - COMINIT Burst Gap Minimum.
+
+ ceva,p1-comwake-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ OOB timing value for COMWAKE parameter for port 1.
+ The fields for the above parameter must be as shown below:-
+ ceva,p1-comwake-params = /bits/ 8 <CWBGMN CWBGMX CWBGN CWNMP>;
+ items:
+ - description: CWBGMN - COMWAKE Burst Gap Minimum.
+ - description: CWBGMX - COMWAKE Burst Gap Maximum.
+ - description: CWBGN - COMWAKE Burst Gap Nominal.
+ - description: CWNMP - COMWAKE Negate Minimum Period.
+
+ ceva,p1-burst-params:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: |
+ Burst timing value for COM parameter for port 1.
+ The fields for the above parameter must be as shown below:-
+ ceva,p1-burst-params = /bits/ 8 <BMX BNM SFD PTST>;
+ items:
+ - description: BMX - COM Burst Maximum.
+ - description: BNM - COM Burst Nominal.
+ - description: SFD - Signal Failure Detection value.
+ - description: PTST - Partial to Slumber timer value.
+
+ ceva,p1-retry-params:
+ $ref: /schemas/types.yaml#/definitions/uint16-array
+ description: |
+ Retry interval timing value for port 1.
+ The fields for the above parameter must be as shown below:-
+ ceva,pN-retry-params = /bits/ 16 <RIT RCT>;
+ items:
+ - description: RIT - Retry Interval Timer.
+ - description: RCT - Rate Change Timer.
+
+ ceva,broken-gen2:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: |
+ limit to gen1 speed instead of gen2.
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ items:
+ - const: sata-phy
+
+ resets:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - interrupts
+ - ceva,p0-cominit-params
+ - ceva,p0-comwake-params
+ - ceva,p0-burst-params
+ - ceva,p0-retry-params
+ - ceva,p1-cominit-params
+ - ceva,p1-comwake-params
+ - ceva,p1-burst-params
+ - ceva,p1-retry-params
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/xlnx-zynqmp-clk.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/power/xlnx-zynqmp-power.h>
+ #include <dt-bindings/reset/xlnx-zynqmp-resets.h>
+ #include <dt-bindings/clock/xlnx-zynqmp-clk.h>
+ #include <dt-bindings/phy/phy.h>
+
+ sata: ahci@fd0c0000 {
+ compatible = "ceva,ahci-1v84";
+ reg = <0xfd0c0000 0x200>;
+ interrupt-parent = <&gic>;
+ interrupts = <0 133 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&zynqmp_clk SATA_REF>;
+ ceva,p0-cominit-params = /bits/ 8 <0x0F 0x25 0x18 0x29>;
+ ceva,p0-comwake-params = /bits/ 8 <0x04 0x0B 0x08 0x0F>;
+ ceva,p0-burst-params = /bits/ 8 <0x0A 0x08 0x4A 0x06>;
+ ceva,p0-retry-params = /bits/ 16 <0x0216 0x7F06>;
+ ceva,p1-cominit-params = /bits/ 8 <0x0F 0x25 0x18 0x29>;
+ ceva,p1-comwake-params = /bits/ 8 <0x04 0x0B 0x08 0x0F>;
+ ceva,p1-burst-params = /bits/ 8 <0x0A 0x08 0x4A 0x06>;
+ ceva,p1-retry-params = /bits/ 16 <0x0216 0x7F06>;
+ ceva,broken-gen2;
+ phys = <&psgtr 1 PHY_TYPE_SATA 1 1>;
+ resets = <&zynqmp_reset ZYNQMP_RESET_SATA>;
+ };
diff --git a/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.txt b/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.txt
deleted file mode 100644
index 1c3d3cc70051..000000000000
--- a/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-* Cortina Systems Gemini SATA Bridge
-
-The Gemini SATA bridge in a SoC-internal PATA to SATA bridge that
-takes two Faraday Technology FTIDE010 PATA controllers and bridges
-them in different configurations to two SATA ports.
-
-Required properties:
-- compatible: should be
- "cortina,gemini-sata-bridge"
-- reg: registers and size for the block
-- resets: phandles to the reset lines for both SATA bridges
-- reset-names: must be "sata0", "sata1"
-- clocks: phandles to the compulsory peripheral clocks
-- clock-names: must be "SATA0_PCLK", "SATA1_PCLK"
-- syscon: a phandle to the global Gemini system controller
-- cortina,gemini-ata-muxmode: tell the desired multiplexing mode for
- the ATA controller and SATA bridges. Values 0..3:
- Mode 0: ata0 master <-> sata0
- ata1 master <-> sata1
- ata0 slave interface brought out on IDE pads
- Mode 1: ata0 master <-> sata0
- ata1 master <-> sata1
- ata1 slave interface brought out on IDE pads
- Mode 2: ata1 master <-> sata1
- ata1 slave <-> sata0
- ata0 master and slave interfaces brought out
- on IDE pads
- Mode 3: ata0 master <-> sata0
- ata0 slave <-> sata1
- ata1 master and slave interfaces brought out
- on IDE pads
-
-Optional boolean properties:
-- cortina,gemini-enable-ide-pins: enables the PATA to IDE connection.
- The muxmode setting decides whether ATA0 or ATA1 is brought out,
- and whether master, slave or both interfaces get brought out.
-- cortina,gemini-enable-sata-bridge: enables the PATA to SATA bridge
- inside the Gemnini SoC. The Muxmode decides what PATA blocks will
- be muxed out and how.
-
-Example:
-
-sata: sata@46000000 {
- compatible = "cortina,gemini-sata-bridge";
- reg = <0x46000000 0x100>;
- resets = <&rcon 26>, <&rcon 27>;
- reset-names = "sata0", "sata1";
- clocks = <&gcc GEMINI_CLK_GATE_SATA0>,
- <&gcc GEMINI_CLK_GATE_SATA1>;
- clock-names = "SATA0_PCLK", "SATA1_PCLK";
- syscon = <&syscon>;
- cortina,gemini-ata-muxmode = <3>;
- cortina,gemini-enable-ide-pins;
- cortina,gemini-enable-sata-bridge;
-};
diff --git a/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.yaml b/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.yaml
new file mode 100644
index 000000000000..529093666508
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/cortina,gemini-sata-bridge.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/cortina,gemini-sata-bridge.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Cortina Systems Gemini SATA Bridge
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: |
+ The Gemini SATA bridge in a SoC-internal PATA to SATA bridge that
+ takes two Faraday Technology FTIDE010 PATA controllers and bridges
+ them in different configurations to two SATA ports.
+
+properties:
+ compatible:
+ const: cortina,gemini-sata-bridge
+
+ reg:
+ maxItems: 1
+
+ resets:
+ maxItems: 2
+ description: phandles to the reset lines for both SATA bridges
+
+ reset-names:
+ items:
+ - const: sata0
+ - const: sata1
+
+ clocks:
+ maxItems: 2
+ description: phandles to the compulsory peripheral clocks
+
+ clock-names:
+ items:
+ - const: SATA0_PCLK
+ - const: SATA1_PCLK
+
+ syscon:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: a phandle to the global Gemini system controller
+
+ cortina,gemini-ata-muxmode:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum:
+ - 0
+ - 1
+ - 2
+ - 3
+ description: |
+ Tell the desired multiplexing mode for the ATA controller and SATA
+ bridges.
+ Mode 0: ata0 master <-> sata0
+ ata1 master <-> sata1
+ ata0 slave interface brought out on IDE pads
+ Mode 1: ata0 master <-> sata0
+ ata1 master <-> sata1
+ ata1 slave interface brought out on IDE pads
+ Mode 2: ata1 master <-> sata1
+ ata1 slave <-> sata0
+ ata0 master and slave interfaces brought out on IDE pads
+ Mode 3: ata0 master <-> sata0
+ ata0 slave <-> sata1
+ ata1 master and slave interfaces brought out on IDE pads
+
+ cortina,gemini-enable-ide-pins:
+ type: boolean
+ description: Enables the PATA to IDE connection.
+ The muxmode setting decides whether ATA0 or ATA1 is brought out,
+ and whether master, slave or both interfaces get brought out.
+
+ cortina,gemini-enable-sata-bridge:
+ type: boolean
+ description: Enables the PATA to SATA bridge inside the Gemnini SoC.
+ The Muxmode decides what PATA blocks will be muxed out and how.
+
+required:
+ - clocks
+ - clock-names
+ - cortina,gemini-ata-muxmode
+ - resets
+ - reset-names
+ - compatible
+ - reg
+ - syscon
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/cortina,gemini-clock.h>
+ sata@46000000 {
+ compatible = "cortina,gemini-sata-bridge";
+ reg = <0x46000000 0x100>;
+ resets = <&rcon 26>, <&rcon 27>;
+ reset-names = "sata0", "sata1";
+ clocks = <&gcc GEMINI_CLK_GATE_SATA0>,
+ <&gcc GEMINI_CLK_GATE_SATA1>;
+ clock-names = "SATA0_PCLK", "SATA1_PCLK";
+ syscon = <&syscon>;
+ cortina,gemini-ata-muxmode = <3>;
+ cortina,gemini-enable-ide-pins;
+ cortina,gemini-enable-sata-bridge;
+ };
diff --git a/Documentation/devicetree/bindings/ata/intel,ixp4xx-compact-flash.yaml b/Documentation/devicetree/bindings/ata/intel,ixp4xx-compact-flash.yaml
index 52e18600ecff..378692010c56 100644
--- a/Documentation/devicetree/bindings/ata/intel,ixp4xx-compact-flash.yaml
+++ b/Documentation/devicetree/bindings/ata/intel,ixp4xx-compact-flash.yaml
@@ -35,6 +35,7 @@ required:
allOf:
- $ref: pata-common.yaml#
+ - $ref: /schemas/memory-controllers/intel,ixp4xx-expansion-peripheral-props.yaml#
unevaluatedProperties: false
diff --git a/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml b/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
index c060c7914cae..fe0909554790 100644
--- a/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
+++ b/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/ata/renesas,rcar-sata.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/ata/renesas,rcar-sata.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas R-Car Serial-ATA Interface
@@ -26,6 +26,7 @@ properties:
- items:
- enum:
- renesas,sata-r8a774b1 # RZ/G2N
+ - renesas,sata-r8a774e1 # RZ/G2H
- renesas,sata-r8a7795 # R-Car H3
- renesas,sata-r8a77965 # R-Car M3-N
- const: renesas,rcar-gen3-sata # generic R-Car Gen3 or RZ/G2
diff --git a/Documentation/devicetree/bindings/ata/sata-common.yaml b/Documentation/devicetree/bindings/ata/sata-common.yaml
index 7ac77b1c5850..58c9342b9925 100644
--- a/Documentation/devicetree/bindings/ata/sata-common.yaml
+++ b/Documentation/devicetree/bindings/ata/sata-common.yaml
@@ -31,22 +31,27 @@ properties:
"#size-cells":
const: 0
+ dma-coherent: true
+
patternProperties:
"^sata-port@[0-9a-e]$":
+ $ref: '#/$defs/sata-port'
description: |
DT nodes for ports connected on the SATA host. The SATA port
nodes will be named "sata-port".
+
+additionalProperties: true
+
+$defs:
+ sata-port:
type: object
properties:
reg:
minimum: 0
- maximum: 14
description:
- The ID number of the drive port SATA can potentially use a port
- multiplier making it possible to connect up to 15 disks to a single
- SATA port.
-
-additionalProperties: true
+ The ID number of the SATA port. Aside with being directly used,
+ each port can have a Port Multiplier attached thus allowing to
+ access more than one drive by means of a single SATA port.
...
diff --git a/Documentation/devicetree/bindings/ata/sata_highbank.yaml b/Documentation/devicetree/bindings/ata/sata_highbank.yaml
index ce75d77e9289..f23f26a8f21c 100644
--- a/Documentation/devicetree/bindings/ata/sata_highbank.yaml
+++ b/Documentation/devicetree/bindings/ata/sata_highbank.yaml
@@ -51,6 +51,8 @@ properties:
$ref: /schemas/types.yaml#/definitions/phandle-array
minItems: 1
maxItems: 8
+ items:
+ maxItems: 2
calxeda,tx-atten:
description: |
diff --git a/Documentation/devicetree/bindings/ata/snps,dwc-ahci-common.yaml b/Documentation/devicetree/bindings/ata/snps,dwc-ahci-common.yaml
new file mode 100644
index 000000000000..c1457910520b
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/snps,dwc-ahci-common.yaml
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/snps,dwc-ahci-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DWC AHCI SATA controller properties
+
+maintainers:
+ - Serge Semin <fancer.lancer@gmail.com>
+
+description:
+ This document defines device tree schema for the generic Synopsys DWC
+ AHCI controller properties.
+
+select: false
+
+allOf:
+ - $ref: ahci-common.yaml#
+
+properties:
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ description:
+ Basic DWC AHCI SATA clock sources like application AXI/AHB BIU clock,
+ PM-alive clock, RxOOB detection clock, embedded PHYs reference (Rx/Tx)
+ clock, etc.
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ minItems: 1
+ maxItems: 4
+ items:
+ oneOf:
+ - description: Application APB/AHB/AXI BIU clock
+ enum:
+ - pclk
+ - aclk
+ - hclk
+ - sata
+ - description: Power Module keep-alive clock
+ const: pmalive
+ - description: RxOOB detection clock
+ const: rxoob
+ - description: SATA Ports reference clock
+ const: ref
+
+ resets:
+ description:
+ At least basic application and reference clock domains resets are
+ normally supported by the DWC AHCI SATA controller.
+ minItems: 1
+ maxItems: 4
+
+ reset-names:
+ minItems: 1
+ maxItems: 4
+ items:
+ oneOf:
+ - description: Application AHB/AXI BIU clock domain reset control
+ enum:
+ - arst
+ - hrst
+ - description: Power Module keep-alive clock domain reset control
+ const: pmalive
+ - description: RxOOB detection clock domain reset control
+ const: rxoob
+ - description: Reference clock domain reset control
+ const: ref
+
+patternProperties:
+ "^sata-port@[0-9a-e]$":
+ $ref: '#/$defs/dwc-ahci-port'
+
+additionalProperties: true
+
+$defs:
+ dwc-ahci-port:
+ $ref: /schemas/ata/ahci-common.yaml#/$defs/ahci-port
+
+ properties:
+ reg:
+ minimum: 0
+ maximum: 7
+
+ snps,tx-ts-max:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Maximal size of Tx DMA transactions in FIFO words
+ enum: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+
+ snps,rx-ts-max:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Maximal size of Rx DMA transactions in FIFO words
+ enum: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+
+...
diff --git a/Documentation/devicetree/bindings/ata/snps,dwc-ahci.yaml b/Documentation/devicetree/bindings/ata/snps,dwc-ahci.yaml
new file mode 100644
index 000000000000..5afa4b57ce20
--- /dev/null
+++ b/Documentation/devicetree/bindings/ata/snps,dwc-ahci.yaml
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ata/snps,dwc-ahci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DWC AHCI SATA controller
+
+maintainers:
+ - Serge Semin <fancer.lancer@gmail.com>
+
+description:
+ This document defines device tree bindings for the generic Synopsys DWC
+ implementation of the AHCI SATA controller.
+
+allOf:
+ - $ref: snps,dwc-ahci-common.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - description: Synopsys AHCI SATA-compatible devices
+ const: snps,dwc-ahci
+ - description: SPEAr1340 AHCI SATA device
+ const: snps,spear-ahci
+ - description: Rockhip RK3568 AHCI controller
+ items:
+ - const: rockchip,rk3568-dwc-ahci
+ - const: snps,dwc-ahci
+
+patternProperties:
+ "^sata-port@[0-9a-e]$":
+ $ref: /schemas/ata/snps,dwc-ahci-common.yaml#/$defs/dwc-ahci-port
+
+ unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/ata/ahci.h>
+
+ sata@122f0000 {
+ compatible = "snps,dwc-ahci";
+ reg = <0x122F0000 0x1ff>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>;
+
+ clocks = <&clock1>, <&clock2>;
+ clock-names = "aclk", "ref";
+
+ phys = <&sata_phy>;
+ phy-names = "sata-phy";
+
+ ports-implemented = <0x1>;
+
+ sata-port@0 {
+ reg = <0>;
+
+ hba-port-cap = <HBA_PORT_FBSCP>;
+
+ snps,tx-ts-max = <512>;
+ snps,rx-ts-max = <512>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml b/Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml
index 64ffff460026..49304a1476ab 100644
--- a/Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml
+++ b/Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml
@@ -10,18 +10,25 @@ maintainers:
- Robin van der Gracht <robin@protonic.nl>
allOf:
- - $ref: "/schemas/input/matrix-keymap.yaml#"
+ - $ref: /schemas/input/matrix-keymap.yaml#
properties:
compatible:
- const: holtek,ht16k33
+ oneOf:
+ - items:
+ - enum:
+ - adafruit,3108 # 0.56" 4-Digit 7-Segment FeatherWing Display (Red)
+ - adafruit,3130 # 0.54" Quad Alphanumeric FeatherWing Display (Red)
+ - const: holtek,ht16k33
+
+ - const: holtek,ht16k33 # Generic 16*8 LED controller with dot-matrix display
reg:
maxItems: 1
refresh-rate-hz:
maxItems: 1
- description: Display update interval in Hertz
+ description: Display update interval in Hertz for dot-matrix displays
interrupts:
maxItems: 1
@@ -41,10 +48,22 @@ properties:
default: 16
description: Initial brightness level
+ led:
+ type: object
+ $ref: /schemas/leds/common.yaml#
+ unevaluatedProperties: false
+
required:
- compatible
- reg
- - refresh-rate-hz
+
+if:
+ properties:
+ compatible:
+ const: holtek,ht16k33
+then:
+ required:
+ - refresh-rate-hz
additionalProperties: false
@@ -52,7 +71,8 @@ examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/input/input.h>
- i2c1 {
+ #include <dt-bindings/leds/common.h>
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -73,5 +93,11 @@ examples:
<MATRIX_KEY(4, 1, KEY_F9)>,
<MATRIX_KEY(5, 1, KEY_F3)>,
<MATRIX_KEY(6, 1, KEY_F1)>;
+
+ led {
+ color = <LED_COLOR_ID_RED>;
+ function = LED_FUNCTION_BACKLIGHT;
+ linux,default-trigger = "backlight";
+ };
};
};
diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml
index 863a287ebc7e..9845a187bdf6 100644
--- a/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml
+++ b/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/bus/allwinner,sun50i-a64-de2.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A64 Display Engine Bus Device Tree Bindings
+title: Allwinner A64 Display Engine Bus
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -35,7 +35,10 @@ properties:
The SRAM that needs to be claimed to access the display engine
bus.
$ref: /schemas/types.yaml#/definitions/phandle-array
- maxItems: 1
+ items:
+ - items:
+ - description: phandle to SRAM
+ - description: register value for device
ranges: true
@@ -43,6 +46,7 @@ patternProperties:
# All other properties should be child nodes with unit-address and 'reg'
"^[a-zA-Z][a-zA-Z0-9,+\\-._]{0,63}@[0-9a-fA-F]+$":
type: object
+ additionalProperties: true
properties:
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
index 3d719f468a5b..24c939f59091 100644
--- a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
+++ b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/bus/allwinner,sun8i-a23-rsb.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A23 RSB Device Tree Bindings
+title: Allwinner A23 RSB
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -45,6 +45,7 @@ properties:
patternProperties:
"^.*@[0-9a-fA-F]+$":
type: object
+ additionalProperties: true
properties:
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml b/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml
new file mode 100644
index 000000000000..2894256c976d
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/aspeed,ast2600-ahbc.yaml
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/aspeed,ast2600-ahbc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED Advanced High-Performance Bus Controller (AHBC)
+
+maintainers:
+ - Neal Liu <neal_liu@aspeedtech.com>
+ - Chia-Wei Wang <chiawei_wang@aspeedtech.com>
+
+description: |
+ Advanced High-performance Bus Controller (AHBC) supports plenty of mechanisms
+ including a priority arbiter, an address decoder and a data multiplexer
+ to control the overall operations of Advanced High-performance Bus (AHB).
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2600-ahbc
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ ahbc@1e600000 {
+ compatible = "aspeed,ast2600-ahbc";
+ reg = <0x1e600000 0x100>;
+ };
diff --git a/Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt b/Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt
deleted file mode 100644
index 10f6d0a8159d..000000000000
--- a/Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Broadcom GISB bus Arbiter controller
-
-Required properties:
-
-- compatible:
- "brcm,bcm7278-gisb-arb" for V7 28nm chips
- "brcm,gisb-arb" or "brcm,bcm7445-gisb-arb" for other 28nm chips
- "brcm,bcm7435-gisb-arb" for newer 40nm chips
- "brcm,bcm7400-gisb-arb" for older 40nm chips and all 65nm chips
- "brcm,bcm7038-gisb-arb" for 130nm chips
-- reg: specifies the base physical address and size of the registers
-- interrupts: specifies the two interrupts (timeout and TEA) to be used from
- the parent interrupt controller. A third optional interrupt may be specified
- for breakpoints.
-
-Optional properties:
-
-- brcm,gisb-arb-master-mask: 32-bits wide bitmask used to specify which GISB
- masters are valid at the system level
-- brcm,gisb-arb-master-names: string list of the litteral name of the GISB
- masters. Should match the number of bits set in brcm,gisb-master-mask and
- the order in which they appear
-
-Example:
-
-gisb-arb@f0400000 {
- compatible = "brcm,gisb-arb";
- reg = <0xf0400000 0x800>;
- interrupts = <0>, <2>;
- interrupt-parent = <&sun_l2_intc>;
-
- brcm,gisb-arb-master-mask = <0x7>;
- brcm,gisb-arb-master-names = "bsp_0", "scpu_0", "cpu_0";
-};
diff --git a/Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml b/Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml
new file mode 100644
index 000000000000..b23c3001991e
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/brcm,gisb-arb.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/brcm,gisb-arb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom GISB bus Arbiter controller
+
+maintainers:
+ - Florian Fainelli <f.fainelli@gmail.com>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - brcm,bcm7445-gisb-arb # for other 28nm chips
+ - const: brcm,gisb-arb
+ - items:
+ - enum:
+ - brcm,bcm7278-gisb-arb # for V7 28nm chips
+ - brcm,bcm7435-gisb-arb # for newer 40nm chips
+ - brcm,bcm7400-gisb-arb # for older 40nm chips and all 65nm chips
+ - brcm,bcm7038-gisb-arb # for 130nm chips
+ - brcm,gisb-arb # fallback compatible
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ minItems: 2
+ items:
+ - description: timeout interrupt line
+ - description: target abort interrupt line
+ - description: breakpoint interrupt line
+
+ brcm,gisb-arb-master-mask:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: >
+ 32-bits wide bitmask used to specify which GISB masters are valid at the
+ system level
+
+ brcm,gisb-arb-master-names:
+ $ref: /schemas/types.yaml#/definitions/string-array
+ description: >
+ String list of the litteral name of the GISB masters. Should match the
+ number of bits set in brcm,gisb-master-mask and the order in which they
+ appear from MSB to LSB.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ gisb-arb@f0400000 {
+ compatible = "brcm,gisb-arb";
+ reg = <0xf0400000 0x800>;
+ interrupts = <0>, <2>;
+ interrupt-parent = <&sun_l2_intc>;
+ brcm,gisb-arb-master-mask = <0x7>;
+ brcm,gisb-arb-master-names = "bsp_0", "scpu_0", "cpu_0";
+ };
diff --git a/Documentation/devicetree/bindings/bus/fsl,imx8qxp-pixel-link-msi-bus.yaml b/Documentation/devicetree/bindings/bus/fsl,imx8qxp-pixel-link-msi-bus.yaml
new file mode 100644
index 000000000000..b568d0ce438d
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/fsl,imx8qxp-pixel-link-msi-bus.yaml
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/fsl,imx8qxp-pixel-link-msi-bus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8qxp Pixel Link Medium Speed Interconnect (MSI) Bus
+
+maintainers:
+ - Liu Ying <victor.liu@nxp.com>
+
+description: |
+ i.MX8qxp pixel link MSI bus is used to control settings of PHYs, I/Os
+ sitting together with the PHYs. It is not the same as the MSI bus coming
+ from i.MX8 System Controller Unit (SCU) which is used to control power,
+ clock and reset through the i.MX8 Distributed Slave System Controller (DSC).
+
+ i.MX8qxp pixel link MSI bus is a simple memory-mapped bus. Two input clocks,
+ that is, MSI clock and AHB clock, need to be enabled so that peripherals
+ connected to the bus can be accessed. Also, the bus is part of a power
+ domain. The power domain needs to be enabled before the peripherals can
+ be accessed.
+
+ Peripherals in i.MX8qm/qxp imaging, LVDS, MIPI DSI and HDMI TX subsystems,
+ like I2C controller, PWM controller, MIPI DSI controller and Control and
+ Status Registers (CSR) module, are accessed through the bus.
+
+ The i.MX System Controller Firmware (SCFW) owns and uses the i.MX8qm/qxp
+ pixel link MSI bus controller and does not allow SCFW user to control it.
+ So, the controller's registers cannot be accessed by SCFW user. Hence,
+ the interrupts generated by the controller don't make any sense from SCFW
+ user's point of view.
+
+allOf:
+ - $ref: simple-pm-bus.yaml#
+
+# We need a select here so we don't match all nodes with 'simple-pm-bus'.
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx8qxp-display-pixel-link-msi-bus
+ - fsl,imx8qm-display-pixel-link-msi-bus
+ required:
+ - compatible
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - fsl,imx8qxp-display-pixel-link-msi-bus
+ - fsl,imx8qm-display-pixel-link-msi-bus
+ - const: simple-pm-bus
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: master gated clock from system
+ - description: AHB clock
+
+ clock-names:
+ items:
+ - const: msi
+ - const: ahb
+
+patternProperties:
+ "^.*@[0-9a-f]+$":
+ description: Devices attached to the bus
+ type: object
+ properties:
+ reg:
+ maxItems: 1
+
+ required:
+ - reg
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - power-domains
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/imx8-lpcg.h>
+ #include <dt-bindings/firmware/imx/rsrc.h>
+ bus@56200000 {
+ compatible = "fsl,imx8qxp-display-pixel-link-msi-bus", "simple-pm-bus";
+ reg = <0x56200000 0x20000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ interrupt-parent = <&dc0_irqsteer>;
+ interrupts = <320>;
+ ranges;
+ clocks = <&dc0_disp_ctrl_link_mst0_lpcg IMX_LPCG_CLK_4>,
+ <&dc0_disp_ctrl_link_mst0_lpcg IMX_LPCG_CLK_4>;
+ clock-names = "msi", "ahb";
+ power-domains = <&pd IMX_SC_R_DC_0>;
+
+ syscon@56221000 {
+ compatible = "fsl,imx8qxp-mipi-lvds-csr", "syscon", "simple-mfd";
+ reg = <0x56221000 0x1000>;
+ clocks = <&mipi_lvds_0_di_mipi_lvds_regs_lpcg IMX_LPCG_CLK_4>;
+ clock-names = "ipg";
+
+ pxl2dpi {
+ compatible = "fsl,imx8qxp-pxl2dpi";
+ fsl,sc-resource = <IMX_SC_R_MIPI_0>;
+ power-domains = <&pd IMX_SC_R_MIPI_0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0>;
+
+ mipi_lvds_0_pxl2dpi_dc0_pixel_link0: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&dc0_pixel_link0_mipi_lvds_0_pxl2dpi>;
+ };
+
+ mipi_lvds_0_pxl2dpi_dc0_pixel_link1: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&dc0_pixel_link1_mipi_lvds_0_pxl2dpi>;
+ };
+ };
+
+ port@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+
+ mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch0: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&mipi_lvds_0_ldb_ch0_mipi_lvds_0_pxl2dpi>;
+ };
+
+ mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch1: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&mipi_lvds_0_ldb_ch1_mipi_lvds_0_pxl2dpi>;
+ };
+ };
+ };
+ };
+
+ ldb {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,imx8qxp-ldb";
+ clocks = <&clk IMX_SC_R_LVDS_0 IMX_SC_PM_CLK_MISC2>,
+ <&clk IMX_SC_R_LVDS_0 IMX_SC_PM_CLK_BYPASS>;
+ clock-names = "pixel", "bypass";
+ power-domains = <&pd IMX_SC_R_LVDS_0>;
+
+ channel@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0>;
+ phys = <&mipi_lvds_0_phy>;
+ phy-names = "lvds_phy";
+
+ port@0 {
+ reg = <0>;
+
+ mipi_lvds_0_ldb_ch0_mipi_lvds_0_pxl2dpi: endpoint {
+ remote-endpoint = <&mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch0>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ /* ... */
+ };
+ };
+
+ channel@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+ phys = <&mipi_lvds_0_phy>;
+ phy-names = "lvds_phy";
+
+ port@0 {
+ reg = <0>;
+
+ mipi_lvds_0_ldb_ch1_mipi_lvds_0_pxl2dpi: endpoint {
+ remote-endpoint = <&mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch1>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ /* ... */
+ };
+ };
+ };
+ };
+
+ clock-controller@56223004 {
+ compatible = "fsl,imx8qxp-lpcg";
+ reg = <0x56223004 0x4>;
+ #clock-cells = <1>;
+ clocks = <&mipi_lvds_0_ipg_clk>;
+ clock-indices = <IMX_LPCG_CLK_4>;
+ clock-output-names = "mipi_lvds_0_di_mipi_lvds_regs_lpcg_ipg_clk";
+ power-domains = <&pd IMX_SC_R_MIPI_0>;
+ };
+
+ phy@56228300 {
+ compatible = "fsl,imx8qxp-mipi-dphy";
+ reg = <0x56228300 0x100>;
+ clocks = <&clk IMX_SC_R_LVDS_0 IMX_SC_PM_CLK_PHY>;
+ clock-names = "phy_ref";
+ #phy-cells = <0>;
+ fsl,syscon = <&mipi_lvds_0_csr>;
+ power-domains = <&pd IMX_SC_R_MIPI_0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/bus/fsl,spba-bus.yaml b/Documentation/devicetree/bindings/bus/fsl,spba-bus.yaml
new file mode 100644
index 000000000000..d42dbb0bbc2e
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/fsl,spba-bus.yaml
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/fsl,spba-bus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Shared Peripherals Bus Interface
+
+maintainers:
+ - Shawn Guo <shawnguo@kernel.org>
+
+description: |
+ A simple bus enabling access to shared peripherals.
+
+ The "spba-bus" follows the "simple-bus" set of properties, as
+ specified in the Devicetree Specification. It is an extension of
+ "simple-bus" because the SDMA controller uses this compatible flag to
+ determine which peripherals are available to it and the range over which
+ the SDMA can access. There are no special clocks for the bus, because
+ the SDMA controller itself has its interrupt and clock assignments.
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: fsl,spba-bus
+ required:
+ - compatible
+
+properties:
+ $nodename:
+ pattern: "^spba-bus(@[0-9a-f]+)?$"
+
+ compatible:
+ items:
+ - const: fsl,spba-bus
+ - const: simple-bus
+
+ '#address-cells':
+ enum: [ 1, 2 ]
+
+ '#size-cells':
+ enum: [ 1, 2 ]
+
+ reg:
+ maxItems: 1
+
+ ranges: true
+
+required:
+ - compatible
+ - '#address-cells'
+ - '#size-cells'
+ - reg
+ - ranges
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ spba-bus@30000000 {
+ compatible = "fsl,spba-bus", "simple-bus";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x30000000 0x100000>;
+ ranges;
+ };
diff --git a/Documentation/devicetree/bindings/bus/imx-weim.txt b/Documentation/devicetree/bindings/bus/imx-weim.txt
index 1b1d1c5c21ea..e7f502070d77 100644
--- a/Documentation/devicetree/bindings/bus/imx-weim.txt
+++ b/Documentation/devicetree/bindings/bus/imx-weim.txt
@@ -48,6 +48,11 @@ Optional properties:
devices, the presence of this property indicates that
the weim bus should operate in Burst Clock Mode.
+ - fsl,continuous-burst-clk Make Burst Clock to output continuous clock.
+ Without this option Burst Clock will output clock
+ only when necessary. This takes effect only if
+ "fsl,burst-clk-enable" is set.
+
Timing property for child nodes. It is mandatory, not optional.
- fsl,weim-cs-timing: The timing array, contains timing values for the
diff --git a/Documentation/devicetree/bindings/bus/intel,ixp4xx-expansion-bus-controller.yaml b/Documentation/devicetree/bindings/bus/intel,ixp4xx-expansion-bus-controller.yaml
deleted file mode 100644
index 5fb4e7bfa4da..000000000000
--- a/Documentation/devicetree/bindings/bus/intel,ixp4xx-expansion-bus-controller.yaml
+++ /dev/null
@@ -1,168 +0,0 @@
-# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/bus/intel,ixp4xx-expansion-bus-controller.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Intel IXP4xx Expansion Bus Controller
-
-description: |
- The IXP4xx expansion bus controller handles access to devices on the
- memory-mapped expansion bus on the Intel IXP4xx family of system on chips,
- including IXP42x, IXP43x, IXP45x and IXP46x.
-
-maintainers:
- - Linus Walleij <linus.walleij@linaro.org>
-
-properties:
- $nodename:
- pattern: '^bus@[0-9a-f]+$'
-
- compatible:
- items:
- - enum:
- - intel,ixp42x-expansion-bus-controller
- - intel,ixp43x-expansion-bus-controller
- - intel,ixp45x-expansion-bus-controller
- - intel,ixp46x-expansion-bus-controller
- - const: syscon
-
- reg:
- description: Control registers for the expansion bus, these are not
- inside the memory range handled by the expansion bus.
- maxItems: 1
-
- native-endian:
- $ref: /schemas/types.yaml#/definitions/flag
- description: The IXP4xx has a peculiar MMIO access scheme, as it changes
- the access pattern for words (swizzling) on the bus depending on whether
- the SoC is running in big-endian or little-endian mode. Thus the
- registers must always be accessed using native endianness.
-
- "#address-cells":
- description: |
- The first cell is the chip select number.
- The second cell is the address offset within the bank.
- const: 2
-
- "#size-cells":
- const: 1
-
- ranges: true
- dma-ranges: true
-
-patternProperties:
- "^.*@[0-7],[0-9a-f]+$":
- description: Devices attached to chip selects are represented as
- subnodes.
- type: object
-
- properties:
- intel,ixp4xx-eb-t1:
- description: Address timing, extend address phase with n cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- maximum: 3
-
- intel,ixp4xx-eb-t2:
- description: Setup chip select timing, extend setup phase with n cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- maximum: 3
-
- intel,ixp4xx-eb-t3:
- description: Strobe timing, extend strobe phase with n cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- maximum: 15
-
- intel,ixp4xx-eb-t4:
- description: Hold timing, extend hold phase with n cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- maximum: 3
-
- intel,ixp4xx-eb-t5:
- description: Recovery timing, extend recovery phase with n cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- maximum: 15
-
- intel,ixp4xx-eb-cycle-type:
- description: The type of cycles to use on the expansion bus for this
- chip select. 0 = Intel cycles, 1 = Motorola cycles, 2 = HPI cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1, 2]
-
- intel,ixp4xx-eb-byte-access-on-halfword:
- description: Allow byte read access on half word devices.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
- intel,ixp4xx-eb-hpi-hrdy-pol-high:
- description: Set HPI HRDY polarity to active high when using HPI.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
- intel,ixp4xx-eb-mux-address-and-data:
- description: Multiplex address and data on the data bus.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
- intel,ixp4xx-eb-ahb-split-transfers:
- description: Enable AHB split transfers.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
- intel,ixp4xx-eb-write-enable:
- description: Enable write cycles.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
- intel,ixp4xx-eb-byte-access:
- description: Expansion bus uses only 8 bits. The default is to use
- 16 bits.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [0, 1]
-
-required:
- - compatible
- - reg
- - native-endian
- - "#address-cells"
- - "#size-cells"
- - ranges
- - dma-ranges
-
-additionalProperties: false
-
-examples:
- - |
- #include <dt-bindings/interrupt-controller/irq.h>
- bus@50000000 {
- compatible = "intel,ixp42x-expansion-bus-controller", "syscon";
- reg = <0xc4000000 0x28>;
- native-endian;
- #address-cells = <2>;
- #size-cells = <1>;
- ranges = <0 0x0 0x50000000 0x01000000>,
- <1 0x0 0x51000000 0x01000000>;
- dma-ranges = <0 0x0 0x50000000 0x01000000>,
- <1 0x0 0x51000000 0x01000000>;
- flash@0,0 {
- compatible = "intel,ixp4xx-flash", "cfi-flash";
- bank-width = <2>;
- reg = <0 0x00000000 0x1000000>;
- intel,ixp4xx-eb-t3 = <3>;
- intel,ixp4xx-eb-cycle-type = <0>;
- intel,ixp4xx-eb-byte-access-on-halfword = <1>;
- intel,ixp4xx-eb-write-enable = <1>;
- intel,ixp4xx-eb-byte-access = <0>;
- };
- serial@1,0 {
- compatible = "exar,xr16l2551", "ns8250";
- reg = <1 0x00000000 0x10>;
- interrupt-parent = <&gpio0>;
- interrupts = <4 IRQ_TYPE_LEVEL_LOW>;
- clock-frequency = <1843200>;
- intel,ixp4xx-eb-t3 = <3>;
- intel,ixp4xx-eb-cycle-type = <1>;
- intel,ixp4xx-eb-write-enable = <1>;
- intel,ixp4xx-eb-byte-access = <1>;
- };
- };
diff --git a/Documentation/devicetree/bindings/bus/microsoft,vmbus.yaml b/Documentation/devicetree/bindings/bus/microsoft,vmbus.yaml
new file mode 100644
index 000000000000..a8d40c766dcd
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/microsoft,vmbus.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/microsoft,vmbus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microsoft Hyper-V VMBus
+
+maintainers:
+ - Saurabh Sengar <ssengar@linux.microsoft.com>
+
+description:
+ VMBus is a software bus that implement the protocols for communication
+ between the root or host OS and guest OSs (virtual machines).
+
+properties:
+ compatible:
+ const: microsoft,vmbus
+
+ ranges: true
+
+ '#address-cells':
+ const: 2
+
+ '#size-cells':
+ const: 1
+
+required:
+ - compatible
+ - ranges
+ - '#address-cells'
+ - '#size-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <1>;
+ bus {
+ compatible = "simple-bus";
+ #address-cells = <2>;
+ #size-cells = <1>;
+ ranges;
+
+ vmbus@ff0000000 {
+ compatible = "microsoft,vmbus";
+ #address-cells = <2>;
+ #size-cells = <1>;
+ ranges = <0x0f 0xf0000000 0x0f 0xf0000000 0x10000000>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/bus/nvidia,tegra210-aconnect.yaml b/Documentation/devicetree/bindings/bus/nvidia,tegra210-aconnect.yaml
index 7b1a08c62aef..4157e885c6e7 100644
--- a/Documentation/devicetree/bindings/bus/nvidia,tegra210-aconnect.yaml
+++ b/Documentation/devicetree/bindings/bus/nvidia,tegra210-aconnect.yaml
@@ -21,6 +21,7 @@ properties:
- const: nvidia,tegra210-aconnect
- items:
- enum:
+ - nvidia,tegra234-aconnect
- nvidia,tegra186-aconnect
- nvidia,tegra194-aconnect
- const: nvidia,tegra210-aconnect
@@ -39,10 +40,10 @@ properties:
maxItems: 1
"#address-cells":
- const: 1
+ enum: [ 1, 2 ]
"#size-cells":
- const: 1
+ enum: [ 1, 2 ]
ranges: true
diff --git a/Documentation/devicetree/bindings/bus/palmbus.yaml b/Documentation/devicetree/bindings/bus/palmbus.yaml
new file mode 100644
index 000000000000..c36c1e92a573
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/palmbus.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/palmbus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ralink PalmBus
+
+maintainers:
+ - Sergio Paracuellos <sergio.paracuellos@gmail.com>
+
+description: |
+ The ralink palmbus controller can be found in all ralink MIPS
+ SoCs. It provides an external bus for connecting multiple
+ external devices to the SoC.
+
+properties:
+ $nodename:
+ pattern: "^palmbus(@[0-9a-f]+)?$"
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 1
+
+ compatible:
+ const: palmbus
+
+ reg:
+ maxItems: 1
+
+ ranges: true
+
+patternProperties:
+ # All other properties should be child nodes with unit-address and 'reg'
+ "@[0-9a-f]+$":
+ type: object
+ additionalProperties: true
+ properties:
+ reg:
+ maxItems: 1
+
+ required:
+ - reg
+
+required:
+ - compatible
+ - reg
+ - "#address-cells"
+ - "#size-cells"
+ - ranges
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/mips-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ palmbus@1e000000 {
+ compatible = "palmbus";
+ reg = <0x1e000000 0x100000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0x0 0x1e000000 0x0fffff>;
+
+ gpio@600 {
+ #gpio-cells = <2>;
+ #interrupt-cells = <2>;
+ compatible = "mediatek,mt7621-gpio";
+ gpio-controller;
+ gpio-ranges = <&pinctrl 0 0 95>;
+ interrupt-controller;
+ reg = <0x600 0x100>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SHARED 12 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/bus/qcom,ssc-block-bus.yaml b/Documentation/devicetree/bindings/bus/qcom,ssc-block-bus.yaml
new file mode 100644
index 000000000000..8e9e6ff35d7d
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/qcom,ssc-block-bus.yaml
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/qcom,ssc-block-bus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: The AHB Bus Providing a Global View of the SSC Block on (some) qcom SoCs
+
+maintainers:
+ - Michael Srba <Michael.Srba@seznam.cz>
+
+description: |
+ This binding describes the dependencies (clocks, resets, power domains) which
+ need to be turned on in a sequence before communication over the AHB bus
+ becomes possible.
+
+ Additionally, the reg property is used to pass to the driver the location of
+ two sadly undocumented registers which need to be poked as part of the sequence.
+
+ The SSC (Snapdragon Sensor Core) block contains a gpio controller, i2c/spi/uart
+ controllers, a hexagon core, and a clock controller which provides clocks for
+ the above.
+
+properties:
+ compatible:
+ items:
+ - const: qcom,msm8998-ssc-block-bus
+ - const: qcom,ssc-block-bus
+
+ reg:
+ items:
+ - description: SSCAON_CONFIG0 registers
+ - description: SSCAON_CONFIG1 registers
+
+ reg-names:
+ items:
+ - const: mpm_sscaon_config0
+ - const: mpm_sscaon_config1
+
+ '#address-cells':
+ enum: [ 1, 2 ]
+
+ '#size-cells':
+ enum: [ 1, 2 ]
+
+ ranges: true
+
+ clocks:
+ maxItems: 6
+
+ clock-names:
+ items:
+ - const: xo
+ - const: aggre2
+ - const: gcc_im_sleep
+ - const: aggre2_north
+ - const: ssc_xo
+ - const: ssc_ahbs
+
+ power-domains:
+ items:
+ - description: CX power domain
+ - description: MX power domain
+
+ power-domain-names:
+ items:
+ - const: ssc_cx
+ - const: ssc_mx
+
+ resets:
+ items:
+ - description: Main reset
+ - description:
+ SSC Branch Control Register reset (associated with the ssc_xo and
+ ssc_ahbs clocks)
+
+ reset-names:
+ items:
+ - const: ssc_reset
+ - const: ssc_bcr
+
+ qcom,halt-regs:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ description: describes how to locate the ssc AXI halt register
+ items:
+ - items:
+ - description: Phandle reference to a syscon representing TCSR
+ - description: offset for the ssc AXI halt register
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - '#address-cells'
+ - '#size-cells'
+ - ranges
+ - clocks
+ - clock-names
+ - power-domains
+ - power-domain-names
+ - resets
+ - reset-names
+ - qcom,halt-regs
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-msm8998.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ soc {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ // devices under this node are physically located in the SSC block, connected to an ssc-internal bus;
+ ssc_ahb_slave: bus@10ac008 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ compatible = "qcom,msm8998-ssc-block-bus", "qcom,ssc-block-bus";
+ reg = <0x10ac008 0x4>, <0x10ac010 0x4>;
+ reg-names = "mpm_sscaon_config0", "mpm_sscaon_config1";
+
+ clocks = <&xo>,
+ <&rpmcc RPM_SMD_AGGR2_NOC_CLK>,
+ <&gcc GCC_IM_SLEEP>,
+ <&gcc AGGRE2_SNOC_NORTH_AXI>,
+ <&gcc SSC_XO>,
+ <&gcc SSC_CNOC_AHBS_CLK>;
+ clock-names = "xo", "aggre2", "gcc_im_sleep", "aggre2_north", "ssc_xo", "ssc_ahbs";
+
+ resets = <&gcc GCC_SSC_RESET>, <&gcc GCC_SSC_BCR>;
+ reset-names = "ssc_reset", "ssc_bcr";
+
+ power-domains = <&rpmpd MSM8998_SSCCX>, <&rpmpd MSM8998_SSCMX>;
+ power-domain-names = "ssc_cx", "ssc_mx";
+
+ qcom,halt-regs = <&tcsr_mutex_regs 0x26000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/bus/ti-sysc.txt b/Documentation/devicetree/bindings/bus/ti-sysc.txt
deleted file mode 100644
index c984143d08d2..000000000000
--- a/Documentation/devicetree/bindings/bus/ti-sysc.txt
+++ /dev/null
@@ -1,139 +0,0 @@
-Texas Instruments sysc interconnect target module wrapper binding
-
-Texas Instruments SoCs can have a generic interconnect target module
-hardware for devices connected to various interconnects such as L3
-interconnect (Arteris NoC) and L4 interconnect (Sonics s3220). The sysc
-is mostly used for interaction between module and PRCM. It participates
-in the OCP Disconnect Protocol but other than that is mostly independent
-of the interconnect.
-
-Each interconnect target module can have one or more devices connected to
-it. There is a set of control registers for managing interconnect target
-module clocks, idle modes and interconnect level resets for the module.
-
-These control registers are sprinkled into the unused register address
-space of the first child device IP block managed by the interconnect
-target module and typically are named REVISION, SYSCONFIG and SYSSTATUS.
-
-Required standard properties:
-
-- compatible shall be one of the following generic types:
-
- "ti,sysc"
- "ti,sysc-omap2"
- "ti,sysc-omap4"
- "ti,sysc-omap4-simple"
-
- or one of the following derivative types for hardware
- needing special workarounds:
-
- "ti,sysc-omap2-timer"
- "ti,sysc-omap4-timer"
- "ti,sysc-omap3430-sr"
- "ti,sysc-omap3630-sr"
- "ti,sysc-omap4-sr"
- "ti,sysc-omap3-sham"
- "ti,sysc-omap-aes"
- "ti,sysc-mcasp"
- "ti,sysc-dra7-mcasp"
- "ti,sysc-usb-host-fs"
- "ti,sysc-dra7-mcan"
- "ti,sysc-pruss"
-
-- reg shall have register areas implemented for the interconnect
- target module in question such as revision, sysc and syss
-
-- reg-names shall contain the register names implemented for the
- interconnect target module in question such as
- "rev, "sysc", and "syss"
-
-- ranges shall contain the interconnect target module IO range
- available for one or more child device IP blocks managed
- by the interconnect target module, the ranges may include
- multiple ranges such as device L4 range for control and
- parent L3 range for DMA access
-
-Optional properties:
-
-- ti,sysc-mask shall contain mask of supported register bits for the
- SYSCONFIG register as documented in the Technical Reference
- Manual (TRM) for the interconnect target module
-
-- ti,sysc-midle list of master idle modes supported by the interconnect
- target module as documented in the TRM for SYSCONFIG
- register MIDLEMODE bits
-
-- ti,sysc-sidle list of slave idle modes supported by the interconnect
- target module as documented in the TRM for SYSCONFIG
- register SIDLEMODE bits
-
-- ti,sysc-delay-us delay needed after OCP softreset before accssing
- SYSCONFIG register again
-
-- ti,syss-mask optional mask of reset done status bits as described in the
- TRM for SYSSTATUS registers, typically 1 with some devices
- having separate reset done bits for children like OHCI and
- EHCI
-
-- clocks clock specifier for each name in the clock-names as
- specified in the binding documentation for ti-clkctrl,
- typically available for all interconnect targets on TI SoCs
- based on omap4 except if it's read-only register in hwauto
- mode as for example omap4 L4_CFG_CLKCTRL
-
-- clock-names should contain at least "fck", and optionally also "ick"
- depending on the SoC and the interconnect target module,
- some interconnect target modules also need additional
- optional clocks that can be specified as listed in TRM
- for the related CLKCTRL register bits 8 to 15 such as
- "dbclk" or "clk32k" depending on their role
-
-- ti,hwmods optional TI interconnect module name to use legacy
- hwmod platform data
-
-- ti,no-reset-on-init interconnect target module should not be reset at init
-
-- ti,no-idle-on-init interconnect target module should not be idled at init
-
-- ti,no-idle interconnect target module should not be idled
-
-Example: Single instance of MUSB controller on omap4 using interconnect ranges
-using offsets from l4_cfg second segment (0x4a000000 + 0x80000 = 0x4a0ab000):
-
- target-module@2b000 { /* 0x4a0ab000, ap 84 12.0 */
- compatible = "ti,sysc-omap2";
- ti,hwmods = "usb_otg_hs";
- reg = <0x2b400 0x4>,
- <0x2b404 0x4>,
- <0x2b408 0x4>;
- reg-names = "rev", "sysc", "syss";
- clocks = <&l3_init_clkctrl OMAP4_USB_OTG_HS_CLKCTRL 0>;
- clock-names = "fck";
- ti,sysc-mask = <(SYSC_OMAP2_ENAWAKEUP |
- SYSC_OMAP2_SOFTRESET |
- SYSC_OMAP2_AUTOIDLE)>;
- ti,sysc-midle = <SYSC_IDLE_FORCE>,
- <SYSC_IDLE_NO>,
- <SYSC_IDLE_SMART>;
- ti,sysc-sidle = <SYSC_IDLE_FORCE>,
- <SYSC_IDLE_NO>,
- <SYSC_IDLE_SMART>,
- <SYSC_IDLE_SMART_WKUP>;
- ti,syss-mask = <1>;
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0 0x2b000 0x1000>;
-
- usb_otg_hs: otg@0 {
- compatible = "ti,omap4-musb";
- reg = <0x0 0x7ff>;
- interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>;
- usb-phy = <&usb2_phy>;
- ...
- };
- };
-
-Note that other SoCs, such as am335x can have multiple child devices. On am335x
-there are two MUSB instances, two USB PHY instances, and a single CPPI41 DMA
-instance as children of a single interconnect target module.
diff --git a/Documentation/devicetree/bindings/bus/ti-sysc.yaml b/Documentation/devicetree/bindings/bus/ti-sysc.yaml
new file mode 100644
index 000000000000..f089634f9466
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/ti-sysc.yaml
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/ti-sysc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments interconnect target module
+
+maintainers:
+ - Tony Lindgren <tony@atomide.com>
+
+description:
+ Texas Instruments SoCs can have a generic interconnect target module
+ for devices connected to various interconnects such as L3 interconnect
+ using Arteris NoC, and L4 interconnect using Sonics s3220. This module
+ is mostly used for interaction between module and Power, Reset and Clock
+ Manager PRCM. It participates in the OCP Disconnect Protocol, but other
+ than that it is mostly independent of the interconnect.
+
+ Each interconnect target module can have one or more devices connected to
+ it. There is a set of control registers for managing the interconnect target
+ module clocks, idle modes and interconnect level resets.
+
+ The interconnect target module control registers are sprinkled into the
+ unused register address space of the first child device IP block managed by
+ the interconnect target module. Typically the register names are REVISION,
+ SYSCONFIG and SYSSTATUS.
+
+properties:
+ $nodename:
+ pattern: "^target-module(@[0-9a-f]+)?$"
+
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - ti,sysc-omap2
+ - ti,sysc-omap4
+ - ti,sysc-omap4-simple
+ - ti,sysc-omap2-timer
+ - ti,sysc-omap4-timer
+ - ti,sysc-omap3430-sr
+ - ti,sysc-omap3630-sr
+ - ti,sysc-omap4-sr
+ - ti,sysc-omap3-sham
+ - ti,sysc-omap-aes
+ - ti,sysc-mcasp
+ - ti,sysc-dra7-mcasp
+ - ti,sysc-usb-host-fs
+ - ti,sysc-dra7-mcan
+ - ti,sysc-pruss
+ - const: ti,sysc
+ - items:
+ - const: ti,sysc
+
+ reg:
+ description:
+ Interconnect target module control registers consisting of
+ REVISION, SYSCONFIG and SYSSTATUS registers as defined in the
+ Technical Reference Manual for the SoC.
+ minItems: 1
+ maxItems: 3
+
+ reg-names:
+ description:
+ Interconnect target module control register names consisting
+ of "rev", "sysc" and "syss".
+ oneOf:
+ - minItems: 1
+ items:
+ - const: rev
+ - const: sysc
+ - const: syss
+ - items:
+ - const: rev
+ - const: syss
+ - enum: [ sysc, syss ]
+
+ power-domains:
+ description: Target module power domain if available.
+ maxItems: 1
+
+ clocks:
+ description:
+ Target module clocks consisting of one functional clock, one
+ interface clock, and up to 8 module specific optional clocks.
+ Some modules may have only the functional clock, and some have
+ no configurable clocks.
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ description:
+ Target module clock names like "fck", "ick", "optck1", "optck2"
+ if the clocks are configurable.
+ oneOf:
+ - enum: [ ick, fck, sys_clk ]
+ - items:
+ - const: fck
+ - enum: [ ick. dbclk, osc, sys_clk, dss_clk, ahclkx ]
+ - items:
+ - const: fck
+ - const: phy-clk
+ - const: phy-clk-div
+ - items:
+ - const: fck
+ - const: hdmi_clk
+ - const: sys_clk
+ - const: tv_clk
+ - items:
+ - const: fck
+ - const: ahclkx
+ - const: ahclkr
+
+ resets:
+ description:
+ Target module reset bit in the RSTCTRL register if wired for the module.
+ Note that the other reset bits should be mapped for the child device
+ driver to use.
+ maxItems: 1
+
+ reset-names:
+ description:
+ Target module reset names in the RSTCTRL register, typically named
+ "rstctrl" if only one reset bit is wired for the module.
+ items:
+ - const: rstctrl
+
+ '#address-cells':
+ enum: [ 1, 2 ]
+
+ '#size-cells':
+ enum: [ 1, 2 ]
+
+ ranges: true
+
+ dma-ranges: true
+
+ ti,sysc-mask:
+ description: Mask of supported register bits for the SYSCONFIG register
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ ti,sysc-midle:
+ description: List of hardware supported idle modes
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+
+ ti,sysc-sidle:
+ description: List of hardware supported idle modes
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+
+ ti,syss-mask:
+ description: Mask of supported register bits for the SYSSTATUS register
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ ti,sysc-delay-us:
+ description: Delay needed after OCP softreset before accessing SYCONFIG
+ default: 0
+ minimum: 0
+ maximum: 2
+
+ ti,no-reset-on-init:
+ description: Interconnect target module shall not be reset at init
+ type: boolean
+
+ ti,no-idle-on-init:
+ description: Interconnect target module shall not be idled at init
+ type: boolean
+
+ ti,no-idle:
+ description: Interconnect target module shall not be idled
+ type: boolean
+
+ ti,hwmods:
+ description: Interconnect module name to use with legacy hwmod data
+ $ref: /schemas/types.yaml#/definitions/string
+ deprecated: true
+
+required:
+ - compatible
+ - '#address-cells'
+ - '#size-cells'
+ - ranges
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ #include <dt-bindings/bus/ti-sysc.h>
+ #include <dt-bindings/clock/omap4.h>
+
+ target-module@2b000 {
+ compatible = "ti,sysc-omap2", "ti,sysc";
+ ti,hwmods = "usb_otg_hs";
+ reg = <0x2b400 0x4>,
+ <0x2b404 0x4>,
+ <0x2b408 0x4>;
+ reg-names = "rev", "sysc", "syss";
+ clocks = <&l3_init_clkctrl OMAP4_USB_OTG_HS_CLKCTRL 0>;
+ clock-names = "fck";
+ ti,sysc-mask = <(SYSC_OMAP2_ENAWAKEUP |
+ SYSC_OMAP2_SOFTRESET |
+ SYSC_OMAP2_AUTOIDLE)>;
+ ti,sysc-midle = <SYSC_IDLE_FORCE>,
+ <SYSC_IDLE_NO>,
+ <SYSC_IDLE_SMART>;
+ ti,sysc-sidle = <SYSC_IDLE_FORCE>,
+ <SYSC_IDLE_NO>,
+ <SYSC_IDLE_SMART>,
+ <SYSC_IDLE_SMART_WKUP>;
+ ti,syss-mask = <1>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0x2b000 0x1000>;
+ };
diff --git a/Documentation/devicetree/bindings/bus/xlnx,versal-net-cdx.yaml b/Documentation/devicetree/bindings/bus/xlnx,versal-net-cdx.yaml
new file mode 100644
index 000000000000..7f62ffbdc245
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/xlnx,versal-net-cdx.yaml
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/xlnx,versal-net-cdx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AMD CDX bus controller
+
+description: |
+ CDX bus controller for AMD devices is implemented to dynamically
+ detect CDX bus and devices using the firmware.
+ The CDX bus manages multiple FPGA based hardware devices, which
+ can support network, crypto or any other specialized type of
+ devices. These FPGA based devices can be added/modified dynamically
+ on run-time.
+
+ All devices on the CDX bus will have a unique streamid (for IOMMU)
+ and a unique device ID (for MSI) corresponding to a requestor ID
+ (one to one associated with the device). The streamid and deviceid
+ are used to configure SMMU and GIC-ITS respectively.
+
+ iommu-map property is used to define the set of stream ids
+ corresponding to each device and the associated IOMMU.
+
+ The MSI writes are accompanied by sideband data (Device ID).
+ The msi-map property is used to associate the devices with the
+ device ID as well as the associated ITS controller.
+
+ rproc property (xlnx,rproc) is used to identify the remote processor
+ with which APU (Application Processor Unit) interacts to find out
+ the bus and device configuration.
+
+maintainers:
+ - Nipun Gupta <nipun.gupta@amd.com>
+ - Nikhil Agarwal <nikhil.agarwal@amd.com>
+
+properties:
+ compatible:
+ const: xlnx,versal-net-cdx
+
+ iommu-map: true
+
+ msi-map: true
+
+ xlnx,rproc:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ phandle to the remoteproc_r5 rproc node using which APU interacts
+ with remote processor.
+
+ ranges: true
+
+ "#address-cells":
+ enum: [1, 2]
+
+ "#size-cells":
+ enum: [1, 2]
+
+required:
+ - compatible
+ - iommu-map
+ - msi-map
+ - xlnx,rproc
+ - ranges
+ - "#address-cells"
+ - "#size-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cdx {
+ compatible = "xlnx,versal-net-cdx";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ /* define map for RIDs 250-259 */
+ iommu-map = <250 &smmu 250 10>;
+ /* define msi map for RIDs 250-259 */
+ msi-map = <250 &its 250 10>;
+ xlnx,rproc = <&remoteproc_r5>;
+ ranges;
+ };
diff --git a/Documentation/devicetree/bindings/memory-controllers/baikal,bt1-l2-ctl.yaml b/Documentation/devicetree/bindings/cache/baikal,bt1-l2-ctl.yaml
index 1fca282f64a2..ec4f367bc0b4 100644
--- a/Documentation/devicetree/bindings/memory-controllers/baikal,bt1-l2-ctl.yaml
+++ b/Documentation/devicetree/bindings/cache/baikal,bt1-l2-ctl.yaml
@@ -2,7 +2,7 @@
# Copyright (C) 2020 BAIKAL ELECTRONICS, JSC
%YAML 1.2
---
-$id: http://devicetree.org/schemas/memory-controllers/baikal,bt1-l2-ctl.yaml#
+$id: http://devicetree.org/schemas/cache/baikal,bt1-l2-ctl.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Baikal-T1 L2-cache Control Block
diff --git a/Documentation/devicetree/bindings/cache/freescale-l2cache.txt b/Documentation/devicetree/bindings/cache/freescale-l2cache.txt
new file mode 100644
index 000000000000..22ad012660e9
--- /dev/null
+++ b/Documentation/devicetree/bindings/cache/freescale-l2cache.txt
@@ -0,0 +1,55 @@
+Freescale L2 Cache Controller
+
+L2 cache is present in Freescale's QorIQ and QorIQ Qonverge platforms.
+The cache bindings explained below are Devicetree Specification compliant
+
+Required Properties:
+
+- compatible : Should include one of the following:
+ "fsl,b4420-l2-cache-controller"
+ "fsl,b4860-l2-cache-controller"
+ "fsl,bsc9131-l2-cache-controller"
+ "fsl,bsc9132-l2-cache-controller"
+ "fsl,c293-l2-cache-controller"
+ "fsl,mpc8536-l2-cache-controller"
+ "fsl,mpc8540-l2-cache-controller"
+ "fsl,mpc8541-l2-cache-controller"
+ "fsl,mpc8544-l2-cache-controller"
+ "fsl,mpc8548-l2-cache-controller"
+ "fsl,mpc8555-l2-cache-controller"
+ "fsl,mpc8560-l2-cache-controller"
+ "fsl,mpc8568-l2-cache-controller"
+ "fsl,mpc8569-l2-cache-controller"
+ "fsl,mpc8572-l2-cache-controller"
+ "fsl,p1010-l2-cache-controller"
+ "fsl,p1011-l2-cache-controller"
+ "fsl,p1012-l2-cache-controller"
+ "fsl,p1013-l2-cache-controller"
+ "fsl,p1014-l2-cache-controller"
+ "fsl,p1015-l2-cache-controller"
+ "fsl,p1016-l2-cache-controller"
+ "fsl,p1020-l2-cache-controller"
+ "fsl,p1021-l2-cache-controller"
+ "fsl,p1022-l2-cache-controller"
+ "fsl,p1023-l2-cache-controller"
+ "fsl,p1024-l2-cache-controller"
+ "fsl,p1025-l2-cache-controller"
+ "fsl,p2010-l2-cache-controller"
+ "fsl,p2020-l2-cache-controller"
+ "fsl,t2080-l2-cache-controller"
+ "fsl,t4240-l2-cache-controller"
+ and "cache".
+- reg : Address and size of L2 cache controller registers
+- cache-size : Size of the entire L2 cache
+- interrupts : Error interrupt of L2 controller
+- cache-line-size : Size of L2 cache lines
+
+Example:
+
+ L2: l2-cache-controller@20000 {
+ compatible = "fsl,bsc9132-l2-cache-controller", "cache";
+ reg = <0x20000 0x1000>;
+ cache-line-size = <32>; // 32 bytes
+ cache-size = <0x40000>; // L2,256K
+ interrupts = <16 2 1 0>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/l2c2x0.yaml b/Documentation/devicetree/bindings/cache/l2c2x0.yaml
index 6b8f4d4fa580..d7840a5c4037 100644
--- a/Documentation/devicetree/bindings/arm/l2c2x0.yaml
+++ b/Documentation/devicetree/bindings/cache/l2c2x0.yaml
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
-$id: http://devicetree.org/schemas/arm/l2c2x0.yaml#
+$id: http://devicetree.org/schemas/cache/l2c2x0.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: ARM L2 Cache Controller
diff --git a/Documentation/devicetree/bindings/arm/mrvl/feroceon.txt b/Documentation/devicetree/bindings/cache/marvell,feroceon-cache.txt
index 0d244b999d10..0d244b999d10 100644
--- a/Documentation/devicetree/bindings/arm/mrvl/feroceon.txt
+++ b/Documentation/devicetree/bindings/cache/marvell,feroceon-cache.txt
diff --git a/Documentation/devicetree/bindings/arm/mrvl/tauros2.txt b/Documentation/devicetree/bindings/cache/marvell,tauros2-cache.txt
index 31af1cbb60bd..31af1cbb60bd 100644
--- a/Documentation/devicetree/bindings/arm/mrvl/tauros2.txt
+++ b/Documentation/devicetree/bindings/cache/marvell,tauros2-cache.txt
diff --git a/Documentation/devicetree/bindings/cache/qcom,llcc.yaml b/Documentation/devicetree/bindings/cache/qcom,llcc.yaml
new file mode 100644
index 000000000000..d8b91944180a
--- /dev/null
+++ b/Documentation/devicetree/bindings/cache/qcom,llcc.yaml
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cache/qcom,llcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Last Level Cache Controller
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ LLCC (Last Level Cache Controller) provides last level of cache memory in SoC,
+ that can be shared by multiple clients. Clients here are different cores in the
+ SoC, the idea is to minimize the local caches at the clients and migrate to
+ common pool of memory. Cache memory is divided into partitions called slices
+ which are assigned to clients. Clients can query the slice details, activate
+ and deactivate them.
+
+properties:
+ compatible:
+ enum:
+ - qcom,sc7180-llcc
+ - qcom,sc7280-llcc
+ - qcom,sc8180x-llcc
+ - qcom,sc8280xp-llcc
+ - qcom,sdm845-llcc
+ - qcom,sm6350-llcc
+ - qcom,sm7150-llcc
+ - qcom,sm8150-llcc
+ - qcom,sm8250-llcc
+ - qcom,sm8350-llcc
+ - qcom,sm8450-llcc
+ - qcom,sm8550-llcc
+
+ reg:
+ minItems: 2
+ maxItems: 9
+
+ reg-names:
+ minItems: 2
+ maxItems: 9
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - reg-names
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7180-llcc
+ - qcom,sm6350-llcc
+ then:
+ properties:
+ reg:
+ items:
+ - description: LLCC0 base register region
+ - description: LLCC broadcast base register region
+ reg-names:
+ items:
+ - const: llcc0_base
+ - const: llcc_broadcast_base
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7280-llcc
+ then:
+ properties:
+ reg:
+ items:
+ - description: LLCC0 base register region
+ - description: LLCC1 base register region
+ - description: LLCC broadcast base register region
+ reg-names:
+ items:
+ - const: llcc0_base
+ - const: llcc1_base
+ - const: llcc_broadcast_base
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc8180x-llcc
+ - qcom,sc8280xp-llcc
+ then:
+ properties:
+ reg:
+ items:
+ - description: LLCC0 base register region
+ - description: LLCC1 base register region
+ - description: LLCC2 base register region
+ - description: LLCC3 base register region
+ - description: LLCC4 base register region
+ - description: LLCC5 base register region
+ - description: LLCC6 base register region
+ - description: LLCC7 base register region
+ - description: LLCC broadcast base register region
+ reg-names:
+ items:
+ - const: llcc0_base
+ - const: llcc1_base
+ - const: llcc2_base
+ - const: llcc3_base
+ - const: llcc4_base
+ - const: llcc5_base
+ - const: llcc6_base
+ - const: llcc7_base
+ - const: llcc_broadcast_base
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sdm845-llcc
+ - qcom,sm8150-llcc
+ - qcom,sm8250-llcc
+ - qcom,sm8350-llcc
+ - qcom,sm8450-llcc
+ then:
+ properties:
+ reg:
+ items:
+ - description: LLCC0 base register region
+ - description: LLCC1 base register region
+ - description: LLCC2 base register region
+ - description: LLCC3 base register region
+ - description: LLCC broadcast base register region
+ reg-names:
+ items:
+ - const: llcc0_base
+ - const: llcc1_base
+ - const: llcc2_base
+ - const: llcc3_base
+ - const: llcc_broadcast_base
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ system-cache-controller@1100000 {
+ compatible = "qcom,sdm845-llcc";
+ reg = <0 0x01100000 0 0x50000>, <0 0x01180000 0 0x50000>,
+ <0 0x01200000 0 0x50000>, <0 0x01280000 0 0x50000>,
+ <0 0x01300000 0 0x50000>;
+ reg-names = "llcc0_base", "llcc1_base", "llcc2_base",
+ "llcc3_base", "llcc_broadcast_base";
+ interrupts = <GIC_SPI 582 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml b/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml
new file mode 100644
index 000000000000..8a6a78e1a7ab
--- /dev/null
+++ b/Documentation/devicetree/bindings/cache/sifive,ccache0.yaml
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright (C) 2020 SiFive, Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cache/sifive,ccache0.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SiFive Composable Cache Controller
+
+maintainers:
+ - Paul Walmsley <paul.walmsley@sifive.com>
+
+description:
+ The SiFive Composable Cache Controller is used to provide access to fast copies
+ of memory for masters in a Core Complex. The Composable Cache Controller also
+ acts as directory-based coherency manager.
+ All the properties in ePAPR/DeviceTree specification applies for this platform.
+
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - sifive,ccache0
+ - sifive,fu540-c000-ccache
+ - sifive,fu740-c000-ccache
+
+ required:
+ - compatible
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - sifive,ccache0
+ - sifive,fu540-c000-ccache
+ - sifive,fu740-c000-ccache
+ - const: cache
+ - items:
+ - const: starfive,jh7110-ccache
+ - const: sifive,ccache0
+ - const: cache
+ - items:
+ - const: microchip,mpfs-ccache
+ - const: sifive,fu540-c000-ccache
+ - const: cache
+
+ cache-block-size:
+ const: 64
+
+ cache-level:
+ enum: [2, 3]
+
+ cache-sets:
+ enum: [1024, 2048]
+
+ cache-size:
+ const: 2097152
+
+ cache-unified: true
+
+ interrupts:
+ minItems: 3
+ items:
+ - description: DirError interrupt
+ - description: DataError interrupt
+ - description: DataFail interrupt
+ - description: DirFail interrupt
+
+ reg:
+ maxItems: 1
+
+ next-level-cache: true
+
+ memory-region:
+ maxItems: 1
+ description: |
+ The reference to the reserved-memory for the L2 Loosely Integrated Memory region.
+ The reserved memory node should be defined as per the bindings in reserved-memory.txt.
+
+allOf:
+ - $ref: /schemas/cache-controller.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - sifive,fu740-c000-ccache
+ - starfive,jh7110-ccache
+ - microchip,mpfs-ccache
+
+ then:
+ properties:
+ interrupts:
+ description: |
+ Must contain entries for DirError, DataError, DataFail, DirFail signals.
+ minItems: 4
+
+ else:
+ properties:
+ interrupts:
+ description: |
+ Must contain entries for DirError, DataError and DataFail signals.
+ maxItems: 3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - sifive,fu740-c000-ccache
+ - starfive,jh7110-ccache
+
+ then:
+ properties:
+ cache-sets:
+ const: 2048
+
+ else:
+ properties:
+ cache-sets:
+ const: 1024
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: sifive,ccache0
+
+ then:
+ properties:
+ cache-level:
+ enum: [2, 3]
+
+ else:
+ properties:
+ cache-level:
+ const: 2
+
+additionalProperties: false
+
+required:
+ - compatible
+ - cache-block-size
+ - cache-level
+ - cache-sets
+ - cache-size
+ - cache-unified
+ - interrupts
+ - reg
+
+examples:
+ - |
+ cache-controller@2010000 {
+ compatible = "sifive,fu540-c000-ccache", "cache";
+ cache-block-size = <64>;
+ cache-level = <2>;
+ cache-sets = <1024>;
+ cache-size = <2097152>;
+ cache-unified;
+ reg = <0x2010000 0x1000>;
+ interrupt-parent = <&plic0>;
+ interrupts = <1>,
+ <2>,
+ <3>;
+ next-level-cache = <&L25>;
+ memory-region = <&l2_lim>;
+ };
diff --git a/Documentation/devicetree/bindings/arm/socionext/socionext,uniphier-system-cache.yaml b/Documentation/devicetree/bindings/cache/socionext,uniphier-system-cache.yaml
index 7ca5375f278f..3196263685a3 100644
--- a/Documentation/devicetree/bindings/arm/socionext/socionext,uniphier-system-cache.yaml
+++ b/Documentation/devicetree/bindings/cache/socionext,uniphier-system-cache.yaml
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
%YAML 1.2
---
-$id: http://devicetree.org/schemas/arm/socionext/socionext,uniphier-system-cache.yaml#
+$id: http://devicetree.org/schemas/cache/socionext,uniphier-system-cache.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: UniPhier outer cache controller
@@ -22,7 +22,6 @@ properties:
description: |
should contain 3 regions: control register, revision register,
operation register, in this order.
- minItems: 3
maxItems: 3
interrupts:
diff --git a/Documentation/devicetree/bindings/chosen.txt b/Documentation/devicetree/bindings/chosen.txt
deleted file mode 100644
index 1cc3aa10dcb1..000000000000
--- a/Documentation/devicetree/bindings/chosen.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-The chosen node
----------------
-
-The chosen node does not represent a real device, but serves as a place
-for passing data between firmware and the operating system, like boot
-arguments. Data in the chosen node does not represent the hardware.
-
-The following properties are recognized:
-
-
-kaslr-seed
------------
-
-This property is used when booting with CONFIG_RANDOMIZE_BASE as the
-entropy used to randomize the kernel image base address location. Since
-it is used directly, this value is intended only for KASLR, and should
-not be used for other purposes (as it may leak information about KASLR
-offsets). It is parsed as a u64 value, e.g.
-
-/ {
- chosen {
- kaslr-seed = <0xfeedbeef 0xc0def00d>;
- };
-};
-
-Note that if this property is set from UEFI (or a bootloader in EFI
-mode) when EFI_RNG_PROTOCOL is supported, it will be overwritten by
-the Linux EFI stub (which will populate the property itself, using
-EFI_RNG_PROTOCOL).
-
-stdout-path
------------
-
-Device trees may specify the device to be used for boot console output
-with a stdout-path property under /chosen, as described in the Devicetree
-Specification, e.g.
-
-/ {
- chosen {
- stdout-path = "/serial@f00:115200";
- };
-
- serial@f00 {
- compatible = "vendor,some-uart";
- reg = <0xf00 0x10>;
- };
-};
-
-If the character ":" is present in the value, this terminates the path.
-The meaning of any characters following the ":" is device-specific, and
-must be specified in the relevant binding documentation.
-
-For UART devices, the preferred binding is a string in the form:
-
- <baud>{<parity>{<bits>{<flow>}}}
-
-where
-
- baud - baud rate in decimal
- parity - 'n' (none), 'o', (odd) or 'e' (even)
- bits - number of data bits
- flow - 'r' (rts)
-
-For example: 115200n8r
-
-Implementation note: Linux will look for the property "linux,stdout-path" or
-on PowerPC "stdout" if "stdout-path" is not found. However, the
-"linux,stdout-path" and "stdout" properties are deprecated. New platforms
-should only use the "stdout-path" property.
-
-linux,booted-from-kexec
------------------------
-
-This property is set (currently only on PowerPC, and only needed on
-book3e) by some versions of kexec-tools to tell the new kernel that it
-is being booted by kexec, as the booting environment may differ (e.g.
-a different secondary CPU release mechanism)
-
-linux,usable-memory-range
--------------------------
-
-This property holds a base address and size, describing a limited region in
-which memory may be considered available for use by the kernel. Memory outside
-of this range is not available for use.
-
-This property describes a limitation: memory within this range is only
-valid when also described through another mechanism that the kernel
-would otherwise use to determine available memory (e.g. memory nodes
-or the EFI memory map). Valid memory may be sparse within the range.
-e.g.
-
-/ {
- chosen {
- linux,usable-memory-range = <0x9 0xf0000000 0x0 0x10000000>;
- };
-};
-
-The main usage is for crash dump kernel to identify its own usable
-memory and exclude, at its boot time, any other memory areas that are
-part of the panicked kernel's memory.
-
-While this property does not represent a real hardware, the address
-and the size are expressed in #address-cells and #size-cells,
-respectively, of the root node.
-
-linux,elfcorehdr
-----------------
-
-This property holds the memory range, the address and the size, of the elf
-core header which mainly describes the panicked kernel's memory layout as
-PT_LOAD segments of elf format.
-e.g.
-
-/ {
- chosen {
- linux,elfcorehdr = <0x9 0xfffff000 0x0 0x800>;
- };
-};
-
-While this property does not represent a real hardware, the address
-and the size are expressed in #address-cells and #size-cells,
-respectively, of the root node.
-
-linux,initrd-start and linux,initrd-end
----------------------------------------
-
-These properties hold the physical start and end address of an initrd that's
-loaded by the bootloader. Note that linux,initrd-start is inclusive, but
-linux,initrd-end is exclusive.
-e.g.
-
-/ {
- chosen {
- linux,initrd-start = <0x82000000>;
- linux,initrd-end = <0x82800000>;
- };
-};
diff --git a/Documentation/devicetree/bindings/chrome/google,cros-ec-typec.yaml b/Documentation/devicetree/bindings/chrome/google,cros-ec-typec.yaml
index 2d98f7c4d3bc..3b0548c34791 100644
--- a/Documentation/devicetree/bindings/chrome/google,cros-ec-typec.yaml
+++ b/Documentation/devicetree/bindings/chrome/google,cros-ec-typec.yaml
@@ -20,23 +20,35 @@ properties:
compatible:
const: google,cros-ec-typec
- connector:
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+patternProperties:
+ '^connector@[0-9a-f]+$':
$ref: /schemas/connector/usb-connector.yaml#
+ unevaluatedProperties: false
+ properties:
+ reg:
+ maxItems: 1
required:
- compatible
-additionalProperties: true #fixme
+additionalProperties: false
examples:
- |+
- spi0 {
+ spi {
#address-cells = <1>;
#size-cells = <0>;
cros_ec: ec@0 {
compatible = "google,cros-ec-spi";
reg = <0>;
+ interrupts = <35 0>;
typec {
compatible = "google,cros-ec-typec";
diff --git a/Documentation/devicetree/bindings/chrome/google,cros-kbd-led-backlight.yaml b/Documentation/devicetree/bindings/chrome/google,cros-kbd-led-backlight.yaml
new file mode 100644
index 000000000000..c94ab8f9e0b8
--- /dev/null
+++ b/Documentation/devicetree/bindings/chrome/google,cros-kbd-led-backlight.yaml
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/chrome/google,cros-kbd-led-backlight.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ChromeOS keyboard backlight LED driver.
+
+maintainers:
+ - Tzung-Bi Shih <tzungbi@kernel.org>
+
+properties:
+ compatible:
+ const: google,cros-kbd-led-backlight
+
+required:
+ - compatible
+
+additionalProperties: false
+
+examples:
+ - |
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cros_ec: ec@0 {
+ compatible = "google,cros-ec-spi";
+ reg = <0>;
+ interrupts = <15 0>;
+
+ kbd-led-backlight {
+ compatible = "google,cros-kbd-led-backlight";
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/adi,axi-clkgen.yaml b/Documentation/devicetree/bindings/clock/adi,axi-clkgen.yaml
index 983033fe5b17..5e942bccf277 100644
--- a/Documentation/devicetree/bindings/clock/adi,axi-clkgen.yaml
+++ b/Documentation/devicetree/bindings/clock/adi,axi-clkgen.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/adi,axi-clkgen.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Binding for Analog Devices AXI clkgen pcore clock generator
+title: Analog Devices AXI clkgen pcore clock generator
maintainers:
- Lars-Peter Clausen <lars@metafoo.de>
diff --git a/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml b/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml
new file mode 100644
index 000000000000..79b0752faa91
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/airoha,en7523-scu.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/airoha,en7523-scu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: EN7523 Clock
+
+maintainers:
+ - Felix Fietkau <nbd@nbd.name>
+ - John Crispin <nbd@nbd.name>
+
+description: |
+ This node defines the System Control Unit of the EN7523 SoC,
+ a collection of registers configuring many different aspects of the SoC.
+
+ The clock driver uses it to read and configure settings of the
+ PLL controller, which provides clocks for the CPU, the bus and
+ other SoC internal peripherals.
+
+ Each clock is assigned an identifier and client nodes use this identifier
+ to specify which clock they consume.
+
+ All these identifiers can be found in:
+ [1]: <include/dt-bindings/clock/en7523-clk.h>.
+
+ The clocks are provided inside a system controller node.
+
+properties:
+ compatible:
+ items:
+ - const: airoha,en7523-scu
+
+ reg:
+ maxItems: 2
+
+ "#clock-cells":
+ description:
+ The first cell indicates the clock number, see [1] for available
+ clocks.
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/en7523-clk.h>
+ scu: system-controller@1fa20000 {
+ compatible = "airoha,en7523-scu";
+ reg = <0x1fa20000 0x400>,
+ <0x1fb00000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml
index 558db4b6ed17..93587b700476 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ahb-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ahb-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 AHB Clock Device Tree Bindings
+title: Allwinner A10 AHB Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml
index b1e3d739beb2..e14e1aad9fd6 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb0-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb0-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 APB0 Bus Clock Device Tree Bindings
+title: Allwinner A10 APB0 Bus Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml
index 51b7a6d4ea54..8a4747ebe0ba 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-apb1-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-apb1-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 APB1 Bus Clock Device Tree Bindings
+title: Allwinner A10 APB1 Bus Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml
index d801158e15de..aa08dd49dd61 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-axi-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-axi-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 AXI Clock Device Tree Bindings
+title: Allwinner A10 AXI Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
index c4b7243ddcf2..1690b9d99c3d 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ccu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner Clock Control Unit Device Tree Bindings
+title: Allwinner Clock Control Unit
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -34,6 +34,8 @@ properties:
- allwinner,sun8i-v3-ccu
- allwinner,sun8i-v3s-ccu
- allwinner,sun9i-a80-ccu
+ - allwinner,sun20i-d1-ccu
+ - allwinner,sun20i-d1-r-ccu
- allwinner,sun50i-a64-ccu
- allwinner,sun50i-a64-r-ccu
- allwinner,sun50i-a100-ccu
@@ -79,6 +81,7 @@ if:
enum:
- allwinner,sun8i-a83t-r-ccu
- allwinner,sun8i-h3-r-ccu
+ - allwinner,sun20i-d1-r-ccu
- allwinner,sun50i-a64-r-ccu
- allwinner,sun50i-a100-r-ccu
- allwinner,sun50i-h6-r-ccu
@@ -99,6 +102,7 @@ else:
properties:
compatible:
enum:
+ - allwinner,sun20i-d1-ccu
- allwinner,sun50i-a100-ccu
- allwinner,sun50i-h6-ccu
- allwinner,sun50i-h616-ccu
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml
index 0dfafba1a168..08d073520cfa 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-cpu-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-cpu-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 CPU Clock Device Tree Bindings
+title: Allwinner A10 CPU Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml
index 7484a7ab7dea..e665e50c1785 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-display-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-display-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Display Clock Device Tree Bindings
+title: Allwinner A10 Display Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml
index 9a37a357cb4e..c4714d0fbe07 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-gates-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-gates-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Bus Gates Clock Device Tree Bindings
+title: Allwinner A10 Bus Gates Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml
index 18f131e262b4..e824e33489b6 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mbus-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mbus-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 MBUS Clock Device Tree Bindings
+title: Allwinner A10 MBUS Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml
index 5199285a661a..c612f94befb9 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mmc-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mmc-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Module 1 Clock Device Tree Bindings
+title: Allwinner A10 Module 1 Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml
index 3e2abe3e67c1..80ae3a7a588c 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod0-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mod0-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Module 0 Clock Device Tree Bindings
+title: Allwinner A10 Module 0 Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml
index 7ddb55c75cff..4f9a8d44d42a 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-mod1-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-mod1-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Module 1 Clock Device Tree Bindings
+title: Allwinner A10 Module 1 Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml
index c604822cda07..52a7b6e7124c 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-osc-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-osc-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Gatable Oscillator Clock Device Tree Bindings
+title: Allwinner A10 Gatable Oscillator Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml
index e5d9d45dab8a..b13a1f21d5da 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll1-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll1-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 CPU PLL Device Tree Bindings
+title: Allwinner A10 CPU PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml
index 4b80a42fb3da..418d207d23b8 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll3-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll3-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Video PLL Device Tree Bindings
+title: Allwinner A10 Video PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml
index 415bd77de53d..76ef3f0c7f2c 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll5-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll5-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 DRAM PLL Device Tree Bindings
+title: Allwinner A10 DRAM PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml
index ec5652f76027..a94c93c90ece 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-pll6-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-pll6-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Peripheral PLL Device Tree Bindings
+title: Allwinner A10 Peripheral PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml
index 0a335c615efd..6646b2a99fc1 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-tcon-ch0-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 TCON Channel 0 Clock Device Tree Bindings
+title: Allwinner A10 TCON Channel 0 Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml
index cd95d25bfe7c..5103b675e488 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-usb-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-usb-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 USB Clock Device Tree Bindings
+title: Allwinner A10 USB Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml
index 5dfd0c1c27b4..80337e38d6e5 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ve-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun4i-a10-ve-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Video Engine Clock Device Tree Bindings
+title: Allwinner A10 Video Engine Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml
index 99add7991c48..c6a6fbb6863b 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun5i-a13-ahb-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun5i-a13-ahb-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A13 AHB Clock Device Tree Bindings
+title: Allwinner A13 AHB Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml
index 5f377205af71..7d6a6a34d20c 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun6i-a31-pll6-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun6i-a31-pll6-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A31 Peripheral PLL Device Tree Bindings
+title: Allwinner A31 Peripheral PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml
index 59e5dce1b65a..b6202de35707 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-gmac-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun7i-a20-gmac-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A20 GMAC TX Clock Device Tree Bindings
+title: Allwinner A20 GMAC TX Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml
index c745733bcf04..fde7f7dc3d34 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun7i-a20-out-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun7i-a20-out-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A20 Output Clock Device Tree Bindings
+title: Allwinner A20 Output Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml
index 3f995d2b30eb..70369bd633e4 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun8i-a83t-de2-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun8i-a83t-de2-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A83t Display Engine 2/3 Clock Controller Device Tree Bindings
+title: Allwinner A83t Display Engine 2/3 Clock Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -24,10 +24,13 @@ properties:
- const: allwinner,sun8i-v3s-de2-clk
- const: allwinner,sun50i-a64-de2-clk
- const: allwinner,sun50i-h5-de2-clk
- - const: allwinner,sun50i-h6-de2-clk
+ - const: allwinner,sun50i-h6-de3-clk
- items:
- const: allwinner,sun8i-r40-de2-clk
- const: allwinner,sun8i-h3-de2-clk
+ - items:
+ - const: allwinner,sun20i-d1-de2-clk
+ - const: allwinner,sun50i-h5-de2-clk
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml
index 3eb2bf65b230..45b9e2c7c1d1 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun8i-h3-bus-gates-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun8i-h3-bus-gates-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Bus Gates Clock Device Tree Bindings
+title: Allwinner A10 Bus Gates Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-ahb-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-ahb-clk.yaml
index d178da90aaec..f0f65af8ae22 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-ahb-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-ahb-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-ahb-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 AHB Clock Device Tree Bindings
+title: Allwinner A80 AHB Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-apb0-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-apb0-clk.yaml
index 0351c79bd221..e9f9bc8f5794 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-apb0-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-apb0-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-apb0-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 APB0 Bus Clock Device Tree Bindings
+title: Allwinner A80 APB0 Bus Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-cpus-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-cpus-clk.yaml
index 24d5b2f1a314..c48db2d49340 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-cpus-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-cpus-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-cpus-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 CPUS Clock Device Tree Bindings
+title: Allwinner A80 CPUS Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-de-clks.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-de-clks.yaml
index a82c7c7e942b..e9f81a343be1 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-de-clks.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-de-clks.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-de-clks.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 Display Engine Clock Controller Device Tree Bindings
+title: Allwinner A80 Display Engine Clock Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-gt-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-gt-clk.yaml
index 43963c3062c8..d3ce5eb18d4e 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-gt-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-gt-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-gt-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 GT Bus Clock Device Tree Bindings
+title: Allwinner A80 GT Bus Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-mmc-config-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-mmc-config-clk.yaml
index 20dc115fa211..65ee5afe83cc 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-mmc-config-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-mmc-config-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-mmc-config-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 MMC Configuration Clock Device Tree Bindings
+title: Allwinner A80 MMC Configuration Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-pll4-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-pll4-clk.yaml
index b76bab6a30e9..261264a8aef6 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-pll4-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-pll4-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-pll4-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 Peripheral PLL Device Tree Bindings
+title: Allwinner A80 Peripheral PLL
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-clks.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-clks.yaml
index 6532fb6821bc..515c15d5f661 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-clks.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-clks.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-usb-clks.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 USB Clock Controller Device Tree Bindings
+title: Allwinner A80 USB Clock Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-mod-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-mod-clk.yaml
index 15218d10e78e..3f7b8d9511f1 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-mod-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-mod-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-usb-mod-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 USB Module Clock Device Tree Bindings
+title: Allwinner A80 USB Module Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-phy-clk.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-phy-clk.yaml
index 2569041684e6..0d49072d47ca 100644
--- a/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-phy-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun9i-a80-usb-phy-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/allwinner,sun9i-a80-usb-phy-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 USB PHY Clock Device Tree Bindings
+title: Allwinner A80 USB PHY Clock
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/clock/amlogic,meson8-ddr-clkc.yaml b/Documentation/devicetree/bindings/clock/amlogic,meson8-ddr-clkc.yaml
index 4b8669f870ec..d98d95d8e8c9 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,meson8-ddr-clkc.yaml
+++ b/Documentation/devicetree/bindings/clock/amlogic,meson8-ddr-clkc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/amlogic,meson8-ddr-clkc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Amlogic DDR Clock Controller Device Tree Bindings
+title: Amlogic DDR Clock Controller
maintainers:
- Martin Blumenstingl <martin.blumenstingl@googlemail.com>
diff --git a/Documentation/devicetree/bindings/clock/apple,nco.yaml b/Documentation/devicetree/bindings/clock/apple,nco.yaml
new file mode 100644
index 000000000000..8b8411dc42f6
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/apple,nco.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/apple,nco.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Apple SoCs' NCO block
+
+maintainers:
+ - Martin Povišer <povik+lin@cutebit.org>
+
+description: |
+ The NCO (Numerically Controlled Oscillator) block found on Apple SoCs
+ such as the t8103 (M1) is a programmable clock generator performing
+ fractional division of a high frequency input clock.
+
+ It carries a number of independent channels and is typically used for
+ generation of audio bitclocks.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - apple,t6000-nco
+ - apple,t8103-nco
+ - apple,t8112-nco
+ - const: apple,nco
+
+ clocks:
+ description:
+ Specifies the reference clock from which the output clocks
+ are derived through fractional division.
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - clocks
+ - '#clock-cells'
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ nco_clkref: clock-ref {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <900000000>;
+ clock-output-names = "nco-ref";
+ };
+
+ nco: clock-controller@23b044000 {
+ compatible = "apple,t8103-nco", "apple,nco";
+ reg = <0x3b044000 0x14000>;
+ #clock-cells = <1>;
+ clocks = <&nco_clkref>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml b/Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml
index 118c5543e037..b5533f81307c 100644
--- a/Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml
+++ b/Documentation/devicetree/bindings/clock/arm,syscon-icst.yaml
@@ -69,6 +69,10 @@ properties:
- arm,impd1-vco1
- arm,impd1-vco2
+ reg:
+ maxItems: 1
+ description: The VCO register
+
clocks:
description: Parent clock for the ICST VCO
maxItems: 1
@@ -77,12 +81,13 @@ properties:
maxItems: 1
lock-offset:
- $ref: '/schemas/types.yaml#/definitions/uint32'
+ $ref: /schemas/types.yaml#/definitions/uint32
description: Offset to the unlocking register for the oscillator
vco-offset:
- $ref: '/schemas/types.yaml#/definitions/uint32'
+ $ref: /schemas/types.yaml#/definitions/uint32
description: Offset to the VCO register for the oscillator
+ deprecated: true
required:
- "#clock-cells"
diff --git a/Documentation/devicetree/bindings/clock/bitmain,bm1880-clk.yaml b/Documentation/devicetree/bindings/clock/bitmain,bm1880-clk.yaml
index 228c9313df53..f0f9392470a6 100644
--- a/Documentation/devicetree/bindings/clock/bitmain,bm1880-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/bitmain,bm1880-clk.yaml
@@ -61,16 +61,4 @@ examples:
#clock-cells = <1>;
};
- # Example UART controller node that consumes clock generated by the clock controller:
- - |
- uart0: serial@58018000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x58018000 0x2000>;
- clocks = <&clk 45>, <&clk 46>;
- clock-names = "baudclk", "apb_pclk";
- interrupts = <0 9 4>;
- reg-shift = <2>;
- reg-io-width = <4>;
- };
-
...
diff --git a/Documentation/devicetree/bindings/clock/brcm,bcm2711-dvp.yaml b/Documentation/devicetree/bindings/clock/brcm,bcm2711-dvp.yaml
index 08543ecbe35b..2d40df2d34df 100644
--- a/Documentation/devicetree/bindings/clock/brcm,bcm2711-dvp.yaml
+++ b/Documentation/devicetree/bindings/clock/brcm,bcm2711-dvp.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/brcm,bcm2711-dvp.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM2711 HDMI DVP Device Tree Bindings
+title: Broadcom BCM2711 HDMI DVP
maintainers:
- Maxime Ripard <mripard@kernel.org>
diff --git a/Documentation/devicetree/bindings/clock/brcm,bcm63268-timer-clocks.yaml b/Documentation/devicetree/bindings/clock/brcm,bcm63268-timer-clocks.yaml
new file mode 100644
index 000000000000..199818b2fb6d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/brcm,bcm63268-timer-clocks.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/brcm,bcm63268-timer-clocks.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom BCM63268 Timer Clock and Reset Device Tree Bindings
+
+maintainers:
+ - Álvaro Fernández Rojas <noltari@gmail.com>
+
+properties:
+ compatible:
+ const: brcm,bcm63268-timer-clocks
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ timer_clk: clock-controller@100000ac {
+ compatible = "brcm,bcm63268-timer-clocks";
+ reg = <0x100000ac 0x4>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/calxeda.yaml b/Documentation/devicetree/bindings/clock/calxeda.yaml
index a34cbf3c9aaf..a88fbe20fef1 100644
--- a/Documentation/devicetree/bindings/clock/calxeda.yaml
+++ b/Documentation/devicetree/bindings/clock/calxeda.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/calxeda.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Device Tree Clock bindings for Calxeda highbank platform
+title: Calxeda highbank platform Clock Controller
description: |
This binding covers the Calxeda SoC internal peripheral and bus clocks
diff --git a/Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml b/Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml
index 7f5cf4001f76..998e5cce652f 100644
--- a/Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/canaan,k210-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/canaan,k210-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Canaan Kendryte K210 Clock Device Tree Bindings
+title: Canaan Kendryte K210 Clock
maintainers:
- Damien Le Moal <damien.lemoal@wdc.com>
diff --git a/Documentation/devicetree/bindings/clock/cirrus,cs2000-cp.yaml b/Documentation/devicetree/bindings/clock/cirrus,cs2000-cp.yaml
new file mode 100644
index 000000000000..d416c374e853
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/cirrus,cs2000-cp.yaml
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/cirrus,cs2000-cp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: CIRRUS LOGIC Fractional-N Clock Synthesizer & Clock Multiplier
+
+maintainers:
+ - Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
+
+description: |
+ The CS2000-CP is an extremely versatile system clocking device that
+ utilizes a programmable phase lock loop.
+
+ Link: https://www.cirrus.com/products/cs2000/
+
+properties:
+ compatible:
+ enum:
+ - cirrus,cs2000-cp
+
+ clocks:
+ description:
+ Common clock binding for CLK_IN, XTI/REF_CLK
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: clk_in
+ - const: ref_clk
+
+ '#clock-cells':
+ const: 0
+
+ reg:
+ maxItems: 1
+
+ cirrus,aux-output-source:
+ description:
+ Specifies the function of the auxiliary clock output pin
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum:
+ - 0 # CS2000CP_AUX_OUTPUT_REF_CLK: ref_clk input
+ - 1 # CS2000CP_AUX_OUTPUT_CLK_IN: clk_in input
+ - 2 # CS2000CP_AUX_OUTPUT_CLK_OUT: clk_out output
+ - 3 # CS2000CP_AUX_OUTPUT_PLL_LOCK: pll lock status
+ default: 0
+
+ cirrus,clock-skip:
+ description:
+ This mode allows the PLL to maintain lock even when CLK_IN
+ has missing pulses for up to 20 ms.
+ $ref: /schemas/types.yaml#/definitions/flag
+
+ cirrus,dynamic-mode:
+ description:
+ In dynamic mode, the CLK_IN input is used to drive the
+ digital PLL of the silicon.
+ If not given, the static mode shall be used to derive the
+ output signal directly from the REF_CLK input.
+ $ref: /schemas/types.yaml#/definitions/flag
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/cirrus,cs2000-cp.h>
+
+ i2c@0 {
+ reg = <0x0 0x100>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ clock-controller@4f {
+ #clock-cells = <0>;
+ compatible = "cirrus,cs2000-cp";
+ reg = <0x4f>;
+ clocks = <&rcar_sound 0>, <&x12_clk>;
+ clock-names = "clk_in", "ref_clk";
+ cirrus,aux-output-source = <CS2000CP_AUX_OUTPUT_CLK_OUT>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/clock-bindings.txt b/Documentation/devicetree/bindings/clock/clock-bindings.txt
index f2ea53832ac6..6fe541368889 100644
--- a/Documentation/devicetree/bindings/clock/clock-bindings.txt
+++ b/Documentation/devicetree/bindings/clock/clock-bindings.txt
@@ -1,186 +1,2 @@
-This binding is a work-in-progress, and are based on some experimental
-work by benh[1].
-
-Sources of clock signal can be represented by any node in the device
-tree. Those nodes are designated as clock providers. Clock consumer
-nodes use a phandle and clock specifier pair to connect clock provider
-outputs to clock inputs. Similar to the gpio specifiers, a clock
-specifier is an array of zero, one or more cells identifying the clock
-output on a device. The length of a clock specifier is defined by the
-value of a #clock-cells property in the clock provider node.
-
-[1] https://patchwork.ozlabs.org/patch/31551/
-
-==Clock providers==
-
-Required properties:
-#clock-cells: Number of cells in a clock specifier; Typically 0 for nodes
- with a single clock output and 1 for nodes with multiple
- clock outputs.
-
-Optional properties:
-clock-output-names: Recommended to be a list of strings of clock output signal
- names indexed by the first cell in the clock specifier.
- However, the meaning of clock-output-names is domain
- specific to the clock provider, and is only provided to
- encourage using the same meaning for the majority of clock
- providers. This format may not work for clock providers
- using a complex clock specifier format. In those cases it
- is recommended to omit this property and create a binding
- specific names property.
-
- Clock consumer nodes must never directly reference
- the provider's clock-output-names property.
-
-For example:
-
- oscillator {
- #clock-cells = <1>;
- clock-output-names = "ckil", "ckih";
- };
-
-- this node defines a device with two clock outputs, the first named
- "ckil" and the second named "ckih". Consumer nodes always reference
- clocks by index. The names should reflect the clock output signal
- names for the device.
-
-clock-indices: If the identifying number for the clocks in the node
- is not linear from zero, then this allows the mapping of
- identifiers into the clock-output-names array.
-
-For example, if we have two clocks <&oscillator 1> and <&oscillator 3>:
-
- oscillator {
- compatible = "myclocktype";
- #clock-cells = <1>;
- clock-indices = <1>, <3>;
- clock-output-names = "clka", "clkb";
- }
-
- This ensures we do not have any empty strings in clock-output-names
-
-
-==Clock consumers==
-
-Required properties:
-clocks: List of phandle and clock specifier pairs, one pair
- for each clock input to the device. Note: if the
- clock provider specifies '0' for #clock-cells, then
- only the phandle portion of the pair will appear.
-
-Optional properties:
-clock-names: List of clock input name strings sorted in the same
- order as the clocks property. Consumers drivers
- will use clock-names to match clock input names
- with clocks specifiers.
-clock-ranges: Empty property indicating that child nodes can inherit named
- clocks from this node. Useful for bus nodes to provide a
- clock to their children.
-
-For example:
-
- device {
- clocks = <&osc 1>, <&ref 0>;
- clock-names = "baud", "register";
- };
-
-
-This represents a device with two clock inputs, named "baud" and "register".
-The baud clock is connected to output 1 of the &osc device, and the register
-clock is connected to output 0 of the &ref.
-
-==Example==
-
- /* external oscillator */
- osc: oscillator {
- compatible = "fixed-clock";
- #clock-cells = <0>;
- clock-frequency = <32678>;
- clock-output-names = "osc";
- };
-
- /* phase-locked-loop device, generates a higher frequency clock
- * from the external oscillator reference */
- pll: pll@4c000 {
- compatible = "vendor,some-pll-interface"
- #clock-cells = <1>;
- clocks = <&osc 0>;
- clock-names = "ref";
- reg = <0x4c000 0x1000>;
- clock-output-names = "pll", "pll-switched";
- };
-
- /* UART, using the low frequency oscillator for the baud clock,
- * and the high frequency switched PLL output for register
- * clocking */
- uart@a000 {
- compatible = "fsl,imx-uart";
- reg = <0xa000 0x1000>;
- interrupts = <33>;
- clocks = <&osc 0>, <&pll 1>;
- clock-names = "baud", "register";
- };
-
-This DT fragment defines three devices: an external oscillator to provide a
-low-frequency reference clock, a PLL device to generate a higher frequency
-clock signal, and a UART.
-
-* The oscillator is fixed-frequency, and provides one clock output, named "osc".
-* The PLL is both a clock provider and a clock consumer. It uses the clock
- signal generated by the external oscillator, and provides two output signals
- ("pll" and "pll-switched").
-* The UART has its baud clock connected the external oscillator and its
- register clock connected to the PLL clock (the "pll-switched" signal)
-
-==Assigned clock parents and rates==
-
-Some platforms may require initial configuration of default parent clocks
-and clock frequencies. Such a configuration can be specified in a device tree
-node through assigned-clocks, assigned-clock-parents and assigned-clock-rates
-properties. The assigned-clock-parents property should contain a list of parent
-clocks in the form of a phandle and clock specifier pair and the
-assigned-clock-rates property should contain a list of frequencies in Hz. Both
-these properties should correspond to the clocks listed in the assigned-clocks
-property.
-
-To skip setting parent or rate of a clock its corresponding entry should be
-set to 0, or can be omitted if it is not followed by any non-zero entry.
-
- uart@a000 {
- compatible = "fsl,imx-uart";
- reg = <0xa000 0x1000>;
- ...
- clocks = <&osc 0>, <&pll 1>;
- clock-names = "baud", "register";
-
- assigned-clocks = <&clkcon 0>, <&pll 2>;
- assigned-clock-parents = <&pll 2>;
- assigned-clock-rates = <0>, <460800>;
- };
-
-In this example the <&pll 2> clock is set as parent of clock <&clkcon 0> and
-the <&pll 2> clock is assigned a frequency value of 460800 Hz.
-
-Configuring a clock's parent and rate through the device node that consumes
-the clock can be done only for clocks that have a single user. Specifying
-conflicting parent or rate configuration in multiple consumer nodes for
-a shared clock is forbidden.
-
-Configuration of common clocks, which affect multiple consumer devices can
-be similarly specified in the clock provider node.
-
-==Protected clocks==
-
-Some platforms or firmwares may not fully expose all the clocks to the OS, such
-as in situations where those clks are used by drivers running in ARM secure
-execution levels. Such a configuration can be specified in device tree with the
-protected-clocks property in the form of a clock specifier list. This property should
-only be specified in the node that is providing the clocks being protected:
-
- clock-controller@a000f000 {
- compatible = "vendor,clk95;
- reg = <0xa000f000 0x1000>
- #clocks-cells = <1>;
- ...
- protected-clocks = <UART3_CLK>, <SPI5_CLK>;
- };
+This file has moved to the clock binding schema:
+https://github.com/devicetree-org/dt-schema/blob/main/dtschema/schemas/clock/clock.yaml
diff --git a/Documentation/devicetree/bindings/clock/cs2000-cp.txt b/Documentation/devicetree/bindings/clock/cs2000-cp.txt
deleted file mode 100644
index 54e6df0bee8a..000000000000
--- a/Documentation/devicetree/bindings/clock/cs2000-cp.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-CIRRUS LOGIC Fractional-N Clock Synthesizer & Clock Multiplier
-
-Required properties:
-
-- compatible: "cirrus,cs2000-cp"
-- reg: The chip select number on the I2C bus
-- clocks: common clock binding for CLK_IN, XTI/REF_CLK
-- clock-names: CLK_IN : clk_in, XTI/REF_CLK : ref_clk
-- #clock-cells: must be <0>
-
-Example:
-
-&i2c2 {
- ...
- cs2000: clk_multiplier@4f {
- #clock-cells = <0>;
- compatible = "cirrus,cs2000-cp";
- reg = <0x4f>;
- clocks = <&rcar_sound 0>, <&x12_clk>;
- clock-names = "clk_in", "ref_clk";
- };
-};
diff --git a/Documentation/devicetree/bindings/clock/efm32-clock.txt b/Documentation/devicetree/bindings/clock/efm32-clock.txt
deleted file mode 100644
index 263d293f6a10..000000000000
--- a/Documentation/devicetree/bindings/clock/efm32-clock.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-* Clock bindings for Energy Micro efm32 Giant Gecko's Clock Management Unit
-
-Required properties:
-- compatible: Should be "efm32gg,cmu"
-- reg: Base address and length of the register set
-- interrupts: Interrupt used by the CMU
-- #clock-cells: Should be <1>
-
-The clock consumer should specify the desired clock by having the clock ID in
-its "clocks" phandle cell. The header efm32-clk.h contains a list of available
-IDs.
diff --git a/Documentation/devicetree/bindings/clock/exynos5260-clock.txt b/Documentation/devicetree/bindings/clock/exynos5260-clock.txt
deleted file mode 100644
index c79d31f7f66e..000000000000
--- a/Documentation/devicetree/bindings/clock/exynos5260-clock.txt
+++ /dev/null
@@ -1,190 +0,0 @@
-* Samsung Exynos5260 Clock Controller
-
-Exynos5260 has 13 clock controllers which are instantiated
-independently from the device-tree. These clock controllers
-generate and supply clocks to various hardware blocks within
-the SoC.
-
-Each clock is assigned an identifier and client nodes can use
-this identifier to specify the clock which they consume. All
-available clocks are defined as preprocessor macros in
-dt-bindings/clock/exynos5260-clk.h header and can be used in
-device tree sources.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It
-is expected that they are defined using standard clock bindings
-with following clock-output-names:
-
- - "fin_pll" - PLL input clock from XXTI
- - "xrtcxti" - input clock from XRTCXTI
- - "ioclk_pcm_extclk" - pcm external operation clock
- - "ioclk_spdif_extclk" - spdif external operation clock
- - "ioclk_i2s_cdclk" - i2s0 codec clock
-
-Phy clocks:
-
-There are several clocks which are generated by specific PHYs.
-These clocks are fed into the clock controller and then routed to
-the hardware blocks. These clocks are defined as fixed clocks in the
-driver with following names:
-
- - "phyclk_dptx_phy_ch3_txd_clk" - dp phy clock for channel 3
- - "phyclk_dptx_phy_ch2_txd_clk" - dp phy clock for channel 2
- - "phyclk_dptx_phy_ch1_txd_clk" - dp phy clock for channel 1
- - "phyclk_dptx_phy_ch0_txd_clk" - dp phy clock for channel 0
- - "phyclk_hdmi_phy_tmds_clko" - hdmi phy tmds clock
- - "phyclk_hdmi_phy_pixel_clko" - hdmi phy pixel clock
- - "phyclk_hdmi_link_o_tmds_clkhi" - hdmi phy for hdmi link
- - "phyclk_dptx_phy_o_ref_clk_24m" - dp phy reference clock
- - "phyclk_dptx_phy_clk_div2"
- - "phyclk_mipi_dphy_4l_m_rxclkesc0"
- - "phyclk_usbhost20_phy_phyclock" - usb 2.0 phy clock
- - "phyclk_usbhost20_phy_freeclk"
- - "phyclk_usbhost20_phy_clk48mohci"
- - "phyclk_usbdrd30_udrd30_pipe_pclk"
- - "phyclk_usbdrd30_udrd30_phyclock" - usb 3.0 phy clock
-
-Required Properties for Clock Controller:
-
- - compatible: should be one of the following.
- 1) "samsung,exynos5260-clock-top"
- 2) "samsung,exynos5260-clock-peri"
- 3) "samsung,exynos5260-clock-egl"
- 4) "samsung,exynos5260-clock-kfc"
- 5) "samsung,exynos5260-clock-g2d"
- 6) "samsung,exynos5260-clock-mif"
- 7) "samsung,exynos5260-clock-mfc"
- 8) "samsung,exynos5260-clock-g3d"
- 9) "samsung,exynos5260-clock-fsys"
- 10) "samsung,exynos5260-clock-aud"
- 11) "samsung,exynos5260-clock-isp"
- 12) "samsung,exynos5260-clock-gscl"
- 13) "samsung,exynos5260-clock-disp"
-
- - reg: physical base address of the controller and the length of
- memory mapped region.
-
- - #clock-cells: should be 1.
-
- - clocks: list of clock identifiers which are fed as the input to
- the given clock controller. Please refer the next section to find
- the input clocks for a given controller.
-
- - clock-names: list of names of clocks which are fed as the input
- to the given clock controller.
-
-Input clocks for top clock controller:
- - fin_pll
- - dout_mem_pll
- - dout_bus_pll
- - dout_media_pll
-
-Input clocks for peri clock controller:
- - fin_pll
- - ioclk_pcm_extclk
- - ioclk_i2s_cdclk
- - ioclk_spdif_extclk
- - phyclk_hdmi_phy_ref_cko
- - dout_aclk_peri_66
- - dout_sclk_peri_uart0
- - dout_sclk_peri_uart1
- - dout_sclk_peri_uart2
- - dout_sclk_peri_spi0_b
- - dout_sclk_peri_spi1_b
- - dout_sclk_peri_spi2_b
- - dout_aclk_peri_aud
- - dout_sclk_peri_spi0_b
-
-Input clocks for egl clock controller:
- - fin_pll
- - dout_bus_pll
-
-Input clocks for kfc clock controller:
- - fin_pll
- - dout_media_pll
-
-Input clocks for g2d clock controller:
- - fin_pll
- - dout_aclk_g2d_333
-
-Input clocks for mif clock controller:
- - fin_pll
-
-Input clocks for mfc clock controller:
- - fin_pll
- - dout_aclk_mfc_333
-
-Input clocks for g3d clock controller:
- - fin_pll
-
-Input clocks for fsys clock controller:
- - fin_pll
- - phyclk_usbhost20_phy_phyclock
- - phyclk_usbhost20_phy_freeclk
- - phyclk_usbhost20_phy_clk48mohci
- - phyclk_usbdrd30_udrd30_pipe_pclk
- - phyclk_usbdrd30_udrd30_phyclock
- - dout_aclk_fsys_200
-
-Input clocks for aud clock controller:
- - fin_pll
- - fout_aud_pll
- - ioclk_i2s_cdclk
- - ioclk_pcm_extclk
-
-Input clocks for isp clock controller:
- - fin_pll
- - dout_aclk_isp1_266
- - dout_aclk_isp1_400
- - mout_aclk_isp1_266
-
-Input clocks for gscl clock controller:
- - fin_pll
- - dout_aclk_gscl_400
- - dout_aclk_gscl_333
-
-Input clocks for disp clock controller:
- - fin_pll
- - phyclk_dptx_phy_ch3_txd_clk
- - phyclk_dptx_phy_ch2_txd_clk
- - phyclk_dptx_phy_ch1_txd_clk
- - phyclk_dptx_phy_ch0_txd_clk
- - phyclk_hdmi_phy_tmds_clko
- - phyclk_hdmi_phy_ref_clko
- - phyclk_hdmi_phy_pixel_clko
- - phyclk_hdmi_link_o_tmds_clkhi
- - phyclk_mipi_dphy_4l_m_txbyte_clkhs
- - phyclk_dptx_phy_o_ref_clk_24m
- - phyclk_dptx_phy_clk_div2
- - phyclk_mipi_dphy_4l_m_rxclkesc0
- - phyclk_hdmi_phy_ref_cko
- - ioclk_spdif_extclk
- - dout_aclk_peri_aud
- - dout_aclk_disp_222
- - dout_sclk_disp_pixel
- - dout_aclk_disp_333
-
-Example 1: An example of a clock controller node is listed below.
-
- clock_mfc: clock-controller@11090000 {
- compatible = "samsung,exynos5260-clock-mfc";
- clock = <&fin_pll>, <&clock_top TOP_DOUT_ACLK_MFC_333>;
- clock-names = "fin_pll", "dout_aclk_mfc_333";
- reg = <0x11090000 0x10000>;
- #clock-cells = <1>;
- };
-
-Example 2: UART controller node that consumes the clock generated by the
- peri clock controller. Refer to the standard clock bindings for
- information about 'clocks' and 'clock-names' property.
-
- serial@12c00000 {
- compatible = "samsung,exynos4210-uart";
- reg = <0x12C00000 0x100>;
- interrupts = <0 146 0>;
- clocks = <&clock_peri PERI_PCLK_UART0>, <&clock_peri PERI_SCLK_UART0>;
- clock-names = "uart", "clk_uart_baud0";
- };
-
diff --git a/Documentation/devicetree/bindings/clock/exynos5410-clock.txt b/Documentation/devicetree/bindings/clock/exynos5410-clock.txt
deleted file mode 100644
index 217beb27c30e..000000000000
--- a/Documentation/devicetree/bindings/clock/exynos5410-clock.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-* Samsung Exynos5410 Clock Controller
-
-The Exynos5410 clock controller generates and supplies clock to various
-controllers within the Exynos5410 SoC.
-
-Required Properties:
-
-- compatible: should be "samsung,exynos5410-clock"
-
-- reg: physical base address of the controller and length of memory mapped
- region.
-
-- #clock-cells: should be 1.
-
-- clocks: should contain an entry specifying the root clock from external
- oscillator supplied through XXTI or XusbXTI pin. This clock should be
- defined using standard clock bindings with "fin_pll" clock-output-name.
- That clock is being passed internally to the 9 PLLs.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/exynos5410.h header and can be used in device
-tree sources.
-
-Example 1: An example of a clock controller node is listed below.
-
- fin_pll: xxti {
- compatible = "fixed-clock";
- clock-frequency = <24000000>;
- clock-output-names = "fin_pll";
- #clock-cells = <0>;
- };
-
- clock: clock-controller@10010000 {
- compatible = "samsung,exynos5410-clock";
- reg = <0x10010000 0x30000>;
- #clock-cells = <1>;
- clocks = <&fin_pll>;
- };
-
-Example 2: UART controller node that consumes the clock generated by the clock
- controller. Refer to the standard clock bindings for information
- about 'clocks' and 'clock-names' property.
-
- serial@12c20000 {
- compatible = "samsung,exynos4210-uart";
- reg = <0x12C00000 0x100>;
- interrupts = <0 51 0>;
- clocks = <&clock CLK_UART0>, <&clock CLK_SCLK_UART0>;
- clock-names = "uart", "clk_uart_baud0";
- };
diff --git a/Documentation/devicetree/bindings/clock/exynos5433-clock.txt b/Documentation/devicetree/bindings/clock/exynos5433-clock.txt
deleted file mode 100644
index 183c327a7d6b..000000000000
--- a/Documentation/devicetree/bindings/clock/exynos5433-clock.txt
+++ /dev/null
@@ -1,507 +0,0 @@
-* Samsung Exynos5433 CMU (Clock Management Units)
-
-The Exynos5433 clock controller generates and supplies clock to various
-controllers within the Exynos5433 SoC.
-
-Required Properties:
-
-- compatible: should be one of the following.
- - "samsung,exynos5433-cmu-top" - clock controller compatible for CMU_TOP
- which generates clocks for IMEM/FSYS/G3D/GSCL/HEVC/MSCL/G2D/MFC/PERIC/PERIS
- domains and bus clocks.
- - "samsung,exynos5433-cmu-cpif" - clock controller compatible for CMU_CPIF
- which generates clocks for LLI (Low Latency Interface) IP.
- - "samsung,exynos5433-cmu-mif" - clock controller compatible for CMU_MIF
- which generates clocks for DRAM Memory Controller domain.
- - "samsung,exynos5433-cmu-peric" - clock controller compatible for CMU_PERIC
- which generates clocks for UART/I2C/SPI/I2S/PCM/SPDIF/PWM/SLIMBUS IPs.
- - "samsung,exynos5433-cmu-peris" - clock controller compatible for CMU_PERIS
- which generates clocks for PMU/TMU/MCT/WDT/RTC/SECKEY/TZPC IPs.
- - "samsung,exynos5433-cmu-fsys" - clock controller compatible for CMU_FSYS
- which generates clocks for USB/UFS/SDMMC/TSI/PDMA IPs.
- - "samsung,exynos5433-cmu-g2d" - clock controller compatible for CMU_G2D
- which generates clocks for G2D/MDMA IPs.
- - "samsung,exynos5433-cmu-disp" - clock controller compatible for CMU_DISP
- which generates clocks for Display (DECON/HDMI/DSIM/MIXER) IPs.
- - "samsung,exynos5433-cmu-aud" - clock controller compatible for CMU_AUD
- which generates clocks for Cortex-A5/BUS/AUDIO clocks.
- - "samsung,exynos5433-cmu-bus0", "samsung,exynos5433-cmu-bus1"
- and "samsung,exynos5433-cmu-bus2" - clock controller compatible for CMU_BUS
- which generates global data buses clock and global peripheral buses clock.
- - "samsung,exynos5433-cmu-g3d" - clock controller compatible for CMU_G3D
- which generates clocks for 3D Graphics Engine IP.
- - "samsung,exynos5433-cmu-gscl" - clock controller compatible for CMU_GSCL
- which generates clocks for GSCALER IPs.
- - "samsung,exynos5433-cmu-apollo"- clock controller compatible for CMU_APOLLO
- which generates clocks for Cortex-A53 Quad-core processor.
- - "samsung,exynos5433-cmu-atlas" - clock controller compatible for CMU_ATLAS
- which generates clocks for Cortex-A57 Quad-core processor, CoreSight and
- L2 cache controller.
- - "samsung,exynos5433-cmu-mscl" - clock controller compatible for CMU_MSCL
- which generates clocks for M2M (Memory to Memory) scaler and JPEG IPs.
- - "samsung,exynos5433-cmu-mfc" - clock controller compatible for CMU_MFC
- which generates clocks for MFC(Multi-Format Codec) IP.
- - "samsung,exynos5433-cmu-hevc" - clock controller compatible for CMU_HEVC
- which generates clocks for HEVC(High Efficiency Video Codec) decoder IP.
- - "samsung,exynos5433-cmu-isp" - clock controller compatible for CMU_ISP
- which generates clocks for FIMC-ISP/DRC/SCLC/DIS/3DNR IPs.
- - "samsung,exynos5433-cmu-cam0" - clock controller compatible for CMU_CAM0
- which generates clocks for MIPI_CSIS{0|1}/FIMC_LITE_{A|B|D}/FIMC_3AA{0|1}
- IPs.
- - "samsung,exynos5433-cmu-cam1" - clock controller compatible for CMU_CAM1
- which generates clocks for Cortex-A5/MIPI_CSIS2/FIMC-LITE_C/FIMC-FD IPs.
- - "samsung,exynos5433-cmu-imem" - clock controller compatible for CMU_IMEM
- which generates clocks for SSS (Security SubSystem) and SlimSSS IPs.
-
-- reg: physical base address of the controller and length of memory mapped
- region.
-
-- #clock-cells: should be 1.
-
-- clocks: list of the clock controller input clock identifiers,
- from common clock bindings. Please refer the next section
- to find the input clocks for a given controller.
-
-- clock-names: list of the clock controller input clock names,
- as described in clock-bindings.txt.
-
- Input clocks for top clock controller:
- - oscclk
- - sclk_mphy_pll
- - sclk_mfc_pll
- - sclk_bus_pll
-
- Input clocks for cpif clock controller:
- - oscclk
-
- Input clocks for mif clock controller:
- - oscclk
- - sclk_mphy_pll
-
- Input clocks for fsys clock controller:
- - oscclk
- - sclk_ufs_mphy
- - aclk_fsys_200
- - sclk_pcie_100_fsys
- - sclk_ufsunipro_fsys
- - sclk_mmc2_fsys
- - sclk_mmc1_fsys
- - sclk_mmc0_fsys
- - sclk_usbhost30_fsys
- - sclk_usbdrd30_fsys
-
- Input clocks for g2d clock controller:
- - oscclk
- - aclk_g2d_266
- - aclk_g2d_400
-
- Input clocks for disp clock controller:
- - oscclk
- - sclk_dsim1_disp
- - sclk_dsim0_disp
- - sclk_dsd_disp
- - sclk_decon_tv_eclk_disp
- - sclk_decon_vclk_disp
- - sclk_decon_eclk_disp
- - sclk_decon_tv_vclk_disp
- - aclk_disp_333
-
- Input clocks for audio clock controller:
- - oscclk
- - fout_aud_pll
-
- Input clocks for bus0 clock controller:
- - aclk_bus0_400
-
- Input clocks for bus1 clock controller:
- - aclk_bus1_400
-
- Input clocks for bus2 clock controller:
- - oscclk
- - aclk_bus2_400
-
- Input clocks for g3d clock controller:
- - oscclk
- - aclk_g3d_400
-
- Input clocks for gscl clock controller:
- - oscclk
- - aclk_gscl_111
- - aclk_gscl_333
-
- Input clocks for apollo clock controller:
- - oscclk
- - sclk_bus_pll_apollo
-
- Input clocks for atlas clock controller:
- - oscclk
- - sclk_bus_pll_atlas
-
- Input clocks for mscl clock controller:
- - oscclk
- - sclk_jpeg_mscl
- - aclk_mscl_400
-
- Input clocks for mfc clock controller:
- - oscclk
- - aclk_mfc_400
-
- Input clocks for hevc clock controller:
- - oscclk
- - aclk_hevc_400
-
- Input clocks for isp clock controller:
- - oscclk
- - aclk_isp_dis_400
- - aclk_isp_400
-
- Input clocks for cam0 clock controller:
- - oscclk
- - aclk_cam0_333
- - aclk_cam0_400
- - aclk_cam0_552
-
- Input clocks for cam1 clock controller:
- - oscclk
- - sclk_isp_uart_cam1
- - sclk_isp_spi1_cam1
- - sclk_isp_spi0_cam1
- - aclk_cam1_333
- - aclk_cam1_400
- - aclk_cam1_552
-
- Input clocks for imem clock controller:
- - oscclk
- - aclk_imem_sssx_266
- - aclk_imem_266
- - aclk_imem_200
-
-Optional properties:
- - power-domains: a phandle to respective power domain node as described by
- generic PM domain bindings (see power/power_domain.txt for more
- information).
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/exynos5433.h header and can be used in device
-tree sources.
-
-Example 1: Examples of 'oscclk' source clock node are listed below.
-
- xxti: xxti {
- compatible = "fixed-clock";
- clock-output-names = "oscclk";
- #clock-cells = <0>;
- };
-
-Example 2: Examples of clock controller nodes are listed below.
-
- cmu_top: clock-controller@10030000 {
- compatible = "samsung,exynos5433-cmu-top";
- reg = <0x10030000 0x0c04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_mphy_pll",
- "sclk_mfc_pll",
- "sclk_bus_pll";
- clocks = <&xxti>,
- <&cmu_cpif CLK_SCLK_MPHY_PLL>,
- <&cmu_mif CLK_SCLK_MFC_PLL>,
- <&cmu_mif CLK_SCLK_BUS_PLL>;
- };
-
- cmu_cpif: clock-controller@10fc0000 {
- compatible = "samsung,exynos5433-cmu-cpif";
- reg = <0x10fc0000 0x0c04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk";
- clocks = <&xxti>;
- };
-
- cmu_mif: clock-controller@105b0000 {
- compatible = "samsung,exynos5433-cmu-mif";
- reg = <0x105b0000 0x100c>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_mphy_pll";
- clocks = <&xxti>,
- <&cmu_cpif CLK_SCLK_MPHY_PLL>;
- };
-
- cmu_peric: clock-controller@14c80000 {
- compatible = "samsung,exynos5433-cmu-peric";
- reg = <0x14c80000 0x0b08>;
- #clock-cells = <1>;
- };
-
- cmu_peris: clock-controller@10040000 {
- compatible = "samsung,exynos5433-cmu-peris";
- reg = <0x10040000 0x0b20>;
- #clock-cells = <1>;
- };
-
- cmu_fsys: clock-controller@156e0000 {
- compatible = "samsung,exynos5433-cmu-fsys";
- reg = <0x156e0000 0x0b04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_ufs_mphy",
- "aclk_fsys_200",
- "sclk_pcie_100_fsys",
- "sclk_ufsunipro_fsys",
- "sclk_mmc2_fsys",
- "sclk_mmc1_fsys",
- "sclk_mmc0_fsys",
- "sclk_usbhost30_fsys",
- "sclk_usbdrd30_fsys";
- clocks = <&xxti>,
- <&cmu_cpif CLK_SCLK_UFS_MPHY>,
- <&cmu_top CLK_ACLK_FSYS_200>,
- <&cmu_top CLK_SCLK_PCIE_100_FSYS>,
- <&cmu_top CLK_SCLK_UFSUNIPRO_FSYS>,
- <&cmu_top CLK_SCLK_MMC2_FSYS>,
- <&cmu_top CLK_SCLK_MMC1_FSYS>,
- <&cmu_top CLK_SCLK_MMC0_FSYS>,
- <&cmu_top CLK_SCLK_USBHOST30_FSYS>,
- <&cmu_top CLK_SCLK_USBDRD30_FSYS>;
- };
-
- cmu_g2d: clock-controller@12460000 {
- compatible = "samsung,exynos5433-cmu-g2d";
- reg = <0x12460000 0x0b08>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "aclk_g2d_266",
- "aclk_g2d_400";
- clocks = <&xxti>,
- <&cmu_top CLK_ACLK_G2D_266>,
- <&cmu_top CLK_ACLK_G2D_400>;
- power-domains = <&pd_g2d>;
- };
-
- cmu_disp: clock-controller@13b90000 {
- compatible = "samsung,exynos5433-cmu-disp";
- reg = <0x13b90000 0x0c04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_dsim1_disp",
- "sclk_dsim0_disp",
- "sclk_dsd_disp",
- "sclk_decon_tv_eclk_disp",
- "sclk_decon_vclk_disp",
- "sclk_decon_eclk_disp",
- "sclk_decon_tv_vclk_disp",
- "aclk_disp_333";
- clocks = <&xxti>,
- <&cmu_mif CLK_SCLK_DSIM1_DISP>,
- <&cmu_mif CLK_SCLK_DSIM0_DISP>,
- <&cmu_mif CLK_SCLK_DSD_DISP>,
- <&cmu_mif CLK_SCLK_DECON_TV_ECLK_DISP>,
- <&cmu_mif CLK_SCLK_DECON_VCLK_DISP>,
- <&cmu_mif CLK_SCLK_DECON_ECLK_DISP>,
- <&cmu_mif CLK_SCLK_DECON_TV_VCLK_DISP>,
- <&cmu_mif CLK_ACLK_DISP_333>;
- power-domains = <&pd_disp>;
- };
-
- cmu_aud: clock-controller@114c0000 {
- compatible = "samsung,exynos5433-cmu-aud";
- reg = <0x114c0000 0x0b04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "fout_aud_pll";
- clocks = <&xxti>, <&cmu_top CLK_FOUT_AUD_PLL>;
- power-domains = <&pd_aud>;
- };
-
- cmu_bus0: clock-controller@13600000 {
- compatible = "samsung,exynos5433-cmu-bus0";
- reg = <0x13600000 0x0b04>;
- #clock-cells = <1>;
-
- clock-names = "aclk_bus0_400";
- clocks = <&cmu_top CLK_ACLK_BUS0_400>;
- };
-
- cmu_bus1: clock-controller@14800000 {
- compatible = "samsung,exynos5433-cmu-bus1";
- reg = <0x14800000 0x0b04>;
- #clock-cells = <1>;
-
- clock-names = "aclk_bus1_400";
- clocks = <&cmu_top CLK_ACLK_BUS1_400>;
- };
-
- cmu_bus2: clock-controller@13400000 {
- compatible = "samsung,exynos5433-cmu-bus2";
- reg = <0x13400000 0x0b04>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "aclk_bus2_400";
- clocks = <&xxti>, <&cmu_mif CLK_ACLK_BUS2_400>;
- };
-
- cmu_g3d: clock-controller@14aa0000 {
- compatible = "samsung,exynos5433-cmu-g3d";
- reg = <0x14aa0000 0x1000>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "aclk_g3d_400";
- clocks = <&xxti>, <&cmu_top CLK_ACLK_G3D_400>;
- power-domains = <&pd_g3d>;
- };
-
- cmu_gscl: clock-controller@13cf0000 {
- compatible = "samsung,exynos5433-cmu-gscl";
- reg = <0x13cf0000 0x0b10>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "aclk_gscl_111",
- "aclk_gscl_333";
- clocks = <&xxti>,
- <&cmu_top CLK_ACLK_GSCL_111>,
- <&cmu_top CLK_ACLK_GSCL_333>;
- power-domains = <&pd_gscl>;
- };
-
- cmu_apollo: clock-controller@11900000 {
- compatible = "samsung,exynos5433-cmu-apollo";
- reg = <0x11900000 0x1088>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "sclk_bus_pll_apollo";
- clocks = <&xxti>, <&cmu_mif CLK_SCLK_BUS_PLL_APOLLO>;
- };
-
- cmu_atlas: clock-controller@11800000 {
- compatible = "samsung,exynos5433-cmu-atlas";
- reg = <0x11800000 0x1088>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "sclk_bus_pll_atlas";
- clocks = <&xxti>, <&cmu_mif CLK_SCLK_BUS_PLL_ATLAS>;
- };
-
- cmu_mscl: clock-controller@105d0000 {
- compatible = "samsung,exynos5433-cmu-mscl";
- reg = <0x105d0000 0x0b10>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_jpeg_mscl",
- "aclk_mscl_400";
- clocks = <&xxti>,
- <&cmu_top CLK_SCLK_JPEG_MSCL>,
- <&cmu_top CLK_ACLK_MSCL_400>;
- power-domains = <&pd_mscl>;
- };
-
- cmu_mfc: clock-controller@15280000 {
- compatible = "samsung,exynos5433-cmu-mfc";
- reg = <0x15280000 0x0b08>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "aclk_mfc_400";
- clocks = <&xxti>, <&cmu_top CLK_ACLK_MFC_400>;
- power-domains = <&pd_mfc>;
- };
-
- cmu_hevc: clock-controller@14f80000 {
- compatible = "samsung,exynos5433-cmu-hevc";
- reg = <0x14f80000 0x0b08>;
- #clock-cells = <1>;
-
- clock-names = "oscclk", "aclk_hevc_400";
- clocks = <&xxti>, <&cmu_top CLK_ACLK_HEVC_400>;
- power-domains = <&pd_hevc>;
- };
-
- cmu_isp: clock-controller@146d0000 {
- compatible = "samsung,exynos5433-cmu-isp";
- reg = <0x146d0000 0x0b0c>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "aclk_isp_dis_400",
- "aclk_isp_400";
- clocks = <&xxti>,
- <&cmu_top CLK_ACLK_ISP_DIS_400>,
- <&cmu_top CLK_ACLK_ISP_400>;
- power-domains = <&pd_isp>;
- };
-
- cmu_cam0: clock-controller@120d0000 {
- compatible = "samsung,exynos5433-cmu-cam0";
- reg = <0x120d0000 0x0b0c>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "aclk_cam0_333",
- "aclk_cam0_400",
- "aclk_cam0_552";
- clocks = <&xxti>,
- <&cmu_top CLK_ACLK_CAM0_333>,
- <&cmu_top CLK_ACLK_CAM0_400>,
- <&cmu_top CLK_ACLK_CAM0_552>;
- power-domains = <&pd_cam0>;
- };
-
- cmu_cam1: clock-controller@145d0000 {
- compatible = "samsung,exynos5433-cmu-cam1";
- reg = <0x145d0000 0x0b08>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "sclk_isp_uart_cam1",
- "sclk_isp_spi1_cam1",
- "sclk_isp_spi0_cam1",
- "aclk_cam1_333",
- "aclk_cam1_400",
- "aclk_cam1_552";
- clocks = <&xxti>,
- <&cmu_top CLK_SCLK_ISP_UART_CAM1>,
- <&cmu_top CLK_SCLK_ISP_SPI1_CAM1>,
- <&cmu_top CLK_SCLK_ISP_SPI0_CAM1>,
- <&cmu_top CLK_ACLK_CAM1_333>,
- <&cmu_top CLK_ACLK_CAM1_400>,
- <&cmu_top CLK_ACLK_CAM1_552>;
- power-domains = <&pd_cam1>;
- };
-
- cmu_imem: clock-controller@11060000 {
- compatible = "samsung,exynos5433-cmu-imem";
- reg = <0x11060000 0x1000>;
- #clock-cells = <1>;
-
- clock-names = "oscclk",
- "aclk_imem_sssx_266",
- "aclk_imem_266",
- "aclk_imem_200";
- clocks = <&xxti>,
- <&cmu_top CLK_DIV_ACLK_IMEM_SSSX_266>,
- <&cmu_top CLK_DIV_ACLK_IMEM_266>,
- <&cmu_top CLK_DIV_ACLK_IMEM_200>;
- };
-
-Example 3: UART controller node that consumes the clock generated by the clock
- controller.
-
- serial_0: serial@14c10000 {
- compatible = "samsung,exynos5433-uart";
- reg = <0x14C10000 0x100>;
- interrupts = <0 421 0>;
- clocks = <&cmu_peric CLK_PCLK_UART0>,
- <&cmu_peric CLK_SCLK_UART0>;
- clock-names = "uart", "clk_uart_baud0";
- pinctrl-names = "default";
- pinctrl-0 = <&uart0_bus>;
- };
diff --git a/Documentation/devicetree/bindings/clock/exynos7-clock.txt b/Documentation/devicetree/bindings/clock/exynos7-clock.txt
deleted file mode 100644
index 6bf1e7493f61..000000000000
--- a/Documentation/devicetree/bindings/clock/exynos7-clock.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-* Samsung Exynos7 Clock Controller
-
-Exynos7 clock controller has various blocks which are instantiated
-independently from the device-tree. These clock controllers
-generate and supply clocks to various hardware blocks within
-the SoC.
-
-Each clock is assigned an identifier and client nodes can use
-this identifier to specify the clock which they consume. All
-available clocks are defined as preprocessor macros in
-dt-bindings/clock/exynos7-clk.h header and can be used in
-device tree sources.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It
-is expected that they are defined using standard clock bindings
-with following clock-output-names:
-
- - "fin_pll" - PLL input clock from XXTI
-
-Required Properties for Clock Controller:
-
- - compatible: clock controllers will use one of the following
- compatible strings to indicate the clock controller
- functionality.
-
- - "samsung,exynos7-clock-topc"
- - "samsung,exynos7-clock-top0"
- - "samsung,exynos7-clock-top1"
- - "samsung,exynos7-clock-ccore"
- - "samsung,exynos7-clock-peric0"
- - "samsung,exynos7-clock-peric1"
- - "samsung,exynos7-clock-peris"
- - "samsung,exynos7-clock-fsys0"
- - "samsung,exynos7-clock-fsys1"
- - "samsung,exynos7-clock-mscl"
- - "samsung,exynos7-clock-aud"
-
- - reg: physical base address of the controller and the length of
- memory mapped region.
-
- - #clock-cells: should be 1.
-
- - clocks: list of clock identifiers which are fed as the input to
- the given clock controller. Please refer the next section to
- find the input clocks for a given controller.
-
-- clock-names: list of names of clocks which are fed as the input
- to the given clock controller.
-
-Input clocks for top0 clock controller:
- - fin_pll
- - dout_sclk_bus0_pll
- - dout_sclk_bus1_pll
- - dout_sclk_cc_pll
- - dout_sclk_mfc_pll
- - dout_sclk_aud_pll
-
-Input clocks for top1 clock controller:
- - fin_pll
- - dout_sclk_bus0_pll
- - dout_sclk_bus1_pll
- - dout_sclk_cc_pll
- - dout_sclk_mfc_pll
-
-Input clocks for ccore clock controller:
- - fin_pll
- - dout_aclk_ccore_133
-
-Input clocks for peric0 clock controller:
- - fin_pll
- - dout_aclk_peric0_66
- - sclk_uart0
-
-Input clocks for peric1 clock controller:
- - fin_pll
- - dout_aclk_peric1_66
- - sclk_uart1
- - sclk_uart2
- - sclk_uart3
- - sclk_spi0
- - sclk_spi1
- - sclk_spi2
- - sclk_spi3
- - sclk_spi4
- - sclk_i2s1
- - sclk_pcm1
- - sclk_spdif
-
-Input clocks for peris clock controller:
- - fin_pll
- - dout_aclk_peris_66
-
-Input clocks for fsys0 clock controller:
- - fin_pll
- - dout_aclk_fsys0_200
- - dout_sclk_mmc2
-
-Input clocks for fsys1 clock controller:
- - fin_pll
- - dout_aclk_fsys1_200
- - dout_sclk_mmc0
- - dout_sclk_mmc1
-
-Input clocks for aud clock controller:
- - fin_pll
- - fout_aud_pll
diff --git a/Documentation/devicetree/bindings/clock/fixed-clock.yaml b/Documentation/devicetree/bindings/clock/fixed-clock.yaml
index b657ecd0ef1c..b0a4fb8256e2 100644
--- a/Documentation/devicetree/bindings/clock/fixed-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/fixed-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/fixed-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Binding for simple fixed-rate clock sources
+title: Simple fixed-rate clock sources
maintainers:
- Michael Turquette <mturquette@baylibre.com>
diff --git a/Documentation/devicetree/bindings/clock/fixed-factor-clock.yaml b/Documentation/devicetree/bindings/clock/fixed-factor-clock.yaml
index f415845b38dd..8f71ab300470 100644
--- a/Documentation/devicetree/bindings/clock/fixed-factor-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/fixed-factor-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/fixed-factor-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Binding for simple fixed factor rate clock sources
+title: Simple fixed factor rate clock sources
maintainers:
- Michael Turquette <mturquette@baylibre.com>
@@ -13,7 +13,6 @@ maintainers:
properties:
compatible:
enum:
- - allwinner,sun4i-a10-pll3-2x-clk
- fixed-factor-clock
"#clock-cells":
diff --git a/Documentation/devicetree/bindings/clock/fixed-mmio-clock.txt b/Documentation/devicetree/bindings/clock/fixed-mmio-clock.txt
deleted file mode 100644
index c359367fd1a9..000000000000
--- a/Documentation/devicetree/bindings/clock/fixed-mmio-clock.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Binding for simple memory mapped io fixed-rate clock sources.
-The driver reads a clock frequency value from a single 32-bit memory mapped
-I/O register and registers it as a fixed rate clock.
-
-It was designed for test systems, like FPGA, not for complete, finished SoCs.
-
-This binding uses the common clock binding[1].
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-
-Required properties:
-- compatible : shall be "fixed-mmio-clock".
-- #clock-cells : from common clock binding; shall be set to 0.
-- reg : Address and length of the clock value register set.
-
-Optional properties:
-- clock-output-names : From common clock binding.
-
-Example:
-sysclock: sysclock@fd020004 {
- #clock-cells = <0>;
- compatible = "fixed-mmio-clock";
- reg = <0xfd020004 0x4>;
-};
diff --git a/Documentation/devicetree/bindings/clock/fixed-mmio-clock.yaml b/Documentation/devicetree/bindings/clock/fixed-mmio-clock.yaml
new file mode 100644
index 000000000000..e22fc272d023
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/fixed-mmio-clock.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/fixed-mmio-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Simple memory mapped IO fixed-rate clock sources
+
+description:
+ This binding describes a fixed-rate clock for which the frequency can
+ be read from a single 32-bit memory mapped I/O register.
+
+ It was designed for test systems, like FPGA, not for complete,
+ finished SoCs.
+
+maintainers:
+ - Jan Kotas <jank@cadence.com>
+
+properties:
+ compatible:
+ const: fixed-mmio-clock
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 0
+
+ clock-output-names:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ sysclock: sysclock@fd020004 {
+ compatible = "fixed-mmio-clock";
+ #clock-cells = <0>;
+ reg = <0xfd020004 0x4>;
+ clock-output-names = "sysclk";
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/fsl,imx8m-anatop.yaml b/Documentation/devicetree/bindings/clock/fsl,imx8m-anatop.yaml
new file mode 100644
index 000000000000..bbd22e95b319
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/fsl,imx8m-anatop.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/fsl,imx8m-anatop.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX8M Family Anatop Module
+
+maintainers:
+ - Peng Fan <peng.fan@nxp.com>
+
+description: |
+ NXP i.MX8M Family anatop PLL module which generates PLL to CCM root.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - fsl,imx8mm-anatop
+ - fsl,imx8mq-anatop
+ - items:
+ - enum:
+ - fsl,imx8mn-anatop
+ - fsl,imx8mp-anatop
+ - const: fsl,imx8mm-anatop
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ anatop: clock-controller@30360000 {
+ compatible = "fsl,imx8mn-anatop", "fsl,imx8mm-anatop";
+ reg = <0x30360000 0x10000>;
+ #clock-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/fsl,plldig.yaml b/Documentation/devicetree/bindings/clock/fsl,plldig.yaml
index 9ac716dfa602..88dd9c18db92 100644
--- a/Documentation/devicetree/bindings/clock/fsl,plldig.yaml
+++ b/Documentation/devicetree/bindings/clock/fsl,plldig.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/fsl,plldig.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NXP QorIQ Layerscape LS1028A Display PIXEL Clock Binding
+title: NXP QorIQ Layerscape LS1028A Display PIXEL Clock
maintainers:
- Wen He <wen.he_1@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/fsl,sai-clock.yaml b/Documentation/devicetree/bindings/clock/fsl,sai-clock.yaml
index fc3bdfdc091a..3bca9d11c148 100644
--- a/Documentation/devicetree/bindings/clock/fsl,sai-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/fsl,sai-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/fsl,sai-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Freescale SAI bitclock-as-a-clock binding
+title: Freescale SAI bitclock-as-a-clock
maintainers:
- Michael Walle <michael@walle.cc>
diff --git a/Documentation/devicetree/bindings/clock/fsl,scu-clk.yaml b/Documentation/devicetree/bindings/clock/fsl,scu-clk.yaml
new file mode 100644
index 000000000000..36d4cfc3c2f8
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/fsl,scu-clk.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/fsl,scu-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: i.MX SCU Client Device Node - Clock Controller Based on SCU Message Protocol
+
+maintainers:
+ - Abel Vesa <abel.vesa@nxp.com>
+
+description: i.MX SCU Client Device Node
+ Client nodes are maintained as children of the relevant IMX-SCU device node.
+ This binding uses the common clock binding.
+ (Documentation/devicetree/bindings/clock/clock-bindings.txt)
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell. See the full list of clock IDs from
+ include/dt-bindings/clock/imx8qxp-clock.h
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - fsl,imx8dxl-clk
+ - fsl,imx8qm-clk
+ - fsl,imx8qxp-clk
+ - const: fsl,scu-clk
+
+ '#clock-cells':
+ const: 2
+
+required:
+ - compatible
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller {
+ compatible = "fsl,imx8qxp-clk", "fsl,scu-clk";
+ #clock-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/gpio-gate-clock.txt b/Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
deleted file mode 100644
index d3379ff9b84b..000000000000
--- a/Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-Binding for simple gpio gated clock.
-
-This binding uses the common clock binding[1].
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-
-Required properties:
-- compatible : shall be "gpio-gate-clock".
-- #clock-cells : from common clock binding; shall be set to 0.
-- enable-gpios : GPIO reference for enabling and disabling the clock.
-
-Optional properties:
-- clocks: Maximum of one parent clock is supported.
-
-Example:
- clock {
- compatible = "gpio-gate-clock";
- clocks = <&parentclk>;
- #clock-cells = <0>;
- enable-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>;
- };
diff --git a/Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml b/Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml
new file mode 100644
index 000000000000..d09d0e3f0c6e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/gpio-gate-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Simple GPIO clock gate
+
+maintainers:
+ - Jyri Sarha <jsarha@ti.com>
+
+properties:
+ compatible:
+ const: gpio-gate-clock
+
+ clocks:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 0
+
+ enable-gpios:
+ description: GPIO reference for enabling and disabling the clock.
+ maxItems: 1
+
+required:
+ - compatible
+ - '#clock-cells'
+ - enable-gpios
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ clock {
+ compatible = "gpio-gate-clock";
+ clocks = <&parentclk>;
+ #clock-cells = <0>;
+ enable-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/idt,versaclock5.yaml b/Documentation/devicetree/bindings/clock/idt,versaclock5.yaml
index ffd6ae0eed64..a2c6eea9871d 100644
--- a/Documentation/devicetree/bindings/clock/idt,versaclock5.yaml
+++ b/Documentation/devicetree/bindings/clock/idt,versaclock5.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/idt,versaclock5.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Binding for IDT VersaClock 5 and 6 programmable I2C clock generators
+title: IDT VersaClock 5 and 6 programmable I2C clock generators
description: |
The IDT VersaClock 5 and VersaClock 6 are programmable I2C
@@ -45,7 +45,7 @@ description: |
The case where SH and SP are both 1 is likely not very interesting.
maintainers:
- - Luca Ceresoli <luca@lucaceresoli.net>
+ - Luca Ceresoli <luca.ceresoli@bootlin.com>
properties:
compatible:
@@ -54,8 +54,10 @@ properties:
- idt,5p49v5925
- idt,5p49v5933
- idt,5p49v5935
+ - idt,5p49v60
- idt,5p49v6901
- idt,5p49v6965
+ - idt,5p49v6975
reg:
description: I2C device address
@@ -108,7 +110,7 @@ patternProperties:
properties:
idt,mode:
description:
- The output drive mode. Values defined in dt-bindings/clk/versaclock.h
+ The output drive mode. Values defined in dt-bindings/clock/versaclock.h
$ref: /schemas/types.yaml#/definitions/uint32
minimum: 0
maximum: 6
@@ -134,6 +136,7 @@ allOf:
enum:
- idt,5p49v5933
- idt,5p49v5935
+ - idt,5p49v6975
then:
# Devices with builtin crystal + optional external input
properties:
@@ -151,7 +154,7 @@ additionalProperties: false
examples:
- |
- #include <dt-bindings/clk/versaclock.h>
+ #include <dt-bindings/clock/versaclock.h>
/* 25MHz reference crystal */
ref25: ref25m {
@@ -191,11 +194,4 @@ examples:
};
};
- /* Consumer referencing the 5P49V5923 pin OUT1 */
- consumer {
- /* ... */
- clocks = <&vc5 1>;
- /* ... */
- };
-
...
diff --git a/Documentation/devicetree/bindings/clock/imx1-clock.yaml b/Documentation/devicetree/bindings/clock/imx1-clock.yaml
index f4833a29b79e..7ade4c32aff3 100644
--- a/Documentation/devicetree/bindings/clock/imx1-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx1-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx1-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX1 CPUs
+title: Freescale i.MX1 CPUs Clock Controller
maintainers:
- Alexander Shiyan <shc_work@mail.ru>
@@ -40,12 +40,3 @@ examples:
compatible = "fsl,imx1-ccm";
reg = <0x0021b000 0x1000>;
};
-
- pwm@208000 {
- #pwm-cells = <2>;
- compatible = "fsl,imx1-pwm";
- reg = <0x00208000 0x1000>;
- interrupts = <34>;
- clocks = <&clks IMX1_CLK_DUMMY>, <&clks IMX1_CLK_PER1>;
- clock-names = "ipg", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx21-clock.yaml b/Documentation/devicetree/bindings/clock/imx21-clock.yaml
index 518ad9a4733c..79cc843703ec 100644
--- a/Documentation/devicetree/bindings/clock/imx21-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx21-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx21-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX21
+title: Freescale i.MX21 Clock Controller
maintainers:
- Alexander Shiyan <shc_work@mail.ru>
@@ -40,12 +40,3 @@ examples:
reg = <0x10027000 0x800>;
#clock-cells = <1>;
};
-
- serial@1000a000 {
- compatible = "fsl,imx21-uart";
- reg = <0x1000a000 0x1000>;
- interrupts = <20>;
- clocks = <&clks IMX21_CLK_UART1_IPG_GATE>,
- <&clks IMX21_CLK_PER1>;
- clock-names = "ipg", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx23-clock.yaml b/Documentation/devicetree/bindings/clock/imx23-clock.yaml
index 5e296a00e14f..5e71c9219500 100644
--- a/Documentation/devicetree/bindings/clock/imx23-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx23-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx23-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX23
+title: Freescale i.MX23 Clock Controller
maintainers:
- Shawn Guo <shawnguo@kernel.org>
@@ -83,12 +83,3 @@ examples:
reg = <0x80040000 0x2000>;
#clock-cells = <1>;
};
-
- serial@8006c000 {
- compatible = "fsl,imx23-auart";
- reg = <0x8006c000 0x2000>;
- interrupts = <24>;
- clocks = <&clks 32>;
- dmas = <&dma_apbx 6>, <&dma_apbx 7>;
- dma-names = "rx", "tx";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx25-clock.yaml b/Documentation/devicetree/bindings/clock/imx25-clock.yaml
index 2a2b10778e72..c626a158590e 100644
--- a/Documentation/devicetree/bindings/clock/imx25-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx25-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx25-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX25
+title: Freescale i.MX25 Clock Controller
maintainers:
- Sascha Hauer <s.hauer@pengutronix.de>
@@ -176,11 +176,3 @@ examples:
interrupts = <31>;
#clock-cells = <1>;
};
-
- serial@43f90000 {
- compatible = "fsl,imx25-uart", "fsl,imx21-uart";
- reg = <0x43f90000 0x4000>;
- interrupts = <45>;
- clocks = <&clks 79>, <&clks 50>;
- clock-names = "ipg", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx27-clock.yaml b/Documentation/devicetree/bindings/clock/imx27-clock.yaml
index 160268f24487..71d78a0b551f 100644
--- a/Documentation/devicetree/bindings/clock/imx27-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx27-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx27-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX27
+title: Freescale i.MX27 Clock Controller
maintainers:
- Fabio Estevam <festevam@gmail.com>
@@ -44,12 +44,3 @@ examples:
interrupts = <31>;
#clock-cells = <1>;
};
-
- serial@1000a000 {
- compatible = "fsl,imx27-uart", "fsl,imx21-uart";
- reg = <0x1000a000 0x1000>;
- interrupts = <20>;
- clocks = <&clks IMX27_CLK_UART1_IPG_GATE>,
- <&clks IMX27_CLK_PER1_GATE>;
- clock-names = "ipg", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx28-clock.yaml b/Documentation/devicetree/bindings/clock/imx28-clock.yaml
index f831b780f951..4aaad7b9c66e 100644
--- a/Documentation/devicetree/bindings/clock/imx28-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx28-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx28-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX28
+title: Freescale i.MX28 Clock Controller
maintainers:
- Shawn Guo <shawnguo@kernel.org>
@@ -106,12 +106,3 @@ examples:
reg = <0x80040000 0x2000>;
#clock-cells = <1>;
};
-
- serial@8006a000 {
- compatible = "fsl,imx28-auart";
- reg = <0x8006a000 0x2000>;
- interrupts = <112>;
- dmas = <&dma_apbx 8>, <&dma_apbx 9>;
- dma-names = "rx", "tx";
- clocks = <&clks 45>;
- };
diff --git a/Documentation/devicetree/bindings/clock/imx31-clock.yaml b/Documentation/devicetree/bindings/clock/imx31-clock.yaml
index d2336261c922..50a8498eef8a 100644
--- a/Documentation/devicetree/bindings/clock/imx31-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx31-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx31-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX31
+title: Freescale i.MX31 Clock Controller
maintainers:
- Fabio Estevam <festevam@gmail.com>
@@ -110,11 +110,3 @@ examples:
interrupts = <31>, <53>;
#clock-cells = <1>;
};
-
- serial@43f90000 {
- compatible = "fsl,imx31-uart", "fsl,imx21-uart";
- reg = <0x43f90000 0x4000>;
- interrupts = <45>;
- clocks = <&clks 10>, <&clks 30>;
- clock-names = "ipg", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx35-clock.yaml b/Documentation/devicetree/bindings/clock/imx35-clock.yaml
index 3e20ccaf8131..c063369de3ec 100644
--- a/Documentation/devicetree/bindings/clock/imx35-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx35-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx35-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX35
+title: Freescale i.MX35 Clock Controller
maintainers:
- Steffen Trumtrar <s.trumtrar@pengutronix.de>
@@ -129,11 +129,3 @@ examples:
interrupts = <31>;
#clock-cells = <1>;
};
-
- mmc@53fb4000 {
- compatible = "fsl,imx35-esdhc";
- reg = <0x53fb4000 0x4000>;
- interrupts = <7>;
- clocks = <&clks 9>, <&clks 8>, <&clks 43>;
- clock-names = "ipg", "ahb", "per";
- };
diff --git a/Documentation/devicetree/bindings/clock/imx5-clock.yaml b/Documentation/devicetree/bindings/clock/imx5-clock.yaml
index b1740d7abe68..423c0142c1d3 100644
--- a/Documentation/devicetree/bindings/clock/imx5-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx5-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx5-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX5
+title: Freescale i.MX5 Clock Controller
maintainers:
- Fabio Estevam <festevam@gmail.com>
@@ -55,11 +55,4 @@ examples:
<0 72 IRQ_TYPE_LEVEL_HIGH>;
#clock-cells = <1>;
};
-
- can@53fc8000 {
- compatible = "fsl,imx53-flexcan", "fsl,imx25-flexcan";
- reg = <0x53fc8000 0x4000>;
- interrupts = <82>;
- clocks = <&clks IMX5_CLK_CAN1_IPG_GATE>, <&clks IMX5_CLK_CAN1_SERIAL_GATE>;
- clock-names = "ipg", "per";
- };
+...
diff --git a/Documentation/devicetree/bindings/clock/imx6q-clock.yaml b/Documentation/devicetree/bindings/clock/imx6q-clock.yaml
index 4f4637eddb8b..bae4fcb3aacc 100644
--- a/Documentation/devicetree/bindings/clock/imx6q-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx6q-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx6q-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX6 Quad
+title: Freescale i.MX6 Quad Clock Controller
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx6sl-clock.yaml b/Documentation/devicetree/bindings/clock/imx6sl-clock.yaml
index b83c8f43d664..c85ff6ea3d24 100644
--- a/Documentation/devicetree/bindings/clock/imx6sl-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx6sl-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx6sl-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX6 SoloLite
+title: Freescale i.MX6 SoloLite Clock Controller
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx6sll-clock.yaml b/Documentation/devicetree/bindings/clock/imx6sll-clock.yaml
index 484894a4b23f..6b549ed1493c 100644
--- a/Documentation/devicetree/bindings/clock/imx6sll-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx6sll-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx6sll-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX6 SLL
+title: Freescale i.MX6 SLL Clock Controller
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx6sx-clock.yaml b/Documentation/devicetree/bindings/clock/imx6sx-clock.yaml
index e6c795657c24..55dcad18b7c6 100644
--- a/Documentation/devicetree/bindings/clock/imx6sx-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx6sx-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx6sx-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX6 SoloX
+title: Freescale i.MX6 SoloX Clock Controller
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx6ul-clock.yaml b/Documentation/devicetree/bindings/clock/imx6ul-clock.yaml
index 6a51a3f51cd9..be54d4df5afa 100644
--- a/Documentation/devicetree/bindings/clock/imx6ul-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx6ul-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx6ul-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX6 UltraLite
+title: Freescale i.MX6 UltraLite Clock Controller
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx7d-clock.yaml b/Documentation/devicetree/bindings/clock/imx7d-clock.yaml
index cefb61db01a8..e7d8427e4957 100644
--- a/Documentation/devicetree/bindings/clock/imx7d-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx7d-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx7d-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX7 Dual
+title: Freescale i.MX7 Dual Clock Controller
maintainers:
- Frank Li <Frank.Li@nxp.com>
diff --git a/Documentation/devicetree/bindings/clock/imx7ulp-pcc-clock.yaml b/Documentation/devicetree/bindings/clock/imx7ulp-pcc-clock.yaml
index 7caf5cee9199..76842038f52e 100644
--- a/Documentation/devicetree/bindings/clock/imx7ulp-pcc-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx7ulp-pcc-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx7ulp-pcc-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX7ULP Peripheral Clock Control (PCC) modules
+title: Freescale i.MX7ULP Peripheral Clock Control (PCC) modules Clock Controller
maintainers:
- A.s. Dong <aisheng.dong@nxp.com>
@@ -108,14 +108,3 @@ examples:
"upll", "sosc_bus_clk", "firc_bus_clk",
"rosc", "spll_bus_clk";
};
-
- mmc@40380000 {
- compatible = "fsl,imx7ulp-usdhc";
- reg = <0x40380000 0x10000>;
- interrupts = <GIC_SPI 43 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&scg1 IMX7ULP_CLK_NIC1_BUS_DIV>,
- <&scg1 IMX7ULP_CLK_NIC1_DIV>,
- <&pcc2 IMX7ULP_CLK_USDHC1>;
- clock-names ="ipg", "ahb", "per";
- bus-width = <4>;
- };
diff --git a/Documentation/devicetree/bindings/clock/imx7ulp-scg-clock.yaml b/Documentation/devicetree/bindings/clock/imx7ulp-scg-clock.yaml
index ee8efb4ed599..5e25bc6d1372 100644
--- a/Documentation/devicetree/bindings/clock/imx7ulp-scg-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx7ulp-scg-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx7ulp-scg-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for Freescale i.MX7ULP System Clock Generation (SCG) modules
+title: Freescale i.MX7ULP System Clock Generation (SCG) modules Clock Controller
maintainers:
- A.s. Dong <aisheng.dong@nxp.com>
@@ -86,14 +86,3 @@ examples:
"firc", "upll";
#clock-cells = <1>;
};
-
- mmc@40380000 {
- compatible = "fsl,imx7ulp-usdhc";
- reg = <0x40380000 0x10000>;
- interrupts = <GIC_SPI 43 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&scg1 IMX7ULP_CLK_NIC1_BUS_DIV>,
- <&scg1 IMX7ULP_CLK_NIC1_DIV>,
- <&pcc2 IMX7ULP_CLK_USDHC1>;
- clock-names ="ipg", "ahb", "per";
- bus-width = <4>;
- };
diff --git a/Documentation/devicetree/bindings/clock/imx8m-clock.yaml b/Documentation/devicetree/bindings/clock/imx8m-clock.yaml
index 625f573a7b90..0dbc1433fede 100644
--- a/Documentation/devicetree/bindings/clock/imx8m-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/imx8m-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx8m-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NXP i.MX8M Family Clock Control Module Binding
+title: NXP i.MX8M Family Clock Control Module
maintainers:
- Anson Huang <Anson.Huang@nxp.com>
@@ -55,8 +55,6 @@ allOf:
then:
properties:
clocks:
- minItems: 7
- maxItems: 7
items:
- description: 32k osc
- description: 25m osc
@@ -66,8 +64,6 @@ allOf:
- description: ext3 clock input
- description: ext4 clock input
clock-names:
- minItems: 7
- maxItems: 7
items:
- const: ckil
- const: osc_25m
@@ -112,7 +108,7 @@ examples:
};
- |
- clock-controller@30390000 {
+ clock-controller@30380000 {
compatible = "fsl,imx8mq-ccm";
reg = <0x30380000 0x10000>;
#clock-cells = <1>;
diff --git a/Documentation/devicetree/bindings/clock/imx8mp-audiomix.yaml b/Documentation/devicetree/bindings/clock/imx8mp-audiomix.yaml
new file mode 100644
index 000000000000..ff9600474df2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imx8mp-audiomix.yaml
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/imx8mp-audiomix.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX8MP AudioMIX Block Control Binding
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+description: |
+ NXP i.MX8M Plus AudioMIX is dedicated clock muxing and gating IP
+ used to control Audio related clock on the SoC.
+
+properties:
+ compatible:
+ const: fsl,imx8mp-audio-blk-ctrl
+
+ reg:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ clocks:
+ minItems: 7
+ maxItems: 7
+
+ clock-names:
+ items:
+ - const: ahb
+ - const: sai1
+ - const: sai2
+ - const: sai3
+ - const: sai5
+ - const: sai6
+ - const: sai7
+
+ '#clock-cells':
+ const: 1
+ description:
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell. See include/dt-bindings/clock/imx8mp-clock.h
+ for the full list of i.MX8MP IMX8MP_CLK_AUDIOMIX_ clock IDs.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - power-domains
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ # Clock Control Module node:
+ - |
+ #include <dt-bindings/clock/imx8mp-clock.h>
+
+ clock-controller@30e20000 {
+ compatible = "fsl,imx8mp-audio-blk-ctrl";
+ reg = <0x30e20000 0x10000>;
+ #clock-cells = <1>;
+ clocks = <&clk IMX8MP_CLK_AUDIO_ROOT>,
+ <&clk IMX8MP_CLK_SAI1>,
+ <&clk IMX8MP_CLK_SAI2>,
+ <&clk IMX8MP_CLK_SAI3>,
+ <&clk IMX8MP_CLK_SAI5>,
+ <&clk IMX8MP_CLK_SAI6>,
+ <&clk IMX8MP_CLK_SAI7>;
+ clock-names = "ahb",
+ "sai1", "sai2", "sai3",
+ "sai5", "sai6", "sai7";
+ power-domains = <&pgc_audio>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml
index 0f6fe365ebf3..b207f95361b2 100644
--- a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml
+++ b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/imx8qxp-lpcg.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: NXP i.MX8QXP LPCG (Low-Power Clock Gating) Clock bindings
+title: NXP i.MX8QXP LPCG (Low-Power Clock Gating) Clock
maintainers:
- Aisheng Dong <aisheng.dong@nxp.com>
@@ -101,14 +101,3 @@ examples:
"sdhc0_lpcg_ahb_clk";
power-domains = <&pd IMX_SC_R_SDHC_0>;
};
-
- mmc@5b010000 {
- compatible = "fsl,imx8qxp-usdhc", "fsl,imx7d-usdhc";
- interrupts = <GIC_SPI 232 IRQ_TYPE_LEVEL_HIGH>;
- reg = <0x5b010000 0x10000>;
- clocks = <&sdhc0_lpcg IMX_LPCG_CLK_4>,
- <&sdhc0_lpcg IMX_LPCG_CLK_5>,
- <&sdhc0_lpcg IMX_LPCG_CLK_0>;
- clock-names = "ipg", "ahb", "per";
- power-domains = <&pd IMX_SC_R_SDHC_0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/imx8ulp-cgc-clock.yaml b/Documentation/devicetree/bindings/clock/imx8ulp-cgc-clock.yaml
new file mode 100644
index 000000000000..68a60cdc19af
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imx8ulp-cgc-clock.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/imx8ulp-cgc-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX8ULP Clock Generation & Control(CGC) Module
+
+maintainers:
+ - Jacky Bai <ping.bai@nxp.com>
+
+description: |
+ On i.MX8ULP, The clock sources generation, distribution and management is
+ under the control of several CGCs & PCCs modules. The CGC modules generate
+ and distribute clocks on the device.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8ulp-cgc1
+ - fsl,imx8ulp-cgc2
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ # Clock Generation & Control Module node:
+ - |
+ clock-controller@292c0000 {
+ compatible = "fsl,imx8ulp-cgc1";
+ reg = <0x292c0000 0x10000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/imx8ulp-pcc-clock.yaml b/Documentation/devicetree/bindings/clock/imx8ulp-pcc-clock.yaml
new file mode 100644
index 000000000000..d0b0792fe7ba
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imx8ulp-pcc-clock.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/imx8ulp-pcc-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX8ULP Peripheral Clock Controller(PCC) Module
+
+maintainers:
+ - Jacky Bai <ping.bai@nxp.com>
+
+description: |
+ On i.MX8ULP, The clock sources generation, distribution and management is
+ under the control of several CGCs & PCCs modules. The PCC modules control
+ software reset, clock selection, optional division and clock gating mode
+ for peripherals.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8ulp-pcc3
+ - fsl,imx8ulp-pcc4
+ - fsl,imx8ulp-pcc5
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+ - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+ # Peripheral Clock Control Module node:
+ - |
+ clock-controller@292d0000 {
+ compatible = "fsl,imx8ulp-pcc3";
+ reg = <0x292d0000 0x10000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/imx93-clock.yaml b/Documentation/devicetree/bindings/clock/imx93-clock.yaml
new file mode 100644
index 000000000000..ccb53c6b96c1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imx93-clock.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/imx93-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX93 Clock Control Module
+
+maintainers:
+ - Peng Fan <peng.fan@nxp.com>
+
+description: |
+ i.MX93 clock control module is an integrated clock controller, which
+ includes clock generator, clock gate and supplies to all modules.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx93-ccm
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ description:
+ specify the external clocks used by the CCM module.
+ items:
+ - description: 32k osc
+ - description: 24m osc
+ - description: ext1 clock input
+
+ clock-names:
+ description:
+ specify the external clocks names used by the CCM module.
+ items:
+ - const: osc_32k
+ - const: osc_24m
+ - const: clk_ext1
+
+ '#clock-cells':
+ const: 1
+ description:
+ See include/dt-bindings/clock/imx93-clock.h for the full list of
+ i.MX93 clock IDs.
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ # Clock Control Module node:
+ - |
+ clock-controller@44450000 {
+ compatible = "fsl,imx93-ccm";
+ reg = <0x44450000 0x10000>;
+ #clock-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/imxrt1050-clock.yaml b/Documentation/devicetree/bindings/clock/imxrt1050-clock.yaml
new file mode 100644
index 000000000000..777af4aad4b2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/imxrt1050-clock.yaml
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/imxrt1050-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MXRT Clock Controller
+
+maintainers:
+ - Giulio Benetti <giulio.benetti@benettiengineering.com>
+ - Jesse Taube <Mr.Bossman075@gmail.com>
+
+description: |
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell. See include/dt-bindings/clock/imxrt*-clock.h
+ for the full list of i.MXRT clock IDs.
+
+properties:
+ compatible:
+ const: fsl,imxrt1050-ccm
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 2
+
+ clocks:
+ description: 24m osc
+ maxItems: 1
+
+ clock-names:
+ const: osc
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/imxrt1050-clock.h>
+
+ clks: clock-controller@400fc000 {
+ compatible = "fsl,imxrt1050-ccm";
+ reg = <0x400fc000 0x4000>;
+ interrupts = <95>, <96>;
+ clocks = <&osc>;
+ clock-names = "osc";
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/ingenic,cgu.yaml b/Documentation/devicetree/bindings/clock/ingenic,cgu.yaml
index 6e80dbc8b8b9..9e733b10c392 100644
--- a/Documentation/devicetree/bindings/clock/ingenic,cgu.yaml
+++ b/Documentation/devicetree/bindings/clock/ingenic,cgu.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/ingenic,cgu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ingenic SoCs CGU devicetree bindings
+title: Ingenic SoCs CGU
description: |
The CGU in an Ingenic SoC provides all the clocks generated on-chip. It
@@ -22,6 +22,7 @@ select:
enum:
- ingenic,jz4740-cgu
- ingenic,jz4725b-cgu
+ - ingenic,jz4755-cgu
- ingenic,jz4760-cgu
- ingenic,jz4760b-cgu
- ingenic,jz4770-cgu
@@ -51,6 +52,7 @@ properties:
- enum:
- ingenic,jz4740-cgu
- ingenic,jz4725b-cgu
+ - ingenic,jz4755-cgu
- ingenic,jz4760-cgu
- ingenic,jz4760b-cgu
- ingenic,jz4770-cgu
@@ -104,7 +106,7 @@ additionalProperties: false
examples:
- |
- #include <dt-bindings/clock/jz4770-cgu.h>
+ #include <dt-bindings/clock/ingenic,jz4770-cgu.h>
cgu: clock-controller@10000000 {
compatible = "ingenic,jz4770-cgu", "simple-mfd";
reg = <0x10000000 0x100>;
diff --git a/Documentation/devicetree/bindings/clock/intc_stratix10.txt b/Documentation/devicetree/bindings/clock/intc_stratix10.txt
deleted file mode 100644
index 9f4ec5cb5c6b..000000000000
--- a/Documentation/devicetree/bindings/clock/intc_stratix10.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Device Tree Clock bindings for Intel's SoCFPGA Stratix10 platform
-
-This binding uses the common clock binding[1].
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-
-Required properties:
-- compatible : shall be
- "intel,stratix10-clkmgr"
-
-- reg : shall be the control register offset from CLOCK_MANAGER's base for the clock.
-
-- #clock-cells : from common clock binding, shall be set to 1.
-
-Example:
- clkmgr: clock-controller@ffd10000 {
- compatible = "intel,stratix10-clkmgr";
- reg = <0xffd10000 0x1000>;
- #clock-cells = <1>;
- };
diff --git a/Documentation/devicetree/bindings/clock/intel,agilex.yaml b/Documentation/devicetree/bindings/clock/intel,agilex.yaml
index cf5a9eb803e6..3745ba8dbd76 100644
--- a/Documentation/devicetree/bindings/clock/intel,agilex.yaml
+++ b/Documentation/devicetree/bindings/clock/intel,agilex.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/intel,agilex.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel SoCFPGA Agilex platform clock controller binding
+title: Intel SoCFPGA Agilex platform clock controller
maintainers:
- Dinh Nguyen <dinguyen@kernel.org>
diff --git a/Documentation/devicetree/bindings/clock/intel,cgu-lgm.yaml b/Documentation/devicetree/bindings/clock/intel,cgu-lgm.yaml
index f3e1a700a2ca..76609a390429 100644
--- a/Documentation/devicetree/bindings/clock/intel,cgu-lgm.yaml
+++ b/Documentation/devicetree/bindings/clock/intel,cgu-lgm.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/intel,cgu-lgm.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel Lightning Mountain SoC's Clock Controller(CGU) Binding
+title: Intel Lightning Mountain SoC's Clock Controller(CGU)
maintainers:
- Rahul Tanwar <rahul.tanwar@linux.intel.com>
diff --git a/Documentation/devicetree/bindings/clock/intel,easic-n5x.yaml b/Documentation/devicetree/bindings/clock/intel,easic-n5x.yaml
index 8f45976e946e..e000116a51a4 100644
--- a/Documentation/devicetree/bindings/clock/intel,easic-n5x.yaml
+++ b/Documentation/devicetree/bindings/clock/intel,easic-n5x.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/intel,easic-n5x.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel SoCFPGA eASIC N5X platform clock controller binding
+title: Intel SoCFPGA eASIC N5X platform clock controller
maintainers:
- Dinh Nguyen <dinguyen@kernel.org>
diff --git a/Documentation/devicetree/bindings/clock/intel,stratix10.yaml b/Documentation/devicetree/bindings/clock/intel,stratix10.yaml
new file mode 100644
index 000000000000..b4a8be213400
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/intel,stratix10.yaml
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/intel,stratix10.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Intel SoCFPGA Stratix10 platform clock controller
+
+maintainers:
+ - Dinh Nguyen <dinguyen@kernel.org>
+
+properties:
+ compatible:
+ const: intel,stratix10-clkmgr
+
+ '#clock-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@ffd10000 {
+ compatible = "intel,stratix10-clkmgr";
+ reg = <0xffd10000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml b/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml
new file mode 100644
index 000000000000..01561a0f35d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/loongson,ls1x-clk.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/loongson,ls1x-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson-1 Clock Controller
+
+maintainers:
+ - Keguang Zhang <keguang.zhang@gmail.com>
+
+properties:
+ compatible:
+ enum:
+ - loongson,ls1b-clk
+ - loongson,ls1c-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ clkc: clock-controller@1fe78030 {
+ compatible = "loongson,ls1b-clk";
+ reg = <0x1fe78030 0x8>;
+
+ clocks = <&xtal>;
+ #clock-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml b/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml
new file mode 100644
index 000000000000..63a59015987e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/loongson,ls2k-clk.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/loongson,ls2k-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson-2 SoC Clock Control Module
+
+maintainers:
+ - Yinbo Zhu <zhuyinbo@loongson.cn>
+
+description: |
+ Loongson-2 SoC clock control module is an integrated clock controller, which
+ generates and supplies to all modules.
+
+properties:
+ compatible:
+ enum:
+ - loongson,ls2k-clk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: 100m ref
+
+ clock-names:
+ items:
+ - const: ref_100m
+
+ '#clock-cells':
+ const: 1
+ description:
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell. See include/dt-bindings/clock/loongson,ls2k-clk.h
+ for the full list of Loongson-2 SoC clock IDs.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ ref_100m: clock-ref-100m {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <100000000>;
+ clock-output-names = "ref_100m";
+ };
+
+ clk: clock-controller@1fe00480 {
+ compatible = "loongson,ls2k-clk";
+ reg = <0x1fe00480 0x58>;
+ #clock-cells = <1>;
+ clocks = <&ref_100m>;
+ clock-names = "ref_100m";
+ };
diff --git a/Documentation/devicetree/bindings/clock/marvell,armada-3700-uart-clock.yaml b/Documentation/devicetree/bindings/clock/marvell,armada-3700-uart-clock.yaml
new file mode 100644
index 000000000000..175f5c8f2bc5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/marvell,armada-3700-uart-clock.yaml
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/marvell,armada-3700-uart-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+title: Marvell Armada 3720 UART clocks
+
+maintainers:
+ - Pali Rohár <pali@kernel.org>
+
+properties:
+ compatible:
+ const: marvell,armada-3700-uart-clock
+
+ reg:
+ items:
+ - description: UART Clock Control Register
+ - description: UART 2 Baud Rate Divisor Register
+
+ clocks:
+ description: |
+ List of parent clocks suitable for UART from following set:
+ "TBG-A-P", "TBG-B-P", "TBG-A-S", "TBG-B-S", "xtal"
+ UART clock can use one from this set and when more are provided
+ then kernel would choose and configure the most suitable one.
+ It is suggest to specify at least one TBG clock to achieve
+ baudrates above 230400 and also to specify clock which bootloader
+ used for UART (most probably xtal) for smooth boot log on UART.
+
+ clock-names:
+ items:
+ - const: TBG-A-P
+ - const: TBG-B-P
+ - const: TBG-A-S
+ - const: TBG-B-S
+ - const: xtal
+ minItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ uartclk: clock-controller@12010 {
+ compatible = "marvell,armada-3700-uart-clock";
+ reg = <0x12010 0x4>, <0x12210 0x4>;
+ clocks = <&tbg 0>, <&tbg 1>, <&tbg 2>, <&tbg 3>, <&xtalclk>;
+ clock-names = "TBG-A-P", "TBG-B-P", "TBG-A-S", "TBG-B-S", "xtal";
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/maxim,max77686.txt b/Documentation/devicetree/bindings/clock/maxim,max77686.txt
index 3472b461ca93..c10849efb444 100644
--- a/Documentation/devicetree/bindings/clock/maxim,max77686.txt
+++ b/Documentation/devicetree/bindings/clock/maxim,max77686.txt
@@ -49,7 +49,7 @@ Example:
max77686: max77686@9 {
compatible = "maxim,max77686";
interrupt-parent = <&wakeup_eint>;
- interrupts = <26 0>;
+ interrupts = <26 IRQ_TYPE_LEVEL_LOW>;
reg = <0x09>;
#clock-cells = <1>;
@@ -74,7 +74,7 @@ Example:
max77802: max77802@9 {
compatible = "maxim,max77802";
interrupt-parent = <&wakeup_eint>;
- interrupts = <26 0>;
+ interrupts = <26 IRQ_TYPE_LEVEL_LOW>;
reg = <0x09>;
#clock-cells = <1>;
diff --git a/Documentation/devicetree/bindings/clock/mediatek,apmixedsys.yaml b/Documentation/devicetree/bindings/clock/mediatek,apmixedsys.yaml
new file mode 100644
index 000000000000..372c1d744bc2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,apmixedsys.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,apmixedsys.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek AP Mixedsys Controller
+
+maintainers:
+ - Michael Turquette <mturquette@baylibre.com>
+ - Stephen Boyd <sboyd@kernel.org>
+
+description:
+ The Mediatek apmixedsys controller provides PLLs to the system.
+ The clock values can be found in <dt-bindings/clock/mt*-clk.h>.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt6797-apmixedsys
+ - mediatek,mt7622-apmixedsys
+ - mediatek,mt7981-apmixedsys
+ - mediatek,mt7986-apmixedsys
+ - mediatek,mt8135-apmixedsys
+ - mediatek,mt8173-apmixedsys
+ - mediatek,mt8516-apmixedsys
+ - items:
+ - const: mediatek,mt7623-apmixedsys
+ - const: mediatek,mt2701-apmixedsys
+ - const: syscon
+ - items:
+ - enum:
+ - mediatek,mt2701-apmixedsys
+ - mediatek,mt2712-apmixedsys
+ - mediatek,mt6765-apmixedsys
+ - mediatek,mt6779-apmixedsys
+ - mediatek,mt6795-apmixedsys
+ - mediatek,mt7629-apmixedsys
+ - mediatek,mt8167-apmixedsys
+ - mediatek,mt8183-apmixedsys
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ apmixedsys: clock-controller@10209000 {
+ compatible = "mediatek,mt8173-apmixedsys";
+ reg = <0x10209000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt6795-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt6795-clock.yaml
new file mode 100644
index 000000000000..04469eabc8fa
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt6795-clock.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt6795-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT6795
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description: |
+ The clock architecture in MediaTek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The devices provide clock gate control in different IP blocks.
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt6795-mfgcfg
+ - mediatek,mt6795-vdecsys
+ - mediatek,mt6795-vencsys
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ mfgcfg: clock-controller@13000000 {
+ compatible = "mediatek,mt6795-mfgcfg";
+ reg = <0 0x13000000 0 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ vdecsys: clock-controller@16000000 {
+ compatible = "mediatek,mt6795-vdecsys";
+ reg = <0 0x16000000 0 0x1000>;
+ #clock-cells = <1>;
+ };
+
+ vencsys: clock-controller@18000000 {
+ compatible = "mediatek,mt6795-vencsys";
+ reg = <0 0x18000000 0 0x1000>;
+ #clock-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt6795-sys-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt6795-sys-clock.yaml
new file mode 100644
index 000000000000..378b761237d3
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt6795-sys-clock.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt6795-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT6795
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+ - Chun-Jie Chen <chun-jie.chen@mediatek.com>
+
+description:
+ The Mediatek system clock controller provides various clocks and system
+ configuration like reset and bus protection on MT6795.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt6795-apmixedsys
+ - mediatek,mt6795-infracfg
+ - mediatek,mt6795-pericfg
+ - mediatek,mt6795-topckgen
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ topckgen: clock-controller@10000000 {
+ compatible = "mediatek,mt6795-topckgen", "syscon";
+ reg = <0 0x10000000 0 0x1000>;
+ #clock-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt7621-sysc.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt7621-sysc.yaml
index 915f84efd763..b42f0f5c11b7 100644
--- a/Documentation/devicetree/bindings/clock/mediatek,mt7621-sysc.yaml
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt7621-sysc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/mediatek,mt7621-sysc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MT7621 Clock Device Tree Bindings
+title: MT7621 Clock
maintainers:
- Sergio Paracuellos <sergio.paracuellos@gmail.com>
@@ -22,6 +22,11 @@ description: |
The clocks are provided inside a system controller node.
+ This node is also a reset provider for all the peripherals.
+
+ Reset related bits are defined in:
+ [2]: <include/dt-bindings/reset/mt7621-reset.h>.
+
properties:
compatible:
items:
@@ -37,6 +42,12 @@ properties:
clocks.
const: 1
+ "#reset-cells":
+ description:
+ The first cell indicates the reset bit within the register, see
+ [2] for available resets.
+ const: 1
+
ralink,memctl:
$ref: /schemas/types.yaml#/definitions/phandle
description:
@@ -61,6 +72,7 @@ examples:
compatible = "mediatek,mt7621-sysc", "syscon";
reg = <0x0 0x100>;
#clock-cells = <1>;
+ #reset-cells = <1>;
ralink,memctl = <&memc>;
clock-output-names = "xtal", "cpu", "bus",
"50m", "125m", "150m",
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8186-fhctl.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8186-fhctl.yaml
new file mode 100644
index 000000000000..d00327d12e1e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8186-fhctl.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8186-fhctl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek frequency hopping and spread spectrum clocking control
+
+maintainers:
+ - Edward-JW Yang <edward-jw.yang@mediatek.com>
+
+description: |
+ Frequency hopping control (FHCTL) is a piece of hardware that control
+ some PLLs to adopt "hopping" mechanism to adjust their frequency.
+ Spread spectrum clocking (SSC) is another function provided by this hardware.
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt6795-fhctl
+ - mediatek,mt8173-fhctl
+ - mediatek,mt8186-fhctl
+ - mediatek,mt8192-fhctl
+ - mediatek,mt8195-fhctl
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ description: Phandles of the PLL with FHCTL hardware capability.
+ minItems: 1
+ maxItems: 30
+
+ mediatek,hopping-ssc-percent:
+ description: The percentage of spread spectrum clocking for one PLL.
+ minItems: 1
+ maxItems: 30
+ items:
+ default: 0
+ minimum: 0
+ maximum: 8
+
+required:
+ - compatible
+ - reg
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mt8186-clk.h>
+ fhctl: fhctl@1000ce00 {
+ compatible = "mediatek,mt8186-fhctl";
+ reg = <0x1000ce00 0x200>;
+ clocks = <&apmixedsys CLK_APMIXED_MSDCPLL>;
+ mediatek,hopping-ssc-percent = <3>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml
new file mode 100644
index 000000000000..d7214d97b2ba
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8188-clock.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8188-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT8188
+
+maintainers:
+ - Garmin Chang <garmin.chang@mediatek.com>
+
+description: |
+ The clock architecture in MediaTek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The devices provide clock gate control in different IP blocks.
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt8188-adsp-audio26m
+ - mediatek,mt8188-camsys
+ - mediatek,mt8188-camsys-rawa
+ - mediatek,mt8188-camsys-rawb
+ - mediatek,mt8188-camsys-yuva
+ - mediatek,mt8188-camsys-yuvb
+ - mediatek,mt8188-ccusys
+ - mediatek,mt8188-imgsys
+ - mediatek,mt8188-imgsys-wpe1
+ - mediatek,mt8188-imgsys-wpe2
+ - mediatek,mt8188-imgsys-wpe3
+ - mediatek,mt8188-imgsys1-dip-nr
+ - mediatek,mt8188-imgsys1-dip-top
+ - mediatek,mt8188-imp-iic-wrap-c
+ - mediatek,mt8188-imp-iic-wrap-en
+ - mediatek,mt8188-imp-iic-wrap-w
+ - mediatek,mt8188-ipesys
+ - mediatek,mt8188-mfgcfg
+ - mediatek,mt8188-vdecsys
+ - mediatek,mt8188-vdecsys-soc
+ - mediatek,mt8188-vencsys
+ - mediatek,mt8188-vppsys0
+ - mediatek,mt8188-vppsys1
+ - mediatek,mt8188-wpesys
+ - mediatek,mt8188-wpesys-vpp0
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@11283000 {
+ compatible = "mediatek,mt8188-imp-iic-wrap-c";
+ reg = <0x11283000 0x1000>;
+ #clock-cells = <1>;
+ };
+
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml
new file mode 100644
index 000000000000..4cf8d3af9803
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8188-sys-clock.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8188-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT8188
+
+maintainers:
+ - Garmin Chang <garmin.chang@mediatek.com>
+
+description: |
+ The clock architecture in MediaTek like below
+ PLLs -->
+ dividers -->
+ muxes
+ -->
+ clock gate
+
+ The apmixedsys provides most of PLLs which generated from SoC 26m.
+ The topckgen provides dividers and muxes which provide the clock source to other IP blocks.
+ The infracfg_ao provides clock gate in peripheral and infrastructure IP blocks.
+ The mcusys provides mux control to select the clock source in AP MCU.
+ The device nodes also provide the system control capacity for configuration.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8188-apmixedsys
+ - mediatek,mt8188-infracfg-ao
+ - mediatek,mt8188-pericfg-ao
+ - mediatek,mt8188-topckgen
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@10000000 {
+ compatible = "mediatek,mt8188-topckgen", "syscon";
+ reg = <0x10000000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8365-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8365-clock.yaml
new file mode 100644
index 000000000000..b327ecb4e524
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8365-clock.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8365-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Functional Clock Controller for MT8365
+
+maintainers:
+ - Markus Schneider-Pargmann <msp@baylibre.com>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8365-apu
+ - mediatek,mt8365-imgsys
+ - mediatek,mt8365-mfgcfg
+ - mediatek,mt8365-vdecsys
+ - mediatek,mt8365-vencsys
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ apu: clock-controller@19020000 {
+ compatible = "mediatek,mt8365-apu", "syscon";
+ reg = <0x19020000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,mt8365-sys-clock.yaml b/Documentation/devicetree/bindings/clock/mediatek,mt8365-sys-clock.yaml
new file mode 100644
index 000000000000..643f84660c8e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,mt8365-sys-clock.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,mt8365-sys-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek System Clock Controller for MT8365
+
+maintainers:
+ - Markus Schneider-Pargmann <msp@baylibre.com>
+
+description:
+ The apmixedsys module provides most of PLLs which generated from SoC 26m.
+ The topckgen provides dividers and muxes which provides the clock source to other IP blocks.
+ The infracfg_ao and pericfg_ao provides clock gate in peripheral and infrastructure IP blocks.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - mediatek,mt8365-topckgen
+ - mediatek,mt8365-infracfg
+ - mediatek,mt8365-apmixedsys
+ - mediatek,mt8365-pericfg
+ - mediatek,mt8365-mcucfg
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ topckgen: clock-controller@10000000 {
+ compatible = "mediatek,mt8365-topckgen", "syscon";
+ reg = <0x10000000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/mediatek,topckgen.yaml b/Documentation/devicetree/bindings/clock/mediatek,topckgen.yaml
new file mode 100644
index 000000000000..6d087ded7437
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mediatek,topckgen.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mediatek,topckgen.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Top Clock Generator Controller
+
+maintainers:
+ - Michael Turquette <mturquette@baylibre.com>
+ - Stephen Boyd <sboyd@kernel.org>
+
+description:
+ The Mediatek topckgen controller provides various clocks to the system.
+ The clock values can be found in <dt-bindings/clock/mt*-clk.h>.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt6797-topckgen
+ - mediatek,mt7622-topckgen
+ - mediatek,mt8135-topckgen
+ - mediatek,mt8173-topckgen
+ - mediatek,mt8516-topckgen
+ - items:
+ - const: mediatek,mt7623-topckgen
+ - const: mediatek,mt2701-topckgen
+ - const: syscon
+ - items:
+ - enum:
+ - mediatek,mt2701-topckgen
+ - mediatek,mt2712-topckgen
+ - mediatek,mt6765-topckgen
+ - mediatek,mt6779-topckgen
+ - mediatek,mt6795-topckgen
+ - mediatek,mt7629-topckgen
+ - mediatek,mt7981-topckgen
+ - mediatek,mt7986-topckgen
+ - mediatek,mt8167-topckgen
+ - mediatek,mt8183-topckgen
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ topckgen: clock-controller@10000000 {
+ compatible = "mediatek,mt8173-topckgen";
+ reg = <0x10000000 0x1000>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/microchip,lan966x-gck.yaml b/Documentation/devicetree/bindings/clock/microchip,lan966x-gck.yaml
new file mode 100644
index 000000000000..df2bec188706
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/microchip,lan966x-gck.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/microchip,lan966x-gck.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip LAN966X Generic Clock Controller
+
+maintainers:
+ - Kavyasree Kotagiri <kavyasree.kotagiri@microchip.com>
+
+description: |
+ The LAN966X Generic clock controller contains 3 PLLs - cpu_clk,
+ ddr_clk and sys_clk. This clock controller generates and supplies
+ clock to various peripherals within the SoC.
+
+properties:
+ compatible:
+ const: microchip,lan966x-gck
+
+ reg:
+ minItems: 1
+ items:
+ - description: Generic clock registers
+ - description: Optional gate clock registers
+
+ clocks:
+ items:
+ - description: CPU clock source
+ - description: DDR clock source
+ - description: System clock source
+
+ clock-names:
+ items:
+ - const: cpu
+ - const: ddr
+ - const: sys
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clks: clock-controller@e00c00a8 {
+ compatible = "microchip,lan966x-gck";
+ #clock-cells = <1>;
+ clocks = <&cpu_clk>, <&ddr_clk>, <&sys_clk>;
+ clock-names = "cpu", "ddr", "sys";
+ reg = <0xe00c00a8 0x38>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/microchip,mpfs-ccc.yaml b/Documentation/devicetree/bindings/clock/microchip,mpfs-ccc.yaml
new file mode 100644
index 000000000000..f1770360798f
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/microchip,mpfs-ccc.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/microchip,mpfs-ccc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip PolarFire SoC Fabric Clock Conditioning Circuitry
+
+maintainers:
+ - Conor Dooley <conor.dooley@microchip.com>
+
+description: |
+ Microchip PolarFire SoC has 4 Clock Conditioning Circuitry blocks. Each of
+ these blocks contains two PLLs and 2 DLLs & are located in the four corners of
+ the FPGA. For more information see "PolarFire SoC FPGA Clocking Resources" at:
+ https://onlinedocs.microchip.com/pr/GUID-8F0CC4C0-0317-4262-89CA-CE7773ED1931-en-US-1/index.html
+
+properties:
+ compatible:
+ const: microchip,mpfs-ccc
+
+ reg:
+ items:
+ - description: PLL0's control registers
+ - description: PLL1's control registers
+ - description: DLL0's control registers
+ - description: DLL1's control registers
+
+ clocks:
+ description:
+ The CCC PLL's have two input clocks. It is required that even if the input
+ clocks are identical that both are provided.
+ minItems: 2
+ items:
+ - description: PLL0's refclk0
+ - description: PLL0's refclk1
+ - description: PLL1's refclk0
+ - description: PLL1's refclk1
+ - description: DLL0's refclk
+ - description: DLL1's refclk
+
+ clock-names:
+ minItems: 2
+ items:
+ - const: pll0_ref0
+ - const: pll0_ref1
+ - const: pll1_ref0
+ - const: pll1_ref1
+ - const: dll0_ref
+ - const: dll1_ref
+
+ '#clock-cells':
+ const: 1
+ description: |
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell.
+ See include/dt-bindings/clock/microchip,mpfs-clock.h for the full list of
+ PolarFire clock IDs.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@38100000 {
+ compatible = "microchip,mpfs-ccc";
+ reg = <0x38010000 0x1000>, <0x38020000 0x1000>,
+ <0x39010000 0x1000>, <0x39020000 0x1000>;
+ #clock-cells = <1>;
+ clocks = <&refclk_ccc>, <&refclk_ccc>, <&refclk_ccc>, <&refclk_ccc>,
+ <&refclk_ccc>, <&refclk_ccc>;
+ clock-names = "pll0_ref0", "pll0_ref1", "pll1_ref0", "pll1_ref1",
+ "dll0_ref", "dll1_ref";
+ };
diff --git a/Documentation/devicetree/bindings/clock/microchip,mpfs-clkcfg.yaml b/Documentation/devicetree/bindings/clock/microchip,mpfs-clkcfg.yaml
new file mode 100644
index 000000000000..e4e1c31267d2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/microchip,mpfs-clkcfg.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/microchip,mpfs-clkcfg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip PolarFire Clock Control Module
+
+maintainers:
+ - Daire McNamara <daire.mcnamara@microchip.com>
+
+description: |
+ Microchip PolarFire clock control (CLKCFG) is an integrated clock controller,
+ which gates and enables all peripheral clocks.
+
+ This device tree binding describes 33 gate clocks. Clocks are referenced by
+ user nodes by the CLKCFG node phandle and the clock index in the group, from
+ 0 to 32.
+
+properties:
+ compatible:
+ const: microchip,mpfs-clkcfg
+
+ reg:
+ items:
+ - description: |
+ clock config registers:
+ These registers contain enable, reset & divider tables for the, cpu,
+ axi, ahb and rtc/mtimer reference clocks as well as enable and reset
+ for the peripheral clocks.
+ - description: |
+ mss pll dri registers:
+ Block of registers responsible for dynamic reconfiguration of the mss
+ pll
+
+ clocks:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+ description: |
+ The clock consumer should specify the desired clock by having the clock
+ ID in its "clocks" phandle cell.
+ See include/dt-bindings/clock/microchip,mpfs-clock.h for the full list of
+ PolarFire clock IDs.
+
+ resets:
+ maxItems: 1
+
+ '#reset-cells':
+ description:
+ The AHB/AXI peripherals on the PolarFire SoC have reset support, so from
+ CLK_ENVM to CLK_CFM. The reset consumer should specify the desired
+ peripheral via the clock ID in its "resets" phandle cell.
+ See include/dt-bindings/clock/microchip,mpfs-clock.h for the full list of
+ PolarFire clock IDs.
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ # Clock Config node:
+ - |
+ #include <dt-bindings/clock/microchip,mpfs-clock.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ clkcfg: clock-controller@20002000 {
+ compatible = "microchip,mpfs-clkcfg";
+ reg = <0x0 0x20002000 0x0 0x1000>, <0x0 0x3E001000 0x0 0x1000>;
+ clocks = <&ref>;
+ #clock-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/milbeaut-clock.yaml b/Documentation/devicetree/bindings/clock/milbeaut-clock.yaml
index 6d39344d2b70..0af1c569eb32 100644
--- a/Documentation/devicetree/bindings/clock/milbeaut-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/milbeaut-clock.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/milbeaut-clock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Milbeaut SoCs Clock Controller Binding
+title: Milbeaut SoCs Clock Controller
maintainers:
- Taichi Sugaya <sugaya.taichi@socionext.com>
diff --git a/Documentation/devicetree/bindings/clock/mstar,msc313-cpupll.yaml b/Documentation/devicetree/bindings/clock/mstar,msc313-cpupll.yaml
new file mode 100644
index 000000000000..a9ad7ab5230c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/mstar,msc313-cpupll.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/mstar,msc313-cpupll.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MStar/Sigmastar MSC313 CPU PLL
+
+maintainers:
+ - Daniel Palmer <daniel@thingy.jp>
+
+description: |
+ The MStar/SigmaStar MSC313 and later ARMv7 chips have a scalable
+ PLL that can be used as the clock source for the CPU(s).
+
+properties:
+ compatible:
+ const: mstar,msc313-cpupll
+
+ "#clock-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - clocks
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mstar-msc313-mpll.h>
+ cpupll: cpupll@206400 {
+ compatible = "mstar,msc313-cpupll";
+ reg = <0x206400 0x200>;
+ #clock-cells = <1>;
+ clocks = <&mpll MSTAR_MSC313_MPLL_DIV2>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml b/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml
new file mode 100644
index 000000000000..b901ca13cd25
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/nuvoton,npcm845-clk.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/nuvoton,npcm845-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nuvoton NPCM8XX Clock Controller
+
+maintainers:
+ - Tomer Maimon <tmaimon77@gmail.com>
+
+description: |
+ Nuvoton Arbel BMC NPCM8XX contains an integrated clock controller, which
+ generates and supplies clocks to all modules within the BMC.
+
+properties:
+ compatible:
+ enum:
+ - nuvoton,npcm845-clk
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+ description:
+ See include/dt-bindings/clock/nuvoton,npcm8xx-clock.h for the full
+ list of NPCM8XX clock IDs.
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ ahb {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ clock-controller@f0801000 {
+ compatible = "nuvoton,npcm845-clk";
+ reg = <0x0 0xf0801000 0x0 0x1000>;
+ #clock-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/nvidia,tegra124-car.yaml b/Documentation/devicetree/bindings/clock/nvidia,tegra124-car.yaml
index ec7ab1483652..1b2181f6d440 100644
--- a/Documentation/devicetree/bindings/clock/nvidia,tegra124-car.yaml
+++ b/Documentation/devicetree/bindings/clock/nvidia,tegra124-car.yaml
@@ -106,10 +106,3 @@ examples:
#clock-cells = <1>;
#reset-cells = <1>;
};
-
- usb-controller@c5004000 {
- compatible = "nvidia,tegra20-ehci";
- reg = <0xc5004000 0x4000>;
- clocks = <&car TEGRA124_CLK_USB2>;
- resets = <&car TEGRA124_CLK_USB2>;
- };
diff --git a/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt b/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt
index 958e0ad78c52..f7d347385b57 100644
--- a/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt
+++ b/Documentation/devicetree/bindings/clock/nvidia,tegra124-dfll.txt
@@ -136,7 +136,7 @@ clock@70110000 {
};
/* pinmux nodes added for completeness. Binding doc can be found in:
- * Documentation/devicetree/bindings/pinctrl/nvidia,tegra210-pinmux.txt
+ * Documentation/devicetree/bindings/pinctrl/nvidia,tegra210-pinmux.yaml
*/
pinmux: pinmux@700008d4 {
diff --git a/Documentation/devicetree/bindings/clock/nvidia,tegra20-car.yaml b/Documentation/devicetree/bindings/clock/nvidia,tegra20-car.yaml
index 459d2a525393..bee2dd4b29bf 100644
--- a/Documentation/devicetree/bindings/clock/nvidia,tegra20-car.yaml
+++ b/Documentation/devicetree/bindings/clock/nvidia,tegra20-car.yaml
@@ -42,6 +42,36 @@ properties:
"#reset-cells":
const: 1
+patternProperties:
+ "^(sclk)|(pll-[cem])$":
+ type: object
+ properties:
+ compatible:
+ enum:
+ - nvidia,tegra20-sclk
+ - nvidia,tegra30-sclk
+ - nvidia,tegra30-pllc
+ - nvidia,tegra30-plle
+ - nvidia,tegra30-pllm
+
+ operating-points-v2: true
+
+ clocks:
+ items:
+ - description: node's clock
+
+ power-domains:
+ maxItems: 1
+ description: phandle to the core SoC power domain
+
+ required:
+ - compatible
+ - operating-points-v2
+ - clocks
+ - power-domains
+
+ additionalProperties: false
+
required:
- compatible
- reg
@@ -59,11 +89,11 @@ examples:
reg = <0x60006000 0x1000>;
#clock-cells = <1>;
#reset-cells = <1>;
- };
- usb-controller@c5004000 {
- compatible = "nvidia,tegra20-ehci";
- reg = <0xc5004000 0x4000>;
- clocks = <&car TEGRA20_CLK_USB2>;
- resets = <&car TEGRA20_CLK_USB2>;
+ sclk {
+ compatible = "nvidia,tegra20-sclk";
+ operating-points-v2 = <&opp_table>;
+ clocks = <&tegra_car TEGRA20_CLK_SCLK>;
+ power-domains = <&domain>;
+ };
};
diff --git a/Documentation/devicetree/bindings/clock/pwm-clock.txt b/Documentation/devicetree/bindings/clock/pwm-clock.txt
deleted file mode 100644
index 83db876b3b90..000000000000
--- a/Documentation/devicetree/bindings/clock/pwm-clock.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Binding for an external clock signal driven by a PWM pin.
-
-This binding uses the common clock binding[1] and the common PWM binding[2].
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] Documentation/devicetree/bindings/pwm/pwm.txt
-
-Required properties:
-- compatible : shall be "pwm-clock".
-- #clock-cells : from common clock binding; shall be set to 0.
-- pwms : from common PWM binding; this determines the clock frequency
- via the period given in the PWM specifier.
-
-Optional properties:
-- clock-output-names : From common clock binding.
-- clock-frequency : Exact output frequency, in case the PWM period
- is not exact but was rounded to nanoseconds.
-
-Example:
- clock {
- compatible = "pwm-clock";
- #clock-cells = <0>;
- clock-frequency = <25000000>;
- clock-output-names = "mipi_mclk";
- pwms = <&pwm2 0 40>; /* 1 / 40 ns = 25 MHz */
- };
diff --git a/Documentation/devicetree/bindings/clock/pwm-clock.yaml b/Documentation/devicetree/bindings/clock/pwm-clock.yaml
new file mode 100644
index 000000000000..f88ecb2995e0
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/pwm-clock.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/pwm-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: An external clock signal driven by a PWM pin.
+
+maintainers:
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+properties:
+ compatible:
+ const: pwm-clock
+
+ '#clock-cells':
+ const: 0
+
+ clock-frequency:
+ description: Exact output frequency, in case the PWM period is not exact
+ but was rounded to nanoseconds.
+
+ clock-output-names:
+ maxItems: 1
+
+ pwms:
+ maxItems: 1
+
+required:
+ - compatible
+ - '#clock-cells'
+ - pwms
+
+additionalProperties: false
+
+examples:
+ - |
+ clock {
+ compatible = "pwm-clock";
+ #clock-cells = <0>;
+ clock-frequency = <25000000>;
+ clock-output-names = "mipi_mclk";
+ pwms = <&pwm2 0 40>; /* 1 / 40 ns = 25 MHz */
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,a53pll.yaml b/Documentation/devicetree/bindings/clock/qcom,a53pll.yaml
index fbd758470b88..659669bf224b 100644
--- a/Documentation/devicetree/bindings/clock/qcom,a53pll.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,a53pll.yaml
@@ -4,10 +4,10 @@
$id: http://devicetree.org/schemas/clock/qcom,a53pll.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm A53 PLL Binding
+title: Qualcomm A53 PLL clock
maintainers:
- - Sivaprakash Murugesan <sivaprak@codeaurora.org>
+ - Bjorn Andersson <andersson@kernel.org>
description:
The A53 PLL on few Qualcomm platforms is the main CPU PLL used used for
@@ -16,7 +16,9 @@ description:
properties:
compatible:
enum:
+ - qcom,ipq5332-a53pll
- qcom,ipq6018-a53pll
+ - qcom,ipq8074-a53pll
- qcom,msm8916-a53pll
- qcom,msm8939-a53pll
@@ -44,14 +46,14 @@ required:
additionalProperties: false
examples:
- #Example 1 - A53 PLL found on MSM8916 devices
+ # Example 1 - A53 PLL found on MSM8916 devices
- |
a53pll: clock@b016000 {
compatible = "qcom,msm8916-a53pll";
reg = <0xb016000 0x40>;
#clock-cells = <0>;
};
- #Example 2 - A53 PLL found on IPQ6018 devices
+ # Example 2 - A53 PLL found on IPQ6018 devices
- |
a53pll_ipq: clock-controller@b116000 {
compatible = "qcom,ipq6018-a53pll";
diff --git a/Documentation/devicetree/bindings/clock/qcom,a7pll.yaml b/Documentation/devicetree/bindings/clock/qcom,a7pll.yaml
index 8666e995725f..809c34eb7d5a 100644
--- a/Documentation/devicetree/bindings/clock/qcom,a7pll.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,a7pll.yaml
@@ -4,13 +4,13 @@
$id: http://devicetree.org/schemas/clock/qcom,a7pll.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm A7 PLL Binding
+title: Qualcomm A7 PLL clock
maintainers:
- Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
description:
- The A7 PLL on the Qualcomm platforms like SDX55 is used to provide high
+ The A7 PLL on the Qualcomm platforms like SDX55, SDX65 is used to provide high
frequency clock to the CPU.
properties:
diff --git a/Documentation/devicetree/bindings/clock/qcom,aoncc-sm8250.yaml b/Documentation/devicetree/bindings/clock/qcom,aoncc-sm8250.yaml
index c40a74b5d672..8b8932bd5a92 100644
--- a/Documentation/devicetree/bindings/clock/qcom,aoncc-sm8250.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,aoncc-sm8250.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/qcom,aoncc-sm8250.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for LPASS Always ON Clock Controller on SM8250 SoCs
+title: LPASS Always ON Clock Controller on SM8250 SoCs
maintainers:
- Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
@@ -17,7 +17,7 @@ description: |
properties:
compatible:
- const: qcom,sm8250-lpass-aon
+ const: qcom,sm8250-lpass-aoncc
reg:
maxItems: 1
@@ -28,11 +28,13 @@ properties:
clocks:
items:
- description: LPASS Core voting clock
+ - description: LPASS Audio codec voting clock
- description: Glitch Free Mux register clock
clock-names:
items:
- const: core
+ - const: audio
- const: bus
required:
@@ -50,9 +52,10 @@ examples:
#include <dt-bindings/sound/qcom,q6afe.h>
clock-controller@3800000 {
#clock-cells = <1>;
- compatible = "qcom,sm8250-lpass-aon";
+ compatible = "qcom,sm8250-lpass-aoncc";
reg = <0x03380000 0x40000>;
clocks = <&q6afecc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
+ <&q6afecc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
<&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>;
- clock-names = "core", "bus";
+ clock-names = "core", "audio", "bus";
};
diff --git a/Documentation/devicetree/bindings/clock/qcom,audiocc-sm8250.yaml b/Documentation/devicetree/bindings/clock/qcom,audiocc-sm8250.yaml
index 915d76206ad0..cfca888f6014 100644
--- a/Documentation/devicetree/bindings/clock/qcom,audiocc-sm8250.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,audiocc-sm8250.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/qcom,audiocc-sm8250.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for LPASS Audio Clock Controller on SM8250 SoCs
+title: LPASS Audio Clock Controller on SM8250 SoCs
maintainers:
- Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
@@ -28,11 +28,13 @@ properties:
clocks:
items:
- description: LPASS Core voting clock
+ - description: LPASS Audio codec voting clock
- description: Glitch Free Mux register clock
clock-names:
items:
- const: core
+ - const: audio
- const: bus
required:
@@ -53,6 +55,7 @@ examples:
compatible = "qcom,sm8250-lpass-audiocc";
reg = <0x03300000 0x30000>;
clocks = <&q6afecc LPASS_HW_MACRO_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
+ <&q6afecc LPASS_HW_DCODEC_VOTE LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
<&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>;
- clock-names = "core", "bus";
+ clock-names = "core", "audio", "bus";
};
diff --git a/Documentation/devicetree/bindings/clock/qcom,camcc-sm8250.yaml b/Documentation/devicetree/bindings/clock/qcom,camcc-sm8250.yaml
index 9f239c3960d1..426335a2841c 100644
--- a/Documentation/devicetree/bindings/clock/qcom,camcc-sm8250.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,camcc-sm8250.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,camcc-sm8250.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Camera Clock & Reset Controller Binding for SM8250
+title: Qualcomm Camera Clock & Reset Controller on SM8250
maintainers:
- Jonathan Marek <jonathan@marek.ca>
description: |
- Qualcomm camera clock control module which supports the clocks, resets and
+ Qualcomm camera clock control module provides the clocks, resets and
power domains on SM8250.
- See also dt-bindings/clock/qcom,camcc-sm8250.h
+ See also:: include/dt-bindings/clock/qcom,camcc-sm8250.h
properties:
compatible:
@@ -21,12 +21,16 @@ properties:
clocks:
items:
+ - description: AHB
- description: Board XO source
+ - description: Board active XO source
- description: Sleep clock source
clock-names:
items:
+ - const: iface
- const: bi_tcxo
+ - const: bi_tcxo_ao
- const: sleep_clk
'#clock-cells':
@@ -38,9 +42,18 @@ properties:
'#power-domain-cells':
const: 1
+ power-domains:
+ items:
+ - description: MMCX power domain
+
reg:
maxItems: 1
+ required-opps:
+ maxItems: 1
+ description:
+ OPP node describing required MMCX performance point.
+
required:
- compatible
- reg
@@ -54,13 +67,16 @@ additionalProperties: false
examples:
- |
+ #include <dt-bindings/clock/qcom,gcc-sm8250.h>
#include <dt-bindings/clock/qcom,rpmh.h>
clock-controller@ad00000 {
compatible = "qcom,sm8250-camcc";
reg = <0x0ad00000 0x10000>;
- clocks = <&rpmhcc RPMH_CXO_CLK>,
+ clocks = <&gcc GCC_CAMERA_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
<&sleep_clk>;
- clock-names = "bi_tcxo", "sleep_clk";
+ clock-names = "iface", "bi_tcxo", "bi_tcxo_ao", "sleep_clk";
#clock-cells = <1>;
#reset-cells = <1>;
#power-domain-cells = <1>;
diff --git a/Documentation/devicetree/bindings/clock/qcom,camcc.txt b/Documentation/devicetree/bindings/clock/qcom,camcc.txt
deleted file mode 100644
index c5eb6694fda9..000000000000
--- a/Documentation/devicetree/bindings/clock/qcom,camcc.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-Qualcomm Camera Clock & Reset Controller Binding
-------------------------------------------------
-
-Required properties :
-- compatible : shall contain "qcom,sdm845-camcc".
-- reg : shall contain base register location and length.
-- #clock-cells : from common clock binding, shall contain 1.
-- #reset-cells : from common reset binding, shall contain 1.
-- #power-domain-cells : from generic power domain binding, shall contain 1.
-
-Example:
- camcc: clock-controller@ad00000 {
- compatible = "qcom,sdm845-camcc";
- reg = <0xad00000 0x10000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- #power-domain-cells = <1>;
- };
diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sc8280xp.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sc8280xp.yaml
new file mode 100644
index 000000000000..3cb996b2c9d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sc8280xp.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,dispcc-sc8280xp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on SC8280XP
+
+maintainers:
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+description: |
+ Qualcomm display clock control module which supports the clocks, resets and
+ power domains for the two MDSS instances on SC8280XP.
+
+ See also:
+ include/dt-bindings/clock/qcom,dispcc-sc8280xp.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sc8280xp-dispcc0
+ - qcom,sc8280xp-dispcc1
+
+ clocks:
+ items:
+ - description: AHB interface clock,
+ - description: SoC CXO clock
+ - description: SoC sleep clock
+ - description: DisplayPort 0 link clock
+ - description: DisplayPort 0 VCO div clock
+ - description: DisplayPort 1 link clock
+ - description: DisplayPort 1 VCO div clock
+ - description: DisplayPort 2 link clock
+ - description: DisplayPort 2 VCO div clock
+ - description: DisplayPort 3 link clock
+ - description: DisplayPort 3 VCO div clock
+ - description: DSI 0 PLL byte clock
+ - description: DSI 0 PLL DSI clock
+ - description: DSI 1 PLL byte clock
+ - description: DSI 1 PLL DSI clock
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+ power-domains:
+ items:
+ - description: MMCX power domain
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sc8280xp.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+ clock-controller@af00000 {
+ compatible = "qcom,sc8280xp-dispcc0";
+ reg = <0x0af00000 0x20000>;
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>,
+ <&sleep_clk>,
+ <&mdss0_dp_phy0 0>,
+ <&mdss0_dp_phy0 1>,
+ <&mdss0_dp_phy1 0>,
+ <&mdss0_dp_phy1 1>,
+ <&mdss0_dp_phy2 0>,
+ <&mdss0_dp_phy2 1>,
+ <&mdss0_dp_phy3 0>,
+ <&mdss0_dp_phy3 1>,
+ <&mdss0_dsi0_phy 0>,
+ <&mdss0_dsi0_phy 1>,
+ <&mdss0_dsi1_phy 0>,
+ <&mdss0_dsi1_phy 1>;
+ power-domains = <&rpmhpd SC8280XP_MMCX>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6125.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6125.yaml
new file mode 100644
index 000000000000..8a210c4c5f82
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6125.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,dispcc-sm6125.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock Controller on SM6125
+
+maintainers:
+ - Martin Botka <martin.botka@somainline.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks and power domains
+ on SM6125.
+
+ See also:: include/dt-bindings/clock/qcom,dispcc-sm6125.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm6125-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Byte clock from DSI PHY0
+ - description: Pixel clock from DSI PHY0
+ - description: Pixel clock from DSI PHY1
+ - description: Link clock from DP PHY
+ - description: VCO DIV clock from DP PHY
+ - description: AHB config clock from GCC
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: dsi0_phy_pll_out_byteclk
+ - const: dsi0_phy_pll_out_dsiclk
+ - const: dsi1_phy_pll_out_dsiclk
+ - const: dp_phy_pll_link_clk
+ - const: dp_phy_pll_vco_div_clk
+ - const: cfg_ahb_clk
+
+ '#clock-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm6125.h>
+ clock-controller@5f00000 {
+ compatible = "qcom,sm6125-dispcc";
+ reg = <0x5f00000 0x20000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&dsi0_phy 0>,
+ <&dsi0_phy 1>,
+ <&dsi1_phy 1>,
+ <&dp_phy 0>,
+ <&dp_phy 1>,
+ <&gcc GCC_DISP_AHB_CLK>;
+ clock-names = "bi_tcxo",
+ "dsi0_phy_pll_out_byteclk",
+ "dsi0_phy_pll_out_dsiclk",
+ "dsi1_phy_pll_out_dsiclk",
+ "dp_phy_pll_link_clk",
+ "dp_phy_pll_vco_div_clk",
+ "cfg_ahb_clk";
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml
new file mode 100644
index 000000000000..8efac3fb159f
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm6350.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,dispcc-sm6350.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on SM6350
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@somainline.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SM6350.
+
+ See also:: include/dt-bindings/clock/qcom,dispcc-sm6350.h
+
+properties:
+ compatible:
+ const: qcom,sm6350-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: GPLL0 source from GCC
+ - description: Byte clock from DSI PHY
+ - description: Pixel clock from DSI PHY
+ - description: Link clock from DP PHY
+ - description: VCO DIV clock from DP PHY
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: gcc_disp_gpll0_clk
+ - const: dsi0_phy_pll_out_byteclk
+ - const: dsi0_phy_pll_out_dsiclk
+ - const: dp_phy_pll_link_clk
+ - const: dp_phy_pll_vco_div_clk
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sm6350.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@af00000 {
+ compatible = "qcom,sm6350-dispcc";
+ reg = <0x0af00000 0x20000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&gcc GCC_DISP_GPLL0_CLK>,
+ <&dsi_phy 0>,
+ <&dsi_phy 1>,
+ <&dp_phy 0>,
+ <&dp_phy 1>;
+ clock-names = "bi_tcxo",
+ "gcc_disp_gpll0_clk",
+ "dsi0_phy_pll_out_byteclk",
+ "dsi0_phy_pll_out_dsiclk",
+ "dp_phy_pll_link_clk",
+ "dp_phy_pll_vco_div_clk";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
index 6667261dc665..d6774db257f0 100644
--- a/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,dispcc-sm8x50.yaml
@@ -4,18 +4,19 @@
$id: http://devicetree.org/schemas/clock/qcom,dispcc-sm8x50.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Display Clock & Reset Controller Binding for SM8150/SM8250
+title: Qualcomm Display Clock & Reset Controller on SM8150/SM8250/SM8350
maintainers:
- Jonathan Marek <jonathan@marek.ca>
description: |
- Qualcomm display clock control module which supports the clocks, resets and
- power domains on SM8150 and SM8250.
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SM8150/SM8250/SM8350.
- See also:
- dt-bindings/clock/qcom,dispcc-sm8150.h
- dt-bindings/clock/qcom,dispcc-sm8250.h
+ See also::
+ include/dt-bindings/clock/qcom,dispcc-sm8150.h
+ include/dt-bindings/clock/qcom,dispcc-sm8250.h
+ include/dt-bindings/clock/qcom,dispcc-sm8350.h
properties:
compatible:
@@ -23,6 +24,7 @@ properties:
- qcom,sc8180x-dispcc
- qcom,sm8150-dispcc
- qcom,sm8250-dispcc
+ - qcom,sm8350-dispcc
clocks:
items:
@@ -56,6 +58,16 @@ properties:
reg:
maxItems: 1
+ power-domains:
+ description:
+ A phandle and PM domain specifier for the MMCX power domain.
+ maxItems: 1
+
+ required-opps:
+ description:
+ A phandle to an OPP node describing required MMCX performance point.
+ maxItems: 1
+
required:
- compatible
- reg
@@ -70,6 +82,7 @@ additionalProperties: false
examples:
- |
#include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
clock-controller@af00000 {
compatible = "qcom,sm8250-dispcc";
reg = <0x0af00000 0x10000>;
@@ -90,5 +103,7 @@ examples:
#clock-cells = <1>;
#reset-cells = <1>;
#power-domain-cells = <1>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+ required-opps = <&rpmhpd_opp_low_svs>;
};
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-apq8064.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-apq8064.yaml
index 8e2eac6cbfb9..09cd7a786871 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-apq8064.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-apq8064.yaml
@@ -4,39 +4,53 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-apq8064.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for APQ8064
+title: Qualcomm Global Clock & Reset Controller on APQ8064/MSM8960
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on APQ8064.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on APQ8064.
- See also:
- - dt-bindings/clock/qcom,gcc-msm8960.h
- - dt-bindings/reset/qcom,gcc-msm8960.h
+ See also::
+ include/dt-bindings/clock/qcom,gcc-msm8960.h
+ include/dt-bindings/reset/qcom,gcc-msm8960.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
properties:
compatible:
- const: qcom,gcc-apq8064
-
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
+ oneOf:
+ - items:
+ - enum:
+ - qcom,gcc-apq8064
+ - qcom,gcc-msm8960
+ - const: syscon
+ - enum:
+ - qcom,gcc-apq8064
+ - qcom,gcc-msm8960
+ deprecated: true
+
+ thermal-sensor:
+ description: child tsens device
+ $ref: /schemas/thermal/qcom-tsens.yaml#
+
+ clocks:
+ maxItems: 3
+
+ clock-names:
+ items:
+ - const: cxo
+ - const: pxo
+ - const: pll4
nvmem-cells:
minItems: 1
maxItems: 2
+ deprecated: true
description:
Qualcomm TSENS (thermal sensor device) on some devices can
be part of GCC and hence the TSENS properties can also be part
@@ -46,39 +60,39 @@ properties:
nvmem-cell-names:
minItems: 1
+ deprecated: true
items:
- const: calib
- const: calib_backup
'#thermal-sensor-cells':
const: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
+ deprecated: true
required:
- compatible
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
- - nvmem-cells
- - nvmem-cell-names
- - '#thermal-sensor-cells'
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
clock-controller@900000 {
- compatible = "qcom,gcc-apq8064";
+ compatible = "qcom,gcc-apq8064", "syscon";
reg = <0x00900000 0x4000>;
- nvmem-cells = <&tsens_calib>, <&tsens_backup>;
- nvmem-cell-names = "calib", "calib_backup";
#clock-cells = <1>;
#reset-cells = <1>;
#power-domain-cells = <1>;
- #thermal-sensor-cells = <1>;
+
+ thermal-sensor {
+ compatible = "qcom,msm8960-tsens";
+
+ nvmem-cells = <&tsens_calib>, <&tsens_backup>;
+ nvmem-cell-names = "calib", "calib_backup";
+ interrupts = <0 178 4>;
+ interrupt-names = "uplow";
+
+ #qcom,sensors = <11>;
+ #thermal-sensor-cells = <1>;
+ };
};
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-apq8084.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-apq8084.yaml
new file mode 100644
index 000000000000..d84608269080
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-apq8084.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-apq8084.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on APQ8084
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on APQ8084.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-apq8084.h
+ include/dt-bindings/reset/qcom,gcc-apq8084.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ const: qcom,gcc-apq8084
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+ - description: UFS RX symbol 0 clock
+ - description: UFS RX symbol 1 clock
+ - description: UFS TX symbol 0 clock
+ - description: UFS TX symbol 1 clock
+ - description: SATA ASIC0 clock
+ - description: SATA RX clock
+ - description: PCIe PIPE clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+ - const: ufs_rx_symbol_0_clk_src
+ - const: ufs_rx_symbol_1_clk_src
+ - const: ufs_tx_symbol_0_clk_src
+ - const: ufs_tx_symbol_1_clk_src
+ - const: sata_asic0_clk
+ - const: sata_rx_clk
+ - const: pcie_pipe
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ /* UFS PHY on APQ8084 is not supported (yet), so these bindings just serve an example */
+ clock-controller@fc400000 {
+ compatible = "qcom,gcc-apq8084";
+ reg = <0xfc400000 0x4000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+
+ clocks = <&xo_board>,
+ <&sleep_clk>,
+ <&ufsphy 0>,
+ <&ufsphy 1>,
+ <&ufsphy 2>,
+ <&ufsphy 3>,
+ <&sata 0>,
+ <&sata 1>,
+ <&pcie_phy>;
+ clock-names = "xo",
+ "sleep_clk",
+ "ufs_rx_symbol_0_clk_src",
+ "ufs_rx_symbol_1_clk_src",
+ "ufs_tx_symbol_0_clk_src",
+ "ufs_tx_symbol_1_clk_src",
+ "sata_asic0_clk",
+ "sata_rx_clk",
+ "pcie_pipe";
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-ipq4019.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq4019.yaml
new file mode 100644
index 000000000000..6ebaef2288fa
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq4019.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-ipq4019.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ4019
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <tdas@codeaurora.org>
+ - Robert Marko <robert.markoo@sartura.hr>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on IPQ4019.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-ipq4019.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ const: qcom,gcc-ipq4019
+
+ clocks:
+ items:
+ - description: board XO clock
+ - description: sleep clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@1800000 {
+ compatible = "qcom,gcc-ipq4019";
+ reg = <0x1800000 0x60000>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ #reset-cells = <1>;
+ clocks = <&xo>, <&sleep_clk>;
+ clock-names = "xo", "sleep_clk";
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8064.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8064.yaml
new file mode 100644
index 000000000000..93f3084b97c1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8064.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-ipq8064.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ8064
+
+maintainers:
+ - Ansuel Smith <ansuelsmth@gmail.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on IPQ8064.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-ipq806x.h (qcom,gcc-ipq8064)
+ include/dt-bindings/reset/qcom,gcc-ipq806x.h (qcom,gcc-ipq8064)
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: qcom,gcc-ipq8064
+ - const: syscon
+
+ clocks:
+ minItems: 2
+ items:
+ - description: PXO source
+ - description: CXO source
+ - description: PLL4 from LCC
+
+ clock-names:
+ minItems: 2
+ items:
+ - const: pxo
+ - const: cxo
+ - const: pll4
+
+ thermal-sensor:
+ type: object
+
+ allOf:
+ - $ref: /schemas/thermal/qcom-tsens.yaml#
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,lcc-ipq806x.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ gcc: clock-controller@900000 {
+ compatible = "qcom,gcc-ipq8064", "syscon";
+ reg = <0x00900000 0x4000>;
+ clocks = <&pxo_board>, <&cxo_board>, <&lcc PLL4>;
+ clock-names = "pxo", "cxo", "pll4";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+
+ tsens: thermal-sensor {
+ compatible = "qcom,ipq8064-tsens";
+
+ nvmem-cells = <&tsens_calib>, <&tsens_calib_backup>;
+ nvmem-cell-names = "calib", "calib_backup";
+ interrupts = <GIC_SPI 178 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "uplow";
+
+ #qcom,sensors = <11>;
+ #thermal-sensor-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8074.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8074.yaml
index 98572b4a9b60..deef398a9872 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8074.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-ipq8074.yaml
@@ -4,43 +4,39 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-ipq8074.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Bindingfor IPQ8074
+title: Qualcomm Global Clock & Reset Controller on IPQ8074
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on IPQ8074.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on IPQ8074.
- See also:
- - dt-bindings/clock/qcom,gcc-ipq8074.h
+ See also:: include/dt-bindings/clock/qcom,gcc-ipq8074.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
properties:
compatible:
const: qcom,gcc-ipq8074
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- reg:
- maxItems: 1
+ clocks:
+ items:
+ - description: board XO clock
+ - description: sleep clock
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
required:
- compatible
- - reg
- - '#clock-cells'
- - '#reset-cells'
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
@@ -48,6 +44,7 @@ examples:
compatible = "qcom,gcc-ipq8074";
reg = <0x01800000 0x80000>;
#clock-cells = <1>;
+ #power-domain-cells = <1>;
#reset-cells = <1>;
};
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8660.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8660.yaml
new file mode 100644
index 000000000000..c9e985548621
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8660.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8660.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8660
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks and resets on
+ MSM8660
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-msm8660.h
+ include/dt-bindings/reset/qcom,gcc-msm8660.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8660
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: pxo
+ - const: cxo
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ # Example for GCC for MSM8974:
+ - |
+ clock-controller@900000 {
+ compatible = "qcom,gcc-msm8660";
+ reg = <0x900000 0x4000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ clocks = <&pxo_board>, <&cxo_board>;
+ clock-names = "pxo", "cxo";
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml
new file mode 100644
index 000000000000..b91462587df5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8909.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8909.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8909, MSM8917 and QM215
+
+maintainers:
+ - Stephan Gerhold <stephan@gerhold.net>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8909, MSM8917 or QM215.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-msm8909.h
+ include/dt-bindings/clock/qcom,gcc-msm8917.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8909
+ - qcom,gcc-msm8917
+ - qcom,gcc-qm215
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+ - const: dsi0pll
+ - const: dsi0pllbyte
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ gcc: clock-controller@1800000 {
+ compatible = "qcom,gcc-msm8909";
+ reg = <0x01800000 0x80000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ clocks = <&xo_board>, <&sleep_clk>, <&dsi0_phy 1>, <&dsi0_phy 0>;
+ clock-names = "xo", "sleep_clk", "dsi0pll", "dsi0pllbyte";
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8916.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8916.yaml
new file mode 100644
index 000000000000..ad84c0f7680b
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8916.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8916.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8916 and MSM8939
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8916 or MSM8939.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-msm8916.h
+ include/dt-bindings/clock/qcom,gcc-msm8939.h
+ include/dt-bindings/reset/qcom,gcc-msm8916.h
+ include/dt-bindings/reset/qcom,gcc-msm8939.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8916
+ - qcom,gcc-msm8939
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: External MCLK clock
+ - description: External Primary I2S clock
+ - description: External Secondary I2S clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: ext_mclk
+ - const: ext_pri_i2s
+ - const: ext_sec_i2s
+
+required:
+ - compatible
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@300000 {
+ compatible = "qcom,gcc-msm8916";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ reg = <0x300000 0x90000>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8974.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8974.yaml
new file mode 100644
index 000000000000..1927aecc86bc
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8974.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8974.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8974 (including Pro) and MSM8226
+ Controller
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8974 (all variants) and MSM8226.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-msm8974.h (qcom,gcc-msm8226 and qcom,gcc-msm8974)
+ include/dt-bindings/reset/qcom,gcc-msm8974.h (qcom,gcc-msm8226 and qcom,gcc-msm8974)
+
+$ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8226
+ - qcom,gcc-msm8974
+ - qcom,gcc-msm8974pro
+ - qcom,gcc-msm8974pro-ac
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@fc400000 {
+ compatible = "qcom,gcc-msm8974";
+ reg = <0x00100000 0x94000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+
+ clock-names = "xo", "sleep_clk";
+ clocks = <&xo_board>,
+ <&sleep_clk>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8976.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8976.yaml
new file mode 100644
index 000000000000..d2186e25f55f
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8976.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8976.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8976
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <tdas@codeaurora.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8976.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-msm8976.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8976
+ - qcom,gcc-msm8976-v1.1
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Always-on XO source
+ - description: Pixel clock from DSI PHY0
+ - description: Byte clock from DSI PHY0
+ - description: Pixel clock from DSI PHY1
+ - description: Byte clock from DSI PHY1
+
+ clock-names:
+ items:
+ - const: xo
+ - const: xo_a
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+
+ vdd_gfx-supply:
+ description:
+ Phandle to voltage regulator providing power to the GX domain.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - vdd_gfx-supply
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@1800000 {
+ compatible = "qcom,gcc-msm8976";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ reg = <0x1800000 0x80000>;
+
+ clocks = <&xo_board>,
+ <&xo_board>,
+ <&dsi0_phy 1>,
+ <&dsi0_phy 0>,
+ <&dsi1_phy 1>,
+ <&dsi1_phy 0>;
+
+ clock-names = "xo",
+ "xo_a",
+ "dsi0pll",
+ "dsi0pllbyte",
+ "dsi1pll",
+ "dsi1pllbyte";
+
+ vdd_gfx-supply = <&pm8004_s5>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml
new file mode 100644
index 000000000000..8f0f20c1442a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8994.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8994.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on MSM8994
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@somainline.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8994 and MSM8992.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-msm8994.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-msm8992
+ - qcom,gcc-msm8994
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@300000 {
+ compatible = "qcom,gcc-msm8994";
+ reg = <0x00300000 0x90000>;
+ clocks = <&xo_board>, <&sleep_clk>;
+ clock-names = "xo", "sleep";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8996.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8996.yaml
index 5a5b2214f0ca..f77036ace31b 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8996.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8996.yaml
@@ -4,59 +4,57 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8996.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for MSM8996
+title: Qualcomm Global Clock & Reset Controller on MSM8996
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
+ Qualcomm global clock control module which provides the clocks, resets and
power domains on MSM8996.
- See also:
- - dt-bindings/clock/qcom,gcc-msm8996.h
+ See also:: include/dt-bindings/clock/qcom,gcc-msm8996.h
properties:
compatible:
const: qcom,gcc-msm8996
clocks:
+ minItems: 3
items:
- description: XO source
- description: Second XO source
- description: Sleep clock source
+ - description: PCIe 0 PIPE clock (optional)
+ - description: PCIe 1 PIPE clock (optional)
+ - description: PCIe 2 PIPE clock (optional)
+ - description: USB3 PIPE clock (optional)
+ - description: UFS RX symbol 0 clock (optional)
+ - description: UFS RX symbol 1 clock (optional)
+ - description: UFS TX symbol 0 clock (optional)
clock-names:
+ minItems: 3
items:
- const: cxo
- const: cxo2
- const: sleep_clk
-
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
+ - const: pcie_0_pipe_clk_src
+ - const: pcie_1_pipe_clk_src
+ - const: pcie_2_pipe_clk_src
+ - const: usb3_phy_pipe_clk_src
+ - const: ufs_rx_symbol_0_clk_src
+ - const: ufs_rx_symbol_1_clk_src
+ - const: ufs_tx_symbol_0_clk_src
required:
- compatible
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8998.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8998.yaml
index a0bb713929b0..3c9729050d6f 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-msm8998.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-msm8998.yaml
@@ -4,18 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-msm8998.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for MSM8998
+title: Qualcomm Global Clock & Reset Controller on MSM8998
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on MSM8998.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on MSM8998.
- See also:
- - dt-bindings/clock/qcom,gcc-msm8998.h
+ See also:: include/dt-bindings/clock/qcom,gcc-msm8998.h
properties:
compatible:
@@ -25,48 +24,25 @@ properties:
items:
- description: Board XO source
- description: Sleep clock source
- - description: USB 3.0 phy pipe clock
- - description: UFS phy rx symbol clock for pipe 0
- - description: UFS phy rx symbol clock for pipe 1
- - description: UFS phy tx symbol clock
- - description: PCIE phy pipe clock
+ - description: Audio reference clock (Optional clock)
+ minItems: 2
clock-names:
items:
- const: xo
- const: sleep_clk
- - const: usb3_pipe
- - const: ufs_rx_symbol0
- - const: ufs_rx_symbol1
- - const: ufs_tx_symbol0
- - const: pcie0_pipe
-
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
+ - const: aud_ref_clk # Optional clock
+ minItems: 2
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
@@ -79,17 +55,9 @@ examples:
reg = <0x00100000 0xb0000>;
clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
<&sleep>,
- <0>,
- <0>,
- <0>,
- <0>,
<0>;
clock-names = "xo",
"sleep_clk",
- "usb3_pipe",
- "ufs_rx_symbol0",
- "ufs_rx_symbol1",
- "ufs_tx_symbol0",
- "pcie0_pipe";
+ "aud_ref_clk";
};
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-other.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-other.yaml
new file mode 100644
index 000000000000..ae01e7749534
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-other.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-other.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <tdas@codeaurora.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains.
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-ipq6018.h
+ include/dt-bindings/reset/qcom,gcc-ipq6018.h
+ include/dt-bindings/clock/qcom,gcc-msm8953.h
+ include/dt-bindings/clock/qcom,gcc-mdm9607.h
+ include/dt-bindings/clock/qcom,gcc-mdm9615.h
+ include/dt-bindings/reset/qcom,gcc-mdm9615.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-ipq6018
+ - qcom,gcc-mdm9607
+ - qcom,gcc-msm8953
+ - qcom,gcc-mdm9615
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@900000 {
+ compatible = "qcom,gcc-mdm9607";
+ reg = <0x900000 0x4000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-qcm2290.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-qcm2290.yaml
new file mode 100644
index 000000000000..c9bec4656f6e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-qcm2290.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-qcm2290.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on QCM2290
+
+maintainers:
+ - Shawn Guo <shawn.guo@linaro.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on QCM2290.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-qcm2290.h
+
+properties:
+ compatible:
+ const: qcom,gcc-qcm2290
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: sleep_clk
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ clock-controller@1400000 {
+ compatible = "qcom,gcc-qcm2290";
+ reg = <0x01400000 0x1f0000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ clock-names = "bi_tcxo", "sleep_clk";
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>, <&sleep_clk>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-qcs404.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-qcs404.yaml
index ce06f3f8c3e3..b2256f81b265 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-qcs404.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-qcs404.yaml
@@ -4,43 +4,47 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-qcs404.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Bindingfor QCS404
+title: Qualcomm Global Clock & Reset Controller on QCS404
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on QCS404.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on QCS404.
- See also:
- - dt-bindings/clock/qcom,gcc-qcs404.h
+ See also:: include/dt-bindings/clock/qcom,gcc-qcs404.h
properties:
compatible:
const: qcom,gcc-qcs404
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+ - description: PCIe 0 PIPE clock (optional)
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: HDMI phy PLL clock
+
+ clock-names:
+ items:
+ - const: cxo
+ - const: sleep_clk
+ - const: pcie_0_pipe_clk_src
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: hdmi_pll
required:
- compatible
- - reg
- - '#clock-cells'
- - '#reset-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
@@ -49,5 +53,6 @@ examples:
reg = <0x01800000 0x80000>;
#clock-cells = <1>;
#reset-cells = <1>;
+ #power-domain-cells = <1>;
};
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sc7180.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sc7180.yaml
index a404c8fbee67..06dce0c6b7d0 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sc7180.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sc7180.yaml
@@ -4,18 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sc7180.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SC7180
+title: Qualcomm Global Clock & Reset Controller on SC7180
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SC7180.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SC7180.
- See also:
- - dt-bindings/clock/qcom,gcc-sc7180.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sc7180.h
properties:
compatible:
@@ -33,32 +32,15 @@ properties:
- const: bi_tcxo_ao
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sc7280.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sc7280.yaml
index 5693b8997570..947b47168cec 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sc7280.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sc7280.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sc7280.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SC7280
+title: Qualcomm Global Clock & Reset Controller on SC7280
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SC7280.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SC7280.
- See also:
- - dt-bindings/clock/qcom,gcc-sc7280.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sc7280.h
properties:
compatible:
@@ -44,28 +43,15 @@ properties:
- const: ufs_phy_tx_symbol_0_clk
- const: usb3_phy_wrapper_gcc_usb30_pipe_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sc8180x.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sc8180x.yaml
index f03ef96e57fa..6c4846b34e4b 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sc8180x.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sc8180x.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sc8180x.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SC8180x
+title: Qualcomm Global Clock & Reset Controller on SC8180x
maintainers:
- Bjorn Andersson <bjorn.andersson@linaro.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SC8180x.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SC8180x.
- See also:
- - dt-bindings/clock/qcom,gcc-sc8180x.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sc8180x.h
properties:
compatible:
@@ -32,32 +31,15 @@ properties:
- const: bi_tcxo_ao
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sc8280xp.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sc8280xp.yaml
new file mode 100644
index 000000000000..5681e535fede
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sc8280xp.yaml
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-sc8280xp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SC8280xp
+
+maintainers:
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and
+ power domains on SC8280xp.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-sc8280xp.h
+
+properties:
+ compatible:
+ const: qcom,gcc-sc8280xp
+
+ clocks:
+ items:
+ - description: XO reference clock
+ - description: Sleep clock
+ - description: UFS memory first RX symbol clock
+ - description: UFS memory second RX symbol clock
+ - description: UFS memory first TX symbol clock
+ - description: UFS card first RX symbol clock
+ - description: UFS card second RX symbol clock
+ - description: UFS card first TX symbol clock
+ - description: Primary USB SuperSpeed pipe clock
+ - description: USB4 PHY pipegmux clock source
+ - description: USB4 PHY DP gmux clock source
+ - description: USB4 PHY sys pipegmux clock source
+ - description: USB4 PHY PCIe pipe clock
+ - description: USB4 PHY router max pipe clock
+ - description: Primary USB4 RX0 clock
+ - description: Primary USB4 RX1 clock
+ - description: Secondary USB SuperSpeed pipe clock
+ - description: Second USB4 PHY pipegmux clock source
+ - description: Second USB4 PHY DP gmux clock source
+ - description: Second USB4 PHY sys pipegmux clock source
+ - description: Second USB4 PHY PCIe pipe clock
+ - description: Second USB4 PHY router max pipe clock
+ - description: Secondary USB4 RX0 clock
+ - description: Secondary USB4 RX1 clock
+ - description: Multiport USB first SuperSpeed pipe clock
+ - description: Multiport USB second SuperSpeed pipe clock
+ - description: PCIe 2a pipe clock
+ - description: PCIe 2b pipe clock
+ - description: PCIe 3a pipe clock
+ - description: PCIe 3b pipe clock
+ - description: PCIe 4 pipe clock
+ - description: First EMAC controller reference clock
+ - description: Second EMAC controller reference clock
+
+ power-domains:
+ items:
+ - description: CX domain
+
+ protected-clocks:
+ maxItems: 389
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ clock-controller@100000 {
+ compatible = "qcom,gcc-sc8280xp";
+ reg = <0x00100000 0x1f0000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&sleep_clk>,
+ <&ufs_phy_rx_symbol_0_clk>,
+ <&ufs_phy_rx_symbol_1_clk>,
+ <&ufs_phy_tx_symbol_0_clk>,
+ <&ufs_card_rx_symbol_0_clk>,
+ <&ufs_card_rx_symbol_1_clk>,
+ <&ufs_card_tx_symbol_0_clk>,
+ <&usb_0_ssphy>,
+ <&gcc_usb4_phy_pipegmux_clk_src>,
+ <&gcc_usb4_phy_dp_gmux_clk_src>,
+ <&gcc_usb4_phy_sys_pipegmux_clk_src>,
+ <&usb4_phy_gcc_usb4_pcie_pipe_clk>,
+ <&usb4_phy_gcc_usb4rtr_max_pipe_clk>,
+ <&qusb4phy_gcc_usb4_rx0_clk>,
+ <&qusb4phy_gcc_usb4_rx1_clk>,
+ <&usb_1_ssphy>,
+ <&gcc_usb4_1_phy_pipegmux_clk_src>,
+ <&gcc_usb4_1_phy_dp_gmux_clk_src>,
+ <&gcc_usb4_1_phy_sys_pipegmux_clk_src>,
+ <&usb4_1_phy_gcc_usb4_pcie_pipe_clk>,
+ <&usb4_1_phy_gcc_usb4rtr_max_pipe_clk>,
+ <&qusb4phy_1_gcc_usb4_rx0_clk>,
+ <&qusb4phy_1_gcc_usb4_rx1_clk>,
+ <&usb_2_ssphy>,
+ <&usb_3_ssphy>,
+ <&pcie2a_lane>,
+ <&pcie2b_lane>,
+ <&pcie3a_lane>,
+ <&pcie3b_lane>,
+ <&pcie4_lane>,
+ <&rxc0_ref_clk>,
+ <&rxc1_ref_clk>;
+ power-domains = <&rpmhpd SC8280XP_CX>;
+
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sdm660.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sdm660.yaml
new file mode 100644
index 000000000000..52e7412aace5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sdm660.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-sdm660.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SDM660/SDM630/SDM636 Global Clock & Reset Controller
+
+maintainers:
+ - Stephen Boyd <sboyd@kernel.org>
+ - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SDM630, SDM636 and SDM660
+
+ See also::
+ include/dt-bindings/clock/qcom,gcc-sdm660.h (qcom,gcc-sdm630 and qcom,gcc-sdm660)
+
+$ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ enum:
+ - qcom,gcc-sdm630
+ - qcom,gcc-sdm660
+
+ clocks:
+ items:
+ - description: XO source
+ - description: Sleep clock source
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ # Example for GCC for SDM660:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,gcc-sdm660";
+ reg = <0x00100000 0x94000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+
+ clock-names = "xo", "sleep_clk";
+ clocks = <&xo_board>,
+ <&sleep_clk>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sdm845.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sdm845.yaml
index d902f137ab17..68e1b7822fe0 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sdm845.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sdm845.yaml
@@ -4,63 +4,81 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sdm845.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding
+title: Qualcomm Global Clock & Reset Controller on SDM670 and SDM845
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SDM845
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SDM670 and SDM845
- See also:
- - dt-bindings/clock/qcom,gcc-sdm845.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sdm845.h
properties:
compatible:
- const: qcom,gcc-sdm845
+ enum:
+ - qcom,gcc-sdm670
+ - qcom,gcc-sdm845
clocks:
- items:
- - description: Board XO source
- - description: Board active XO source
- - description: Sleep clock source
- - description: PCIE 0 Pipe clock source
- - description: PCIE 1 Pipe clock source
+ minItems: 3
+ maxItems: 5
clock-names:
- items:
- - const: bi_tcxo
- - const: bi_tcxo_ao
- - const: sleep_clk
- - const: pcie_0_pipe_clk
- - const: pcie_1_pipe_clk
+ minItems: 3
+ maxItems: 5
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
+ power-domains:
maxItems: 1
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,gcc-sdm670
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board active XO source
+ - description: Sleep clock source
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: sleep_clk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,gcc-sdm845
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board active XO source
+ - description: Sleep clock source
+ - description: PCIE 0 Pipe clock source
+ - description: PCIE 1 Pipe clock source
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: sleep_clk
+ - const: pcie_0_pipe_clk
+ - const: pcie_1_pipe_clk
+
+unevaluatedProperties: false
examples:
# Example for GCC for SDM845:
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sdx55.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sdx55.yaml
index b0d1c65aa354..428e954d7638 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sdx55.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sdx55.yaml
@@ -4,18 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sdx55.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SDX55
+title: Qualcomm Global Clock & Reset Controller on SDX55
maintainers:
- Vinod Koul <vkoul@kernel.org>
- Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
+ Qualcomm global clock control module provides the clocks, resets and
power domains on SDX55
- See also:
- - dt-bindings/clock/qcom,gcc-sdx55.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sdx55.h
properties:
compatible:
@@ -25,38 +24,21 @@ properties:
items:
- description: Board XO source
- description: Sleep clock source
- - description: PLL test clock source (Optional clock)
- minItems: 2
clock-names:
items:
- const: bi_tcxo
- const: sleep_clk
- - const: core_bi_pll_test_se # Optional clock
- minItems: 2
-
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
@@ -65,8 +47,9 @@ examples:
compatible = "qcom,gcc-sdx55";
reg = <0x00100000 0x1f0000>;
clocks = <&rpmhcc RPMH_CXO_CLK>,
- <&sleep_clk>, <&pll_test_clk>;
- clock-names = "bi_tcxo", "sleep_clk", "core_bi_pll_test_se";
+ <&sleep_clk>;
+ clock-names = "bi_tcxo",
+ "sleep_clk";
#clock-cells = <1>;
#reset-cells = <1>;
#power-domain-cells = <1>;
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sdx65.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sdx65.yaml
new file mode 100644
index 000000000000..523e18d7f150
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sdx65.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-sdx65.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SDX65
+
+maintainers:
+ - Vamsi krishna Lanka <quic_vamslank@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SDX65
+
+ See also:: include/dt-bindings/clock/qcom,gcc-sdx65.h
+
+properties:
+ compatible:
+ const: qcom,gcc-sdx65
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board active XO source
+ - description: Sleep clock source
+ - description: PCIE Pipe clock source
+ - description: USB3 phy wrapper pipe clock source
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: sleep_clk
+ - const: pcie_pipe_clk
+ - const: usb3_phy_wrapper_gcc_usb30_pipe_clk
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,gcc-sdx65";
+ reg = <0x100000 0x1f7400>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&rpmhcc RPMH_CXO_CLK_A>, <&sleep_clk>,
+ <&pcie_pipe_clk>, <&usb3_phy_wrapper_gcc_usb30_pipe_clk>;
+ clock-names = "bi_tcxo", "bi_tcxo_ao", "sleep_clk",
+ "pcie_pipe_clk", "usb3_phy_wrapper_gcc_usb30_pipe_clk";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6115.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6115.yaml
index 26050da844d5..a5ad0a3da397 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6115.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6115.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm6115.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM6115 and SM4250
+title: Qualcomm Global Clock & Reset Controller on SM6115 and SM4250
maintainers:
- Iskren Chernev <iskren.chernev@gmail.com>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM4250/6115.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM4250/6115.
- See also:
- - dt-bindings/clock/qcom,gcc-sm6115.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm6115.h
properties:
compatible:
@@ -30,32 +29,15 @@ properties:
- const: bi_tcxo
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml
index ab12b391effc..8e37623788bd 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6125.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm6125.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM6125
+title: Qualcomm Global Clock & Reset Controller on SM6125
maintainers:
- Konrad Dybcio <konrad.dybcio@somainline.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM6125.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM6125.
- See also:
- - dt-bindings/clock/qcom,gcc-sm6125.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm6125.h
properties:
compatible:
@@ -30,32 +29,15 @@ properties:
- const: bi_tcxo
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml
index 20926cd8293e..d1b26ab48eaf 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm6350.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm6350.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM6350
+title: Qualcomm Global Clock & Reset Controller on SM6350
maintainers:
- Konrad Dybcio <konrad.dybcio@somainline.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM6350.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM6350.
- See also:
- - dt-bindings/clock/qcom,gcc-sm6350.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm6350.h
properties:
compatible:
@@ -32,32 +31,15 @@ properties:
- const: bi_tcxo_ao
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8150.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8150.yaml
index 12766a866625..3ea0ff37a4cb 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8150.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8150.yaml
@@ -4,18 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm8150.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM8150
+title: Qualcomm Global Clock & Reset Controller on SM8150
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM8150.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM8150.
- See also:
- - dt-bindings/clock/qcom,gcc-sm8150.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm8150.h
properties:
compatible:
@@ -31,32 +30,15 @@ properties:
- const: bi_tcxo
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8250.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8250.yaml
index 80bd6caf5bc9..b752542ee20c 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8250.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8250.yaml
@@ -4,18 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm8250.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM8250
+title: Qualcomm Global Clock & Reset Controller on SM8250
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM8250.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM8250.
- See also:
- - dt-bindings/clock/qcom,gcc-sm8250.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm8250.h
properties:
compatible:
@@ -31,32 +30,15 @@ properties:
- const: bi_tcxo
- const: sleep_clk
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
- protected-clocks:
- description:
- Protected clock specifier list as per common clock binding.
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8350.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8350.yaml
index 1122700dcc2b..b4fdde71ef18 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8350.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8350.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc-sm8350.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding for SM8350
+title: Qualcomm Global Clock & Reset Controller on SM8350
maintainers:
- Vinod Koul <vkoul@kernel.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains on SM8350.
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM8350.
- See also:
- - dt-bindings/clock/qcom,gcc-sm8350.h
+ See also:: include/dt-bindings/clock/qcom,gcc-sm8350.h
properties:
compatible:
@@ -24,7 +23,6 @@ properties:
items:
- description: Board XO source
- description: Sleep clock source
- - description: PLL test clock source (Optional clock)
- description: PCIE 0 Pipe clock source (Optional clock)
- description: PCIE 1 Pipe clock source (Optional clock)
- description: UFS card Rx symbol 0 clock source (Optional clock)
@@ -41,7 +39,6 @@ properties:
items:
- const: bi_tcxo
- const: sleep_clk
- - const: core_bi_pll_test_se # Optional clock
- const: pcie_0_pipe_clk # Optional clock
- const: pcie_1_pipe_clk # Optional clock
- const: ufs_card_rx_symbol_0_clk # Optional clock
@@ -54,28 +51,15 @@ properties:
- const: usb3_uni_phy_sec_gcc_usb30_pipe_clk # Optional clock
minItems: 2
- '#clock-cells':
- const: 1
-
- '#reset-cells':
- const: 1
-
- '#power-domain-cells':
- const: 1
-
- reg:
- maxItems: 1
-
required:
- compatible
- clocks
- clock-names
- - reg
- - '#clock-cells'
- - '#reset-cells'
- - '#power-domain-cells'
-additionalProperties: false
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc-sm8450.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8450.yaml
new file mode 100644
index 000000000000..9a31981fbeb2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc-sm8450.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,gcc-sm8450.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SM8450
+
+maintainers:
+ - Vinod Koul <vkoul@kernel.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM8450
+
+ See also:: include/dt-bindings/clock/qcom,gcc-sm8450.h
+
+properties:
+ compatible:
+ const: qcom,gcc-sm8450
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+ - description: PCIE 0 Pipe clock source (Optional clock)
+ - description: PCIE 1 Pipe clock source (Optional clock)
+ - description: PCIE 1 Phy Auxillary clock source (Optional clock)
+ - description: UFS Phy Rx symbol 0 clock source (Optional clock)
+ - description: UFS Phy Rx symbol 1 clock source (Optional clock)
+ - description: UFS Phy Tx symbol 0 clock source (Optional clock)
+ - description: USB3 Phy wrapper pipe clock source (Optional clock)
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: sleep_clk
+ - const: pcie_0_pipe_clk # Optional clock
+ - const: pcie_1_pipe_clk # Optional clock
+ - const: pcie_1_phy_aux_clk # Optional clock
+ - const: ufs_phy_rx_symbol_0_clk # Optional clock
+ - const: ufs_phy_rx_symbol_1_clk # Optional clock
+ - const: ufs_phy_tx_symbol_0_clk # Optional clock
+ - const: usb3_phy_wrapper_gcc_usb30_pipe_clk # Optional clock
+ minItems: 2
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,gcc-sm8450";
+ reg = <0x00100000 0x001f4200>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&sleep_clk>;
+ clock-names = "bi_tcxo", "sleep_clk";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,gcc.yaml
index 2f20f8aa932a..7129fbcf2b6c 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc.yaml
@@ -4,59 +4,17 @@
$id: http://devicetree.org/schemas/clock/qcom,gcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Global Clock & Reset Controller Binding
+title: Qualcomm Global Clock & Reset Controller Common Properties
maintainers:
- Stephen Boyd <sboyd@kernel.org>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm global clock control module which supports the clocks, resets and
- power domains.
-
- See also:
- - dt-bindings/clock/qcom,gcc-apq8084.h
- - dt-bindings/reset/qcom,gcc-apq8084.h
- - dt-bindings/clock/qcom,gcc-ipq4019.h
- - dt-bindings/clock/qcom,gcc-ipq6018.h
- - dt-bindings/reset/qcom,gcc-ipq6018.h
- - dt-bindings/clock/qcom,gcc-ipq806x.h (qcom,gcc-ipq8064)
- - dt-bindings/reset/qcom,gcc-ipq806x.h (qcom,gcc-ipq8064)
- - dt-bindings/clock/qcom,gcc-msm8939.h
- - dt-bindings/clock/qcom,gcc-msm8953.h
- - dt-bindings/reset/qcom,gcc-msm8939.h
- - dt-bindings/clock/qcom,gcc-msm8660.h
- - dt-bindings/reset/qcom,gcc-msm8660.h
- - dt-bindings/clock/qcom,gcc-msm8974.h (qcom,gcc-msm8226 and qcom,gcc-msm8974)
- - dt-bindings/reset/qcom,gcc-msm8974.h (qcom,gcc-msm8226 and qcom,gcc-msm8974)
- - dt-bindings/clock/qcom,gcc-msm8994.h
- - dt-bindings/clock/qcom,gcc-mdm9607.h
- - dt-bindings/clock/qcom,gcc-mdm9615.h
- - dt-bindings/reset/qcom,gcc-mdm9615.h
- - dt-bindings/clock/qcom,gcc-sdm660.h (qcom,gcc-sdm630 and qcom,gcc-sdm660)
+ Common bindings for Qualcomm global clock control module providing the
+ clocks, resets and power domains.
properties:
- compatible:
- enum:
- - qcom,gcc-apq8084
- - qcom,gcc-ipq4019
- - qcom,gcc-ipq6018
- - qcom,gcc-ipq8064
- - qcom,gcc-mdm9607
- - qcom,gcc-msm8226
- - qcom,gcc-msm8660
- - qcom,gcc-msm8916
- - qcom,gcc-msm8939
- - qcom,gcc-msm8953
- - qcom,gcc-msm8960
- - qcom,gcc-msm8974
- - qcom,gcc-msm8974pro
- - qcom,gcc-msm8974pro-ac
- - qcom,gcc-msm8994
- - qcom,gcc-mdm9615
- - qcom,gcc-sdm630
- - qcom,gcc-sdm660
-
'#clock-cells':
const: 1
@@ -74,22 +32,11 @@ properties:
Protected clock specifier list as per common clock binding.
required:
- - compatible
- reg
- '#clock-cells'
- '#reset-cells'
- '#power-domain-cells'
-additionalProperties: false
+additionalProperties: true
-examples:
- # Example for GCC for MSM8960:
- - |
- clock-controller@900000 {
- compatible = "qcom,gcc-msm8960";
- reg = <0x900000 0x4000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- #power-domain-cells = <1>;
- };
...
diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc-sdm660.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc-sdm660.yaml
index 3f70eb59aae3..0518ea963cdd 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc-sdm660.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc-sdm660.yaml
@@ -4,13 +4,13 @@
$id: http://devicetree.org/schemas/clock/qcom,gpucc-sdm660.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Graphics Clock & Reset Controller Binding for SDM630 and SDM660
+title: Qualcomm Graphics Clock & Reset Controller on SDM630 and SDM660
maintainers:
- AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
description: |
- Qualcomm graphics clock control module which supports the clocks, resets and
+ Qualcomm graphics clock control module provides the clocks, resets and
power domains on SDM630 and SDM660.
See also dt-bindings/clock/qcom,gpucc-sdm660.h.
diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
index 46dff46d5760..1e3dc9deded9 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml
@@ -4,31 +4,39 @@
$id: http://devicetree.org/schemas/clock/qcom,gpucc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Graphics Clock & Reset Controller Binding
+title: Qualcomm Graphics Clock & Reset Controller
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm graphics clock control module which supports the clocks, resets and
- power domains on Qualcomm SoCs.
+ Qualcomm graphics clock control module provides the clocks, resets and power
+ domains on Qualcomm SoCs.
- See also:
- dt-bindings/clock/qcom,gpucc-sdm845.h
- dt-bindings/clock/qcom,gpucc-sc7180.h
- dt-bindings/clock/qcom,gpucc-sc7280.h
- dt-bindings/clock/qcom,gpucc-sm8150.h
- dt-bindings/clock/qcom,gpucc-sm8250.h
+ See also::
+ include/dt-bindings/clock/qcom,gpucc-sdm845.h
+ include/dt-bindings/clock/qcom,gpucc-sa8775p.h
+ include/dt-bindings/clock/qcom,gpucc-sc7180.h
+ include/dt-bindings/clock/qcom,gpucc-sc7280.h
+ include/dt-bindings/clock/qcom,gpucc-sc8280xp.h
+ include/dt-bindings/clock/qcom,gpucc-sm6350.h
+ include/dt-bindings/clock/qcom,gpucc-sm8150.h
+ include/dt-bindings/clock/qcom,gpucc-sm8250.h
+ include/dt-bindings/clock/qcom,gpucc-sm8350.h
properties:
compatible:
enum:
- qcom,sdm845-gpucc
+ - qcom,sa8775p-gpucc
- qcom,sc7180-gpucc
- qcom,sc7280-gpucc
- qcom,sc8180x-gpucc
+ - qcom,sc8280xp-gpucc
+ - qcom,sm6350-gpucc
- qcom,sm8150-gpucc
- qcom,sm8250-gpucc
+ - qcom,sm8350-gpucc
clocks:
items:
diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml
new file mode 100644
index 000000000000..718fe0625424
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq5332-gcc.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,ipq5332-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ5332
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on IPQ5332.
+
+ See also:: include/dt-bindings/clock/qcom,gcc-ipq5332.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ const: qcom,ipq5332-gcc
+
+ clocks:
+ items:
+ - description: Board XO clock source
+ - description: Sleep clock source
+ - description: PCIE 2lane PHY pipe clock source
+ - description: PCIE 2lane x1 PHY pipe clock source (For second lane)
+ - description: USB PCIE wrapper pipe clock source
+
+required:
+ - compatible
+ - clocks
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@1800000 {
+ compatible = "qcom,ipq5332-gcc";
+ reg = <0x01800000 0x80000>;
+ clocks = <&xo_board>,
+ <&sleep_clk>,
+ <&pcie_2lane_phy_pipe_clk>,
+ <&pcie_2lane_phy_pipe_clk_x1>,
+ <&usb_pcie_wrapper_pipe_clk>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ #reset-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml
new file mode 100644
index 000000000000..afc68eb9d7cc
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq9574-gcc.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,ipq9574-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on IPQ9574
+
+maintainers:
+ - Anusha Rao <quic_anusha@quicinc.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on IPQ9574
+
+ See also::
+ include/dt-bindings/clock/qcom,ipq9574-gcc.h
+ include/dt-bindings/reset/qcom,ipq9574-gcc.h
+
+properties:
+ compatible:
+ const: qcom,ipq9574-gcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+ - description: Bias PLL ubi clock source
+ - description: PCIE30 PHY0 pipe clock source
+ - description: PCIE30 PHY1 pipe clock source
+ - description: PCIE30 PHY2 pipe clock source
+ - description: PCIE30 PHY3 pipe clock source
+ - description: USB3 PHY pipe clock source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ clock-controller@1800000 {
+ compatible = "qcom,ipq9574-gcc";
+ reg = <0x01800000 0x80000>;
+ clocks = <&xo_board_clk>,
+ <&sleep_clk>,
+ <&bias_pll_ubi_nc_clk>,
+ <&pcie30_phy0_pipe_clk>,
+ <&pcie30_phy1_pipe_clk>,
+ <&pcie30_phy2_pipe_clk>,
+ <&pcie30_phy3_pipe_clk>,
+ <&usb3phy_0_cc_pipe_clk>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,kpss-acc-v1.yaml b/Documentation/devicetree/bindings/clock/qcom,kpss-acc-v1.yaml
new file mode 100644
index 000000000000..a466e4e8aacd
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,kpss-acc-v1.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,kpss-acc-v1.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Krait Processor Sub-system (KPSS) Application Clock Controller (ACC) v1
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+
+description:
+ The KPSS ACC provides clock, power domain, and reset control to a Krait CPU.
+ There is one ACC register region per CPU within the KPSS remapped region as
+ well as an alias register region that remaps accesses to the ACC associated
+ with the CPU accessing the region. ACC v1 is currently used as a
+ clock-controller for enabling the cpu and hanling the aux clocks.
+
+properties:
+ compatible:
+ const: qcom,kpss-acc-v1
+
+ reg:
+ items:
+ - description: Base address and size of the register region
+ - description: Optional base address and size of the alias register region
+ minItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: pll8_vote
+ - const: pxo
+
+ clock-output-names:
+ description: Name of the aux clock. Krait can have at most 4 cpu.
+ enum:
+ - acpu0_aux
+ - acpu1_aux
+ - acpu2_aux
+ - acpu3_aux
+
+ '#clock-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - clock-output-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-ipq806x.h>
+
+ clock-controller@2088000 {
+ compatible = "qcom,kpss-acc-v1";
+ reg = <0x02088000 0x1000>, <0x02008000 0x1000>;
+ clocks = <&gcc PLL8_VOTE>, <&pxo_board>;
+ clock-names = "pll8_vote", "pxo";
+ clock-output-names = "acpu0_aux";
+ #clock-cells = <0>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,kpss-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,kpss-gcc.yaml
new file mode 100644
index 000000000000..88b7672123a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,kpss-gcc.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,kpss-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Krait Processor Sub-system (KPSS) Global Clock Controller (GCC)
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+
+description:
+ Krait Processor Sub-system (KPSS) Global Clock Controller (GCC). Used
+ to control L2 mux (in the current implementation) and provide access
+ to the kpss-gcc registers.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qcom,kpss-gcc-ipq8064
+ - qcom,kpss-gcc-apq8064
+ - qcom,kpss-gcc-msm8974
+ - qcom,kpss-gcc-msm8960
+ - qcom,kpss-gcc-msm8660
+ - qcom,kpss-gcc-mdm9615
+ - const: qcom,kpss-gcc
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: pll8_vote
+ - const: pxo
+
+ '#clock-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,kpss-gcc-ipq8064
+ - qcom,kpss-gcc-apq8064
+ - qcom,kpss-gcc-msm8974
+ - qcom,kpss-gcc-msm8960
+then:
+ required:
+ - clocks
+ - clock-names
+ - '#clock-cells'
+else:
+ properties:
+ clock: false
+ clock-names: false
+ '#clock-cells': false
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-ipq806x.h>
+
+ clock-controller@2011000 {
+ compatible = "qcom,kpss-gcc-ipq8064", "qcom,kpss-gcc", "syscon";
+ reg = <0x2011000 0x1000>;
+ clocks = <&gcc PLL8_VOTE>, <&pxo_board>;
+ clock-names = "pll8_vote", "pxo";
+ #clock-cells = <0>;
+ };
+
+ - |
+ clock-controller@2011000 {
+ compatible = "qcom,kpss-gcc-mdm9615", "qcom,kpss-gcc", "syscon";
+ reg = <0x02011000 0x1000>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,lcc.txt b/Documentation/devicetree/bindings/clock/qcom,lcc.txt
deleted file mode 100644
index a3c78aa88038..000000000000
--- a/Documentation/devicetree/bindings/clock/qcom,lcc.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-Qualcomm LPASS Clock & Reset Controller Binding
-------------------------------------------------
-
-Required properties :
-- compatible : shall contain only one of the following:
-
- "qcom,lcc-msm8960"
- "qcom,lcc-apq8064"
- "qcom,lcc-ipq8064"
- "qcom,lcc-mdm9615"
-
-- reg : shall contain base register location and length
-- #clock-cells : shall contain 1
-- #reset-cells : shall contain 1
-
-Example:
- clock-controller@28000000 {
- compatible = "qcom,lcc-ipq8064";
- reg = <0x28000000 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
diff --git a/Documentation/devicetree/bindings/clock/qcom,lcc.yaml b/Documentation/devicetree/bindings/clock/qcom,lcc.yaml
new file mode 100644
index 000000000000..8c783823e93c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,lcc.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,lcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm LPASS Clock & Reset Controller
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - qcom,lcc-apq8064
+ - qcom,lcc-ipq8064
+ - qcom,lcc-mdm9615
+ - qcom,lcc-msm8960
+
+ clocks:
+ maxItems: 8
+
+ clock-names:
+ maxItems: 8
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - '#clock-cells'
+ - '#reset-cells'
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,lcc-apq8064
+ - qcom,lcc-msm8960
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board PXO source
+ - description: PLL 4 Vote clock
+ - description: MI2S codec clock
+ - description: Mic I2S codec clock
+ - description: Mic I2S spare clock
+ - description: Speaker I2S codec clock
+ - description: Speaker I2S spare clock
+ - description: PCM codec clock
+
+ clock-names:
+ items:
+ - const: pxo
+ - const: pll4_vote
+ - const: mi2s_codec_clk
+ - const: codec_i2s_mic_codec_clk
+ - const: spare_i2s_mic_codec_clk
+ - const: codec_i2s_spkr_codec_clk
+ - const: spare_i2s_spkr_codec_clk
+ - const: pcm_codec_clk
+
+ required:
+ - clocks
+ - clock-names
+
+examples:
+ - |
+ clock-controller@28000000 {
+ compatible = "qcom,lcc-ipq8064";
+ reg = <0x28000000 0x1000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,lpasscc.txt b/Documentation/devicetree/bindings/clock/qcom,lpasscc.txt
deleted file mode 100644
index b9e9787045b9..000000000000
--- a/Documentation/devicetree/bindings/clock/qcom,lpasscc.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Qualcomm LPASS Clock Controller Binding
------------------------------------------------
-
-Required properties :
-- compatible : shall contain "qcom,sdm845-lpasscc"
-- #clock-cells : from common clock binding, shall contain 1.
-- reg : shall contain base register address and size,
- in the order
- Index-0 maps to LPASS_CC register region
- Index-1 maps to LPASS_QDSP6SS register region
-
-Optional properties :
-- reg-names : register names of LPASS domain
- "cc", "qdsp6ss".
-
-Example:
-
-The below node has to be defined in the cases where the LPASS peripheral loader
-would bring the subsystem out of reset.
-
- lpasscc: clock-controller@17014000 {
- compatible = "qcom,sdm845-lpasscc";
- reg = <0x17014000 0x1f004>, <0x17300000 0x200>;
- reg-names = "cc", "qdsp6ss";
- #clock-cells = <1>;
- };
diff --git a/Documentation/devicetree/bindings/clock/qcom,mmcc.yaml b/Documentation/devicetree/bindings/clock/qcom,mmcc.yaml
index 68fdc3d4982a..acf0c923c24f 100644
--- a/Documentation/devicetree/bindings/clock/qcom,mmcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,mmcc.yaml
@@ -4,14 +4,14 @@
$id: http://devicetree.org/schemas/clock/qcom,mmcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Multimedia Clock & Reset Controller Binding
+title: Qualcomm Multimedia Clock & Reset Controller
maintainers:
- - Jeffrey Hugo <jhugo@codeaurora.org>
+ - Jeffrey Hugo <quic_jhugo@quicinc.com>
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm multimedia clock control module which supports the clocks, resets and
+ Qualcomm multimedia clock control module provides the clocks, resets and
power domains.
properties:
@@ -19,6 +19,7 @@ properties:
enum:
- qcom,mmcc-apq8064
- qcom,mmcc-apq8084
+ - qcom,mmcc-msm8226
- qcom,mmcc-msm8660
- qcom,mmcc-msm8960
- qcom,mmcc-msm8974
@@ -30,30 +31,12 @@ properties:
- qcom,mmcc-sdm660
clocks:
- items:
- - description: Board XO source
- - description: Board sleep source
- - description: Global PLL 0 clock
- - description: DSI phy instance 0 dsi clock
- - description: DSI phy instance 0 byte clock
- - description: DSI phy instance 1 dsi clock
- - description: DSI phy instance 1 byte clock
- - description: HDMI phy PLL clock
- - description: DisplayPort phy PLL vco clock
- - description: DisplayPort phy PLL link clock
+ minItems: 8
+ maxItems: 13
clock-names:
- items:
- - const: xo
- - const: sleep
- - const: gpll0
- - const: dsi0dsi
- - const: dsi0byte
- - const: dsi1dsi
- - const: dsi1byte
- - const: hdmipll
- - const: dpvco
- - const: dplink
+ minItems: 8
+ maxItems: 13
'#clock-cells':
const: 1
@@ -84,16 +67,255 @@ required:
additionalProperties: false
-if:
- properties:
- compatible:
- contains:
- const: qcom,mmcc-msm8998
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,mmcc-apq8064
+ - qcom,mmcc-msm8960
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board PXO source
+ - description: PLL 3 clock
+ - description: PLL 3 Vote clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: DSI phy instance 2 dsi clock
+ - description: DSI phy instance 2 byte clock
+ - description: HDMI phy PLL clock
-then:
- required:
- - clocks
- - clock-names
+ clock-names:
+ items:
+ - const: pxo
+ - const: pll3
+ - const: pll8_vote
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: dsi2pll
+ - const: dsi2pllbyte
+ - const: hdmipll
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,mmcc-msm8974
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: MMSS GPLL0 voted clock
+ - description: GPLL0 voted clock
+ - description: GPLL1 voted clock
+ - description: GFX3D clock source
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: HDMI phy PLL clock
+ - description: eDP phy PLL link clock
+ - description: eDP phy PLL vco clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: mmss_gpll0_vote
+ - const: gpll0_vote
+ - const: gpll1_vote
+ - const: gfx3d_clk_src
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: hdmipll
+ - const: edp_link_clk
+ - const: edp_vco_div
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,mmcc-apq8084
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board sleep source
+ - description: MMSS GPLL0 voted clock
+ - description: GPLL0 clock
+ - description: GPLL0 voted clock
+ - description: GPLL1 clock
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: HDMI phy PLL clock
+ - description: eDP phy PLL link clock
+ - description: eDP phy PLL vco clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+ - const: mmss_gpll0_vote
+ - const: gpll0
+ - const: gpll0_vote
+ - const: gpll1
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: hdmipll
+ - const: edp_link_clk
+ - const: edp_vco_div
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,mmcc-msm8994
+ - qcom,mmcc-msm8998
+ - qcom,mmcc-sdm630
+ - qcom,mmcc-sdm660
+ then:
+ required:
+ - clocks
+ - clock-names
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,mmcc-msm8994
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Global PLL 0 clock
+ - description: MMSS NoC AHB clock
+ - description: GFX3D clock
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: HDMI phy PLL clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: gpll0
+ - const: mmssnoc_ahb
+ - const: oxili_gfx3d_clk_src
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: hdmipll
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,mmcc-msm8996
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Global PLL 0 clock
+ - description: MMSS NoC AHB clock
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: HDMI phy PLL clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: gpll0
+ - const: gcc_mmss_noc_cfg_ahb_clk
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: hdmipll
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,mmcc-msm8998
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Global PLL 0 clock
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: HDMI phy PLL clock
+ - description: DisplayPort phy PLL link clock
+ - description: DisplayPort phy PLL vco clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: gpll0
+ - const: dsi0dsi
+ - const: dsi0byte
+ - const: dsi1dsi
+ - const: dsi1byte
+ - const: hdmipll
+ - const: dplink
+ - const: dpvco
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,mmcc-sdm630
+ - qcom,mmcc-sdm660
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board sleep source
+ - description: Global PLL 0 clock
+ - description: Global PLL 0 DIV clock
+ - description: DSI phy instance 0 dsi clock
+ - description: DSI phy instance 0 byte clock
+ - description: DSI phy instance 1 dsi clock
+ - description: DSI phy instance 1 byte clock
+ - description: DisplayPort phy PLL link clock
+ - description: DisplayPort phy PLL vco clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: sleep_clk
+ - const: gpll0
+ - const: gpll0_div
+ - const: dsi0pll
+ - const: dsi0pllbyte
+ - const: dsi1pll
+ - const: dsi1pllbyte
+ - const: dp_link_2x_clk_divsel_five
+ - const: dp_vco_divided_clk_src_mux
examples:
# Example for MMCC for MSM8960:
diff --git a/Documentation/devicetree/bindings/clock/qcom,msm8996-apcc.yaml b/Documentation/devicetree/bindings/clock/qcom,msm8996-apcc.yaml
index a20cb10636dd..fcace96c72eb 100644
--- a/Documentation/devicetree/bindings/clock/qcom,msm8996-apcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,msm8996-apcc.yaml
@@ -26,22 +26,20 @@ properties:
clocks:
items:
- - description: Primary PLL clock for power cluster (little)
- - description: Primary PLL clock for perf cluster (big)
- - description: Alternate PLL clock for power cluster (little)
- - description: Alternate PLL clock for perf cluster (big)
+ - description: XO source
+ - description: SYS APCS AUX clock
clock-names:
items:
- - const: pwrcl_pll
- - const: perfcl_pll
- - const: pwrcl_alt_pll
- - const: perfcl_alt_pll
+ - const: xo
+ - const: sys_apcs_aux
required:
- compatible
- reg
- '#clock-cells'
+ - clocks
+ - clock-names
additionalProperties: false
@@ -51,4 +49,7 @@ examples:
compatible = "qcom,msm8996-apcc";
reg = <0x6400000 0x90000>;
#clock-cells = <1>;
+
+ clocks = <&xo_board>, <&apcs_glb>;
+ clock-names = "xo", "sys_apcs_aux";
};
diff --git a/Documentation/devicetree/bindings/clock/qcom,msm8996-cbf.yaml b/Documentation/devicetree/bindings/clock/qcom,msm8996-cbf.yaml
new file mode 100644
index 000000000000..3ffe69d8cdd5
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,msm8996-cbf.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,msm8996-cbf.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm MSM8996 Core Bus Fabric (CBF) clock controller
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description: >
+ The clock controller for the Qualcomm MSM8996 CBF clock, which drives the
+ interconnect between two CPU clusters.
+
+properties:
+ compatible:
+ const: qcom,msm8996-cbf
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: XO source
+ - description: SYS APCS AUX clock
+
+ '#clock-cells':
+ const: 0
+
+ '#interconnect-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#interconnect-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ clock-controller@9a11000 {
+ compatible = "qcom,msm8996-cbf";
+ reg = <0x09a11000 0x10000>;
+ clocks = <&rpmcc RPM_SMD_BB_CLK1>, <&apcs_glb>;
+ #clock-cells = <0>;
+ #interconnect-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,msm8998-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,msm8998-gpucc.yaml
index d747bb58f0a7..2d8897991663 100644
--- a/Documentation/devicetree/bindings/clock/qcom,msm8998-gpucc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,msm8998-gpucc.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,msm8998-gpucc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Graphics Clock & Reset Controller Binding for MSM8998
+title: Qualcomm Graphics Clock & Reset Controller on MSM8998
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm graphics clock control module which supports the clocks, resets and
- power domains on MSM8998.
+ Qualcomm graphics clock control module provides the clocks, resets and power
+ domains on MSM8998.
- See also dt-bindings/clock/qcom,gpucc-msm8998.h.
+ See also:: include/dt-bindings/clock/qcom,gpucc-msm8998.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,q6sstopcc.yaml b/Documentation/devicetree/bindings/clock/qcom,q6sstopcc.yaml
index bbaaf1e2a203..03fa30fe9253 100644
--- a/Documentation/devicetree/bindings/clock/qcom,q6sstopcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,q6sstopcc.yaml
@@ -11,7 +11,7 @@ maintainers:
properties:
compatible:
- const: "qcom,qcs404-q6sstopcc"
+ const: qcom,qcs404-q6sstopcc
reg:
items:
diff --git a/Documentation/devicetree/bindings/clock/qcom,qcm2290-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,qcm2290-dispcc.yaml
new file mode 100644
index 000000000000..4a00f2d41684
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,qcm2290-dispcc.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,qcm2290-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on QCM2290
+
+maintainers:
+ - Loic Poulain <loic.poulain@linaro.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on qcm2290.
+
+ See also:: include/dt-bindings/clock/qcom,dispcc-qcm2290.h
+
+properties:
+ compatible:
+ const: qcom,qcm2290-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board active-only XO source
+ - description: GPLL0 source from GCC
+ - description: GPLL0 div source from GCC
+ - description: Byte clock from DSI PHY
+ - description: Pixel clock from DSI PHY
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: gcc_disp_gpll0_clk_src
+ - const: gcc_disp_gpll0_div_clk_src
+ - const: dsi0_phy_pll_out_byteclk
+ - const: dsi0_phy_pll_out_dsiclk
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-qcm2290.h>
+ #include <dt-bindings/clock/qcom,gcc-qcm2290.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ clock-controller@5f00000 {
+ compatible = "qcom,qcm2290-dispcc";
+ reg = <0x5f00000 0x20000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&rpmcc RPM_SMD_XO_A_CLK_SRC>,
+ <&gcc GCC_DISP_GPLL0_CLK_SRC>,
+ <&gcc GCC_DISP_GPLL0_DIV_CLK_SRC>,
+ <&dsi0_phy 0>,
+ <&dsi0_phy 1>;
+ clock-names = "bi_tcxo",
+ "bi_tcxo_ao",
+ "gcc_disp_gpll0_clk_src",
+ "gcc_disp_gpll0_div_clk_src",
+ "dsi0_phy_pll_out_byteclk",
+ "dsi0_phy_pll_out_dsiclk";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,qdu1000-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,qdu1000-gcc.yaml
new file mode 100644
index 000000000000..767a9d03aa32
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,qdu1000-gcc.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,qdu1000-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller for QDU1000 and QRU1000
+
+maintainers:
+ - Melody Olvera <quic_molvera@quicinc.com>
+
+description: |
+ Qualcomm global clock control module which supports the clocks, resets and
+ power domains on QDU1000 and QRU1000
+
+ See also:: include/dt-bindings/clock/qcom,qdu1000-gcc.h
+
+properties:
+ compatible:
+ const: qcom,qdu1000-gcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+ - description: PCIE 0 Pipe clock source
+ - description: PCIE 0 Phy Auxiliary clock source
+ - description: USB3 Phy wrapper pipe clock source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,qdu1000-gcc";
+ reg = <0x00100000 0x001f4200>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&sleep_clk>,
+ <&pcie_0_pipe_clk>, <&pcie_0_phy_aux_clk>,
+ <&usb3_phy_wrapper_pipe_clk>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt b/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
deleted file mode 100644
index a4877881f1d8..000000000000
--- a/Documentation/devicetree/bindings/clock/qcom,rpmcc.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-Qualcomm RPM Clock Controller Binding
-------------------------------------------------
-The RPM is a dedicated hardware engine for managing the shared
-SoC resources in order to keep the lowest power profile. It
-communicates with other hardware subsystems via shared memory
-and accepts clock requests, aggregates the requests and turns
-the clocks on/off or scales them on demand.
-
-Required properties :
-- compatible : shall contain only one of the following. The generic
- compatible "qcom,rpmcc" should be also included.
-
- "qcom,rpmcc-mdm9607", "qcom,rpmcc"
- "qcom,rpmcc-msm8660", "qcom,rpmcc"
- "qcom,rpmcc-apq8060", "qcom,rpmcc"
- "qcom,rpmcc-msm8226", "qcom,rpmcc"
- "qcom,rpmcc-msm8916", "qcom,rpmcc"
- "qcom,rpmcc-msm8936", "qcom,rpmcc"
- "qcom,rpmcc-msm8953", "qcom,rpmcc"
- "qcom,rpmcc-msm8974", "qcom,rpmcc"
- "qcom,rpmcc-msm8976", "qcom,rpmcc"
- "qcom,rpmcc-apq8064", "qcom,rpmcc"
- "qcom,rpmcc-ipq806x", "qcom,rpmcc"
- "qcom,rpmcc-msm8992",·"qcom,rpmcc"
- "qcom,rpmcc-msm8994",·"qcom,rpmcc"
- "qcom,rpmcc-msm8996", "qcom,rpmcc"
- "qcom,rpmcc-msm8998", "qcom,rpmcc"
- "qcom,rpmcc-qcs404", "qcom,rpmcc"
- "qcom,rpmcc-sdm660", "qcom,rpmcc"
- "qcom,rpmcc-sm6115", "qcom,rpmcc"
- "qcom,rpmcc-sm6125", "qcom,rpmcc"
-
-- #clock-cells : shall contain 1
-
-The clock enumerators are defined in <dt-bindings/clock/qcom,rpmcc.h>
-and come in pairs: FOO_CLK followed by FOO_A_CLK. The latter clock
-is an "active" clock, which means that the consumer only care that the
-clock is available when the apps CPU subsystem is active, i.e. not
-suspended or in deep idle. If it is important that the clock keeps running
-during system suspend, you need to specify the non-active clock, the one
-not containing *_A_* in the enumerator name.
-
-Example:
- smd {
- compatible = "qcom,smd";
-
- rpm {
- interrupts = <0 168 1>;
- qcom,ipc = <&apcs 8 0>;
- qcom,smd-edge = <15>;
-
- rpm_requests {
- compatible = "qcom,rpm-msm8916";
- qcom,smd-channels = "rpm_requests";
-
- rpmcc: clock-controller {
- compatible = "qcom,rpmcc-msm8916", "qcom,rpmcc";
- #clock-cells = <1>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmcc.yaml b/Documentation/devicetree/bindings/clock/qcom,rpmcc.yaml
new file mode 100644
index 000000000000..3665dd30604a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmcc.yaml
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,rpmcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm RPM Clock Controller
+
+maintainers:
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+ - Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+
+description: |
+ The clock enumerators are defined in <dt-bindings/clock/qcom,rpmcc.h> and
+ come in pairs:: FOO_CLK followed by FOO_A_CLK. The latter clock is
+ an "active" clock, which means that the consumer only care that the clock is
+ available when the apps CPU subsystem is active, i.e. not suspended or in
+ deep idle. If it is important that the clock keeps running during system
+ suspend, you need to specify the non-active clock, the one not containing
+ *_A_* in the enumerator name.
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qcom,rpmcc-apq8060
+ - qcom,rpmcc-apq8064
+ - qcom,rpmcc-ipq806x
+ - qcom,rpmcc-mdm9607
+ - qcom,rpmcc-msm8226
+ - qcom,rpmcc-msm8660
+ - qcom,rpmcc-msm8909
+ - qcom,rpmcc-msm8916
+ - qcom,rpmcc-msm8917
+ - qcom,rpmcc-msm8936
+ - qcom,rpmcc-msm8953
+ - qcom,rpmcc-msm8974
+ - qcom,rpmcc-msm8976
+ - qcom,rpmcc-msm8992
+ - qcom,rpmcc-msm8994
+ - qcom,rpmcc-msm8996
+ - qcom,rpmcc-msm8998
+ - qcom,rpmcc-qcm2290
+ - qcom,rpmcc-qcs404
+ - qcom,rpmcc-sdm660
+ - qcom,rpmcc-sm6115
+ - qcom,rpmcc-sm6125
+ - qcom,rpmcc-sm6375
+ - const: qcom,rpmcc
+
+ '#clock-cells':
+ const: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ maxItems: 2
+
+required:
+ - compatible
+ - '#clock-cells'
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,rpmcc-apq8060
+ - qcom,rpmcc-ipq806x
+ - qcom,rpmcc-msm8660
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: pxo clock
+
+ clock-names:
+ items:
+ - const: pxo
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,rpmcc-apq8064
+ then:
+ properties:
+ clocks:
+ items:
+ - description: pxo clock
+ - description: cxo clock
+
+ clock-names:
+ items:
+ - const: pxo
+ - const: cxo
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,rpmcc-mdm9607
+ - qcom,rpmcc-msm8226
+ - qcom,rpmcc-msm8916
+ - qcom,rpmcc-msm8917
+ - qcom,rpmcc-msm8936
+ - qcom,rpmcc-msm8953
+ - qcom,rpmcc-msm8974
+ - qcom,rpmcc-msm8976
+ - qcom,rpmcc-msm8992
+ - qcom,rpmcc-msm8994
+ - qcom,rpmcc-msm8996
+ - qcom,rpmcc-msm8998
+ - qcom,rpmcc-qcm2290
+ - qcom,rpmcc-qcs404
+ - qcom,rpmcc-sdm660
+ - qcom,rpmcc-sm6115
+ - qcom,rpmcc-sm6125
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: xo clock
+
+ clock-names:
+ items:
+ - const: xo
+
+additionalProperties: false
+
+examples:
+ - |
+ rpm {
+ rpm-requests {
+ compatible = "qcom,rpm-msm8916";
+ qcom,smd-channels = "rpm_requests";
+
+ clock-controller {
+ compatible = "qcom,rpmcc-msm8916", "qcom,rpmcc";
+ #clock-cells = <1>;
+ };
+ };
+ };
+
+ - |
+ rpm {
+ clock-controller {
+ compatible = "qcom,rpmcc-ipq806x", "qcom,rpmcc";
+ #clock-cells = <1>;
+ clocks = <&pxo_board>;
+ clock-names = "pxo";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,rpmhcc.yaml b/Documentation/devicetree/bindings/clock/qcom,rpmhcc.yaml
index 72212970e6f5..d5a250b7c2af 100644
--- a/Documentation/devicetree/bindings/clock/qcom,rpmhcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmhcc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/qcom,rpmhcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Technologies, Inc. RPMh Clocks Bindings
+title: Qualcomm Technologies, Inc. RPMh Clocks
maintainers:
- Taniya Das <tdas@codeaurora.org>
@@ -17,15 +17,22 @@ description: |
properties:
compatible:
enum:
+ - qcom,qdu1000-rpmh-clk
+ - qcom,sa8775p-rpmh-clk
- qcom,sc7180-rpmh-clk
- qcom,sc7280-rpmh-clk
- qcom,sc8180x-rpmh-clk
+ - qcom,sc8280xp-rpmh-clk
+ - qcom,sdm670-rpmh-clk
- qcom,sdm845-rpmh-clk
- qcom,sdx55-rpmh-clk
+ - qcom,sdx65-rpmh-clk
- qcom,sm6350-rpmh-clk
- qcom,sm8150-rpmh-clk
- qcom,sm8250-rpmh-clk
- qcom,sm8350-rpmh-clk
+ - qcom,sm8450-rpmh-clk
+ - qcom,sm8550-rpmh-clk
clocks:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/clock/qcom,sa8775p-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sa8775p-gcc.yaml
new file mode 100644
index 000000000000..0f641c235b13
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sa8775p-gcc.yaml
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sa8775p-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on sa8775p
+
+maintainers:
+ - Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and
+ power domains on sa8775p.
+
+ See also:: include/dt-bindings/clock/qcom,sa8775p-gcc.h
+
+properties:
+ compatible:
+ const: qcom,sa8775p-gcc
+
+ clocks:
+ items:
+ - description: XO reference clock
+ - description: Sleep clock
+ - description: UFS memory first RX symbol clock
+ - description: UFS memory second RX symbol clock
+ - description: UFS memory first TX symbol clock
+ - description: UFS card first RX symbol clock
+ - description: UFS card second RX symbol clock
+ - description: UFS card first TX symbol clock
+ - description: Primary USB3 PHY wrapper pipe clock
+ - description: Secondary USB3 PHY wrapper pipe clock
+ - description: PCIe 0 pipe clock
+ - description: PCIe 1 pipe clock
+ - description: PCIe PHY clock
+ - description: First EMAC controller reference clock
+ - description: Second EMAC controller reference clock
+
+ protected-clocks:
+ maxItems: 240
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ gcc: clock-controller@100000 {
+ compatible = "qcom,sa8775p-gcc";
+ reg = <0x100000 0xc7018>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&sleep_clk>,
+ <&ufs_phy_rx_symbol_0_clk>,
+ <&ufs_phy_rx_symbol_1_clk>,
+ <&ufs_phy_tx_symbol_0_clk>,
+ <&ufs_card_rx_symbol_0_clk>,
+ <&ufs_card_rx_symbol_1_clk>,
+ <&ufs_card_tx_symbol_0_clk>,
+ <&usb_0_ssphy>,
+ <&usb_1_ssphy>,
+ <&pcie_0_pipe_clk>,
+ <&pcie_1_pipe_clk>,
+ <&pcie_phy_pipe_clk>,
+ <&rxc0_ref_clk>,
+ <&rxc1_ref_clk>;
+ power-domains = <&rpmhpd SA8775P_CX>;
+
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7180-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7180-camcc.yaml
index f49027edfc44..098c8acf4bad 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sc7180-camcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7180-camcc.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,sc7180-camcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Camera Clock & Reset Controller Binding for SC7180
+title: Qualcomm Camera Clock & Reset Controller on SC7180
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm camera clock control module which supports the clocks, resets and
- power domains on SC7180.
+ Qualcomm camera clock control module provides the clocks, resets and power
+ domains on SC7180.
- See also:
- - dt-bindings/clock/qcom,camcc-sc7180.h
+ See also:: include/dt-bindings/clock/qcom,camcc-sc7180.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7180-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7180-dispcc.yaml
index e94847f92770..95ad16d0abc3 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sc7180-dispcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7180-dispcc.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,sc7180-dispcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Display Clock & Reset Controller Binding for SC7180
+title: Qualcomm Display Clock & Reset Controller on SC7180
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm display clock control module which supports the clocks, resets and
- power domains on SC7180.
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SC7180.
- See also dt-bindings/clock/qcom,dispcc-sc7180.h.
+ See also:: include/dt-bindings/clock/qcom,dispcc-sc7180.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7180-lpasscorecc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7180-lpasscorecc.yaml
index c54172fbf29f..f297694ef8b8 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sc7180-lpasscorecc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7180-lpasscorecc.yaml
@@ -4,17 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,sc7180-lpasscorecc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm LPASS Core Clock Controller Binding for SC7180
+title: Qualcomm LPASS Core Clock Controller on SC7180
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm LPASS core clock control module which supports the clocks and
- power domains on SC7180.
+ Qualcomm LPASS core clock control module provides the clocks and power
+ domains on SC7180.
- See also:
- - dt-bindings/clock/qcom,lpasscorecc-sc7180.h
+ See also:: include/dt-bindings/clock/qcom,lpasscorecc-sc7180.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml
index 970030986a86..1e856a8a996e 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7180-mss.yaml
@@ -4,16 +4,15 @@
$id: http://devicetree.org/schemas/clock/qcom,sc7180-mss.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Modem Clock Controller Binding for SC7180
+title: Qualcomm Modem Clock Controller on SC7180
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm modem clock control module which supports the clocks on SC7180.
+ Qualcomm modem clock control module provides the clocks on SC7180.
- See also:
- - dt-bindings/clock/qcom,mss-sc7180.h
+ See also:: include/dt-bindings/clock/qcom,mss-sc7180.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7280-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7280-camcc.yaml
new file mode 100644
index 000000000000..b60adbad4590
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7280-camcc.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sc7280-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on SC7280
+
+maintainers:
+ - Taniya Das <tdas@codeaurora.org>
+
+description: |
+ Qualcomm camera clock control module provides the clocks, resets and
+ power domains on SC7280.
+
+ See also:: include/dt-bindings/clock/qcom,camcc-sc7280.h
+
+properties:
+ compatible:
+ const: qcom,sc7280-camcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board XO active source
+ - description: Sleep clock source
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: sleep_clk
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@ad00000 {
+ compatible = "qcom,sc7280-camcc";
+ reg = <0x0ad00000 0x10000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
+ <&sleep_clk>;
+ clock-names = "bi_tcxo", "bi_tcxo_ao", "sleep_clk";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7280-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7280-dispcc.yaml
index 2178666fb697..cfe6594a0a6b 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sc7280-dispcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7280-dispcc.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,sc7280-dispcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Display Clock & Reset Controller Binding for SC7280
+title: Qualcomm Display Clock & Reset Controller on SC7280
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm display clock control module which supports the clocks, resets and
- power domains on SC7280.
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SC7280.
- See also dt-bindings/clock/qcom,dispcc-sc7280.h.
+ See also:: include/dt-bindings/clock/qcom,dispcc-sc7280.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscc.yaml
new file mode 100644
index 000000000000..97c6bd96e0cb
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscc.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sc7280-lpasscc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm LPASS Core Clock Controller on SC7280
+
+maintainers:
+ - Taniya Das <tdas@codeaurora.org>
+
+description: |
+ Qualcomm LPASS core clock control module provides the clocks and power
+ domains on SC7280.
+
+ See also:: include/dt-bindings/clock/qcom,lpass-sc7280.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sc7280-lpasscc
+
+ clocks:
+ items:
+ - description: gcc_cfg_noc_lpass_clk from GCC
+
+ clock-names:
+ items:
+ - const: iface
+
+ '#clock-cells':
+ const: 1
+
+ reg:
+ items:
+ - description: LPASS qdsp6ss register
+ - description: LPASS top-cc register
+
+ reg-names:
+ items:
+ - const: qdsp6ss
+ - const: top_cc
+
+ qcom,adsp-pil-mode:
+ description:
+ Indicates if the LPASS would be brought out of reset using
+ remoteproc peripheral loader.
+ type: boolean
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpass-sc7280.h>
+ clock-controller@3000000 {
+ compatible = "qcom,sc7280-lpasscc";
+ reg = <0x03000000 0x40>, <0x03c04000 0x4>;
+ reg-names = "qdsp6ss", "top_cc";
+ clocks = <&gcc GCC_CFG_NOC_LPASS_CLK>;
+ clock-names = "iface";
+ qcom,adsp-pil-mode;
+ #clock-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscorecc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscorecc.yaml
new file mode 100644
index 000000000000..447cdc447a0c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sc7280-lpasscorecc.yaml
@@ -0,0 +1,192 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sc7280-lpasscorecc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm LPASS Core & Audio Clock Controller on SC7280
+
+maintainers:
+ - Taniya Das <tdas@codeaurora.org>
+
+description: |
+ Qualcomm LPASS core and audio clock control module provides the clocks and
+ power domains on SC7280.
+
+ See also::
+ include/dt-bindings/clock/qcom,lpasscorecc-sc7280.h
+ include/dt-bindings/clock/qcom,lpassaudiocc-sc7280.h
+
+properties:
+ clocks: true
+
+ clock-names: true
+
+ reg: true
+
+ compatible:
+ enum:
+ - qcom,sc7280-lpassaoncc
+ - qcom,sc7280-lpassaudiocc
+ - qcom,sc7280-lpasscorecc
+ - qcom,sc7280-lpasshm
+
+ power-domains:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ qcom,adsp-pil-mode:
+ description:
+ Indicates if the LPASS would be brought out of reset using
+ peripheral loader.
+ type: boolean
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,sc7280-lpassaudiocc
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: LPASS_AON_CC_MAIN_RCG_CLK_SRC
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: lpass_aon_cc_main_rcg_clk_src
+
+ reg:
+ items:
+ - description: lpass core cc register
+ - description: lpass audio csr register
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7280-lpassaoncc
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board XO active only source
+ - description: LPASS_AON_CC_MAIN_RCG_CLK_SRC
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+ - const: iface
+
+ reg:
+ maxItems: 1
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7280-lpasshm
+ - qcom,sc7280-lpasscorecc
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+
+ reg:
+ maxItems: 1
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpassaudiocc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpasscorecc-sc7280.h>
+ lpass_audiocc: clock-controller@3300000 {
+ compatible = "qcom,sc7280-lpassaudiocc";
+ reg = <0x3300000 0x30000>,
+ <0x32a9000 0x1000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&lpass_aon LPASS_AON_CC_MAIN_RCG_CLK_SRC>;
+ clock-names = "bi_tcxo", "lpass_aon_cc_main_rcg_clk_src";
+ power-domains = <&lpass_aon LPASS_AON_CC_LPASS_AUDIO_HM_GDSC>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpassaudiocc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpasscorecc-sc7280.h>
+ lpass_hm: clock-controller@3c00000 {
+ compatible = "qcom,sc7280-lpasshm";
+ reg = <0x3c00000 0x28>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "bi_tcxo";
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpassaudiocc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpasscorecc-sc7280.h>
+ lpasscore: clock-controller@3900000 {
+ compatible = "qcom,sc7280-lpasscorecc";
+ reg = <0x3900000 0x50000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "bi_tcxo";
+ power-domains = <&lpass_hm LPASS_CORE_CC_LPASS_CORE_HM_GDSC>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpassaudiocc-sc7280.h>
+ #include <dt-bindings/clock/qcom,lpasscorecc-sc7280.h>
+ lpass_aon: clock-controller@3380000 {
+ compatible = "qcom,sc7280-lpassaoncc";
+ reg = <0x3380000 0x30000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&rpmhcc RPMH_CXO_CLK_A>,
+ <&lpasscore LPASS_CORE_CC_CORE_CLK>;
+ clock-names = "bi_tcxo", "bi_tcxo_ao","iface";
+ qcom,adsp-pil-mode;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sdm845-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sdm845-camcc.yaml
new file mode 100644
index 000000000000..91d1f7918037
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sdm845-camcc.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sdm845-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on SDM845
+
+maintainers:
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+description: |
+ Qualcomm camera clock control module provides the clocks, resets and power
+ domains on SDM845.
+
+ See also:: include/dt-bindings/clock/qcom,camcc-sm845.h
+
+properties:
+ compatible:
+ const: qcom,sdm845-camcc
+
+ clocks:
+ items:
+ - description: Board XO source
+
+ clock-names:
+ items:
+ - const: bi_tcxo
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@ad00000 {
+ compatible = "qcom,sdm845-camcc";
+ reg = <0x0ad00000 0x10000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "bi_tcxo";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sdm845-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sdm845-dispcc.yaml
index 4a3be733d042..76b53ce64e40 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sdm845-dispcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sdm845-dispcc.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/clock/qcom,sdm845-dispcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Display Clock & Reset Controller Binding for SDM845
+title: Qualcomm Display Clock & Reset Controller on SDM845
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm display clock control module which supports the clocks, resets and
- power domains on SDM845.
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SDM845.
- See also dt-bindings/clock/qcom,dispcc-sdm845.h.
+ See also:: include/dt-bindings/clock/qcom,dispcc-sdm845.h
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/clock/qcom,sdm845-lpasscc.yaml b/Documentation/devicetree/bindings/clock/qcom,sdm845-lpasscc.yaml
new file mode 100644
index 000000000000..a96fd837c70a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sdm845-lpasscc.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sdm845-lpasscc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SDM845 LPASS Clock Controller
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Qualcomm SDM845 LPASS (Low Power Audio SubSystem) Clock Controller.
+
+ See also:: include/dt-bindings/clock/qcom,lpass-sdm845.h
+
+properties:
+ compatible:
+ const: qcom,sdm845-lpasscc
+
+ '#clock-cells':
+ const: 1
+
+ reg:
+ maxItems: 2
+
+ reg-names:
+ items:
+ - const: cc
+ - const: qdsp6ss
+
+required:
+ - compatible
+ - '#clock-cells'
+ - reg
+ - reg-names
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@17014000 {
+ compatible = "qcom,sdm845-lpasscc";
+ reg = <0x17014000 0x1f004>, <0x17300000 0x200>;
+ reg-names = "cc", "qdsp6ss";
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6115-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6115-dispcc.yaml
new file mode 100644
index 000000000000..f802a2e7f818
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6115-dispcc.yaml
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6115-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock Controller for SM6115
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks and power domains
+ on SM6115.
+
+ See also:: include/dt-bindings/clock/qcom,sm6115-dispcc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm6115-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board sleep clock
+ - description: Byte clock from DSI PHY0
+ - description: Pixel clock from DSI PHY0
+ - description: GPLL0 DISP DIV clock from GCC
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm6115.h>
+ clock-controller@5f00000 {
+ compatible = "qcom,sm6115-dispcc";
+ reg = <0x5f00000 0x20000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&sleep_clk>,
+ <&dsi0_phy 0>,
+ <&dsi0_phy 1>,
+ <&gcc GCC_DISP_GPLL0_DIV_CLK_SRC>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml
new file mode 100644
index 000000000000..cf19f44af774
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6115-gpucc.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6115-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6115
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+ Qualcomm graphics clock control module provides clocks, resets and power
+ domains on Qualcomm SoCs.
+
+ See also:: include/dt-bindings/clock/qcom,sm6115-gpucc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm6115-gpucc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: GPLL0 main branch source
+ - description: GPLL0 main div source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sm6115.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+
+ soc {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ clock-controller@5990000 {
+ compatible = "qcom,sm6115-gpucc";
+ reg = <0x05990000 0x9000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&gcc GCC_GPU_GPLL0_CLK_SRC>,
+ <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml
new file mode 100644
index 000000000000..374a1844a159
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6125-gpucc.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6125-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6125
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+ Qualcomm graphics clock control module provides clocks and power domains on
+ Qualcomm SoCs.
+
+ See also:: include/dt-bindings/clock/qcom,sm6125-gpucc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm6125-gpucc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: GPLL0 main branch source
+
+ '#clock-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sm6125.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+
+ soc {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ clock-controller@5990000 {
+ compatible = "qcom,sm6125-gpucc";
+ reg = <0x05990000 0x9000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&gcc GCC_GPU_GPLL0_CLK_SRC>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml
new file mode 100644
index 000000000000..fd6658cb793d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6350-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on SM6350
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+ Qualcomm camera clock control module provides the clocks, resets and power
+ domains on SM6350.
+
+ See also:: include/dt-bindings/clock/qcom,sm6350-camcc.h
+
+properties:
+ compatible:
+ const: qcom,sm6350-camcc
+
+ clocks:
+ items:
+ - description: Board XO source
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@ad00000 {
+ compatible = "qcom,sm6350-camcc";
+ reg = <0x0ad00000 0x16000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml
new file mode 100644
index 000000000000..183b1c75dbdf
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-dispcc.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6375-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on SM6375
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SM6375.
+
+ See also:: include/dt-bindings/clock/qcom,dispcc-sm6375.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm6375-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: GPLL0 source from GCC
+ - description: Byte clock from DSI PHY
+ - description: Pixel clock from DSI PHY
+
+required:
+ - compatible
+ - clocks
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm6375-gcc.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+
+ clock-controller@5f00000 {
+ compatible = "qcom,sm6375-dispcc";
+ reg = <0x05f00000 0x20000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&gcc GCC_DISP_GPLL0_CLK_SRC>,
+ <&dsi_phy 0>,
+ <&dsi_phy 1>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml
new file mode 100644
index 000000000000..295d4bb1a966
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gcc.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6375-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SM6375
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@somainline.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM6375
+
+ See also:: include/dt-bindings/clock/qcom,sm6375-gcc.h
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm6375-gcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board XO Active-Only source
+ - description: Sleep clock source
+
+required:
+ - compatible
+ - clocks
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ clock-controller@1400000 {
+ compatible = "qcom,sm6375-gcc";
+ reg = <0x01400000 0x1f0000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&rpmcc RPM_SMD_XO_A_CLK_SRC>,
+ <&sleep_clk>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml
new file mode 100644
index 000000000000..b480ead5bd69
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6375-gpucc.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm6375-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on SM6375
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@linaro.org>
+
+description: |
+ Qualcomm graphics clock control module provides clocks, resets and power
+ domains on Qualcomm SoCs.
+
+ See also:: include/dt-bindings/clock/qcom,sm6375-gpucc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm6375-gpucc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: GPLL0 main branch source
+ - description: GPLL0 div branch source
+ - description: SNoC DVM GFX source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm6375-gcc.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ clock-controller@5990000 {
+ compatible = "qcom,sm6375-gpucc";
+ reg = <0 0x05990000 0 0x9000>;
+ clocks = <&rpmcc RPM_SMD_XO_CLK_SRC>,
+ <&gcc GCC_GPU_GPLL0_CLK_SRC>,
+ <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>,
+ <&gcc GCC_GPU_SNOC_DVM_GFX_CLK>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml
new file mode 100644
index 000000000000..0eb76d9d51c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm7150-gcc.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm7150-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SM7150
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+ - Danila Tikhonov <danila@jiaxyga.com>
+ - David Wronek <davidwronek@gmail.com>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM7150
+
+ See also:: include/dt-bindings/clock/qcom,sm7150-gcc.h
+
+properties:
+ compatible:
+ const: qcom,sm7150-gcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board XO Active-Only source
+ - description: Sleep clock source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,sm7150-gcc";
+ reg = <0x00100000 0x001f0000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
+ <&sleep_clk>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml
new file mode 100644
index 000000000000..87ae74166807
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-camcc.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8450-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on SM8450
+
+maintainers:
+ - Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
+
+description: |
+ Qualcomm camera clock control module provides the clocks, resets and power
+ domains on SM8450.
+
+ See also:: include/dt-bindings/clock/qcom,sm8450-camcc.h
+
+properties:
+ compatible:
+ const: qcom,sm8450-camcc
+
+ clocks:
+ items:
+ - description: Camera AHB clock from GCC
+ - description: Board XO source
+ - description: Board active XO source
+ - description: Sleep clock source
+
+ power-domains:
+ maxItems: 1
+ description:
+ A phandle and PM domain specifier for the MMCX power domain.
+
+ required-opps:
+ maxItems: 1
+ description:
+ A phandle to an OPP node describing required MMCX performance point.
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - power-domains
+ - required-opps
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+ clock-controller@ade0000 {
+ compatible = "qcom,sm8450-camcc";
+ reg = <0xade0000 0x20000>;
+ clocks = <&gcc GCC_CAMERA_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
+ <&sleep_clk>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-dispcc.yaml
new file mode 100644
index 000000000000..1dd1f696dcd3
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-dispcc.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8450-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller for SM8450
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SM8450.
+
+ See also:: include/dt-bindings/clock/qcom,sm8450-dispcc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm8450-dispcc
+
+ clocks:
+ minItems: 3
+ items:
+ - description: Board XO source
+ - description: Board Always On XO source
+ - description: Display's AHB clock
+ - description: sleep clock
+ - description: Byte clock from DSI PHY0
+ - description: Pixel clock from DSI PHY0
+ - description: Byte clock from DSI PHY1
+ - description: Pixel clock from DSI PHY1
+ - description: Link clock from DP PHY0
+ - description: VCO DIV clock from DP PHY0
+ - description: Link clock from DP PHY1
+ - description: VCO DIV clock from DP PHY1
+ - description: Link clock from DP PHY2
+ - description: VCO DIV clock from DP PHY2
+ - description: Link clock from DP PHY3
+ - description: VCO DIV clock from DP PHY3
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+ power-domains:
+ description:
+ A phandle and PM domain specifier for the MMCX power domain.
+ maxItems: 1
+
+ required-opps:
+ description:
+ A phandle to an OPP node describing required MMCX performance point.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+ clock-controller@af00000 {
+ compatible = "qcom,sm8450-dispcc";
+ reg = <0x0af00000 0x10000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
+ <&gcc GCC_DISP_AHB_CLK>,
+ <&sleep_clk>,
+ <&dsi0_phy 0>,
+ <&dsi0_phy 1>,
+ <&dsi1_phy 0>,
+ <&dsi1_phy 1>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8550-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8550-dispcc.yaml
new file mode 100644
index 000000000000..ab25f7cbaa2e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8550-dispcc.yaml
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8550-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller for SM8550
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+description: |
+ Qualcomm display clock control module provides the clocks, resets and power
+ domains on SM8550.
+
+ See also:: include/dt-bindings/clock/qcom,sm8550-dispcc.h
+
+properties:
+ compatible:
+ enum:
+ - qcom,sm8550-dispcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board Always On XO source
+ - description: Display's AHB clock
+ - description: sleep clock
+ - description: Byte clock from DSI PHY0
+ - description: Pixel clock from DSI PHY0
+ - description: Byte clock from DSI PHY1
+ - description: Pixel clock from DSI PHY1
+ - description: Link clock from DP PHY0
+ - description: VCO DIV clock from DP PHY0
+ - description: Link clock from DP PHY1
+ - description: VCO DIV clock from DP PHY1
+ - description: Link clock from DP PHY2
+ - description: VCO DIV clock from DP PHY2
+ - description: Link clock from DP PHY3
+ - description: VCO DIV clock from DP PHY3
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+ '#power-domain-cells':
+ const: 1
+
+ reg:
+ maxItems: 1
+
+ power-domains:
+ description:
+ A phandle and PM domain specifier for the MMCX power domain.
+ maxItems: 1
+
+ required-opps:
+ description:
+ A phandle to an OPP node describing required MMCX performance point.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+ - '#reset-cells'
+ - '#power-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8550-gcc.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+ clock-controller@af00000 {
+ compatible = "qcom,sm8550-dispcc";
+ reg = <0x0af00000 0x10000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&rpmhcc RPMH_CXO_CLK_A>,
+ <&gcc GCC_DISP_AHB_CLK>,
+ <&sleep_clk>,
+ <&dsi0_phy 0>,
+ <&dsi0_phy 1>,
+ <&dsi1_phy 0>,
+ <&dsi1_phy 1>,
+ <&dp0_phy 0>,
+ <&dp0_phy 1>,
+ <&dp1_phy 0>,
+ <&dp1_phy 1>,
+ <&dp2_phy 0>,
+ <&dp2_phy 1>,
+ <&dp3_phy 0>,
+ <&dp3_phy 1>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ power-domains = <&rpmhpd SM8550_MMCX>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8550-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8550-gcc.yaml
new file mode 100644
index 000000000000..0c706de31cf1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8550-gcc.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8550-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on SM8550
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Qualcomm global clock control module provides the clocks, resets and power
+ domains on SM8550
+
+ See also:: include/dt-bindings/clock/qcom,sm8550-gcc.h
+
+properties:
+ compatible:
+ const: qcom,sm8550-gcc
+
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Sleep clock source
+ - description: PCIE 0 Pipe clock source
+ - description: PCIE 1 Pipe clock source
+ - description: PCIE 1 Phy Auxiliary clock source
+ - description: UFS Phy Rx symbol 0 clock source
+ - description: UFS Phy Rx symbol 1 clock source
+ - description: UFS Phy Tx symbol 0 clock source
+ - description: USB3 Phy wrapper pipe clock source
+
+required:
+ - compatible
+ - clocks
+
+allOf:
+ - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ clock-controller@100000 {
+ compatible = "qcom,sm8550-gcc";
+ reg = <0x00100000 0x001f4200>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&sleep_clk>,
+ <&pcie0_phy>,
+ <&pcie1_phy>,
+ <&pcie_1_phy_aux_clk>,
+ <&ufs_mem_phy 0>,
+ <&ufs_mem_phy 1>,
+ <&ufs_mem_phy 2>,
+ <&usb_1_qmpphy>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ #power-domain-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml
new file mode 100644
index 000000000000..1bf1a41fd89c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8550-tcsr.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sm8550-tcsr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm TCSR Clock Controller on SM8550
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description: |
+ Qualcomm TCSR clock control module provides the clocks, resets and
+ power domains on SM8550
+
+ See also:: include/dt-bindings/clock/qcom,sm8550-tcsr.h
+
+properties:
+ compatible:
+ items:
+ - const: qcom,sm8550-tcsr
+ - const: syscon
+
+ clocks:
+ items:
+ - description: TCXO pad clock
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+
+ clock-controller@1fc0000 {
+ compatible = "qcom,sm8550-tcsr", "syscon";
+ reg = <0x1fc0000 0x30000>;
+ clocks = <&rpmhcc RPMH_CXO_CLK>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.txt b/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.txt
deleted file mode 100644
index 7474aba36607..000000000000
--- a/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-Qualcomm Technologies, Inc. SPMI PMIC clock divider (clkdiv)
-
-clkdiv configures the clock frequency of a set of outputs on the PMIC.
-These clocks are typically wired through alternate functions on
-gpio pins.
-
-=======================
-Properties
-=======================
-
-- compatible
- Usage: required
- Value type: <string>
- Definition: must be "qcom,spmi-clkdiv".
-
-- reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: base address of CLKDIV peripherals.
-
-- qcom,num-clkdivs
- Usage: required
- Value type: <u32>
- Definition: number of CLKDIV peripherals.
-
-- clocks:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: reference to the xo clock.
-
-- clock-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "xo".
-
-- #clock-cells:
- Usage: required
- Value type: <u32>
- Definition: shall contain 1.
-
-=======
-Example
-=======
-
-pm8998_clk_divs: clock-controller@5b00 {
- compatible = "qcom,spmi-clkdiv";
- reg = <0x5b00>;
- #clock-cells = <1>;
- qcom,num-clkdivs = <3>;
- clocks = <&xo_board>;
- clock-names = "xo";
-
- assigned-clocks = <&pm8998_clk_divs 1>,
- <&pm8998_clk_divs 2>,
- <&pm8998_clk_divs 3>;
- assigned-clock-rates = <9600000>,
- <9600000>,
- <9600000>;
-};
diff --git a/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.yaml b/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.yaml
new file mode 100644
index 000000000000..16c95ad6c9d1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,spmi-clkdiv.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,spmi-clkdiv.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SPMI PMIC clock divider
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+ - Stephen Boyd <sboyd@kernel.org>
+
+description: |
+ Qualcomm SPMI PMIC clock divider configures the clock frequency of a set of
+ outputs on the PMIC. These clocks are typically wired through alternate
+ functions on GPIO pins.
+
+properties:
+ compatible:
+ const: qcom,spmi-clkdiv
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Board XO source
+
+ clock-names:
+ items:
+ - const: xo
+
+ "#clock-cells":
+ const: 1
+
+ qcom,num-clkdivs:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Number of CLKDIV peripherals.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - "#clock-cells"
+ - qcom,num-clkdivs
+
+additionalProperties: false
+
+examples:
+ - |
+ pmic {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ clock-controller@5b00 {
+ compatible = "qcom,spmi-clkdiv";
+ reg = <0x5b00>;
+ clocks = <&xo_board>;
+ clock-names = "xo";
+ #clock-cells = <1>;
+ qcom,num-clkdivs = <3>;
+
+ assigned-clocks = <&pm8998_clk_divs 1>,
+ <&pm8998_clk_divs 2>,
+ <&pm8998_clk_divs 3>;
+ assigned-clock-rates = <9600000>,
+ <9600000>,
+ <9600000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/qcom,videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,videocc.yaml
index 0d224f114b5b..2b07146161b4 100644
--- a/Documentation/devicetree/bindings/clock/qcom,videocc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,videocc.yaml
@@ -4,21 +4,21 @@
$id: http://devicetree.org/schemas/clock/qcom,videocc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Qualcomm Video Clock & Reset Controller Binding
+title: Qualcomm Video Clock & Reset Controller
maintainers:
- Taniya Das <tdas@codeaurora.org>
description: |
- Qualcomm video clock control module which supports the clocks, resets and
- power domains on Qualcomm SoCs.
+ Qualcomm video clock control module provides the clocks, resets and power
+ domains on Qualcomm SoCs.
- See also:
- dt-bindings/clock/qcom,videocc-sc7180.h
- dt-bindings/clock/qcom,videocc-sc7280.h
- dt-bindings/clock/qcom,videocc-sdm845.h
- dt-bindings/clock/qcom,videocc-sm8150.h
- dt-bindings/clock/qcom,videocc-sm8250.h
+ See also::
+ include/dt-bindings/clock/qcom,videocc-sc7180.h
+ include/dt-bindings/clock/qcom,videocc-sc7280.h
+ include/dt-bindings/clock/qcom,videocc-sdm845.h
+ include/dt-bindings/clock/qcom,videocc-sm8150.h
+ include/dt-bindings/clock/qcom,videocc-sm8250.h
properties:
compatible:
@@ -30,12 +30,12 @@ properties:
- qcom,sm8250-videocc
clocks:
- items:
- - description: Board XO source
+ minItems: 1
+ maxItems: 3
clock-names:
- items:
- - const: bi_tcxo
+ minItems: 1
+ maxItems: 3
'#clock-cells':
const: 1
@@ -49,6 +49,16 @@ properties:
reg:
maxItems: 1
+ power-domains:
+ description:
+ A phandle and PM domain specifier for the MMCX power domain.
+ maxItems: 1
+
+ required-opps:
+ description:
+ A phandle to an OPP node describing required MMCX performance point.
+ maxItems: 1
+
required:
- compatible
- reg
@@ -58,11 +68,63 @@ required:
- '#reset-cells'
- '#power-domain-cells'
+allOf:
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,sc7180-videocc
+ - qcom,sdm845-videocc
+ - qcom,sm8150-videocc
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ clock-names:
+ items:
+ - const: bi_tcxo
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,sc7280-videocc
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Board XO source
+ - description: Board active XO source
+ clock-names:
+ items:
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,sm8250-videocc
+ then:
+ properties:
+ clocks:
+ items:
+ - description: AHB
+ - description: Board XO source
+ - description: Board active XO source
+ clock-names:
+ items:
+ - const: iface
+ - const: bi_tcxo
+ - const: bi_tcxo_ao
+
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
clock-controller@ab00000 {
compatible = "qcom,sdm845-videocc";
reg = <0x0ab00000 0x10000>;
@@ -71,5 +133,7 @@ examples:
#clock-cells = <1>;
#reset-cells = <1>;
#power-domain-cells = <1>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+ required-opps = <&rpmhpd_opp_low_svs>;
};
...
diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
index f7d48f23da44..10119d9ef4b1 100644
--- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt
+++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt
@@ -44,6 +44,7 @@ Required properties:
* "fsl,ls1046a-clockgen"
* "fsl,ls1088a-clockgen"
* "fsl,ls2080a-clockgen"
+ * "fsl,lx2160a-clockgen"
Chassis-version clock strings include:
* "fsl,qoriq-clockgen-1.0": for chassis 1.0 clocks
* "fsl,qoriq-clockgen-2.0": for chassis 2.0 clocks
diff --git a/Documentation/devicetree/bindings/clock/renesas,9series.yaml b/Documentation/devicetree/bindings/clock/renesas,9series.yaml
new file mode 100644
index 000000000000..3afdebdb52ad
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/renesas,9series.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/renesas,9series.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas 9-series I2C PCIe clock generators
+
+description: |
+ The Renesas 9-series are I2C PCIe clock generators providing
+ from 1 to 20 output clocks.
+
+ When referencing the provided clock in the DT using phandle
+ and clock specifier, the following mapping applies:
+
+ - 9FGV0241:
+ 0 -- DIF0
+ 1 -- DIF1
+ - 9FGV0441:
+ 0 -- DIF0
+ 1 -- DIF1
+ 2 -- DIF2
+ 3 -- DIF3
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+properties:
+ compatible:
+ enum:
+ - renesas,9fgv0241
+ - renesas,9fgv0441
+
+ reg:
+ description: I2C device address
+ enum: [ 0x68, 0x6a ]
+
+ '#clock-cells':
+ const: 1
+
+ clocks:
+ items:
+ - description: XTal input clock
+
+ renesas,out-amplitude-microvolt:
+ enum: [ 600000, 700000, 800000, 900000 ]
+ description: Output clock signal amplitude
+
+ renesas,out-spread-spectrum:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [ 100000, 99750, 99500 ]
+ description: Output clock down spread in pcm (1/1000 of percent)
+
+patternProperties:
+ "^DIF[0-19]$":
+ type: object
+ description:
+ Description of one of the outputs (DIF0..DIF19).
+
+ properties:
+ renesas,slew-rate:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [ 2000000, 3000000 ]
+ description: Output clock slew rate select in V/ns
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ /* 25MHz reference crystal */
+ ref25: ref25m {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <25000000>;
+ };
+
+ i2c@0 {
+ reg = <0x0 0x100>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ rs9: clock-generator@6a {
+ compatible = "renesas,9fgv0241";
+ reg = <0x6a>;
+ #clock-cells = <1>;
+
+ clocks = <&ref25m>;
+
+ DIF0 {
+ renesas,slew-rate = <3000000>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clock.yaml b/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clock.yaml
index c55a7c494e01..2197c952e21d 100644
--- a/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,cpg-div6-clock.yaml
@@ -51,6 +51,18 @@ additionalProperties: false
examples:
- |
#include <dt-bindings/clock/r8a73a4-clock.h>
+
+ cpg_clocks: cpg_clocks@e6150000 {
+ compatible = "renesas,r8a73a4-cpg-clocks";
+ reg = <0xe6150000 0x10000>;
+ clocks = <&extal1_clk>, <&extal2_clk>;
+ #clock-cells = <1>;
+ clock-output-names = "main", "pll0", "pll1", "pll2",
+ "pll2s", "pll2h", "z", "z2",
+ "i", "m3", "b", "m1", "m2",
+ "zx", "zs", "hp";
+ };
+
sdhi2_clk: sdhi2_clk@e615007c {
compatible = "renesas,r8a73a4-div6-clock", "renesas,cpg-div6-clock";
reg = <0xe615007c 4>;
diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
index 9b414fbde6d7..9c3dc6c4fa94 100644
--- a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/clock/renesas,cpg-mssr.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/clock/renesas,cpg-mssr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas Clock Pulse Generator / Module Standby and Software Reset
@@ -48,6 +48,8 @@ properties:
- renesas,r8a77990-cpg-mssr # R-Car E3
- renesas,r8a77995-cpg-mssr # R-Car D3
- renesas,r8a779a0-cpg-mssr # R-Car V3U
+ - renesas,r8a779f0-cpg-mssr # R-Car S4-8
+ - renesas,r8a779g0-cpg-mssr # R-Car V4H
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt b/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt
deleted file mode 100644
index 399e0da22348..000000000000
--- a/Documentation/devicetree/bindings/clock/renesas,h8300-div-clock.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-* Renesas H8/300 divider clock
-
-Required Properties:
-
- - compatible: Must be "renesas,h8300-div-clock"
-
- - clocks: Reference to the parent clocks ("extal1" and "extal2")
-
- - #clock-cells: Must be 1
-
- - reg: Base address and length of the divide rate selector
-
- - renesas,width: bit width of selector
-
-Example
--------
-
- cclk: cclk {
- compatible = "renesas,h8300-div-clock";
- clocks = <&xclk>;
- #clock-cells = <0>;
- reg = <0xfee01b 2>;
- renesas,width = <2>;
- };
diff --git a/Documentation/devicetree/bindings/clock/renesas,h8s2678-pll-clock.txt b/Documentation/devicetree/bindings/clock/renesas,h8s2678-pll-clock.txt
deleted file mode 100644
index 500cdadbceb7..000000000000
--- a/Documentation/devicetree/bindings/clock/renesas,h8s2678-pll-clock.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Renesas H8S2678 PLL clock
-
-This device is Clock multiplyer
-
-Required Properties:
-
- - compatible: Must be "renesas,h8s2678-pll-clock"
-
- - clocks: Reference to the parent clocks
-
- - #clock-cells: Must be 0
-
- - reg: Two rate selector (Multiply / Divide) register address
-
-Example
--------
-
- pllclk: pllclk {
- compatible = "renesas,h8s2678-pll-clock";
- clocks = <&xclk>;
- #clock-cells = <0>;
- reg = <0xfee03b 2>, <0xfee045 2>;
- };
diff --git a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.yaml b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.yaml
index 25dbb0fac065..99686085f751 100644
--- a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas RZ/N1D (R9A06G032) System Controller
maintainers:
- - Gareth Williams <gareth.williams.jx@renesas.com>
+ - Fabrizio Castro <fabrizio.castro.jz@renesas.com>
- Geert Uytterhoeven <geert+renesas@glider.be>
properties:
@@ -39,6 +39,17 @@ properties:
'#power-domain-cells':
const: 0
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 1
+
+patternProperties:
+ "^dma-router@[a-f0-9]+$":
+ type: object
+ $ref: "../dma/renesas,rzn1-dmamux.yaml#"
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/clock/renesas,rcar-usb2-clock-sel.yaml b/Documentation/devicetree/bindings/clock/renesas,rcar-usb2-clock-sel.yaml
index 6eaabb4d82ec..c84f29f1810f 100644
--- a/Documentation/devicetree/bindings/clock/renesas,rcar-usb2-clock-sel.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,rcar-usb2-clock-sel.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/clock/renesas,rcar-usb2-clock-sel.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/clock/renesas,rcar-usb2-clock-sel.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas R-Car USB 2.0 clock selector
@@ -47,7 +47,6 @@ properties:
maxItems: 1
clocks:
- minItems: 4
maxItems: 4
clock-names:
@@ -64,7 +63,6 @@ properties:
maxItems: 1
resets:
- minItems: 2
maxItems: 2
reset-names:
diff --git a/Documentation/devicetree/bindings/clock/renesas,rzg2l-cpg.yaml b/Documentation/devicetree/bindings/clock/renesas,rzg2l-cpg.yaml
index 30b2e3d0d25d..fe2fba18ae84 100644
--- a/Documentation/devicetree/bindings/clock/renesas,rzg2l-cpg.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,rzg2l-cpg.yaml
@@ -1,17 +1,18 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/clock/renesas,rzg2l-cpg.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/clock/renesas,rzg2l-cpg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Renesas RZ/G2L Clock Pulse Generator / Module Standby Mode
+title: Renesas RZ/{G2L,V2L,V2M} Clock Pulse Generator / Module Standby Mode
maintainers:
- Geert Uytterhoeven <geert+renesas@glider.be>
description: |
- On Renesas RZ/G2L SoC, the CPG (Clock Pulse Generator) and Module
- Standby Mode share the same register block.
+ On Renesas RZ/{G2L,V2L}-alike SoC's, the CPG (Clock Pulse Generator) and Module
+ Standby Mode share the same register block. On RZ/V2M, the functionality is
+ similar, but does not have Clock Monitor Registers.
They provide the following functionalities:
- The CPG block generates various core clocks,
@@ -22,7 +23,11 @@ description: |
properties:
compatible:
- const: renesas,r9a07g044-cpg # RZ/G2{L,LC}
+ enum:
+ - renesas,r9a07g043-cpg # RZ/G2UL{Type-1,Type-2} and RZ/Five
+ - renesas,r9a07g044-cpg # RZ/G2{L,LC}
+ - renesas,r9a07g054-cpg # RZ/V2L
+ - renesas,r9a09g011-cpg # RZ/V2M
reg:
maxItems: 1
@@ -40,9 +45,9 @@ properties:
description: |
- For CPG core clocks, the two clock specifier cells must be "CPG_CORE"
and a core clock reference, as defined in
- <dt-bindings/clock/r9a07g044-cpg.h>
+ <dt-bindings/clock/r9a0*-cpg.h>,
- For module clocks, the two clock specifier cells must be "CPG_MOD" and
- a module number, as defined in the <dt-bindings/clock/r9a07g044-cpg.h>.
+ a module number, as defined in <dt-bindings/clock/r9a0*-cpg.h>.
const: 2
'#power-domain-cells':
@@ -56,7 +61,7 @@ properties:
'#reset-cells':
description:
The single reset specifier cell must be the module number, as defined in
- the <dt-bindings/clock/r9a07g044-cpg.h>.
+ <dt-bindings/clock/r9a0*-cpg.h>.
const: 1
required:
diff --git a/Documentation/devicetree/bindings/clock/renesas,versaclock7.yaml b/Documentation/devicetree/bindings/clock/renesas,versaclock7.yaml
new file mode 100644
index 000000000000..b339f1f9f072
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/renesas,versaclock7.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/renesas,versaclock7.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas Versaclock7 Programmable Clock
+
+maintainers:
+ - Alex Helms <alexander.helms.jy@renesas.com>
+
+description: |
+ Renesas Versaclock7 is a family of configurable clock generator and
+ jitter attenuator ICs with fractional and integer dividers.
+
+properties:
+ '#clock-cells':
+ const: 1
+
+ compatible:
+ enum:
+ - renesas,rc21008a
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: External crystal or oscillator
+
+ clock-names:
+ items:
+ - const: xin
+
+required:
+ - '#clock-cells'
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ vc7_xin: clock {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <49152000>;
+ };
+
+ i2c@0 {
+ reg = <0x0 0x100>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ vc7: clock-controller@9 {
+ compatible = "renesas,rc21008a";
+ reg = <0x9>;
+ #clock-cells = <1>;
+ clocks = <&vc7_xin>;
+ clock-names = "xin";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,px30-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,px30-cru.txt
deleted file mode 100644
index 55e78cddec8c..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,px30-cru.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-* Rockchip PX30 Clock and Reset Unit
-
-The PX30 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: PMU for CRU should be "rockchip,px30-pmu-cru"
-- compatible: CRU should be "rockchip,px30-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- clocks: A list of phandle + clock-specifier pairs for the clocks listed
- in clock-names
-- clock-names: Should contain the following:
- - "xin24m" for both PMUCRU and CRU
- - "gpll" for CRU (sourced from PMUCRU)
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing, pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/px30-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "i2sx_clkin" - external I2S clock - optional,
- - "gmac_clkin" - external GMAC clock - optional
-
-Example: Clock controller node:
-
- pmucru: clock-controller@ff2bc000 {
- compatible = "rockchip,px30-pmucru";
- reg = <0x0 0xff2bc000 0x0 0x1000>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
- cru: clock-controller@ff2b0000 {
- compatible = "rockchip,px30-cru";
- reg = <0x0 0xff2b0000 0x0 0x1000>;
- rockchip,grf = <&grf>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@ff030000 {
- compatible = "rockchip,px30-uart", "snps,dw-apb-uart";
- reg = <0x0 0xff030000 0x0 0x100>;
- interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&pmucru SCLK_UART0_PMU>, <&pmucru PCLK_UART0_PMU>;
- clock-names = "baudclk", "apb_pclk";
- reg-shift = <2>;
- reg-io-width = <4>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,px30-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,px30-cru.yaml
new file mode 100644
index 000000000000..0f0f64b6f8cb
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,px30-cru.yaml
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,px30-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip PX30 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The PX30 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/px30-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "xin32k" - rtc clock - optional
+ - "i2sx_clkin" - external I2S clock - optional
+ - "gmac_clkin" - external GMAC clock - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,px30-cru
+ - rockchip,px30-pmucru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ minItems: 1
+ items:
+ - description: Clock for both PMUCRU and CRU
+ - description: Clock for CRU (sourced from PMUCRU)
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: xin24m
+ - const: gpll
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - "#clock-cells"
+ - "#reset-cells"
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,px30-cru
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+
+ clock-names:
+ minItems: 2
+
+ else:
+ properties:
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ maxItems: 1
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/px30-cru.h>
+
+ pmucru: clock-controller@ff2bc000 {
+ compatible = "rockchip,px30-pmucru";
+ reg = <0xff2bc000 0x1000>;
+ clocks = <&xin24m>;
+ clock-names = "xin24m";
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+ cru: clock-controller@ff2b0000 {
+ compatible = "rockchip,px30-cru";
+ reg = <0xff2b0000 0x1000>;
+ clocks = <&xin24m>, <&pmucru PLL_GPLL>;
+ clock-names = "xin24m", "gpll";
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt
deleted file mode 100644
index 20df350b9ef3..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-* Rockchip RK3036 Clock and Reset Unit
-
-The RK3036 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3036-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3036-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "ext_i2s" - external I2S clock - optional,
- - "rmii_clkin" - external EMAC clock - optional
-
-Example: Clock controller node:
-
- cru: cru@20000000 {
- compatible = "rockchip,rk3036-cru";
- reg = <0x20000000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@20060000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x20060000 0x100>;
- interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <4>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.yaml
new file mode 100644
index 000000000000..ba5b45464315
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3036-cru.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3036-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3036 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3036 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3036-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "ext_i2s" - external I2S clock - optional
+ - "rmii_clkin" - external EMAC clock - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3036-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@20000000 {
+ compatible = "rockchip,rk3036-cru";
+ reg = <0x20000000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.txt
deleted file mode 100644
index 6f8744fd301b..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-* Rockchip RK3126/RK3128 Clock and Reset Unit
-
-The RK3126/RK3128 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3126-cru" or "rockchip,rk3128-cru"
- "rockchip,rk3126-cru" - controller compatible with RK3126 SoC.
- "rockchip,rk3128-cru" - controller compatible with RK3128 SoC.
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3128-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "ext_i2s" - external I2S clock - optional,
- - "gmac_clkin" - external GMAC clock - optional
-
-Example: Clock controller node:
-
- cru: cru@20000000 {
- compatible = "rockchip,rk3128-cru";
- reg = <0x20000000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart2: serial@20068000 {
- compatible = "rockchip,serial";
- reg = <0x20068000 0x100>;
- interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>;
- clock-frequency = <24000000>;
- clocks = <&cru SCLK_UART2>, <&cru PCLK_UART2>;
- clock-names = "sclk_uart", "pclk_uart";
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.yaml
new file mode 100644
index 000000000000..b3d9c8eca989
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3128-cru.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3128-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3126/RK3128 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3126/RK3128 clock controller generates and supplies clock to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3128-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3126-cru
+ - rockchip,rk3128-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 3
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: xin24m
+ - enum:
+ - ext_i2s
+ - gmac_clkin
+ - enum:
+ - ext_i2s
+ - gmac_clkin
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@20000000 {
+ compatible = "rockchip,rk3128-cru";
+ reg = <0x20000000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt
deleted file mode 100644
index 7f368530a2e4..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-* Rockchip RK3188/RK3066 Clock and Reset Unit
-
-The RK3188/RK3066 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3188-cru", "rockchip,rk3188a-cru" or
- "rockchip,rk3066a-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3188-cru.h and
-dt-bindings/clock/rk3066-cru.h headers and can be used in device tree sources.
-Similar macros exist for the reset sources in these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "xin27m" - 27mhz crystal input on rk3066 - optional,
- - "ext_hsadc" - external HSADC clock - optional,
- - "ext_cif0" - external camera clock - optional,
- - "ext_rmii" - external RMII clock - optional,
- - "ext_jtag" - externalJTAG clock - optional
-
-Example: Clock controller node:
-
- cru: cru@20000000 {
- compatible = "rockchip,rk3188-cru";
- reg = <0x20000000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@10124000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x10124000 0x400>;
- interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <1>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.yaml
new file mode 100644
index 000000000000..ddd7e46af0f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3188-cru.yaml
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3188-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3188/RK3066 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3188/RK3066 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3188-cru.h and
+ dt-bindings/clock/rk3066-cru.h headers and can be used in device tree sources.
+ Similar macros exist for the reset sources in these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "xin32k" - RTC clock - optional
+ - "xin27m" - 27mhz crystal input on RK3066 - optional
+ - "ext_hsadc" - external HSADC clock - optional
+ - "ext_cif0" - external camera clock - optional
+ - "ext_rmii" - external RMII clock - optional
+ - "ext_jtag" - external JTAG clock - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3066a-cru
+ - rockchip,rk3188-cru
+ - rockchip,rk3188a-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@20000000 {
+ compatible = "rockchip,rk3188-cru";
+ reg = <0x20000000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt
deleted file mode 100644
index f323048127eb..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-* Rockchip RK3228 Clock and Reset Unit
-
-The RK3228 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3228-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3228-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "ext_i2s" - external I2S clock - optional,
- - "ext_gmac" - external GMAC clock - optional
- - "ext_hsadc" - external HSADC clock - optional
- - "phy_50m_out" - output clock of the pll in the mac phy
-
-Example: Clock controller node:
-
- cru: cru@20000000 {
- compatible = "rockchip,rk3228-cru";
- reg = <0x20000000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@10110000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x10110000 0x100>;
- interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <4>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.yaml
new file mode 100644
index 000000000000..1050fff72ade
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3228-cru.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3228-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3228 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3228 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3228-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "ext_i2s" - external I2S clock - optional
+ - "ext_gmac" - external GMAC clock - optional
+ - "ext_hsadc" - external HSADC clock - optional
+ - "phy_50m_out" - output clock of the pll in the mac phy
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3228-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@20000000 {
+ compatible = "rockchip,rk3228-cru";
+ reg = <0x20000000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt
deleted file mode 100644
index bf3a9ec19241..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-* Rockchip RK3288 Clock and Reset Unit
-
-The RK3288 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-A revision of this SoC is available: rk3288w. The clock tree is a bit
-different so another dt-compatible is available. Noticed that it is only
-setting the difference but there is no automatic revision detection. This
-should be performed by bootloaders.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3288-cru" or "rockchip,rk3288w-cru" in
- case of this revision of Rockchip rk3288.
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3288-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "ext_i2s" - external I2S clock - optional,
- - "ext_hsadc" - external HSADC clock - optional,
- - "ext_edp_24m" - external display port clock - optional,
- - "ext_vip" - external VIP clock - optional,
- - "ext_isp" - external ISP clock - optional,
- - "ext_jtag" - external JTAG clock - optional
-
-Example: Clock controller node:
-
- cru: cru@20000000 {
- compatible = "rockchip,rk3188-cru";
- reg = <0x20000000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@10124000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x10124000 0x400>;
- interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <1>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.yaml
new file mode 100644
index 000000000000..6655e97d52e4
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3288-cru.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3288-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3288 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3288 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+
+ A revision of this SoC is available: rk3288w. The clock tree is a bit
+ different so another dt-compatible is available. Noticed that it is only
+ setting the difference but there is no automatic revision detection. This
+ should be performed by boot loaders.
+
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3288-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required,
+ - "xin32k" - rtc clock - optional,
+ - "ext_i2s" - external I2S clock - optional,
+ - "ext_hsadc" - external HSADC clock - optional,
+ - "ext_edp_24m" - external display port clock - optional,
+ - "ext_vip" - external VIP clock - optional,
+ - "ext_isp" - external ISP clock - optional,
+ - "ext_jtag" - external JTAG clock - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3288-cru
+ - rockchip,rk3288w-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@ff760000 {
+ compatible = "rockchip,rk3288-cru";
+ reg = <0xff760000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.txt
deleted file mode 100644
index 9b151c5b0c90..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-* Rockchip RK3308 Clock and Reset Unit
-
-The RK3308 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: CRU should be "rockchip,rk3308-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing, pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3308-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "mclk_i2s0_8ch_in", "mclk_i2s1_8ch_in", "mclk_i2s2_8ch_in",
- "mclk_i2s3_8ch_in", "mclk_i2s0_2ch_in",
- "mclk_i2s1_2ch_in" - external I2S or SPDIF clock - optional,
- - "mac_clkin" - external MAC clock - optional
-
-Example: Clock controller node:
-
- cru: clock-controller@ff500000 {
- compatible = "rockchip,rk3308-cru";
- reg = <0x0 0xff500000 0x0 0x1000>;
- rockchip,grf = <&grf>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@ff0a0000 {
- compatible = "rockchip,rk3308-uart", "snps,dw-apb-uart";
- reg = <0x0 0xff0a0000 0x0 0x100>;
- interrupts = <GIC_SPI 18 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru SCLK_UART0>, <&cru PCLK_UART0>;
- clock-names = "baudclk", "apb_pclk";
- reg-shift = <2>;
- reg-io-width = <4>;
- status = "disabled";
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.yaml
new file mode 100644
index 000000000000..fec37f5b80f6
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3308-cru.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3308-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3308 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3308 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3308-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "xin32k" - rtc clock - optional
+ - "mclk_i2s0_8ch_in", "mclk_i2s1_8ch_in",
+ "mclk_i2s2_8ch_in", "mclk_i2s3_8ch_in",
+ "mclk_i2s0_2ch_in", "mclk_i2s1_2ch_in" - external I2S or
+ SPDIF clock - optional
+ - "mac_clkin" - external MAC clock - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3308-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@ff500000 {
+ compatible = "rockchip,rk3308-cru";
+ reg = <0xff500000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.txt
deleted file mode 100644
index 7c8bbcfed8d2..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-* Rockchip RK3368 Clock and Reset Unit
-
-The RK3368 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rk3368-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing, pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rk3368-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "ext_i2s" - external I2S clock - optional,
- - "ext_gmac" - external GMAC clock - optional
- - "ext_hsadc" - external HSADC clock - optional,
- - "ext_isp" - external ISP clock - optional,
- - "ext_jtag" - external JTAG clock - optional
- - "ext_vip" - external VIP clock - optional,
- - "usbotg_out" - output clock of the pll in the otg phy
-
-Example: Clock controller node:
-
- cru: clock-controller@ff760000 {
- compatible = "rockchip,rk3368-cru";
- reg = <0x0 0xff760000 0x0 0x1000>;
- rockchip,grf = <&grf>;
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@10124000 {
- compatible = "snps,dw-apb-uart";
- reg = <0x10124000 0x400>;
- interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <1>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.yaml
new file mode 100644
index 000000000000..90af242b41c1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3368-cru.yaml
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3368-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RK3368 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3368 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rk3368-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "xin32k" - rtc clock - optional
+ - "ext_i2s" - external I2S clock - optional
+ - "ext_gmac" - external GMAC clock - optional
+ - "ext_hsadc" - external HSADC clock - optional
+ - "ext_isp" - external ISP clock - optional
+ - "ext_jtag" - external JTAG clock - optional
+ - "ext_vip" - external VIP clock - optional
+ - "usbotg_out" - output clock of the pll in the otg phy
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3368-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@ff760000 {
+ compatible = "rockchip,rk3368-cru";
+ reg = <0xff760000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3399-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3399-cru.yaml
index 72b286a1beba..0b758e015ee3 100644
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3399-cru.yaml
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3399-cru.yaml
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0-only
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
%YAML 1.2
---
$id: http://devicetree.org/schemas/clock/rockchip,rk3399-cru.yaml#
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Rockchip RK3399 Clock and Reset Unit
maintainers:
- - Xing Zheng <zhengxing@rock-chips.com>
+ - Elaine Zhang <zhangqing@rock-chips.com>
- Heiko Stuebner <heiko@sntech.de>
description: |
@@ -22,11 +22,11 @@ description: |
There are several clocks that are generated outside the SoC. It is expected
that they are defined using standard clock bindings with following
clock-output-names:
- - "xin24m" - crystal input - required,
- - "xin32k" - rtc clock - optional,
- - "clkin_gmac" - external GMAC clock - optional,
- - "clkin_i2s" - external I2S clock - optional,
- - "pclkin_cif" - external ISP clock - optional,
+ - "xin24m" - crystal input - required,
+ - "xin32k" - rtc clock - optional,
+ - "clkin_gmac" - external GMAC clock - optional,
+ - "clkin_i2s" - external I2S clock - optional,
+ - "pclkin_cif" - external ISP clock - optional,
- "clk_usbphy0_480m" - output clock of the pll in the usbphy0
- "clk_usbphy1_480m" - output clock of the pll in the usbphy1
@@ -46,24 +46,15 @@ properties:
const: 1
clocks:
- minItems: 1
-
- assigned-clocks:
- minItems: 1
- maxItems: 64
-
- assigned-clock-parents:
- minItems: 1
- maxItems: 64
+ maxItems: 1
- assigned-clock-rates:
- minItems: 1
- maxItems: 64
+ clock-names:
+ const: xin24m
rockchip,grf:
$ref: /schemas/types.yaml#/definitions/phandle
- description: >
- phandle to the syscon managing the "general register files". It is used
+ description:
+ Phandle to the syscon managing the "general register files". It is used
for GRF muxes, if missing any muxes present in the GRF will not be
available.
@@ -77,7 +68,7 @@ additionalProperties: false
examples:
- |
- pmucru: pmu-clock-controller@ff750000 {
+ pmucru: clock-controller@ff750000 {
compatible = "rockchip,rk3399-pmucru";
reg = <0xff750000 0x1000>;
#clock-cells = <1>;
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3568-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3568-cru.yaml
index b2c26097827f..f809c289445e 100644
--- a/Documentation/devicetree/bindings/clock/rockchip,rk3568-cru.yaml
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3568-cru.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/rockchip,rk3568-cru.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ROCKCHIP rk3568 Family Clock Control Module Binding
+title: ROCKCHIP rk3568 Family Clock Control Module
maintainers:
- Elaine Zhang <zhangqing@rock-chips.com>
@@ -34,6 +34,19 @@ properties:
"#reset-cells":
const: 1
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk3588-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rk3588-cru.yaml
new file mode 100644
index 000000000000..74cd3f3f229a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rk3588-cru.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rk3588-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip rk3588 Family Clock and Reset Control Module
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RK3588 clock controller generates the clock and also implements a reset
+ controller for SoC peripherals. For example it provides SCLK_UART2 and
+ PCLK_UART2, as well as SRST_P_UART2 and SRST_S_UART2 for the second UART
+ module.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clock and reset IDs
+ are defined as preprocessor macros in dt-binding headers.
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3588-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: xin24m
+ - const: xin32k
+
+ assigned-clocks: true
+
+ assigned-clock-rates: true
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: >
+ phandle to the syscon managing the "general register files". It is used
+ for GRF muxes, if missing any muxes present in the GRF will not be
+ available.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@fd7c0000 {
+ compatible = "rockchip,rk3588-cru";
+ reg = <0xfd7c0000 0x5c000>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt
deleted file mode 100644
index 161326a4f9c1..000000000000
--- a/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-* Rockchip RV1108 Clock and Reset Unit
-
-The RV1108 clock controller generates and supplies clock to various
-controllers within the SoC and also implements a reset controller for SoC
-peripherals.
-
-Required Properties:
-
-- compatible: should be "rockchip,rv1108-cru"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-- #reset-cells: should be 1.
-
-Optional Properties:
-
-- rockchip,grf: phandle to the syscon managing the "general register files"
- If missing pll rates are not changeable, due to the missing pll lock status.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. All available clocks are defined as
-preprocessor macros in the dt-bindings/clock/rv1108-cru.h headers and can be
-used in device tree sources. Similar macros exist for the reset sources in
-these files.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xin24m" - crystal input - required,
- - "ext_vip" - external VIP clock - optional
- - "ext_i2s" - external I2S clock - optional
- - "ext_gmac" - external GMAC clock - optional
- - "hdmiphy" - external clock input derived from HDMI PHY - optional
- - "usbphy" - external clock input derived from USB PHY - optional
-
-Example: Clock controller node:
-
- cru: cru@20200000 {
- compatible = "rockchip,rv1108-cru";
- reg = <0x20200000 0x1000>;
- rockchip,grf = <&grf>;
-
- #clock-cells = <1>;
- #reset-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller:
-
- uart0: serial@10230000 {
- compatible = "rockchip,rv1108-uart", "snps,dw-apb-uart";
- reg = <0x10230000 0x100>;
- interrupts = <GIC_SPI 44 IRQ_TYPE_LEVEL_HIGH>;
- reg-shift = <2>;
- reg-io-width = <4>;
- clocks = <&cru SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.yaml
new file mode 100644
index 000000000000..4611d920b8df
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.yaml
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rv1108-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RV1108 Clock and Reset Unit (CRU)
+
+maintainers:
+ - Elaine Zhang <zhangqing@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description: |
+ The RV1108 clock controller generates and supplies clocks to various
+ controllers within the SoC and also implements a reset controller for SoC
+ peripherals.
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All available clocks are defined as
+ preprocessor macros in the dt-bindings/clock/rv1108-cru.h headers and can be
+ used in device tree sources. Similar macros exist for the reset sources in
+ these files.
+ There are several clocks that are generated outside the SoC. It is expected
+ that they are defined using standard clock bindings with following
+ clock-output-names:
+ - "xin24m" - crystal input - required
+ - "ext_vip" - external VIP clock - optional
+ - "ext_i2s" - external I2S clock - optional
+ - "ext_gmac" - external GMAC clock - optional
+ - "hdmiphy" - external clock input derived from HDMI PHY - optional
+ - "usbphy" - external clock input derived from USB PHY - optional
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rv1108-cru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@20200000 {
+ compatible = "rockchip,rv1108-cru";
+ reg = <0x20200000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1126-cru.yaml b/Documentation/devicetree/bindings/clock/rockchip,rv1126-cru.yaml
new file mode 100644
index 000000000000..0998f8b922bd
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/rockchip,rv1126-cru.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/rockchip,rv1126-cru.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip RV1126 Clock and Reset Unit
+
+maintainers:
+ - Jagan Teki <jagan@edgeble.ai>
+ - Finley Xiao <finley.xiao@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+description:
+ The RV1126 clock controller generates the clock and also implements a
+ reset controller for SoC peripherals.
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rv1126-cru
+ - rockchip,rv1126-pmucru
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: xin24m
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "general register files" (GRF),
+ if missing pll rates are not changeable, due to the missing pll
+ lock status.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ cru: clock-controller@ff490000 {
+ compatible = "rockchip,rv1126-cru";
+ reg = <0xff490000 0x1000>;
+ rockchip,grf = <&grf>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos-audss-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos-audss-clock.yaml
index f14f1d39da36..d819dfaafff9 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos-audss-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos-audss-clock.yaml
@@ -8,7 +8,7 @@ title: Samsung Exynos SoC Audio SubSystem clock controller
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
- Sylwester Nawrocki <s.nawrocki@samsung.com>
- Tomasz Figa <tomasz.figa@gmail.com>
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos-clock.yaml
index 4e8062860986..0589a63e273a 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos-clock.yaml
@@ -8,7 +8,7 @@ title: Samsung Exynos SoC clock controller
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
- Sylwester Nawrocki <s.nawrocki@samsung.com>
- Tomasz Figa <tomasz.figa@gmail.com>
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos-ext-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos-ext-clock.yaml
index 64d027dbe3b2..c98eff64f2b5 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos-ext-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos-ext-clock.yaml
@@ -8,7 +8,7 @@ title: Samsung SoC external/osc/XXTI/XusbXTI clock
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
- Sylwester Nawrocki <s.nawrocki@samsung.com>
- Tomasz Figa <tomasz.figa@gmail.com>
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos4412-isp-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos4412-isp-clock.yaml
index 1ed64add4355..bee13436d1ea 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynos4412-isp-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos4412-isp-clock.yaml
@@ -8,7 +8,7 @@ title: Samsung Exynos4412 SoC ISP clock controller
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
- Sylwester Nawrocki <s.nawrocki@samsung.com>
- Tomasz Figa <tomasz.figa@gmail.com>
@@ -61,4 +61,3 @@ examples:
clocks = <&clock CLK_ACLK200>, <&clock CLK_ACLK400_MCUISP>;
clock-names = "aclk200", "aclk400_mcuisp";
};
-
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos5260-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos5260-clock.yaml
new file mode 100644
index 000000000000..b05f83533e3d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos5260-clock.yaml
@@ -0,0 +1,382 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos5260-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos5260 SoC clock controller
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Expected external clocks, defined in DTS as fixed-rate clocks with a matching
+ name::
+ - "fin_pll" - PLL input clock from XXTI
+ - "xrtcxti" - input clock from XRTCXTI
+ - "ioclk_pcm_extclk" - pcm external operation clock
+ - "ioclk_spdif_extclk" - spdif external operation clock
+ - "ioclk_i2s_cdclk" - i2s0 codec clock
+
+ Phy clocks::
+ There are several clocks which are generated by specific PHYs. These clocks
+ are fed into the clock controller and then routed to the hardware blocks.
+ These clocks are defined as fixed clocks in the driver with following names::
+ - "phyclk_dptx_phy_ch3_txd_clk" - dp phy clock for channel 3
+ - "phyclk_dptx_phy_ch2_txd_clk" - dp phy clock for channel 2
+ - "phyclk_dptx_phy_ch1_txd_clk" - dp phy clock for channel 1
+ - "phyclk_dptx_phy_ch0_txd_clk" - dp phy clock for channel 0
+ - "phyclk_hdmi_phy_tmds_clko" - hdmi phy tmds clock
+ - "phyclk_hdmi_phy_pixel_clko" - hdmi phy pixel clock
+ - "phyclk_hdmi_link_o_tmds_clkhi" - hdmi phy for hdmi link
+ - "phyclk_dptx_phy_o_ref_clk_24m" - dp phy reference clock
+ - "phyclk_dptx_phy_clk_div2"
+ - "phyclk_mipi_dphy_4l_m_rxclkesc0"
+ - "phyclk_usbhost20_phy_phyclock" - usb 2.0 phy clock
+ - "phyclk_usbhost20_phy_freeclk"
+ - "phyclk_usbhost20_phy_clk48mohci"
+ - "phyclk_usbdrd30_udrd30_pipe_pclk"
+ - "phyclk_usbdrd30_udrd30_phyclock" - usb 3.0 phy clock
+
+ All available clocks are defined as preprocessor macros in
+ include/dt-bindings/clock/exynos5260-clk.h header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos5260-clock-top
+ - samsung,exynos5260-clock-peri
+ - samsung,exynos5260-clock-egl
+ - samsung,exynos5260-clock-kfc
+ - samsung,exynos5260-clock-g2d
+ - samsung,exynos5260-clock-mif
+ - samsung,exynos5260-clock-mfc
+ - samsung,exynos5260-clock-g3d
+ - samsung,exynos5260-clock-fsys
+ - samsung,exynos5260-clock-aud
+ - samsung,exynos5260-clock-isp
+ - samsung,exynos5260-clock-gscl
+ - samsung,exynos5260-clock-disp
+
+ clocks:
+ minItems: 1
+ maxItems: 19
+
+ clock-names:
+ minItems: 1
+ maxItems: 19
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - reg
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-top
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_mem_pll
+ - const: dout_bus_pll
+ - const: dout_media_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-peri
+ then:
+ properties:
+ clocks:
+ minItems: 13
+ maxItems: 13
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: ioclk_pcm_extclk
+ - const: ioclk_i2s_cdclk
+ - const: ioclk_spdif_extclk
+ - const: phyclk_hdmi_phy_ref_cko
+ - const: dout_aclk_peri_66
+ - const: dout_sclk_peri_uart0
+ - const: dout_sclk_peri_uart1
+ - const: dout_sclk_peri_uart2
+ - const: dout_sclk_peri_spi0_b
+ - const: dout_sclk_peri_spi1_b
+ - const: dout_sclk_peri_spi2_b
+ - const: dout_aclk_peri_aud
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-egl
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_bus_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-kfc
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_media_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-g2d
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_g2d_333
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-mif
+ then:
+ properties:
+ clocks:
+ minItems: 1
+ maxItems: 1
+ clock-names:
+ items:
+ - const: fin_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-mfc
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_mfc_333
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-g3d
+ then:
+ properties:
+ clocks:
+ minItems: 1
+ maxItems: 1
+ clock-names:
+ items:
+ - const: fin_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-fsys
+ then:
+ properties:
+ clocks:
+ minItems: 7
+ maxItems: 7
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: phyclk_usbhost20_phy_phyclock
+ - const: phyclk_usbhost20_phy_freeclk
+ - const: phyclk_usbhost20_phy_clk48mohci
+ - const: phyclk_usbdrd30_udrd30_pipe_pclk
+ - const: phyclk_usbdrd30_udrd30_phyclock
+ - const: dout_aclk_fsys_200
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-aud
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: fout_aud_pll
+ - const: ioclk_i2s_cdclk
+ - const: ioclk_pcm_extclk
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-isp
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_isp1_266
+ - const: dout_aclk_isp1_400
+ - const: mout_aclk_isp1_266
+
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-gscl
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_gscl_400
+ - const: dout_aclk_gscl_333
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5260-clock-disp
+ then:
+ properties:
+ clocks:
+ minItems: 19
+ maxItems: 19
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: phyclk_dptx_phy_ch3_txd_clk
+ - const: phyclk_dptx_phy_ch2_txd_clk
+ - const: phyclk_dptx_phy_ch1_txd_clk
+ - const: phyclk_dptx_phy_ch0_txd_clk
+ - const: phyclk_hdmi_phy_tmds_clko
+ - const: phyclk_hdmi_phy_ref_clko
+ - const: phyclk_hdmi_phy_pixel_clko
+ - const: phyclk_hdmi_link_o_tmds_clkhi
+ - const: phyclk_mipi_dphy_4l_m_txbyte_clkhs
+ - const: phyclk_dptx_phy_o_ref_clk_24m
+ - const: phyclk_dptx_phy_clk_div2
+ - const: phyclk_mipi_dphy_4l_m_rxclkesc0
+ - const: phyclk_hdmi_phy_ref_cko
+ - const: ioclk_spdif_extclk
+ - const: dout_aclk_peri_aud
+ - const: dout_aclk_disp_222
+ - const: dout_sclk_disp_pixel
+ - const: dout_aclk_disp_333
+ required:
+ - clock-names
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5260-clk.h>
+
+ fin_pll: clock {
+ compatible = "fixed-clock";
+ clock-output-names = "fin_pll";
+ #clock-cells = <0>;
+ clock-frequency = <24000000>;
+ };
+
+ clock-controller@10010000 {
+ compatible = "samsung,exynos5260-clock-top";
+ reg = <0x10010000 0x10000>;
+ #clock-cells = <1>;
+ clocks = <&fin_pll>,
+ <&clock_mif MIF_DOUT_MEM_PLL>,
+ <&clock_mif MIF_DOUT_BUS_PLL>,
+ <&clock_mif MIF_DOUT_MEDIA_PLL>;
+ clock-names = "fin_pll",
+ "dout_mem_pll",
+ "dout_bus_pll",
+ "dout_media_pll";
+ };
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos5410-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos5410-clock.yaml
new file mode 100644
index 000000000000..b737c9d35a1c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos5410-clock.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos5410-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos5410 SoC clock controller
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Expected external clocks, defined in DTS as fixed-rate clocks with a matching
+ name::
+ - "fin_pll" - PLL input clock from XXTI
+
+ All available clocks are defined as preprocessor macros in
+ include/dt-bindings/clock/exynos5410.h header.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - samsung,exynos5410-clock
+
+ clocks:
+ description:
+ Should contain an entry specifying the root clock from external
+ oscillator supplied through XXTI or XusbXTI pin. This clock should be
+ defined using standard clock bindings with "fin_pll" clock-output-name.
+ That clock is being passed internally to the 9 PLLs.
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5410.h>
+
+ fin_pll: osc-clock {
+ compatible = "fixed-clock";
+ clock-frequency = <24000000>;
+ clock-output-names = "fin_pll";
+ #clock-cells = <0>;
+ };
+
+ clock-controller@10010000 {
+ compatible = "samsung,exynos5410-clock";
+ reg = <0x10010000 0x30000>;
+ #clock-cells = <1>;
+ clocks = <&fin_pll>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos5433-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos5433-clock.yaml
new file mode 100644
index 000000000000..3f9326e09f79
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos5433-clock.yaml
@@ -0,0 +1,524 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos5433-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos5433 SoC clock controller
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Expected external clocks, defined in DTS as fixed-rate clocks with a matching
+ name::
+ - "oscclk" - PLL input clock from XXTI
+
+ All available clocks are defined as preprocessor macros in
+ include/dt-bindings/clock/exynos5433.h header.
+
+properties:
+ compatible:
+ enum:
+ # CMU_TOP which generates clocks for
+ # IMEM/FSYS/G3D/GSCL/HEVC/MSCL/G2D/MFC/PERIC/PERIS domains and bus
+ # clocks
+ - samsung,exynos5433-cmu-top
+ # CMU_CPIF which generates clocks for LLI (Low Latency Interface) IP
+ - samsung,exynos5433-cmu-cpif
+ # CMU_MIF which generates clocks for DRAM Memory Controller domain
+ - samsung,exynos5433-cmu-mif
+ # CMU_PERIC which generates clocks for
+ # UART/I2C/SPI/I2S/PCM/SPDIF/PWM/SLIMBUS IPs
+ - samsung,exynos5433-cmu-peric
+ # CMU_PERIS which generates clocks for PMU/TMU/MCT/WDT/RTC/SECKEY/TZPC IPs
+ - samsung,exynos5433-cmu-peris
+ # CMU_FSYS which generates clocks for USB/UFS/SDMMC/TSI/PDMA IPs
+ - samsung,exynos5433-cmu-fsys
+ - samsung,exynos5433-cmu-g2d
+ # CMU_DISP which generates clocks for Display (DECON/HDMI/DSIM/MIXER) IPs
+ - samsung,exynos5433-cmu-disp
+ - samsung,exynos5433-cmu-aud
+ - samsung,exynos5433-cmu-bus0
+ - samsung,exynos5433-cmu-bus1
+ - samsung,exynos5433-cmu-bus2
+ - samsung,exynos5433-cmu-g3d
+ - samsung,exynos5433-cmu-gscl
+ - samsung,exynos5433-cmu-apollo
+ # CMU_ATLAS which generates clocks for Cortex-A57 Quad-core processor,
+ # CoreSight and L2 cache controller
+ - samsung,exynos5433-cmu-atlas
+ # CMU_MSCL which generates clocks for M2M (Memory to Memory) scaler and
+ # JPEG IPs
+ - samsung,exynos5433-cmu-mscl
+ - samsung,exynos5433-cmu-mfc
+ - samsung,exynos5433-cmu-hevc
+ # CMU_ISP which generates clocks for FIMC-ISP/DRC/SCLC/DIS/3DNR IPs
+ - samsung,exynos5433-cmu-isp
+ # CMU_CAM0 which generates clocks for
+ # MIPI_CSIS{0|1}/FIMC_LITE_{A|B|D}/FIMC_3AA{0|1} IPs
+ - samsung,exynos5433-cmu-cam0
+ # CMU_CAM1 which generates clocks for
+ # Cortex-A5/MIPI_CSIS2/FIMC-LITE_C/FIMC-FD IPs
+ - samsung,exynos5433-cmu-cam1
+ # CMU_IMEM which generates clocks for SSS (Security SubSystem) and
+ # SlimSSS IPs
+ - samsung,exynos5433-cmu-imem
+
+ clocks:
+ minItems: 1
+ maxItems: 10
+
+ clock-names:
+ minItems: 1
+ maxItems: 10
+
+ "#clock-cells":
+ const: 1
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - reg
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-top
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_mphy_pll
+ - const: sclk_mfc_pll
+ - const: sclk_bus_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-cpif
+ then:
+ properties:
+ clocks:
+ minItems: 1
+ maxItems: 1
+ clock-names:
+ items:
+ - const: oscclk
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-mif
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_mphy_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-fsys
+ then:
+ properties:
+ clocks:
+ minItems: 10
+ maxItems: 10
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_ufs_mphy
+ - const: aclk_fsys_200
+ - const: sclk_pcie_100_fsys
+ - const: sclk_ufsunipro_fsys
+ - const: sclk_mmc2_fsys
+ - const: sclk_mmc1_fsys
+ - const: sclk_mmc0_fsys
+ - const: sclk_usbhost30_fsys
+ - const: sclk_usbdrd30_fsys
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-g2d
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_g2d_266
+ - const: aclk_g2d_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-disp
+ then:
+ properties:
+ clocks:
+ minItems: 9
+ maxItems: 9
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_dsim1_disp
+ - const: sclk_dsim0_disp
+ - const: sclk_dsd_disp
+ - const: sclk_decon_tv_eclk_disp
+ - const: sclk_decon_vclk_disp
+ - const: sclk_decon_eclk_disp
+ - const: sclk_decon_tv_vclk_disp
+ - const: aclk_disp_333
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-aud
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: fout_aud_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-bus0
+ then:
+ properties:
+ clocks:
+ minItems: 1
+ maxItems: 1
+ clock-names:
+ items:
+ - const: aclk_bus0_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-bus1
+ then:
+ properties:
+ clocks:
+ minItems: 1
+ maxItems: 1
+ clock-names:
+ items:
+ - const: aclk_bus1_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-bus2
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_bus2_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-g3d
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_g3d_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-gscl
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_gscl_111
+ - const: aclk_gscl_333
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-apollo
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_bus_pll_apollo
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-atlas
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_bus_pll_atlas
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-mscl
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_jpeg_mscl
+ - const: aclk_mscl_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-mfc
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_mfc_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-hevc
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_hevc_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-isp
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_isp_dis_400
+ - const: aclk_isp_400
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-cam0
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_cam0_333
+ - const: aclk_cam0_400
+ - const: aclk_cam0_552
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-cam1
+ then:
+ properties:
+ clocks:
+ minItems: 7
+ maxItems: 7
+ clock-names:
+ items:
+ - const: oscclk
+ - const: sclk_isp_uart_cam1
+ - const: sclk_isp_spi1_cam1
+ - const: sclk_isp_spi0_cam1
+ - const: aclk_cam1_333
+ - const: aclk_cam1_400
+ - const: aclk_cam1_552
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-cmu-imem
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ clock-names:
+ items:
+ - const: oscclk
+ - const: aclk_imem_sssx_266
+ - const: aclk_imem_266
+ - const: aclk_imem_200
+ required:
+ - clock-names
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5433.h>
+ xxti: clock {
+ compatible = "fixed-clock";
+ clock-output-names = "oscclk";
+ #clock-cells = <0>;
+ clock-frequency = <24000000>;
+ };
+
+ clock-controller@10030000 {
+ compatible = "samsung,exynos5433-cmu-top";
+ reg = <0x10030000 0x1000>;
+ #clock-cells = <1>;
+
+ clock-names = "oscclk",
+ "sclk_mphy_pll",
+ "sclk_mfc_pll",
+ "sclk_bus_pll";
+ clocks = <&xxti>,
+ <&cmu_cpif CLK_SCLK_MPHY_PLL>,
+ <&cmu_mif CLK_SCLK_MFC_PLL>,
+ <&cmu_mif CLK_SCLK_BUS_PLL>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos7-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos7-clock.yaml
new file mode 100644
index 000000000000..c137c6744ef9
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos7-clock.yaml
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos7-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos7 SoC clock controller
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Expected external clocks, defined in DTS as fixed-rate clocks with a matching
+ name::
+ - "fin_pll" - PLL input clock from XXTI
+
+ All available clocks are defined as preprocessor macros in
+ include/dt-bindings/clock/exynos7-clk.h header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos7-clock-topc
+ - samsung,exynos7-clock-top0
+ - samsung,exynos7-clock-top1
+ - samsung,exynos7-clock-ccore
+ - samsung,exynos7-clock-peric0
+ - samsung,exynos7-clock-peric1
+ - samsung,exynos7-clock-peris
+ - samsung,exynos7-clock-fsys0
+ - samsung,exynos7-clock-fsys1
+ - samsung,exynos7-clock-mscl
+ - samsung,exynos7-clock-aud
+
+ clocks:
+ minItems: 1
+ maxItems: 13
+
+ clock-names:
+ minItems: 1
+ maxItems: 13
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - reg
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-top0
+ then:
+ properties:
+ clocks:
+ minItems: 6
+ maxItems: 6
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_sclk_bus0_pll
+ - const: dout_sclk_bus1_pll
+ - const: dout_sclk_cc_pll
+ - const: dout_sclk_mfc_pll
+ - const: dout_sclk_aud_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-top1
+ then:
+ properties:
+ clocks:
+ minItems: 5
+ maxItems: 5
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_sclk_bus0_pll
+ - const: dout_sclk_bus1_pll
+ - const: dout_sclk_cc_pll
+ - const: dout_sclk_mfc_pll
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-ccore
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_ccore_133
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-peric0
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_peric0_66
+ - const: sclk_uart0
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-peric1
+ then:
+ properties:
+ clocks:
+ minItems: 13
+ maxItems: 13
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_peric1_66
+ - const: sclk_uart1
+ - const: sclk_uart2
+ - const: sclk_uart3
+ - const: sclk_spi0
+ - const: sclk_spi1
+ - const: sclk_spi2
+ - const: sclk_spi3
+ - const: sclk_spi4
+ - const: sclk_i2s1
+ - const: sclk_pcm1
+ - const: sclk_spdif
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-peris
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_peris_66
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-fsys0
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_fsys0_200
+ - const: dout_sclk_mmc2
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-fsys1
+ then:
+ properties:
+ clocks:
+ minItems: 7
+ maxItems: 7
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_aclk_fsys1_200
+ - const: dout_sclk_mmc0
+ - const: dout_sclk_mmc1
+ - const: dout_sclk_ufsunipro20
+ - const: dout_sclk_phy_fsys1
+ - const: dout_sclk_phy_fsys1_26m
+ required:
+ - clock-names
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7-clock-aud
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: fout_aud_pll
+ required:
+ - clock-names
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos7-clk.h>
+
+ fin_pll: clock {
+ compatible = "fixed-clock";
+ clock-output-names = "fin_pll";
+ #clock-cells = <0>;
+ clock-frequency = <24000000>;
+ };
+
+ clock-controller@105e0000 {
+ compatible = "samsung,exynos7-clock-top1";
+ reg = <0x105e0000 0xb000>;
+ #clock-cells = <1>;
+ clocks = <&fin_pll>,
+ <&clock_topc DOUT_SCLK_BUS0_PLL>,
+ <&clock_topc DOUT_SCLK_BUS1_PLL>,
+ <&clock_topc DOUT_SCLK_CC_PLL>,
+ <&clock_topc DOUT_SCLK_MFC_PLL>;
+ clock-names = "fin_pll",
+ "dout_sclk_bus0_pll",
+ "dout_sclk_bus1_pll",
+ "dout_sclk_cc_pll",
+ "dout_sclk_mfc_pll";
+ };
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml
new file mode 100644
index 000000000000..006d33a9e0f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos7885-clock.yaml
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos7885-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos7885 SoC clock controller
+
+maintainers:
+ - Dávid Virág <virag.david003@gmail.com>
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Exynos7885 clock controller is comprised of several CMU units, generating
+ clocks for different domains. Those CMU units are modeled as separate device
+ tree nodes, and might depend on each other. The root clock in that root tree
+ is an external clock: OSCCLK (26 MHz). This external clock must be defined
+ as a fixed-rate clock in dts.
+
+ CMU_TOP is a top-level CMU, where all base clocks are prepared using PLLs and
+ dividers; all other leaf clocks (other CMUs) are usually derived from CMU_TOP.
+
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All clocks available for usage
+ in clock consumer nodes are defined as preprocessor macros in
+ 'dt-bindings/clock/exynos7885.h' header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos7885-cmu-top
+ - samsung,exynos7885-cmu-core
+ - samsung,exynos7885-cmu-fsys
+ - samsung,exynos7885-cmu-peri
+
+ clocks:
+ minItems: 1
+ maxItems: 10
+
+ clock-names:
+ minItems: 1
+ maxItems: 10
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7885-cmu-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+
+ clock-names:
+ items:
+ - const: oscclk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7885-cmu-core
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_CORE bus clock (from CMU_TOP)
+ - description: CCI clock (from CMU_TOP)
+ - description: G3D clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_core_bus
+ - const: dout_core_cci
+ - const: dout_core_g3d
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7885-cmu-fsys
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_FSYS bus clock (from CMU_TOP)
+ - description: MMC_CARD clock (from CMU_TOP)
+ - description: MMC_EMBD clock (from CMU_TOP)
+ - description: MMC_SDIO clock (from CMU_TOP)
+ - description: USB30DRD clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_fsys_bus
+ - const: dout_fsys_mmc_card
+ - const: dout_fsys_mmc_embd
+ - const: dout_fsys_mmc_sdio
+ - const: dout_fsys_usb30drd
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos7885-cmu-peri
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_PERI bus clock (from CMU_TOP)
+ - description: SPI0 clock (from CMU_TOP)
+ - description: SPI1 clock (from CMU_TOP)
+ - description: UART0 clock (from CMU_TOP)
+ - description: UART1 clock (from CMU_TOP)
+ - description: UART2 clock (from CMU_TOP)
+ - description: USI0 clock (from CMU_TOP)
+ - description: USI1 clock (from CMU_TOP)
+ - description: USI2 clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_peri_bus
+ - const: dout_peri_spi0
+ - const: dout_peri_spi1
+ - const: dout_peri_uart0
+ - const: dout_peri_uart1
+ - const: dout_peri_uart2
+ - const: dout_peri_usi0
+ - const: dout_peri_usi1
+ - const: dout_peri_usi2
+
+required:
+ - compatible
+ - "#clock-cells"
+ - clocks
+ - clock-names
+ - reg
+
+additionalProperties: false
+
+examples:
+ # Clock controller node for CMU_PERI
+ - |
+ #include <dt-bindings/clock/exynos7885.h>
+
+ cmu_peri: clock-controller@10010000 {
+ compatible = "samsung,exynos7885-cmu-peri";
+ reg = <0x10010000 0x8000>;
+ #clock-cells = <1>;
+
+ clocks = <&oscclk>,
+ <&cmu_top CLK_DOUT_PERI_BUS>,
+ <&cmu_top CLK_DOUT_PERI_SPI0>,
+ <&cmu_top CLK_DOUT_PERI_SPI1>,
+ <&cmu_top CLK_DOUT_PERI_UART0>,
+ <&cmu_top CLK_DOUT_PERI_UART1>,
+ <&cmu_top CLK_DOUT_PERI_UART2>,
+ <&cmu_top CLK_DOUT_PERI_USI0>,
+ <&cmu_top CLK_DOUT_PERI_USI1>,
+ <&cmu_top CLK_DOUT_PERI_USI2>;
+ clock-names = "oscclk",
+ "dout_peri_bus",
+ "dout_peri_spi0",
+ "dout_peri_spi1",
+ "dout_peri_uart0",
+ "dout_peri_uart1",
+ "dout_peri_uart2",
+ "dout_peri_usi0",
+ "dout_peri_usi1",
+ "dout_peri_usi2";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml
new file mode 100644
index 000000000000..c752c8985a53
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynos850-clock.yaml
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynos850-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos850 SoC clock controller
+
+maintainers:
+ - Sam Protsenko <semen.protsenko@linaro.org>
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Exynos850 clock controller is comprised of several CMU units, generating
+ clocks for different domains. Those CMU units are modeled as separate device
+ tree nodes, and might depend on each other. Root clocks in that clock tree are
+ two external clocks:: OSCCLK (26 MHz) and RTCCLK (32768 Hz). Those external
+ clocks must be defined as fixed-rate clocks in dts.
+
+ CMU_TOP is a top-level CMU, where all base clocks are prepared using PLLs and
+ dividers; all other leaf clocks (other CMUs) are usually derived from CMU_TOP.
+
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All clocks available for usage
+ in clock consumer nodes are defined as preprocessor macros in
+ 'dt-bindings/clock/exynos850.h' header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos850-cmu-top
+ - samsung,exynos850-cmu-apm
+ - samsung,exynos850-cmu-aud
+ - samsung,exynos850-cmu-cmgp
+ - samsung,exynos850-cmu-core
+ - samsung,exynos850-cmu-dpu
+ - samsung,exynos850-cmu-g3d
+ - samsung,exynos850-cmu-hsi
+ - samsung,exynos850-cmu-is
+ - samsung,exynos850-cmu-mfcmscl
+ - samsung,exynos850-cmu-peri
+
+ clocks:
+ minItems: 1
+ maxItems: 5
+
+ clock-names:
+ minItems: 1
+ maxItems: 5
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+
+ clock-names:
+ items:
+ - const: oscclk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-apm
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_APM bus clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_apm_bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-aud
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: AUD clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_aud
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-cmgp
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_CMGP bus clock (from CMU_APM)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: gout_clkcmu_cmgp_bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-core
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_CORE bus clock (from CMU_TOP)
+ - description: CCI clock (from CMU_TOP)
+ - description: eMMC clock (from CMU_TOP)
+ - description: SSS clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_core_bus
+ - const: dout_core_cci
+ - const: dout_core_mmc_embd
+ - const: dout_core_sss
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-dpu
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: DPU clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_dpu
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-g3d
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: G3D clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_g3d_switch
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-hsi
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: External RTC clock (32768 Hz)
+ - description: CMU_HSI bus clock (from CMU_TOP)
+ - description: SD card clock (from CMU_TOP)
+ - description: USB 2.0 DRD clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: rtcclk
+ - const: dout_hsi_bus
+ - const: dout_hsi_mmc_card
+ - const: dout_hsi_usb20drd
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-is
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_IS bus clock (from CMU_TOP)
+ - description: Image Texture Processing core clock (from CMU_TOP)
+ - description: Visual Recognition Accelerator clock (from CMU_TOP)
+ - description: Geometric Distortion Correction clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_is_bus
+ - const: dout_is_itp
+ - const: dout_is_vra
+ - const: dout_is_gdc
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-mfcmscl
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: Multi-Format Codec clock (from CMU_TOP)
+ - description: Memory to Memory Scaler clock (from CMU_TOP)
+ - description: Multi-Channel Scaler clock (from CMU_TOP)
+ - description: JPEG codec clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_mfcmscl_mfc
+ - const: dout_mfcmscl_m2m
+ - const: dout_mfcmscl_mcsc
+ - const: dout_mfcmscl_jpeg
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos850-cmu-peri
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_PERI bus clock (from CMU_TOP)
+ - description: UART clock (from CMU_TOP)
+ - description: Parent clock for HSI2C and SPI (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_peri_bus
+ - const: dout_peri_uart
+ - const: dout_peri_ip
+
+required:
+ - compatible
+ - "#clock-cells"
+ - clocks
+ - clock-names
+ - reg
+
+additionalProperties: false
+
+examples:
+ # Clock controller node for CMU_PERI
+ - |
+ #include <dt-bindings/clock/exynos850.h>
+
+ cmu_peri: clock-controller@10030000 {
+ compatible = "samsung,exynos850-cmu-peri";
+ reg = <0x10030000 0x8000>;
+ #clock-cells = <1>;
+
+ clocks = <&oscclk>, <&cmu_top CLK_DOUT_PERI_BUS>,
+ <&cmu_top CLK_DOUT_PERI_UART>,
+ <&cmu_top CLK_DOUT_PERI_IP>;
+ clock-names = "oscclk", "dout_peri_bus",
+ "dout_peri_uart", "dout_peri_ip";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/samsung,exynosautov9-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynosautov9-clock.yaml
new file mode 100644
index 000000000000..55c4f94a14d1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,exynosautov9-clock.yaml
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,exynosautov9-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos Auto v9 SoC clock controller
+
+maintainers:
+ - Chanho Park <chanho61.park@samsung.com>
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Exynos Auto v9 clock controller is comprised of several CMU units, generating
+ clocks for different domains. Those CMU units are modeled as separate device
+ tree nodes, and might depend on each other. Root clocks in that clock tree are
+ two external clocks:: OSCCLK/XTCXO (26 MHz) and RTCCLK/XrtcXTI (32768 Hz).
+ The external OSCCLK must be defined as fixed-rate clock in dts.
+
+ CMU_TOP is a top-level CMU, where all base clocks are prepared using PLLs and
+ dividers; all other clocks of function blocks (other CMUs) are usually
+ derived from CMU_TOP.
+
+ Each clock is assigned an identifier and client nodes can use this identifier
+ to specify the clock which they consume. All clocks available for usage
+ in clock consumer nodes are defined as preprocessor macros in
+ 'include/dt-bindings/clock/samsung,exynosautov9.h' header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynosautov9-cmu-top
+ - samsung,exynosautov9-cmu-busmc
+ - samsung,exynosautov9-cmu-core
+ - samsung,exynosautov9-cmu-fsys0
+ - samsung,exynosautov9-cmu-fsys1
+ - samsung,exynosautov9-cmu-fsys2
+ - samsung,exynosautov9-cmu-peric0
+ - samsung,exynosautov9-cmu-peric1
+ - samsung,exynosautov9-cmu-peris
+
+ clocks:
+ minItems: 1
+ maxItems: 5
+
+ clock-names:
+ minItems: 1
+ maxItems: 5
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+
+ clock-names:
+ items:
+ - const: oscclk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-busmc
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_BUSMC bus clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_busmc_bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-core
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_CORE bus clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_core_bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-fsys0
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_FSYS0 bus clock (from CMU_TOP)
+ - description: CMU_FSYS0 pcie clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_fsys0_bus
+ - const: dout_clkcmu_fsys0_pcie
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-fsys1
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_FSYS1 bus clock (from CMU_TOP)
+ - description: CMU_FSYS1 mmc card clock (from CMU_TOP)
+ - description: CMU_FSYS1 usb clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_fsys1_bus
+ - const: gout_clkcmu_fsys1_mmc_card
+ - const: dout_clkcmu_fsys1_usbdrd
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-fsys2
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_FSYS2 bus clock (from CMU_TOP)
+ - description: UFS clock (from CMU_TOP)
+ - description: Ethernet clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_fsys2_bus
+ - const: dout_fsys2_clkcmu_ufs_embd
+ - const: dout_fsys2_clkcmu_ethernet
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-peric0
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_PERIC0 bus clock (from CMU_TOP)
+ - description: PERIC0 IP clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_peric0_bus
+ - const: dout_clkcmu_peric0_ip
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-peric1
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_PERIC1 bus clock (from CMU_TOP)
+ - description: PERIC1 IP clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_peric1_bus
+ - const: dout_clkcmu_peric1_ip
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynosautov9-cmu-peris
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (26 MHz)
+ - description: CMU_PERIS bus clock (from CMU_TOP)
+
+ clock-names:
+ items:
+ - const: oscclk
+ - const: dout_clkcmu_peris_bus
+
+required:
+ - compatible
+ - "#clock-cells"
+ - clocks
+ - clock-names
+ - reg
+
+additionalProperties: false
+
+examples:
+ # Clock controller node for CMU_FSYS2
+ - |
+ #include <dt-bindings/clock/samsung,exynosautov9.h>
+
+ cmu_fsys2: clock-controller@17c00000 {
+ compatible = "samsung,exynosautov9-cmu-fsys2";
+ reg = <0x17c00000 0x8000>;
+ #clock-cells = <1>;
+
+ clocks = <&xtcxo>,
+ <&cmu_top DOUT_CLKCMU_FSYS2_BUS>,
+ <&cmu_top DOUT_CLKCMU_FSYS2_UFS_EMBD>,
+ <&cmu_top DOUT_CLKCMU_FSYS2_ETHERNET>;
+ clock-names = "oscclk",
+ "dout_clkcmu_fsys2_bus",
+ "dout_fsys2_clkcmu_ufs_embd",
+ "dout_fsys2_clkcmu_ethernet";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt b/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt
deleted file mode 100644
index 2726c1d58a79..000000000000
--- a/Documentation/devicetree/bindings/clock/samsung,s2mps11.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-Binding for Samsung S2M and S5M family clock generator block
-============================================================
-
-This is a part of device tree bindings for S2M and S5M family multi-function
-devices.
-More information can be found in bindings/mfd/sec-core.txt file.
-
-The S2MPS11/13/15 and S5M8767 provide three(AP/CP/BT) buffered 32.768 kHz
-outputs. The S2MPS14 provides two (AP/BT) buffered 32.768 KHz outputs.
-
-To register these as clocks with common clock framework instantiate under
-main device node a sub-node named "clocks".
-
-It uses the common clock binding documented in:
- - Documentation/devicetree/bindings/clock/clock-bindings.txt
-
-
-Required properties of the "clocks" sub-node:
- - #clock-cells: should be 1.
- - compatible: Should be one of: "samsung,s2mps11-clk", "samsung,s2mps13-clk",
- "samsung,s2mps14-clk", "samsung,s5m8767-clk"
- The S2MPS15 uses the same compatible as S2MPS13, as both provides similar
- clocks.
-
-
-Each clock is assigned an identifier and client nodes use this identifier
-to specify the clock which they consume.
- Clock ID Devices
- ----------------------------------------------------------
- 32KhzAP 0 S2MPS11/13/14/15, S5M8767
- 32KhzCP 1 S2MPS11/13/15, S5M8767
- 32KhzBT 2 S2MPS11/13/14/15, S5M8767
-
-Include dt-bindings/clock/samsung,s2mps11.h file to use preprocessor defines
-in device tree sources.
-
-
-Example:
-
- s2mps11_pmic@66 {
- compatible = "samsung,s2mps11-pmic";
- reg = <0x66>;
-
- s2m_osc: clocks {
- compatible = "samsung,s2mps11-clk";
- #clock-cells = <1>;
- clock-output-names = "xx", "yy", "zz";
- };
- };
diff --git a/Documentation/devicetree/bindings/clock/samsung,s2mps11.yaml b/Documentation/devicetree/bindings/clock/samsung,s2mps11.yaml
new file mode 100644
index 000000000000..d5296e6053a1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,s2mps11.yaml
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,s2mps11.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S2M and S5M family clock generator block
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ This is a part of device tree bindings for S2M and S5M family of Power
+ Management IC (PMIC).
+
+ The S2MPS11/13/15 and S5M8767 provide three(AP/CP/BT) buffered 32.768 kHz
+ outputs. The S2MPS14 provides two (AP/BT) buffered 32.768 KHz outputs.
+
+ All available clocks are defined as preprocessor macros in
+ dt-bindings/clock/samsung,s2mps11.h header.
+
+ See also Documentation/devicetree/bindings/mfd/samsung,s2mps11.yaml for
+ additional information and example.
+
+properties:
+ compatible:
+ enum:
+ - samsung,s2mps11-clk
+ - samsung,s2mps13-clk # S2MPS13 and S2MPS15
+ - samsung,s2mps14-clk
+ - samsung,s5m8767-clk
+
+ "#clock-cells":
+ const: 1
+
+ clock-output-names:
+ maxItems: 3
+ description: Names for AP, CP and BT clocks.
+
+required:
+ - compatible
+ - "#clock-cells"
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/clock/samsung,s3c2410-clock.txt b/Documentation/devicetree/bindings/clock/samsung,s3c2410-clock.txt
deleted file mode 100644
index 2632d3f13004..000000000000
--- a/Documentation/devicetree/bindings/clock/samsung,s3c2410-clock.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-* Samsung S3C2410 Clock Controller
-
-The S3C2410 clock controller generates and supplies clock to various controllers
-within the SoC. The clock binding described here is applicable to the s3c2410,
-s3c2440 and s3c2442 SoCs in the s3c24x family.
-
-Required Properties:
-
-- compatible: should be one of the following.
- - "samsung,s3c2410-clock" - controller compatible with S3C2410 SoC.
- - "samsung,s3c2440-clock" - controller compatible with S3C2440 SoC.
- - "samsung,s3c2442-clock" - controller compatible with S3C2442 SoC.
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. Some of the clocks are available only
-on a particular SoC.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/s3c2410.h header and can be used in device
-tree sources.
-
-External clocks:
-
-The xti clock used as input for the plls is generated outside the SoC. It is
-expected that is are defined using standard clock bindings with a
-clock-output-names value of "xti".
-
-Example: Clock controller node:
-
- clocks: clock-controller@4c000000 {
- compatible = "samsung,s3c2410-clock";
- reg = <0x4c000000 0x20>;
- #clock-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller (refer to the standard clock bindings for information about
- "clocks" and "clock-names" properties):
-
- serial@50004000 {
- compatible = "samsung,s3c2440-uart";
- reg = <0x50004000 0x4000>;
- interrupts = <1 23 3 4>, <1 23 4 4>;
- clock-names = "uart", "clk_uart_baud2";
- clocks = <&clocks PCLK_UART0>, <&clocks PCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/samsung,s3c2412-clock.txt b/Documentation/devicetree/bindings/clock/samsung,s3c2412-clock.txt
deleted file mode 100644
index 21a8c23e658f..000000000000
--- a/Documentation/devicetree/bindings/clock/samsung,s3c2412-clock.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-* Samsung S3C2412 Clock Controller
-
-The S3C2412 clock controller generates and supplies clock to various controllers
-within the SoC. The clock binding described here is applicable to the s3c2412
-and s3c2413 SoCs in the s3c24x family.
-
-Required Properties:
-
-- compatible: should be "samsung,s3c2412-clock"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. Some of the clocks are available only
-on a particular SoC.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/s3c2412.h header and can be used in device
-tree sources.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xti" - crystal input - required,
- - "ext" - external clock source - optional,
-
-Example: Clock controller node:
-
- clocks: clock-controller@4c000000 {
- compatible = "samsung,s3c2412-clock";
- reg = <0x4c000000 0x20>;
- #clock-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller (refer to the standard clock bindings for information about
- "clocks" and "clock-names" properties):
-
- serial@50004000 {
- compatible = "samsung,s3c2412-uart";
- reg = <0x50004000 0x4000>;
- interrupts = <1 23 3 4>, <1 23 4 4>;
- clock-names = "uart", "clk_uart_baud2", "clk_uart_baud3";
- clocks = <&clocks PCLK_UART0>, <&clocks PCLK_UART0>,
- <&clocks SCLK_UART>;
- };
diff --git a/Documentation/devicetree/bindings/clock/samsung,s3c2443-clock.txt b/Documentation/devicetree/bindings/clock/samsung,s3c2443-clock.txt
deleted file mode 100644
index 985c0f574e9a..000000000000
--- a/Documentation/devicetree/bindings/clock/samsung,s3c2443-clock.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-* Samsung S3C2443 Clock Controller
-
-The S3C2443 clock controller generates and supplies clock to various controllers
-within the SoC. The clock binding described here is applicable to all SoCs in
-the s3c24x family starting with the s3c2443.
-
-Required Properties:
-
-- compatible: should be one of the following.
- - "samsung,s3c2416-clock" - controller compatible with S3C2416 SoC.
- - "samsung,s3c2443-clock" - controller compatible with S3C2443 SoC.
- - "samsung,s3c2450-clock" - controller compatible with S3C2450 SoC.
-- reg: physical base address of the controller and length of memory mapped
- region.
-- #clock-cells: should be 1.
-
-Each clock is assigned an identifier and client nodes can use this identifier
-to specify the clock which they consume. Some of the clocks are available only
-on a particular SoC.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/s3c2443.h header and can be used in device
-tree sources.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xti" - crystal input - required,
- - "ext" - external clock source - optional,
- - "ext_i2s" - external I2S clock - optional,
- - "ext_uart" - external uart clock - optional,
-
-Example: Clock controller node:
-
- clocks: clock-controller@4c000000 {
- compatible = "samsung,s3c2416-clock";
- reg = <0x4c000000 0x40>;
- #clock-cells = <1>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller (refer to the standard clock bindings for information about
- "clocks" and "clock-names" properties):
-
- serial@50004000 {
- compatible = "samsung,s3c2440-uart";
- reg = <0x50004000 0x4000>;
- interrupts = <1 23 3 4>, <1 23 4 4>;
- clock-names = "uart", "clk_uart_baud2",
- "clk_uart_baud3";
- clocks = <&clocks PCLK_UART0>, <&clocks PCLK_UART0>,
- <&clocks SCLK_UART>;
- };
diff --git a/Documentation/devicetree/bindings/clock/samsung,s5pv210-audss-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,s5pv210-audss-clock.yaml
index ae8f8fc93233..2659854ea1c0 100644
--- a/Documentation/devicetree/bindings/clock/samsung,s5pv210-audss-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,s5pv210-audss-clock.yaml
@@ -8,7 +8,7 @@ title: Samsung S5Pv210 SoC Audio SubSystem clock controller
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
- Sylwester Nawrocki <s.nawrocki@samsung.com>
- Tomasz Figa <tomasz.figa@gmail.com>
diff --git a/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt b/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt
deleted file mode 100644
index a86c83bf9d4e..000000000000
--- a/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-* Samsung S5P6442/S5PC110/S5PV210 Clock Controller
-
-Samsung S5P6442, S5PC110 and S5PV210 SoCs contain integrated clock
-controller, which generates and supplies clock to various controllers
-within the SoC.
-
-Required Properties:
-
-- compatible: should be one of following:
- - "samsung,s5pv210-clock" : for clock controller of Samsung
- S5PC110/S5PV210 SoCs,
- - "samsung,s5p6442-clock" : for clock controller of Samsung
- S5P6442 SoC.
-
-- reg: physical base address of the controller and length of memory mapped
- region.
-
-- #clock-cells: should be 1.
-
-All available clocks are defined as preprocessor macros in
-dt-bindings/clock/s5pv210.h header and can be used in device tree sources.
-
-External clocks:
-
-There are several clocks that are generated outside the SoC. It is expected
-that they are defined using standard clock bindings with following
-clock-output-names:
- - "xxti": external crystal oscillator connected to XXTI and XXTO pins of
-the SoC,
- - "xusbxti": external crystal oscillator connected to XUSBXTI and XUSBXTO
-pins of the SoC,
-
-A subset of above clocks available on given board shall be specified in
-board device tree, including the system base clock, as selected by XOM[0]
-pin of the SoC. Refer to generic fixed rate clock bindings
-documentation[1] for more information how to specify these clocks.
-
-[1] Documentation/devicetree/bindings/clock/fixed-clock.yaml
-
-Example: Clock controller node:
-
- clock: clock-controller@7e00f000 {
- compatible = "samsung,s5pv210-clock";
- reg = <0x7e00f000 0x1000>;
- #clock-cells = <1>;
- };
-
-Example: Required external clocks:
-
- xxti: clock-xxti {
- compatible = "fixed-clock";
- clock-output-names = "xxti";
- clock-frequency = <24000000>;
- #clock-cells = <0>;
- };
-
- xusbxti: clock-xusbxti {
- compatible = "fixed-clock";
- clock-output-names = "xusbxti";
- clock-frequency = <24000000>;
- #clock-cells = <0>;
- };
-
-Example: UART controller node that consumes the clock generated by the clock
- controller (refer to the standard clock bindings for information about
- "clocks" and "clock-names" properties):
-
- uart0: serial@e2900000 {
- compatible = "samsung,s5pv210-uart";
- reg = <0xe2900000 0x400>;
- interrupt-parent = <&vic1>;
- interrupts = <10>;
- clock-names = "uart", "clk_uart_baud0",
- "clk_uart_baud1";
- clocks = <&clocks UART0>, <&clocks UART0>,
- <&clocks SCLK_UART0>;
- };
diff --git a/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.yaml
new file mode 100644
index 000000000000..67a33665cf00
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/samsung,s5pv210-clock.yaml
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/samsung,s5pv210-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S5P6442/S5PC110/S5PV210 SoC clock controller
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+ - Sylwester Nawrocki <s.nawrocki@samsung.com>
+ - Tomasz Figa <tomasz.figa@gmail.com>
+
+description: |
+ Expected external clocks, defined in DTS as fixed-rate clocks with a matching
+ name::
+ - "xxti" - external crystal oscillator connected to XXTI and XXTO pins of
+ the SoC,
+ - "xusbxti" - external crystal oscillator connected to XUSBXTI and XUSBXTO
+ pins of the SoC,
+
+ All available clocks are defined as preprocessor macros in
+ include/dt-bindings/clock/s5pv210.h header.
+
+properties:
+ compatible:
+ enum:
+ - samsung,s5pv210-clock
+ - samsung,s5p6442-clock
+
+ clocks:
+ items:
+ - description: xxti clock
+ - description: xusbxti clock
+
+ clock-names:
+ items:
+ - const: xxti
+ - const: xusbxti
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#clock-cells"
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/s5pv210.h>
+
+ xxti: clock-0 {
+ compatible = "fixed-clock";
+ clock-frequency = <0>;
+ clock-output-names = "xxti";
+ #clock-cells = <0>;
+ };
+
+ xusbxti: clock-1 {
+ compatible = "fixed-clock";
+ clock-frequency = <0>;
+ clock-output-names = "xusbxti";
+ #clock-cells = <0>;
+ };
+
+ clock-controller@e0100000 {
+ compatible = "samsung,s5pv210-clock";
+ reg = <0xe0100000 0x10000>;
+ clock-names = "xxti", "xusbxti";
+ clocks = <&xxti>, <&xusbxti>;
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/sifive/fu540-prci.yaml b/Documentation/devicetree/bindings/clock/sifive/fu540-prci.yaml
index c3be1b600007..c79e752283aa 100644
--- a/Documentation/devicetree/bindings/clock/sifive/fu540-prci.yaml
+++ b/Documentation/devicetree/bindings/clock/sifive/fu540-prci.yaml
@@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: SiFive FU540 Power Reset Clock Interrupt Controller (PRCI)
maintainers:
- - Sagar Kadam <sagar.kadam@sifive.com>
- Paul Walmsley <paul.walmsley@sifive.com>
description:
diff --git a/Documentation/devicetree/bindings/clock/sifive/fu740-prci.yaml b/Documentation/devicetree/bindings/clock/sifive/fu740-prci.yaml
index e17143cac316..252085a0cf65 100644
--- a/Documentation/devicetree/bindings/clock/sifive/fu740-prci.yaml
+++ b/Documentation/devicetree/bindings/clock/sifive/fu740-prci.yaml
@@ -42,6 +42,9 @@ properties:
"#clock-cells":
const: 1
+ "#reset-cells":
+ const: 1
+
required:
- compatible
- reg
@@ -57,4 +60,5 @@ examples:
reg = <0x10000000 0x1000>;
clocks = <&hfclk>, <&rtcclk>;
#clock-cells = <1>;
+ #reset-cells = <1>;
};
diff --git a/Documentation/devicetree/bindings/clock/silabs,si5351.txt b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
index 8fe6f80afade..bfda6af76bee 100644
--- a/Documentation/devicetree/bindings/clock/silabs,si5351.txt
+++ b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
@@ -2,7 +2,7 @@ Binding for Silicon Labs Si5351a/b/c programmable i2c clock generator.
Reference
[1] Si5351A/B/C Data Sheet
- https://www.silabs.com/Support%20Documents/TechnicalDocs/Si5351.pdf
+ https://www.skyworksinc.com/-/media/Skyworks/SL/documents/public/data-sheets/Si5351-B.pdf
The Si5351a/b/c are programmable i2c clock generators with up to 8 output
clocks. Si5351a also has a reduced pin-count package (MSOP10) where only
diff --git a/Documentation/devicetree/bindings/clock/skyworks,si521xx.yaml b/Documentation/devicetree/bindings/clock/skyworks,si521xx.yaml
new file mode 100644
index 000000000000..9e35e0e51ce8
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/skyworks,si521xx.yaml
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/skyworks,si521xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Skyworks Si521xx I2C PCIe clock generators
+
+description: |
+ The Skyworks Si521xx are I2C PCIe clock generators providing
+ from 4 to 9 output clocks.
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+properties:
+ compatible:
+ enum:
+ - skyworks,si52144
+ - skyworks,si52146
+ - skyworks,si52147
+
+ reg:
+ const: 0x6b
+
+ '#clock-cells':
+ const: 1
+
+ clocks:
+ items:
+ - description: XTal input clock
+
+ skyworks,out-amplitude-microvolt:
+ enum: [ 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000 ]
+ description: Output clock signal amplitude
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ clock-generator@6b {
+ compatible = "skyworks,si52144";
+ reg = <0x6b>;
+ #clock-cells = <1>;
+ clocks = <&ref25m>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/socionext,uniphier-clock.yaml b/Documentation/devicetree/bindings/clock/socionext,uniphier-clock.yaml
index c3930edc410f..4e82582fb2f3 100644
--- a/Documentation/devicetree/bindings/clock/socionext,uniphier-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/socionext,uniphier-clock.yaml
@@ -23,6 +23,7 @@ properties:
- socionext,uniphier-ld11-clock
- socionext,uniphier-ld20-clock
- socionext,uniphier-pxs3-clock
+ - socionext,uniphier-nx1-clock
- description: Media I/O (MIO) clock, SD clock
enum:
- socionext,uniphier-ld4-mio-clock
@@ -33,6 +34,7 @@ properties:
- socionext,uniphier-ld11-mio-clock
- socionext,uniphier-ld20-sd-clock
- socionext,uniphier-pxs3-sd-clock
+ - socionext,uniphier-nx1-sd-clock
- description: Peripheral clock
enum:
- socionext,uniphier-ld4-peri-clock
@@ -43,6 +45,10 @@ properties:
- socionext,uniphier-ld11-peri-clock
- socionext,uniphier-ld20-peri-clock
- socionext,uniphier-pxs3-peri-clock
+ - socionext,uniphier-nx1-peri-clock
+ - description: SoC-glue clock
+ enum:
+ - socionext,uniphier-pro4-sg-clock
"#clock-cells":
const: 1
@@ -55,40 +61,7 @@ required:
examples:
- |
- sysctrl@61840000 {
- compatible = "socionext,uniphier-sysctrl", "simple-mfd", "syscon";
- reg = <0x61840000 0x4000>;
-
- clock {
- compatible = "socionext,uniphier-ld11-clock";
- #clock-cells = <1>;
- };
-
- // other nodes ...
- };
-
- - |
- mioctrl@59810000 {
- compatible = "socionext,uniphier-mioctrl", "simple-mfd", "syscon";
- reg = <0x59810000 0x800>;
-
- clock {
- compatible = "socionext,uniphier-ld11-mio-clock";
- #clock-cells = <1>;
- };
-
- // other nodes ...
- };
-
- - |
- perictrl@59820000 {
- compatible = "socionext,uniphier-perictrl", "simple-mfd", "syscon";
- reg = <0x59820000 0x200>;
-
- clock {
- compatible = "socionext,uniphier-ld11-peri-clock";
- #clock-cells = <1>;
- };
-
- // other nodes ...
+ clock-controller {
+ compatible = "socionext,uniphier-ld11-clock";
+ #clock-cells = <1>;
};
diff --git a/Documentation/devicetree/bindings/clock/sprd,sc9863a-clk.yaml b/Documentation/devicetree/bindings/clock/sprd,sc9863a-clk.yaml
index 47e1ab08c95d..1703e305e6d8 100644
--- a/Documentation/devicetree/bindings/clock/sprd,sc9863a-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/sprd,sc9863a-clk.yaml
@@ -2,10 +2,10 @@
# Copyright 2019 Unisoc Inc.
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/clock/sprd,sc9863a-clk.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/clock/sprd,sc9863a-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: SC9863A Clock Control Unit Device Tree Bindings
+title: SC9863A Clock Control Unit
maintainers:
- Orson Zhai <orsonzhai@gmail.com>
diff --git a/Documentation/devicetree/bindings/clock/sprd,ums512-clk.yaml b/Documentation/devicetree/bindings/clock/sprd,ums512-clk.yaml
new file mode 100644
index 000000000000..43d2b6c31357
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/sprd,ums512-clk.yaml
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2022 Unisoc Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/sprd,ums512-clk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: UMS512 Soc clock controller
+
+maintainers:
+ - Orson Zhai <orsonzhai@gmail.com>
+ - Baolin Wang <baolin.wang7@gmail.com>
+ - Chunyan Zhang <zhang.lyra@gmail.com>
+
+properties:
+ compatible:
+ enum:
+ - sprd,ums512-apahb-gate
+ - sprd,ums512-ap-clk
+ - sprd,ums512-aonapb-clk
+ - sprd,ums512-pmu-gate
+ - sprd,ums512-g0-pll
+ - sprd,ums512-g2-pll
+ - sprd,ums512-g3-pll
+ - sprd,ums512-gc-pll
+ - sprd,ums512-aon-gate
+ - sprd,ums512-audcpapb-gate
+ - sprd,ums512-audcpahb-gate
+ - sprd,ums512-gpu-clk
+ - sprd,ums512-mm-clk
+ - sprd,ums512-mm-gate-clk
+ - sprd,ums512-apapb-gate
+
+ "#clock-cells":
+ const: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 4
+ description: |
+ The input parent clock(s) phandle for the clock, only list
+ fixed clocks which are declared in devicetree.
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: ext-26m
+ - const: ext-32k
+ - const: ext-4m
+ - const: rco-100m
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - '#clock-cells'
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ ap_clk: clock-controller@20200000 {
+ compatible = "sprd,ums512-ap-clk";
+ reg = <0x20200000 0x1000>;
+ clocks = <&ext_26m>;
+ clock-names = "ext-26m";
+ #clock-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/st,stm32mp1-rcc.yaml b/Documentation/devicetree/bindings/clock/st,stm32mp1-rcc.yaml
index 8b1ecb2ecdd5..5194be0b410e 100644
--- a/Documentation/devicetree/bindings/clock/st,stm32mp1-rcc.yaml
+++ b/Documentation/devicetree/bindings/clock/st,stm32mp1-rcc.yaml
@@ -4,10 +4,10 @@
$id: http://devicetree.org/schemas/clock/st,stm32mp1-rcc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Reset Clock Controller Binding
+title: STMicroelectronics STM32MP1 Reset Clock Controller
maintainers:
- - Gabriel Fernandez <gabriel.fernandez@st.com>
+ - Gabriel Fernandez <gabriel.fernandez@foss.st.com>
description: |
The RCC IP is both a reset and a clock controller.
@@ -41,6 +41,7 @@ description: |
The list of valid indices for STM32MP1 is available in:
include/dt-bindings/reset-controller/stm32mp1-resets.h
+ include/dt-bindings/reset-controller/stm32mp13-resets.h
This file implements defines like:
#define LTDC_R 3072
@@ -57,7 +58,10 @@ properties:
- enum:
- st,stm32mp1-rcc-secure
- st,stm32mp1-rcc
+ - st,stm32mp13-rcc
- const: syscon
+ clocks: true
+ clock-names: true
reg:
maxItems: 1
@@ -68,14 +72,54 @@ required:
- compatible
- reg
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - st,stm32mp1-rcc-secure
+ - st,stm32mp13-rcc
+then:
+ properties:
+ clocks:
+ description: Specifies oscillators.
+ maxItems: 5
+
+ clock-names:
+ items:
+ - const: hse
+ - const: hsi
+ - const: csi
+ - const: lse
+ - const: lsi
+ required:
+ - clocks
+ - clock-names
+else:
+ properties:
+ clocks:
+ description:
+ Specifies the external RX clock for ethernet MAC.
+ maxItems: 1
+
+ clock-names:
+ const: ETH_RX_CLK/ETH_REF_CLK
+
additionalProperties: false
examples:
- |
+ #include <dt-bindings/clock/stm32mp1-clks.h>
rcc: rcc@50000000 {
compatible = "st,stm32mp1-rcc-secure", "syscon";
reg = <0x50000000 0x1000>;
#clock-cells = <1>;
#reset-cells = <1>;
+ clock-names = "hse", "hsi", "csi", "lse", "lsi";
+ clocks = <&scmi_clk CK_SCMI_HSE>,
+ <&scmi_clk CK_SCMI_HSI>,
+ <&scmi_clk CK_SCMI_CSI>,
+ <&scmi_clk CK_SCMI_LSE>,
+ <&scmi_clk CK_SCMI_LSI>;
};
...
diff --git a/Documentation/devicetree/bindings/clock/st/st,flexgen.txt b/Documentation/devicetree/bindings/clock/st/st,flexgen.txt
index 55a18939bddd..c918075405ba 100644
--- a/Documentation/devicetree/bindings/clock/st/st,flexgen.txt
+++ b/Documentation/devicetree/bindings/clock/st/st,flexgen.txt
@@ -78,7 +78,7 @@ Required properties:
- #clock-cells : from common clock binding; shall be set to 1 (multiple clock
outputs).
-- clocks : must be set to the parent's phandle. it's could be output clocks of
+- clocks : must be set to the parent's phandle. it could be output clocks of
a quadsfs or/and a pll or/and clk_sysin (up to 7 clocks)
- clock-output-names : List of strings used to name the clock outputs.
diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7100-audclk.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7100-audclk.yaml
new file mode 100644
index 000000000000..8f49a1ae03f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7100-audclk.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7100-audclk.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7100 Audio Clock Generator
+
+maintainers:
+ - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+ compatible:
+ const: starfive,jh7100-audclk
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Audio source clock
+ - description: External 12.288MHz clock
+ - description: Domain 7 AHB bus clock
+
+ clock-names:
+ items:
+ - const: audio_src
+ - const: audio_12288
+ - const: dom7ahb_bus
+
+ '#clock-cells':
+ const: 1
+ description:
+ See <dt-bindings/clock/starfive-jh7100-audio.h> for valid indices.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/starfive-jh7100.h>
+
+ clock-controller@10480000 {
+ compatible = "starfive,jh7100-audclk";
+ reg = <0x10480000 0x10000>;
+ clocks = <&clkgen JH7100_CLK_AUDIO_SRC>,
+ <&clkgen JH7100_CLK_AUDIO_12288>,
+ <&clkgen JH7100_CLK_DOM7AHB_BUS>;
+ clock-names = "audio_src", "audio_12288", "dom7ahb_bus";
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml
new file mode 100644
index 000000000000..12f17b60ecbe
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7100-clkgen.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7100-clkgen.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7100 Clock Generator
+
+maintainers:
+ - Geert Uytterhoeven <geert@linux-m68k.org>
+ - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+ compatible:
+ const: starfive,jh7100-clkgen
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Main clock source (25 MHz)
+ - description: Application-specific clock source (12-27 MHz)
+ - description: RMII reference clock (50 MHz)
+ - description: RGMII RX clock (125 MHz)
+
+ clock-names:
+ items:
+ - const: osc_sys
+ - const: osc_aud
+ - const: gmac_rmii_ref
+ - const: gmac_gr_mii_rxclk
+
+ '#clock-cells':
+ const: 1
+ description:
+ See <dt-bindings/clock/starfive-jh7100.h> for valid indices.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@11800000 {
+ compatible = "starfive,jh7100-clkgen";
+ reg = <0x11800000 0x10000>;
+ clocks = <&osc_sys>, <&osc_aud>, <&gmac_rmii_ref>, <&gmac_gr_mii_rxclk>;
+ clock-names = "osc_sys", "osc_aud", "gmac_rmii_ref", "gmac_gr_mii_rxclk";
+ #clock-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml
new file mode 100644
index 000000000000..923680a44aef
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7110-aoncrg.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7110-aoncrg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7110 Always-On Clock and Reset Generator
+
+maintainers:
+ - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+ compatible:
+ const: starfive,jh7110-aoncrg
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ oneOf:
+ - items:
+ - description: Main Oscillator (24 MHz)
+ - description: GMAC0 RMII reference or GMAC0 RGMII RX
+ - description: STG AXI/AHB
+ - description: APB Bus
+ - description: GMAC0 GTX
+
+ - items:
+ - description: Main Oscillator (24 MHz)
+ - description: GMAC0 RMII reference or GMAC0 RGMII RX
+ - description: STG AXI/AHB or GMAC0 RGMII RX
+ - description: APB Bus or STG AXI/AHB
+ - description: GMAC0 GTX or APB Bus
+ - description: RTC Oscillator (32.768 kHz) or GMAC0 GTX
+
+ - items:
+ - description: Main Oscillator (24 MHz)
+ - description: GMAC0 RMII reference
+ - description: GMAC0 RGMII RX
+ - description: STG AXI/AHB
+ - description: APB Bus
+ - description: GMAC0 GTX
+ - description: RTC Oscillator (32.768 kHz)
+
+ clock-names:
+ oneOf:
+ - minItems: 5
+ items:
+ - const: osc
+ - enum:
+ - gmac0_rmii_refin
+ - gmac0_rgmii_rxin
+ - const: stg_axiahb
+ - const: apb_bus
+ - const: gmac0_gtxclk
+ - const: rtc_osc
+
+ - minItems: 6
+ items:
+ - const: osc
+ - const: gmac0_rmii_refin
+ - const: gmac0_rgmii_rxin
+ - const: stg_axiahb
+ - const: apb_bus
+ - const: gmac0_gtxclk
+ - const: rtc_osc
+
+ '#clock-cells':
+ const: 1
+ description:
+ See <dt-bindings/clock/starfive,jh7110-crg.h> for valid indices.
+
+ '#reset-cells':
+ const: 1
+ description:
+ See <dt-bindings/reset/starfive,jh7110-crg.h> for valid indices.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/starfive,jh7110-crg.h>
+
+ clock-controller@17000000 {
+ compatible = "starfive,jh7110-aoncrg";
+ reg = <0x17000000 0x10000>;
+ clocks = <&osc>, <&gmac0_rmii_refin>,
+ <&gmac0_rgmii_rxin>,
+ <&syscrg JH7110_SYSCLK_STG_AXIAHB>,
+ <&syscrg JH7110_SYSCLK_APB_BUS>,
+ <&syscrg JH7110_SYSCLK_GMAC0_GTXCLK>,
+ <&rtc_osc>;
+ clock-names = "osc", "gmac0_rmii_refin",
+ "gmac0_rgmii_rxin", "stg_axiahb",
+ "apb_bus", "gmac0_gtxclk",
+ "rtc_osc";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml b/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml
new file mode 100644
index 000000000000..84373ae31644
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/starfive,jh7110-syscrg.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/starfive,jh7110-syscrg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH7110 System Clock and Reset Generator
+
+maintainers:
+ - Emil Renner Berthing <kernel@esmil.dk>
+
+properties:
+ compatible:
+ const: starfive,jh7110-syscrg
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ oneOf:
+ - items:
+ - description: Main Oscillator (24 MHz)
+ - description: GMAC1 RMII reference or GMAC1 RGMII RX
+ - description: External I2S TX bit clock
+ - description: External I2S TX left/right channel clock
+ - description: External I2S RX bit clock
+ - description: External I2S RX left/right channel clock
+ - description: External TDM clock
+ - description: External audio master clock
+
+ - items:
+ - description: Main Oscillator (24 MHz)
+ - description: GMAC1 RMII reference
+ - description: GMAC1 RGMII RX
+ - description: External I2S TX bit clock
+ - description: External I2S TX left/right channel clock
+ - description: External I2S RX bit clock
+ - description: External I2S RX left/right channel clock
+ - description: External TDM clock
+ - description: External audio master clock
+
+ clock-names:
+ oneOf:
+ - items:
+ - const: osc
+ - enum:
+ - gmac1_rmii_refin
+ - gmac1_rgmii_rxin
+ - const: i2stx_bclk_ext
+ - const: i2stx_lrck_ext
+ - const: i2srx_bclk_ext
+ - const: i2srx_lrck_ext
+ - const: tdm_ext
+ - const: mclk_ext
+
+ - items:
+ - const: osc
+ - const: gmac1_rmii_refin
+ - const: gmac1_rgmii_rxin
+ - const: i2stx_bclk_ext
+ - const: i2stx_lrck_ext
+ - const: i2srx_bclk_ext
+ - const: i2srx_lrck_ext
+ - const: tdm_ext
+ - const: mclk_ext
+
+ '#clock-cells':
+ const: 1
+ description:
+ See <dt-bindings/clock/starfive,jh7110-crg.h> for valid indices.
+
+ '#reset-cells':
+ const: 1
+ description:
+ See <dt-bindings/reset/starfive,jh7110-crg.h> for valid indices.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#clock-cells'
+ - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@13020000 {
+ compatible = "starfive,jh7110-syscrg";
+ reg = <0x13020000 0x10000>;
+ clocks = <&osc>, <&gmac1_rmii_refin>,
+ <&gmac1_rgmii_rxin>,
+ <&i2stx_bclk_ext>, <&i2stx_lrck_ext>,
+ <&i2srx_bclk_ext>, <&i2srx_lrck_ext>,
+ <&tdm_ext>, <&mclk_ext>;
+ clock-names = "osc", "gmac1_rmii_refin",
+ "gmac1_rgmii_rxin",
+ "i2stx_bclk_ext", "i2stx_lrck_ext",
+ "i2srx_bclk_ext", "i2srx_lrck_ext",
+ "tdm_ext", "mclk_ext";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/clock/stericsson,u8500-clks.yaml b/Documentation/devicetree/bindings/clock/stericsson,u8500-clks.yaml
new file mode 100644
index 000000000000..2150307219a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/stericsson,u8500-clks.yaml
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/stericsson,u8500-clks.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ST-Ericsson DB8500 (U8500) clocks
+
+maintainers:
+ - Ulf Hansson <ulf.hansson@linaro.org>
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: While named "U8500 clocks" these clocks are inside the
+ DB8500 digital baseband system-on-chip and its siblings such as
+ DB8520. These bindings consider the clocks present in the SoC
+ itself, not off-chip clocks. There are four different on-chip
+ clocks - RTC (32 kHz), CPU clock (SMP TWD), PRCMU (power reset and
+ control management unit) clocks and PRCC (peripheral reset and
+ clock controller) clocks. For some reason PRCC 4 does not exist so
+ the itemization can be a bit unintuitive.
+
+properties:
+ compatible:
+ enum:
+ - stericsson,u8500-clks
+ - stericsson,u8540-clks
+ - stericsson,u9540-clks
+
+ reg:
+ items:
+ - description: PRCC 1 register area
+ - description: PRCC 2 register area
+ - description: PRCC 3 register area
+ - description: PRCC 5 register area
+ - description: PRCC 6 register area
+
+ prcmu-clock:
+ description: A subnode with one clock cell for PRCMU (power, reset, control
+ management unit) clocks. The cell indicates which PRCMU clock in the
+ prcmu-clock node the consumer wants to use.
+ type: object
+
+ properties:
+ '#clock-cells':
+ const: 1
+
+ additionalProperties: false
+
+ prcc-periph-clock:
+ description: A subnode with two clock cells for PRCC (peripheral
+ reset and clock controller) peripheral clocks. The first cell indicates
+ which PRCC block the consumer wants to use, possible values are 1, 2, 3,
+ 5, 6. The second cell indicates which clock inside the PRCC block it
+ wants, possible values are 0 thru 31.
+ type: object
+
+ properties:
+ '#clock-cells':
+ const: 2
+
+ additionalProperties: false
+
+ prcc-kernel-clock:
+ description: A subnode with two clock cells for PRCC (peripheral reset
+ and clock controller) kernel clocks. The first cell indicates which PRCC
+ block the consumer wants to use, possible values are 1, 2, 3, 5, 6. The
+ second cell indicates which clock inside the PRCC block it wants, possible
+ values are 0 thru 31.
+ type: object
+
+ properties:
+ '#clock-cells':
+ const: 2
+
+ additionalProperties: false
+
+ prcc-reset-controller:
+ description: A subnode with two reset cells for the reset portions of the
+ PRCC (peripheral reset and clock controller). The first cell indicates
+ which PRCC block the consumer wants to use, possible values are 1, 2, 3
+ 5 and 6. The second cell indicates which reset line inside the PRCC block
+ it wants to control, possible values are 0 thru 31.
+ type: object
+
+ properties:
+ '#reset-cells':
+ const: 2
+
+ additionalProperties: false
+
+ rtc32k-clock:
+ description: A subnode with zero clock cells for the 32kHz RTC clock.
+ type: object
+
+ properties:
+ '#clock-cells':
+ const: 0
+
+ additionalProperties: false
+
+ smp-twd-clock:
+ description: A subnode for the ARM SMP Timer Watchdog cluster with zero
+ clock cells.
+ type: object
+
+ properties:
+ '#clock-cells':
+ const: 0
+
+ additionalProperties: false
+
+ clkout-clock:
+ description: A subnode with three clock cells for externally routed clocks,
+ output clocks. These are two PRCMU-internal clocks that can be divided and
+ muxed out on the pads of the DB8500 SoC.
+ type: object
+
+ properties:
+ '#clock-cells':
+ description:
+ The first cell indicates which output clock we are using,
+ possible values are 0 (CLKOUT1) and 1 (CLKOUT2).
+ The second cell indicates which clock we want to use as source,
+ possible values are 0 thru 7, see the defines for the different
+ source clocks.
+ The third cell is a divider, legal values are 1 thru 63.
+ const: 3
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - prcmu-clock
+ - prcc-periph-clock
+ - prcc-kernel-clock
+ - rtc32k-clock
+ - smp-twd-clock
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/ste-db8500-clkout.h>
+ clocks@8012 {
+ compatible = "stericsson,u8500-clks";
+ reg = <0x8012f000 0x1000>, <0x8011f000 0x1000>,
+ <0x8000f000 0x1000>, <0xa03ff000 0x1000>,
+ <0xa03cf000 0x1000>;
+
+ prcmu_clk: prcmu-clock {
+ #clock-cells = <1>;
+ };
+
+ prcc_pclk: prcc-periph-clock {
+ #clock-cells = <2>;
+ };
+
+ prcc_kclk: prcc-kernel-clock {
+ #clock-cells = <2>;
+ };
+
+ prcc_reset: prcc-reset-controller {
+ #reset-cells = <2>;
+ };
+
+ rtc_clk: rtc32k-clock {
+ #clock-cells = <0>;
+ };
+
+ smp_twd_clk: smp-twd-clock {
+ #clock-cells = <0>;
+ };
+
+ clkout_clk: clkout-clock {
+ #clock-cells = <3>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml b/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
new file mode 100644
index 000000000000..bcc14088220a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/sunplus,sp7021-clkc.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) Sunplus Co., Ltd. 2021
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/sunplus,sp7021-clkc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sunplus SP7021 SoC Clock Controller
+
+maintainers:
+ - Qin Jian <qinjian@cqplus1.com>
+
+properties:
+ compatible:
+ const: sunplus,sp7021-clkc
+
+ reg:
+ maxItems: 3
+
+ clocks:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ extclk: osc0 {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <27000000>;
+ clock-output-names = "extclk";
+ };
+
+ clkc: clock-controller@9c000004 {
+ compatible = "sunplus,sp7021-clkc";
+ reg = <0x9c000004 0x28>,
+ <0x9c000200 0x44>,
+ <0x9c000268 0x08>;
+ clocks = <&extclk>;
+ #clock-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/tesla,fsd-clock.yaml b/Documentation/devicetree/bindings/clock/tesla,fsd-clock.yaml
new file mode 100644
index 000000000000..dc808e2f8327
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/tesla,fsd-clock.yaml
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/tesla,fsd-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Tesla FSD (Full Self-Driving) SoC clock controller
+
+maintainers:
+ - Alim Akhtar <alim.akhtar@samsung.com>
+ - linux-fsd@tesla.com
+
+description: |
+ FSD clock controller consist of several clock management unit
+ (CMU), which generates clocks for various inteernal SoC blocks.
+ The root clock comes from external OSC clock (24 MHz).
+
+ All available clocks are defined as preprocessor macros in
+ 'dt-bindings/clock/fsd-clk.h' header.
+
+properties:
+ compatible:
+ enum:
+ - tesla,fsd-clock-cmu
+ - tesla,fsd-clock-imem
+ - tesla,fsd-clock-peric
+ - tesla,fsd-clock-fsys0
+ - tesla,fsd-clock-fsys1
+ - tesla,fsd-clock-mfc
+ - tesla,fsd-clock-cam_csi
+
+ clocks:
+ minItems: 1
+ maxItems: 6
+
+ clock-names:
+ minItems: 1
+ maxItems: 6
+
+ "#clock-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-cmu
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ clock-names:
+ items:
+ - const: fin_pll
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-imem
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ - description: IMEM TCU clock (from CMU_CMU)
+ - description: IMEM bus clock (from CMU_CMU)
+ - description: IMEM DMA clock (from CMU_CMU)
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_cmu_imem_tcuclk
+ - const: dout_cmu_imem_aclk
+ - const: dout_cmu_imem_dmaclk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-peric
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ - description: Shared0 PLL div4 clock (from CMU_CMU)
+ - description: PERIC shared1 div36 clock (from CMU_CMU)
+ - description: PERIC shared0 div3 TBU clock (from CMU_CMU)
+ - description: PERIC shared0 div20 clock (from CMU_CMU)
+ - description: PERIC shared1 div4 DMAclock (from CMU_CMU)
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_cmu_pll_shared0_div4
+ - const: dout_cmu_peric_shared1div36
+ - const: dout_cmu_peric_shared0div3_tbuclk
+ - const: dout_cmu_peric_shared0div20
+ - const: dout_cmu_peric_shared1div4_dmaclk
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-fsys0
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ - description: Shared0 PLL div6 clock (from CMU_CMU)
+ - description: FSYS0 shared1 div4 clock (from CMU_CMU)
+ - description: FSYS0 shared0 div4 clock (from CMU_CMU)
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_cmu_pll_shared0_div6
+ - const: dout_cmu_fsys0_shared1div4
+ - const: dout_cmu_fsys0_shared0div4
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-fsys1
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ - description: FSYS1 shared0 div8 clock (from CMU_CMU)
+ - description: FSYS1 shared0 div4 clock (from CMU_CMU)
+ clock-names:
+ items:
+ - const: fin_pll
+ - const: dout_cmu_fsys1_shared0div8
+ - const: dout_cmu_fsys1_shared0div4
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-mfc
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ clock-names:
+ items:
+ - const: fin_pll
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: tesla,fsd-clock-cam_csi
+ then:
+ properties:
+ clocks:
+ items:
+ - description: External reference clock (24 MHz)
+ clock-names:
+ items:
+ - const: fin_pll
+
+required:
+ - compatible
+ - "#clock-cells"
+ - clocks
+ - clock-names
+ - reg
+
+additionalProperties: false
+
+examples:
+ # Clock controller node for CMU_FSYS1
+ - |
+ #include <dt-bindings/clock/fsd-clk.h>
+
+ clock_fsys1: clock-controller@16810000 {
+ compatible = "tesla,fsd-clock-fsys1";
+ reg = <0x16810000 0x3000>;
+ #clock-cells = <1>;
+
+ clocks = <&fin_pll>,
+ <&clock_cmu DOUT_CMU_FSYS1_SHARED0DIV8>,
+ <&clock_cmu DOUT_CMU_FSYS1_SHARED0DIV4>;
+ clock-names = "fin_pll",
+ "dout_cmu_fsys1_shared0div8",
+ "dout_cmu_fsys1_shared0div4";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/ti,am654-ehrpwm-tbclk.yaml b/Documentation/devicetree/bindings/clock/ti,am654-ehrpwm-tbclk.yaml
index 9b537bc876b5..66765116aff5 100644
--- a/Documentation/devicetree/bindings/clock/ti,am654-ehrpwm-tbclk.yaml
+++ b/Documentation/devicetree/bindings/clock/ti,am654-ehrpwm-tbclk.yaml
@@ -15,6 +15,7 @@ properties:
- enum:
- ti,am654-ehrpwm-tbclk
- ti,am64-epwm-tbclk
+ - ti,am62-epwm-tbclk
- const: syscon
"#clock-cells":
diff --git a/Documentation/devicetree/bindings/clock/ti,cdce925.txt b/Documentation/devicetree/bindings/clock/ti,cdce925.txt
deleted file mode 100644
index df42ab72718f..000000000000
--- a/Documentation/devicetree/bindings/clock/ti,cdce925.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-Binding for TI CDCE913/925/937/949 programmable I2C clock synthesizers.
-
-Reference
-This binding uses the common clock binding[1].
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] https://www.ti.com/product/cdce913
-[3] https://www.ti.com/product/cdce925
-[4] https://www.ti.com/product/cdce937
-[5] https://www.ti.com/product/cdce949
-
-The driver provides clock sources for each output Y1 through Y5.
-
-Required properties:
- - compatible: Shall be one of the following:
- - "ti,cdce913": 1-PLL, 3 Outputs
- - "ti,cdce925": 2-PLL, 5 Outputs
- - "ti,cdce937": 3-PLL, 7 Outputs
- - "ti,cdce949": 4-PLL, 9 Outputs
- - reg: I2C device address.
- - clocks: Points to a fixed parent clock that provides the input frequency.
- - #clock-cells: From common clock bindings: Shall be 1.
-
-Optional properties:
- - xtal-load-pf: Crystal load-capacitor value to fine-tune performance on a
- board, or to compensate for external influences.
-- vdd-supply: A regulator node for Vdd
-- vddout-supply: A regulator node for Vddout
-
-For all PLL1, PLL2, ... an optional child node can be used to specify spread
-spectrum clocking parameters for a board.
- - spread-spectrum: SSC mode as defined in the data sheet.
- - spread-spectrum-center: Use "centered" mode instead of "max" mode. When
- present, the clock runs at the requested frequency on average. Otherwise
- the requested frequency is the maximum value of the SCC range.
-
-
-Example:
-
- clockgen: cdce925pw@64 {
- compatible = "cdce925";
- reg = <0x64>;
- clocks = <&xtal_27Mhz>;
- #clock-cells = <1>;
- xtal-load-pf = <5>;
- vdd-supply = <&1v8-reg>;
- vddout-supply = <&3v3-reg>;
- /* PLL options to get SSC 1% centered */
- PLL2 {
- spread-spectrum = <4>;
- spread-spectrum-center;
- };
- };
diff --git a/Documentation/devicetree/bindings/clock/ti,cdce925.yaml b/Documentation/devicetree/bindings/clock/ti,cdce925.yaml
new file mode 100644
index 000000000000..a4ec8dd5ddf1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/ti,cdce925.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: (GPL-2.0-only or BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/ti,cdce925.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TI CDCE913/925/937/949 programmable I2C clock synthesizers
+
+maintainers:
+ - Alexander Stein <alexander.stein@ew.tq-group.com>
+
+description: |
+ Flexible Low Power LVCMOS Clock Generator with SSC Support for EMI Reduction
+
+ - CDCE(L)913: 1-PLL, 3 Outputs https://www.ti.com/product/cdce913
+ - CDCE(L)925: 2-PLL, 5 Outputs https://www.ti.com/product/cdce925
+ - CDCE(L)937: 3-PLL, 7 Outputs https://www.ti.com/product/cdce937
+ - CDCE(L)949: 4-PLL, 9 Outputs https://www.ti.com/product/cdce949
+
+properties:
+ compatible:
+ enum:
+ - ti,cdce913
+ - ti,cdce925
+ - ti,cdce937
+ - ti,cdce949
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: fixed parent clock
+
+ "#clock-cells":
+ const: 1
+
+ vdd-supply:
+ description: Regulator that provides 1.8V Vdd power supply
+
+ vddout-supply:
+ description: |
+ Regulator that provides Vddout power supply.
+ non-L variant: 2.5V or 3.3V for
+ L variant: 1.8V for
+
+ xtal-load-pf:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Crystal load-capacitor value to fine-tune performance on a
+ board, or to compensate for external influences.
+
+patternProperties:
+ "^PLL[1-4]$":
+ type: object
+ description: |
+ optional child node can be used to specify spread
+ spectrum clocking parameters for a board
+
+ additionalProperties: false
+
+ properties:
+ spread-spectrum:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: SSC mode as defined in the data sheet
+
+ spread-spectrum-center:
+ type: boolean
+ description: |
+ Use "centered" mode instead of "max" mode. When
+ present, the clock runs at the requested frequency on average.
+ Otherwise the requested frequency is the maximum value of the
+ SCC range.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - "#clock-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cdce925: clock-controller@64 {
+ compatible = "ti,cdce925";
+ reg = <0x64>;
+ clocks = <&xtal_27Mhz>;
+ #clock-cells = <1>;
+ xtal-load-pf = <5>;
+ vdd-supply = <&reg_1v8>;
+ vddout-supply = <&reg_3v3>;
+ /* PLL options to get SSC 1% centered */
+ PLL2 {
+ spread-spectrum = <4>;
+ spread-spectrum-center;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/clock/ti,lmk04832.yaml b/Documentation/devicetree/bindings/clock/ti,lmk04832.yaml
index bd8173848253..13d7b3d03d84 100644
--- a/Documentation/devicetree/bindings/clock/ti,lmk04832.yaml
+++ b/Documentation/devicetree/bindings/clock/ti,lmk04832.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/ti,lmk04832.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Clock bindings for the Texas Instruments LMK04832
+title: Texas Instruments LMK04832 Clock Controller
maintainers:
- Liam Beguin <liambeguin@gmail.com>
@@ -160,7 +160,7 @@ examples:
};
};
- spi0 {
+ spi {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/clock/ti,sci-clk.yaml b/Documentation/devicetree/bindings/clock/ti,sci-clk.yaml
index 0e370289a053..63d976341696 100644
--- a/Documentation/devicetree/bindings/clock/ti,sci-clk.yaml
+++ b/Documentation/devicetree/bindings/clock/ti,sci-clk.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/clock/ti,sci-clk.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: TI-SCI clock controller node bindings
+title: TI-SCI clock controller
maintainers:
- Nishanth Menon <nm@ti.com>
diff --git a/Documentation/devicetree/bindings/clock/ti-clkctrl.txt b/Documentation/devicetree/bindings/clock/ti-clkctrl.txt
index 18af6b9409e3..d20db7974a38 100644
--- a/Documentation/devicetree/bindings/clock/ti-clkctrl.txt
+++ b/Documentation/devicetree/bindings/clock/ti-clkctrl.txt
@@ -21,6 +21,7 @@ Required properties :
"ti,clkctrl-l4-per"
"ti,clkctrl-l4-secure"
"ti,clkctrl-l4-wkup"
+- clock-output-names : from common clock binding
- #clock-cells : shall contain 2 with the first entry being the instance
offset from the clock domain base and the second being the
clock index
@@ -32,7 +33,8 @@ Example: Clock controller node on omap 4430:
l4per: cm@1400 {
cm_l4per@0 {
cm_l4per_clkctrl: clock@20 {
- compatible = "ti,clkctrl-l4-per", "ti,clkctrl";
+ compatible = "ti,clkctrl";
+ clock-output-names = "l4_per";
reg = <0x20 0x1b0>;
#clock-cells = <2>;
};
diff --git a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt
index cb76b3f2b341..9c6199249ce5 100644
--- a/Documentation/devicetree/bindings/clock/ti/clockdomain.txt
+++ b/Documentation/devicetree/bindings/clock/ti/clockdomain.txt
@@ -17,6 +17,9 @@ Required properties:
- #clock-cells : from common clock binding; shall be set to 0.
- clocks : link phandles of clocks within this domain
+Optional properties:
+- clock-output-names : from common clock binding.
+
Examples:
dss_clkdm: dss_clkdm {
compatible = "ti,clockdomain";
diff --git a/Documentation/devicetree/bindings/clock/ti/composite.txt b/Documentation/devicetree/bindings/clock/ti/composite.txt
index 5f43c4706b09..33ac7c9ad053 100644
--- a/Documentation/devicetree/bindings/clock/ti/composite.txt
+++ b/Documentation/devicetree/bindings/clock/ti/composite.txt
@@ -27,6 +27,9 @@ Required properties:
- clocks : link phandles of component clocks
- #clock-cells : from common clock binding; shall be set to 0.
+Optional properties:
+- clock-output-names : from common clock binding.
+
Examples:
usb_l4_gate_ick: usb_l4_gate_ick {
diff --git a/Documentation/devicetree/bindings/clock/ti/davinci/pll.txt b/Documentation/devicetree/bindings/clock/ti/davinci/pll.txt
index 36998e184821..c9894538315b 100644
--- a/Documentation/devicetree/bindings/clock/ti/davinci/pll.txt
+++ b/Documentation/devicetree/bindings/clock/ti/davinci/pll.txt
@@ -15,7 +15,7 @@ Required properties:
- for "ti,da850-pll1", shall be "clksrc"
Optional properties:
-- ti,clkmode-square-wave: Indicates that the the board is supplying a square
+- ti,clkmode-square-wave: Indicates that the board is supplying a square
wave input on the OSCIN pin instead of using a crystal oscillator.
This property is only valid when compatible = "ti,da850-pll0".
diff --git a/Documentation/devicetree/bindings/clock/ti/dra7-atl.txt b/Documentation/devicetree/bindings/clock/ti/dra7-atl.txt
index 21c002d28b9b..68504079f99f 100644
--- a/Documentation/devicetree/bindings/clock/ti/dra7-atl.txt
+++ b/Documentation/devicetree/bindings/clock/ti/dra7-atl.txt
@@ -6,7 +6,7 @@ functional clock but can be configured to provide different clocks.
ATL can maintain a clock averages to some desired frequency based on the bws/aws
signals - can compensate the drift between the two ws signal.
-In order to provide the support for ATL and it's output clocks (which can be used
+In order to provide the support for ATL and its output clocks (which can be used
internally within the SoC or external components) two sets of bindings is needed:
Clock tree binding:
diff --git a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt
index 662b36d53bf0..518e3c142276 100644
--- a/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt
+++ b/Documentation/devicetree/bindings/clock/ti/fixed-factor-clock.txt
@@ -16,6 +16,7 @@ Required properties:
- clocks: parent clock.
Optional properties:
+- clock-output-names : from common clock binding.
- ti,autoidle-shift: bit shift of the autoidle enable bit for the clock,
see [2]
- reg: offset for the autoidle register of this clock, see [2]
diff --git a/Documentation/devicetree/bindings/clock/ti/gate.txt b/Documentation/devicetree/bindings/clock/ti/gate.txt
index 56d603c1f716..4982615c01b9 100644
--- a/Documentation/devicetree/bindings/clock/ti/gate.txt
+++ b/Documentation/devicetree/bindings/clock/ti/gate.txt
@@ -10,7 +10,7 @@ will be controlled instead and the corresponding hw-ops for
that is used.
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
+[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml
[3] Documentation/devicetree/bindings/clock/ti/clockdomain.txt
Required properties:
@@ -36,6 +36,7 @@ Required properties:
ti,clkdm-gate-clock type
Optional properties:
+- clock-output-names : from common clock binding.
- ti,bit-shift : bit shift for programming the clock gate, invalid for
ti,clkdm-gate-clock type
- ti,set-bit-to-disable : inverts default gate programming. Setting the bit
diff --git a/Documentation/devicetree/bindings/clock/ti/interface.txt b/Documentation/devicetree/bindings/clock/ti/interface.txt
index 3f4704040140..d3eb5ca92a7f 100644
--- a/Documentation/devicetree/bindings/clock/ti/interface.txt
+++ b/Documentation/devicetree/bindings/clock/ti/interface.txt
@@ -9,7 +9,7 @@ companion clock finding (match corresponding functional gate
clock) and hardware autoidle enable / disable.
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
+[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.yaml
Required properties:
- compatible : shall be one of:
@@ -28,6 +28,7 @@ Required properties:
- reg : base address for the control register
Optional properties:
+- clock-output-names : from common clock binding.
- ti,bit-shift : bit shift for the bit enabling/disabling the clock (default 0)
Examples:
diff --git a/Documentation/devicetree/bindings/clock/ti/mux.txt b/Documentation/devicetree/bindings/clock/ti/mux.txt
index eec8994b9be8..e17425a58621 100644
--- a/Documentation/devicetree/bindings/clock/ti/mux.txt
+++ b/Documentation/devicetree/bindings/clock/ti/mux.txt
@@ -42,6 +42,7 @@ Required properties:
- reg : register offset for register controlling adjustable mux
Optional properties:
+- clock-output-names : from common clock binding.
- ti,bit-shift : number of bits to shift the bit-mask, defaults to
0 if not present
- ti,index-starts-at-one : valid input select programming starts at 1, not
diff --git a/Documentation/devicetree/bindings/clock/ti/ti,clksel.yaml b/Documentation/devicetree/bindings/clock/ti/ti,clksel.yaml
new file mode 100644
index 000000000000..d525f96cf244
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/ti/ti,clksel.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/ti/ti,clksel.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TI clksel clock
+
+maintainers:
+ - Tony Lindgren <tony@atomide.com>
+
+description: |
+ The TI CLKSEL clocks consist of consist of input clock mux bits, and in some
+ cases also has divider, multiplier and gate bits.
+
+properties:
+ compatible:
+ const: ti,clksel
+
+ reg:
+ maxItems: 1
+ description: The CLKSEL register range
+
+ '#address-cells':
+ enum: [ 0, 1, 2 ]
+
+ '#size-cells':
+ enum: [ 0, 1, 2 ]
+
+ ranges: true
+
+ "#clock-cells":
+ const: 2
+ description: The CLKSEL register and bit offset
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ clksel_gfx_fclk: clock@52c {
+ compatible = "ti,clksel";
+ reg = <0x25c 0x4>;
+ #clock-cells = <2>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml b/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml
new file mode 100644
index 000000000000..d36558aa39f3
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pipllct.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/toshiba,tmpv770x-pipllct.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba Visconti5 TMPV770X PLL Controller
+
+maintainers:
+ - Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
+
+description:
+ Toshia Visconti5 PLL controller which supports the PLLs on TMPV770X.
+
+properties:
+ compatible:
+ const: toshiba,tmpv7708-pipllct
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ clocks:
+ description: External reference clock (OSC2)
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+
+ osc2_clk: osc2-clk {
+ compatible = "fixed-clock";
+ clock-frequency = <20000000>;
+ #clock-cells = <0>;
+ };
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pipllct: clock-controller@24220000 {
+ compatible = "toshiba,tmpv7708-pipllct";
+ reg = <0 0x24220000 0 0x820>;
+ #clock-cells = <1>;
+ clocks = <&osc2_clk>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml b/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml
new file mode 100644
index 000000000000..081f85b1eb88
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/toshiba,tmpv770x-pismu.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/toshiba,tmpv770x-pismu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba Visconti5 TMPV770x SMU controller
+
+maintainers:
+ - Nobuhiro Iwamatsu <nobuhiro1.iwamatsu@toshiba.co.jp>
+
+description:
+ Toshia Visconti5 SMU (System Management Unit) which supports the clock
+ and resets on TMPV770x.
+
+properties:
+ compatible:
+ items:
+ - const: toshiba,tmpv7708-pismu
+ - const: syscon
+
+ reg:
+ maxItems: 1
+
+ '#clock-cells':
+ const: 1
+
+ '#reset-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - "#reset-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ pismu: syscon@24200000 {
+ compatible = "toshiba,tmpv7708-pismu", "syscon";
+ reg = <0 0x24200000 0 0x2140>;
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/clock/ux500.txt b/Documentation/devicetree/bindings/clock/ux500.txt
deleted file mode 100644
index e52bd4b72348..000000000000
--- a/Documentation/devicetree/bindings/clock/ux500.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-Clock bindings for ST-Ericsson Ux500 clocks
-
-Required properties :
-- compatible : shall contain only one of the following:
- "stericsson,u8500-clks"
- "stericsson,u8540-clks"
- "stericsson,u9540-clks"
-- reg : shall contain base register location and length for
- CLKRST1, 2, 3, 5, and 6 in an array. Note the absence of
- CLKRST4, which does not exist.
-
-Required subnodes:
-- prcmu-clock: a subnode with one clock cell for PRCMU (power,
- reset, control unit) clocks. The cell indicates which PRCMU
- clock in the prcmu-clock node the consumer wants to use.
-- prcc-periph-clock: a subnode with two clock cells for
- PRCC (programmable reset- and clock controller) peripheral clocks.
- The first cell indicates which PRCC block the consumer
- wants to use, possible values are 1, 2, 3, 5, 6. The second
- cell indicates which clock inside the PRCC block it wants,
- possible values are 0 thru 31.
-- prcc-kernel-clock: a subnode with two clock cells for
- PRCC (programmable reset- and clock controller) kernel clocks
- The first cell indicates which PRCC block the consumer
- wants to use, possible values are 1, 2, 3, 5, 6. The second
- cell indicates which clock inside the PRCC block it wants,
- possible values are 0 thru 31.
-- rtc32k-clock: a subnode with zero clock cells for the 32kHz
- RTC clock.
-- smp-twd-clock: a subnode for the ARM SMP Timer Watchdog cluster
- with zero clock cells.
-
-Example:
-
-clocks {
- compatible = "stericsson,u8500-clks";
- /*
- * Registers for the CLKRST block on peripheral
- * groups 1, 2, 3, 5, 6,
- */
- reg = <0x8012f000 0x1000>, <0x8011f000 0x1000>,
- <0x8000f000 0x1000>, <0xa03ff000 0x1000>,
- <0xa03cf000 0x1000>;
-
- prcmu_clk: prcmu-clock {
- #clock-cells = <1>;
- };
-
- prcc_pclk: prcc-periph-clock {
- #clock-cells = <2>;
- };
-
- prcc_kclk: prcc-kernel-clock {
- #clock-cells = <2>;
- };
-
- rtc_clk: rtc32k-clock {
- #clock-cells = <0>;
- };
-
- smp_twd_clk: smp-twd-clock {
- #clock-cells = <0>;
- };
-};
diff --git a/Documentation/devicetree/bindings/clock/xlnx,clocking-wizard.yaml b/Documentation/devicetree/bindings/clock/xlnx,clocking-wizard.yaml
new file mode 100644
index 000000000000..c1f04830a832
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/xlnx,clocking-wizard.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/xlnx,clocking-wizard.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx clocking wizard
+
+maintainers:
+ - Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
+
+description:
+ The clocking wizard is a soft ip clocking block of Xilinx versal. It
+ reads required input clock frequencies from the devicetree and acts as clock
+ clock output.
+
+properties:
+ compatible:
+ enum:
+ - xlnx,clocking-wizard
+ - xlnx,clocking-wizard-v5.2
+ - xlnx,clocking-wizard-v6.0
+
+
+ reg:
+ maxItems: 1
+
+ "#clock-cells":
+ const: 1
+
+ clocks:
+ items:
+ - description: clock input
+ - description: axi clock
+
+ clock-names:
+ items:
+ - const: clk_in1
+ - const: s_axi_aclk
+
+
+ xlnx,speed-grade:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [1, 2, 3]
+ description:
+ Speed grade of the device. Higher the speed grade faster is the FPGA device.
+
+ xlnx,nr-outputs:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1
+ maximum: 8
+ description:
+ Number of outputs.
+
+required:
+ - compatible
+ - reg
+ - "#clock-cells"
+ - clocks
+ - clock-names
+ - xlnx,speed-grade
+ - xlnx,nr-outputs
+
+additionalProperties: false
+
+examples:
+ - |
+ clock-controller@b0000000 {
+ compatible = "xlnx,clocking-wizard";
+ reg = <0xb0000000 0x10000>;
+ #clock-cells = <1>;
+ xlnx,speed-grade = <1>;
+ xlnx,nr-outputs = <6>;
+ clock-names = "clk_in1", "s_axi_aclk";
+ clocks = <&clkc 15>, <&clkc 15>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/connector/usb-connector.yaml b/Documentation/devicetree/bindings/connector/usb-connector.yaml
index 7eb8659fa610..ae515651fc6b 100644
--- a/Documentation/devicetree/bindings/connector/usb-connector.yaml
+++ b/Documentation/devicetree/bindings/connector/usb-connector.yaml
@@ -104,8 +104,7 @@ properties:
- "1.5A" and "3.0A", 5V 1.5A and 5V 3.0A respectively, as defined in USB
Type-C Cable and Connector specification, when Power Delivery is not
supported.
- allOf:
- - $ref: /schemas/types.yaml#/definitions/string
+ $ref: /schemas/types.yaml#/definitions/string
enum:
- default
- 1.5A
@@ -264,11 +263,11 @@ examples:
# Micro-USB connector with HS lines routed via controller (MUIC).
- |
muic-max77843 {
- usb_con1: connector {
- compatible = "usb-b-connector";
- label = "micro-USB";
- type = "micro";
- };
+ usb_con1: connector {
+ compatible = "usb-b-connector";
+ label = "micro-USB";
+ type = "micro";
+ };
};
# USB-C connector attached to CC controller (s2mm005), HS lines routed
@@ -276,34 +275,34 @@ examples:
# DisplayPort video lines are routed to the connector via SS mux in USB3 PHY.
- |
ccic: s2mm005 {
- usb_con2: connector {
- compatible = "usb-c-connector";
- label = "USB-C";
+ usb_con2: connector {
+ compatible = "usb-c-connector";
+ label = "USB-C";
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
- port@0 {
- reg = <0>;
- usb_con_hs: endpoint {
- remote-endpoint = <&max77865_usbc_hs>;
- };
- };
- port@1 {
- reg = <1>;
- usb_con_ss: endpoint {
- remote-endpoint = <&usbdrd_phy_ss>;
- };
- };
- port@2 {
- reg = <2>;
- usb_con_sbu: endpoint {
- remote-endpoint = <&dp_aux>;
+ port@0 {
+ reg = <0>;
+ usb_con_hs: endpoint {
+ remote-endpoint = <&max77865_usbc_hs>;
+ };
+ };
+ port@1 {
+ reg = <1>;
+ usb_con_ss: endpoint {
+ remote-endpoint = <&usbdrd_phy_ss>;
+ };
+ };
+ port@2 {
+ reg = <2>;
+ usb_con_sbu: endpoint {
+ remote-endpoint = <&dp_aux>;
+ };
+ };
};
- };
};
- };
};
# USB-C connector attached to a typec port controller(ptn5110), which has
@@ -311,16 +310,16 @@ examples:
- |
#include <dt-bindings/usb/pd.h>
typec: ptn5110 {
- usb_con3: connector {
- compatible = "usb-c-connector";
- label = "USB-C";
- power-role = "dual";
- try-power-role = "sink";
- source-pdos = <PDO_FIXED(5000, 2000, PDO_FIXED_USB_COMM)>;
- sink-pdos = <PDO_FIXED(5000, 2000, PDO_FIXED_USB_COMM)
- PDO_VAR(5000, 12000, 2000)>;
- op-sink-microwatt = <10000000>;
- };
+ usb_con3: connector {
+ compatible = "usb-c-connector";
+ label = "USB-C";
+ power-role = "dual";
+ try-power-role = "sink";
+ source-pdos = <PDO_FIXED(5000, 2000, PDO_FIXED_USB_COMM)>;
+ sink-pdos = <PDO_FIXED(5000, 2000, PDO_FIXED_USB_COMM)
+ PDO_VAR(5000, 12000, 2000)>;
+ op-sink-microwatt = <10000000>;
+ };
};
# USB-C connector attached to SoC and USB3 typec port controller(hd3ss3220)
@@ -333,20 +332,20 @@ examples:
data-role = "dual";
ports {
- #address-cells = <1>;
- #size-cells = <0>;
- port@0 {
- reg = <0>;
- hs_ep: endpoint {
- remote-endpoint = <&usb3_hs_ep>;
- };
+ #address-cells = <1>;
+ #size-cells = <0>;
+ port@0 {
+ reg = <0>;
+ hs_ep: endpoint {
+ remote-endpoint = <&usb3_hs_ep>;
};
- port@1 {
- reg = <1>;
- ss_ep: endpoint {
- remote-endpoint = <&hd3ss3220_in_ep>;
- };
+ };
+ port@1 {
+ reg = <1>;
+ ss_ep: endpoint {
+ remote-endpoint = <&hd3ss3220_in_ep>;
};
+ };
};
};
@@ -355,12 +354,12 @@ examples:
#include <dt-bindings/gpio/gpio.h>
usb {
- connector {
- compatible = "gpio-usb-b-connector", "usb-b-connector";
- type = "micro";
- id-gpios = <&pio 12 GPIO_ACTIVE_HIGH>;
- vbus-supply = <&usb_p0_vbus>;
- };
+ connector {
+ compatible = "gpio-usb-b-connector", "usb-b-connector";
+ type = "micro";
+ id-gpios = <&pio 12 GPIO_ACTIVE_HIGH>;
+ vbus-supply = <&usb_p0_vbus>;
+ };
};
# Micro-USB connector with HS lines routed via controller (MUIC) and MHL
@@ -368,27 +367,27 @@ examples:
# mobile phone
- |
muic-max77843 {
- usb_con4: connector {
- compatible = "samsung,usb-connector-11pin", "usb-b-connector";
- label = "micro-USB";
- type = "micro";
+ usb_con4: connector {
+ compatible = "samsung,usb-connector-11pin", "usb-b-connector";
+ label = "micro-USB";
+ type = "micro";
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
- port@0 {
- reg = <0>;
- muic_to_usb: endpoint {
- remote-endpoint = <&usb_to_muic>;
- };
- };
- port@3 {
- reg = <3>;
- usb_con_mhl: endpoint {
- remote-endpoint = <&sii8620_mhl>;
+ port@0 {
+ reg = <0>;
+ muic_to_usb: endpoint {
+ remote-endpoint = <&usb_to_muic>;
+ };
+ };
+ port@3 {
+ reg = <3>;
+ usb_con_mhl: endpoint {
+ remote-endpoint = <&sii8620_mhl>;
+ };
+ };
};
- };
};
- };
};
diff --git a/Documentation/devicetree/bindings/counter/ti,am62-ecap-capture.yaml b/Documentation/devicetree/bindings/counter/ti,am62-ecap-capture.yaml
new file mode 100644
index 000000000000..4e0b2d2b303e
--- /dev/null
+++ b/Documentation/devicetree/bindings/counter/ti,am62-ecap-capture.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/counter/ti,am62-ecap-capture.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments Enhanced Capture (eCAP) Module
+
+maintainers:
+ - Julien Panis <jpanis@baylibre.com>
+
+description: |
+ The eCAP module resources can be used to capture timestamps
+ on input signal events (falling/rising edges).
+
+properties:
+ compatible:
+ const: ti,am62-ecap-capture
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: fck
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/soc/ti,sci_pm_domain.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ capture@23100000 { /* eCAP in capture mode on am62x */
+ compatible = "ti,am62-ecap-capture";
+ reg = <0x00 0x23100000 0x00 0x100>;
+ interrupts = <GIC_SPI 113 IRQ_TYPE_EDGE_RISING>;
+ power-domains = <&k3_pds 51 TI_SCI_PD_EXCLUSIVE>;
+ clocks = <&k3_clks 51 0>;
+ clock-names = "fck";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/arm/cpu-capacity.txt b/Documentation/devicetree/bindings/cpu/cpu-capacity.txt
index 380e21c5fc7e..f28e1adad428 100644
--- a/Documentation/devicetree/bindings/arm/cpu-capacity.txt
+++ b/Documentation/devicetree/bindings/cpu/cpu-capacity.txt
@@ -1,12 +1,12 @@
==========================================
-ARM CPUs capacity bindings
+CPU capacity bindings
==========================================
==========================================
1 - Introduction
==========================================
-ARM systems may be configured to have cpus with different power/performance
+Some systems may be configured to have cpus with different power/performance
characteristics within the same chip. In this case, additional information has
to be made available to the kernel for it to be aware of such differences and
take decisions accordingly.
@@ -62,8 +62,8 @@ Example 1 (ARM 64-bit, 6-cpu system, two clusters):
The capacities-dmips-mhz or DMIPS/MHz values (scaled to 1024)
are 1024 and 578 for cluster0 and cluster1. Further normalization
is done by the operating system based on cluster0@max-freq=1100 and
-custer1@max-freq=850, final capacities are 1024 for cluster0 and
-446 for cluster1 (576*850/1100).
+cluster1@max-freq=850, final capacities are 1024 for cluster0 and
+446 for cluster1 (578*850/1100).
cpus {
#address-cells = <2>;
diff --git a/Documentation/devicetree/bindings/cpu/idle-states.yaml b/Documentation/devicetree/bindings/cpu/idle-states.yaml
new file mode 100644
index 000000000000..b8cc826c9501
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpu/idle-states.yaml
@@ -0,0 +1,855 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpu/idle-states.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Idle states
+
+maintainers:
+ - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+ - Anup Patel <anup@brainfault.org>
+
+description: |+
+ ==========================================
+ 1 - Introduction
+ ==========================================
+
+ ARM and RISC-V systems contain HW capable of managing power consumption
+ dynamically, where cores can be put in different low-power states (ranging
+ from simple wfi to power gating) according to OS PM policies. The CPU states
+ representing the range of dynamic idle states that a processor can enter at
+ run-time, can be specified through device tree bindings representing the
+ parameters required to enter/exit specific idle states on a given processor.
+
+ ==========================================
+ 2 - ARM idle states
+ ==========================================
+
+ According to the Server Base System Architecture document (SBSA, [3]), the
+ power states an ARM CPU can be put into are identified by the following list:
+
+ - Running
+ - Idle_standby
+ - Idle_retention
+ - Sleep
+ - Off
+
+ The power states described in the SBSA document define the basic CPU states on
+ top of which ARM platforms implement power management schemes that allow an OS
+ PM implementation to put the processor in different idle states (which include
+ states listed above; "off" state is not an idle state since it does not have
+ wake-up capabilities, hence it is not considered in this document).
+
+ Idle state parameters (e.g. entry latency) are platform specific and need to
+ be characterized with bindings that provide the required information to OS PM
+ code so that it can build the required tables and use them at runtime.
+
+ The device tree binding definition for ARM idle states is the subject of this
+ document.
+
+ ==========================================
+ 3 - RISC-V idle states
+ ==========================================
+
+ On RISC-V systems, the HARTs (or CPUs) [6] can be put in platform specific
+ suspend (or idle) states (ranging from simple WFI, power gating, etc). The
+ RISC-V SBI v0.3 (or higher) [7] hart state management extension provides a
+ standard mechanism for OS to request HART state transitions.
+
+ The platform specific suspend (or idle) states of a hart can be either
+ retentive or non-rententive in nature. A retentive suspend state will
+ preserve HART registers and CSR values for all privilege modes whereas
+ a non-retentive suspend state will not preserve HART registers and CSR
+ values.
+
+ ===========================================
+ 4 - idle-states definitions
+ ===========================================
+
+ Idle states are characterized for a specific system through a set of
+ timing and energy related properties, that underline the HW behaviour
+ triggered upon idle states entry and exit.
+
+ The following diagram depicts the CPU execution phases and related timing
+ properties required to enter and exit an idle state:
+
+ ..__[EXEC]__|__[PREP]__|__[ENTRY]__|__[IDLE]__|__[EXIT]__|__[EXEC]__..
+ | | | | |
+
+ |<------ entry ------->|
+ | latency |
+ |<- exit ->|
+ | latency |
+ |<-------- min-residency -------->|
+ |<------- wakeup-latency ------->|
+
+ Diagram 1: CPU idle state execution phases
+
+ EXEC: Normal CPU execution.
+
+ PREP: Preparation phase before committing the hardware to idle mode
+ like cache flushing. This is abortable on pending wake-up
+ event conditions. The abort latency is assumed to be negligible
+ (i.e. less than the ENTRY + EXIT duration). If aborted, CPU
+ goes back to EXEC. This phase is optional. If not abortable,
+ this should be included in the ENTRY phase instead.
+
+ ENTRY: The hardware is committed to idle mode. This period must run
+ to completion up to IDLE before anything else can happen.
+
+ IDLE: This is the actual energy-saving idle period. This may last
+ between 0 and infinite time, until a wake-up event occurs.
+
+ EXIT: Period during which the CPU is brought back to operational
+ mode (EXEC).
+
+ entry-latency: Worst case latency required to enter the idle state. The
+ exit-latency may be guaranteed only after entry-latency has passed.
+
+ min-residency: Minimum period, including preparation and entry, for a given
+ idle state to be worthwhile energywise.
+
+ wakeup-latency: Maximum delay between the signaling of a wake-up event and the
+ CPU being able to execute normal code again. If not specified, this is assumed
+ to be entry-latency + exit-latency.
+
+ These timing parameters can be used by an OS in different circumstances.
+
+ An idle CPU requires the expected min-residency time to select the most
+ appropriate idle state based on the expected expiry time of the next IRQ
+ (i.e. wake-up) that causes the CPU to return to the EXEC phase.
+
+ An operating system scheduler may need to compute the shortest wake-up delay
+ for CPUs in the system by detecting how long will it take to get a CPU out
+ of an idle state, e.g.:
+
+ wakeup-delay = exit-latency + max(entry-latency - (now - entry-timestamp), 0)
+
+ In other words, the scheduler can make its scheduling decision by selecting
+ (e.g. waking-up) the CPU with the shortest wake-up delay.
+ The wake-up delay must take into account the entry latency if that period
+ has not expired. The abortable nature of the PREP period can be ignored
+ if it cannot be relied upon (e.g. the PREP deadline may occur much sooner than
+ the worst case since it depends on the CPU operating conditions, i.e. caches
+ state).
+
+ An OS has to reliably probe the wakeup-latency since some devices can enforce
+ latency constraint guarantees to work properly, so the OS has to detect the
+ worst case wake-up latency it can incur if a CPU is allowed to enter an
+ idle state, and possibly to prevent that to guarantee reliable device
+ functioning.
+
+ The min-residency time parameter deserves further explanation since it is
+ expressed in time units but must factor in energy consumption coefficients.
+
+ The energy consumption of a cpu when it enters a power state can be roughly
+ characterised by the following graph:
+
+ |
+ |
+ |
+ e |
+ n | /---
+ e | /------
+ r | /------
+ g | /-----
+ y | /------
+ | ----
+ | /|
+ | / |
+ | / |
+ | / |
+ | / |
+ | / |
+ |/ |
+ -----|-------+----------------------------------
+ 0| 1 time(ms)
+
+ Graph 1: Energy vs time example
+
+ The graph is split in two parts delimited by time 1ms on the X-axis.
+ The graph curve with X-axis values = { x | 0 < x < 1ms } has a steep slope
+ and denotes the energy costs incurred while entering and leaving the idle
+ state.
+ The graph curve in the area delimited by X-axis values = {x | x > 1ms } has
+ shallower slope and essentially represents the energy consumption of the idle
+ state.
+
+ min-residency is defined for a given idle state as the minimum expected
+ residency time for a state (inclusive of preparation and entry) after
+ which choosing that state become the most energy efficient option. A good
+ way to visualise this, is by taking the same graph above and comparing some
+ states energy consumptions plots.
+
+ For sake of simplicity, let's consider a system with two idle states IDLE1,
+ and IDLE2:
+
+ |
+ |
+ |
+ | /-- IDLE1
+ e | /---
+ n | /----
+ e | /---
+ r | /-----/--------- IDLE2
+ g | /-------/---------
+ y | ------------ /---|
+ | / /---- |
+ | / /--- |
+ | / /---- |
+ | / /--- |
+ | --- |
+ | / |
+ | / |
+ |/ | time
+ ---/----------------------------+------------------------
+ |IDLE1-energy < IDLE2-energy | IDLE2-energy < IDLE1-energy
+ |
+ IDLE2-min-residency
+
+ Graph 2: idle states min-residency example
+
+ In graph 2 above, that takes into account idle states entry/exit energy
+ costs, it is clear that if the idle state residency time (i.e. time till next
+ wake-up IRQ) is less than IDLE2-min-residency, IDLE1 is the better idle state
+ choice energywise.
+
+ This is mainly down to the fact that IDLE1 entry/exit energy costs are lower
+ than IDLE2.
+
+ However, the lower power consumption (i.e. shallower energy curve slope) of
+ idle state IDLE2 implies that after a suitable time, IDLE2 becomes more energy
+ efficient.
+
+ The time at which IDLE2 becomes more energy efficient than IDLE1 (and other
+ shallower states in a system with multiple idle states) is defined
+ IDLE2-min-residency and corresponds to the time when energy consumption of
+ IDLE1 and IDLE2 states breaks even.
+
+ The definitions provided in this section underpin the idle states
+ properties specification that is the subject of the following sections.
+
+ ===========================================
+ 5 - idle-states node
+ ===========================================
+
+ The processor idle states are defined within the idle-states node, which is
+ a direct child of the cpus node [1] and provides a container where the
+ processor idle states, defined as device tree nodes, are listed.
+
+ On ARM systems, it is a container of processor idle states nodes. If the
+ system does not provide CPU power management capabilities, or the processor
+ just supports idle_standby, an idle-states node is not required.
+
+ ===========================================
+ 6 - References
+ ===========================================
+
+ [1] ARM Linux Kernel documentation - CPUs bindings
+ Documentation/devicetree/bindings/arm/cpus.yaml
+
+ [2] ARM Linux Kernel documentation - PSCI bindings
+ Documentation/devicetree/bindings/arm/psci.yaml
+
+ [3] ARM Server Base System Architecture (SBSA)
+ http://infocenter.arm.com/help/index.jsp
+
+ [4] ARM Architecture Reference Manuals
+ http://infocenter.arm.com/help/index.jsp
+
+ [5] ARM Linux Kernel documentation - Booting AArch64 Linux
+ Documentation/arm64/booting.rst
+
+ [6] RISC-V Linux Kernel documentation - CPUs bindings
+ Documentation/devicetree/bindings/riscv/cpus.yaml
+
+ [7] RISC-V Supervisor Binary Interface (SBI)
+ http://github.com/riscv/riscv-sbi-doc/riscv-sbi.adoc
+
+properties:
+ $nodename:
+ const: idle-states
+
+ entry-method:
+ description: |
+ Usage and definition depend on ARM architecture version.
+
+ On ARM v8 64-bit this property is required.
+ On ARM 32-bit systems this property is optional
+
+ This assumes that the "enable-method" property is set to "psci" in the cpu
+ node[5] that is responsible for setting up CPU idle management in the OS
+ implementation.
+ const: psci
+
+patternProperties:
+ "^(cpu|cluster)-":
+ type: object
+ description: |
+ Each state node represents an idle state description and must be defined
+ as follows.
+
+ The idle state entered by executing the wfi instruction (idle_standby
+ SBSA,[3][4]) is considered standard on all ARM and RISC-V platforms and
+ therefore must not be listed.
+
+ In addition to the properties listed above, a state node may require
+ additional properties specific to the entry-method defined in the
+ idle-states node. Please refer to the entry-method bindings
+ documentation for properties definitions.
+
+ properties:
+ compatible:
+ enum:
+ - arm,idle-state
+ - riscv,idle-state
+
+ arm,psci-suspend-param:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ power_state parameter to pass to the ARM PSCI suspend call.
+
+ Device tree nodes that require usage of PSCI CPU_SUSPEND function
+ (i.e. idle states node with entry-method property is set to "psci")
+ must specify this property.
+
+ riscv,sbi-suspend-param:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ suspend_type parameter to pass to the RISC-V SBI HSM suspend call.
+
+ This property is required in idle state nodes of device tree meant
+ for RISC-V systems. For more details on the suspend_type parameter
+ refer the SBI specifiation v0.3 (or higher) [7].
+
+ local-timer-stop:
+ description:
+ If present the CPU local timer control logic is
+ lost on state entry, otherwise it is retained.
+ type: boolean
+
+ entry-latency-us:
+ description:
+ Worst case latency in microseconds required to enter the idle state.
+
+ exit-latency-us:
+ description:
+ Worst case latency in microseconds required to exit the idle state.
+ The exit-latency-us duration may be guaranteed only after
+ entry-latency-us has passed.
+
+ min-residency-us:
+ description:
+ Minimum residency duration in microseconds, inclusive of preparation
+ and entry, for this idle state to be considered worthwhile energy wise
+ (refer to section 2 of this document for a complete description).
+
+ wakeup-latency-us:
+ description: |
+ Maximum delay between the signaling of a wake-up event and the CPU
+ being able to execute normal code again. If omitted, this is assumed
+ to be equal to:
+
+ entry-latency-us + exit-latency-us
+
+ It is important to supply this value on systems where the duration of
+ PREP phase (see diagram 1, section 2) is non-neglibigle. In such
+ systems entry-latency-us + exit-latency-us will exceed
+ wakeup-latency-us by this duration.
+
+ idle-state-name:
+ $ref: /schemas/types.yaml#/definitions/string
+ description:
+ A string used as a descriptive name for the idle state.
+
+ additionalProperties: false
+
+ required:
+ - compatible
+ - entry-latency-us
+ - exit-latency-us
+ - min-residency-us
+
+additionalProperties: false
+
+examples:
+ - |
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <2>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x0>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x1>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10000>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10001>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@10101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a57";
+ reg = <0x0 0x10101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_0_0>, <&CPU_SLEEP_0_0>,
+ <&CLUSTER_RETENTION_0>, <&CLUSTER_SLEEP_0>;
+ };
+
+ cpu@100000000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x0>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x1>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100000101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010000 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10000>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010001 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10001>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ cpu@100010101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x1 0x10101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_RETENTION_1_0>, <&CPU_SLEEP_1_0>,
+ <&CLUSTER_RETENTION_1>, <&CLUSTER_SLEEP_1>;
+ };
+
+ idle-states {
+ entry-method = "psci";
+
+ CPU_RETENTION_0_0: cpu-retention-0-0 {
+ compatible = "arm,idle-state";
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <80>;
+ };
+
+ CLUSTER_RETENTION_0: cluster-retention-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <250>;
+ wakeup-latency-us = <130>;
+ };
+
+ CPU_SLEEP_0_0: cpu-sleep-0-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <250>;
+ exit-latency-us = <500>;
+ min-residency-us = <950>;
+ };
+
+ CLUSTER_SLEEP_0: cluster-sleep-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <600>;
+ exit-latency-us = <1100>;
+ min-residency-us = <2700>;
+ wakeup-latency-us = <1500>;
+ };
+
+ CPU_RETENTION_1_0: cpu-retention-1-0 {
+ compatible = "arm,idle-state";
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <90>;
+ };
+
+ CLUSTER_RETENTION_1: cluster-retention-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <270>;
+ wakeup-latency-us = <100>;
+ };
+
+ CPU_SLEEP_1_0: cpu-sleep-1-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x0010000>;
+ entry-latency-us = <70>;
+ exit-latency-us = <100>;
+ min-residency-us = <300>;
+ wakeup-latency-us = <150>;
+ };
+
+ CLUSTER_SLEEP_1: cluster-sleep-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ arm,psci-suspend-param = <0x1010000>;
+ entry-latency-us = <500>;
+ exit-latency-us = <1200>;
+ min-residency-us = <3500>;
+ wakeup-latency-us = <1300>;
+ };
+ };
+ };
+
+ - |
+ // Example 2 (ARM 32-bit, 8-cpu system, two clusters):
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <1>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x0>;
+ cpu-idle-states = <&cpu_sleep_0_0>, <&cluster_sleep_0>;
+ };
+
+ cpu@1 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x1>;
+ cpu-idle-states = <&cpu_sleep_0_0>, <&cluster_sleep_0>;
+ };
+
+ cpu@2 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x2>;
+ cpu-idle-states = <&cpu_sleep_0_0>, <&cluster_sleep_0>;
+ };
+
+ cpu@3 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a15";
+ reg = <0x3>;
+ cpu-idle-states = <&cpu_sleep_0_0>, <&cluster_sleep_0>;
+ };
+
+ cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x100>;
+ cpu-idle-states = <&cpu_sleep_1_0>, <&cluster_sleep_1>;
+ };
+
+ cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x101>;
+ cpu-idle-states = <&cpu_sleep_1_0>, <&cluster_sleep_1>;
+ };
+
+ cpu@102 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x102>;
+ cpu-idle-states = <&cpu_sleep_1_0>, <&cluster_sleep_1>;
+ };
+
+ cpu@103 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a7";
+ reg = <0x103>;
+ cpu-idle-states = <&cpu_sleep_1_0>, <&cluster_sleep_1>;
+ };
+
+ idle-states {
+ cpu_sleep_0_0: cpu-sleep-0-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <200>;
+ exit-latency-us = <100>;
+ min-residency-us = <400>;
+ wakeup-latency-us = <250>;
+ };
+
+ cluster_sleep_0: cluster-sleep-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <500>;
+ exit-latency-us = <1500>;
+ min-residency-us = <2500>;
+ wakeup-latency-us = <1700>;
+ };
+
+ cpu_sleep_1_0: cpu-sleep-1-0 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <300>;
+ exit-latency-us = <500>;
+ min-residency-us = <900>;
+ wakeup-latency-us = <600>;
+ };
+
+ cluster_sleep_1: cluster-sleep-1 {
+ compatible = "arm,idle-state";
+ local-timer-stop;
+ entry-latency-us = <800>;
+ exit-latency-us = <2000>;
+ min-residency-us = <6500>;
+ wakeup-latency-us = <2300>;
+ };
+ };
+ };
+
+ - |
+ // Example 3 (RISC-V 64-bit, 4-cpu systems, two clusters):
+
+ cpus {
+ #size-cells = <0>;
+ #address-cells = <1>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "riscv";
+ reg = <0x0>;
+ riscv,isa = "rv64imafdc";
+ mmu-type = "riscv,sv48";
+ cpu-idle-states = <&CPU_RET_0_0>, <&CPU_NONRET_0_0>,
+ <&CLUSTER_RET_0>, <&CLUSTER_NONRET_0>;
+
+ cpu_intc0: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+
+ cpu@1 {
+ device_type = "cpu";
+ compatible = "riscv";
+ reg = <0x1>;
+ riscv,isa = "rv64imafdc";
+ mmu-type = "riscv,sv48";
+ cpu-idle-states = <&CPU_RET_0_0>, <&CPU_NONRET_0_0>,
+ <&CLUSTER_RET_0>, <&CLUSTER_NONRET_0>;
+
+ cpu_intc1: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+
+ cpu@10 {
+ device_type = "cpu";
+ compatible = "riscv";
+ reg = <0x10>;
+ riscv,isa = "rv64imafdc";
+ mmu-type = "riscv,sv48";
+ cpu-idle-states = <&CPU_RET_1_0>, <&CPU_NONRET_1_0>,
+ <&CLUSTER_RET_1>, <&CLUSTER_NONRET_1>;
+
+ cpu_intc10: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+
+ cpu@11 {
+ device_type = "cpu";
+ compatible = "riscv";
+ reg = <0x11>;
+ riscv,isa = "rv64imafdc";
+ mmu-type = "riscv,sv48";
+ cpu-idle-states = <&CPU_RET_1_0>, <&CPU_NONRET_1_0>,
+ <&CLUSTER_RET_1>, <&CLUSTER_NONRET_1>;
+
+ cpu_intc11: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+
+ idle-states {
+ CPU_RET_0_0: cpu-retentive-0-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x10000000>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <80>;
+ };
+
+ CPU_NONRET_0_0: cpu-nonretentive-0-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x90000000>;
+ entry-latency-us = <250>;
+ exit-latency-us = <500>;
+ min-residency-us = <950>;
+ };
+
+ CLUSTER_RET_0: cluster-retentive-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x11000000>;
+ local-timer-stop;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <250>;
+ wakeup-latency-us = <130>;
+ };
+
+ CLUSTER_NONRET_0: cluster-nonretentive-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x91000000>;
+ local-timer-stop;
+ entry-latency-us = <600>;
+ exit-latency-us = <1100>;
+ min-residency-us = <2700>;
+ wakeup-latency-us = <1500>;
+ };
+
+ CPU_RET_1_0: cpu-retentive-1-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x10000010>;
+ entry-latency-us = <20>;
+ exit-latency-us = <40>;
+ min-residency-us = <80>;
+ };
+
+ CPU_NONRET_1_0: cpu-nonretentive-1-0 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x90000010>;
+ entry-latency-us = <250>;
+ exit-latency-us = <500>;
+ min-residency-us = <950>;
+ };
+
+ CLUSTER_RET_1: cluster-retentive-1 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x11000010>;
+ local-timer-stop;
+ entry-latency-us = <50>;
+ exit-latency-us = <100>;
+ min-residency-us = <250>;
+ wakeup-latency-us = <130>;
+ };
+
+ CLUSTER_NONRET_1: cluster-nonretentive-1 {
+ compatible = "riscv,idle-state";
+ riscv,sbi-suspend-param = <0x91000010>;
+ local-timer-stop;
+ entry-latency-us = <600>;
+ exit-latency-us = <1100>;
+ min-residency-us = <2700>;
+ wakeup-latency-us = <1500>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml b/Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml
new file mode 100644
index 000000000000..76cb9726660e
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/apple,cluster-cpufreq.yaml
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpufreq/apple,cluster-cpufreq.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Apple SoC cluster cpufreq device
+
+maintainers:
+ - Hector Martin <marcan@marcan.st>
+
+description: |
+ Apple SoCs (e.g. M1) have a per-cpu-cluster DVFS controller that is part of
+ the cluster management register block. This binding uses the standard
+ operating-points-v2 table to define the CPU performance states, with the
+ opp-level property specifying the hardware p-state index for that level.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - apple,t8103-cluster-cpufreq
+ - apple,t8112-cluster-cpufreq
+ - const: apple,cluster-cpufreq
+ - items:
+ - const: apple,t6000-cluster-cpufreq
+ - const: apple,t8103-cluster-cpufreq
+ - const: apple,cluster-cpufreq
+
+ reg:
+ maxItems: 1
+
+ '#performance-domain-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+ - '#performance-domain-cells'
+
+additionalProperties: false
+
+examples:
+ - |
+ // This example shows a single CPU per domain and 2 domains,
+ // with two p-states per domain.
+ // Shipping hardware has 2-4 CPUs per domain and 2-6 domains.
+ cpus {
+ #address-cells = <2>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ compatible = "apple,icestorm";
+ device_type = "cpu";
+ reg = <0x0 0x0>;
+ operating-points-v2 = <&ecluster_opp>;
+ performance-domains = <&cpufreq_e>;
+ };
+
+ cpu@10100 {
+ compatible = "apple,firestorm";
+ device_type = "cpu";
+ reg = <0x0 0x10100>;
+ operating-points-v2 = <&pcluster_opp>;
+ performance-domains = <&cpufreq_p>;
+ };
+ };
+
+ ecluster_opp: opp-table-0 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp01 {
+ opp-hz = /bits/ 64 <600000000>;
+ opp-level = <1>;
+ clock-latency-ns = <7500>;
+ };
+ opp02 {
+ opp-hz = /bits/ 64 <972000000>;
+ opp-level = <2>;
+ clock-latency-ns = <22000>;
+ };
+ };
+
+ pcluster_opp: opp-table-1 {
+ compatible = "operating-points-v2";
+ opp-shared;
+
+ opp01 {
+ opp-hz = /bits/ 64 <600000000>;
+ opp-level = <1>;
+ clock-latency-ns = <8000>;
+ };
+ opp02 {
+ opp-hz = /bits/ 64 <828000000>;
+ opp-level = <2>;
+ clock-latency-ns = <19000>;
+ };
+ };
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ cpufreq_e: performance-controller@210e20000 {
+ compatible = "apple,t8103-cluster-cpufreq", "apple,cluster-cpufreq";
+ reg = <0x2 0x10e20000 0 0x1000>;
+ #performance-domain-cells = <0>;
+ };
+
+ cpufreq_p: performance-controller@211e20000 {
+ compatible = "apple,t8103-cluster-cpufreq", "apple,cluster-cpufreq";
+ reg = <0x2 0x11e20000 0 0x1000>;
+ #performance-domain-cells = <0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
index 73470ecd1f12..ce91a9197697 100644
--- a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
+++ b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
@@ -16,7 +16,7 @@ has been processed. See [2] for more information on the brcm,l2-intc node.
firmware. On some SoCs, this firmware supports DFS and DVFS in addition to
Adaptive Voltage Scaling.
-[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt
+[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.yaml
Node brcm,avs-cpu-data-mem
diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek-hw.yaml
index 9cd42a64b13e..d0aecde2b89b 100644
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek-hw.yaml
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek-hw.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/cpufreq/cpufreq-mediatek-hw.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MediaTek's CPUFREQ Bindings
+title: MediaTek's CPUFREQ
maintainers:
- Hector Yuan <hector.yuan@mediatek.com>
diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt
index b8233ec91d3d..e0a4ba599abc 100644
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt
@@ -20,6 +20,13 @@ Optional properties:
Vsram to fit SoC specific needs. When absent, the voltage scaling
flow is handled by hardware, hence no software "voltage tracking" is
needed.
+- mediatek,cci:
+ Used to confirm the link status between cpufreq and mediatek cci. Because
+ cpufreq and mediatek cci could share the same regulator in some MediaTek SoCs.
+ To prevent the issue of high frequency and low voltage, we need to use this
+ property to make sure mediatek cci is ready.
+ For details of mediatek cci, please refer to
+ Documentation/devicetree/bindings/interconnect/mediatek,cci.yaml
- #cooling-cells:
For details, please refer to
Documentation/devicetree/bindings/thermal/thermal-cooling-devices.yaml
diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt
deleted file mode 100644
index 9299028ee712..000000000000
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-Qualcomm Technologies, Inc. CPUFREQ Bindings
-
-CPUFREQ HW is a hardware engine used by some Qualcomm Technologies, Inc. (QTI)
-SoCs to manage frequency in hardware. It is capable of controlling frequency
-for multiple clusters.
-
-Properties:
-- compatible
- Usage: required
- Value type: <string>
- Definition: must be "qcom,cpufreq-hw" or "qcom,cpufreq-epss".
-
-- clocks
- Usage: required
- Value type: <phandle> From common clock binding.
- Definition: clock handle for XO clock and GPLL0 clock.
-
-- clock-names
- Usage: required
- Value type: <string> From common clock binding.
- Definition: must be "xo", "alternate".
-
-- reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: Addresses and sizes for the memory of the HW bases in
- each frequency domain.
-- reg-names
- Usage: Optional
- Value type: <string>
- Definition: Frequency domain name i.e.
- "freq-domain0", "freq-domain1".
-
-- #freq-domain-cells:
- Usage: required.
- Definition: Number of cells in a freqency domain specifier.
-
-* Property qcom,freq-domain
-Devices supporting freq-domain must set their "qcom,freq-domain" property with
-phandle to a cpufreq_hw followed by the Domain ID(0/1) in the CPU DT node.
-
-
-Example:
-
-Example 1: Dual-cluster, Quad-core per cluster. CPUs within a cluster switch
-DCVS state together.
-
-/ {
- cpus {
- #address-cells = <2>;
- #size-cells = <0>;
-
- CPU0: cpu@0 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x0>;
- enable-method = "psci";
- next-level-cache = <&L2_0>;
- qcom,freq-domain = <&cpufreq_hw 0>;
- L2_0: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- L3_0: l3-cache {
- compatible = "cache";
- };
- };
- };
-
- CPU1: cpu@100 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x100>;
- enable-method = "psci";
- next-level-cache = <&L2_100>;
- qcom,freq-domain = <&cpufreq_hw 0>;
- L2_100: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU2: cpu@200 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x200>;
- enable-method = "psci";
- next-level-cache = <&L2_200>;
- qcom,freq-domain = <&cpufreq_hw 0>;
- L2_200: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU3: cpu@300 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x300>;
- enable-method = "psci";
- next-level-cache = <&L2_300>;
- qcom,freq-domain = <&cpufreq_hw 0>;
- L2_300: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU4: cpu@400 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x400>;
- enable-method = "psci";
- next-level-cache = <&L2_400>;
- qcom,freq-domain = <&cpufreq_hw 1>;
- L2_400: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU5: cpu@500 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x500>;
- enable-method = "psci";
- next-level-cache = <&L2_500>;
- qcom,freq-domain = <&cpufreq_hw 1>;
- L2_500: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU6: cpu@600 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x600>;
- enable-method = "psci";
- next-level-cache = <&L2_600>;
- qcom,freq-domain = <&cpufreq_hw 1>;
- L2_600: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
-
- CPU7: cpu@700 {
- device_type = "cpu";
- compatible = "qcom,kryo385";
- reg = <0x0 0x700>;
- enable-method = "psci";
- next-level-cache = <&L2_700>;
- qcom,freq-domain = <&cpufreq_hw 1>;
- L2_700: l2-cache {
- compatible = "cache";
- next-level-cache = <&L3_0>;
- };
- };
- };
-
- soc {
- cpufreq_hw: cpufreq@17d43000 {
- compatible = "qcom,cpufreq-hw";
- reg = <0x17d43000 0x1400>, <0x17d45800 0x1400>;
- reg-names = "freq-domain0", "freq-domain1";
-
- clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GPLL0>;
- clock-names = "xo", "alternate";
-
- #freq-domain-cells = <1>;
- };
-}
diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
new file mode 100644
index 000000000000..a6b3bb8fdf33
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
@@ -0,0 +1,362 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpufreq/cpufreq-qcom-hw.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. CPUFREQ
+
+maintainers:
+ - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+
+description: |
+
+ CPUFREQ HW is a hardware engine used by some Qualcomm Technologies, Inc. (QTI)
+ SoCs to manage frequency in hardware. It is capable of controlling frequency
+ for multiple clusters.
+
+properties:
+ compatible:
+ oneOf:
+ - description: v1 of CPUFREQ HW
+ items:
+ - enum:
+ - qcom,qcm2290-cpufreq-hw
+ - qcom,sc7180-cpufreq-hw
+ - qcom,sdm845-cpufreq-hw
+ - qcom,sm6115-cpufreq-hw
+ - qcom,sm6350-cpufreq-hw
+ - qcom,sm8150-cpufreq-hw
+ - const: qcom,cpufreq-hw
+
+ - description: v2 of CPUFREQ HW (EPSS)
+ items:
+ - enum:
+ - qcom,qdu1000-cpufreq-epss
+ - qcom,sa8775p-cpufreq-epss
+ - qcom,sc7280-cpufreq-epss
+ - qcom,sc8280xp-cpufreq-epss
+ - qcom,sm6375-cpufreq-epss
+ - qcom,sm8250-cpufreq-epss
+ - qcom,sm8350-cpufreq-epss
+ - qcom,sm8450-cpufreq-epss
+ - qcom,sm8550-cpufreq-epss
+ - const: qcom,cpufreq-epss
+
+ reg:
+ minItems: 1
+ items:
+ - description: Frequency domain 0 register region
+ - description: Frequency domain 1 register region
+ - description: Frequency domain 2 register region
+
+ reg-names:
+ minItems: 1
+ items:
+ - const: freq-domain0
+ - const: freq-domain1
+ - const: freq-domain2
+
+ clocks:
+ items:
+ - description: XO Clock
+ - description: GPLL0 Clock
+
+ clock-names:
+ items:
+ - const: xo
+ - const: alternate
+
+ interrupts:
+ minItems: 1
+ maxItems: 3
+
+ interrupt-names:
+ minItems: 1
+ items:
+ - const: dcvsh-irq-0
+ - const: dcvsh-irq-1
+ - const: dcvsh-irq-2
+
+ '#freq-domain-cells':
+ const: 1
+
+ '#clock-cells':
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - '#freq-domain-cells'
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,qcm2290-cpufreq-hw
+ then:
+ properties:
+ reg:
+ minItems: 1
+ maxItems: 1
+
+ reg-names:
+ minItems: 1
+ maxItems: 1
+
+ interrupts:
+ minItems: 1
+ maxItems: 1
+
+ interrupt-names:
+ minItems: 1
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,qdu1000-cpufreq-epss
+ - qcom,sc7180-cpufreq-hw
+ - qcom,sc8280xp-cpufreq-epss
+ - qcom,sdm845-cpufreq-hw
+ - qcom,sm6115-cpufreq-hw
+ - qcom,sm6350-cpufreq-hw
+ - qcom,sm6375-cpufreq-epss
+ then:
+ properties:
+ reg:
+ minItems: 2
+ maxItems: 2
+
+ reg-names:
+ minItems: 2
+ maxItems: 2
+
+ interrupts:
+ minItems: 2
+ maxItems: 2
+
+ interrupt-names:
+ minItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7280-cpufreq-epss
+ - qcom,sm8250-cpufreq-epss
+ - qcom,sm8350-cpufreq-epss
+ - qcom,sm8450-cpufreq-epss
+ - qcom,sm8550-cpufreq-epss
+ then:
+ properties:
+ reg:
+ minItems: 3
+ maxItems: 3
+
+ reg-names:
+ minItems: 3
+ maxItems: 3
+
+ interrupts:
+ minItems: 3
+ maxItems: 3
+
+ interrupt-names:
+ minItems: 3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sm8150-cpufreq-hw
+ then:
+ properties:
+ reg:
+ minItems: 3
+ maxItems: 3
+
+ reg-names:
+ minItems: 3
+ maxItems: 3
+
+ # On some SoCs the Prime core shares the LMH irq with Big cores
+ interrupts:
+ minItems: 2
+ maxItems: 2
+
+ interrupt-names:
+ minItems: 2
+
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+
+ // Example 1: Dual-cluster, Quad-core per cluster. CPUs within a cluster
+ // switch DCVS state together.
+ cpus {
+ #address-cells = <2>;
+ #size-cells = <0>;
+
+ CPU0: cpu@0 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x0>;
+ enable-method = "psci";
+ next-level-cache = <&L2_0>;
+ qcom,freq-domain = <&cpufreq_hw 0>;
+ clocks = <&cpufreq_hw 0>;
+ L2_0: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ L3_0: l3-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <3>;
+ };
+ };
+ };
+
+ CPU1: cpu@100 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x100>;
+ enable-method = "psci";
+ next-level-cache = <&L2_100>;
+ qcom,freq-domain = <&cpufreq_hw 0>;
+ clocks = <&cpufreq_hw 0>;
+ L2_100: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU2: cpu@200 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x200>;
+ enable-method = "psci";
+ next-level-cache = <&L2_200>;
+ qcom,freq-domain = <&cpufreq_hw 0>;
+ clocks = <&cpufreq_hw 0>;
+ L2_200: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU3: cpu@300 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x300>;
+ enable-method = "psci";
+ next-level-cache = <&L2_300>;
+ qcom,freq-domain = <&cpufreq_hw 0>;
+ clocks = <&cpufreq_hw 0>;
+ L2_300: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU4: cpu@400 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x400>;
+ enable-method = "psci";
+ next-level-cache = <&L2_400>;
+ qcom,freq-domain = <&cpufreq_hw 1>;
+ clocks = <&cpufreq_hw 1>;
+ L2_400: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU5: cpu@500 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x500>;
+ enable-method = "psci";
+ next-level-cache = <&L2_500>;
+ qcom,freq-domain = <&cpufreq_hw 1>;
+ clocks = <&cpufreq_hw 1>;
+ L2_500: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU6: cpu@600 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x600>;
+ enable-method = "psci";
+ next-level-cache = <&L2_600>;
+ qcom,freq-domain = <&cpufreq_hw 1>;
+ clocks = <&cpufreq_hw 1>;
+ L2_600: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+
+ CPU7: cpu@700 {
+ device_type = "cpu";
+ compatible = "qcom,kryo385";
+ reg = <0x0 0x700>;
+ enable-method = "psci";
+ next-level-cache = <&L2_700>;
+ qcom,freq-domain = <&cpufreq_hw 1>;
+ clocks = <&cpufreq_hw 1>;
+ L2_700: l2-cache {
+ compatible = "cache";
+ cache-unified;
+ cache-level = <2>;
+ next-level-cache = <&L3_0>;
+ };
+ };
+ };
+
+ soc {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ cpufreq@17d43000 {
+ compatible = "qcom,sdm845-cpufreq-hw", "qcom,cpufreq-hw";
+ reg = <0x17d43000 0x1400>, <0x17d45800 0x1400>;
+ reg-names = "freq-domain0", "freq-domain1";
+
+ clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GPLL0>;
+ clock-names = "xo", "alternate";
+
+ #freq-domain-cells = <1>;
+ #clock-cells = <1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
new file mode 100644
index 000000000000..6f5e7904181f
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
@@ -0,0 +1,204 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpufreq/qcom-cpufreq-nvmem.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. NVMEM CPUFreq
+
+maintainers:
+ - Ilia Lin <ilia.lin@kernel.org>
+
+description: |
+ In certain Qualcomm Technologies, Inc. SoCs such as QCS404, The CPU supply
+ voltage is dynamically configured by Core Power Reduction (CPR) depending on
+ current CPU frequency and efuse values.
+ CPR provides a power domain with multiple levels that are selected depending
+ on the CPU OPP in use. The CPUFreq driver sets the CPR power domain level
+ according to the required OPPs defined in the CPU OPP tables.
+
+ For old implementation efuses are parsed to select the correct opp table and
+ voltage and CPR is not supported/used.
+
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,apq8064
+ - qcom,apq8096
+ - qcom,ipq8064
+ - qcom,msm8939
+ - qcom,msm8960
+ - qcom,msm8974
+ - qcom,msm8996
+ - qcom,qcs404
+ required:
+ - compatible
+
+patternProperties:
+ '^opp-table(-[a-z0-9]+)?$':
+ allOf:
+ - if:
+ properties:
+ compatible:
+ const: operating-points-v2-kryo-cpu
+ then:
+ $ref: /schemas/opp/opp-v2-kryo-cpu.yaml#
+
+ - if:
+ properties:
+ compatible:
+ const: operating-points-v2-qcom-level
+ then:
+ $ref: /schemas/opp/opp-v2-qcom-level.yaml#
+
+ unevaluatedProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,qcs404
+
+ then:
+ properties:
+ cpus:
+ type: object
+
+ patternProperties:
+ '^cpu@[0-9a-f]+$':
+ type: object
+
+ properties:
+ power-domains:
+ maxItems: 1
+
+ power-domain-names:
+ items:
+ - const: cpr
+
+ required:
+ - power-domains
+ - power-domain-names
+
+ patternProperties:
+ '^opp-table(-[a-z0-9]+)?$':
+ if:
+ properties:
+ compatible:
+ const: operating-points-v2-kryo-cpu
+ then:
+ patternProperties:
+ '^opp-?[0-9]+$':
+ required:
+ - required-opps
+
+additionalProperties: true
+
+examples:
+ - |
+ / {
+ model = "Qualcomm Technologies, Inc. QCS404 EVB 1000";
+ compatible = "qcom,qcs404-evb-1000", "qcom,qcs404-evb", "qcom,qcs404";
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ CPU0: cpu@100 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x100>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_SLEEP_0>;
+ next-level-cache = <&L2_0>;
+ #cooling-cells = <2>;
+ clocks = <&apcs_glb>;
+ operating-points-v2 = <&cpu_opp_table>;
+ power-domains = <&cpr>;
+ power-domain-names = "cpr";
+ };
+
+ CPU1: cpu@101 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x101>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_SLEEP_0>;
+ next-level-cache = <&L2_0>;
+ #cooling-cells = <2>;
+ clocks = <&apcs_glb>;
+ operating-points-v2 = <&cpu_opp_table>;
+ power-domains = <&cpr>;
+ power-domain-names = "cpr";
+ };
+
+ CPU2: cpu@102 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x102>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_SLEEP_0>;
+ next-level-cache = <&L2_0>;
+ #cooling-cells = <2>;
+ clocks = <&apcs_glb>;
+ operating-points-v2 = <&cpu_opp_table>;
+ power-domains = <&cpr>;
+ power-domain-names = "cpr";
+ };
+
+ CPU3: cpu@103 {
+ device_type = "cpu";
+ compatible = "arm,cortex-a53";
+ reg = <0x103>;
+ enable-method = "psci";
+ cpu-idle-states = <&CPU_SLEEP_0>;
+ next-level-cache = <&L2_0>;
+ #cooling-cells = <2>;
+ clocks = <&apcs_glb>;
+ operating-points-v2 = <&cpu_opp_table>;
+ power-domains = <&cpr>;
+ power-domain-names = "cpr";
+ };
+ };
+
+ cpu_opp_table: opp-table-cpu {
+ compatible = "operating-points-v2-kryo-cpu";
+ opp-shared;
+
+ opp-1094400000 {
+ opp-hz = /bits/ 64 <1094400000>;
+ required-opps = <&cpr_opp1>;
+ };
+ opp-1248000000 {
+ opp-hz = /bits/ 64 <1248000000>;
+ required-opps = <&cpr_opp2>;
+ };
+ opp-1401600000 {
+ opp-hz = /bits/ 64 <1401600000>;
+ required-opps = <&cpr_opp3>;
+ };
+ };
+
+ cpr_opp_table: opp-table-cpr {
+ compatible = "operating-points-v2-qcom-level";
+
+ cpr_opp1: opp1 {
+ opp-level = <1>;
+ qcom,opp-fuse-level = <1>;
+ };
+ cpr_opp2: opp2 {
+ opp-level = <2>;
+ qcom,opp-fuse-level = <2>;
+ };
+ cpr_opp3: opp3 {
+ opp-level = <3>;
+ qcom,opp-fuse-level = <3>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/crypto/allwinner,sun4i-a10-crypto.yaml b/Documentation/devicetree/bindings/crypto/allwinner,sun4i-a10-crypto.yaml
index 0429fb774f10..0401c11da8d9 100644
--- a/Documentation/devicetree/bindings/crypto/allwinner,sun4i-a10-crypto.yaml
+++ b/Documentation/devicetree/bindings/crypto/allwinner,sun4i-a10-crypto.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/crypto/allwinner,sun4i-a10-crypto.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Security System Device Tree Bindings
+title: Allwinner A10 Security System
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -44,6 +44,16 @@ properties:
- const: ahb
- const: mod
+ dmas:
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
resets:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml
index 00648f9d9278..4287678aa79f 100644
--- a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml
+++ b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml
@@ -14,6 +14,7 @@ properties:
enum:
- allwinner,sun8i-h3-crypto
- allwinner,sun8i-r40-crypto
+ - allwinner,sun20i-d1-crypto
- allwinner,sun50i-a64-crypto
- allwinner,sun50i-h5-crypto
- allwinner,sun50i-h6-crypto
@@ -29,6 +30,7 @@ properties:
- description: Bus clock
- description: Module clock
- description: MBus clock
+ - description: TRNG clock (RC oscillator)
minItems: 2
clock-names:
@@ -36,6 +38,7 @@ properties:
- const: bus
- const: mod
- const: ram
+ - const: trng
minItems: 2
resets:
@@ -44,19 +47,33 @@ properties:
if:
properties:
compatible:
- const: allwinner,sun50i-h6-crypto
+ enum:
+ - allwinner,sun20i-d1-crypto
then:
properties:
clocks:
- minItems: 3
+ minItems: 4
clock-names:
- minItems: 3
+ minItems: 4
else:
- properties:
- clocks:
- maxItems: 2
- clock-names:
- maxItems: 2
+ if:
+ properties:
+ compatible:
+ const: allwinner,sun50i-h6-crypto
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ minItems: 3
+ maxItems: 3
+ else:
+ properties:
+ clocks:
+ maxItems: 2
+ clock-names:
+ maxItems: 2
required:
- compatible
@@ -82,4 +99,3 @@ examples:
clock-names = "bus", "mod";
resets = <&ccu RST_BUS_CE>;
};
-
diff --git a/Documentation/devicetree/bindings/crypto/aspeed,ast2500-hace.yaml b/Documentation/devicetree/bindings/crypto/aspeed,ast2500-hace.yaml
new file mode 100644
index 000000000000..a772d232de09
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/aspeed,ast2500-hace.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/aspeed,ast2500-hace.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED HACE hash and crypto Hardware Accelerator Engines
+
+maintainers:
+ - Neal Liu <neal_liu@aspeedtech.com>
+
+description: |
+ The Hash and Crypto Engine (HACE) is designed to accelerate the throughput
+ of hash data digest, encryption, and decryption. Basically, HACE can be
+ divided into two independently engines - Hash Engine and Crypto Engine.
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2500-hace
+ - aspeed,ast2600-hace
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - interrupts
+ - resets
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/ast2600-clock.h>
+ hace: crypto@1e6d0000 {
+ compatible = "aspeed,ast2600-hace";
+ reg = <0x1e6d0000 0x200>;
+ interrupts = <4>;
+ clocks = <&syscon ASPEED_CLK_GATE_YCLK>;
+ resets = <&syscon ASPEED_RESET_HACE>;
+ };
diff --git a/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml b/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml
new file mode 100644
index 000000000000..b18f178aac06
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/aspeed,ast2600-acry.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/aspeed,ast2600-acry.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED ACRY ECDSA/RSA Hardware Accelerator Engines
+
+maintainers:
+ - Neal Liu <neal_liu@aspeedtech.com>
+
+description:
+ The ACRY ECDSA/RSA engines is designed to accelerate the throughput
+ of ECDSA/RSA signature and verification. Basically, ACRY can be
+ divided into two independent engines - ECC Engine and RSA Engine.
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2600-acry
+
+ reg:
+ items:
+ - description: acry base address & size
+ - description: acry sram base address & size
+
+ clocks:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/ast2600-clock.h>
+ acry: crypto@1e6fa000 {
+ compatible = "aspeed,ast2600-acry";
+ reg = <0x1e6fa000 0x400>, <0x1e710000 0x1800>;
+ interrupts = <160>;
+ clocks = <&syscon ASPEED_CLK_GATE_RSACLK>;
+ };
diff --git a/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-aes.yaml b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-aes.yaml
new file mode 100644
index 000000000000..0b7383b3106b
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-aes.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/atmel,at91sam9g46-aes.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel Advanced Encryption Standard (AES) HW cryptographic accelerator
+
+maintainers:
+ - Tudor Ambarus <tudor.ambarus@linaro.org>
+
+properties:
+ compatible:
+ const: atmel,at91sam9g46-aes
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: aes_clk
+
+ dmas:
+ items:
+ - description: TX DMA Channel
+ - description: RX DMA Channel
+
+ dma-names:
+ items:
+ - const: tx
+ - const: rx
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - dmas
+ - dma-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/at91.h>
+ #include <dt-bindings/dma/at91.h>
+
+ aes: crypto@e1810000 {
+ compatible = "atmel,at91sam9g46-aes";
+ reg = <0xe1810000 0x100>;
+ interrupts = <GIC_SPI 27 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&pmc PMC_TYPE_PERIPHERAL 27>;
+ clock-names = "aes_clk";
+ dmas = <&dma0 AT91_XDMAC_DT_PERID(1)>,
+ <&dma0 AT91_XDMAC_DT_PERID(2)>;
+ dma-names = "tx", "rx";
+ };
diff --git a/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-sha.yaml b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-sha.yaml
new file mode 100644
index 000000000000..ee2ffb034325
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-sha.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/atmel,at91sam9g46-sha.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel Secure Hash Algorithm (SHA) HW cryptographic accelerator
+
+maintainers:
+ - Tudor Ambarus <tudor.ambarus@linaro.org>
+
+properties:
+ compatible:
+ const: atmel,at91sam9g46-sha
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: sha_clk
+
+ dmas:
+ maxItems: 1
+ description: TX DMA Channel
+
+ dma-names:
+ const: tx
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/at91.h>
+ #include <dt-bindings/dma/at91.h>
+
+ sha: crypto@e1814000 {
+ compatible = "atmel,at91sam9g46-sha";
+ reg = <0xe1814000 0x100>;
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&pmc PMC_TYPE_PERIPHERAL 83>;
+ clock-names = "sha_clk";
+ dmas = <&dma0 AT91_XDMAC_DT_PERID(48)>;
+ dma-names = "tx";
+ };
diff --git a/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-tdes.yaml b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-tdes.yaml
new file mode 100644
index 000000000000..3d6ed24b1b00
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/atmel,at91sam9g46-tdes.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/atmel,at91sam9g46-tdes.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel Triple Data Encryption Standard (TDES) HW cryptographic accelerator
+
+maintainers:
+ - Tudor Ambarus <tudor.ambarus@linaro.org>
+
+properties:
+ compatible:
+ const: atmel,at91sam9g46-tdes
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: tdes_clk
+
+ dmas:
+ items:
+ - description: TX DMA Channel
+ - description: RX DMA Channel
+
+ dma-names:
+ items:
+ - const: tx
+ - const: rx
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/at91.h>
+ #include <dt-bindings/dma/at91.h>
+
+ tdes: crypto@e2014000 {
+ compatible = "atmel,at91sam9g46-tdes";
+ reg = <0xe2014000 0x100>;
+ interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&pmc PMC_TYPE_PERIPHERAL 96>;
+ clock-names = "tdes_clk";
+ dmas = <&dma0 AT91_XDMAC_DT_PERID(54)>,
+ <&dma0 AT91_XDMAC_DT_PERID(53)>;
+ dma-names = "tx", "rx";
+ };
diff --git a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
deleted file mode 100644
index f2aab3dc2b52..000000000000
--- a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-* Atmel HW cryptographic accelerators
-
-These are the HW cryptographic accelerators found on some Atmel products.
-
-* Advanced Encryption Standard (AES)
-
-Required properties:
-- compatible : Should be "atmel,at91sam9g46-aes".
-- reg: Should contain AES registers location and length.
-- interrupts: Should contain the IRQ line for the AES.
-- dmas: List of two DMA specifiers as described in
- atmel-dma.txt and dma.txt files.
-- dma-names: Contains one identifier string for each DMA specifier
- in the dmas property.
-
-Example:
-aes@f8038000 {
- compatible = "atmel,at91sam9g46-aes";
- reg = <0xf8038000 0x100>;
- interrupts = <43 4 0>;
- dmas = <&dma1 2 18>,
- <&dma1 2 19>;
- dma-names = "tx", "rx";
-
-* Triple Data Encryption Standard (Triple DES)
-
-Required properties:
-- compatible : Should be "atmel,at91sam9g46-tdes".
-- reg: Should contain TDES registers location and length.
-- interrupts: Should contain the IRQ line for the TDES.
-
-Optional properties:
-- dmas: List of two DMA specifiers as described in
- atmel-dma.txt and dma.txt files.
-- dma-names: Contains one identifier string for each DMA specifier
- in the dmas property.
-
-Example:
-tdes@f803c000 {
- compatible = "atmel,at91sam9g46-tdes";
- reg = <0xf803c000 0x100>;
- interrupts = <44 4 0>;
- dmas = <&dma1 2 20>,
- <&dma1 2 21>;
- dma-names = "tx", "rx";
-};
-
-* Secure Hash Algorithm (SHA)
-
-Required properties:
-- compatible : Should be "atmel,at91sam9g46-sha".
-- reg: Should contain SHA registers location and length.
-- interrupts: Should contain the IRQ line for the SHA.
-
-Optional properties:
-- dmas: One DMA specifiers as described in
- atmel-dma.txt and dma.txt files.
-- dma-names: Contains one identifier string for each DMA specifier
- in the dmas property. Only one "tx" string needed.
-
-Example:
-sha@f8034000 {
- compatible = "atmel,at91sam9g46-sha";
- reg = <0xf8034000 0x100>;
- interrupts = <42 4 0>;
- dmas = <&dma1 2 17>;
- dma-names = "tx";
-};
diff --git a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0-mon.yaml b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0-mon.yaml
new file mode 100644
index 000000000000..286dffa0671b
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0-mon.yaml
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2008-2011 Freescale Semiconductor Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/fsl,sec-v4.0-mon.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale Secure Non-Volatile Storage (SNVS)
+
+maintainers:
+ - '"Horia Geantă" <horia.geanta@nxp.com>'
+ - Pankaj Gupta <pankaj.gupta@nxp.com>
+ - Gaurav Jain <gaurav.jain@nxp.com>
+
+description:
+ Node defines address range and the associated interrupt for the SNVS function.
+ This function monitors security state information & reports security
+ violations. This also included rtc, system power off and ON/OFF key.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: fsl,sec-v4.0-mon
+ - const: syscon
+ - const: simple-mfd
+ - items:
+ - const: fsl,sec-v5.0-mon
+ - const: fsl,sec-v4.0-mon
+ - items:
+ - enum:
+ - fsl,sec-v5.3-mon
+ - fsl,sec-v5.4-mon
+ - const: fsl,sec-v5.0-mon
+ - const: fsl,sec-v4.0-mon
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 2
+
+ snvs-rtc-lp:
+ type: object
+ additionalProperties: false
+ description:
+ Secure Non-Volatile Storage (SNVS) Low Power (LP) RTC Node
+
+ properties:
+ compatible:
+ const: fsl,sec-v4.0-mon-rtc-lp
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: snvs-rtc
+
+ interrupts:
+ # VFxxx has only one. What is the 2nd one?
+ minItems: 1
+ maxItems: 2
+
+ regmap:
+ description: Parent node containing registers
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ offset:
+ description: LP register offset
+ $ref: /schemas/types.yaml#/definitions/uint32
+ default: 0x34
+
+ required:
+ - compatible
+ - interrupts
+ - regmap
+
+ snvs-powerkey:
+ type: object
+ additionalProperties: false
+ description:
+ The snvs-pwrkey is designed to enable POWER key function which controlled
+ by SNVS ONOFF, the driver can report the status of POWER key and wakeup
+ system if pressed after system suspend.
+
+ properties:
+ compatible:
+ const: fsl,sec-v4.0-pwrkey
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: snvs-pwrkey
+
+ interrupts:
+ maxItems: 1
+
+ regmap:
+ description: Parent node containing registers
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ wakeup-source: true
+
+ linux,keycode:
+ default: 116
+
+ required:
+ - compatible
+ - interrupts
+ - regmap
+
+ snvs-lpgpr:
+ $ref: /schemas/nvmem/snvs-lpgpr.yaml#
+
+ snvs-poweroff:
+ description:
+ The SNVS could drive signal to PMIC to turn off system power by setting
+ SNVS_LP LPCR register.
+ $ref: /schemas/power/reset/syscon-poweroff.yaml#
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/imx7d-clock.h>
+
+ sec_mon: sec-mon@314000 {
+ compatible = "fsl,sec-v4.0-mon", "syscon", "simple-mfd";
+ reg = <0x314000 0x1000>;
+
+ snvs-rtc-lp {
+ compatible = "fsl,sec-v4.0-mon-rtc-lp";
+ regmap = <&sec_mon>;
+ offset = <0x34>;
+ clocks = <&clks IMX7D_SNVS_CLK>;
+ clock-names = "snvs-rtc";
+ interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ snvs-powerkey {
+ compatible = "fsl,sec-v4.0-pwrkey";
+ regmap = <&sec_mon>;
+ clocks = <&clks IMX7D_SNVS_CLK>;
+ clock-names = "snvs-pwrkey";
+ interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
+ linux,keycode = <116>; /* KEY_POWER */
+ wakeup-source;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml
new file mode 100644
index 000000000000..0a9ed2848b7c
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/fsl,sec-v4.0.yaml
@@ -0,0 +1,266 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2008-2011 Freescale Semiconductor Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/fsl,sec-v4.0.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale SEC 4
+
+maintainers:
+ - '"Horia Geantă" <horia.geanta@nxp.com>'
+ - Pankaj Gupta <pankaj.gupta@nxp.com>
+ - Gaurav Jain <gaurav.jain@nxp.com>
+
+description: |
+ NOTE: the SEC 4 is also known as Freescale's Cryptographic Accelerator
+ Accelerator and Assurance Module (CAAM).
+
+ SEC 4 h/w can process requests from 2 types of sources.
+ 1. DPAA Queue Interface (HW interface between Queue Manager & SEC 4).
+ 2. Job Rings (HW interface between cores & SEC 4 registers).
+
+ High Speed Data Path Configuration:
+
+ HW interface between QM & SEC 4 and also BM & SEC 4, on DPAA-enabled parts
+ such as the P4080. The number of simultaneous dequeues the QI can make is
+ equal to the number of Descriptor Controller (DECO) engines in a particular
+ SEC version. E.g., the SEC 4.0 in the P4080 has 5 DECOs and can thus
+ dequeue from 5 subportals simultaneously.
+
+ Job Ring Data Path Configuration:
+
+ Each JR is located on a separate 4k page, they may (or may not) be made visible
+ in the memory partition devoted to a particular core. The P4080 has 4 JRs, so
+ up to 4 JRs can be configured; and all 4 JRs process requests in parallel.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: fsl,sec-v5.4
+ - const: fsl,sec-v5.0
+ - const: fsl,sec-v4.0
+ - items:
+ - enum:
+ - fsl,imx6ul-caam
+ - fsl,sec-v5.0
+ - const: fsl,sec-v4.0
+ - const: fsl,sec-v4.0
+
+ reg:
+ maxItems: 1
+
+ ranges:
+ maxItems: 1
+
+ '#address-cells':
+ enum: [1, 2]
+
+ '#size-cells':
+ enum: [1, 2]
+
+ clocks:
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ minItems: 1
+ maxItems: 4
+ items:
+ enum: [mem, aclk, ipg, emi_slow]
+
+ dma-coherent: true
+
+ interrupts:
+ maxItems: 1
+
+ fsl,sec-era:
+ description: Defines the 'ERA' of the SEC device.
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+patternProperties:
+ '^jr@[0-9a-f]+$':
+ type: object
+ additionalProperties: false
+ description:
+ Job Ring (JR) Node. Defines data processing interface to SEC 4 across the
+ peripheral bus for purposes of processing cryptographic descriptors. The
+ specified address range can be made visible to one (or more) cores. The
+ interrupt defined for this node is controlled within the address range of
+ this node.
+
+ properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: fsl,sec-v5.4-job-ring
+ - const: fsl,sec-v5.0-job-ring
+ - const: fsl,sec-v4.0-job-ring
+ - items:
+ - const: fsl,sec-v5.0-job-ring
+ - const: fsl,sec-v4.0-job-ring
+ - const: fsl,sec-v4.0-job-ring
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ fsl,liodn:
+ description:
+ Specifies the LIODN to be used in conjunction with the ppid-to-liodn
+ table that specifies the PPID to LIODN mapping. Needed if the PAMU is
+ used. Value is a 12 bit value where value is a LIODN ID for this JR.
+ This property is normally set by boot firmware.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 0xfff
+
+ '^rtic@[0-9a-f]+$':
+ type: object
+ additionalProperties: false
+ description:
+ Run Time Integrity Check (RTIC) Node. Defines a register space that
+ contains up to 5 sets of addresses and their lengths (sizes) that will be
+ checked at run time. After an initial hash result is calculated, these
+ addresses are checked by HW to monitor any change. If any memory is
+ modified, a Security Violation is triggered (see SNVS definition).
+
+ properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: fsl,sec-v5.4-rtic
+ - const: fsl,sec-v5.0-rtic
+ - const: fsl,sec-v4.0-rtic
+ - const: fsl,sec-v4.0-rtic
+
+ reg:
+ maxItems: 1
+
+ ranges:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 1
+
+ patternProperties:
+ '^rtic-[a-z]@[0-9a-f]+$':
+ type: object
+ additionalProperties: false
+ description:
+ Run Time Integrity Check (RTIC) Memory Node defines individual RTIC
+ memory regions that are used to perform run-time integrity check of
+ memory areas that should not modified. The node defines a register
+ that contains the memory address & length (combined) and a second
+ register that contains the hash result in big endian format.
+
+ properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: fsl,sec-v5.4-rtic-memory
+ - const: fsl,sec-v5.0-rtic-memory
+ - const: fsl,sec-v4.0-rtic-memory
+ - const: fsl,sec-v4.0-rtic-memory
+
+ reg:
+ items:
+ - description: RTIC memory address
+ - description: RTIC hash result
+
+ fsl,liodn:
+ description:
+ Specifies the LIODN to be used in conjunction with the
+ ppid-to-liodn table that specifies the PPID to LIODN mapping.
+ Needed if the PAMU is used. Value is a 12 bit value where value
+ is a LIODN ID for this JR. This property is normally set by boot
+ firmware.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 0xfff
+
+ fsl,rtic-region:
+ description:
+ Specifies the HW address (36 bit address) for this region
+ followed by the length of the HW partition to be checked;
+ the address is represented as a 64 bit quantity followed
+ by a 32 bit length.
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+
+required:
+ - compatible
+ - reg
+ - ranges
+
+additionalProperties: false
+
+examples:
+ - |
+ crypto@300000 {
+ compatible = "fsl,sec-v4.0";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x300000 0x10000>;
+ ranges = <0 0x300000 0x10000>;
+ interrupts = <92 2>;
+
+ jr@1000 {
+ compatible = "fsl,sec-v4.0-job-ring";
+ reg = <0x1000 0x1000>;
+ interrupts = <88 2>;
+ };
+
+ jr@2000 {
+ compatible = "fsl,sec-v4.0-job-ring";
+ reg = <0x2000 0x1000>;
+ interrupts = <89 2>;
+ };
+
+ jr@3000 {
+ compatible = "fsl,sec-v4.0-job-ring";
+ reg = <0x3000 0x1000>;
+ interrupts = <90 2>;
+ };
+
+ jr@4000 {
+ compatible = "fsl,sec-v4.0-job-ring";
+ reg = <0x4000 0x1000>;
+ interrupts = <91 2>;
+ };
+
+ rtic@6000 {
+ compatible = "fsl,sec-v4.0-rtic";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ reg = <0x6000 0x100>;
+ ranges = <0x0 0x6100 0xe00>;
+
+ rtic-a@0 {
+ compatible = "fsl,sec-v4.0-rtic-memory";
+ reg = <0x00 0x20>, <0x100 0x80>;
+ };
+
+ rtic-b@20 {
+ compatible = "fsl,sec-v4.0-rtic-memory";
+ reg = <0x20 0x20>, <0x200 0x80>;
+ };
+
+ rtic-c@40 {
+ compatible = "fsl,sec-v4.0-rtic-memory";
+ reg = <0x40 0x20>, <0x300 0x80>;
+ };
+
+ rtic-d@60 {
+ compatible = "fsl,sec-v4.0-rtic-memory";
+ reg = <0x60 0x20>, <0x500 0x80>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/crypto/fsl-sec4.txt b/Documentation/devicetree/bindings/crypto/fsl-sec4.txt
deleted file mode 100644
index 8f359f473ada..000000000000
--- a/Documentation/devicetree/bindings/crypto/fsl-sec4.txt
+++ /dev/null
@@ -1,553 +0,0 @@
-=====================================================================
-SEC 4 Device Tree Binding
-Copyright (C) 2008-2011 Freescale Semiconductor Inc.
-
- CONTENTS
- -Overview
- -SEC 4 Node
- -Job Ring Node
- -Run Time Integrity Check (RTIC) Node
- -Run Time Integrity Check (RTIC) Memory Node
- -Secure Non-Volatile Storage (SNVS) Node
- -Secure Non-Volatile Storage (SNVS) Low Power (LP) RTC Node
- -Full Example
-
-NOTE: the SEC 4 is also known as Freescale's Cryptographic Accelerator
-Accelerator and Assurance Module (CAAM).
-
-=====================================================================
-Overview
-
-DESCRIPTION
-
-SEC 4 h/w can process requests from 2 types of sources.
-1. DPAA Queue Interface (HW interface between Queue Manager & SEC 4).
-2. Job Rings (HW interface between cores & SEC 4 registers).
-
-High Speed Data Path Configuration:
-
-HW interface between QM & SEC 4 and also BM & SEC 4, on DPAA-enabled parts
-such as the P4080. The number of simultaneous dequeues the QI can make is
-equal to the number of Descriptor Controller (DECO) engines in a particular
-SEC version. E.g., the SEC 4.0 in the P4080 has 5 DECOs and can thus
-dequeue from 5 subportals simultaneously.
-
-Job Ring Data Path Configuration:
-
-Each JR is located on a separate 4k page, they may (or may not) be made visible
-in the memory partition devoted to a particular core. The P4080 has 4 JRs, so
-up to 4 JRs can be configured; and all 4 JRs process requests in parallel.
-
-=====================================================================
-SEC 4 Node
-
-Description
-
- Node defines the base address of the SEC 4 block.
- This block specifies the address range of all global
- configuration registers for the SEC 4 block. It
- also receives interrupts from the Run Time Integrity Check
- (RTIC) function within the SEC 4 block.
-
-PROPERTIES
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0"
-
- - fsl,sec-era
- Usage: optional
- Value type: <u32>
- Definition: A standard property. Define the 'ERA' of the SEC
- device.
-
- - #address-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing physical addresses in child nodes.
-
- - #size-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing the size of physical addresses in
- child nodes.
-
- - reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies the physical
- address and length of the SEC4 configuration registers.
- registers
-
- - ranges
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies the physical address
- range of the SEC 4.0 register space (-SNVS not included). A
- triplet that includes the child address, parent address, &
- length.
-
- - interrupts
- Usage: required
- Value type: <prop_encoded-array>
- Definition: Specifies the interrupts generated by this
- device. The value of the interrupts property
- consists of one interrupt specifier. The format
- of the specifier is defined by the binding document
- describing the node's interrupt parent.
-
- - clocks
- Usage: required if SEC 4.0 requires explicit enablement of clocks
- Value type: <prop_encoded-array>
- Definition: A list of phandle and clock specifier pairs describing
- the clocks required for enabling and disabling SEC 4.0.
-
- - clock-names
- Usage: required if SEC 4.0 requires explicit enablement of clocks
- Value type: <string>
- Definition: A list of clock name strings in the same order as the
- clocks property.
-
- Note: All other standard properties (see the Devicetree Specification)
- are allowed but are optional.
-
-
-EXAMPLE
-
-iMX6QDL/SX requires four clocks
-
- crypto@300000 {
- compatible = "fsl,sec-v4.0";
- fsl,sec-era = <2>;
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x300000 0x10000>;
- ranges = <0 0x300000 0x10000>;
- interrupt-parent = <&mpic>;
- interrupts = <92 2>;
- clocks = <&clks IMX6QDL_CLK_CAAM_MEM>,
- <&clks IMX6QDL_CLK_CAAM_ACLK>,
- <&clks IMX6QDL_CLK_CAAM_IPG>,
- <&clks IMX6QDL_CLK_EIM_SLOW>;
- clock-names = "mem", "aclk", "ipg", "emi_slow";
- };
-
-
-iMX6UL does only require three clocks
-
- crypto: crypto@2140000 {
- compatible = "fsl,sec-v4.0";
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x2140000 0x3c000>;
- ranges = <0 0x2140000 0x3c000>;
- interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
-
- clocks = <&clks IMX6UL_CLK_CAAM_MEM>,
- <&clks IMX6UL_CLK_CAAM_ACLK>,
- <&clks IMX6UL_CLK_CAAM_IPG>;
- clock-names = "mem", "aclk", "ipg";
- };
-
-=====================================================================
-Job Ring (JR) Node
-
- Child of the crypto node defines data processing interface to SEC 4
- across the peripheral bus for purposes of processing
- cryptographic descriptors. The specified address
- range can be made visible to one (or more) cores.
- The interrupt defined for this node is controlled within
- the address range of this node.
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0-job-ring"
-
- - reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: Specifies a two JR parameters: an offset from
- the parent physical address and the length the JR registers.
-
- - fsl,liodn
- Usage: optional-but-recommended
- Value type: <prop-encoded-array>
- Definition:
- Specifies the LIODN to be used in conjunction with
- the ppid-to-liodn table that specifies the PPID to LIODN mapping.
- Needed if the PAMU is used. Value is a 12 bit value
- where value is a LIODN ID for this JR. This property is
- normally set by boot firmware.
-
- - interrupts
- Usage: required
- Value type: <prop_encoded-array>
- Definition: Specifies the interrupts generated by this
- device. The value of the interrupts property
- consists of one interrupt specifier. The format
- of the specifier is defined by the binding document
- describing the node's interrupt parent.
-
-EXAMPLE
- jr@1000 {
- compatible = "fsl,sec-v4.0-job-ring";
- reg = <0x1000 0x1000>;
- fsl,liodn = <0x081>;
- interrupt-parent = <&mpic>;
- interrupts = <88 2>;
- };
-
-
-=====================================================================
-Run Time Integrity Check (RTIC) Node
-
- Child node of the crypto node. Defines a register space that
- contains up to 5 sets of addresses and their lengths (sizes) that
- will be checked at run time. After an initial hash result is
- calculated, these addresses are checked by HW to monitor any
- change. If any memory is modified, a Security Violation is
- triggered (see SNVS definition).
-
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0-rtic".
-
- - #address-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing physical addresses in child nodes. Must
- have a value of 1.
-
- - #size-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing the size of physical addresses in
- child nodes. Must have a value of 1.
-
- - reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies a two parameters:
- an offset from the parent physical address and the length
- the SEC4 registers.
-
- - ranges
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies the physical address
- range of the SEC 4 register space (-SNVS not included). A
- triplet that includes the child address, parent address, &
- length.
-
-EXAMPLE
- rtic@6000 {
- compatible = "fsl,sec-v4.0-rtic";
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x6000 0x100>;
- ranges = <0x0 0x6100 0xe00>;
- };
-
-=====================================================================
-Run Time Integrity Check (RTIC) Memory Node
- A child node that defines individual RTIC memory regions that are used to
- perform run-time integrity check of memory areas that should not modified.
- The node defines a register that contains the memory address &
- length (combined) and a second register that contains the hash result
- in big endian format.
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0-rtic-memory".
-
- - reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies two parameters:
- an offset from the parent physical address and the length:
-
- 1. The location of the RTIC memory address & length registers.
- 2. The location RTIC hash result.
-
- - fsl,rtic-region
- Usage: optional-but-recommended
- Value type: <prop-encoded-array>
- Definition:
- Specifies the HW address (36 bit address) for this region
- followed by the length of the HW partition to be checked;
- the address is represented as a 64 bit quantity followed
- by a 32 bit length.
-
- - fsl,liodn
- Usage: optional-but-recommended
- Value type: <prop-encoded-array>
- Definition:
- Specifies the LIODN to be used in conjunction with
- the ppid-to-liodn table that specifies the PPID to LIODN
- mapping. Needed if the PAMU is used. Value is a 12 bit value
- where value is a LIODN ID for this RTIC memory region. This
- property is normally set by boot firmware.
-
-EXAMPLE
- rtic-a@0 {
- compatible = "fsl,sec-v4.0-rtic-memory";
- reg = <0x00 0x20 0x100 0x80>;
- fsl,liodn = <0x03c>;
- fsl,rtic-region = <0x12345678 0x12345678 0x12345678>;
- };
-
-=====================================================================
-Secure Non-Volatile Storage (SNVS) Node
-
- Node defines address range and the associated
- interrupt for the SNVS function. This function
- monitors security state information & reports
- security violations. This also included rtc,
- system power off and ON/OFF key.
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0-mon" and "syscon".
-
- - reg
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies the physical
- address and length of the SEC4 configuration
- registers.
-
- - #address-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing physical addresses in child nodes. Must
- have a value of 1.
-
- - #size-cells
- Usage: required
- Value type: <u32>
- Definition: A standard property. Defines the number of cells
- for representing the size of physical addresses in
- child nodes. Must have a value of 1.
-
- - ranges
- Usage: required
- Value type: <prop-encoded-array>
- Definition: A standard property. Specifies the physical address
- range of the SNVS register space. A triplet that includes
- the child address, parent address, & length.
-
- - interrupts
- Usage: optional
- Value type: <prop_encoded-array>
- Definition: Specifies the interrupts generated by this
- device. The value of the interrupts property
- consists of one interrupt specifier. The format
- of the specifier is defined by the binding document
- describing the node's interrupt parent.
-
-EXAMPLE
- sec_mon@314000 {
- compatible = "fsl,sec-v4.0-mon", "syscon";
- reg = <0x314000 0x1000>;
- ranges = <0 0x314000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <93 2>;
- };
-
-=====================================================================
-Secure Non-Volatile Storage (SNVS) Low Power (LP) RTC Node
-
- A SNVS child node that defines SNVS LP RTC.
-
- - compatible
- Usage: required
- Value type: <string>
- Definition: Must include "fsl,sec-v4.0-mon-rtc-lp".
-
- - interrupts
- Usage: required
- Value type: <prop_encoded-array>
- Definition: Specifies the interrupts generated by this
- device. The value of the interrupts property
- consists of one interrupt specifier. The format
- of the specifier is defined by the binding document
- describing the node's interrupt parent.
-
- - regmap
- Usage: required
- Value type: <phandle>
- Definition: this is phandle to the register map node.
-
- - offset
- Usage: option
- value type: <u32>
- Definition: LP register offset. default it is 0x34.
-
- - clocks
- Usage: optional, required if SNVS LP RTC requires explicit
- enablement of clocks
- Value type: <prop_encoded-array>
- Definition: a clock specifier describing the clock required for
- enabling and disabling SNVS LP RTC.
-
- - clock-names
- Usage: optional, required if SNVS LP RTC requires explicit
- enablement of clocks
- Value type: <string>
- Definition: clock name string should be "snvs-rtc".
-
-EXAMPLE
- sec_mon_rtc_lp@1 {
- compatible = "fsl,sec-v4.0-mon-rtc-lp";
- interrupts = <93 2>;
- regmap = <&snvs>;
- offset = <0x34>;
- clocks = <&clks IMX7D_SNVS_CLK>;
- clock-names = "snvs-rtc";
- };
-
-=====================================================================
-System ON/OFF key driver
-
- The snvs-pwrkey is designed to enable POWER key function which controlled
- by SNVS ONOFF, the driver can report the status of POWER key and wakeup
- system if pressed after system suspend.
-
- - compatible:
- Usage: required
- Value type: <string>
- Definition: Mush include "fsl,sec-v4.0-pwrkey".
-
- - interrupts:
- Usage: required
- Value type: <prop_encoded-array>
- Definition: The SNVS ON/OFF interrupt number to the CPU(s).
-
- - linux,keycode:
- Usage: option
- Value type: <int>
- Definition: Keycode to emit, KEY_POWER by default.
-
- - wakeup-source:
- Usage: option
- Value type: <boo>
- Definition: Button can wake-up the system.
-
- - regmap:
- Usage: required:
- Value type: <phandle>
- Definition: this is phandle to the register map node.
-
-EXAMPLE:
- snvs-pwrkey@020cc000 {
- compatible = "fsl,sec-v4.0-pwrkey";
- regmap = <&snvs>;
- interrupts = <0 4 0x4>
- linux,keycode = <116>; /* KEY_POWER */
- wakeup-source;
- };
-
-=====================================================================
-FULL EXAMPLE
-
- crypto: crypto@300000 {
- compatible = "fsl,sec-v4.0";
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x300000 0x10000>;
- ranges = <0 0x300000 0x10000>;
- interrupt-parent = <&mpic>;
- interrupts = <92 2>;
-
- sec_jr0: jr@1000 {
- compatible = "fsl,sec-v4.0-job-ring";
- reg = <0x1000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <88 2>;
- };
-
- sec_jr1: jr@2000 {
- compatible = "fsl,sec-v4.0-job-ring";
- reg = <0x2000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <89 2>;
- };
-
- sec_jr2: jr@3000 {
- compatible = "fsl,sec-v4.0-job-ring";
- reg = <0x3000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <90 2>;
- };
-
- sec_jr3: jr@4000 {
- compatible = "fsl,sec-v4.0-job-ring";
- reg = <0x4000 0x1000>;
- interrupt-parent = <&mpic>;
- interrupts = <91 2>;
- };
-
- rtic@6000 {
- compatible = "fsl,sec-v4.0-rtic";
- #address-cells = <1>;
- #size-cells = <1>;
- reg = <0x6000 0x100>;
- ranges = <0x0 0x6100 0xe00>;
-
- rtic_a: rtic-a@0 {
- compatible = "fsl,sec-v4.0-rtic-memory";
- reg = <0x00 0x20 0x100 0x80>;
- };
-
- rtic_b: rtic-b@20 {
- compatible = "fsl,sec-v4.0-rtic-memory";
- reg = <0x20 0x20 0x200 0x80>;
- };
-
- rtic_c: rtic-c@40 {
- compatible = "fsl,sec-v4.0-rtic-memory";
- reg = <0x40 0x20 0x300 0x80>;
- };
-
- rtic_d: rtic-d@60 {
- compatible = "fsl,sec-v4.0-rtic-memory";
- reg = <0x60 0x20 0x500 0x80>;
- };
- };
- };
-
- sec_mon: sec_mon@314000 {
- compatible = "fsl,sec-v4.0-mon";
- reg = <0x314000 0x1000>;
- ranges = <0 0x314000 0x1000>;
-
- sec_mon_rtc_lp@34 {
- compatible = "fsl,sec-v4.0-mon-rtc-lp";
- regmap = <&sec_mon>;
- offset = <0x34>;
- interrupts = <93 2>;
- clocks = <&clks IMX7D_SNVS_CLK>;
- clock-names = "snvs-rtc";
- };
-
- snvs-pwrkey@020cc000 {
- compatible = "fsl,sec-v4.0-pwrkey";
- regmap = <&sec_mon>;
- interrupts = <0 4 0x4>;
- linux,keycode = <116>; /* KEY_POWER */
- wakeup-source;
- };
- };
-
-=====================================================================
diff --git a/Documentation/devicetree/bindings/crypto/intel,ixp4xx-crypto.yaml b/Documentation/devicetree/bindings/crypto/intel,ixp4xx-crypto.yaml
index 9c53c27bd20a..e0fe63957888 100644
--- a/Documentation/devicetree/bindings/crypto/intel,ixp4xx-crypto.yaml
+++ b/Documentation/devicetree/bindings/crypto/intel,ixp4xx-crypto.yaml
@@ -22,19 +22,28 @@ properties:
intel,npe-handle:
$ref: '/schemas/types.yaml#/definitions/phandle-array'
- maxItems: 1
+ items:
+ - items:
+ - description: phandle to the NPE this crypto engine
+ - description: the NPE instance number
description: phandle to the NPE this crypto engine is using, the cell
describing the NPE instance to be used.
queue-rx:
$ref: /schemas/types.yaml#/definitions/phandle-array
- maxItems: 1
+ items:
+ - items:
+ - description: phandle to the RX queue on the NPE
+ - description: the queue instance number
description: phandle to the RX queue on the NPE, the cell describing
the queue instance to be used.
queue-txready:
$ref: /schemas/types.yaml#/definitions/phandle-array
- maxItems: 1
+ items:
+ - items:
+ - description: phandle to the TX READY queue on the NPE
+ - description: the queue instance number
description: phandle to the TX READY queue on the NPE, the cell describing
the queue instance to be used.
diff --git a/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-aes.yaml b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-aes.yaml
index ee2c099981b2..fedd8be56ad6 100644
--- a/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-aes.yaml
+++ b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-aes.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/crypto/intel,keembay-ocs-aes.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel Keem Bay OCS AES Device Tree Bindings
+title: Intel Keem Bay OCS AES
maintainers:
- Daniele Alessandrelli <daniele.alessandrelli@intel.com>
diff --git a/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-ecc.yaml b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-ecc.yaml
new file mode 100644
index 000000000000..2bb95247b64f
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-ecc.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/intel,keembay-ocs-ecc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Intel Keem Bay OCS ECC
+
+maintainers:
+ - Daniele Alessandrelli <daniele.alessandrelli@intel.com>
+ - Prabhjot Khurana <prabhjot.khurana@intel.com>
+
+description:
+ The Intel Keem Bay Offload and Crypto Subsystem (OCS) Elliptic Curve
+ Cryptography (ECC) device provides hardware acceleration for elliptic curve
+ cryptography using the NIST P-256 and NIST P-384 elliptic curves.
+
+properties:
+ compatible:
+ const: intel,keembay-ocs-ecc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ crypto@30001000 {
+ compatible = "intel,keembay-ocs-ecc";
+ reg = <0x30001000 0x1000>;
+ interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&scmi_clk 95>;
+ };
diff --git a/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml
index acb92706d280..46e2853ab8f4 100644
--- a/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml
+++ b/Documentation/devicetree/bindings/crypto/intel,keembay-ocs-hcu.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/crypto/intel,keembay-ocs-hcu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Intel Keem Bay OCS HCU Device Tree Bindings
+title: Intel Keem Bay OCS HCU
maintainers:
- Declan Murphy <declan.murphy@intel.com>
diff --git a/Documentation/devicetree/bindings/crypto/qcom,inline-crypto-engine.yaml b/Documentation/devicetree/bindings/crypto/qcom,inline-crypto-engine.yaml
new file mode 100644
index 000000000000..92e1d76e29ee
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/qcom,inline-crypto-engine.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/qcom,inline-crypto-engine.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. (QTI) Inline Crypto Engine
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qcom,sm8550-inline-crypto-engine
+ - const: qcom,inline-crypto-engine
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8550-gcc.h>
+
+ crypto@1d88000 {
+ compatible = "qcom,sm8550-inline-crypto-engine",
+ "qcom,inline-crypto-engine";
+ reg = <0x01d88000 0x8000>;
+ clocks = <&gcc GCC_UFS_PHY_ICE_CORE_CLK>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/crypto/qcom,prng.txt b/Documentation/devicetree/bindings/crypto/qcom,prng.txt
deleted file mode 100644
index 7ee0e9eac973..000000000000
--- a/Documentation/devicetree/bindings/crypto/qcom,prng.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Qualcomm MSM pseudo random number generator.
-
-Required properties:
-
-- compatible : should be "qcom,prng" for 8916 etc
- : should be "qcom,prng-ee" for 8996 and later using EE
- (Execution Environment) slice of prng
-- reg : specifies base physical address and size of the registers map
-- clocks : phandle to clock-controller plus clock-specifier pair
-- clock-names : "core" clocks all registers, FIFO and circuits in PRNG IP block
-
-Example:
-
- rng@f9bff000 {
- compatible = "qcom,prng";
- reg = <0xf9bff000 0x200>;
- clocks = <&clock GCC_PRNG_AHB_CLK>;
- clock-names = "core";
- };
diff --git a/Documentation/devicetree/bindings/crypto/qcom,prng.yaml b/Documentation/devicetree/bindings/crypto/qcom,prng.yaml
new file mode 100644
index 000000000000..bb42f4588b40
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/qcom,prng.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/qcom,prng.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Pseudo Random Number Generator
+
+maintainers:
+ - Vinod Koul <vkoul@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - qcom,prng # 8916 etc.
+ - qcom,prng-ee # 8996 and later using EE
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: core
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ rng@f9bff000 {
+ compatible = "qcom,prng";
+ reg = <0xf9bff000 0x200>;
+ clocks = <&clk 125>;
+ clock-names = "core";
+ };
diff --git a/Documentation/devicetree/bindings/crypto/qcom-qce.txt b/Documentation/devicetree/bindings/crypto/qcom-qce.txt
deleted file mode 100644
index fdd53b184ba8..000000000000
--- a/Documentation/devicetree/bindings/crypto/qcom-qce.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-Qualcomm crypto engine driver
-
-Required properties:
-
-- compatible : should be "qcom,crypto-v5.1"
-- reg : specifies base physical address and size of the registers map
-- clocks : phandle to clock-controller plus clock-specifier pair
-- clock-names : "iface" clocks register interface
- "bus" clocks data transfer interface
- "core" clocks rest of the crypto block
-- dmas : DMA specifiers for tx and rx dma channels. For more see
- Documentation/devicetree/bindings/dma/dma.txt
-- dma-names : DMA request names should be "rx" and "tx"
-
-Example:
- crypto@fd45a000 {
- compatible = "qcom,crypto-v5.1";
- reg = <0xfd45a000 0x6000>;
- clocks = <&gcc GCC_CE2_AHB_CLK>,
- <&gcc GCC_CE2_AXI_CLK>,
- <&gcc GCC_CE2_CLK>;
- clock-names = "iface", "bus", "core";
- dmas = <&cryptobam 2>, <&cryptobam 3>;
- dma-names = "rx", "tx";
- };
diff --git a/Documentation/devicetree/bindings/crypto/qcom-qce.yaml b/Documentation/devicetree/bindings/crypto/qcom-qce.yaml
new file mode 100644
index 000000000000..e375bd981300
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/qcom-qce.yaml
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/qcom-qce.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm crypto engine driver
+
+maintainers:
+ - Bhupesh Sharma <bhupesh.sharma@linaro.org>
+
+description:
+ This document defines the binding for the QCE crypto
+ controller found on Qualcomm parts.
+
+properties:
+ compatible:
+ oneOf:
+ - const: qcom,crypto-v5.1
+ deprecated: true
+ description: Kept only for ABI backward compatibility
+
+ - const: qcom,crypto-v5.4
+ deprecated: true
+ description: Kept only for ABI backward compatibility
+
+ - items:
+ - enum:
+ - qcom,ipq6018-qce
+ - qcom,ipq8074-qce
+ - qcom,msm8996-qce
+ - qcom,sdm845-qce
+ - const: qcom,ipq4019-qce
+ - const: qcom,qce
+
+ - items:
+ - enum:
+ - qcom,sm8250-qce
+ - qcom,sm8350-qce
+ - qcom,sm8450-qce
+ - qcom,sm8550-qce
+ - const: qcom,sm8150-qce
+ - const: qcom,qce
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: iface clocks register interface.
+ - description: bus clocks data transfer interface.
+ - description: core clocks rest of the crypto block.
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+
+ iommus:
+ minItems: 1
+ maxItems: 8
+ description:
+ phandle to apps_smmu node with sid mask.
+
+ interconnects:
+ maxItems: 1
+ description:
+ Interconnect path between qce crypto and main memory.
+
+ interconnect-names:
+ const: memory
+
+ dmas:
+ items:
+ - description: DMA specifiers for rx dma channel.
+ - description: DMA specifiers for tx dma channel.
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,crypto-v5.1
+ - qcom,crypto-v5.4
+ - qcom,ipq4019-qce
+
+ then:
+ required:
+ - clocks
+ - clock-names
+
+required:
+ - compatible
+ - reg
+ - dmas
+ - dma-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-apq8084.h>
+ crypto-engine@fd45a000 {
+ compatible = "qcom,ipq6018-qce", "qcom,ipq4019-qce", "qcom,qce";
+ reg = <0xfd45a000 0x6000>;
+ clocks = <&gcc GCC_CE2_AHB_CLK>,
+ <&gcc GCC_CE2_AXI_CLK>,
+ <&gcc GCC_CE2_CLK>;
+ clock-names = "iface", "bus", "core";
+ dmas = <&cryptobam 2>, <&cryptobam 3>;
+ dma-names = "rx", "tx";
+ iommus = <&apps_smmu 0x584 0x0011>,
+ <&apps_smmu 0x586 0x0011>,
+ <&apps_smmu 0x594 0x0011>,
+ <&apps_smmu 0x596 0x0011>;
+ };
diff --git a/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml
new file mode 100644
index 000000000000..f1a9da8bff7a
--- /dev/null
+++ b/Documentation/devicetree/bindings/crypto/rockchip,rk3288-crypto.yaml
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/crypto/rockchip,rk3288-crypto.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip Electronics Security Accelerator
+
+maintainers:
+ - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3288-crypto
+ - rockchip,rk3328-crypto
+ - rockchip,rk3399-crypto
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 3
+ maxItems: 4
+
+ clock-names:
+ minItems: 3
+ maxItems: 4
+
+ resets:
+ minItems: 1
+ maxItems: 3
+
+ reset-names:
+ minItems: 1
+ maxItems: 3
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3288-crypto
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ clock-names:
+ items:
+ - const: aclk
+ - const: hclk
+ - const: sclk
+ - const: apb_pclk
+ resets:
+ maxItems: 1
+ reset-names:
+ items:
+ - const: crypto-rst
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3328-crypto
+ then:
+ properties:
+ clocks:
+ maxItems: 3
+ clock-names:
+ items:
+ - const: hclk_master
+ - const: hclk_slave
+ - const: sclk
+ resets:
+ maxItems: 1
+ reset-names:
+ items:
+ - const: crypto-rst
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3399-crypto
+ then:
+ properties:
+ clocks:
+ maxItems: 3
+ clock-names:
+ items:
+ - const: hclk_master
+ - const: hclk_slave
+ - const: sclk
+ resets:
+ minItems: 3
+ reset-names:
+ items:
+ - const: master
+ - const: slave
+ - const: crypto-rst
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/rk3288-cru.h>
+ crypto@ff8a0000 {
+ compatible = "rockchip,rk3288-crypto";
+ reg = <0xff8a0000 0x4000>;
+ interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cru ACLK_CRYPTO>, <&cru HCLK_CRYPTO>,
+ <&cru SCLK_CRYPTO>, <&cru ACLK_DMAC1>;
+ clock-names = "aclk", "hclk", "sclk", "apb_pclk";
+ resets = <&cru SRST_CRYPTO>;
+ reset-names = "crypto-rst";
+ };
diff --git a/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt b/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt
deleted file mode 100644
index 5e2ba385b8c9..000000000000
--- a/Documentation/devicetree/bindings/crypto/rockchip-crypto.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-Rockchip Electronics And Security Accelerator
-
-Required properties:
-- compatible: Should be "rockchip,rk3288-crypto"
-- reg: Base physical address of the engine and length of memory mapped
- region
-- interrupts: Interrupt number
-- clocks: Reference to the clocks about crypto
-- clock-names: "aclk" used to clock data
- "hclk" used to clock data
- "sclk" used to clock crypto accelerator
- "apb_pclk" used to clock dma
-- resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
-- reset-names: Must include the name "crypto-rst".
-
-Examples:
-
- crypto: cypto-controller@ff8a0000 {
- compatible = "rockchip,rk3288-crypto";
- reg = <0xff8a0000 0x4000>;
- interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru ACLK_CRYPTO>, <&cru HCLK_CRYPTO>,
- <&cru SCLK_CRYPTO>, <&cru ACLK_DMAC1>;
- clock-names = "aclk", "hclk", "sclk", "apb_pclk";
- resets = <&cru SRST_CRYPTO>;
- reset-names = "crypto-rst";
- };
diff --git a/Documentation/devicetree/bindings/crypto/samsung-slimsss.yaml b/Documentation/devicetree/bindings/crypto/samsung-slimsss.yaml
index 676950bb7b37..5b31891c97fe 100644
--- a/Documentation/devicetree/bindings/crypto/samsung-slimsss.yaml
+++ b/Documentation/devicetree/bindings/crypto/samsung-slimsss.yaml
@@ -24,7 +24,6 @@ properties:
maxItems: 1
clocks:
- minItems: 2
maxItems: 2
clock-names:
diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-crc.yaml b/Documentation/devicetree/bindings/crypto/st,stm32-crc.yaml
index cee624c14f07..50b2c2e0c3cd 100644
--- a/Documentation/devicetree/bindings/crypto/st,stm32-crc.yaml
+++ b/Documentation/devicetree/bindings/crypto/st,stm32-crc.yaml
@@ -4,10 +4,10 @@
$id: http://devicetree.org/schemas/crypto/st,stm32-crc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 CRC bindings
+title: STMicroelectronics STM32 CRC
maintainers:
- - Lionel Debieve <lionel.debieve@st.com>
+ - Lionel Debieve <lionel.debieve@foss.st.com>
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml b/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml
index a4574552502a..0ddeb8a9a7a0 100644
--- a/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml
+++ b/Documentation/devicetree/bindings/crypto/st,stm32-cryp.yaml
@@ -4,14 +4,20 @@
$id: http://devicetree.org/schemas/crypto/st,stm32-cryp.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 CRYP bindings
+title: STMicroelectronics STM32 CRYP
+
+description: The STM32 CRYP block is built on the CRYP block found in
+ the STn8820 SoC introduced in 2007, and subsequently used in the U8500
+ SoC in 2010.
maintainers:
- - Lionel Debieve <lionel.debieve@st.com>
+ - Lionel Debieve <lionel.debieve@foss.st.com>
properties:
compatible:
enum:
+ - st,stn8820-cryp
+ - stericsson,ux500-cryp
- st,stm32f756-cryp
- st,stm32mp1-cryp
@@ -27,6 +33,19 @@ properties:
resets:
maxItems: 1
+ dmas:
+ items:
+ - description: mem2cryp DMA channel
+ - description: cryp2mem DMA channel
+
+ dma-names:
+ items:
+ - const: mem2cryp
+ - const: cryp2mem
+
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml b/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml
index 6dd658f0912c..b767ec72a999 100644
--- a/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml
+++ b/Documentation/devicetree/bindings/crypto/st,stm32-hash.yaml
@@ -4,14 +4,20 @@
$id: http://devicetree.org/schemas/crypto/st,stm32-hash.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 HASH bindings
+title: STMicroelectronics STM32 HASH
+
+description: The STM32 HASH block is built on the HASH block found in
+ the STn8820 SoC introduced in 2007, and subsequently used in the U8500
+ SoC in 2010.
maintainers:
- - Lionel Debieve <lionel.debieve@st.com>
+ - Lionel Debieve <lionel.debieve@foss.st.com>
properties:
compatible:
enum:
+ - st,stn8820-hash
+ - stericsson,ux500-hash
- st,stm32f456-hash
- st,stm32f756-hash
@@ -41,11 +47,26 @@ properties:
maximum: 2
default: 0
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
- clocks
- - interrupts
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ items:
+ const: stericsson,ux500-hash
+ then:
+ properties:
+ interrupts: false
+ else:
+ required:
+ - interrupts
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/crypto/ti,sa2ul.yaml b/Documentation/devicetree/bindings/crypto/ti,sa2ul.yaml
index a410d2cedde6..77ec8bc70bf7 100644
--- a/Documentation/devicetree/bindings/crypto/ti,sa2ul.yaml
+++ b/Documentation/devicetree/bindings/crypto/ti,sa2ul.yaml
@@ -15,6 +15,7 @@ properties:
- ti,j721e-sa2ul
- ti,am654-sa2ul
- ti,am64-sa2ul
+ - ti,am62-sa3ul
reg:
maxItems: 1
@@ -25,8 +26,8 @@ properties:
dmas:
items:
- description: TX DMA Channel
- - description: RX DMA Channel #1
- - description: RX DMA Channel #2
+ - description: 'RX DMA Channel #1'
+ - description: 'RX DMA Channel #2'
dma-names:
items:
@@ -34,8 +35,6 @@ properties:
- const: rx1
- const: rx2
- dma-coherent: true
-
"#address-cells":
const: 2
@@ -71,16 +70,6 @@ required:
- dmas
- dma-names
-if:
- properties:
- compatible:
- enum:
- - ti,j721e-sa2ul
- - ti,am654-sa2ul
-then:
- required:
- - dma-coherent
-
additionalProperties: false
examples:
@@ -94,5 +83,4 @@ examples:
dmas = <&main_udmap 0xc000>, <&main_udmap 0x4000>,
<&main_udmap 0x4001>;
dma-names = "tx", "rx1", "rx2";
- dma-coherent;
};
diff --git a/Documentation/devicetree/bindings/crypto/xlnx,zynqmp-aes.yaml b/Documentation/devicetree/bindings/crypto/xlnx,zynqmp-aes.yaml
index 55dd6e3d270d..9e8fbd02b150 100644
--- a/Documentation/devicetree/bindings/crypto/xlnx,zynqmp-aes.yaml
+++ b/Documentation/devicetree/bindings/crypto/xlnx,zynqmp-aes.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/crypto/xlnx,zynqmp-aes.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx ZynqMP AES-GCM Hardware Accelerator Device Tree Bindings
+title: Xilinx ZynqMP AES-GCM Hardware Accelerator
maintainers:
- Kalyani Akula <kalyani.akula@xilinx.com>
diff --git a/Documentation/devicetree/bindings/ddr/lpddr2-timings.txt b/Documentation/devicetree/bindings/ddr/lpddr2-timings.txt
deleted file mode 100644
index 9ceb19e0c7fd..000000000000
--- a/Documentation/devicetree/bindings/ddr/lpddr2-timings.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-* AC timing parameters of LPDDR2(JESD209-2) memories for a given speed-bin
-
-Required properties:
-- compatible : Should be "jedec,lpddr2-timings"
-- min-freq : minimum DDR clock frequency for the speed-bin. Type is <u32>
-- max-freq : maximum DDR clock frequency for the speed-bin. Type is <u32>
-
-Optional properties:
-
-The following properties represent AC timing parameters from the memory
-data-sheet of the device for a given speed-bin. All these properties are
-of type <u32> and the default unit is ps (pico seconds). Parameters with
-a different unit have a suffix indicating the unit such as 'tRAS-max-ns'
-- tRCD
-- tWR
-- tRAS-min
-- tRRD
-- tWTR
-- tXP
-- tRTP
-- tDQSCK-max
-- tFAW
-- tZQCS
-- tZQinit
-- tRPab
-- tZQCL
-- tCKESR
-- tRAS-max-ns
-- tDQSCK-max-derated
-
-Example:
-
-timings_elpida_ECB240ABACN_400mhz: lpddr2-timings@0 {
- compatible = "jedec,lpddr2-timings";
- min-freq = <10000000>;
- max-freq = <400000000>;
- tRPab = <21000>;
- tRCD = <18000>;
- tWR = <15000>;
- tRAS-min = <42000>;
- tRRD = <10000>;
- tWTR = <7500>;
- tXP = <7500>;
- tRTP = <7500>;
- tCKESR = <15000>;
- tDQSCK-max = <5500>;
- tFAW = <50000>;
- tZQCS = <90000>;
- tZQCL = <360000>;
- tZQinit = <1000000>;
- tRAS-max-ns = <70000>;
-};
diff --git a/Documentation/devicetree/bindings/ddr/lpddr2.txt b/Documentation/devicetree/bindings/ddr/lpddr2.txt
deleted file mode 100644
index ddd40121e6f6..000000000000
--- a/Documentation/devicetree/bindings/ddr/lpddr2.txt
+++ /dev/null
@@ -1,102 +0,0 @@
-* LPDDR2 SDRAM memories compliant to JEDEC JESD209-2
-
-Required properties:
-- compatible : Should be one of - "jedec,lpddr2-nvm", "jedec,lpddr2-s2",
- "jedec,lpddr2-s4"
-
- "ti,jedec-lpddr2-s2" should be listed if the memory part is LPDDR2-S2 type
-
- "ti,jedec-lpddr2-s4" should be listed if the memory part is LPDDR2-S4 type
-
- "ti,jedec-lpddr2-nvm" should be listed if the memory part is LPDDR2-NVM type
-
-- density : <u32> representing density in Mb (Mega bits)
-
-- io-width : <u32> representing bus width. Possible values are 8, 16, and 32
-
-Optional properties:
-
-The following optional properties represent the minimum value of some AC
-timing parameters of the DDR device in terms of number of clock cycles.
-These values shall be obtained from the device data-sheet.
-- tRRD-min-tck
-- tWTR-min-tck
-- tXP-min-tck
-- tRTP-min-tck
-- tCKE-min-tck
-- tRPab-min-tck
-- tRCD-min-tck
-- tWR-min-tck
-- tRASmin-min-tck
-- tCKESR-min-tck
-- tFAW-min-tck
-
-Child nodes:
-- The lpddr2 node may have one or more child nodes of type "lpddr2-timings".
- "lpddr2-timings" provides AC timing parameters of the device for
- a given speed-bin. The user may provide the timings for as many
- speed-bins as is required. Please see Documentation/devicetree/
- bindings/ddr/lpddr2-timings.txt for more information on "lpddr2-timings"
-
-Example:
-
-elpida_ECB240ABACN : lpddr2 {
- compatible = "Elpida,ECB240ABACN","jedec,lpddr2-s4";
- density = <2048>;
- io-width = <32>;
-
- tRPab-min-tck = <3>;
- tRCD-min-tck = <3>;
- tWR-min-tck = <3>;
- tRASmin-min-tck = <3>;
- tRRD-min-tck = <2>;
- tWTR-min-tck = <2>;
- tXP-min-tck = <2>;
- tRTP-min-tck = <2>;
- tCKE-min-tck = <3>;
- tCKESR-min-tck = <3>;
- tFAW-min-tck = <8>;
-
- timings_elpida_ECB240ABACN_400mhz: lpddr2-timings@0 {
- compatible = "jedec,lpddr2-timings";
- min-freq = <10000000>;
- max-freq = <400000000>;
- tRPab = <21000>;
- tRCD = <18000>;
- tWR = <15000>;
- tRAS-min = <42000>;
- tRRD = <10000>;
- tWTR = <7500>;
- tXP = <7500>;
- tRTP = <7500>;
- tCKESR = <15000>;
- tDQSCK-max = <5500>;
- tFAW = <50000>;
- tZQCS = <90000>;
- tZQCL = <360000>;
- tZQinit = <1000000>;
- tRAS-max-ns = <70000>;
- };
-
- timings_elpida_ECB240ABACN_200mhz: lpddr2-timings@1 {
- compatible = "jedec,lpddr2-timings";
- min-freq = <10000000>;
- max-freq = <200000000>;
- tRPab = <21000>;
- tRCD = <18000>;
- tWR = <15000>;
- tRAS-min = <42000>;
- tRRD = <10000>;
- tWTR = <10000>;
- tXP = <7500>;
- tRTP = <7500>;
- tCKESR = <15000>;
- tDQSCK-max = <5500>;
- tFAW = <50000>;
- tZQCS = <90000>;
- tZQCL = <360000>;
- tZQinit = <1000000>;
- tRAS-max-ns = <70000>;
- };
-
-}
diff --git a/Documentation/devicetree/bindings/ddr/lpddr3-timings.txt b/Documentation/devicetree/bindings/ddr/lpddr3-timings.txt
deleted file mode 100644
index 84705e50a3fd..000000000000
--- a/Documentation/devicetree/bindings/ddr/lpddr3-timings.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-* AC timing parameters of LPDDR3 memories for a given speed-bin.
-
-The structures are based on LPDDR2 and extended where needed.
-
-Required properties:
-- compatible : Should be "jedec,lpddr3-timings"
-- min-freq : minimum DDR clock frequency for the speed-bin. Type is <u32>
-- reg : maximum DDR clock frequency for the speed-bin. Type is <u32>
-
-Optional properties:
-
-The following properties represent AC timing parameters from the memory
-data-sheet of the device for a given speed-bin. All these properties are
-of type <u32> and the default unit is ps (pico seconds).
-- tRFC
-- tRRD
-- tRPab
-- tRPpb
-- tRCD
-- tRC
-- tRAS
-- tWTR
-- tWR
-- tRTP
-- tW2W-C2C
-- tR2R-C2C
-- tFAW
-- tXSR
-- tXP
-- tCKE
-- tCKESR
-- tMRD
-
-Example:
-
-timings_samsung_K3QF2F20DB_800mhz: lpddr3-timings@800000000 {
- compatible = "jedec,lpddr3-timings";
- reg = <800000000>; /* workaround: it shows max-freq */
- min-freq = <100000000>;
- tRFC = <65000>;
- tRRD = <6000>;
- tRPab = <12000>;
- tRPpb = <12000>;
- tRCD = <10000>;
- tRC = <33750>;
- tRAS = <23000>;
- tWTR = <3750>;
- tWR = <7500>;
- tRTP = <3750>;
- tW2W-C2C = <0>;
- tR2R-C2C = <0>;
- tFAW = <25000>;
- tXSR = <70000>;
- tXP = <3750>;
- tCKE = <3750>;
- tCKESR = <3750>;
- tMRD = <7000>;
-};
diff --git a/Documentation/devicetree/bindings/ddr/lpddr3.txt b/Documentation/devicetree/bindings/ddr/lpddr3.txt
deleted file mode 100644
index b221e653d384..000000000000
--- a/Documentation/devicetree/bindings/ddr/lpddr3.txt
+++ /dev/null
@@ -1,106 +0,0 @@
-* LPDDR3 SDRAM memories compliant to JEDEC JESD209-3C
-
-Required properties:
-- compatible : Should be "<vendor>,<type>", and generic value "jedec,lpddr3".
- Example "<vendor>,<type>" values:
- "samsung,K3QF2F20DB"
-
-- density : <u32> representing density in Mb (Mega bits)
-- io-width : <u32> representing bus width. Possible values are 8, 16, 32, 64
-- #address-cells: Must be set to 1
-- #size-cells: Must be set to 0
-
-Optional properties:
-
-- manufacturer-id : <u32> Manufacturer ID value read from Mode Register 5
-- revision-id : <u32 u32> Revision IDs read from Mode Registers 6 and 7
-
-The following optional properties represent the minimum value of some AC
-timing parameters of the DDR device in terms of number of clock cycles.
-These values shall be obtained from the device data-sheet.
-- tRFC-min-tck
-- tRRD-min-tck
-- tRPab-min-tck
-- tRPpb-min-tck
-- tRCD-min-tck
-- tRC-min-tck
-- tRAS-min-tck
-- tWTR-min-tck
-- tWR-min-tck
-- tRTP-min-tck
-- tW2W-C2C-min-tck
-- tR2R-C2C-min-tck
-- tWL-min-tck
-- tDQSCK-min-tck
-- tRL-min-tck
-- tFAW-min-tck
-- tXSR-min-tck
-- tXP-min-tck
-- tCKE-min-tck
-- tCKESR-min-tck
-- tMRD-min-tck
-
-Child nodes:
-- The lpddr3 node may have one or more child nodes of type "lpddr3-timings".
- "lpddr3-timings" provides AC timing parameters of the device for
- a given speed-bin. Please see Documentation/devicetree/
- bindings/ddr/lpddr3-timings.txt for more information on "lpddr3-timings"
-
-Example:
-
-samsung_K3QF2F20DB: lpddr3 {
- compatible = "samsung,K3QF2F20DB", "jedec,lpddr3";
- density = <16384>;
- io-width = <32>;
- manufacturer-id = <1>;
- revision-id = <123 234>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- tRFC-min-tck = <17>;
- tRRD-min-tck = <2>;
- tRPab-min-tck = <2>;
- tRPpb-min-tck = <2>;
- tRCD-min-tck = <3>;
- tRC-min-tck = <6>;
- tRAS-min-tck = <5>;
- tWTR-min-tck = <2>;
- tWR-min-tck = <7>;
- tRTP-min-tck = <2>;
- tW2W-C2C-min-tck = <0>;
- tR2R-C2C-min-tck = <0>;
- tWL-min-tck = <8>;
- tDQSCK-min-tck = <5>;
- tRL-min-tck = <14>;
- tFAW-min-tck = <5>;
- tXSR-min-tck = <12>;
- tXP-min-tck = <2>;
- tCKE-min-tck = <2>;
- tCKESR-min-tck = <2>;
- tMRD-min-tck = <5>;
-
- timings_samsung_K3QF2F20DB_800mhz: lpddr3-timings@800000000 {
- compatible = "jedec,lpddr3-timings";
- /* workaround: 'reg' shows max-freq */
- reg = <800000000>;
- min-freq = <100000000>;
- tRFC = <65000>;
- tRRD = <6000>;
- tRPab = <12000>;
- tRPpb = <12000>;
- tRCD = <10000>;
- tRC = <33750>;
- tRAS = <23000>;
- tWTR = <3750>;
- tWR = <7500>;
- tRTP = <3750>;
- tW2W-C2C = <0>;
- tR2R-C2C = <0>;
- tFAW = <25000>;
- tXSR = <70000>;
- tXP = <3750>;
- tCKE = <3750>;
- tCKESR = <3750>;
- tMRD = <7000>;
- };
-}
diff --git a/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-nocp.yaml b/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-nocp.yaml
index d318fccf78f1..2bdd05af6079 100644
--- a/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-nocp.yaml
+++ b/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-nocp.yaml
@@ -8,7 +8,7 @@ title: Samsung Exynos NoC (Network on Chip) Probe
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
description: |
The Samsung Exynos542x SoC has a NoC (Network on Chip) Probe for NoC bus.
diff --git a/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-ppmu.yaml b/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-ppmu.yaml
index c9a8cb5fd555..e300df4b47f3 100644
--- a/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-ppmu.yaml
+++ b/Documentation/devicetree/bindings/devfreq/event/samsung,exynos-ppmu.yaml
@@ -8,7 +8,7 @@ title: Samsung Exynos SoC PPMU (Platform Performance Monitoring Unit)
maintainers:
- Chanwoo Choi <cw00.choi@samsung.com>
- - Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
description: |
The Samsung Exynos SoC has PPMU (Platform Performance Monitoring Unit) for
diff --git a/Documentation/devicetree/bindings/devfreq/exynos-bus.txt b/Documentation/devicetree/bindings/devfreq/exynos-bus.txt
deleted file mode 100644
index bcaa2c08ac11..000000000000
--- a/Documentation/devicetree/bindings/devfreq/exynos-bus.txt
+++ /dev/null
@@ -1,488 +0,0 @@
-* Generic Exynos Bus frequency device
-
-The Samsung Exynos SoC has many buses for data transfer between DRAM
-and sub-blocks in SoC. Most Exynos SoCs share the common architecture
-for buses. Generally, each bus of Exynos SoC includes a source clock
-and a power line, which are able to change the clock frequency
-of the bus in runtime. To monitor the usage of each bus in runtime,
-the driver uses the PPMU (Platform Performance Monitoring Unit), which
-is able to measure the current load of sub-blocks.
-
-The Exynos SoC includes the various sub-blocks which have the each AXI bus.
-The each AXI bus has the owned source clock but, has not the only owned
-power line. The power line might be shared among one more sub-blocks.
-So, we can divide into two type of device as the role of each sub-block.
-There are two type of bus devices as following:
-- parent bus device
-- passive bus device
-
-Basically, parent and passive bus device share the same power line.
-The parent bus device can only change the voltage of shared power line
-and the rest bus devices (passive bus device) depend on the decision of
-the parent bus device. If there are three blocks which share the VDD_xxx
-power line, Only one block should be parent device and then the rest blocks
-should depend on the parent device as passive device.
-
- VDD_xxx |--- A block (parent)
- |--- B block (passive)
- |--- C block (passive)
-
-There are a little different composition among Exynos SoC because each Exynos
-SoC has different sub-blocks. Therefore, such difference should be specified
-in devicetree file instead of each device driver. In result, this driver
-is able to support the bus frequency for all Exynos SoCs.
-
-Required properties for all bus devices:
-- compatible: Should be "samsung,exynos-bus".
-- clock-names : the name of clock used by the bus, "bus".
-- clocks : phandles for clock specified in "clock-names" property.
-- operating-points-v2: the OPP table including frequency/voltage information
- to support DVFS (Dynamic Voltage/Frequency Scaling) feature.
-
-Required properties only for parent bus device:
-- vdd-supply: the regulator to provide the buses with the voltage.
-- devfreq-events: the devfreq-event device to monitor the current utilization
- of buses.
-
-Required properties only for passive bus device:
-- devfreq: the parent bus device.
-
-Optional properties only for parent bus device:
-- exynos,saturation-ratio: the percentage value which is used to calibrate
- the performance count against total cycle count.
-
-Optional properties for the interconnect functionality (QoS frequency
-constraints):
-- #interconnect-cells: should be 0.
-- interconnects: as documented in ../interconnect.txt, describes a path at the
- higher level interconnects used by this interconnect provider.
- If this interconnect provider is directly linked to a top level interconnect
- provider the property contains only one phandle. The provider extends
- the interconnect graph by linking its node to a node registered by provider
- pointed to by first phandle in the 'interconnects' property.
-
-- samsung,data-clock-ratio: ratio of the data throughput in B/s to minimum data
- clock frequency in Hz, default value is 8 when this property is missing.
-
-Detailed correlation between sub-blocks and power line according to Exynos SoC:
-- In case of Exynos3250, there are two power line as following:
- VDD_MIF |--- DMC
-
- VDD_INT |--- LEFTBUS (parent device)
- |--- PERIL
- |--- MFC
- |--- G3D
- |--- RIGHTBUS
- |--- PERIR
- |--- FSYS
- |--- LCD0
- |--- PERIR
- |--- ISP
- |--- CAM
-
-- In case of Exynos4210, there is one power line as following:
- VDD_INT |--- DMC (parent device)
- |--- LEFTBUS
- |--- PERIL
- |--- MFC(L)
- |--- G3D
- |--- TV
- |--- LCD0
- |--- RIGHTBUS
- |--- PERIR
- |--- MFC(R)
- |--- CAM
- |--- FSYS
- |--- GPS
- |--- LCD0
- |--- LCD1
-
-- In case of Exynos4x12, there are two power line as following:
- VDD_MIF |--- DMC
-
- VDD_INT |--- LEFTBUS (parent device)
- |--- PERIL
- |--- MFC(L)
- |--- G3D
- |--- TV
- |--- IMAGE
- |--- RIGHTBUS
- |--- PERIR
- |--- MFC(R)
- |--- CAM
- |--- FSYS
- |--- GPS
- |--- LCD0
- |--- ISP
-
-- In case of Exynos5422, there are two power line as following:
- VDD_MIF |--- DREX 0 (parent device, DRAM EXpress controller)
- |--- DREX 1
-
- VDD_INT |--- NoC_Core (parent device)
- |--- G2D
- |--- G3D
- |--- DISP1
- |--- NoC_WCORE
- |--- GSCL
- |--- MSCL
- |--- ISP
- |--- MFC
- |--- GEN
- |--- PERIS
- |--- PERIC
- |--- FSYS
- |--- FSYS2
-
-- In case of Exynos5433, there is VDD_INT power line as following:
- VDD_INT |--- G2D (parent device)
- |--- MSCL
- |--- GSCL
- |--- JPEG
- |--- MFC
- |--- HEVC
- |--- BUS0
- |--- BUS1
- |--- BUS2
- |--- PERIS (Fixed clock rate)
- |--- PERIC (Fixed clock rate)
- |--- FSYS (Fixed clock rate)
-
-Example 1:
- Show the AXI buses of Exynos3250 SoC. Exynos3250 divides the buses to
- power line (regulator). The MIF (Memory Interface) AXI bus is used to
- transfer data between DRAM and CPU and uses the VDD_MIF regulator.
-
- - MIF (Memory Interface) block
- : VDD_MIF |--- DMC (Dynamic Memory Controller)
-
- - INT (Internal) block
- : VDD_INT |--- LEFTBUS (parent device)
- |--- PERIL
- |--- MFC
- |--- G3D
- |--- RIGHTBUS
- |--- FSYS
- |--- LCD0
- |--- PERIR
- |--- ISP
- |--- CAM
-
- - MIF bus's frequency/voltage table
- -----------------------
- |Lv| Freq | Voltage |
- -----------------------
- |L1| 50000 |800000 |
- |L2| 100000 |800000 |
- |L3| 134000 |800000 |
- |L4| 200000 |825000 |
- |L5| 400000 |875000 |
- -----------------------
-
- - INT bus's frequency/voltage table
- ----------------------------------------------------------
- |Block|LEFTBUS|RIGHTBUS|MCUISP |ISP |PERIL ||VDD_INT |
- | name| |LCD0 | | | || |
- | | |FSYS | | | || |
- | | |MFC | | | || |
- ----------------------------------------------------------
- |Mode |*parent|passive |passive|passive|passive|| |
- ----------------------------------------------------------
- |Lv |Frequency ||Voltage |
- ----------------------------------------------------------
- |L1 |50000 |50000 |50000 |50000 |50000 ||900000 |
- |L2 |80000 |80000 |80000 |80000 |80000 ||900000 |
- |L3 |100000 |100000 |100000 |100000 |100000 ||1000000 |
- |L4 |134000 |134000 |200000 |200000 | ||1000000 |
- |L5 |200000 |200000 |400000 |300000 | ||1000000 |
- ----------------------------------------------------------
-
-Example 2:
- The bus of DMC (Dynamic Memory Controller) block in exynos3250.dtsi
- is listed below:
-
- bus_dmc: bus_dmc {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu_dmc CLK_DIV_DMC>;
- clock-names = "bus";
- operating-points-v2 = <&bus_dmc_opp_table>;
- status = "disabled";
- };
-
- bus_dmc_opp_table: opp_table1 {
- compatible = "operating-points-v2";
- opp-shared;
-
- opp-50000000 {
- opp-hz = /bits/ 64 <50000000>;
- opp-microvolt = <800000>;
- };
- opp-100000000 {
- opp-hz = /bits/ 64 <100000000>;
- opp-microvolt = <800000>;
- };
- opp-134000000 {
- opp-hz = /bits/ 64 <134000000>;
- opp-microvolt = <800000>;
- };
- opp-200000000 {
- opp-hz = /bits/ 64 <200000000>;
- opp-microvolt = <825000>;
- };
- opp-400000000 {
- opp-hz = /bits/ 64 <400000000>;
- opp-microvolt = <875000>;
- };
- };
-
- bus_leftbus: bus_leftbus {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_GDL>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- status = "disabled";
- };
-
- bus_rightbus: bus_rightbus {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_GDR>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- status = "disabled";
- };
-
- bus_lcd0: bus_lcd0 {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_ACLK_160>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- status = "disabled";
- };
-
- bus_fsys: bus_fsys {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_ACLK_200>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- status = "disabled";
- };
-
- bus_mcuisp: bus_mcuisp {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_ACLK_400_MCUISP>;
- clock-names = "bus";
- operating-points-v2 = <&bus_mcuisp_opp_table>;
- status = "disabled";
- };
-
- bus_isp: bus_isp {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_ACLK_266>;
- clock-names = "bus";
- operating-points-v2 = <&bus_isp_opp_table>;
- status = "disabled";
- };
-
- bus_peril: bus_peril {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_DIV_ACLK_100>;
- clock-names = "bus";
- operating-points-v2 = <&bus_peril_opp_table>;
- status = "disabled";
- };
-
- bus_mfc: bus_mfc {
- compatible = "samsung,exynos-bus";
- clocks = <&cmu CLK_SCLK_MFC>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- status = "disabled";
- };
-
- bus_leftbus_opp_table: opp_table1 {
- compatible = "operating-points-v2";
- opp-shared;
-
- opp-50000000 {
- opp-hz = /bits/ 64 <50000000>;
- opp-microvolt = <900000>;
- };
- opp-80000000 {
- opp-hz = /bits/ 64 <80000000>;
- opp-microvolt = <900000>;
- };
- opp-100000000 {
- opp-hz = /bits/ 64 <100000000>;
- opp-microvolt = <1000000>;
- };
- opp-134000000 {
- opp-hz = /bits/ 64 <134000000>;
- opp-microvolt = <1000000>;
- };
- opp-200000000 {
- opp-hz = /bits/ 64 <200000000>;
- opp-microvolt = <1000000>;
- };
- };
-
- bus_mcuisp_opp_table: opp_table2 {
- compatible = "operating-points-v2";
- opp-shared;
-
- opp-50000000 {
- opp-hz = /bits/ 64 <50000000>;
- };
- opp-80000000 {
- opp-hz = /bits/ 64 <80000000>;
- };
- opp-100000000 {
- opp-hz = /bits/ 64 <100000000>;
- };
- opp-200000000 {
- opp-hz = /bits/ 64 <200000000>;
- };
- opp-400000000 {
- opp-hz = /bits/ 64 <400000000>;
- };
- };
-
- bus_isp_opp_table: opp_table3 {
- compatible = "operating-points-v2";
- opp-shared;
-
- opp-50000000 {
- opp-hz = /bits/ 64 <50000000>;
- };
- opp-80000000 {
- opp-hz = /bits/ 64 <80000000>;
- };
- opp-100000000 {
- opp-hz = /bits/ 64 <100000000>;
- };
- opp-200000000 {
- opp-hz = /bits/ 64 <200000000>;
- };
- opp-300000000 {
- opp-hz = /bits/ 64 <300000000>;
- };
- };
-
- bus_peril_opp_table: opp_table4 {
- compatible = "operating-points-v2";
- opp-shared;
-
- opp-50000000 {
- opp-hz = /bits/ 64 <50000000>;
- };
- opp-80000000 {
- opp-hz = /bits/ 64 <80000000>;
- };
- opp-100000000 {
- opp-hz = /bits/ 64 <100000000>;
- };
- };
-
-
- Usage case to handle the frequency and voltage of bus on runtime
- in exynos3250-rinato.dts is listed below:
-
- &bus_dmc {
- devfreq-events = <&ppmu_dmc0_3>, <&ppmu_dmc1_3>;
- vdd-supply = <&buck1_reg>; /* VDD_MIF */
- status = "okay";
- };
-
- &bus_leftbus {
- devfreq-events = <&ppmu_leftbus_3>, <&ppmu_rightbus_3>;
- vdd-supply = <&buck3_reg>;
- status = "okay";
- };
-
- &bus_rightbus {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_lcd0 {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_fsys {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_mcuisp {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_isp {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_peril {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
- &bus_mfc {
- devfreq = <&bus_leftbus>;
- status = "okay";
- };
-
-Example 3:
- An interconnect path "bus_display -- bus_leftbus -- bus_dmc" on
- Exynos4412 SoC with video mixer as an interconnect consumer device.
-
- soc {
- bus_dmc: bus_dmc {
- compatible = "samsung,exynos-bus";
- clocks = <&clock CLK_DIV_DMC>;
- clock-names = "bus";
- operating-points-v2 = <&bus_dmc_opp_table>;
- samsung,data-clock-ratio = <4>;
- #interconnect-cells = <0>;
- };
-
- bus_leftbus: bus_leftbus {
- compatible = "samsung,exynos-bus";
- clocks = <&clock CLK_DIV_GDL>;
- clock-names = "bus";
- operating-points-v2 = <&bus_leftbus_opp_table>;
- #interconnect-cells = <0>;
- interconnects = <&bus_dmc>;
- };
-
- bus_display: bus_display {
- compatible = "samsung,exynos-bus";
- clocks = <&clock CLK_ACLK160>;
- clock-names = "bus";
- operating-points-v2 = <&bus_display_opp_table>;
- #interconnect-cells = <0>;
- interconnects = <&bus_leftbus &bus_dmc>;
- };
-
- bus_dmc_opp_table: opp_table1 {
- compatible = "operating-points-v2";
- /* ... */
- }
-
- bus_leftbus_opp_table: opp_table3 {
- compatible = "operating-points-v2";
- /* ... */
- };
-
- bus_display_opp_table: opp_table4 {
- compatible = "operating-points-v2";
- /* .. */
- };
-
- &mixer {
- compatible = "samsung,exynos4212-mixer";
- interconnects = <&bus_display &bus_dmc>;
- /* ... */
- };
- };
diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
deleted file mode 100644
index 3fbeb3733c48..000000000000
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-* Rockchip rk3399 DMC (Dynamic Memory Controller) device
-
-Required properties:
-- compatible: Must be "rockchip,rk3399-dmc".
-- devfreq-events: Node to get DDR loading, Refer to
- Documentation/devicetree/bindings/devfreq/event/
- rockchip-dfi.txt
-- clocks: Phandles for clock specified in "clock-names" property
-- clock-names : The name of clock used by the DFI, must be
- "pclk_ddr_mon";
-- operating-points-v2: Refer to Documentation/devicetree/bindings/opp/opp-v2.yaml
- for details.
-- center-supply: DMC supply node.
-- status: Marks the node enabled/disabled.
-- rockchip,pmu: Phandle to the syscon managing the "PMU general register
- files".
-
-Optional properties:
-- interrupts: The CPU interrupt number. The interrupt specifier
- format depends on the interrupt controller.
- It should be a DCF interrupt. When DDR DVFS finishes
- a DCF interrupt is triggered.
-- rockchip,pmu: Phandle to the syscon managing the "PMU general register
- files".
-
-Following properties relate to DDR timing:
-
-- rockchip,dram_speed_bin : Value reference include/dt-bindings/clock/rk3399-ddr.h,
- it selects the DDR3 cl-trp-trcd type. It must be
- set according to "Speed Bin" in DDR3 datasheet,
- DO NOT use a smaller "Speed Bin" than specified
- for the DDR3 being used.
-
-- rockchip,pd_idle : Configure the PD_IDLE value. Defines the
- power-down idle period in which memories are
- placed into power-down mode if bus is idle
- for PD_IDLE DFI clock cycles.
-
-- rockchip,sr_idle : Configure the SR_IDLE value. Defines the
- self-refresh idle period in which memories are
- placed into self-refresh mode if bus is idle
- for SR_IDLE * 1024 DFI clock cycles (DFI
- clocks freq is half of DRAM clock), default
- value is "0".
-
-- rockchip,sr_mc_gate_idle : Defines the memory self-refresh and controller
- clock gating idle period. Memories are placed
- into self-refresh mode and memory controller
- clock arg gating started if bus is idle for
- sr_mc_gate_idle*1024 DFI clock cycles.
-
-- rockchip,srpd_lite_idle : Defines the self-refresh power down idle
- period in which memories are placed into
- self-refresh power down mode if bus is idle
- for srpd_lite_idle * 1024 DFI clock cycles.
- This parameter is for LPDDR4 only.
-
-- rockchip,standby_idle : Defines the standby idle period in which
- memories are placed into self-refresh mode.
- The controller, pi, PHY and DRAM clock will
- be gated if bus is idle for standby_idle * DFI
- clock cycles.
-
-- rockchip,dram_dll_dis_freq : Defines the DDR3 DLL bypass frequency in MHz.
- When DDR frequency is less than DRAM_DLL_DISB_FREQ,
- DDR3 DLL will be bypassed. Note: if DLL was bypassed,
- the odt will also stop working.
-
-- rockchip,phy_dll_dis_freq : Defines the PHY dll bypass frequency in
- MHz (Mega Hz). When DDR frequency is less than
- DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed.
- Note: PHY DLL and PHY ODT are independent.
-
-- rockchip,ddr3_odt_dis_freq : When the DRAM type is DDR3, this parameter defines
- the ODT disable frequency in MHz (Mega Hz).
- when the DDR frequency is less then ddr3_odt_dis_freq,
- the ODT on the DRAM side and controller side are
- both disabled.
-
-- rockchip,ddr3_drv : When the DRAM type is DDR3, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 40.
-
-- rockchip,ddr3_odt : When the DRAM type is DDR3, this parameter defines
- the DRAM side ODT strength in ohms. Default value
- is 120.
-
-- rockchip,phy_ddr3_ca_drv : When the DRAM type is DDR3, this parameter defines
- the phy side CA line (incluing command line,
- address line and clock line) driver strength.
- Default value is 40.
-
-- rockchip,phy_ddr3_dq_drv : When the DRAM type is DDR3, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 40.
-
-- rockchip,phy_ddr3_odt : When the DRAM type is DDR3, this parameter defines
- the PHY side ODT strength. Default value is 240.
-
-- rockchip,lpddr3_odt_dis_freq : When the DRAM type is LPDDR3, this parameter defines
- then ODT disable frequency in MHz (Mega Hz).
- When DDR frequency is less then ddr3_odt_dis_freq,
- the ODT on the DRAM side and controller side are
- both disabled.
-
-- rockchip,lpddr3_drv : When the DRAM type is LPDDR3, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 34.
-
-- rockchip,lpddr3_odt : When the DRAM type is LPDDR3, this parameter defines
- the DRAM side ODT strength in ohms. Default value
- is 240.
-
-- rockchip,phy_lpddr3_ca_drv : When the DRAM type is LPDDR3, this parameter defines
- the PHY side CA line (including command line,
- address line and clock line) driver strength.
- Default value is 40.
-
-- rockchip,phy_lpddr3_dq_drv : When the DRAM type is LPDDR3, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 40.
-
-- rockchip,phy_lpddr3_odt : When dram type is LPDDR3, this parameter define
- the phy side odt strength, default value is 240.
-
-- rockchip,lpddr4_odt_dis_freq : When the DRAM type is LPDDR4, this parameter
- defines the ODT disable frequency in
- MHz (Mega Hz). When the DDR frequency is less then
- ddr3_odt_dis_freq, the ODT on the DRAM side and
- controller side are both disabled.
-
-- rockchip,lpddr4_drv : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 60.
-
-- rockchip,lpddr4_dq_odt : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side ODT on DQS/DQ line strength in ohms.
- Default value is 40.
-
-- rockchip,lpddr4_ca_odt : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side ODT on CA line strength in ohms.
- Default value is 40.
-
-- rockchip,phy_lpddr4_ca_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side CA line (including command address
- line) driver strength. Default value is 40.
-
-- rockchip,phy_lpddr4_ck_cs_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side clock line and CS line driver
- strength. Default value is 80.
-
-- rockchip,phy_lpddr4_dq_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 80.
-
-- rockchip,phy_lpddr4_odt : When the DRAM type is LPDDR4, this parameter defines
- the PHY side ODT strength. Default value is 60.
-
-Example:
- dmc_opp_table: dmc_opp_table {
- compatible = "operating-points-v2";
-
- opp00 {
- opp-hz = /bits/ 64 <300000000>;
- opp-microvolt = <900000>;
- };
- opp01 {
- opp-hz = /bits/ 64 <666000000>;
- opp-microvolt = <900000>;
- };
- };
-
- dmc: dmc {
- compatible = "rockchip,rk3399-dmc";
- devfreq-events = <&dfi>;
- interrupts = <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru SCLK_DDRCLK>;
- clock-names = "dmc_clk";
- operating-points-v2 = <&dmc_opp_table>;
- center-supply = <&ppvar_centerlogic>;
- upthreshold = <15>;
- downdifferential = <10>;
- rockchip,ddr3_speed_bin = <21>;
- rockchip,pd_idle = <0x40>;
- rockchip,sr_idle = <0x2>;
- rockchip,sr_mc_gate_idle = <0x3>;
- rockchip,srpd_lite_idle = <0x4>;
- rockchip,standby_idle = <0x2000>;
- rockchip,dram_dll_dis_freq = <300>;
- rockchip,phy_dll_dis_freq = <125>;
- rockchip,auto_pd_dis_freq = <666>;
- rockchip,ddr3_odt_dis_freq = <333>;
- rockchip,ddr3_drv = <40>;
- rockchip,ddr3_odt = <120>;
- rockchip,phy_ddr3_ca_drv = <40>;
- rockchip,phy_ddr3_dq_drv = <40>;
- rockchip,phy_ddr3_odt = <240>;
- rockchip,lpddr3_odt_dis_freq = <333>;
- rockchip,lpddr3_drv = <34>;
- rockchip,lpddr3_odt = <240>;
- rockchip,phy_lpddr3_ca_drv = <40>;
- rockchip,phy_lpddr3_dq_drv = <40>;
- rockchip,phy_lpddr3_odt = <240>;
- rockchip,lpddr4_odt_dis_freq = <333>;
- rockchip,lpddr4_drv = <60>;
- rockchip,lpddr4_dq_odt = <40>;
- rockchip,lpddr4_ca_odt = <40>;
- rockchip,phy_lpddr4_ca_drv = <40>;
- rockchip,phy_lpddr4_ck_cs_drv = <80>;
- rockchip,phy_lpddr4_dq_drv = <80>;
- rockchip,phy_lpddr4_odt = <60>;
- };
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-backend.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-backend.yaml
index 3d8ea3c2d8dd..ba06d1857b7d 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-backend.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-backend.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-display-backend.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Display Engine Backend Device Tree Bindings
+title: Allwinner A10 Display Engine Backend
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml
index e77523b02fad..e6088f379f70 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-engine.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-display-engine.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Display Engine Pipeline Device Tree Bindings
+title: Allwinner A10 Display Engine Pipeline
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -62,6 +62,7 @@ properties:
- allwinner,sun8i-r40-display-engine
- allwinner,sun8i-v3s-display-engine
- allwinner,sun9i-a80-display-engine
+ - allwinner,sun20i-d1-display-engine
- allwinner,sun50i-a64-display-engine
- allwinner,sun50i-h6-display-engine
@@ -69,6 +70,8 @@ properties:
$ref: /schemas/types.yaml#/definitions/phandle-array
minItems: 1
maxItems: 2
+ items:
+ maxItems: 1
description: |
Available display engine frontends (DE 1.0) or mixers (DE
2.0/3.0) available.
@@ -91,6 +94,7 @@ if:
- allwinner,sun8i-a83t-display-engine
- allwinner,sun8i-r40-display-engine
- allwinner,sun9i-a80-display-engine
+ - allwinner,sun20i-d1-display-engine
- allwinner,sun50i-a64-display-engine
then:
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-frontend.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-frontend.yaml
index 055157fbf3bf..98e8240a05bd 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-frontend.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-display-frontend.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-display-frontend.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Display Engine Frontend Device Tree Bindings
+title: Allwinner A10 Display Engine Frontend
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-hdmi.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-hdmi.yaml
index 7f11452539f4..55703caacb9c 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-hdmi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-hdmi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 HDMI Controller Device Tree Bindings
+title: Allwinner A10 HDMI Controller
description: |
The HDMI Encoder supports the HDMI video and audio outputs, and does
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml
index 3a7d5d731712..724d93b9193b 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-tcon.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 Timings Controller (TCON) Device Tree Bindings
+title: Allwinner A10 Timings Controller (TCON)
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -33,6 +33,8 @@ properties:
- const: allwinner,sun8i-v3s-tcon
- const: allwinner,sun9i-a80-tcon-lcd
- const: allwinner,sun9i-a80-tcon-tv
+ - const: allwinner,sun20i-d1-tcon-lcd
+ - const: allwinner,sun20i-d1-tcon-tv
- items:
- enum:
@@ -231,6 +233,7 @@ allOf:
- allwinner,sun8i-a83t-tcon-lcd
- allwinner,sun8i-v3s-tcon
- allwinner,sun9i-a80-tcon-lcd
+ - allwinner,sun20i-d1-tcon-lcd
then:
properties:
@@ -250,6 +253,7 @@ allOf:
- allwinner,sun8i-a83t-tcon-tv
- allwinner,sun8i-r40-tcon-tv
- allwinner,sun9i-a80-tcon-tv
+ - allwinner,sun20i-d1-tcon-tv
then:
properties:
@@ -276,6 +280,7 @@ allOf:
- allwinner,sun9i-a80-tcon-lcd
- allwinner,sun4i-a10-tcon
- allwinner,sun8i-a83t-tcon-lcd
+ - allwinner,sun20i-d1-tcon-lcd
then:
required:
@@ -292,6 +297,7 @@ allOf:
- allwinner,sun8i-a23-tcon
- allwinner,sun8i-a33-tcon
- allwinner,sun8i-a83t-tcon-lcd
+ - allwinner,sun20i-d1-tcon-lcd
then:
properties:
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tv-encoder.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tv-encoder.yaml
index afc0ed799e0e..c39e90a5945f 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tv-encoder.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tv-encoder.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun4i-a10-tv-encoder.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 TV Encoder Device Tree Bindings
+title: Allwinner A10 TV Encoder
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-drc.yaml b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-drc.yaml
index 71cce5687580..895506d93f4c 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-drc.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-drc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-drc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A31 Dynamic Range Controller Device Tree Bindings
+title: Allwinner A31 Dynamic Range Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
index bf0bdf54e5f9..c731fbdc2fe0 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-mipi-dsi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A31 MIPI-DSI Controller Device Tree Bindings
+title: Allwinner A31 MIPI-DSI Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -12,9 +12,14 @@ maintainers:
properties:
compatible:
- enum:
- - allwinner,sun6i-a31-mipi-dsi
- - allwinner,sun50i-a64-mipi-dsi
+ oneOf:
+ - enum:
+ - allwinner,sun6i-a31-mipi-dsi
+ - allwinner,sun50i-a64-mipi-dsi
+ - allwinner,sun50i-a100-mipi-dsi
+ - items:
+ - const: allwinner,sun20i-d1-mipi-dsi
+ - const: allwinner,sun50i-a100-mipi-dsi
reg:
maxItems: 1
@@ -59,7 +64,6 @@ required:
- phys
- phy-names
- resets
- - vcc-dsi-supply
- port
allOf:
@@ -68,7 +72,9 @@ allOf:
properties:
compatible:
contains:
- const: allwinner,sun6i-a31-mipi-dsi
+ enum:
+ - allwinner,sun6i-a31-mipi-dsi
+ - allwinner,sun50i-a100-mipi-dsi
then:
properties:
@@ -78,16 +84,22 @@ allOf:
required:
- clock-names
+ else:
+ properties:
+ clocks:
+ maxItems: 1
+
- if:
properties:
compatible:
contains:
- const: allwinner,sun50i-a64-mipi-dsi
+ enum:
+ - allwinner,sun6i-a31-mipi-dsi
+ - allwinner,sun50i-a64-mipi-dsi
then:
- properties:
- clocks:
- minItems: 1
+ required:
+ - vcc-dsi-supply
unevaluatedProperties: false
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-de2-mixer.yaml b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-de2-mixer.yaml
index 4f91eec26de9..b75c1ec686ad 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-de2-mixer.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-de2-mixer.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun8i-a83t-de2-mixer.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner Display Engine 2.0 Mixer Device Tree Bindings
+title: Allwinner Display Engine 2.0 Mixer
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -19,6 +19,8 @@ properties:
- allwinner,sun8i-r40-de2-mixer-0
- allwinner,sun8i-r40-de2-mixer-1
- allwinner,sun8i-v3s-de2-mixer
+ - allwinner,sun20i-d1-de2-mixer-0
+ - allwinner,sun20i-d1-de2-mixer-1
- allwinner,sun50i-a64-de2-mixer-0
- allwinner,sun50i-a64-de2-mixer-1
- allwinner,sun50i-h6-de3-mixer-0
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-dw-hdmi.yaml b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-dw-hdmi.yaml
index 4951b5ef5c6a..60fd927b5a06 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-dw-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-dw-hdmi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun8i-a83t-dw-hdmi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A83t DWC HDMI TX Encoder Device Tree Bindings
+title: Allwinner A83t DWC HDMI TX Encoder
description: |
The HDMI transmitter is a Synopsys DesignWare HDMI 1.4 TX controller
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-hdmi-phy.yaml b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-hdmi-phy.yaml
index a97366aaf924..1b47f3d99a78 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-hdmi-phy.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun8i-a83t-hdmi-phy.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun8i-a83t-hdmi-phy.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A83t HDMI PHY Device Tree Bindings
+title: Allwinner A83t HDMI PHY
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun8i-r40-tcon-top.yaml b/Documentation/devicetree/bindings/display/allwinner,sun8i-r40-tcon-top.yaml
index 61ef7b337218..7d849c4095a3 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun8i-r40-tcon-top.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun8i-r40-tcon-top.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun8i-r40-tcon-top.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner R40 TCON TOP Device Tree Bindings
+title: Allwinner R40 TCON TOP
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -41,6 +41,7 @@ properties:
compatible:
enum:
- allwinner,sun8i-r40-tcon-top
+ - allwinner,sun20i-d1-tcon-top
- allwinner,sun50i-h6-tcon-top
reg:
@@ -48,31 +49,15 @@ properties:
clocks:
minItems: 2
- items:
- - description: The TCON TOP interface clock
- - description: The TCON TOP TV0 clock
- - description: The TCON TOP TVE0 clock
- - description: The TCON TOP TV1 clock
- - description: The TCON TOP TVE1 clock
- - description: The TCON TOP MIPI DSI clock
+ maxItems: 6
clock-names:
minItems: 2
- items:
- - const: bus
- - const: tcon-tv0
- - const: tve0
- - const: tcon-tv1
- - const: tve1
- - const: dsi
+ maxItems: 6
clock-output-names:
minItems: 1
maxItems: 3
- description: >
- The first item is the name of the clock created for the TV0
- channel, the second item is the name of the TCON TV1 channel
- clock and the third one is the name of the DSI channel clock.
resets:
maxItems: 1
@@ -129,32 +114,92 @@ required:
additionalProperties: false
-if:
- properties:
- compatible:
- contains:
- const: allwinner,sun50i-h6-tcon-top
-
-then:
- properties:
- clocks:
- maxItems: 2
-
- clock-output-names:
- maxItems: 1
-
-else:
- properties:
- clocks:
- minItems: 6
-
- clock-output-names:
- minItems: 3
-
- ports:
- required:
- - port@2
- - port@3
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-r40-tcon-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: The TCON TOP interface clock
+ - description: The TCON TOP TV0 clock
+ - description: The TCON TOP TVE0 clock
+ - description: The TCON TOP TV1 clock
+ - description: The TCON TOP TVE1 clock
+ - description: The TCON TOP MIPI DSI clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: tcon-tv0
+ - const: tve0
+ - const: tcon-tv1
+ - const: tve1
+ - const: dsi
+
+ clock-output-names:
+ items:
+ - description: TCON TV0 output clock name
+ - description: TCON TV1 output clock name
+ - description: DSI output clock name
+
+ ports:
+ required:
+ - port@2
+ - port@3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun20i-d1-tcon-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: The TCON TOP interface clock
+ - description: The TCON TOP TV0 clock
+ - description: The TCON TOP TVE0 clock
+ - description: The TCON TOP MIPI DSI clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: tcon-tv0
+ - const: tve0
+ - const: dsi
+
+ clock-output-names:
+ items:
+ - description: TCON TV0 output clock name
+ - description: DSI output clock name
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun50i-h6-tcon-top
+
+ then:
+ properties:
+ clocks:
+ items:
+ - description: The TCON TOP interface clock
+ - description: The TCON TOP TV0 clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: tcon-tv0
+
+ clock-output-names:
+ items:
+ - description: TCON TV0 output clock name
examples:
- |
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun9i-a80-deu.yaml b/Documentation/devicetree/bindings/display/allwinner,sun9i-a80-deu.yaml
index 637372ec4614..193afee2c3c1 100644
--- a/Documentation/devicetree/bindings/display/allwinner,sun9i-a80-deu.yaml
+++ b/Documentation/devicetree/bindings/display/allwinner,sun9i-a80-deu.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/allwinner,sun9i-a80-deu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A80 Detail Enhancement Unit Device Tree Bindings
+title: Allwinner A80 Detail Enhancement Unit
maintainers:
- Chen-Yu Tsai <wens@csie.org>
diff --git a/Documentation/devicetree/bindings/display/amlogic,meson-dw-hdmi.yaml b/Documentation/devicetree/bindings/display/amlogic,meson-dw-hdmi.yaml
index cf5a208f2f10..0c85894648d8 100644
--- a/Documentation/devicetree/bindings/display/amlogic,meson-dw-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/amlogic,meson-dw-hdmi.yaml
@@ -2,13 +2,16 @@
# Copyright 2019 BayLibre, SAS
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/amlogic,meson-dw-hdmi.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/amlogic,meson-dw-hdmi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Amlogic specific extensions to the Synopsys Designware HDMI Controller
maintainers:
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+allOf:
+ - $ref: /schemas/sound/dai-common.yaml#
description: |
The Amlogic Meson Synopsys Designware Integration is composed of
@@ -99,6 +102,8 @@ properties:
"#sound-dai-cells":
const: 0
+ sound-name-prefix: true
+
required:
- compatible
- reg
@@ -145,4 +150,3 @@ examples:
};
};
};
-
diff --git a/Documentation/devicetree/bindings/display/amlogic,meson-vpu.yaml b/Documentation/devicetree/bindings/display/amlogic,meson-vpu.yaml
index 851cb0781217..0c72120acc4f 100644
--- a/Documentation/devicetree/bindings/display/amlogic,meson-vpu.yaml
+++ b/Documentation/devicetree/bindings/display/amlogic,meson-vpu.yaml
@@ -2,13 +2,13 @@
# Copyright 2019 BayLibre, SAS
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/amlogic,meson-vpu.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/amlogic,meson-vpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Amlogic Meson Display Controller
maintainers:
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
description: |
The Amlogic Meson Display controller is composed of several components
@@ -78,6 +78,10 @@ properties:
interrupts:
maxItems: 1
+ amlogic,canvas:
+ description: should point to a canvas provider node
+ $ref: /schemas/types.yaml#/definitions/phandle
+
power-domains:
maxItems: 1
description: phandle to the associated power domain
@@ -106,6 +110,7 @@ required:
- port@1
- "#address-cells"
- "#size-cells"
+ - amlogic,canvas
additionalProperties: false
@@ -118,6 +123,7 @@ examples:
interrupts = <3>;
#address-cells = <1>;
#size-cells = <0>;
+ amlogic,canvas = <&canvas>;
/* CVBS VDAC output port */
port@0 {
diff --git a/Documentation/devicetree/bindings/display/arm,hdlcd.txt b/Documentation/devicetree/bindings/display/arm,hdlcd.txt
deleted file mode 100644
index 78bc24296f3e..000000000000
--- a/Documentation/devicetree/bindings/display/arm,hdlcd.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-ARM HDLCD
-
-This is a display controller found on several development platforms produced
-by ARM Ltd and in more modern of its' Fast Models. The HDLCD is an RGB
-streamer that reads the data from a framebuffer and sends it to a single
-digital encoder (DVI or HDMI).
-
-Required properties:
- - compatible: "arm,hdlcd"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: One interrupt used by the display controller to notify the
- interrupt controller when any of the interrupt sources programmed in
- the interrupt mask register have activated.
- - clocks: A list of phandle + clock-specifier pairs, one for each
- entry in 'clock-names'.
- - clock-names: A list of clock names. For HDLCD it should contain:
- - "pxlclk" for the clock feeding the output PLL of the controller.
-
-Required sub-nodes:
- - port: The HDLCD connection to an encoder chip. The connection is modeled
- using the OF graph bindings specified in
- Documentation/devicetree/bindings/graph.txt.
-
-Optional properties:
- - memory-region: phandle to a node describing memory (see
- Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt) to be
- used for the framebuffer; if not present, the framebuffer may be located
- anywhere in memory.
-
-
-Example:
-
-/ {
- ...
-
- hdlcd@2b000000 {
- compatible = "arm,hdlcd";
- reg = <0 0x2b000000 0 0x1000>;
- interrupts = <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&oscclk5>;
- clock-names = "pxlclk";
- port {
- hdlcd_output: endpoint@0 {
- remote-endpoint = <&hdmi_enc_input>;
- };
- };
- };
-
- /* HDMI encoder on I2C bus */
- i2c@7ffa0000 {
- ....
- hdmi-transmitter@70 {
- compatible = ".....";
- reg = <0x70>;
- port@0 {
- hdmi_enc_input: endpoint {
- remote-endpoint = <&hdlcd_output>;
- };
-
- hdmi_enc_output: endpoint {
- remote-endpoint = <&hdmi_1_port>;
- };
- };
- };
-
- };
-
- hdmi1: connector@1 {
- compatible = "hdmi-connector";
- type = "a";
- port {
- hdmi_1_port: endpoint {
- remote-endpoint = <&hdmi_enc_output>;
- };
- };
- };
-
- ...
-};
diff --git a/Documentation/devicetree/bindings/display/arm,hdlcd.yaml b/Documentation/devicetree/bindings/display/arm,hdlcd.yaml
new file mode 100644
index 000000000000..9a30e9005e8a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/arm,hdlcd.yaml
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/arm,hdlcd.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm HDLCD display controller
+
+maintainers:
+ - Liviu Dudau <Liviu.Dudau@arm.com>
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ The Arm HDLCD is a display controller found on several development platforms
+ produced by ARM Ltd and in more modern of its Fast Models. The HDLCD is an
+ RGB streamer that reads the data from a framebuffer and sends it to a single
+ digital encoder (DVI or HDMI).
+
+properties:
+ compatible:
+ const: arm,hdlcd
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clock-names:
+ const: pxlclk
+
+ clocks:
+ maxItems: 1
+ description: The input reference for the pixel clock.
+
+ memory-region:
+ maxItems: 1
+ description:
+ Phandle to a node describing memory to be used for the framebuffer.
+ If not present, the framebuffer may be located anywhere in memory.
+
+ iommus:
+ maxItems: 1
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+ unevaluatedProperties: false
+ description:
+ Output endpoint of the controller, connecting the LCD panel signals.
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - port
+
+examples:
+ - |
+ hdlcd@2b000000 {
+ compatible = "arm,hdlcd";
+ reg = <0x2b000000 0x1000>;
+ interrupts = <0 85 4>;
+ clocks = <&oscclk5>;
+ clock-names = "pxlclk";
+ port {
+ hdlcd_output: endpoint {
+ remote-endpoint = <&hdmi_enc_input>;
+ };
+ };
+ };
+
+ /* HDMI encoder on I2C bus */
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ hdmi-transmitter@70 {
+ compatible = "nxp,tda998x";
+ reg = <0x70>;
+ port {
+ hdmi_enc_input: endpoint {
+ remote-endpoint = <&hdlcd_output>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/arm,komeda.txt b/Documentation/devicetree/bindings/display/arm,komeda.txt
deleted file mode 100644
index 8513695ee47f..000000000000
--- a/Documentation/devicetree/bindings/display/arm,komeda.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-Device Tree bindings for Arm Komeda display driver
-
-Required properties:
-- compatible: Should be "arm,mali-d71"
-- reg: Physical base address and length of the registers in the system
-- interrupts: the interrupt line number of the device in the system
-- clocks: A list of phandle + clock-specifier pairs, one for each entry
- in 'clock-names'
-- clock-names: A list of clock names. It should contain:
- - "aclk": for the main processor clock
-- #address-cells: Must be 1
-- #size-cells: Must be 0
-- iommus: configure the stream id to IOMMU, Must be configured if want to
- enable iommu in display. for how to configure this node please reference
- devicetree/bindings/iommu/arm,smmu-v3.txt,
- devicetree/bindings/iommu/iommu.txt
-
-Required properties for sub-node: pipeline@nq
-Each device contains one or two pipeline sub-nodes (at least one), each
-pipeline node should provide properties:
-- reg: Zero-indexed identifier for the pipeline
-- clocks: A list of phandle + clock-specifier pairs, one for each entry
- in 'clock-names'
-- clock-names: should contain:
- - "pxclk": pixel clock
-
-- port: each pipeline connect to an encoder input port. The connection is
- modeled using the OF graph bindings specified in
- Documentation/devicetree/bindings/graph.txt
-
-Optional properties:
- - memory-region: phandle to a node describing memory (see
- Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt)
- to be used for the framebuffer; if not present, the framebuffer may
- be located anywhere in memory.
-
-Example:
-/ {
- ...
-
- dp0: display@c00000 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "arm,mali-d71";
- reg = <0xc00000 0x20000>;
- interrupts = <0 168 4>;
- clocks = <&dpu_aclk>;
- clock-names = "aclk";
- iommus = <&smmu 0>, <&smmu 1>, <&smmu 2>, <&smmu 3>,
- <&smmu 4>, <&smmu 5>, <&smmu 6>, <&smmu 7>,
- <&smmu 8>, <&smmu 9>;
-
- dp0_pipe0: pipeline@0 {
- clocks = <&fpgaosc2>;
- clock-names = "pxclk";
- reg = <0>;
-
- port {
- dp0_pipe0_out: endpoint {
- remote-endpoint = <&db_dvi0_in>;
- };
- };
- };
-
- dp0_pipe1: pipeline@1 {
- clocks = <&fpgaosc2>;
- clock-names = "pxclk";
- reg = <1>;
-
- port {
- dp0_pipe1_out: endpoint {
- remote-endpoint = <&db_dvi1_in>;
- };
- };
- };
- };
- ...
-};
diff --git a/Documentation/devicetree/bindings/display/arm,komeda.yaml b/Documentation/devicetree/bindings/display/arm,komeda.yaml
new file mode 100644
index 000000000000..3ad3eef89ca8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/arm,komeda.yaml
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/arm,komeda.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Komeda display processor
+
+maintainers:
+ - Liviu Dudau <Liviu.Dudau@arm.com>
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ The Arm Mali D71 display processor supports up to two displays with up
+ to a 4K resolution each. Each pipeline can be composed of up to four
+ layers. It is typically connected to a digital display connector like HDMI.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: arm,mali-d32
+ - const: arm,mali-d71
+ - const: arm,mali-d71
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clock-names:
+ const: aclk
+
+ clocks:
+ maxItems: 1
+ description: The main DPU processor clock
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ memory-region:
+ maxItems: 1
+ description:
+ Phandle to a node describing memory to be used for the framebuffer.
+ If not present, the framebuffer may be located anywhere in memory.
+
+ iommus:
+ description:
+ The stream IDs for each of the used pipelines, each four IDs for the
+ four layers, plus one for the write-back stream.
+ minItems: 5
+ maxItems: 10
+
+patternProperties:
+ '^pipeline@[01]$':
+ type: object
+ additionalProperties: false
+ description:
+ clocks
+
+ properties:
+ reg:
+ enum: [ 0, 1 ]
+
+ clock-names:
+ const: pxclk
+
+ clocks:
+ maxItems: 1
+ description: The input reference for the pixel clock.
+
+ port:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+
+additionalProperties: false
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clock-names
+ - clocks
+ - pipeline@0
+
+examples:
+ - |
+ display@c00000 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "arm,mali-d71";
+ reg = <0xc00000 0x20000>;
+ interrupts = <168>;
+ clocks = <&dpu_aclk>;
+ clock-names = "aclk";
+ iommus = <&smmu 0>, <&smmu 1>, <&smmu 2>, <&smmu 3>,
+ <&smmu 8>,
+ <&smmu 4>, <&smmu 5>, <&smmu 6>, <&smmu 7>,
+ <&smmu 9>;
+
+ dp0_pipe0: pipeline@0 {
+ clocks = <&fpgaosc2>;
+ clock-names = "pxclk";
+ reg = <0>;
+
+ port {
+ dp0_pipe0_out: endpoint {
+ remote-endpoint = <&db_dvi0_in>;
+ };
+ };
+ };
+
+ dp0_pipe1: pipeline@1 {
+ clocks = <&fpgaosc2>;
+ clock-names = "pxclk";
+ reg = <1>;
+
+ port {
+ dp0_pipe1_out: endpoint {
+ remote-endpoint = <&db_dvi1_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/arm,malidp.txt b/Documentation/devicetree/bindings/display/arm,malidp.txt
deleted file mode 100644
index 7a97a2b48c2a..000000000000
--- a/Documentation/devicetree/bindings/display/arm,malidp.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-ARM Mali-DP
-
-The following bindings apply to a family of Display Processors sold as
-licensable IP by ARM Ltd. The bindings describe the Mali DP500, DP550 and
-DP650 processors that offer multiple composition layers, support for
-rotation and scaling output.
-
-Required properties:
- - compatible: should be one of
- "arm,mali-dp500"
- "arm,mali-dp550"
- "arm,mali-dp650"
- depending on the particular implementation present in the hardware
- - reg: Physical base address and size of the block of registers used by
- the processor.
- - interrupts: Interrupt list, as defined in ../interrupt-controller/interrupts.txt,
- interrupt client nodes.
- - interrupt-names: name of the engine inside the processor that will
- use the corresponding interrupt. Should be one of "DE" or "SE".
- - clocks: A list of phandle + clock-specifier pairs, one for each entry
- in 'clock-names'
- - clock-names: A list of clock names. It should contain:
- - "pclk": for the APB interface clock
- - "aclk": for the AXI interface clock
- - "mclk": for the main processor clock
- - "pxlclk": for the pixel clock feeding the output PLL of the processor.
- - arm,malidp-output-port-lines: Array of u8 values describing the number
- of output lines per channel (R, G and B).
-
-Required sub-nodes:
- - port: The Mali DP connection to an encoder input port. The connection
- is modelled using the OF graph bindings specified in
- Documentation/devicetree/bindings/graph.txt
-
-Optional properties:
- - memory-region: phandle to a node describing memory (see
- Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt)
- to be used for the framebuffer; if not present, the framebuffer may
- be located anywhere in memory.
- - arm,malidp-arqos-high-level: integer of u32 value describing the ARQoS
- levels of DP500's QoS signaling.
-
-
-Example:
-
-/ {
- ...
-
- dp0: malidp@6f200000 {
- compatible = "arm,mali-dp650";
- reg = <0 0x6f200000 0 0x20000>;
- memory-region = <&display_reserved>;
- interrupts = <0 168 IRQ_TYPE_LEVEL_HIGH>,
- <0 168 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "DE", "SE";
- clocks = <&oscclk2>, <&fpgaosc0>, <&fpgaosc1>, <&fpgaosc1>;
- clock-names = "pxlclk", "mclk", "aclk", "pclk";
- arm,malidp-output-port-lines = /bits/ 8 <8 8 8>;
- arm,malidp-arqos-high-level = <0xd000d000>;
- port {
- dp0_output: endpoint {
- remote-endpoint = <&tda998x_2_input>;
- };
- };
- };
-
- ...
-};
diff --git a/Documentation/devicetree/bindings/display/arm,malidp.yaml b/Documentation/devicetree/bindings/display/arm,malidp.yaml
new file mode 100644
index 000000000000..91812573fd08
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/arm,malidp.yaml
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/arm,malidp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm Mali Display Processor (Mali-DP)
+
+maintainers:
+ - Liviu Dudau <Liviu.Dudau@arm.com>
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ The following bindings apply to a family of Display Processors sold as
+ licensable IP by ARM Ltd. The bindings describe the Mali DP500, DP550 and
+ DP650 processors that offer multiple composition layers, support for
+ rotation and scaling output.
+
+properties:
+ compatible:
+ enum:
+ - arm,mali-dp500
+ - arm,mali-dp550
+ - arm,mali-dp650
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ items:
+ - description:
+ The interrupt used by the Display Engine (DE). Can be shared with
+ the interrupt for the Scaling Engine (SE), but it will have to be
+ listed individually.
+ - description:
+ The interrupt used by the Scaling Engine (SE). Can be shared with
+ the interrupt for the Display Engine (DE), but it will have to be
+ listed individually.
+
+ interrupt-names:
+ items:
+ - const: DE
+ - const: SE
+
+ clock-names:
+ items:
+ - const: pxlclk
+ - const: mclk
+ - const: aclk
+ - const: pclk
+
+ clocks:
+ items:
+ - description: the pixel clock feeding the output PLL of the processor
+ - description: the main processor clock
+ - description: the AXI interface clock
+ - description: the APB interface clock
+
+ memory-region:
+ maxItems: 1
+ description:
+ Phandle to a node describing memory to be used for the framebuffer.
+ If not present, the framebuffer may be located anywhere in memory.
+
+ arm,malidp-output-port-lines:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description:
+ Number of output lines/bits for each colour channel.
+ items:
+ - description: number of output lines for the red channel (R)
+ - description: number of output lines for the green channel (G)
+ - description: number of output lines for the blue channel (B)
+
+ arm,malidp-arqos-value:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Quality-of-Service value for the display engine FIFOs, to write
+ into the RQOS register of the DP500.
+ See the ARM Mali-DP500 TRM for details on the encoding.
+ If omitted, the RQOS register will not be changed.
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+ unevaluatedProperties: false
+ description:
+ Output endpoint of the controller, connecting the LCD panel signals.
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-names
+ - clocks
+ - clock-names
+ - port
+ - arm,malidp-output-port-lines
+
+examples:
+ - |
+ dp0: malidp@6f200000 {
+ compatible = "arm,mali-dp650";
+ reg = <0x6f200000 0x20000>;
+ memory-region = <&display_reserved>;
+ interrupts = <168>, <168>;
+ interrupt-names = "DE", "SE";
+ clocks = <&oscclk2>, <&fpgaosc0>, <&fpgaosc1>, <&fpgaosc1>;
+ clock-names = "pxlclk", "mclk", "aclk", "pclk";
+ arm,malidp-output-port-lines = /bits/ 8 <8 8 8>;
+ arm,malidp-arqos-value = <0xd000d000>;
+
+ port {
+ dp0_output: endpoint {
+ remote-endpoint = <&tda998x_2_input>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/arm,pl11x.txt b/Documentation/devicetree/bindings/display/arm,pl11x.txt
deleted file mode 100644
index 3f977e72a200..000000000000
--- a/Documentation/devicetree/bindings/display/arm,pl11x.txt
+++ /dev/null
@@ -1,110 +0,0 @@
-* ARM PrimeCell Color LCD Controller PL110/PL111
-
-See also Documentation/devicetree/bindings/arm/primecell.yaml
-
-Required properties:
-
-- compatible: must be one of:
- "arm,pl110", "arm,primecell"
- "arm,pl111", "arm,primecell"
-
-- reg: base address and size of the control registers block
-
-- interrupt-names: either the single entry "combined" representing a
- combined interrupt output (CLCDINTR), or the four entries
- "mbe", "vcomp", "lnbu", "fuf" representing the individual
- CLCDMBEINTR, CLCDVCOMPINTR, CLCDLNBUINTR, CLCDFUFINTR interrupts
-
-- interrupts: contains an interrupt specifier for each entry in
- interrupt-names
-
-- clock-names: should contain "clcdclk" and "apb_pclk"
-
-- clocks: contains phandle and clock specifier pairs for the entries
- in the clock-names property. See
- Documentation/devicetree/bindings/clock/clock-bindings.txt
-
-Optional properties:
-
-- memory-region: phandle to a node describing memory (see
- Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt)
- to be used for the framebuffer; if not present, the framebuffer
- may be located anywhere in the memory
-
-- max-memory-bandwidth: maximum bandwidth in bytes per second that the
- cell's memory interface can handle; if not present, the memory
- interface is fast enough to handle all possible video modes
-
-Required sub-nodes:
-
-- port: describes LCD panel signals, following the common binding
- for video transmitter interfaces; see
- Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Deprecated properties:
- The port's endbpoint subnode had this, now deprecated property
- in the past. Drivers should be able to survive without it:
-
- - arm,pl11x,tft-r0g0b0-pads: an array of three 32-bit values,
- defining the way CLD pads are wired up; first value
- contains index of the "CLD" external pin (pad) used
- as R0 (first bit of the red component), second value
- index of the pad used as G0, third value index of the
- pad used as B0, see also "LCD panel signal multiplexing
- details" paragraphs in the PL110/PL111 Technical
- Reference Manuals; this implicitly defines available
- color modes, for example:
- - PL111 TFT 4:4:4 panel:
- arm,pl11x,tft-r0g0b0-pads = <4 15 20>;
- - PL110 TFT (1:)5:5:5 panel:
- arm,pl11x,tft-r0g0b0-pads = <1 7 13>;
- - PL111 TFT (1:)5:5:5 panel:
- arm,pl11x,tft-r0g0b0-pads = <3 11 19>;
- - PL111 TFT 5:6:5 panel:
- arm,pl11x,tft-r0g0b0-pads = <3 10 19>;
- - PL110 and PL111 TFT 8:8:8 panel:
- arm,pl11x,tft-r0g0b0-pads = <0 8 16>;
- - PL110 and PL111 TFT 8:8:8 panel, R & B components swapped:
- arm,pl11x,tft-r0g0b0-pads = <16 8 0>;
-
-
-Example:
-
- clcd@10020000 {
- compatible = "arm,pl111", "arm,primecell";
- reg = <0x10020000 0x1000>;
- interrupt-names = "combined";
- interrupts = <0 44 4>;
- clocks = <&oscclk1>, <&oscclk2>;
- clock-names = "clcdclk", "apb_pclk";
- max-memory-bandwidth = <94371840>; /* Bps, 1024x768@60 16bpp */
-
- port {
- clcd_pads: endpoint {
- remote-endpoint = <&clcd_panel>;
- };
- };
-
- };
-
- panel {
- compatible = "panel-dpi";
-
- port {
- clcd_panel: endpoint {
- remote-endpoint = <&clcd_pads>;
- };
- };
-
- panel-timing {
- clock-frequency = <25175000>;
- hactive = <640>;
- hback-porch = <40>;
- hfront-porch = <24>;
- hsync-len = <96>;
- vactive = <480>;
- vback-porch = <32>;
- vfront-porch = <11>;
- vsync-len = <2>;
- };
- };
diff --git a/Documentation/devicetree/bindings/display/arm,pl11x.yaml b/Documentation/devicetree/bindings/display/arm,pl11x.yaml
new file mode 100644
index 000000000000..6cc9045e5c68
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/arm,pl11x.yaml
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/arm,pl11x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm PrimeCell Color LCD Controller PL110/PL111
+
+maintainers:
+ - Liviu Dudau <Liviu.Dudau@arm.com>
+ - Andre Przywara <andre.przywara@arm.com>
+
+description:
+ The Arm Primcell PL010/PL111 is an LCD controller IP, than scans out
+ a framebuffer region in system memory, and creates timed signals for
+ a variety of LCD panels.
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - arm,pl110
+ - arm,pl111
+ required:
+ - compatible
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - arm,pl110
+ - arm,pl111
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ interrupt-names:
+ oneOf:
+ - const: combined
+ description:
+ The IP provides four individual interrupt lines, but also one
+ combined line. If the integration only connects this line to the
+ interrupt controller, this single interrupt is noted here.
+ - items:
+ - const: mbe # CLCDMBEINTR
+ - const: vcomp # CLCDVCOMPINTR
+ - const: lnbu # CLCDLNBUINTR
+ - const: fuf # CLCDFUFINTR
+
+ interrupts:
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ items:
+ - const: clcdclk
+ - const: apb_pclk
+
+ clocks:
+ items:
+ - description: The CLCDCLK reference clock for the controller.
+ - description: The HCLK AHB slave clock for the register access.
+
+ memory-region:
+ maxItems: 1
+ description:
+ Phandle to a node describing memory to be used for the framebuffer.
+ If not present, the framebuffer may be located anywhere in memory.
+
+ max-memory-bandwidth:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Maximum bandwidth in bytes per second that the cell's memory interface
+ can handle.
+ If not present, the memory interface is fast enough to handle all
+ possible video modes.
+
+ port:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ additionalProperties: false
+
+ description:
+ Output endpoint of the controller, connecting the LCD panel signals.
+
+ properties:
+ endpoint:
+ $ref: /schemas/graph.yaml#/$defs/endpoint-base
+ unevaluatedProperties: false
+
+ properties:
+ arm,pl11x,tft-r0g0b0-pads:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ - description: index of CLD pad used for first red bit (R0)
+ - description: index of CLD pad used for first green bit (G0)
+ - description: index of CLD pad used for first blue bit (G0)
+ deprecated: true
+ description: |
+ DEPRECATED. An array of three 32-bit values, defining the way
+ CLD[23:0] pads are wired up.
+ The first value contains the index of the "CLD" external pin (pad)
+ used as R0 (first bit of the red component), the second value for
+ green, the third value for blue.
+ See also "LCD panel signal multiplexing details" paragraphs in the
+ PL110/PL111 Technical Reference Manuals.
+ This implicitly defines available color modes, for example:
+ - PL111 TFT 4:4:4 panel:
+ arm,pl11x,tft-r0g0b0-pads = <4 15 20>;
+ - PL110 TFT (1:)5:5:5 panel:
+ arm,pl11x,tft-r0g0b0-pads = <1 7 13>;
+ - PL111 TFT (1:)5:5:5 panel:
+ arm,pl11x,tft-r0g0b0-pads = <3 11 19>;
+ - PL111 TFT 5:6:5 panel:
+ arm,pl11x,tft-r0g0b0-pads = <3 10 19>;
+ - PL110 and PL111 TFT 8:8:8 panel:
+ arm,pl11x,tft-r0g0b0-pads = <0 8 16>;
+ - PL110 and PL111 TFT 8:8:8 panel, R & B components swapped:
+ arm,pl11x,tft-r0g0b0-pads = <16 8 0>;
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clock-names
+ - clocks
+ - port
+
+allOf:
+ - if:
+ properties:
+ interrupts:
+ minItems: 2
+ required:
+ - interrupts
+ then:
+ required:
+ - interrupt-names
+
+examples:
+ - |
+ clcd@10020000 {
+ compatible = "arm,pl111", "arm,primecell";
+ reg = <0x10020000 0x1000>;
+ interrupt-names = "combined";
+ interrupts = <44>;
+ clocks = <&oscclk1>, <&oscclk2>;
+ clock-names = "clcdclk", "apb_pclk";
+ max-memory-bandwidth = <94371840>; /* Bps, 1024x768@60 16bpp */
+
+ port {
+ clcd_pads: endpoint {
+ remote-endpoint = <&clcd_panel>;
+ };
+ };
+ };
+
+ panel {
+ compatible = "arm,rtsm-display";
+
+ port {
+ clcd_panel: endpoint {
+ remote-endpoint = <&clcd_pads>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/atmel,lcdc.txt b/Documentation/devicetree/bindings/display/atmel,lcdc.txt
index acb5a0132127..b5e355ada2fa 100644
--- a/Documentation/devicetree/bindings/display/atmel,lcdc.txt
+++ b/Documentation/devicetree/bindings/display/atmel,lcdc.txt
@@ -9,7 +9,6 @@ Required properties:
"atmel,at91sam9g45-lcdc" ,
"atmel,at91sam9g45es-lcdc" ,
"atmel,at91sam9rl-lcdc" ,
- "atmel,at32ap-lcdc"
- reg : Should contain 1 register ranges(address and length).
Can contain an additional register range(address and length)
for fixed framebuffer memory. Useful for dedicated memories.
diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml
index a1d5a32660e0..5b35adf34c7b 100644
--- a/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/brcm,bcm2711-hdmi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom BCM2711 HDMI Controller Device Tree Bindings
+title: Broadcom BCM2711 HDMI Controller
maintainers:
- Eric Anholt <eric@anholt.net>
@@ -72,8 +72,7 @@ properties:
- const: hpd-removed
ddc:
- allOf:
- - $ref: /schemas/types.yaml#/definitions/phandle
+ $ref: /schemas/types.yaml#/definitions/phandle
description: >
Phandle of the I2C controller used for DDC EDID probing
diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2835-dsi0.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2835-dsi0.yaml
index 32608578a352..c8b2459d64f6 100644
--- a/Documentation/devicetree/bindings/display/brcm,bcm2835-dsi0.yaml
+++ b/Documentation/devicetree/bindings/display/brcm,bcm2835-dsi0.yaml
@@ -47,6 +47,9 @@ properties:
interrupts:
maxItems: 1
+ power-domains:
+ maxItems: 1
+
required:
- "#clock-cells"
- compatible
diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2835-hdmi.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2835-hdmi.yaml
index 031e35e76db2..48c8cad0d96d 100644
--- a/Documentation/devicetree/bindings/display/brcm,bcm2835-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/brcm,bcm2835-hdmi.yaml
@@ -51,6 +51,9 @@ properties:
dma-names:
const: audio-rx
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2835-v3d.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2835-v3d.yaml
index 8a73780f573d..c55a8217de25 100644
--- a/Documentation/devicetree/bindings/display/brcm,bcm2835-v3d.yaml
+++ b/Documentation/devicetree/bindings/display/brcm,bcm2835-v3d.yaml
@@ -24,6 +24,9 @@ properties:
interrupts:
maxItems: 1
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2835-vec.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2835-vec.yaml
index 9b24081a0dbd..5d921e30394e 100644
--- a/Documentation/devicetree/bindings/display/brcm,bcm2835-vec.yaml
+++ b/Documentation/devicetree/bindings/display/brcm,bcm2835-vec.yaml
@@ -24,6 +24,9 @@ properties:
interrupts:
maxItems: 1
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/bridge/adi,adv7511.yaml b/Documentation/devicetree/bindings/display/bridge/adi,adv7511.yaml
index d3dd7a79b909..5bbe81862c8f 100644
--- a/Documentation/devicetree/bindings/display/bridge/adi,adv7511.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/adi,adv7511.yaml
@@ -76,9 +76,8 @@ properties:
adi,input-depth:
description: Number of bits per color component at the input.
- allOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- - enum: [ 8, 10, 12 ]
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [ 8, 10, 12 ]
adi,input-colorspace:
description: Input color space.
@@ -118,23 +117,21 @@ properties:
ports:
description:
- The ADV7511(W)/13 has two video ports and one audio port. This node
- models their connections as documented in
- Documentation/devicetree/bindings/media/video-interfaces.txt
- Documentation/devicetree/bindings/graph.txt
- type: object
+ The ADV7511(W)/13 has two video ports and one audio port.
+ $ref: /schemas/graph.yaml#/properties/ports
+
properties:
port@0:
description: Video port for the RGB or YUV input.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
port@1:
description: Video port for the HDMI output.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
port@2:
description: Audio port for the HDMI output.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
# adi,input-colorspace and adi,input-clock are required except in
# "rgb 1x" and "yuv444 1x" modes, in which case they must not be
diff --git a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml
index f36209137c8a..987aa83c2649 100644
--- a/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/adi,adv7533.yaml
@@ -91,25 +91,23 @@ properties:
ports:
description:
- The ADV7533/35 has two video ports and one audio port. This node
- models their connections as documented in
- Documentation/devicetree/bindings/media/video-interfaces.txt
- Documentation/devicetree/bindings/graph.txt
- type: object
+ The ADV7533/35 has two video ports and one audio port.
+ $ref: /schemas/graph.yaml#/properties/ports
+
properties:
port@0:
description:
Video port for the DSI input. The remote endpoint phandle
should be a reference to a valid mipi_dsi_host_device.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
port@1:
description: Video port for the HDMI output.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
port@2:
description: Audio port for the HDMI output.
- type: object
+ $ref: /schemas/graph.yaml#/properties/port
required:
- compatible
diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
index ab48ab2f4240..a1ed1004651b 100644
--- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml
@@ -2,8 +2,8 @@
# Copyright 2019 Analogix Semiconductor, Inc.
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/bridge/analogix,anx7625.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/bridge/analogix,anx7625.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Analogix ANX7625 SlimPort (4K Mobile HD Transmitter)
@@ -16,8 +16,7 @@ description: |
properties:
compatible:
- items:
- - const: analogix,anx7625
+ const: analogix,anx7625
reg:
maxItems: 1
@@ -43,14 +42,73 @@ properties:
vdd33-supply:
description: Regulator that provides the supply 3.3V power.
+ analogix,lane0-swing:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ minItems: 1
+ maxItems: 20
+ description:
+ an array of swing register setting for DP tx lane0 PHY.
+ Registers 0~9 are Swing0_Pre0, Swing1_Pre0, Swing2_Pre0,
+ Swing3_Pre0, Swing0_Pre1, Swing1_Pre1, Swing2_Pre1, Swing0_Pre2,
+ Swing1_Pre2, Swing0_Pre3, they are for [Boost control] and
+ [Swing control] setting.
+ Registers 0~9, bit 3:0 is [Boost control], these bits control
+ post cursor manual, increase the [Boost control] to increase
+ Pre-emphasis value.
+ Registers 0~9, bit 6:4 is [Swing control], these bits control
+ swing manual, increase [Swing control] setting to add Vp-p value
+ for each Swing, Pre.
+ Registers 10~19 are Swing0_Pre0, Swing1_Pre0, Swing2_Pre0,
+ Swing3_Pre0, Swing0_Pre1, Swing1_Pre1, Swing2_Pre1, Swing0_Pre2,
+ Swing1_Pre2, Swing0_Pre3, they are for [R select control] and
+ [R Termination control] setting.
+ Registers 10~19, bit 4:0 is [R select control], these bits are
+ compensation manual, increase it can enhance IO driven strength
+ and Vp-p.
+ Registers 10~19, bit 5:6 is [R termination control], these bits
+ adjust 50ohm impedance of DP tx termination. 00:55 ohm,
+ 01:50 ohm(default), 10:45 ohm, 11:40 ohm.
+
+ analogix,lane1-swing:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ minItems: 1
+ maxItems: 20
+ description:
+ an array of swing register setting for DP tx lane1 PHY.
+ DP TX lane1 swing register setting same with lane0
+ swing, please refer lane0-swing property description.
+
+ analogix,audio-enable:
+ type: boolean
+ description: let the driver enable audio HDMI codec function or not.
+
+ aux-bus:
+ $ref: /schemas/display/dp-aux-bus.yaml#
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
properties:
port@0:
- $ref: /schemas/graph.yaml#/properties/port
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
description:
- Video port for MIPI DSI input.
+ MIPI DSI/DPI input.
+
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ type: object
+ additionalProperties: false
+
+ properties:
+ remote-endpoint: true
+
+ bus-type:
+ enum: [7]
+ default: 1
+
+ data-lanes: true
port@1:
$ref: /schemas/graph.yaml#/properties/port
@@ -75,7 +133,7 @@ examples:
- |
#include <dt-bindings/gpio/gpio.h>
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -87,6 +145,9 @@ examples:
vdd10-supply = <&pp1000_mipibrdg>;
vdd18-supply = <&pp1800_mipibrdg>;
vdd33-supply = <&pp3300_mipibrdg>;
+ analogix,audio-enable;
+ analogix,lane0-swing = /bits/ 8 <0x14 0x54 0x64 0x74>;
+ analogix,lane1-swing = /bits/ 8 <0x14 0x54 0x64 0x74>;
ports {
#address-cells = <1>;
@@ -96,6 +157,8 @@ examples:
reg = <0>;
anx7625_in: endpoint {
remote-endpoint = <&mipi_dsi>;
+ bus-type = <7>;
+ data-lanes = <0 1 2 3>;
};
};
@@ -106,5 +169,19 @@ examples:
};
};
};
+
+ aux-bus {
+ panel {
+ compatible = "innolux,n125hce-gn1";
+ power-supply = <&pp3300_disp_x>;
+ backlight = <&backlight_lcd0>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&anx7625_out>;
+ };
+ };
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml
index 8e13f27b28ed..4a5e5d9d6f90 100644
--- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml
@@ -7,7 +7,9 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Analogix ANX7814 SlimPort (Full-HD Transmitter)
maintainers:
- - Enric Balletbo i Serra <enric.balletbo@collabora.com>
+ - Andrzej Hajda <andrzej.hajda@intel.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
+ - Robert Foss <robert.foss@linaro.org>
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,dp.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,dp.yaml
new file mode 100644
index 000000000000..c9b06885cc63
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/analogix,dp.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/analogix,dp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analogix Display Port bridge
+
+maintainers:
+ - Rob Herring <robh@kernel.org>
+
+properties:
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks: true
+
+ clock-names: true
+
+ phys: true
+
+ phy-names:
+ const: dp
+
+ force-hpd:
+ description:
+ Indicate driver need force hpd when hpd detect failed, this
+ is used for some eDP screen which don not have a hpd signal.
+
+ hpd-gpios:
+ description:
+ Hotplug detect GPIO.
+ Indicates which GPIO should be used for hotplug detection
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Input node to receive pixel data.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Port node with one endpoint connected to a dp-connector node.
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - reg
+ - interrupts
+ - clock-names
+ - clocks
+ - ports
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/display/bridge/analogix_dp.txt b/Documentation/devicetree/bindings/display/bridge/analogix_dp.txt
deleted file mode 100644
index 027d76c27a41..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/analogix_dp.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-Analogix Display Port bridge bindings
-
-Required properties for dp-controller:
- -compatible:
- platform specific such as:
- * "samsung,exynos5-dp"
- * "rockchip,rk3288-dp"
- * "rockchip,rk3399-edp"
- -reg:
- physical base address of the controller and length
- of memory mapped region.
- -interrupts:
- interrupt combiner values.
- -clocks:
- from common clock binding: handle to dp clock.
- -clock-names:
- from common clock binding: Shall be "dp".
- -phys:
- from general PHY binding: the phandle for the PHY device.
- -phy-names:
- from general PHY binding: Should be "dp".
-
-Optional properties for dp-controller:
- -force-hpd:
- Indicate driver need force hpd when hpd detect failed, this
- is used for some eDP screen which don't have hpd signal.
- -hpd-gpios:
- Hotplug detect GPIO.
- Indicates which GPIO should be used for hotplug detection
- -port@[X]: SoC specific port nodes with endpoint definitions as defined
- in Documentation/devicetree/bindings/media/video-interfaces.txt,
- please refer to the SoC specific binding document:
- * Documentation/devicetree/bindings/display/exynos/exynos_dp.txt
- * Documentation/devicetree/bindings/display/rockchip/analogix_dp-rockchip.txt
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
--------------------------------------------------------------------------------
-
-Example:
-
- dp-controller {
- compatible = "samsung,exynos5-dp";
- reg = <0x145b0000 0x10000>;
- interrupts = <10 3>;
- interrupt-parent = <&combiner>;
- clocks = <&clock 342>;
- clock-names = "dp";
-
- phys = <&dp_phy>;
- phy-names = "dp";
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/anx6345.yaml b/Documentation/devicetree/bindings/display/bridge/anx6345.yaml
index 1c0406c38fe5..514f58852990 100644
--- a/Documentation/devicetree/bindings/display/bridge/anx6345.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/anx6345.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/bridge/anx6345.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Analogix ANX6345 eDP Transmitter Device Tree Bindings
+title: Analogix ANX6345 eDP Transmitter
maintainers:
- Torsten Duwe <duwe@lst.de>
@@ -61,7 +61,7 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/bridge/cdns,dsi.txt b/Documentation/devicetree/bindings/display/bridge/cdns,dsi.txt
deleted file mode 100644
index 525a4bfd8634..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/cdns,dsi.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-Cadence DSI bridge
-==================
-
-The Cadence DSI bridge is a DPI to DSI bridge supporting up to 4 DSI lanes.
-
-Required properties:
-- compatible: should be set to "cdns,dsi".
-- reg: physical base address and length of the controller's registers.
-- interrupts: interrupt line connected to the DSI bridge.
-- clocks: DSI bridge clocks.
-- clock-names: must contain "dsi_p_clk" and "dsi_sys_clk".
-- phys: phandle link to the MIPI D-PHY controller.
-- phy-names: must contain "dphy".
-- #address-cells: must be set to 1.
-- #size-cells: must be set to 0.
-
-Optional properties:
-- resets: DSI reset lines.
-- reset-names: can contain "dsi_p_rst".
-
-Required subnodes:
-- ports: Ports as described in Documentation/devicetree/bindings/graph.txt.
- 2 ports are available:
- * port 0: this port is only needed if some of your DSI devices are
- controlled through an external bus like I2C or SPI. Can have at
- most 4 endpoints. The endpoint number is directly encoding the
- DSI virtual channel used by this device.
- * port 1: represents the DPI input.
- Other ports will be added later to support the new kind of inputs.
-
-- one subnode per DSI device connected on the DSI bus. Each DSI device should
- contain a reg property encoding its virtual channel.
-
-Example:
- dsi0: dsi@fd0c0000 {
- compatible = "cdns,dsi";
- reg = <0x0 0xfd0c0000 0x0 0x1000>;
- clocks = <&pclk>, <&sysclk>;
- clock-names = "dsi_p_clk", "dsi_sys_clk";
- interrupts = <1>;
- phys = <&dphy0>;
- phy-names = "dphy";
- #address-cells = <1>;
- #size-cells = <0>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@1 {
- reg = <1>;
- dsi0_dpi_input: endpoint {
- remote-endpoint = <&xxx_dpi_output>;
- };
- };
- };
-
- panel: dsi-dev@0 {
- compatible = "<vendor,panel>";
- reg = <0>;
- };
- };
-
-or
-
- dsi0: dsi@fd0c0000 {
- compatible = "cdns,dsi";
- reg = <0x0 0xfd0c0000 0x0 0x1000>;
- clocks = <&pclk>, <&sysclk>;
- clock-names = "dsi_p_clk", "dsi_sys_clk";
- interrupts = <1>;
- phys = <&dphy1>;
- phy-names = "dphy";
- #address-cells = <1>;
- #size-cells = <0>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- dsi0_output: endpoint@0 {
- reg = <0>;
- remote-endpoint = <&dsi_panel_input>;
- };
- };
-
- port@1 {
- reg = <1>;
- dsi0_dpi_input: endpoint {
- remote-endpoint = <&xxx_dpi_output>;
- };
- };
- };
- };
-
- i2c@xxx {
- panel: panel@59 {
- compatible = "<vendor,panel>";
- reg = <0x59>;
-
- port {
- dsi_panel_input: endpoint {
- remote-endpoint = <&dsi0_output>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/cdns,dsi.yaml b/Documentation/devicetree/bindings/display/bridge/cdns,dsi.yaml
new file mode 100644
index 000000000000..23060324d16e
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/cdns,dsi.yaml
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/cdns,dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Cadence DSI bridge
+
+maintainers:
+ - Boris Brezillon <boris.brezillon@bootlin.com>
+
+description: |
+ CDNS DSI is a bridge device which converts DPI to DSI
+
+properties:
+ compatible:
+ enum:
+ - cdns,dsi
+ - ti,j721e-dsi
+
+ reg:
+ minItems: 1
+ items:
+ - description:
+ Register block for controller's registers.
+ - description:
+ Register block for wrapper settings registers in case of TI J7 SoCs.
+
+ clocks:
+ items:
+ - description: PSM clock, used by the IP
+ - description: sys clock, used by the IP
+
+ clock-names:
+ items:
+ - const: dsi_p_clk
+ - const: dsi_sys_clk
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ const: dphy
+
+ interrupts:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: dsi_p_rst
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Output port representing the DSI output. It can have
+ at most 4 endpoints. The endpoint number is directly encoding
+ the DSI virtual channel used by this device.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Input port representing the DPI input.
+
+ required:
+ - port@1
+
+allOf:
+ - $ref: ../dsi-controller.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: ti,j721e-dsi
+ then:
+ properties:
+ reg:
+ minItems: 2
+ maxItems: 2
+ power-domains:
+ maxItems: 1
+ else:
+ properties:
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - phys
+ - phy-names
+ - ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ dsi@fd0c0000 {
+ compatible = "cdns,dsi";
+ reg = <0x0 0xfd0c0000 0x0 0x1000>;
+ clocks = <&pclk>, <&sysclk>;
+ clock-names = "dsi_p_clk", "dsi_sys_clk";
+ interrupts = <1>;
+ phys = <&dphy0>;
+ phy-names = "dphy";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&xxx_dpi_output>;
+ };
+ };
+ };
+
+ panel@0 {
+ compatible = "panasonic,vvx10f034n00";
+ reg = <0>;
+ power-supply = <&vcc_lcd_reg>;
+ };
+ };
+ };
+
+ - |
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ dsi@fd0c0000 {
+ compatible = "cdns,dsi";
+ reg = <0x0 0xfd0c0000 0x0 0x1000>;
+ clocks = <&pclk>, <&sysclk>;
+ clock-names = "dsi_p_clk", "dsi_sys_clk";
+ interrupts = <1>;
+ phys = <&dphy1>;
+ phy-names = "dphy";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&dsi_panel_input>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&xxx_dpi_output>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/cdns,mhdp8546.yaml b/Documentation/devicetree/bindings/display/bridge/cdns,mhdp8546.yaml
index b2e8bc6da9d0..c2b369456e4e 100644
--- a/Documentation/devicetree/bindings/display/bridge/cdns,mhdp8546.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/cdns,mhdp8546.yaml
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/bridge/cdns,mhdp8546.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/bridge/cdns,mhdp8546.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Cadence MHDP8546 bridge
diff --git a/Documentation/devicetree/bindings/display/bridge/chipone,icn6211.yaml b/Documentation/devicetree/bindings/display/bridge/chipone,icn6211.yaml
index 62c3bd4cb28d..5fb54375aeb6 100644
--- a/Documentation/devicetree/bindings/display/bridge/chipone,icn6211.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/chipone,icn6211.yaml
@@ -24,6 +24,15 @@ properties:
maxItems: 1
description: virtual channel number of a DSI peripheral
+ clock-names:
+ const: refclk
+
+ clocks:
+ maxItems: 1
+ description: |
+ Optional external clock connected to REF_CLK input.
+ The clock rate must be in 10..154 MHz range.
+
enable-gpios:
description: Bridge EN pin, chip is reset when EN is low.
@@ -41,17 +50,32 @@ properties:
properties:
port@0:
- $ref: /schemas/graph.yaml#/properties/port
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
description:
Video port for MIPI DSI input
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ description: array of physical DSI data lane indexes.
+ minItems: 1
+ items:
+ - const: 1
+ - const: 2
+ - const: 3
+ - const: 4
+
port@1:
$ref: /schemas/graph.yaml#/properties/port
description:
Video port for MIPI DPI output (panel or connector).
required:
- - port@0
- port@1
required:
diff --git a/Documentation/devicetree/bindings/display/bridge/chrontel,ch7033.yaml b/Documentation/devicetree/bindings/display/bridge/chrontel,ch7033.yaml
index bb6289c7d375..b0589fa16736 100644
--- a/Documentation/devicetree/bindings/display/bridge/chrontel,ch7033.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/chrontel,ch7033.yaml
@@ -5,7 +5,7 @@
$id: http://devicetree.org/schemas/display/bridge/chrontel,ch7033.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Chrontel CH7033 Video Encoder Device Tree Bindings
+title: Chrontel CH7033 Video Encoder
maintainers:
- Lubomir Rintel <lkundrak@v3.sk>
diff --git a/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-ldb.yaml b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-ldb.yaml
new file mode 100644
index 000000000000..94543006f5de
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-ldb.yaml
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/fsl,imx8qxp-ldb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8qm/qxp LVDS Display Bridge
+
+maintainers:
+ - Liu Ying <victor.liu@nxp.com>
+
+description: |
+ The Freescale i.MX8qm/qxp LVDS Display Bridge(LDB) has two channels.
+
+ The i.MX8qm/qxp LDB is controlled by Control and Status Registers(CSR) module.
+ The CSR module, as a system controller, contains the LDB's configuration
+ registers.
+
+ For i.MX8qxp LDB, each channel supports up to 24bpp parallel input color
+ format and can map the input to VESA or JEIDA standards. The two channels
+ cannot be used simultaneously, that is to say, the user should pick one of
+ them to use. Two LDB channels from two LDB instances can work together in
+ LDB split mode to support a dual link LVDS display. The channel indexes
+ have to be different. Channel0 outputs odd pixels and channel1 outputs
+ even pixels.
+
+ For i.MX8qm LDB, each channel additionally supports up to 30bpp parallel
+ input color format. The two channels can be used simultaneously, either
+ in dual mode or split mode. In dual mode, the two channels output identical
+ data. In split mode, channel0 outputs odd pixels and channel1 outputs even
+ pixels.
+
+ A side note is that i.MX8qm/qxp LDB is officially called pixel mapper in
+ the SoC reference manuals. The pixel mapper uses logic of LDBs embedded in
+ i.MX6qdl/sx SoCs, i.e., it is essentially based on them. To keep the naming
+ consistency, this binding calls it LDB.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8qm-ldb
+ - fsl,imx8qxp-ldb
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ clocks:
+ items:
+ - description: pixel clock
+ - description: bypass clock
+
+ clock-names:
+ items:
+ - const: pixel
+ - const: bypass
+
+ power-domains:
+ maxItems: 1
+
+ fsl,companion-ldb:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: |
+ A phandle which points to companion LDB which is used in LDB split mode.
+
+patternProperties:
+ "^channel@[0-1]$":
+ type: object
+ description: Represents a channel of LDB.
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ reg:
+ description: The channel index.
+ enum: [ 0, 1 ]
+
+ phys:
+ description: A phandle to the phy module representing the LVDS PHY.
+ maxItems: 1
+
+ phy-names:
+ const: lvds_phy
+
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Input port of the channel.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Output port of the channel.
+
+ required:
+ - "#address-cells"
+ - "#size-cells"
+ - reg
+ - phys
+ - phy-names
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - "#address-cells"
+ - "#size-cells"
+ - clocks
+ - clock-names
+ - power-domains
+ - channel@0
+ - channel@1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx8qm-ldb
+ then:
+ properties:
+ fsl,companion-ldb: false
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/firmware/imx/rsrc.h>
+ ldb {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,imx8qxp-ldb";
+ clocks = <&clk IMX_SC_R_LVDS_0 IMX_SC_PM_CLK_MISC2>,
+ <&clk IMX_SC_R_LVDS_0 IMX_SC_PM_CLK_BYPASS>;
+ clock-names = "pixel", "bypass";
+ power-domains = <&pd IMX_SC_R_LVDS_0>;
+
+ channel@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0>;
+ phys = <&mipi_lvds_0_phy>;
+ phy-names = "lvds_phy";
+
+ port@0 {
+ reg = <0>;
+
+ mipi_lvds_0_ldb_ch0_mipi_lvds_0_pxl2dpi: endpoint {
+ remote-endpoint = <&mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch0>;
+ };
+ };
+ };
+
+ channel@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+ phys = <&mipi_lvds_0_phy>;
+ phy-names = "lvds_phy";
+
+ port@0 {
+ reg = <0>;
+
+ mipi_lvds_0_ldb_ch1_mipi_lvds_0_pxl2dpi: endpoint {
+ remote-endpoint = <&mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch1>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-combiner.yaml b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-combiner.yaml
new file mode 100644
index 000000000000..50bae2122183
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-combiner.yaml
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/fsl,imx8qxp-pixel-combiner.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8qm/qxp Pixel Combiner
+
+maintainers:
+ - Liu Ying <victor.liu@nxp.com>
+
+description: |
+ The Freescale i.MX8qm/qxp Pixel Combiner takes two output streams from a
+ single display controller and manipulates the two streams to support a number
+ of modes(bypass, pixel combine, YUV444 to YUV422, split_RGB) configured as
+ either one screen, two screens, or virtual screens. The pixel combiner is
+ also responsible for generating some of the control signals for the pixel link
+ output channel.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8qm-pixel-combiner
+ - fsl,imx8qxp-pixel-combiner
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: apb
+
+ power-domains:
+ maxItems: 1
+
+patternProperties:
+ "^channel@[0-1]$":
+ type: object
+ description: Represents a display stream of pixel combiner.
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ reg:
+ description: The display stream index.
+ enum: [ 0, 1 ]
+
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Input endpoint of the display stream.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Output endpoint of the display stream.
+
+ required:
+ - "#address-cells"
+ - "#size-cells"
+ - reg
+ - port@0
+ - port@1
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - "#address-cells"
+ - "#size-cells"
+ - reg
+ - clocks
+ - clock-names
+ - power-domains
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/imx8-lpcg.h>
+ #include <dt-bindings/firmware/imx/rsrc.h>
+ pixel-combiner@56020000 {
+ compatible = "fsl,imx8qxp-pixel-combiner";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0x56020000 0x10000>;
+ clocks = <&dc0_pixel_combiner_lpcg IMX_LPCG_CLK_4>;
+ clock-names = "apb";
+ power-domains = <&pd IMX_SC_R_DC_0>;
+
+ channel@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ dc0_pixel_combiner_ch0_dc0_dpu_disp0: endpoint {
+ remote-endpoint = <&dc0_dpu_disp0_dc0_pixel_combiner_ch0>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ dc0_pixel_combiner_ch0_dc0_pixel_link0: endpoint {
+ remote-endpoint = <&dc0_pixel_link0_dc0_pixel_combiner_ch0>;
+ };
+ };
+ };
+
+ channel@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+
+ port@0 {
+ reg = <0>;
+
+ dc0_pixel_combiner_ch1_dc0_dpu_disp1: endpoint {
+ remote-endpoint = <&dc0_dpu_disp1_dc0_pixel_combiner_ch1>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ dc0_pixel_combiner_ch1_dc0_pixel_link1: endpoint {
+ remote-endpoint = <&dc0_pixel_link1_dc0_pixel_combiner_ch1>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-link.yaml b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-link.yaml
new file mode 100644
index 000000000000..38ecc7926fad
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pixel-link.yaml
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/fsl,imx8qxp-pixel-link.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8qm/qxp Display Pixel Link
+
+maintainers:
+ - Liu Ying <victor.liu@nxp.com>
+
+description: |
+ The Freescale i.MX8qm/qxp Display Pixel Link(DPL) forms a standard
+ asynchronous linkage between pixel sources(display controller or
+ camera module) and pixel consumers(imaging or displays).
+ It consists of two distinct functions, a pixel transfer function and a
+ control interface. Multiple pixel channels can exist per one control channel.
+ This binding documentation is only for pixel links whose pixel sources are
+ display controllers.
+
+ The i.MX8qm/qxp Display Pixel Link is accessed via System Controller Unit(SCU)
+ firmware.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8qm-dc-pixel-link
+ - fsl,imx8qxp-dc-pixel-link
+
+ fsl,dc-id:
+ $ref: /schemas/types.yaml#/definitions/uint8
+ description: |
+ u8 value representing the display controller index that the pixel link
+ connects to.
+
+ fsl,dc-stream-id:
+ $ref: /schemas/types.yaml#/definitions/uint8
+ description: |
+ u8 value representing the display controller stream index that the pixel
+ link connects to.
+ enum: [0, 1]
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The pixel link input port node from upstream video source.
+
+ patternProperties:
+ "^port@[1-4]$":
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The pixel link output port node to downstream bridge.
+
+ required:
+ - port@0
+ - port@1
+ - port@2
+ - port@3
+ - port@4
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx8qxp-dc-pixel-link
+ then:
+ properties:
+ fsl,dc-id:
+ const: 0
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx8qm-dc-pixel-link
+ then:
+ properties:
+ fsl,dc-id:
+ enum: [0, 1]
+
+required:
+ - compatible
+ - fsl,dc-id
+ - fsl,dc-stream-id
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ dc0-pixel-link0 {
+ compatible = "fsl,imx8qxp-dc-pixel-link";
+ fsl,dc-id = /bits/ 8 <0>;
+ fsl,dc-stream-id = /bits/ 8 <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* from dc0 pixel combiner channel0 */
+ port@0 {
+ reg = <0>;
+
+ dc0_pixel_link0_dc0_pixel_combiner_ch0: endpoint {
+ remote-endpoint = <&dc0_pixel_combiner_ch0_dc0_pixel_link0>;
+ };
+ };
+
+ /* to PXL2DPIs in MIPI/LVDS combo subsystems */
+ port@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+
+ dc0_pixel_link0_mipi_lvds_0_pxl2dpi: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&mipi_lvds_0_pxl2dpi_dc0_pixel_link0>;
+ };
+
+ dc0_pixel_link0_mipi_lvds_1_pxl2dpi: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&mipi_lvds_1_pxl2dpi_dc0_pixel_link0>;
+ };
+ };
+
+ /* unused */
+ port@2 {
+ reg = <2>;
+ };
+
+ /* unused */
+ port@3 {
+ reg = <3>;
+ };
+
+ /* to imaging subsystem */
+ port@4 {
+ reg = <4>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pxl2dpi.yaml b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pxl2dpi.yaml
new file mode 100644
index 000000000000..e4e77fad05f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/fsl,imx8qxp-pxl2dpi.yaml
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/fsl,imx8qxp-pxl2dpi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8qxp Pixel Link to Display Pixel Interface
+
+maintainers:
+ - Liu Ying <victor.liu@nxp.com>
+
+description: |
+ The Freescale i.MX8qxp Pixel Link to Display Pixel Interface(PXL2DPI)
+ interfaces the pixel link 36-bit data output and the DSI controller’s
+ MIPI-DPI 24-bit data input, and inputs of LVDS Display Bridge(LDB) module
+ used in LVDS mode, to remap the pixel color codings between those modules.
+ This module is purely combinatorial.
+
+ The i.MX8qxp PXL2DPI is controlled by Control and Status Registers(CSR) module.
+ The CSR module, as a system controller, contains the PXL2DPI's configuration
+ register.
+
+properties:
+ compatible:
+ const: fsl,imx8qxp-pxl2dpi
+
+ fsl,sc-resource:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: The SCU resource ID associated with this PXL2DPI instance.
+
+ power-domains:
+ maxItems: 1
+
+ fsl,companion-pxl2dpi:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: |
+ A phandle which points to companion PXL2DPI which is used by downstream
+ LVDS Display Bridge(LDB) in split mode.
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The PXL2DPI input port node from pixel link.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The PXL2DPI output port node to downstream bridge.
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - fsl,sc-resource
+ - power-domains
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/firmware/imx/rsrc.h>
+ pxl2dpi {
+ compatible = "fsl,imx8qxp-pxl2dpi";
+ fsl,sc-resource = <IMX_SC_R_MIPI_0>;
+ power-domains = <&pd IMX_SC_R_MIPI_0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0>;
+
+ mipi_lvds_0_pxl2dpi_dc_pixel_link0: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&dc_pixel_link0_mipi_lvds_0_pxl2dpi>;
+ };
+
+ mipi_lvds_0_pxl2dpi_dc_pixel_link1: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&dc_pixel_link1_mipi_lvds_0_pxl2dpi>;
+ };
+ };
+
+ port@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <1>;
+
+ mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch0: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&mipi_lvds_0_ldb_ch0_mipi_lvds_0_pxl2dpi>;
+ };
+
+ mipi_lvds_0_pxl2dpi_mipi_lvds_0_ldb_ch1: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&mipi_lvds_0_ldb_ch1_mipi_lvds_0_pxl2dpi>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/fsl,ldb.yaml b/Documentation/devicetree/bindings/display/bridge/fsl,ldb.yaml
new file mode 100644
index 000000000000..6e0e3ba9b49e
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/fsl,ldb.yaml
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/fsl,ldb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX8MP DPI to LVDS bridge chip
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+description: |
+ The i.MX8MP mediamix contains two registers which are responsible
+ for configuring the on-SoC DPI-to-LVDS serializer. This describes
+ those registers as bridge within the DT.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8mp-ldb
+ - fsl,imx93-ldb
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: ldb
+
+ reg:
+ maxItems: 2
+
+ reg-names:
+ items:
+ - const: ldb
+ - const: lvds
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for DPI input.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for LVDS Channel-A output (panel or bridge).
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for LVDS Channel-B output (panel or bridge).
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - clocks
+ - ports
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx93-ldb
+ then:
+ properties:
+ ports:
+ properties:
+ port@2: false
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/imx8mp-clock.h>
+
+ blk-ctrl {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ bridge@5c {
+ compatible = "fsl,imx8mp-ldb";
+ clocks = <&clk IMX8MP_CLK_MEDIA_LDB>;
+ clock-names = "ldb";
+ reg = <0x5c 0x4>, <0x128 0x4>;
+ reg-names = "ldb", "lvds";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ ldb_from_lcdif2: endpoint {
+ remote-endpoint = <&lcdif2_to_ldb>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ ldb_lvds_ch0: endpoint {
+ remote-endpoint = <&ldb_to_lvdsx4panel>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+
+ ldb_lvds_ch1: endpoint {
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml
index 9f7cc6b757cb..a44d025d33bd 100644
--- a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml
@@ -8,7 +8,6 @@ title: ChromeOS EC ANX7688 HDMI to DP Converter through Type-C Port
maintainers:
- Nicolas Boichat <drinkcat@chromium.org>
- - Enric Balletbo i Serra <enric.balletbo@collabora.com>
description: |
ChromeOS EC ANX7688 is a display bridge that converts HDMI 2.0 to
@@ -79,4 +78,3 @@ examples:
};
};
};
-
diff --git a/Documentation/devicetree/bindings/display/bridge/ingenic,jz4780-hdmi.yaml b/Documentation/devicetree/bindings/display/bridge/ingenic,jz4780-hdmi.yaml
new file mode 100644
index 000000000000..0b27df429bdc
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/ingenic,jz4780-hdmi.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/ingenic,jz4780-hdmi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ingenic JZ4780 HDMI Transmitter
+
+maintainers:
+ - H. Nikolaus Schaller <hns@goldelico.com>
+
+description: |
+ The HDMI Transmitter in the Ingenic JZ4780 is a Synopsys DesignWare HDMI 1.4
+ TX controller IP with accompanying PHY IP.
+
+allOf:
+ - $ref: synopsys,dw-hdmi.yaml#
+
+properties:
+ compatible:
+ const: ingenic,jz4780-dw-hdmi
+
+ reg-io-width:
+ const: 4
+
+ clocks:
+ maxItems: 2
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Input from LCD controller output.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Link to the HDMI connector.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - ports
+ - reg-io-width
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/ingenic,jz4780-cgu.h>
+
+ hdmi: hdmi@10180000 {
+ compatible = "ingenic,jz4780-dw-hdmi";
+ reg = <0x10180000 0x8000>;
+ reg-io-width = <4>;
+ interrupt-parent = <&intc>;
+ interrupts = <3>;
+ clocks = <&cgu JZ4780_CLK_AHB0>, <&cgu JZ4780_CLK_HDMI>;
+ clock-names = "iahb", "isfr";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ hdmi_in: port@0 {
+ reg = <0>;
+ dw_hdmi_in: endpoint {
+ remote-endpoint = <&jz4780_lcd_out>;
+ };
+ };
+ hdmi_out: port@1 {
+ reg = <1>;
+ dw_hdmi_out: endpoint {
+ remote-endpoint = <&hdmi_con>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml b/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml
index dcb1336ee2a5..958a073f4ff7 100644
--- a/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/bridge/intel,keembay-dsi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Devicetree bindings for Intel Keem Bay mipi dsi controller
+title: Intel Keem Bay mipi dsi controller
maintainers:
- Anitha Chrisanthus <anitha.chrisanthus@intel.com>
diff --git a/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml b/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml
index 833d11b2303a..c9a882ee6d98 100644
--- a/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/bridge/ite,it6505.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ITE it6505 Device Tree Bindings
+title: ITE it6505
maintainers:
- Allen Chen <allen.chen@ite.com.tw>
@@ -52,9 +52,49 @@ properties:
maxItems: 1
description: extcon specifier for the Power Delivery
- port:
- $ref: /schemas/graph.yaml#/properties/port
- description: A port node pointing to DPI host port node
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: A port node pointing to DPI host port node
+
+ properties:
+ endpoint:
+ $ref: /schemas/graph.yaml#/$defs/endpoint-base
+ unevaluatedProperties: false
+
+ properties:
+ link-frequencies:
+ minItems: 1
+ maxItems: 1
+ description: Allowed max link frequencies in Hz
+
+ port@1:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: Video port for DP output
+
+ properties:
+ endpoint:
+ $ref: /schemas/graph.yaml#/$defs/endpoint-base
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ minItems: 1
+ uniqueItems: true
+ items:
+ - enum: [ 0, 1 ]
+ - const: 1
+ - const: 2
+ - const: 3
+
+ required:
+ - port@0
+ - port@1
required:
- compatible
@@ -63,6 +103,7 @@ required:
- interrupts
- reset-gpios
- extcon
+ - ports
additionalProperties: false
@@ -85,9 +126,24 @@ examples:
reset-gpios = <&pio 179 1>;
extcon = <&usbc_extcon>;
- port {
- it6505_in: endpoint {
- remote-endpoint = <&dpi_out>;
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ it6505_in: endpoint {
+ remote-endpoint = <&dpi_out>;
+ link-frequencies = /bits/ 64 <150000000>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ it6505_out: endpoint {
+ remote-endpoint = <&dp_in>;
+ data-lanes = <0 1>;
+ };
};
};
};
diff --git a/Documentation/devicetree/bindings/display/bridge/ite,it66121.yaml b/Documentation/devicetree/bindings/display/bridge/ite,it66121.yaml
index 6ec1d5fbb8bc..a7eb2603691f 100644
--- a/Documentation/devicetree/bindings/display/bridge/ite,it66121.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/ite,it66121.yaml
@@ -4,11 +4,11 @@
$id: http://devicetree.org/schemas/display/bridge/ite,it66121.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ITE it66121 HDMI bridge Device Tree Bindings
+title: ITE it66121 HDMI bridge
maintainers:
- Phong LE <ple@baylibre.com>
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
description: |
The IT66121 is a high-performance and low-power single channel HDMI
@@ -17,7 +17,9 @@ description: |
properties:
compatible:
- const: ite,it66121
+ enum:
+ - ite,it66121
+ - ite,it6610
reg:
maxItems: 1
@@ -38,6 +40,9 @@ properties:
interrupts:
maxItems: 1
+ "#sound-dai-cells":
+ const: 0
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
diff --git a/Documentation/devicetree/bindings/display/bridge/lontium,lt8912b.yaml b/Documentation/devicetree/bindings/display/bridge/lontium,lt8912b.yaml
index 674891ee2f8e..f201ae4af4fb 100644
--- a/Documentation/devicetree/bindings/display/bridge/lontium,lt8912b.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/lontium,lt8912b.yaml
@@ -67,7 +67,7 @@ examples:
- |
#include <dt-bindings/gpio/gpio.h>
- i2c4 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/bridge/lontium,lt9211.yaml b/Documentation/devicetree/bindings/display/bridge/lontium,lt9211.yaml
new file mode 100644
index 000000000000..9a6e9b25d14a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/lontium,lt9211.yaml
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/lontium,lt9211.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Lontium LT9211 DSI/LVDS/DPI to DSI/LVDS/DPI bridge.
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+description: |
+ The LT9211 are bridge devices which convert Single/Dual-Link DSI/LVDS
+ or Single DPI to Single/Dual-Link DSI/LVDS or Single DPI.
+
+properties:
+ compatible:
+ enum:
+ - lontium,lt9211
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ reset-gpios:
+ maxItems: 1
+ description: GPIO connected to active high RESET pin.
+
+ vccio-supply:
+ description: Regulator for 1.8V IO power.
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Primary MIPI DSI port-1 for MIPI input or
+ LVDS port-1 for LVDS input or DPI input.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Additional MIPI port-2 for MIPI input or LVDS port-2
+ for LVDS input. Used in combination with primary
+ port-1 to drive higher resolution displays
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Primary MIPI DSI port-1 for MIPI output or
+ LVDS port-1 for LVDS output or DPI output.
+
+ port@3:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Additional MIPI port-2 for MIPI output or LVDS port-2
+ for LVDS output. Used in combination with primary
+ port-1 to drive higher resolution displays.
+
+ required:
+ - port@0
+ - port@2
+
+required:
+ - compatible
+ - reg
+ - vccio-supply
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ hdmi-bridge@3b {
+ compatible = "lontium,lt9211";
+ reg = <0x3b>;
+
+ reset-gpios = <&tlmm 128 GPIO_ACTIVE_HIGH>;
+ interrupts-extended = <&tlmm 84 IRQ_TYPE_EDGE_FALLING>;
+
+ vccio-supply = <&lt9211_1v8>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+
+ endpoint {
+ remote-endpoint = <&panel_in_lvds>;
+ };
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml b/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml
index 304a1367faaa..84aafcbf0919 100644
--- a/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml
@@ -39,6 +39,7 @@ properties:
- const: lvds-encoder # Generic LVDS encoder compatible fallback
- items:
- enum:
+ - ti,ds90cf364a # For the DS90CF364A FPD-Link LVDS Receiver
- ti,ds90cf384a # For the DS90CF384A FPD-Link LVDS Receiver
- const: lvds-decoder # Generic LVDS decoders compatible fallback
- enum:
@@ -49,11 +50,27 @@ properties:
properties:
port@0:
- $ref: /schemas/graph.yaml#/properties/port
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
description: |
For LVDS encoders, port 0 is the parallel input
For LVDS decoders, port 0 is the LVDS input
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-mapping:
+ enum:
+ - jeida-18
+ - jeida-24
+ - vesa-24
+ description: |
+ The color signals mapping order. See details in
+ Documentation/devicetree/bindings/display/lvds.yaml
+
port@1:
$ref: /schemas/graph.yaml#/properties/port
description: |
@@ -64,6 +81,14 @@ properties:
- port@0
- port@1
+ pclk-sample:
+ description:
+ Data sampling on rising or falling edge.
+ enum:
+ - 0 # Falling edge
+ - 1 # Rising edge
+ default: 0
+
powerdown-gpios:
description:
The GPIO used to control the power down line of this device.
@@ -71,6 +96,33 @@ properties:
power-supply: true
+allOf:
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ const: lvds-decoder
+ then:
+ properties:
+ ports:
+ properties:
+ port@0:
+ properties:
+ endpoint:
+ properties:
+ data-mapping: false
+
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ const: lvds-encoder
+ then:
+ properties:
+ pclk-sample: false
+
required:
- compatible
- ports
diff --git a/Documentation/devicetree/bindings/display/bridge/nxp,ptn3460.yaml b/Documentation/devicetree/bindings/display/bridge/nxp,ptn3460.yaml
new file mode 100644
index 000000000000..70ec70922c13
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/nxp,ptn3460.yaml
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/nxp,ptn3460.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP PTN3460 eDP to LVDS bridge
+
+maintainers:
+ - Sean Paul <seanpaul@chromium.org>
+
+properties:
+ compatible:
+ const: nxp,ptn3460
+
+ reg:
+ description: I2C address of the bridge
+ maxItems: 1
+
+ edid-emulation:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ The EDID emulation entry to use
+ Value Resolution Description
+ 0 1024x768 NXP Generic
+ 1 1920x1080 NXP Generic
+ 2 1920x1080 NXP Generic
+ 3 1600x900 Samsung LTM200KT
+ 4 1920x1080 Samsung LTM230HT
+ 5 1366x768 NXP Generic
+ 6 1600x900 ChiMei M215HGE
+ enum: [0, 1, 2, 3, 4, 5, 6]
+
+ powerdown-gpios:
+ description: GPIO connected to the PD_N signal.
+ maxItems: 1
+
+ reset-gpios:
+ description: GPIO connected to the RST_N signal.
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for LVDS output
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for eDP input
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - edid-emulation
+ - powerdown-gpios
+ - reset-gpios
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@20 {
+ compatible = "nxp,ptn3460";
+ reg = <0x20>;
+ edid-emulation = <5>;
+ powerdown-gpios = <&gpy2 5 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpx1 5 GPIO_ACTIVE_LOW>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ bridge_out: endpoint {
+ remote-endpoint = <&panel_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ bridge_in: endpoint {
+ remote-endpoint = <&dp_out>;
+ };
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/nxp,tda998x.yaml b/Documentation/devicetree/bindings/display/bridge/nxp,tda998x.yaml
new file mode 100644
index 000000000000..c4bf54397473
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/nxp,tda998x.yaml
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/nxp,tda998x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP TDA998x HDMI transmitter
+
+maintainers:
+ - Russell King <linux@armlinux.org.uk>
+
+properties:
+ compatible:
+ const: nxp,tda998x
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ video-ports:
+ default: 0x230145
+ maximum: 0xffffff
+ description:
+ 24 bits value which defines how the video controller output is wired to
+ the TDA998x input.
+
+ audio-ports:
+ description:
+ Array of 8-bit values, 2 values per DAI (Documentation/sound/soc/dai.rst).
+ The implementation allows one or two DAIs.
+ If two DAIs are defined, they must be of different type.
+ $ref: /schemas/types.yaml#/definitions/uint32-matrix
+ items:
+ minItems: 1
+ items:
+ - description: |
+ The first value defines the DAI type: TDA998x_SPDIF or TDA998x_I2S
+ (see include/dt-bindings/display/tda998x.h).
+ - description:
+ The second value defines the tda998x AP_ENA reg content when the
+ DAI in question is used.
+
+ '#sound-dai-cells':
+ enum: [ 0, 1 ]
+
+ nxp,calib-gpios:
+ maxItems: 1
+ description:
+ Calibration GPIO, which must correspond with the gpio used for the
+ TDA998x interrupt pin.
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Parallel input port
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ type: object
+ description: Parallel input port
+
+ port@1:
+ type: object
+ description: HDMI output port
+
+required:
+ - compatible
+ - reg
+
+oneOf:
+ - required:
+ - port
+ - required:
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/display/tda998x.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ tda998x: hdmi-encoder@70 {
+ compatible = "nxp,tda998x";
+ reg = <0x70>;
+ interrupt-parent = <&gpio0>;
+ interrupts = <27 IRQ_TYPE_EDGE_FALLING>;
+ video-ports = <0x230145>;
+
+ #sound-dai-cells = <1>;
+ /* DAI-format / AP_ENA reg value */
+ audio-ports = <TDA998x_SPDIF 0x04>,
+ <TDA998x_I2S 0x03>;
+
+ port {
+ tda998x_in: endpoint {
+ remote-endpoint = <&lcdc_0>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/parade,ps8622.yaml b/Documentation/devicetree/bindings/display/bridge/parade,ps8622.yaml
new file mode 100644
index 000000000000..e6397ac2048b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/parade,ps8622.yaml
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/parade,ps8622.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Parade PS8622/PS8625 DisplayPort to LVDS Converter
+
+maintainers:
+ - Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+
+properties:
+ compatible:
+ enum:
+ - parade,ps8622
+ - parade,ps8625
+
+ reg:
+ maxItems: 1
+
+ lane-count:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [1, 2]
+ description: Number of DP lanes to use.
+
+ use-external-pwm:
+ type: boolean
+ description: Backlight will be controlled by an external PWM.
+
+ reset-gpios:
+ maxItems: 1
+ description: GPIO connected to RST_ pin.
+
+ sleep-gpios:
+ maxItems: 1
+ description: GPIO connected to PD_ pin.
+
+ vdd12-supply: true
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for LVDS output.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for DisplayPort input.
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - reset-gpios
+ - sleep-gpios
+ - ports
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ const: parade,ps8622
+ then:
+ properties:
+ lane-count:
+ const: 1
+ else:
+ properties:
+ lane-count:
+ const: 2
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ lvds-bridge@48 {
+ compatible = "parade,ps8625";
+ reg = <0x48>;
+ sleep-gpios = <&gpx3 5 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpy7 7 GPIO_ACTIVE_HIGH>;
+ lane-count = <2>;
+ use-external-pwm;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ bridge_out: endpoint {
+ remote-endpoint = <&panel_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ bridge_in: endpoint {
+ remote-endpoint = <&dp_out>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/ps8622.txt b/Documentation/devicetree/bindings/display/bridge/ps8622.txt
deleted file mode 100644
index c989c3807f2b..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/ps8622.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-ps8622-bridge bindings
-
-Required properties:
- - compatible: "parade,ps8622" or "parade,ps8625"
- - reg: first i2c address of the bridge
- - sleep-gpios: OF device-tree gpio specification for PD_ pin.
- - reset-gpios: OF device-tree gpio specification for RST_ pin.
-
-Optional properties:
- - lane-count: number of DP lanes to use
- - use-external-pwm: backlight will be controlled by an external PWM
- - video interfaces: Device node can contain video interface port
- nodes for panel according to [1].
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
- lvds-bridge@48 {
- compatible = "parade,ps8622";
- reg = <0x48>;
- sleep-gpios = <&gpc3 6 1 0 0>;
- reset-gpios = <&gpc3 1 1 0 0>;
- lane-count = <1>;
- ports {
- port@0 {
- bridge_out: endpoint {
- remote-endpoint = <&panel_in>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml
index fce82b605c8b..5856450c5da7 100644
--- a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml
@@ -4,11 +4,10 @@
$id: http://devicetree.org/schemas/display/bridge/ps8640.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: MIPI DSI to eDP Video Format Converter Device Tree Bindings
+title: MIPI DSI to eDP Video Format Converter
maintainers:
- Nicolas Boichat <drinkcat@chromium.org>
- - Enric Balletbo i Serra <enric.balletbo@collabora.com>
description: |
The PS8640 is a low power MIPI-to-eDP video format converter supporting
@@ -40,6 +39,9 @@ properties:
vdd33-supply:
description: Regulator for 3.3V digital core power.
+ aux-bus:
+ $ref: /schemas/display/dp-aux-bus.yaml#
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
@@ -71,7 +73,7 @@ additionalProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -98,9 +100,22 @@ examples:
reg = <1>;
ps8640_out: endpoint {
remote-endpoint = <&panel_in>;
- };
+ };
+ };
+ };
+
+ aux-bus {
+ panel {
+ compatible = "boe,nv133fhm-n62";
+ power-supply = <&pp3300_dx_edp>;
+ backlight = <&backlight>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&ps8640_out>;
+ };
+ };
};
};
};
};
-
diff --git a/Documentation/devicetree/bindings/display/bridge/ptn3460.txt b/Documentation/devicetree/bindings/display/bridge/ptn3460.txt
deleted file mode 100644
index 361971ba104d..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/ptn3460.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-ptn3460 bridge bindings
-
-Required properties:
- - compatible: "nxp,ptn3460"
- - reg: i2c address of the bridge
- - powerdown-gpio: OF device-tree gpio specification for PD_N pin.
- - reset-gpio: OF device-tree gpio specification for RST_N pin.
- - edid-emulation: The EDID emulation entry to use
- +-------+------------+------------------+
- | Value | Resolution | Description |
- | 0 | 1024x768 | NXP Generic |
- | 1 | 1920x1080 | NXP Generic |
- | 2 | 1920x1080 | NXP Generic |
- | 3 | 1600x900 | Samsung LTM200KT |
- | 4 | 1920x1080 | Samsung LTM230HT |
- | 5 | 1366x768 | NXP Generic |
- | 6 | 1600x900 | ChiMei M215HGE |
- +-------+------------+------------------+
-
- - video interfaces: Device node can contain video interface port
- nodes for panel according to [1].
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
- lvds-bridge@20 {
- compatible = "nxp,ptn3460";
- reg = <0x20>;
- powerdown-gpio = <&gpy2 5 1 0 0>;
- reset-gpio = <&gpx1 5 1 0 0>;
- edid-emulation = <5>;
- ports {
- port@0 {
- bridge_out: endpoint {
- remote-endpoint = <&panel_in>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml b/Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml
new file mode 100644
index 000000000000..d33026f85e19
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,dsi-csi2-tx.yaml
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/renesas,dsi-csi2-tx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas R-Car MIPI DSI/CSI-2 Encoder
+
+maintainers:
+ - Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
+
+description: |
+ This binding describes the MIPI DSI/CSI-2 encoder embedded in the Renesas
+ R-Car Gen4 SoCs. The encoder can operate in either DSI or CSI-2 mode, with up
+ to four data lanes.
+
+properties:
+ compatible:
+ enum:
+ - renesas,r8a779a0-dsi-csi2-tx # for V3U
+ - renesas,r8a779g0-dsi-csi2-tx # for V4H
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Functional clock
+ - description: DSI (and CSI-2) functional clock
+ - description: PLL reference clock
+
+ clock-names:
+ items:
+ - const: fck
+ - const: dsi
+ - const: pll
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Parallel input port
+
+ port@1:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: DSI/CSI-2 output port
+
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ minItems: 1
+ maxItems: 4
+
+ required:
+ - data-lanes
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - power-domains
+ - resets
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/r8a779a0-cpg-mssr.h>
+ #include <dt-bindings/power/r8a779a0-sysc.h>
+
+ dsi0: dsi-encoder@fed80000 {
+ compatible = "renesas,r8a779a0-dsi-csi2-tx";
+ reg = <0xfed80000 0x10000>;
+ power-domains = <&sysc R8A779A0_PD_ALWAYS_ON>;
+ clocks = <&cpg CPG_MOD 415>,
+ <&cpg CPG_CORE R8A779A0_CLK_DSI>,
+ <&cpg CPG_CORE R8A779A0_CLK_CP>;
+ clock-names = "fck", "dsi", "pll";
+ resets = <&cpg 415>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&du_out_dsi0>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ data-lanes = <1 2>;
+ remote-endpoint = <&sn65dsi86_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,dsi.yaml b/Documentation/devicetree/bindings/display/bridge/renesas,dsi.yaml
new file mode 100644
index 000000000000..e08c24633926
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,dsi.yaml
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/renesas,dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/G2L MIPI DSI Encoder
+
+maintainers:
+ - Biju Das <biju.das.jz@bp.renesas.com>
+
+description: |
+ This binding describes the MIPI DSI encoder embedded in the Renesas
+ RZ/G2L alike family of SoC's. The encoder can operate in DSI mode, with
+ up to four data lanes.
+
+allOf:
+ - $ref: /schemas/display/dsi-controller.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - renesas,r9a07g044-mipi-dsi # RZ/G2{L,LC}
+ - renesas,r9a07g054-mipi-dsi # RZ/V2L
+ - const: renesas,rzg2l-mipi-dsi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ items:
+ - description: Sequence operation channel 0 interrupt
+ - description: Sequence operation channel 1 interrupt
+ - description: Video-Input operation channel 1 interrupt
+ - description: DSI Packet Receive interrupt
+ - description: DSI Fatal Error interrupt
+ - description: DSI D-PHY PPI interrupt
+ - description: Debug interrupt
+
+ interrupt-names:
+ items:
+ - const: seq0
+ - const: seq1
+ - const: vin1
+ - const: rcv
+ - const: ferr
+ - const: ppi
+ - const: debug
+
+ clocks:
+ items:
+ - description: DSI D-PHY PLL multiplied clock
+ - description: DSI D-PHY system clock
+ - description: DSI AXI bus clock
+ - description: DSI Register access clock
+ - description: DSI Video clock
+ - description: DSI D-PHY Escape mode transmit clock
+
+ clock-names:
+ items:
+ - const: pllclk
+ - const: sysclk
+ - const: aclk
+ - const: pclk
+ - const: vclk
+ - const: lpclk
+
+ resets:
+ items:
+ - description: MIPI_DSI_CMN_RSTB
+ - description: MIPI_DSI_ARESET_N
+ - description: MIPI_DSI_PRESET_N
+
+ reset-names:
+ items:
+ - const: rst
+ - const: arst
+ - const: prst
+
+ power-domains:
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Parallel input port
+
+ port@1:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: DSI output port
+
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ description: array of physical DSI data lane indexes.
+ minItems: 1
+ items:
+ - const: 1
+ - const: 2
+ - const: 3
+ - const: 4
+
+ required:
+ - data-lanes
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-names
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/r9a07g044-cpg.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ dsi0: dsi@10850000 {
+ compatible = "renesas,r9a07g044-mipi-dsi", "renesas,rzg2l-mipi-dsi";
+ reg = <0x10850000 0x20000>;
+ interrupts = <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 145 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 146 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 147 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "seq0", "seq1", "vin1", "rcv",
+ "ferr", "ppi", "debug";
+ clocks = <&cpg CPG_MOD R9A07G044_MIPI_DSI_PLLCLK>,
+ <&cpg CPG_MOD R9A07G044_MIPI_DSI_SYSCLK>,
+ <&cpg CPG_MOD R9A07G044_MIPI_DSI_ACLK>,
+ <&cpg CPG_MOD R9A07G044_MIPI_DSI_PCLK>,
+ <&cpg CPG_MOD R9A07G044_MIPI_DSI_VCLK>,
+ <&cpg CPG_MOD R9A07G044_MIPI_DSI_LPCLK>;
+ clock-names = "pllclk", "sysclk", "aclk", "pclk", "vclk", "lpclk";
+ resets = <&cpg R9A07G044_MIPI_DSI_CMN_RSTB>,
+ <&cpg R9A07G044_MIPI_DSI_ARESET_N>,
+ <&cpg R9A07G044_MIPI_DSI_PRESET_N>;
+ reset-names = "rst", "arst", "prst";
+ power-domains = <&cpg>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&du_out_dsi0>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ data-lanes = <1 2 3 4>;
+ remote-endpoint = <&adv7535_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml
index 0c9785c8db51..e3ec697f89e7 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.yaml
@@ -38,6 +38,9 @@ properties:
clock-names:
maxItems: 2
+ resets:
+ maxItems: 1
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
@@ -67,6 +70,7 @@ required:
- reg
- clocks
- clock-names
+ - resets
- interrupts
- ports
@@ -85,6 +89,7 @@ examples:
clocks = <&cpg CPG_CORE R8A7795_CLK_S0D4>, <&cpg CPG_MOD 729>;
clock-names = "iahb", "isfr";
power-domains = <&sysc R8A7795_PD_ALWAYS_ON>;
+ resets = <&cpg 729>;
ports {
#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml
index acfc327f70a7..bb9dbfb9beaf 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.yaml
@@ -28,6 +28,7 @@ properties:
- renesas,r8a7793-lvds # for R-Car M2-N compatible LVDS encoders
- renesas,r8a7795-lvds # for R-Car H3 compatible LVDS encoders
- renesas,r8a7796-lvds # for R-Car M3-W compatible LVDS encoders
+ - renesas,r8a77961-lvds # for R-Car M3-W+ compatible LVDS encoders
- renesas,r8a77965-lvds # for R-Car M3-N compatible LVDS encoders
- renesas,r8a77970-lvds # for R-Car V3M compatible LVDS encoders
- renesas,r8a77980-lvds # for R-Car V3H compatible LVDS encoders
@@ -94,7 +95,6 @@ then:
properties:
clocks:
minItems: 1
- maxItems: 4
items:
- description: Functional clock
- description: EXTAL input clock
@@ -103,7 +103,6 @@ then:
clock-names:
minItems: 1
- maxItems: 4
items:
- const: fck
# The LVDS encoder can use the EXTAL or DU_DOTCLKINx clocks.
@@ -127,12 +126,10 @@ then:
else:
properties:
clocks:
- maxItems: 1
items:
- description: Functional clock
clock-names:
- maxItems: 1
items:
- const: fck
diff --git a/Documentation/devicetree/bindings/display/bridge/samsung,mipi-dsim.yaml b/Documentation/devicetree/bindings/display/bridge/samsung,mipi-dsim.yaml
new file mode 100644
index 000000000000..e841659e20cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/samsung,mipi-dsim.yaml
@@ -0,0 +1,255 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/samsung,mipi-dsim.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung MIPI DSIM bridge controller
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Jagan Teki <jagan@amarulasolutions.com>
+ - Marek Szyprowski <m.szyprowski@samsung.com>
+
+description: |
+ Samsung MIPI DSIM bridge controller can be found it on Exynos
+ and i.MX8M Mini/Nano/Plus SoC's.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - samsung,exynos3250-mipi-dsi
+ - samsung,exynos4210-mipi-dsi
+ - samsung,exynos5410-mipi-dsi
+ - samsung,exynos5422-mipi-dsi
+ - samsung,exynos5433-mipi-dsi
+ - fsl,imx8mm-mipi-dsim
+ - fsl,imx8mp-mipi-dsim
+ - items:
+ - const: fsl,imx8mn-mipi-dsim
+ - const: fsl,imx8mm-mipi-dsim
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+ clocks:
+ minItems: 2
+ maxItems: 5
+
+ clock-names:
+ minItems: 2
+ maxItems: 5
+
+ samsung,phy-type:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: phandle to the samsung phy-type
+
+ power-domains:
+ maxItems: 1
+
+ samsung,power-domain:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: phandle to the associated samsung power domain
+
+ vddcore-supply:
+ description: MIPI DSIM Core voltage supply (e.g. 1.1V)
+
+ vddio-supply:
+ description: MIPI DSIM I/O and PLL voltage supply (e.g. 1.8V)
+
+ samsung,burst-clock-frequency:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ DSIM high speed burst mode frequency.
+
+ samsung,esc-clock-frequency:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ DSIM escape mode frequency.
+
+ samsung,pll-clock-frequency:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ DSIM oscillator clock frequency.
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ const: dsim
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Input port node to receive pixel data from the
+ display controller. Exactly one endpoint must be
+ specified.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ DSI output port node to the panel or the next bridge
+ in the chain.
+
+required:
+ - clock-names
+ - clocks
+ - compatible
+ - interrupts
+ - reg
+ - samsung,burst-clock-frequency
+ - samsung,esc-clock-frequency
+ - samsung,pll-clock-frequency
+
+allOf:
+ - $ref: ../dsi-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ minItems: 5
+
+ clock-names:
+ items:
+ - const: bus_clk
+ - const: phyclk_mipidphy0_bitclkdiv8
+ - const: phyclk_mipidphy0_rxclkesc0
+ - const: sclk_rgb_vclk_to_dsim0
+ - const: sclk_mipi
+
+ ports:
+ required:
+ - port@0
+
+ required:
+ - ports
+ - vddcore-supply
+ - vddio-supply
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5410-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: bus_clk
+ - const: pll_clk
+
+ required:
+ - vddcore-supply
+ - vddio-supply
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos4210-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: bus_clk
+ - const: sclk_mipi
+
+ required:
+ - vddcore-supply
+ - vddio-supply
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos3250-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: bus_clk
+ - const: pll_clk
+
+ required:
+ - vddcore-supply
+ - vddio-supply
+ - samsung,phy-type
+
+additionalProperties:
+ type: object
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5433.h>
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ dsi@13900000 {
+ compatible = "samsung,exynos5433-mipi-dsi";
+ reg = <0x13900000 0xC0>;
+ interrupts = <GIC_SPI 205 IRQ_TYPE_LEVEL_HIGH>;
+ phys = <&mipi_phy 1>;
+ phy-names = "dsim";
+ clocks = <&cmu_disp CLK_PCLK_DSIM0>,
+ <&cmu_disp CLK_PHYCLK_MIPIDPHY0_BITCLKDIV8>,
+ <&cmu_disp CLK_PHYCLK_MIPIDPHY0_RXCLKESC0>,
+ <&cmu_disp CLK_SCLK_RGB_VCLK_TO_DSIM0>,
+ <&cmu_disp CLK_SCLK_DSIM0>;
+ clock-names = "bus_clk",
+ "phyclk_mipidphy0_bitclkdiv8",
+ "phyclk_mipidphy0_rxclkesc0",
+ "sclk_rgb_vclk_to_dsim0",
+ "sclk_mipi";
+ power-domains = <&pd_disp>;
+ vddcore-supply = <&ldo6_reg>;
+ vddio-supply = <&ldo7_reg>;
+ samsung,burst-clock-frequency = <512000000>;
+ samsung,esc-clock-frequency = <16000000>;
+ samsung,pll-clock-frequency = <24000000>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&te_irq>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ dsi_to_mic: endpoint {
+ remote-endpoint = <&mic_to_dsi>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/sii902x.txt b/Documentation/devicetree/bindings/display/bridge/sii902x.txt
deleted file mode 100644
index 3bc760cc31cb..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/sii902x.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-sii902x HDMI bridge bindings
-
-Required properties:
- - compatible: "sil,sii9022"
- - reg: i2c address of the bridge
-
-Optional properties:
- - interrupts: describe the interrupt line used to inform the host
- about hotplug events.
- - reset-gpios: OF device-tree gpio specification for RST_N pin.
- - iovcc-supply: I/O Supply Voltage (1.8V or 3.3V)
- - cvcc12-supply: Digital Core Supply Voltage (1.2V)
-
- HDMI audio properties:
- - #sound-dai-cells: <0> or <1>. <0> if only i2s or spdif pin
- is wired, <1> if the both are wired. HDMI audio is
- configured only if this property is found.
- - sil,i2s-data-lanes: Array of up to 4 integers with values of 0-3
- Each integer indicates which i2s pin is connected to which
- audio fifo. The first integer selects i2s audio pin for the
- first audio fifo#0 (HDMI channels 1&2), second for fifo#1
- (HDMI channels 3&4), and so on. There is 4 fifos and 4 i2s
- pins (SD0 - SD3). Any i2s pin can be connected to any fifo,
- but there can be no gaps. E.g. an i2s pin must be mapped to
- fifo#0 and fifo#1 before mapping a channel to fifo#2. Default
- value is <0>, describing SD0 pin beiging routed to hdmi audio
- fifo #0.
- - clocks: phandle and clock specifier for each clock listed in
- the clock-names property
- - clock-names: "mclk"
- Describes SII902x MCLK input. MCLK can be used to produce
- HDMI audio CTS values. This property follows
- Documentation/devicetree/bindings/clock/clock-bindings.txt
- consumer binding.
-
- If HDMI audio is configured the sii902x device becomes an I2S
- and/or spdif audio codec component (e.g a digital audio sink),
- that can be used in configuring a full audio devices with
- simple-card or audio-graph-card binding. See their binding
- documents on how to describe the way the sii902x device is
- connected to the rest of the audio system:
- Documentation/devicetree/bindings/sound/simple-card.yaml
- Documentation/devicetree/bindings/sound/audio-graph-card.yaml
- Note: In case of the audio-graph-card binding the used port
- index should be 3.
-
-Optional subnodes:
- - video input: this subnode can contain a video input port node
- to connect the bridge to a display controller output (See this
- documentation [1]).
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
- hdmi-bridge@39 {
- compatible = "sil,sii9022";
- reg = <0x39>;
- reset-gpios = <&pioA 1 0>;
- iovcc-supply = <&v3v3_hdmi>;
- cvcc12-supply = <&v1v2_hdmi>;
-
- #sound-dai-cells = <0>;
- sil,i2s-data-lanes = < 0 1 2 >;
- clocks = <&mclk>;
- clock-names = "mclk";
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- bridge_in: endpoint {
- remote-endpoint = <&dc_out>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/sii9234.txt b/Documentation/devicetree/bindings/display/bridge/sii9234.txt
deleted file mode 100644
index a55bf77bd960..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/sii9234.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-Silicon Image SiI9234 HDMI/MHL bridge bindings
-
-Required properties:
- - compatible : "sil,sii9234".
- - reg : I2C address for TPI interface, use 0x39
- - avcc33-supply : MHL/USB Switch Supply Voltage (3.3V)
- - iovcc18-supply : I/O Supply Voltage (1.8V)
- - avcc12-supply : TMDS Analog Supply Voltage (1.2V)
- - cvcc12-supply : Digital Core Supply Voltage (1.2V)
- - interrupts: interrupt specifier of INT pin
- - reset-gpios: gpio specifier of RESET pin (active low)
- - video interfaces: Device node can contain two video interface port
- nodes for HDMI encoder and connector according to [1].
- - port@0 - MHL to HDMI
- - port@1 - MHL to connector
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-
-Example:
- sii9234@39 {
- compatible = "sil,sii9234";
- reg = <0x39>;
- avcc33-supply = <&vcc33mhl>;
- iovcc18-supply = <&vcc18mhl>;
- avcc12-supply = <&vsil12>;
- cvcc12-supply = <&vsil12>;
- reset-gpios = <&gpf3 4 GPIO_ACTIVE_LOW>;
- interrupt-parent = <&gpf3>;
- interrupts = <5 IRQ_TYPE_LEVEL_HIGH>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- mhl_to_hdmi: endpoint {
- remote-endpoint = <&hdmi_to_mhl>;
- };
- };
- port@1 {
- reg = <1>;
- mhl_to_connector: endpoint {
- remote-endpoint = <&connector_to_mhl>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/sil,sii8620.yaml b/Documentation/devicetree/bindings/display/bridge/sil,sii8620.yaml
new file mode 100644
index 000000000000..6d1a36b76fcb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/sil,sii8620.yaml
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/sil,sii8620.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Silicon Image SiI8620 HDMI/MHL bridge
+
+maintainers:
+ - Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+
+properties:
+ compatible:
+ const: sil,sii8620
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: xtal
+
+ cvcc10-supply:
+ description: Digital Core Supply Voltage (1.0V)
+
+ interrupts:
+ maxItems: 1
+
+ iovcc18-supply:
+ description: I/O Supply Voltage (1.8V)
+
+ reset-gpios:
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ unevaluatedProperties: false
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for HDMI (encoder) input
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ MHL to connector port
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - cvcc10-supply
+ - interrupts
+ - iovcc18-supply
+ - reset-gpios
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@39 {
+ reg = <0x39>;
+ compatible = "sil,sii8620";
+ cvcc10-supply = <&ldo36_reg>;
+ iovcc18-supply = <&ldo34_reg>;
+ interrupt-parent = <&gpf0>;
+ interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
+ reset-gpios = <&gpv7 0 GPIO_ACTIVE_LOW>;
+ clocks = <&pmu_system_controller 0>;
+ clock-names = "xtal";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ mhl_to_hdmi: endpoint {
+ remote-endpoint = <&hdmi_to_mhl>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ mhl_to_musb_con: endpoint {
+ remote-endpoint = <&musb_con_to_mhl>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/sil,sii9022.yaml b/Documentation/devicetree/bindings/display/bridge/sil,sii9022.yaml
new file mode 100644
index 000000000000..5a69547ad3d7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/sil,sii9022.yaml
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/sil,sii9022.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Silicon Image sii902x HDMI bridge
+
+maintainers:
+ - Boris Brezillon <bbrezillon@kernel.org>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - sil,sii9022-cpi # CEC Programming Interface
+ - sil,sii9022-tpi # Transmitter Programming Interface
+ - const: sil,sii9022
+ - const: sil,sii9022
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+ description: Interrupt line used to inform the host about hotplug events.
+
+ reset-gpios:
+ maxItems: 1
+
+ iovcc-supply:
+ description: I/O Supply Voltage (1.8V or 3.3V)
+
+ cvcc12-supply:
+ description: Digital Core Supply Voltage (1.2V)
+
+ '#sound-dai-cells':
+ enum: [ 0, 1 ]
+ description: |
+ <0> if only I2S or S/PDIF pin is wired,
+ <1> if both are wired.
+ HDMI audio is configured only if this property is found.
+ If HDMI audio is configured, the sii902x device becomes an I2S and/or
+ S/PDIF audio codec component (e.g. a digital audio sink), that can be
+ used in configuring full audio devices with simple-card or
+ audio-graph-card bindings. See their binding documents on how to describe
+ the way the
+ sii902x device is connected to the rest of the audio system:
+ Documentation/devicetree/bindings/sound/simple-card.yaml
+ Documentation/devicetree/bindings/sound/audio-graph-card.yaml
+ Note: In case of the audio-graph-card binding the used port index should
+ be 3.
+
+ sil,i2s-data-lanes:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 1
+ maxItems: 4
+ uniqueItems: true
+ items:
+ enum: [ 0, 1, 2, 3 ]
+ description:
+ Each integer indicates which I2S pin is connected to which audio FIFO.
+ The first integer selects the I2S audio pin for the first audio FIFO#0
+ (HDMI channels 1&2), the second for FIFO#1 (HDMI channels 3&4), and so
+ on. There are 4 FIFOs and 4 I2S pins (SD0 - SD3). Any I2S pin can be
+ connected to any FIFO, but there can be no gaps. E.g. an I2S pin must be
+ mapped to FIFO#0 and FIFO#1 before mapping a channel to FIFO#2. The
+ default value is <0>, describing SD0 pin being routed to HDMI audio
+ FIFO#0.
+
+ clocks:
+ maxItems: 1
+ description: MCLK input. MCLK can be used to produce HDMI audio CTS values.
+
+ clock-names:
+ const: mclk
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Parallel RGB input port
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: HDMI output port
+
+ port@3:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Sound input port
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ hdmi-bridge@39 {
+ compatible = "sil,sii9022";
+ reg = <0x39>;
+ reset-gpios = <&pioA 1 0>;
+ iovcc-supply = <&v3v3_hdmi>;
+ cvcc12-supply = <&v1v2_hdmi>;
+
+ #sound-dai-cells = <0>;
+ sil,i2s-data-lanes = < 0 1 2 >;
+ clocks = <&mclk>;
+ clock-names = "mclk";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ bridge_in: endpoint {
+ remote-endpoint = <&dc_out>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/sil,sii9234.yaml b/Documentation/devicetree/bindings/display/bridge/sil,sii9234.yaml
new file mode 100644
index 000000000000..176181d25530
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/sil,sii9234.yaml
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/sil,sii9234.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Silicon Image SiI9234 HDMI/MHL bridge
+
+maintainers:
+ - Maciej Purski <m.purski@samsung.com>
+
+properties:
+ compatible:
+ const: sil,sii9234
+
+ reg:
+ description: I2C address for TPI interface
+ maxItems: 1
+
+ avcc12-supply:
+ description: TMDS Analog Supply Voltage, 1.2V
+
+ avcc33-supply:
+ description: MHL/USB Switch Supply Voltage, 3.3V
+
+ cvcc12-supply:
+ description: Digital Core Supply Voltage, 1.2V
+
+ iovcc18-supply:
+ description: I/O voltage supply, 1.8V
+
+ interrupts:
+ maxItems: 1
+
+ reset-gpios:
+ description: GPIO connected to the reset pin.
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for HDMI (encoder) input
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ MHL to connector port
+
+ required:
+ - port@0
+
+required:
+ - compatible
+ - reg
+ - avcc12-supply
+ - avcc33-supply
+ - cvcc12-supply
+ - iovcc18-supply
+ - interrupts
+ - reset-gpios
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@39 {
+ compatible = "sil,sii9234";
+ reg = <0x39>;
+ avcc12-supply = <&vsil12>;
+ avcc33-supply = <&vcc33mhl>;
+ cvcc12-supply = <&vsil12>;
+ iovcc18-supply = <&vcc18mhl>;
+ interrupt-parent = <&gpf3>;
+ interrupts = <5 IRQ_TYPE_LEVEL_HIGH>;
+ reset-gpios = <&gpf3 4 GPIO_ACTIVE_LOW>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ mhl_to_hdmi: endpoint {
+ remote-endpoint = <&hdmi_to_mhl>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ mhl_to_connector: endpoint {
+ remote-endpoint = <&connector_to_mhl>;
+ };
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/bridge/sil-sii8620.txt b/Documentation/devicetree/bindings/display/bridge/sil-sii8620.txt
deleted file mode 100644
index b05052f7d62f..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/sil-sii8620.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Silicon Image SiI8620 HDMI/MHL bridge bindings
-
-Required properties:
- - compatible: "sil,sii8620"
- - reg: i2c address of the bridge
- - cvcc10-supply: Digital Core Supply Voltage (1.0V)
- - iovcc18-supply: I/O Supply Voltage (1.8V)
- - interrupts: interrupt specifier of INT pin
- - reset-gpios: gpio specifier of RESET pin
- - clocks, clock-names: specification and name of "xtal" clock
- - video interfaces: Device node can contain video interface port
- node for HDMI encoder according to [1].
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
- sii8620@39 {
- reg = <0x39>;
- compatible = "sil,sii8620";
- cvcc10-supply = <&ldo36_reg>;
- iovcc18-supply = <&ldo34_reg>;
- interrupt-parent = <&gpf0>;
- interrupts = <2 0>;
- reset-gpio = <&gpv7 0 0>;
- clocks = <&pmu_system_controller 0>;
- clock-names = "xtal";
-
- port {
- mhl_to_hdmi: endpoint {
- remote-endpoint = <&hdmi_to_mhl>;
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/snps,dw-mipi-dsi.yaml b/Documentation/devicetree/bindings/display/bridge/snps,dw-mipi-dsi.yaml
index 3c3e51af154b..0b51c64f141a 100644
--- a/Documentation/devicetree/bindings/display/bridge/snps,dw-mipi-dsi.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/snps,dw-mipi-dsi.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Synopsys DesignWare MIPI DSI host controller
maintainers:
- - Philippe CORNU <philippe.cornu@st.com>
+ - Philippe CORNU <philippe.cornu@foss.st.com>
description: |
This document defines device tree properties for the Synopsys DesignWare MIPI
@@ -26,19 +26,9 @@ properties:
reg:
maxItems: 1
- clocks:
- items:
- - description: Module clock
- - description: DSI bus clock for either AHB and APB
- - description: Pixel clock for the DPI/RGB input
- minItems: 2
-
- clock-names:
- items:
- - const: ref
- - const: pclk
- - const: px_clk
- minItems: 2
+ clocks: true
+
+ clock-names: true
resets:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/display/bridge/synopsys,dw-hdmi.yaml b/Documentation/devicetree/bindings/display/bridge/synopsys,dw-hdmi.yaml
index 9be44a682e67..4b7e54a8f037 100644
--- a/Documentation/devicetree/bindings/display/bridge/synopsys,dw-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/synopsys,dw-hdmi.yaml
@@ -26,9 +26,7 @@ properties:
reg-io-width:
description:
Width (in bytes) of the registers specified by the reg property.
- allOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- - enum: [1, 4]
+ enum: [1, 4]
default: 1
clocks:
diff --git a/Documentation/devicetree/bindings/display/bridge/tda998x.txt b/Documentation/devicetree/bindings/display/bridge/tda998x.txt
deleted file mode 100644
index f5a02f61dd36..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/tda998x.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Device-Tree bindings for the NXP TDA998x HDMI transmitter
-
-Required properties;
- - compatible: must be "nxp,tda998x"
-
- - reg: I2C address
-
-Required node:
- - port: Input port node with endpoint definition, as described
- in Documentation/devicetree/bindings/graph.txt
-
-Optional properties:
- - interrupts: interrupt number and trigger type
- default: polling
-
- - pinctrl-0: pin control group to be used for
- screen plug/unplug interrupt.
-
- - pinctrl-names: must contain a "default" entry.
-
- - video-ports: 24 bits value which defines how the video controller
- output is wired to the TDA998x input - default: <0x230145>
-
- - audio-ports: array of 8-bit values, 2 values per one DAI[1].
- The first value defines the DAI type: TDA998x_SPDIF or TDA998x_I2S[2].
- The second value defines the tda998x AP_ENA reg content when the DAI
- in question is used. The implementation allows one or two DAIs. If two
- DAIs are defined, they must be of different type.
-
- - nxp,calib-gpios: calibration GPIO, which must correspond with the
- gpio used for the TDA998x interrupt pin.
-
-[1] Documentation/sound/soc/dai.rst
-[2] include/dt-bindings/display/tda998x.h
-
-Example:
-
-#include <dt-bindings/display/tda998x.h>
-
- tda998x: hdmi-encoder {
- compatible = "nxp,tda998x";
- reg = <0x70>;
- interrupt-parent = <&gpio0>;
- interrupts = <27 2>; /* falling edge */
- pinctrl-0 = <&pmx_camera>;
- pinctrl-names = "default";
- video-ports = <0x230145>;
-
- #sound-dai-cells = <2>;
- /* DAI-format AP_ENA reg value */
- audio-ports = < TDA998x_SPDIF 0x04
- TDA998x_I2S 0x03>;
-
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/ti,dlpc3433.yaml b/Documentation/devicetree/bindings/display/bridge/ti,dlpc3433.yaml
new file mode 100644
index 000000000000..d3f84d220723
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/ti,dlpc3433.yaml
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/ti,dlpc3433.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TI DLPC3433 MIPI DSI to DMD bridge
+
+maintainers:
+ - Jagan Teki <jagan@amarulasolutions.com>
+ - Christopher Vollo <chris@renewoutreach.org>
+
+description: |
+ TI DLPC3433 is a MIPI DSI based display controller bridge
+ for processing high resolution DMD based projectors.
+
+ It has a flexible configuration of MIPI DSI and DPI signal
+ input that produces a DMD output in RGB565, RGB666, RGB888
+ formats.
+
+ It supports upto 720p resolution with 60 and 120 Hz refresh
+ rates.
+
+properties:
+ compatible:
+ const: ti,dlpc3433
+
+ reg:
+ enum:
+ - 0x1b
+ - 0x1d
+
+ enable-gpios:
+ description: PROJ_ON pin, chip powers up PROJ_ON is high.
+
+ vcc_intf-supply:
+ description: A 1.8V/3.3V supply that power the Host I/O.
+
+ vcc_flsh-supply:
+ description: A 1.8V/3.3V supply that power the Flash I/O.
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: Video port for MIPI DSI input.
+
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ description: array of physical DSI data lane indexes.
+ minItems: 1
+ items:
+ - const: 1
+ - const: 2
+ - const: 3
+ - const: 4
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Video port for DMD output.
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - enable-gpios
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@1b {
+ compatible = "ti,dlpc3433";
+ reg = <0x1b>;
+ enable-gpios = <&gpio2 1 GPIO_ACTIVE_HIGH>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+
+ bridge_in_dsi: endpoint {
+ remote-endpoint = <&dsi_out_bridge>;
+ data-lanes = <1 2 3 4>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+
+ bridge_out_panel: endpoint {
+ remote-endpoint = <&panel_out_bridge>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml
index b446d0f0f1b4..48a97bb3e2e0 100644
--- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi83.yaml
@@ -32,6 +32,9 @@ properties:
maxItems: 1
description: GPIO specifier for bridge_en pin (active high).
+ vcc-supply:
+ description: A 1.8V power supply (see regulator/regulator.yaml).
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
@@ -91,7 +94,6 @@ properties:
required:
- compatible
- reg
- - enable-gpios
- ports
allOf:
@@ -133,6 +135,7 @@ examples:
reg = <0x2d>;
enable-gpios = <&gpio2 1 GPIO_ACTIVE_HIGH>;
+ vcc-supply = <&reg_sn65dsi83_1v8>;
ports {
#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml
index 911564468c5e..6ec6d287bff4 100644
--- a/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/ti,sn65dsi86.yaml
@@ -90,7 +90,7 @@ properties:
properties:
endpoint:
- $ref: /schemas/graph.yaml#/$defs/endpoint-base
+ $ref: /schemas/media/video-interfaces.yaml#
unevaluatedProperties: false
properties:
@@ -106,7 +106,6 @@ properties:
description:
If you have 1 logical lane the bridge supports routing
to either port 0 or port 1. Port 0 is suggested.
- See ../../media/video-interface.txt for details.
- minItems: 2
maxItems: 2
@@ -118,7 +117,6 @@ properties:
description:
If you have 2 logical lanes the bridge supports
reordering but only on physical ports 0 and 1.
- See ../../media/video-interface.txt for details.
- minItems: 4
maxItems: 4
@@ -132,7 +130,6 @@ properties:
description:
If you have 4 logical lanes the bridge supports
reordering in any way.
- See ../../media/video-interface.txt for details.
lane-polarities:
minItems: 1
@@ -141,7 +138,6 @@ properties:
enum:
- 0
- 1
- description: See ../../media/video-interface.txt
dependencies:
lane-polarities: [data-lanes]
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358762.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358762.yaml
index 5216c27fc0ad..81ca3cbc7abe 100644
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358762.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358762.yaml
@@ -39,7 +39,6 @@ properties:
Video port for MIPI DPI output (panel or connector).
required:
- - port@0
- port@1
required:
@@ -52,7 +51,7 @@ additionalProperties: false
examples:
- |
- i2c1 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt
deleted file mode 100644
index 8f9abf28a8fa..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-TC358764 MIPI-DSI to LVDS panel bridge
-
-Required properties:
- - compatible: "toshiba,tc358764"
- - reg: the virtual channel number of a DSI peripheral
- - vddc-supply: core voltage supply, 1.2V
- - vddio-supply: I/O voltage supply, 1.8V or 3.3V
- - vddlvds-supply: LVDS1/2 voltage supply, 3.3V
- - reset-gpios: a GPIO spec for the reset pin
-
-The device node can contain following 'port' child nodes,
-according to the OF graph bindings defined in [1]:
- 0: DSI Input, not required, if the bridge is DSI controlled
- 1: LVDS Output, mandatory
-
-[1]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
-
- bridge@0 {
- reg = <0>;
- compatible = "toshiba,tc358764";
- vddc-supply = <&vcc_1v2_reg>;
- vddio-supply = <&vcc_1v8_reg>;
- vddlvds-supply = <&vcc_3v3_reg>;
- reset-gpios = <&gpd1 6 GPIO_ACTIVE_LOW>;
- #address-cells = <1>;
- #size-cells = <0>;
- port@1 {
- reg = <1>;
- lvds_ep: endpoint {
- remote-endpoint = <&panel_ep>;
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.yaml
new file mode 100644
index 000000000000..866607400514
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358764.yaml
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/toshiba,tc358764.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba TC358764 MIPI-DSI to LVDS bridge
+
+maintainers:
+ - Andrzej Hajda <andrzej.hajda@intel.com>
+
+properties:
+ compatible:
+ const: toshiba,tc358764
+
+ reg:
+ description: Virtual channel number of a DSI peripheral
+ maxItems: 1
+
+ reset-gpios:
+ maxItems: 1
+
+ vddc-supply:
+ description: Core voltage supply, 1.2V
+
+ vddio-supply:
+ description: I/O voltage supply, 1.8V or 3.3V
+
+ vddlvds-supply:
+ description: LVDS1/2 voltage supply, 3.3V
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for MIPI DSI input, if the bridge DSI controlled
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port for LVDS output (panel or connector).
+
+ required:
+ - port@1
+
+required:
+ - compatible
+ - reg
+ - reset-gpios
+ - vddc-supply
+ - vddio-supply
+ - vddlvds-supply
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@0 {
+ compatible = "toshiba,tc358764";
+ reg = <0>;
+
+ reset-gpios = <&gpd1 6 GPIO_ACTIVE_LOW>;
+ vddc-supply = <&vcc_1v2_reg>;
+ vddio-supply = <&vcc_1v8_reg>;
+ vddlvds-supply = <&vcc_3v3_reg>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@1 {
+ reg = <1>;
+ lvds_ep: endpoint {
+ remote-endpoint = <&panel_ep>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
deleted file mode 100644
index 583c5e9dbe6b..000000000000
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Toshiba TC358767 eDP bridge bindings
-
-Required properties:
- - compatible: "toshiba,tc358767"
- - reg: i2c address of the bridge, 0x68 or 0x0f, depending on bootstrap pins
- - clock-names: should be "ref"
- - clocks: OF device-tree clock specification for refclk input. The reference
- clock rate must be 13 MHz, 19.2 MHz, 26 MHz, or 38.4 MHz.
-
-Optional properties:
- - shutdown-gpios: OF device-tree gpio specification for SD pin
- (active high shutdown input)
- - reset-gpios: OF device-tree gpio specification for RSTX pin
- (active low system reset)
- - toshiba,hpd-pin: TC358767 GPIO pin number to which HPD is connected to (0 or 1)
- - ports: the ports node can contain video interface port nodes to connect
- to a DPI/DSI source and to an eDP/DP sink according to [1][2]:
- - port@0: DSI input port
- - port@1: DPI input port
- - port@2: eDP/DP output port
-
-[1]: Documentation/devicetree/bindings/graph.txt
-[2]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
- edp-bridge@68 {
- compatible = "toshiba,tc358767";
- reg = <0x68>;
- shutdown-gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
- reset-gpios = <&gpio3 24 GPIO_ACTIVE_LOW>;
- clock-names = "ref";
- clocks = <&edp_refclk>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@1 {
- reg = <1>;
-
- bridge_in: endpoint {
- remote-endpoint = <&dpi_out>;
- };
- };
-
- port@2 {
- reg = <2>;
-
- bridge_out: endpoint {
- remote-endpoint = <&panel_in>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.yaml
new file mode 100644
index 000000000000..e1494b5007cb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.yaml
@@ -0,0 +1,174 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/bridge/toshiba,tc358767.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Toshiba TC358767 eDP bridge
+
+maintainers:
+ - Andrey Gusakov <andrey.gusakov@cogentembedded.com>
+
+description: The TC358767 is bridge device which converts DSI/DPI to eDP/DP
+
+properties:
+ compatible:
+ const: toshiba,tc358767
+
+ reg:
+ enum:
+ - 0x68
+ - 0x0f
+ description: |
+ i2c address of the bridge, 0x68 or 0x0f, depending on bootstrap pins
+
+ clock-names:
+ const: ref
+
+ clocks:
+ maxItems: 1
+ description: |
+ OF device-tree clock specification for refclk input. The reference.
+ clock rate must be 13 MHz, 19.2 MHz, 26 MHz, or 38.4 MHz.
+
+ shutdown-gpios:
+ maxItems: 1
+ description: |
+ OF device-tree gpio specification for SD pin(active high shutdown input)
+
+ reset-gpios:
+ maxItems: 1
+ description: |
+ OF device-tree gpio specification for RSTX pin(active low system reset)
+
+ toshiba,hpd-pin:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum:
+ - 0
+ - 1
+ description: TC358767 GPIO pin number to which HPD is connected to (0 or 1)
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: |
+ DSI input port. The remote endpoint phandle should be a
+ reference to a valid DSI output endpoint node
+
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+
+ properties:
+ data-lanes:
+ description: array of physical DSI data lane indexes.
+ minItems: 1
+ items:
+ - const: 1
+ - const: 2
+ - const: 3
+ - const: 4
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: |
+ DPI input/output port. The remote endpoint phandle should be a
+ reference to a valid DPI output or input endpoint node.
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: |
+ eDP/DP output port. The remote endpoint phandle should be a
+ reference to a valid eDP panel input endpoint node. This port is
+ optional, treated as DP panel if not defined
+
+ oneOf:
+ - required:
+ - port@0
+ - required:
+ - port@1
+
+
+required:
+ - compatible
+ - reg
+ - clock-names
+ - clocks
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ /* DPI input and eDP output */
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ edp-bridge@68 {
+ compatible = "toshiba,tc358767";
+ reg = <0x68>;
+ shutdown-gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio3 24 GPIO_ACTIVE_LOW>;
+ clock-names = "ref";
+ clocks = <&edp_refclk>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@1 {
+ reg = <1>;
+
+ bridge_in_0: endpoint {
+ remote-endpoint = <&dpi_out>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+
+ bridge_out: endpoint {
+ remote-endpoint = <&panel_in>;
+ };
+ };
+ };
+ };
+ };
+ - |
+ /* DPI input and DP output */
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ edp-bridge@68 {
+ compatible = "toshiba,tc358767";
+ reg = <0x68>;
+ shutdown-gpios = <&gpio3 23 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio3 24 GPIO_ACTIVE_LOW>;
+ clock-names = "ref";
+ clocks = <&edp_refclk>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@1 {
+ reg = <1>;
+
+ bridge_in_1: endpoint {
+ remote-endpoint = <&dpi_out>;
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358768.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358768.yaml
index eacfe7165083..779d8c57f854 100644
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358768.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358768.yaml
@@ -58,6 +58,7 @@ properties:
properties:
data-lines:
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [ 16, 18, 24 ]
port@1:
@@ -77,17 +78,20 @@ required:
- vddio-supply
- ports
-additionalProperties: false
+allOf:
+ - $ref: ../dsi-controller.yaml#
+
+unevaluatedProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
- i2c1 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
- dsi_bridge: dsi-bridge@e {
+ dsi_bridge: dsi@e {
compatible = "toshiba,tc358768";
reg = <0xe>;
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml
index 10471c6c1ff9..d879c700594a 100644
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/bridge/toshiba,tc358775.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Toshiba TC358775 DSI to LVDS bridge bindings
+title: Toshiba TC358775 DSI to LVDS bridge
maintainers:
- Vinay Simha BN <simhavcs@gmail.com>
diff --git a/Documentation/devicetree/bindings/display/dp-aux-bus.yaml b/Documentation/devicetree/bindings/display/dp-aux-bus.yaml
index 5e4afe9f98fb..0ece7b01790b 100644
--- a/Documentation/devicetree/bindings/display/dp-aux-bus.yaml
+++ b/Documentation/devicetree/bindings/display/dp-aux-bus.yaml
@@ -26,7 +26,7 @@ description:
properties:
$nodename:
- const: "aux-bus"
+ const: aux-bus
panel:
$ref: panel/panel-common.yaml#
diff --git a/Documentation/devicetree/bindings/display/dsi-controller.yaml b/Documentation/devicetree/bindings/display/dsi-controller.yaml
index ca21671f6bdd..67ce10307ee0 100644
--- a/Documentation/devicetree/bindings/display/dsi-controller.yaml
+++ b/Documentation/devicetree/bindings/display/dsi-controller.yaml
@@ -30,6 +30,15 @@ properties:
$nodename:
pattern: "^dsi(@.*)?$"
+ clock-master:
+ type: boolean
+ description:
+ Should be enabled if the host is being used in conjunction with
+ another DSI host to drive the same peripheral. Hardware supporting
+ such a configuration generally requires the data on both the busses
+ to be driven by the same clock. Only the DSI host instance
+ controlling this clock should contain this property.
+
"#address-cells":
const: 1
@@ -52,15 +61,6 @@ patternProperties:
case the reg property can take multiple entries, one for each virtual
channel that the peripheral responds to.
- clock-master:
- type: boolean
- description:
- Should be enabled if the host is being used in conjunction with
- another DSI host to drive the same peripheral. Hardware supporting
- such a configuration generally requires the data on both the busses
- to be driven by the same clock. Only the DSI host instance
- controlling this clock should contain this property.
-
enforce-video-mode:
type: boolean
description:
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos-mic.txt b/Documentation/devicetree/bindings/display/exynos/exynos-mic.txt
deleted file mode 100644
index 0fba2ee6440a..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos-mic.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-Device-Tree bindings for Samsung Exynos SoC mobile image compressor (MIC)
-
-MIC (mobile image compressor) resides between decon and mipi dsi. Mipi dsi is
-not capable to transfer high resoltuion frame data as decon can send. MIC
-solves this problem by compressing the frame data by 1/2 before it is
-transferred through mipi dsi. The compressed frame data must be uncompressed in
-the panel PCB.
-
-Required properties:
-- compatible: value should be "samsung,exynos5433-mic".
-- reg: physical base address and length of the MIC registers set and system
- register of mic.
-- clocks: must include clock specifiers corresponding to entries in the
- clock-names property.
-- clock-names: list of clock names sorted in the same order as the clocks
- property. Must contain "pclk_mic0", "sclk_rgb_vclk_to_mic0".
-- samsung,disp-syscon: the reference node for syscon for DISP block.
-- ports: contains a port which is connected to decon node and dsi node.
- address-cells and size-cells must 1 and 0, respectively.
-- port: contains an endpoint node which is connected to the endpoint in the
- decon node or dsi node. The reg value must be 0 and 1 respectively.
-
-Example:
-SoC specific DT entry:
-mic: mic@13930000 {
- compatible = "samsung,exynos5433-mic";
- reg = <0x13930000 0x48>;
- clocks = <&cmu_disp CLK_PCLK_MIC0>,
- <&cmu_disp CLK_SCLK_RGB_VCLK_TO_MIC0>;
- clock-names = "pclk_mic0", "sclk_rgb_vclk_to_mic0";
- samsung,disp-syscon = <&syscon_disp>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- mic_to_decon: endpoint {
- remote-endpoint = <&decon_to_mic>;
- };
- };
-
- port@1 {
- reg = <1>;
- mic_to_dsi: endpoint {
- remote-endpoint = <&dsi_to_mic>;
- };
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt b/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt
deleted file mode 100644
index 775193e1c641..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Device-Tree bindings for Samsung Exynos SoC display controller (DECON)
-
-DECON (Display and Enhancement Controller) is the Display Controller for the
-Exynos series of SoCs which transfers the image data from a video memory
-buffer to an external LCD interface.
-
-Required properties:
-- compatible: value should be one of:
- "samsung,exynos5433-decon", "samsung,exynos5433-decon-tv";
-- reg: physical base address and length of the DECON registers set.
-- interrupt-names: should contain the interrupt names depending on mode of work:
- video mode: "vsync",
- command mode: "lcd_sys",
- command mode with software trigger: "lcd_sys", "te".
-- interrupts or interrupts-extended: list of interrupt specifiers corresponding
- to names privided in interrupt-names, as described in
- interrupt-controller/interrupts.txt
-- clocks: must include clock specifiers corresponding to entries in the
- clock-names property.
-- clock-names: list of clock names sorted in the same order as the clocks
- property. Must contain "pclk", "aclk_decon", "aclk_smmu_decon0x",
- "aclk_xiu_decon0x", "pclk_smmu_decon0x", "aclk_smmu_decon1x",
- "aclk_xiu_decon1x", "pclk_smmu_decon1x", clk_decon_vclk",
- "sclk_decon_eclk"
-- ports: contains a port which is connected to mic node. address-cells and
- size-cells must 1 and 0, respectively.
-- port: contains an endpoint node which is connected to the endpoint in the mic
- node. The reg value muset be 0.
-
-Example:
-SoC specific DT entry:
-decon: decon@13800000 {
- compatible = "samsung,exynos5433-decon";
- reg = <0x13800000 0x2104>;
- clocks = <&cmu_disp CLK_ACLK_DECON>, <&cmu_disp CLK_ACLK_SMMU_DECON0X>,
- <&cmu_disp CLK_ACLK_XIU_DECON0X>,
- <&cmu_disp CLK_PCLK_SMMU_DECON0X>,
- <&cmu_disp CLK_ACLK_SMMU_DECON1X>,
- <&cmu_disp CLK_ACLK_XIU_DECON1X>,
- <&cmu_disp CLK_PCLK_SMMU_DECON1X>,
- <&cmu_disp CLK_SCLK_DECON_VCLK>,
- <&cmu_disp CLK_SCLK_DECON_ECLK>;
- clock-names = "aclk_decon", "aclk_smmu_decon0x", "aclk_xiu_decon0x",
- "pclk_smmu_decon0x", "aclk_smmu_decon1x", "aclk_xiu_decon1x",
- "pclk_smmu_decon1x", "sclk_decon_vclk", "sclk_decon_eclk";
- interrupt-names = "vsync", "lcd_sys";
- interrupts = <0 202 0>, <0 203 0>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- decon_to_mic: endpoint {
- remote-endpoint = <&mic_to_decon>;
- };
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos7-decon.txt b/Documentation/devicetree/bindings/display/exynos/exynos7-decon.txt
deleted file mode 100644
index 53912c99ec38..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos7-decon.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-Device-Tree bindings for Samsung Exynos7 SoC display controller (DECON)
-
-DECON (Display and Enhancement Controller) is the Display Controller for the
-Exynos7 series of SoCs which transfers the image data from a video memory
-buffer to an external LCD interface.
-
-Required properties:
-- compatible: value should be "samsung,exynos7-decon";
-
-- reg: physical base address and length of the DECON registers set.
-
-- interrupts: should contain a list of all DECON IP block interrupts in the
- order: FIFO Level, VSYNC, LCD_SYSTEM. The interrupt specifier
- format depends on the interrupt controller used.
-
-- interrupt-names: should contain the interrupt names: "fifo", "vsync",
- "lcd_sys", in the same order as they were listed in the interrupts
- property.
-
-- pinctrl-0: pin control group to be used for this controller.
-
-- pinctrl-names: must contain a "default" entry.
-
-- clocks: must include clock specifiers corresponding to entries in the
- clock-names property.
-
-- clock-names: list of clock names sorted in the same order as the clocks
- property. Must contain "pclk_decon0", "aclk_decon0",
- "decon0_eclk", "decon0_vclk".
-- i80-if-timings: timing configuration for lcd i80 interface support.
-
-Optional Properties:
-- power-domains: a phandle to DECON power domain node.
-- display-timings: timing settings for DECON, as described in document [1].
- Can be used in case timings cannot be provided otherwise
- or to override timings provided by the panel.
-
-[1]: Documentation/devicetree/bindings/display/panel/display-timing.txt
-
-Example:
-
-SoC specific DT entry:
-
- decon@13930000 {
- compatible = "samsung,exynos7-decon";
- interrupt-parent = <&combiner>;
- reg = <0x13930000 0x1000>;
- interrupt-names = "lcd_sys", "vsync", "fifo";
- interrupts = <0 188 0>, <0 189 0>, <0 190 0>;
- clocks = <&clock_disp PCLK_DECON_INT>,
- <&clock_disp ACLK_DECON_INT>,
- <&clock_disp SCLK_DECON_INT_ECLK>,
- <&clock_disp SCLK_DECON_INT_EXTCLKPLL>;
- clock-names = "pclk_decon0", "aclk_decon0", "decon0_eclk",
- "decon0_vclk";
- status = "disabled";
- };
-
-Board specific DT entry:
-
- decon@13930000 {
- pinctrl-0 = <&lcd_clk &pwm1_out>;
- pinctrl-names = "default";
- status = "okay";
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt b/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt
index 9b6cba3f82af..3a401590320f 100644
--- a/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt
+++ b/Documentation/devicetree/bindings/display/exynos/exynos_dp.txt
@@ -50,7 +50,7 @@ Optional properties for dp-controller:
Documentation/devicetree/bindings/display/panel/display-timing.txt
For the below properties, please refer to Analogix DP binding document:
- * Documentation/devicetree/bindings/display/bridge/analogix_dp.txt
+ * Documentation/devicetree/bindings/display/bridge/analogix,dp.yaml
-phys (required)
-phy-names (required)
-hpd-gpios (optional)
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt b/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt
deleted file mode 100644
index be377786e8cd..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos_dsim.txt
+++ /dev/null
@@ -1,90 +0,0 @@
-Exynos MIPI DSI Master
-
-Required properties:
- - compatible: value should be one of the following
- "samsung,exynos3250-mipi-dsi" /* for Exynos3250/3472 SoCs */
- "samsung,exynos4210-mipi-dsi" /* for Exynos4 SoCs */
- "samsung,exynos5410-mipi-dsi" /* for Exynos5410/5420/5440 SoCs */
- "samsung,exynos5422-mipi-dsi" /* for Exynos5422/5800 SoCs */
- "samsung,exynos5433-mipi-dsi" /* for Exynos5433 SoCs */
- - reg: physical base address and length of the registers set for the device
- - interrupts: should contain DSI interrupt
- - clocks: list of clock specifiers, must contain an entry for each required
- entry in clock-names
- - clock-names: should include "bus_clk"and "sclk_mipi" entries
- the use of "pll_clk" is deprecated
- - phys: list of phy specifiers, must contain an entry for each required
- entry in phy-names
- - phy-names: should include "dsim" entry
- - vddcore-supply: MIPI DSIM Core voltage supply (e.g. 1.1V)
- - vddio-supply: MIPI DSIM I/O and PLL voltage supply (e.g. 1.8V)
- - samsung,pll-clock-frequency: specifies frequency of the oscillator clock
- - #address-cells, #size-cells: should be set respectively to <1> and <0>
- according to DSI host bindings (see MIPI DSI bindings [1])
- - samsung,burst-clock-frequency: specifies DSI frequency in high-speed burst
- mode
- - samsung,esc-clock-frequency: specifies DSI frequency in escape mode
-
-Optional properties:
- - power-domains: a phandle to DSIM power domain node
-
-Child nodes:
- Should contain DSI peripheral nodes (see MIPI DSI bindings [1]).
-
-Video interfaces:
- Device node can contain following video interface port nodes according to [2]:
- 0: RGB input,
- 1: DSI output
-
-[1]: Documentation/devicetree/bindings/display/mipi-dsi-bus.txt
-[2]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
-
- dsi@11c80000 {
- compatible = "samsung,exynos4210-mipi-dsi";
- reg = <0x11C80000 0x10000>;
- interrupts = <0 79 0>;
- clocks = <&clock 286>, <&clock 143>;
- clock-names = "bus_clk", "sclk_mipi";
- phys = <&mipi_phy 1>;
- phy-names = "dsim";
- vddcore-supply = <&vusb_reg>;
- vddio-supply = <&vmipi_reg>;
- power-domains = <&pd_lcd0>;
- #address-cells = <1>;
- #size-cells = <0>;
- samsung,pll-clock-frequency = <24000000>;
-
- panel@1 {
- reg = <0>;
- ...
- port {
- panel_ep: endpoint {
- remote-endpoint = <&dsi_ep>;
- };
- };
- };
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- decon_to_mic: endpoint {
- remote-endpoint = <&mic_to_decon>;
- };
- };
-
- port@1 {
- reg = <1>;
- dsi_ep: endpoint {
- reg = <0>;
- samsung,burst-clock-frequency = <500000000>;
- samsung,esc-clock-frequency = <20000000>;
- remote-endpoint = <&panel_ep>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt b/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt
deleted file mode 100644
index 58b12e25bbb1..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos_hdmi.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-Device-Tree bindings for drm hdmi driver
-
-Required properties:
-- compatible: value should be one among the following:
- 1) "samsung,exynos4210-hdmi"
- 2) "samsung,exynos4212-hdmi"
- 3) "samsung,exynos5420-hdmi"
- 4) "samsung,exynos5433-hdmi"
-- reg: physical base address of the hdmi and length of memory mapped
- region.
-- interrupts: interrupt number to the cpu.
-- hpd-gpios: following information about the hotplug gpio pin.
- a) phandle of the gpio controller node.
- b) pin number within the gpio controller.
- c) optional flags and pull up/down.
-- ddc: phandle to the hdmi ddc node
-- phy: phandle to the hdmi phy node
-- samsung,syscon-phandle: phandle for system controller node for PMU.
-- #sound-dai-cells: should be 0.
-
-Required properties for Exynos 4210, 4212, 5420 and 5433:
-- clocks: list of clock IDs from SoC clock driver.
- a) hdmi: Gate of HDMI IP bus clock.
- b) sclk_hdmi: Gate of HDMI special clock.
- c) sclk_pixel: Pixel special clock, one of the two possible inputs of
- HDMI clock mux.
- d) sclk_hdmiphy: HDMI PHY clock output, one of two possible inputs of
- HDMI clock mux.
- e) mout_hdmi: It is required by the driver to switch between the 2
- parents i.e. sclk_pixel and sclk_hdmiphy. If hdmiphy is stable
- after configuration, parent is set to sclk_hdmiphy else
- sclk_pixel.
-- clock-names: aliases as per driver requirements for above clock IDs:
- "hdmi", "sclk_hdmi", "sclk_pixel", "sclk_hdmiphy" and "mout_hdmi".
-
-Required properties for Exynos 5433:
-- clocks: list of clock specifiers according to common clock bindings.
- a) hdmi_pclk: Gate of HDMI IP APB bus.
- b) hdmi_i_pclk: Gate of HDMI-PHY IP APB bus.
- d) i_tmds_clk: Gate of HDMI TMDS clock.
- e) i_pixel_clk: Gate of HDMI pixel clock.
- f) i_spdif_clk: Gate of HDMI SPDIF clock.
- g) oscclk: Oscillator clock, used as parent of following *_user clocks
- in case HDMI-PHY is not operational.
- h) tmds_clko: TMDS clock generated by HDMI-PHY.
- i) tmds_clko_user: MUX used to switch between oscclk and tmds_clko,
- respectively if HDMI-PHY is off and operational.
- j) pixel_clko: Pixel clock generated by HDMI-PHY.
- k) pixel_clko_user: MUX used to switch between oscclk and pixel_clko,
- respectively if HDMI-PHY is off and operational.
-- clock-names: aliases for above clock specfiers.
-- samsung,sysreg: handle to syscon used to control the system registers.
-
-Example:
-
- hdmi {
- compatible = "samsung,exynos4212-hdmi";
- reg = <0x14530000 0x100000>;
- interrupts = <0 95 0>;
- hpd-gpios = <&gpx3 7 1>;
- ddc = <&hdmi_ddc_node>;
- phy = <&hdmi_phy_node>;
- samsung,syscon-phandle = <&pmu_system_controller>;
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_hdmiddc.txt b/Documentation/devicetree/bindings/display/exynos/exynos_hdmiddc.txt
deleted file mode 100644
index 41eee971562b..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos_hdmiddc.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Device-Tree bindings for hdmiddc driver
-
-Required properties:
-- compatible: value should be one of the following
- 1) "samsung,exynos5-hdmiddc" <DEPRECATED>
- 2) "samsung,exynos4210-hdmiddc"
-
-- reg: I2C address of the hdmiddc device.
-
-Example:
-
- hdmiddc {
- compatible = "samsung,exynos4210-hdmiddc";
- reg = <0x50>;
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_hdmiphy.txt b/Documentation/devicetree/bindings/display/exynos/exynos_hdmiphy.txt
deleted file mode 100644
index 162f641f7639..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos_hdmiphy.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Device-Tree bindings for hdmiphy driver
-
-Required properties:
-- compatible: value should be one of the following:
- 1) "samsung,exynos5-hdmiphy" <DEPRECATED>
- 2) "samsung,exynos4210-hdmiphy".
- 3) "samsung,exynos4212-hdmiphy".
-- reg: I2C address of the hdmiphy device.
-
-Example:
-
- hdmiphy {
- compatible = "samsung,exynos4210-hdmiphy";
- reg = <0x38>;
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos_mixer.txt b/Documentation/devicetree/bindings/display/exynos/exynos_mixer.txt
deleted file mode 100644
index 3e38128f866b..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/exynos_mixer.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Device-Tree bindings for mixer driver
-
-Required properties:
-- compatible: value should be one of the following:
- 1) "samsung,exynos5-mixer" <DEPRECATED>
- 2) "samsung,exynos4210-mixer"
- 3) "samsung,exynos4212-mixer"
- 4) "samsung,exynos5250-mixer"
- 5) "samsung,exynos5420-mixer"
-
-- reg: physical base address of the mixer and length of memory mapped
- region.
-- interrupts: interrupt number to the cpu.
-- clocks: list of clock IDs from SoC clock driver.
- a) mixer: Gate of Mixer IP bus clock.
- b) sclk_hdmi: HDMI Special clock, one of the two possible inputs of
- mixer mux.
- c) hdmi: Gate of HDMI IP bus clock, needed together with sclk_hdmi.
-
-Example:
-
- mixer {
- compatible = "samsung,exynos5250-mixer";
- reg = <0x14450000 0x10000>;
- interrupts = <0 94 0>;
- };
diff --git a/Documentation/devicetree/bindings/display/exynos/samsung-fimd.txt b/Documentation/devicetree/bindings/display/exynos/samsung-fimd.txt
deleted file mode 100644
index b3096421d42b..000000000000
--- a/Documentation/devicetree/bindings/display/exynos/samsung-fimd.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-Device-Tree bindings for Samsung SoC display controller (FIMD)
-
-FIMD (Fully Interactive Mobile Display) is the Display Controller for the
-Samsung series of SoCs which transfers the image data from a video memory
-buffer to an external LCD interface.
-
-Required properties:
-- compatible: value should be one of the following
- "samsung,s3c2443-fimd"; /* for S3C24XX SoCs */
- "samsung,s3c6400-fimd"; /* for S3C64XX SoCs */
- "samsung,s5pv210-fimd"; /* for S5PV210 SoC */
- "samsung,exynos3250-fimd"; /* for Exynos3250/3472 SoCs */
- "samsung,exynos4210-fimd"; /* for Exynos4 SoCs */
- "samsung,exynos5250-fimd"; /* for Exynos5250 SoCs */
- "samsung,exynos5420-fimd"; /* for Exynos5420/5422/5800 SoCs */
-
-- reg: physical base address and length of the FIMD registers set.
-
-- interrupts: should contain a list of all FIMD IP block interrupts in the
- order: FIFO Level, VSYNC, LCD_SYSTEM. The interrupt specifier
- format depends on the interrupt controller used.
-
-- interrupt-names: should contain the interrupt names: "fifo", "vsync",
- "lcd_sys", in the same order as they were listed in the interrupts
- property.
-
-- pinctrl-0: pin control group to be used for this controller.
-
-- pinctrl-names: must contain a "default" entry.
-
-- clocks: must include clock specifiers corresponding to entries in the
- clock-names property.
-
-- clock-names: list of clock names sorted in the same order as the clocks
- property. Must contain "sclk_fimd" and "fimd".
-
-Optional Properties:
-- power-domains: a phandle to FIMD power domain node.
-- samsung,invert-vden: video enable signal is inverted
-- samsung,invert-vclk: video clock signal is inverted
-- display-timings: timing settings for FIMD, as described in document [1].
- Can be used in case timings cannot be provided otherwise
- or to override timings provided by the panel.
-- samsung,sysreg: handle to syscon used to control the system registers
-- i80-if-timings: timing configuration for lcd i80 interface support.
- - cs-setup: clock cycles for the active period of address signal is enabled
- until chip select is enabled.
- If not specified, the default value(0) will be used.
- - wr-setup: clock cycles for the active period of CS signal is enabled until
- write signal is enabled.
- If not specified, the default value(0) will be used.
- - wr-active: clock cycles for the active period of CS is enabled.
- If not specified, the default value(1) will be used.
- - wr-hold: clock cycles for the active period of CS is disabled until write
- signal is disabled.
- If not specified, the default value(0) will be used.
-
- The parameters are defined as:
-
- VCLK(internal) __|??????|_____|??????|_____|??????|_____|??????|_____|??
- : : : : :
- Address Output --:<XXXXXXXXXXX:XXXXXXXXXXXX:XXXXXXXXXXXX:XXXXXXXXXXXX:XX
- | cs-setup+1 | : : :
- |<---------->| : : :
- Chip Select ???????????????|____________:____________:____________|??
- | wr-setup+1 | | wr-hold+1 |
- |<---------->| |<---------->|
- Write Enable ????????????????????????????|____________|???????????????
- | wr-active+1|
- |<---------->|
- Video Data ----------------------------<XXXXXXXXXXXXXXXXXXXXXXXXX>--
-
-The device node can contain 'port' child nodes according to the bindings defined
-in [2]. The following are properties specific to those nodes:
-- reg: (required) port index, can be:
- 0 - for CAMIF0 input,
- 1 - for CAMIF1 input,
- 2 - for CAMIF2 input,
- 3 - for parallel output,
- 4 - for write-back interface
-
-[1]: Documentation/devicetree/bindings/display/panel/display-timing.txt
-[2]: Documentation/devicetree/bindings/media/video-interfaces.txt
-
-Example:
-
-SoC specific DT entry:
-
- fimd@11c00000 {
- compatible = "samsung,exynos4210-fimd";
- interrupt-parent = <&combiner>;
- reg = <0x11c00000 0x20000>;
- interrupt-names = "fifo", "vsync", "lcd_sys";
- interrupts = <11 0>, <11 1>, <11 2>;
- clocks = <&clock 140>, <&clock 283>;
- clock-names = "sclk_fimd", "fimd";
- power-domains = <&pd_lcd0>;
- status = "disabled";
- };
-
-Board specific DT entry:
-
- fimd@11c00000 {
- pinctrl-0 = <&lcd_clk &lcd_data24 &pwm1_out>;
- pinctrl-names = "default";
- status = "okay";
- };
diff --git a/Documentation/devicetree/bindings/display/fsl,lcdif.yaml b/Documentation/devicetree/bindings/display/fsl,lcdif.yaml
index 900a56cae80e..75b4efd70ba8 100644
--- a/Documentation/devicetree/bindings/display/fsl,lcdif.yaml
+++ b/Documentation/devicetree/bindings/display/fsl,lcdif.yaml
@@ -20,6 +20,7 @@ properties:
- fsl,imx23-lcdif
- fsl,imx28-lcdif
- fsl,imx6sx-lcdif
+ - fsl,imx8mp-lcdif
- items:
- enum:
- fsl,imx6sl-lcdif
@@ -51,6 +52,9 @@ properties:
interrupts:
maxItems: 1
+ power-domains:
+ maxItems: 1
+
port:
$ref: /schemas/graph.yaml#/properties/port
description: The LCDIF output port
@@ -80,12 +84,48 @@ allOf:
maxItems: 3
required:
- clock-names
- else:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx8mp-lcdif
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ minItems: 3
+ maxItems: 3
+ required:
+ - clock-names
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx6sx-lcdif
+ - fsl,imx8mp-lcdif
+ then:
properties:
clocks:
maxItems: 1
clock-names:
maxItems: 1
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx6sl-lcdif
+ - fsl,imx6sx-lcdif
+ - fsl,imx8mm-lcdif
+ - fsl,imx8mn-lcdif
+ - fsl,imx8mp-lcdif
+ then:
+ required:
+ - power-domains
examples:
- |
@@ -100,6 +140,7 @@ examples:
<&clks IMX6SX_CLK_LCDIF_APB>,
<&clks IMX6SX_CLK_DISPLAY_AXI>;
clock-names = "pix", "axi", "disp_axi";
+ power-domains = <&pd_disp>;
port {
endpoint {
diff --git a/Documentation/devicetree/bindings/display/ilitek,ili9341.txt b/Documentation/devicetree/bindings/display/ilitek,ili9341.txt
deleted file mode 100644
index 169b32e4ee4e..000000000000
--- a/Documentation/devicetree/bindings/display/ilitek,ili9341.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Ilitek ILI9341 display panels
-
-This binding is for display panels using an Ilitek ILI9341 controller in SPI
-mode.
-
-Required properties:
-- compatible: "adafruit,yx240qv29", "ilitek,ili9341"
-- dc-gpios: D/C pin
-- reset-gpios: Reset pin
-
-The node for this driver must be a child node of a SPI controller, hence
-all mandatory properties described in ../spi/spi-bus.txt must be specified.
-
-Optional properties:
-- rotation: panel rotation in degrees counter clockwise (0,90,180,270)
-- backlight: phandle of the backlight device attached to the panel
-
-Example:
- display@0{
- compatible = "adafruit,yx240qv29", "ilitek,ili9341";
- reg = <0>;
- spi-max-frequency = <32000000>;
- dc-gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>;
- reset-gpios = <&gpio0 8 GPIO_ACTIVE_HIGH>;
- rotation = <270>;
- backlight = <&backlight>;
- };
diff --git a/Documentation/devicetree/bindings/display/ilitek,ili9486.yaml b/Documentation/devicetree/bindings/display/ilitek,ili9486.yaml
index aecff34f505d..1f8f2182e2f1 100644
--- a/Documentation/devicetree/bindings/display/ilitek,ili9486.yaml
+++ b/Documentation/devicetree/bindings/display/ilitek,ili9486.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/ilitek,ili9486.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ilitek ILI9486 display panels device tree bindings
+title: Ilitek ILI9486 display panels
maintainers:
- Kamlesh Gurudasani <kamlesh.gurudasani@gmail.com>
diff --git a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt b/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt
deleted file mode 100644
index f4df9e83bcd2..000000000000
--- a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-Freescale imx21 Framebuffer
-
-This framebuffer driver supports devices imx1, imx21, imx25, and imx27.
-
-Required properties:
-- compatible : "fsl,<chip>-fb", chip should be imx1 or imx21
-- reg : Should contain 1 register ranges(address and length)
-- interrupts : One interrupt of the fb dev
-
-Required nodes:
-- display: Phandle to a display node as described in
- Documentation/devicetree/bindings/display/panel/display-timing.txt
- Additional, the display node has to define properties:
- - bits-per-pixel: Bits per pixel
- - fsl,pcr: LCDC PCR value
- A display node may optionally define
- - fsl,aus-mode: boolean to enable AUS mode (only for imx21)
-
-Optional properties:
-- lcd-supply: Regulator for LCD supply voltage.
-- fsl,dmacr: DMA Control Register value. This is optional. By default, the
- register is not modified as recommended by the datasheet.
-- fsl,lpccr: Contrast Control Register value. This property provides the
- default value for the contrast control register.
- If that property is omitted, the register is zeroed.
-- fsl,lscr1: LCDC Sharp Configuration Register value.
-
-Example:
-
- imxfb: fb@10021000 {
- compatible = "fsl,imx21-fb";
- interrupts = <61>;
- reg = <0x10021000 0x1000>;
- display = <&display0>;
- };
-
- ...
-
- display0: display0 {
- model = "Primeview-PD050VL1";
- bits-per-pixel = <16>;
- fsl,pcr = <0xf0c88080>; /* non-standard but required */
- display-timings {
- native-mode = <&timing_disp0>;
- timing_disp0: 640x480 {
- hactive = <640>;
- vactive = <480>;
- hback-porch = <112>;
- hfront-porch = <36>;
- hsync-len = <32>;
- vback-porch = <33>;
- vfront-porch = <33>;
- vsync-len = <2>;
- clock-frequency = <25000000>;
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/imx/fsl,imx-lcdc.yaml b/Documentation/devicetree/bindings/display/imx/fsl,imx-lcdc.yaml
new file mode 100644
index 000000000000..c2b29622bceb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/imx/fsl,imx-lcdc.yaml
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/imx/fsl,imx-lcdc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale i.MX LCD Controller, found on i.MX1, i.MX21, i.MX25 and i.MX27
+
+maintainers:
+ - Sascha Hauer <s.hauer@pengutronix.de>
+ - Pengutronix Kernel Team <kernel@pengutronix.de>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - fsl,imx1-fb
+ - fsl,imx21-fb
+ - items:
+ - enum:
+ - fsl,imx25-fb
+ - fsl,imx27-fb
+ - const: fsl,imx21-fb
+ - items:
+ - const: fsl,imx25-lcdc
+ - const: fsl,imx21-lcdc
+
+ clocks:
+ maxItems: 3
+
+ clock-names:
+ items:
+ - const: ipg
+ - const: ahb
+ - const: per
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+
+ display:
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ interrupts:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ lcd-supply:
+ description:
+ Regulator for LCD supply voltage.
+
+ fsl,dmacr:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Override value for DMA Control Register
+
+ fsl,lpccr:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Contrast Control Register value.
+
+ fsl,lscr1:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ LCDC Sharp Configuration Register value.
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx1-lcdc
+ - fsl,imx21-lcdc
+ then:
+ properties:
+ display: false
+ fsl,dmacr: false
+ fsl,lpccr: false
+ fsl,lscr1: false
+
+ required:
+ - port
+
+ else:
+ properties:
+ port: false
+
+ required:
+ - display
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - interrupts
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ lcdc@53fbc000 {
+ compatible = "fsl,imx25-lcdc", "fsl,imx21-lcdc";
+ reg = <0x53fbc000 0x4000>;
+ interrupts = <39>;
+ clocks = <&clks 103>, <&clks 66>, <&clks 49>;
+ clock-names = "ipg", "ahb", "per";
+
+ port {
+ parallel_out: endpoint {
+ remote-endpoint = <&panel_in>;
+ };
+ };
+ };
+ - |
+ imxfb: fb@10021000 {
+ compatible = "fsl,imx21-fb";
+ interrupts = <61>;
+ reg = <0x10021000 0x1000>;
+ display = <&display0>;
+ clocks = <&clks 103>, <&clks 49>, <&clks 66>;
+ clock-names = "ipg", "ahb", "per";
+ };
+
+ display0: display0 {
+ model = "Primeview-PD050VL1";
+ bits-per-pixel = <16>;
+ fsl,pcr = <0xf0c88080>; /* non-standard but required */
+
+ display-timings {
+ native-mode = <&timing_disp0>;
+ timing_disp0: timing0 {
+ hactive = <640>;
+ vactive = <480>;
+ hback-porch = <112>;
+ hfront-porch = <36>;
+ hsync-len = <32>;
+ vback-porch = <33>;
+ vfront-porch = <33>;
+ vsync-len = <2>;
+ clock-frequency = <25000000>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/imx/nxp,imx8mq-dcss.yaml b/Documentation/devicetree/bindings/display/imx/nxp,imx8mq-dcss.yaml
index 0091df9dd73b..4ae6328cde64 100644
--- a/Documentation/devicetree/bindings/display/imx/nxp,imx8mq-dcss.yaml
+++ b/Documentation/devicetree/bindings/display/imx/nxp,imx8mq-dcss.yaml
@@ -2,8 +2,8 @@
# Copyright 2019 NXP
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/imx/nxp,imx8mq-dcss.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/imx/nxp,imx8mq-dcss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: iMX8MQ Display Controller Subsystem (DCSS)
@@ -105,4 +105,3 @@ examples:
};
};
};
-
diff --git a/Documentation/devicetree/bindings/display/ingenic,ipu.yaml b/Documentation/devicetree/bindings/display/ingenic,ipu.yaml
index e679f48a3886..319bd7c88fe3 100644
--- a/Documentation/devicetree/bindings/display/ingenic,ipu.yaml
+++ b/Documentation/devicetree/bindings/display/ingenic,ipu.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/ingenic,ipu.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ingenic SoCs Image Processing Unit (IPU) devicetree bindings
+title: Ingenic SoCs Image Processing Unit (IPU)
maintainers:
- Paul Cercueil <paul@crapouillou.net>
@@ -45,7 +45,7 @@ additionalProperties: false
examples:
- |
- #include <dt-bindings/clock/jz4770-cgu.h>
+ #include <dt-bindings/clock/ingenic,jz4770-cgu.h>
ipu@13080000 {
compatible = "ingenic,jz4770-ipu", "ingenic,jz4760-ipu";
reg = <0x13080000 0x800>;
diff --git a/Documentation/devicetree/bindings/display/ingenic,lcd.yaml b/Documentation/devicetree/bindings/display/ingenic,lcd.yaml
index 50d2b0a50e8a..6d4c00f3fcc8 100644
--- a/Documentation/devicetree/bindings/display/ingenic,lcd.yaml
+++ b/Documentation/devicetree/bindings/display/ingenic,lcd.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/ingenic,lcd.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ingenic SoCs LCD controller devicetree bindings
+title: Ingenic SoCs LCD controller
maintainers:
- Paul Cercueil <paul@crapouillou.net>
@@ -17,6 +17,8 @@ properties:
enum:
- ingenic,jz4740-lcd
- ingenic,jz4725b-lcd
+ - ingenic,jz4760-lcd
+ - ingenic,jz4760b-lcd
- ingenic,jz4770-lcd
- ingenic,jz4780-lcd
@@ -88,7 +90,7 @@ additionalProperties: false
examples:
- |
- #include <dt-bindings/clock/jz4740-cgu.h>
+ #include <dt-bindings/clock/ingenic,jz4740-cgu.h>
lcd-controller@13050000 {
compatible = "ingenic,jz4740-lcd";
reg = <0x13050000 0x1000>;
@@ -107,7 +109,7 @@ examples:
};
- |
- #include <dt-bindings/clock/jz4725b-cgu.h>
+ #include <dt-bindings/clock/ingenic,jz4725b-cgu.h>
lcd-controller@13050000 {
compatible = "ingenic,jz4725b-lcd";
reg = <0x13050000 0x1000>;
diff --git a/Documentation/devicetree/bindings/display/intel,keembay-display.yaml b/Documentation/devicetree/bindings/display/intel,keembay-display.yaml
index bc6622b010ca..2cf54ecc707a 100644
--- a/Documentation/devicetree/bindings/display/intel,keembay-display.yaml
+++ b/Documentation/devicetree/bindings/display/intel,keembay-display.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/intel,keembay-display.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Devicetree bindings for Intel Keem Bay display controller
+title: Intel Keem Bay display controller
maintainers:
- Anitha Chrisanthus <anitha.chrisanthus@intel.com>
diff --git a/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml b/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml
index a222b52d8b8f..cc7e1f318fe4 100644
--- a/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml
+++ b/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/intel,keembay-msscam.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Devicetree bindings for Intel Keem Bay MSSCAM
+title: Intel Keem Bay MSSCAM
maintainers:
- Anitha Chrisanthus <anitha.chrisanthus@intel.com>
diff --git a/Documentation/devicetree/bindings/display/panel/lvds.yaml b/Documentation/devicetree/bindings/display/lvds.yaml
index 49460c9dceea..7cd2ce7e9c33 100644
--- a/Documentation/devicetree/bindings/display/panel/lvds.yaml
+++ b/Documentation/devicetree/bindings/display/lvds.yaml
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
%YAML 1.2
---
-$id: http://devicetree.org/schemas/display/panel/lvds.yaml#
+$id: http://devicetree.org/schemas/display/lvds.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: LVDS Display Panel
+title: LVDS Display Common Properties
maintainers:
- Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
@@ -13,8 +13,8 @@ maintainers:
description: |+
LVDS is a physical layer specification defined in ANSI/TIA/EIA-644-A. Multiple
incompatible data link layers have been used over time to transmit image data
- to LVDS panels. This bindings supports display panels compatible with the
- following specifications.
+ to LVDS devices. This bindings supports devices compatible with the following
+ specifications.
[JEIDA] "Digital Interface Standards for Monitor", JEIDA-59-1999, February
1999 (Version 1.0), Japan Electronic Industry Development Association (JEIDA)
@@ -26,18 +26,7 @@ description: |+
Device compatible with those specifications have been marketed under the
FPD-Link and FlatLink brands.
-allOf:
- - $ref: panel-common.yaml#
-
properties:
- compatible:
- contains:
- const: panel-lvds
- description:
- Shall contain "panel-lvds" in addition to a mandatory panel-specific
- compatible string defined in individual panel bindings. The "panel-lvds"
- value shall never be used on its own.
-
data-mapping:
enum:
- jeida-18
@@ -96,22 +85,6 @@ properties:
If set, reverse the bit order described in the data mappings below on all
data lanes, transmitting bits for slots 6 to 0 instead of 0 to 6.
- port: true
- ports: true
-
-required:
- - compatible
- - data-mapping
- - width-mm
- - height-mm
- - panel-timing
-
-oneOf:
- - required:
- - port
- - required:
- - ports
-
additionalProperties: true
...
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml
new file mode 100644
index 000000000000..92741486c24d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,aal.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display adaptive ambient light processor
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display adaptive ambient light processor, namely AAL,
+ is responsible for backlight power saving and sunlight visibility improving.
+ AAL device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-aal
+ - mediatek,mt8183-disp-aal
+ - items:
+ - enum:
+ - mediatek,mt2712-disp-aal
+ - const: mediatek,mt8173-disp-aal
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-aal
+ - mediatek,mt8188-disp-aal
+ - mediatek,mt8192-disp-aal
+ - mediatek,mt8195-disp-aal
+ - const: mediatek,mt8183-disp-aal
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: AAL Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ aal@14015000 {
+ compatible = "mediatek,mt8173-disp-aal";
+ reg = <0 0x14015000 0 0x1000>;
+ interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_AAL>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x5000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml
new file mode 100644
index 000000000000..8c2a737237f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,ccorr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display color correction
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display color correction, namely CCORR, reproduces correct color
+ on panels with different color gamut.
+ CCORR device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8183-disp-ccorr
+ - mediatek,mt8192-disp-ccorr
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-ccorr
+ - mediatek,mt8188-disp-ccorr
+ - mediatek,mt8195-disp-ccorr
+ - const: mediatek,mt8192-disp-ccorr
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: CCORR Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8183-clk.h>
+ #include <dt-bindings/power/mt8183-power.h>
+ #include <dt-bindings/gce/mt8183-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ ccorr0: ccorr@1400f000 {
+ compatible = "mediatek,mt8183-disp-ccorr";
+ reg = <0 0x1400f000 0 0x1000>;
+ interrupts = <GIC_SPI 232 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+ clocks = <&mmsys CLK_MM_DISP_CCORR0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xf000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,cec.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,cec.yaml
index 66288b9f0aa6..080cf321209e 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,cec.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,cec.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/mediatek/mediatek,cec.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Mediatek HDMI CEC Controller Device Tree Bindings
+title: Mediatek HDMI CEC Controller
maintainers:
- CK Hu <ck.hu@mediatek.com>
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml
new file mode 100644
index 000000000000..d0ea77fc4b06
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,color.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display color processor
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display color processor, namely COLOR, provides hue, luma and
+ saturation adjustments to get better picture quality and to have one panel
+ resemble the other in their output characteristics.
+ COLOR device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt2701-disp-color
+ - mediatek,mt8167-disp-color
+ - mediatek,mt8173-disp-color
+ - items:
+ - enum:
+ - mediatek,mt7623-disp-color
+ - mediatek,mt2712-disp-color
+ - const: mediatek,mt2701-disp-color
+ - items:
+ - enum:
+ - mediatek,mt8183-disp-color
+ - mediatek,mt8186-disp-color
+ - mediatek,mt8188-disp-color
+ - mediatek,mt8192-disp-color
+ - mediatek,mt8195-disp-color
+ - const: mediatek,mt8173-disp-color
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: COLOR Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ color0: color@14013000 {
+ compatible = "mediatek,mt8173-disp-color";
+ reg = <0 0x14013000 0 0x1000>;
+ interrupts = <GIC_SPI 187 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_COLOR0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x3000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,disp.txt b/Documentation/devicetree/bindings/display/mediatek/mediatek,disp.txt
deleted file mode 100644
index 78044c340e20..000000000000
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,disp.txt
+++ /dev/null
@@ -1,219 +0,0 @@
-Mediatek display subsystem
-==========================
-
-The Mediatek display subsystem consists of various DISP function blocks in the
-MMSYS register space. The connections between them can be configured by output
-and input selectors in the MMSYS_CONFIG register space. Pixel clock and start
-of frame signal are distributed to the other function blocks by a DISP_MUTEX
-function block.
-
-All DISP device tree nodes must be siblings to the central MMSYS_CONFIG node.
-For a description of the MMSYS_CONFIG binding, see
-Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml.
-
-DISP function blocks
-====================
-
-A display stream starts at a source function block that reads pixel data from
-memory and ends with a sink function block that drives pixels on a display
-interface, or writes pixels back to memory. All DISP function blocks have
-their own register space, interrupt, and clock gate. The blocks that can
-access memory additionally have to list the IOMMU and local arbiter they are
-connected to.
-
-For a description of the display interface sink function blocks, see
-Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.txt and
-Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml.
-
-Required properties (all function blocks):
-- compatible: "mediatek,<chip>-disp-<function>", one of
- "mediatek,<chip>-disp-ovl" - overlay (4 layers, blending, csc)
- "mediatek,<chip>-disp-ovl-2l" - overlay (2 layers, blending, csc)
- "mediatek,<chip>-disp-rdma" - read DMA / line buffer
- "mediatek,<chip>-disp-wdma" - write DMA
- "mediatek,<chip>-disp-ccorr" - color correction
- "mediatek,<chip>-disp-color" - color processor
- "mediatek,<chip>-disp-dither" - dither
- "mediatek,<chip>-disp-aal" - adaptive ambient light controller
- "mediatek,<chip>-disp-gamma" - gamma correction
- "mediatek,<chip>-disp-merge" - merge streams from two RDMA sources
- "mediatek,<chip>-disp-postmask" - control round corner for display frame
- "mediatek,<chip>-disp-split" - split stream to two encoders
- "mediatek,<chip>-disp-ufoe" - data compression engine
- "mediatek,<chip>-dsi" - DSI controller, see mediatek,dsi.txt
- "mediatek,<chip>-dpi" - DPI controller, see mediatek,dpi.txt
- "mediatek,<chip>-disp-mutex" - display mutex
- "mediatek,<chip>-disp-od" - overdrive
- the supported chips are mt2701, mt7623, mt2712, mt8167, mt8173, mt8183 and mt8192.
-- reg: Physical base address and length of the function block register space
-- interrupts: The interrupt signal from the function block (required, except for
- merge and split function blocks).
-- clocks: device clocks
- See Documentation/devicetree/bindings/clock/clock-bindings.txt for details.
- For most function blocks this is just a single clock input. Only the DSI and
- DPI controller nodes have multiple clock inputs. These are documented in
- mediatek,dsi.txt and mediatek,dpi.txt, respectively.
- An exception is that the mt8183 mutex is always free running with no clocks property.
-
-Required properties (DMA function blocks):
-- compatible: Should be one of
- "mediatek,<chip>-disp-ovl"
- "mediatek,<chip>-disp-rdma"
- "mediatek,<chip>-disp-wdma"
- the supported chips are mt2701, mt8167 and mt8173.
-- larb: Should contain a phandle pointing to the local arbiter device as defined
- in Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml
-- iommus: Should point to the respective IOMMU block with master port as
- argument, see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml
- for details.
-
-Optional properties (RDMA function blocks):
-- mediatek,rdma-fifo-size: rdma fifo size may be different even in same SOC, add this
- property to the corresponding rdma
- the value is the Max value which defined in hardware data sheet.
- mediatek,rdma-fifo-size of mt8173-rdma0 is 8K
- mediatek,rdma-fifo-size of mt8183-rdma0 is 5K
- mediatek,rdma-fifo-size of mt8183-rdma1 is 2K
-
-Examples:
-
-mmsys: clock-controller@14000000 {
- compatible = "mediatek,mt8173-mmsys", "syscon";
- reg = <0 0x14000000 0 0x1000>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- #clock-cells = <1>;
-};
-
-ovl0: ovl@1400c000 {
- compatible = "mediatek,mt8173-disp-ovl";
- reg = <0 0x1400c000 0 0x1000>;
- interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_OVL0>;
- iommus = <&iommu M4U_PORT_DISP_OVL0>;
- mediatek,larb = <&larb0>;
-};
-
-ovl1: ovl@1400d000 {
- compatible = "mediatek,mt8173-disp-ovl";
- reg = <0 0x1400d000 0 0x1000>;
- interrupts = <GIC_SPI 181 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_OVL1>;
- iommus = <&iommu M4U_PORT_DISP_OVL1>;
- mediatek,larb = <&larb4>;
-};
-
-rdma0: rdma@1400e000 {
- compatible = "mediatek,mt8173-disp-rdma";
- reg = <0 0x1400e000 0 0x1000>;
- interrupts = <GIC_SPI 182 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_RDMA0>;
- iommus = <&iommu M4U_PORT_DISP_RDMA0>;
- mediatek,larb = <&larb0>;
- mediatek,rdma-fifosize = <8192>;
-};
-
-rdma1: rdma@1400f000 {
- compatible = "mediatek,mt8173-disp-rdma";
- reg = <0 0x1400f000 0 0x1000>;
- interrupts = <GIC_SPI 183 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_RDMA1>;
- iommus = <&iommu M4U_PORT_DISP_RDMA1>;
- mediatek,larb = <&larb4>;
-};
-
-rdma2: rdma@14010000 {
- compatible = "mediatek,mt8173-disp-rdma";
- reg = <0 0x14010000 0 0x1000>;
- interrupts = <GIC_SPI 184 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_RDMA2>;
- iommus = <&iommu M4U_PORT_DISP_RDMA2>;
- mediatek,larb = <&larb4>;
-};
-
-wdma0: wdma@14011000 {
- compatible = "mediatek,mt8173-disp-wdma";
- reg = <0 0x14011000 0 0x1000>;
- interrupts = <GIC_SPI 185 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_WDMA0>;
- iommus = <&iommu M4U_PORT_DISP_WDMA0>;
- mediatek,larb = <&larb0>;
-};
-
-wdma1: wdma@14012000 {
- compatible = "mediatek,mt8173-disp-wdma";
- reg = <0 0x14012000 0 0x1000>;
- interrupts = <GIC_SPI 186 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_WDMA1>;
- iommus = <&iommu M4U_PORT_DISP_WDMA1>;
- mediatek,larb = <&larb4>;
-};
-
-color0: color@14013000 {
- compatible = "mediatek,mt8173-disp-color";
- reg = <0 0x14013000 0 0x1000>;
- interrupts = <GIC_SPI 187 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_COLOR0>;
-};
-
-color1: color@14014000 {
- compatible = "mediatek,mt8173-disp-color";
- reg = <0 0x14014000 0 0x1000>;
- interrupts = <GIC_SPI 188 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_COLOR1>;
-};
-
-aal@14015000 {
- compatible = "mediatek,mt8173-disp-aal";
- reg = <0 0x14015000 0 0x1000>;
- interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_AAL>;
-};
-
-gamma@14016000 {
- compatible = "mediatek,mt8173-disp-gamma";
- reg = <0 0x14016000 0 0x1000>;
- interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_GAMMA>;
-};
-
-ufoe@1401a000 {
- compatible = "mediatek,mt8173-disp-ufoe";
- reg = <0 0x1401a000 0 0x1000>;
- interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_UFOE>;
-};
-
-dsi0: dsi@1401b000 {
- /* See mediatek,dsi.txt for details */
-};
-
-dpi0: dpi@1401d000 {
- /* See mediatek,dpi.txt for details */
-};
-
-mutex: mutex@14020000 {
- compatible = "mediatek,mt8173-disp-mutex";
- reg = <0 0x14020000 0 0x1000>;
- interrupts = <GIC_SPI 169 IRQ_TYPE_LEVEL_LOW>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_MUTEX_32K>;
-};
-
-od@14023000 {
- compatible = "mediatek,mt8173-disp-od";
- reg = <0 0x14023000 0 0x1000>;
- power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
- clocks = <&mmsys CLK_MM_DISP_OD>;
-};
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml
new file mode 100644
index 000000000000..1588b3f7cec7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,dither.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display dither processor
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display dither processor, namely DITHER, works by approximating
+ unavailable colors with available colors and by mixing and matching available
+ colors to mimic unavailable ones.
+ DITHER device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8183-disp-dither
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-dither
+ - mediatek,mt8188-disp-dither
+ - mediatek,mt8192-disp-dither
+ - mediatek,mt8195-disp-dither
+ - const: mediatek,mt8183-disp-dither
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: DITHER Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8183-clk.h>
+ #include <dt-bindings/power/mt8183-power.h>
+ #include <dt-bindings/gce/mt8183-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ dither0: dither@14012000 {
+ compatible = "mediatek,mt8183-disp-dither";
+ reg = <0 0x14012000 0 0x1000>;
+ interrupts = <GIC_SPI 235 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+ clocks = <&mmsys CLK_MM_DISP_DITHER0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x2000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml
new file mode 100644
index 000000000000..ff781f2174a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dp.yaml
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,dp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Display Port Controller
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Jitao shi <jitao.shi@mediatek.com>
+
+description: |
+ MediaTek DP and eDP are different hardwares and there are some features
+ which are not supported for eDP. For example, audio is not supported for
+ eDP. Therefore, we need to use two different compatibles to describe them.
+ In addition, We just need to enable the power domain of DP, so the clock
+ of DP is generated by itself and we are not using other PLL to generate
+ clocks.
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt8195-dp-tx
+ - mediatek,mt8195-edp-tx
+
+ reg:
+ maxItems: 1
+
+ nvmem-cells:
+ maxItems: 1
+ description: efuse data for display port calibration
+
+ nvmem-cell-names:
+ const: dp_calibration_data
+
+ power-domains:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Input endpoint of the controller, usually dp_intf
+
+ port@1:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: Output endpoint of the controller
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+ properties:
+ data-lanes:
+ description: |
+ number of lanes supported by the hardware.
+ The possible values:
+ 0 - For 1 lane enabled in IP.
+ 0 1 - For 2 lanes enabled in IP.
+ 0 1 2 3 - For 4 lanes enabled in IP.
+ minItems: 1
+ maxItems: 4
+ required:
+ - data-lanes
+
+ required:
+ - port@0
+ - port@1
+
+ max-linkrate-mhz:
+ enum: [ 1620, 2700, 5400, 8100 ]
+ description: maximum link rate supported by the hardware.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - ports
+ - max-linkrate-mhz
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/mt8195-power.h>
+ dptx@1c600000 {
+ compatible = "mediatek,mt8195-dp-tx";
+ reg = <0x1c600000 0x8000>;
+ power-domains = <&spm MT8195_POWER_DOMAIN_DP_TX>;
+ interrupts = <GIC_SPI 458 IRQ_TYPE_LEVEL_HIGH 0>;
+ max-linkrate-mhz = <8100>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dptx_in: endpoint {
+ remote-endpoint = <&dp_intf0_out>;
+ };
+ };
+ port@1 {
+ reg = <1>;
+ dptx_out: endpoint {
+ data-lanes = <0 1 2 3>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
index dd2896a40ff0..d976380801e3 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
@@ -4,16 +4,16 @@
$id: http://devicetree.org/schemas/display/mediatek/mediatek,dpi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: mediatek DPI Controller Device Tree Bindings
+title: MediaTek DPI and DP_INTF Controller
maintainers:
- CK Hu <ck.hu@mediatek.com>
- Jitao shi <jitao.shi@mediatek.com>
description: |
- The Mediatek DPI function block is a sink of the display subsystem and
- provides 8-bit RGB/YUV444 or 8/10/10-bit YUV422 pixel data on a parallel
- output bus.
+ The MediaTek DPI and DP_INTF function blocks are a sink of the display
+ subsystem and provides 8-bit RGB/YUV444 or 8/10/10-bit YUV422 pixel data on a
+ parallel output bus.
properties:
compatible:
@@ -22,7 +22,10 @@ properties:
- mediatek,mt7623-dpi
- mediatek,mt8173-dpi
- mediatek,mt8183-dpi
+ - mediatek,mt8186-dpi
+ - mediatek,mt8188-dp-intf
- mediatek,mt8192-dpi
+ - mediatek,mt8195-dp-intf
reg:
maxItems: 1
@@ -54,7 +57,7 @@ properties:
$ref: /schemas/graph.yaml#/properties/port
description:
Output port node. This port should be connected to the input port of an
- attached HDMI or LVDS encoder chip.
+ attached HDMI, LVDS or DisplayPort encoder chip.
required:
- compatible
@@ -70,8 +73,7 @@ examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/clock/mt8173-clk.h>
- #include <dt-bindings/interrupt-controller/arm-gic.h>
- #include <dt-bindings/interrupt-controller/irq.h>
+
dpi0: dpi@1401d000 {
compatible = "mediatek,mt8173-dpi";
reg = <0x1401d000 0x1000>;
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml
new file mode 100644
index 000000000000..2cbdd9ee449d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,dsc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: mediatek display DSC controller
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ The DSC standard is a specification of the algorithms used for
+ compressing and decompressing image display streams, including
+ the specification of the syntax and semantics of the compressed
+ video bit stream. DSC is designed for real-time systems with
+ real-time compression, transmission, decompression and Display.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8195-disp-dsc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: DSC Wrapper Clock
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ mediatek,gce-client-reg:
+ description:
+ The register of client driver can be configured by gce with 4 arguments
+ defined in this property, such as phandle of gce, subsys id,
+ register offset and size.
+ Each subsys id is mapping to a base address of display function blocks
+ register which is defined in the gce header
+ include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8195-clk.h>
+ #include <dt-bindings/power/mt8195-power.h>
+ #include <dt-bindings/gce/mt8195-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ dsc0: disp_dsc_wrap@1c009000 {
+ compatible = "mediatek,mt8195-disp-dsc";
+ reg = <0 0x1c009000 0 0x1000>;
+ interrupts = <GIC_SPI 645 IRQ_TYPE_LEVEL_HIGH 0>;
+ power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS0>;
+ clocks = <&vdosys0 CLK_VDO0_DSC_WRAP0>;
+ mediatek,gce-client-reg = <&gce1 SUBSYS_1c00XXXX 0x9000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.txt b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.txt
deleted file mode 100644
index d30428b9fb33..000000000000
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Mediatek DSI Device
-===================
-
-The Mediatek DSI function block is a sink of the display subsystem and can
-drive up to 4-lane MIPI DSI output. Two DSIs can be synchronized for dual-
-channel output.
-
-Required properties:
-- compatible: "mediatek,<chip>-dsi"
-- the supported chips are mt2701, mt7623, mt8167, mt8173 and mt8183.
-- reg: Physical base address and length of the controller's registers
-- interrupts: The interrupt signal from the function block.
-- clocks: device clocks
- See Documentation/devicetree/bindings/clock/clock-bindings.txt for details.
-- clock-names: must contain "engine", "digital", and "hs"
-- phys: phandle link to the MIPI D-PHY controller.
-- phy-names: must contain "dphy"
-- port: Output port node with endpoint definitions as described in
- Documentation/devicetree/bindings/graph.txt. This port should be connected
- to the input port of an attached DSI panel or DSI-to-eDP encoder chip.
-
-MIPI TX Configuration Module
-============================
-
-See phy/mediatek,dsi-phy.yaml
-
-Example:
-
-mipi_tx0: mipi-dphy@10215000 {
- compatible = "mediatek,mt8173-mipi-tx";
- reg = <0 0x10215000 0 0x1000>;
- clocks = <&clk26m>;
- clock-output-names = "mipi_tx0_pll";
- #clock-cells = <0>;
- #phy-cells = <0>;
- drive-strength-microamp = <4600>;
- nvmem-cells= <&mipi_tx_calibration>;
- nvmem-cell-names = "calibration-data";
-};
-
-dsi0: dsi@1401b000 {
- compatible = "mediatek,mt8173-dsi";
- reg = <0 0x1401b000 0 0x1000>;
- interrupts = <GIC_SPI 192 IRQ_TYPE_LEVEL_LOW>;
- clocks = <&mmsys MM_DSI0_ENGINE>, <&mmsys MM_DSI0_DIGITAL>,
- <&mipi_tx0>;
- clock-names = "engine", "digital", "hs";
- phys = <&mipi_tx0>;
- phy-names = "dphy";
-
- port {
- dsi0_out: endpoint {
- remote-endpoint = <&panel_in>;
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.yaml
new file mode 100644
index 000000000000..4707b60238b0
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsi.yaml
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek DSI Controller
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+ - Jitao Shi <jitao.shi@mediatek.com>
+ - Xinlei Lee <xinlei.lee@mediatek.com>
+
+description: |
+ The MediaTek DSI function block is a sink of the display subsystem and can
+ drive up to 4-lane MIPI DSI output. Two DSIs can be synchronized for dual-
+ channel output.
+
+allOf:
+ - $ref: /schemas/display/dsi-controller.yaml#
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt2701-dsi
+ - mediatek,mt7623-dsi
+ - mediatek,mt8167-dsi
+ - mediatek,mt8173-dsi
+ - mediatek,mt8183-dsi
+ - mediatek,mt8186-dsi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Engine Clock
+ - description: Digital Clock
+ - description: HS Clock
+
+ clock-names:
+ items:
+ - const: engine
+ - const: digital
+ - const: hs
+
+ resets:
+ maxItems: 1
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ items:
+ - const: dphy
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Output port node. This port should be connected to the input
+ port of an attached DSI panel or DSI-to-eDP encoder chip.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+ - clock-names
+ - phys
+ - phy-names
+ - port
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mt8183-clk.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/power/mt8183-power.h>
+ #include <dt-bindings/phy/phy.h>
+ #include <dt-bindings/reset/mt8183-resets.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ dsi0: dsi@14014000 {
+ compatible = "mediatek,mt8183-dsi";
+ reg = <0 0x14014000 0 0x1000>;
+ interrupts = <GIC_SPI 236 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+ clocks = <&mmsys CLK_MM_DSI0_MM>,
+ <&mmsys CLK_MM_DSI0_IF>,
+ <&mipi_tx0>;
+ clock-names = "engine", "digital", "hs";
+ resets = <&mmsys MT8183_MMSYS_SW0_RST_B_DISP_DSI0>;
+ phys = <&mipi_tx0>;
+ phy-names = "dphy";
+ port {
+ dsi0_out: endpoint {
+ remote-endpoint = <&panel_in>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml
new file mode 100644
index 000000000000..801fa66ae615
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,ethdr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek Ethdr Device
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description:
+ ETHDR (ET High Dynamic Range) is a MediaTek internal HDR engine and is
+ designed for HDR video and graphics conversion in the external display path.
+ It handles multiple HDR input types and performs tone mapping, color
+ space/color format conversion, and then combine different layers,
+ output the required HDR or SDR signal to the subsequent display path.
+ This engine is composed of two video frontends, two graphic frontends,
+ one video backend and a mixer. ETHDR has two DMA function blocks, DS and ADL.
+ These two function blocks read the pre-programmed registers from DRAM and
+ set them to HW in the v-blanking period.
+
+properties:
+ compatible:
+ const: mediatek,mt8195-disp-ethdr
+
+ reg:
+ maxItems: 7
+
+ reg-names:
+ items:
+ - const: mixer
+ - const: vdo_fe0
+ - const: vdo_fe1
+ - const: gfx_fe0
+ - const: gfx_fe1
+ - const: vdo_be
+ - const: adl_ds
+
+ interrupts:
+ maxItems: 1
+
+ iommus:
+ minItems: 1
+ maxItems: 2
+
+ clocks:
+ items:
+ - description: mixer clock
+ - description: video frontend 0 clock
+ - description: video frontend 1 clock
+ - description: graphic frontend 0 clock
+ - description: graphic frontend 1 clock
+ - description: video backend clock
+ - description: autodownload and menuload clock
+ - description: video frontend 0 async clock
+ - description: video frontend 1 async clock
+ - description: graphic frontend 0 async clock
+ - description: graphic frontend 1 async clock
+ - description: video backend async clock
+ - description: ethdr top clock
+
+ clock-names:
+ items:
+ - const: mixer
+ - const: vdo_fe0
+ - const: vdo_fe1
+ - const: gfx_fe0
+ - const: gfx_fe1
+ - const: vdo_be
+ - const: adl_ds
+ - const: vdo_fe0_async
+ - const: vdo_fe1_async
+ - const: gfx_fe0_async
+ - const: gfx_fe1_async
+ - const: vdo_be_async
+ - const: ethdr_top
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ items:
+ - description: video frontend 0 async reset
+ - description: video frontend 1 async reset
+ - description: graphic frontend 0 async reset
+ - description: graphic frontend 1 async reset
+ - description: video backend async reset
+
+ reset-names:
+ items:
+ - const: vdo_fe0_async
+ - const: vdo_fe1_async
+ - const: gfx_fe0_async
+ - const: gfx_fe1_async
+ - const: vdo_be_async
+
+ mediatek,gce-client-reg:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ minItems: 1
+ maxItems: 7
+ description: The register of display function block to be set by gce.
+ There are 4 arguments in this property, gce node, subsys id, offset and
+ register size. The subsys id is defined in the gce header of each chips
+ include/dt-bindings/gce/<chip>-gce.h, mapping to the register of display
+ function block.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - interrupts
+ - power-domains
+ - resets
+ - mediatek,gce-client-reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8195-clk.h>
+ #include <dt-bindings/gce/mt8195-gce.h>
+ #include <dt-bindings/memory/mt8195-memory-port.h>
+ #include <dt-bindings/power/mt8195-power.h>
+ #include <dt-bindings/reset/mt8195-resets.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ hdr-engine@1c114000 {
+ compatible = "mediatek,mt8195-disp-ethdr";
+ reg = <0 0x1c114000 0 0x1000>,
+ <0 0x1c115000 0 0x1000>,
+ <0 0x1c117000 0 0x1000>,
+ <0 0x1c119000 0 0x1000>,
+ <0 0x1c11a000 0 0x1000>,
+ <0 0x1c11b000 0 0x1000>,
+ <0 0x1c11c000 0 0x1000>;
+ reg-names = "mixer", "vdo_fe0", "vdo_fe1", "gfx_fe0", "gfx_fe1",
+ "vdo_be", "adl_ds";
+ mediatek,gce-client-reg = <&gce0 SUBSYS_1c11XXXX 0x4000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0x5000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0x7000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0x9000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0xa000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0xb000 0x1000>,
+ <&gce0 SUBSYS_1c11XXXX 0xc000 0x1000>;
+ clocks = <&vdosys1 CLK_VDO1_DISP_MIXER>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_FE0>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_FE1>,
+ <&vdosys1 CLK_VDO1_HDR_GFX_FE0>,
+ <&vdosys1 CLK_VDO1_HDR_GFX_FE1>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_BE>,
+ <&vdosys1 CLK_VDO1_26M_SLOW>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_FE0_DL_ASYNC>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_FE1_DL_ASYNC>,
+ <&vdosys1 CLK_VDO1_HDR_GFX_FE0_DL_ASYNC>,
+ <&vdosys1 CLK_VDO1_HDR_GFX_FE1_DL_ASYNC>,
+ <&vdosys1 CLK_VDO1_HDR_VDO_BE_DL_ASYNC>,
+ <&topckgen CLK_TOP_ETHDR>;
+ clock-names = "mixer", "vdo_fe0", "vdo_fe1", "gfx_fe0", "gfx_fe1",
+ "vdo_be", "adl_ds", "vdo_fe0_async", "vdo_fe1_async",
+ "gfx_fe0_async", "gfx_fe1_async","vdo_be_async",
+ "ethdr_top";
+ power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>;
+ iommus = <&iommu_vpp M4U_PORT_L3_HDR_DS>,
+ <&iommu_vpp M4U_PORT_L3_HDR_ADL>;
+ interrupts = <GIC_SPI 517 IRQ_TYPE_LEVEL_HIGH 0>; /* disp mixer */
+ resets = <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_FE0_DL_ASYNC>,
+ <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_FE1_DL_ASYNC>,
+ <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_GFX_FE0_DL_ASYNC>,
+ <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_GFX_FE1_DL_ASYNC>,
+ <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_BE_DL_ASYNC>;
+ reset-names = "vdo_fe0_async", "vdo_fe1_async", "gfx_fe0_async",
+ "gfx_fe1_async", "vdo_be_async";
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml
new file mode 100644
index 000000000000..6c2be9d6840b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,gamma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display gamma correction
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display gamma correction, namely GAMMA, provides a nonlinear
+ operation used to adjust luminance in display system.
+ GAMMA device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-gamma
+ - mediatek,mt8183-disp-gamma
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-gamma
+ - mediatek,mt8188-disp-gamma
+ - mediatek,mt8192-disp-gamma
+ - mediatek,mt8195-disp-gamma
+ - const: mediatek,mt8183-disp-gamma
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: GAMMA Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ gamma@14016000 {
+ compatible = "mediatek,mt8173-disp-gamma";
+ reg = <0 0x14016000 0 0x1000>;
+ interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_GAMMA>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x6000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi-ddc.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi-ddc.yaml
index b6fcdfb99ab2..bd8f7b8ae0ff 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi-ddc.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi-ddc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/mediatek/mediatek,hdmi-ddc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Mediatek HDMI DDC Device Tree Bindings
+title: Mediatek HDMI DDC
maintainers:
- CK Hu <ck.hu@mediatek.com>
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi.yaml
index 111967efa999..b90b6d18a828 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,hdmi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/mediatek/mediatek,hdmi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Mediatek HDMI Encoder Device Tree Bindings
+title: Mediatek HDMI Encoder
maintainers:
- CK Hu <ck.hu@mediatek.com>
@@ -50,8 +50,11 @@ properties:
- const: hdmi
mediatek,syscon-hdmi:
- $ref: '/schemas/types.yaml#/definitions/phandle-array'
- maxItems: 1
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ - items:
+ - description: phandle to system configuration registers
+ - description: register offset in the system configuration registers
description: |
phandle link and register offset to the system configuration registers.
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,mdp-rdma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,mdp-rdma.yaml
new file mode 100644
index 000000000000..dd12e2ff685c
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,mdp-rdma.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,mdp-rdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek MDP RDMA
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description:
+ The MediaTek MDP RDMA stands for Read Direct Memory Access.
+ It provides real time data to the back-end panel driver, such as DSI,
+ DPI and DP_INTF.
+ It contains one line buffer to store the sufficient pixel data.
+ RDMA device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml for details.
+
+properties:
+ compatible:
+ const: mediatek,mt8195-vdo1-rdma
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: RDMA Clock
+
+ iommus:
+ maxItems: 1
+
+ mediatek,gce-client-reg:
+ description:
+ The register of display function block to be set by gce. There are 4 arguments,
+ such as gce node, subsys id, offset and register size. The subsys id that is
+ mapping to the register of display function blocks is defined in the gce header
+ include/dt-bindings/gce/<chip>-gce.h of each chips.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ items:
+ - description: phandle of GCE
+ - description: GCE subsys id
+ - description: register offset
+ - description: register size
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - power-domains
+ - clocks
+ - iommus
+ - mediatek,gce-client-reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8195-clk.h>
+ #include <dt-bindings/power/mt8195-power.h>
+ #include <dt-bindings/gce/mt8195-gce.h>
+ #include <dt-bindings/memory/mt8195-memory-port.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ rdma@1c104000 {
+ compatible = "mediatek,mt8195-vdo1-rdma";
+ reg = <0 0x1c104000 0 0x1000>;
+ interrupts = <GIC_SPI 495 IRQ_TYPE_LEVEL_HIGH 0>;
+ clocks = <&vdosys1 CLK_VDO1_MDP_RDMA0>;
+ power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>;
+ iommus = <&iommu_vdo M4U_PORT_L2_MDP_RDMA0>;
+ mediatek,gce-client-reg = <&gce0 SUBSYS_1c10XXXX 0x4000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml
new file mode 100644
index 000000000000..2f8e2f4dc3b8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,merge.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display merge
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display merge, namely MERGE, is used to merge two slice-per-line
+ inputs into one side-by-side output.
+ MERGE device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-merge
+ - mediatek,mt8195-disp-merge
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ oneOf:
+ - items:
+ - const: merge
+ - items:
+ - const: merge
+ - const: merge_async
+
+ mediatek,merge-fifo-en:
+ description:
+ The setting of merge fifo is mainly provided for the display latency
+ buffer to ensure that the back-end panel display data will not be
+ underrun, a little more data is needed in the fifo.
+ According to the merge fifo settings, when the water level is detected
+ to be insufficient, it will trigger RDMA sending ultra and preulra
+ command to SMI to speed up the data rate.
+ type: boolean
+
+ mediatek,merge-mute:
+ description: Support mute function. Mute the content of merge output.
+ type: boolean
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+ resets:
+ description: reset controller
+ See Documentation/devicetree/bindings/reset/reset.txt for details.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ merge@14017000 {
+ compatible = "mediatek,mt8173-disp-merge";
+ reg = <0 0x14017000 0 0x1000>;
+ power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_MERGE>;
+ clock-names = "merge";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml
new file mode 100644
index 000000000000..29f9fa8f8219
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,od.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display overdirve
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display overdrive, namely OD, increases the transition values
+ of pixels between consecutive frames to make LCD rotate faster.
+ OD device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt2712-disp-od
+ - mediatek,mt8173-disp-od
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: OD Clock
+
+required:
+ - compatible
+ - reg
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mt8173-clk.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ od@14023000 {
+ compatible = "mediatek,mt8173-disp-od";
+ reg = <0 0x14023000 0 0x1000>;
+ clocks = <&mmsys CLK_MM_DISP_OD>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml
new file mode 100644
index 000000000000..c7dd0ef02dcf
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,ovl-2l.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display overlay 2 layer
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display overlay 2 layer, namely OVL-2L, provides 2 more layer
+ for OVL.
+ OVL-2L device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8183-disp-ovl-2l
+ - mediatek,mt8192-disp-ovl-2l
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-ovl-2l
+ - const: mediatek,mt8192-disp-ovl-2l
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: OVL-2L Clock
+
+ iommus:
+ description:
+ This property should point to the respective IOMMU block with master port as argument,
+ see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml for details.
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+ - iommus
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8183-clk.h>
+ #include <dt-bindings/power/mt8183-power.h>
+ #include <dt-bindings/gce/mt8183-gce.h>
+ #include <dt-bindings/memory/mt8183-larb-port.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ ovl_2l0: ovl@14009000 {
+ compatible = "mediatek,mt8183-disp-ovl-2l";
+ reg = <0 0x14009000 0 0x1000>;
+ interrupts = <GIC_SPI 226 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+ clocks = <&mmsys CLK_MM_DISP_OVL0_2L>;
+ iommus = <&iommu M4U_PORT_DISP_2L_OVL0_LARB0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0x9000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
new file mode 100644
index 000000000000..92e320d54ba2
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,ovl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display overlay
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display overlay, namely OVL, can do alpha blending from
+ the memory.
+ OVL device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt2701-disp-ovl
+ - mediatek,mt8173-disp-ovl
+ - mediatek,mt8183-disp-ovl
+ - mediatek,mt8192-disp-ovl
+ - items:
+ - enum:
+ - mediatek,mt7623-disp-ovl
+ - mediatek,mt2712-disp-ovl
+ - const: mediatek,mt2701-disp-ovl
+ - items:
+ - enum:
+ - mediatek,mt8188-disp-ovl
+ - mediatek,mt8195-disp-ovl
+ - const: mediatek,mt8183-disp-ovl
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-ovl
+ - const: mediatek,mt8192-disp-ovl
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: OVL Clock
+
+ iommus:
+ description:
+ This property should point to the respective IOMMU block with master port as argument,
+ see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml for details.
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+ - iommus
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+ #include <dt-bindings/memory/mt8173-larb-port.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ ovl0: ovl@1400c000 {
+ compatible = "mediatek,mt8173-disp-ovl";
+ reg = <0 0x1400c000 0 0x1000>;
+ interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_OVL0>;
+ iommus = <&iommu M4U_PORT_DISP_OVL0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xc000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml
new file mode 100644
index 000000000000..11fe32e50a59
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,postmask.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display postmask
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display postmask, namely POSTMASK, provides round corner pattern
+ generation.
+ POSTMASK device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8192-disp-postmask
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-postmask
+ - mediatek,mt8188-disp-postmask
+ - const: mediatek,mt8192-disp-postmask
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: POSTMASK Clock
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8192-clk.h>
+ #include <dt-bindings/power/mt8192-power.h>
+ #include <dt-bindings/gce/mt8192-gce.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ postmask0: postmask@1400d000 {
+ compatible = "mediatek,mt8192-disp-postmask";
+ reg = <0 0x1400d000 0 0x1000>;
+ interrupts = <GIC_SPI 262 IRQ_TYPE_LEVEL_HIGH 0>;
+ power-domains = <&scpsys MT8192_POWER_DOMAIN_DISP>;
+ clocks = <&mmsys CLK_MM_DISP_POSTMASK0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xd000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml
new file mode 100644
index 000000000000..42059efad45d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,rdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek Read Direct Memory Access
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek Read Direct Memory Access(RDMA) component used to read the
+ data into DMA. It provides real time data to the back-end panel
+ driver, such as DSI, DPI and DP_INTF.
+ It contains one line buffer to store the sufficient pixel data.
+ RDMA device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt2701-disp-rdma
+ - mediatek,mt8173-disp-rdma
+ - mediatek,mt8183-disp-rdma
+ - mediatek,mt8195-disp-rdma
+ - items:
+ - enum:
+ - mediatek,mt8188-disp-rdma
+ - const: mediatek,mt8195-disp-rdma
+ - items:
+ - enum:
+ - mediatek,mt7623-disp-rdma
+ - mediatek,mt2712-disp-rdma
+ - const: mediatek,mt2701-disp-rdma
+ - items:
+ - enum:
+ - mediatek,mt8186-disp-rdma
+ - mediatek,mt8192-disp-rdma
+ - const: mediatek,mt8183-disp-rdma
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: RDMA Clock
+
+ iommus:
+ description:
+ This property should point to the respective IOMMU block with master port as argument,
+ see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml for details.
+
+ mediatek,rdma-fifo-size:
+ description:
+ rdma fifo size may be different even in same SOC, add this property to the
+ corresponding rdma.
+ The value below is the Max value which defined in hardware data sheet
+ mediatek,rdma-fifo-size of mt8173-rdma0 is 8K
+ mediatek,rdma-fifo-size of mt8183-rdma0 is 5K
+ mediatek,rdma-fifo-size of mt8183-rdma1 is 2K
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [8192, 5120, 2048]
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+ - iommus
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+ #include <dt-bindings/memory/mt8173-larb-port.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ rdma0: rdma@1400e000 {
+ compatible = "mediatek,mt8173-disp-rdma";
+ reg = <0 0x1400e000 0 0x1000>;
+ interrupts = <GIC_SPI 182 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_RDMA0>;
+ iommus = <&iommu M4U_PORT_DISP_RDMA0>;
+ mediatek,rdma-fifo-size = <8192>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xe000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
new file mode 100644
index 000000000000..21a4e96ecd93
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,split.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display split
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display split, namely SPLIT, is used to split stream to two
+ encoders.
+ SPLIT device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-split
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: SPLIT Clock
+
+required:
+ - compatible
+ - reg
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ split0: split@14018000 {
+ compatible = "mediatek,mt8173-disp-split";
+ reg = <0 0x14018000 0 0x1000>;
+ power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_SPLIT0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml
new file mode 100644
index 000000000000..62fad23a26f5
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,ufoe.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek display UFOe
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek display UFOe stands for Unified Frame Optimization engine.
+ UFOe can cut the data rate for DSI port which may lead to reduce power
+ consumption.
+ UFOe device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-ufoe
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: UFOe Clock
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ ufoe@1401a000 {
+ compatible = "mediatek,mt8173-disp-ufoe";
+ reg = <0 0x1401a000 0 0x1000>;
+ interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_UFOE>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml
new file mode 100644
index 000000000000..991183165d29
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/mediatek/mediatek,wdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek Write Direct Memory Access
+
+maintainers:
+ - Chun-Kuang Hu <chunkuang.hu@kernel.org>
+ - Philipp Zabel <p.zabel@pengutronix.de>
+
+description: |
+ Mediatek Write Direct Memory Access(WDMA) component used to write
+ the data into DMA.
+ WDMA device node must be siblings to the central MMSYS_CONFIG node.
+ For a description of the MMSYS_CONFIG binding, see
+ Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.yaml
+ for details.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - mediatek,mt8173-disp-wdma
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ description: A phandle and PM domain specifier as defined by bindings of
+ the power controller specified by phandle. See
+ Documentation/devicetree/bindings/power/power-domain.yaml for details.
+
+ clocks:
+ items:
+ - description: WDMA Clock
+
+ iommus:
+ description:
+ This property should point to the respective IOMMU block with master port as argument,
+ see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml for details.
+
+ mediatek,gce-client-reg:
+ description: The register of client driver can be configured by gce with
+ 4 arguments defined in this property, such as phandle of gce, subsys id,
+ register offset and size. Each GCE subsys id is mapping to a client
+ defined in the header include/dt-bindings/gce/<chip>-gce.h.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - power-domains
+ - clocks
+ - iommus
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt8173-clk.h>
+ #include <dt-bindings/power/mt8173-power.h>
+ #include <dt-bindings/gce/mt8173-gce.h>
+ #include <dt-bindings/memory/mt8173-larb-port.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ wdma0: wdma@14011000 {
+ compatible = "mediatek,mt8173-disp-wdma";
+ reg = <0 0x14011000 0 0x1000>;
+ interrupts = <GIC_SPI 185 IRQ_TYPE_LEVEL_LOW>;
+ power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+ clocks = <&mmsys CLK_MM_DISP_WDMA0>;
+ iommus = <&iommu M4U_PORT_DISP_WDMA0>;
+ mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x1000 0x1000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/msm/dp-controller.yaml b/Documentation/devicetree/bindings/display/msm/dp-controller.yaml
index 64d8d9e5e47a..f0c2237d5f82 100644
--- a/Documentation/devicetree/bindings/display/msm/dp-controller.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dp-controller.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: MSM Display Port Controller
maintainers:
- - Kuogee Hsieh <khsieh@codeaurora.org>
+ - Kuogee Hsieh <quic_khsieh@quicinc.com>
description: |
Device tree bindings for DisplayPort host controller for MSM targets
@@ -15,11 +15,30 @@ description: |
properties:
compatible:
- enum:
- - qcom,sc7180-dp
+ oneOf:
+ - enum:
+ - qcom,sc7180-dp
+ - qcom,sc7280-dp
+ - qcom,sc7280-edp
+ - qcom,sc8180x-dp
+ - qcom,sc8180x-edp
+ - qcom,sc8280xp-dp
+ - qcom,sc8280xp-edp
+ - qcom,sdm845-dp
+ - qcom,sm8350-dp
+ - items:
+ - enum:
+ - qcom,sm8450-dp
+ - const: qcom,sm8350-dp
reg:
- maxItems: 1
+ minItems: 4
+ items:
+ - description: ahb register block
+ - description: aux register block
+ - description: link register block
+ - description: p0 register block
+ - description: p1 register block
interrupts:
maxItems: 1
@@ -57,15 +76,32 @@ properties:
items:
- const: dp
- operating-points-v2:
- maxItems: 1
+ operating-points-v2: true
+
+ opp-table: true
power-domains:
maxItems: 1
+ aux-bus:
+ $ref: /schemas/display/dp-aux-bus.yaml#
+
+ data-lanes:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ deprecated: true
+ minItems: 1
+ maxItems: 4
+ items:
+ maximum: 3
+
"#sound-dai-cells":
const: 0
+ vdda-0p9-supply:
+ deprecated: true
+ vdda-1p2-supply:
+ deprecated: true
+
ports:
$ref: /schemas/graph.yaml#/properties/ports
properties:
@@ -74,8 +110,28 @@ properties:
description: Input endpoint of the controller
port@1:
- $ref: /schemas/graph.yaml#/properties/port
+ $ref: /schemas/graph.yaml#/$defs/port-base
description: Output endpoint of the controller
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+ properties:
+ data-lanes:
+ minItems: 1
+ maxItems: 4
+ items:
+ enum: [ 0, 1, 2, 3 ]
+
+ link-frequencies:
+ minItems: 1
+ maxItems: 4
+ items:
+ enum: [ 1620000000, 2700000000, 5400000000, 8100000000 ]
+
+ required:
+ - port@0
+ - port@1
required:
- compatible
@@ -85,22 +141,47 @@ required:
- clock-names
- phys
- phy-names
- - "#sound-dai-cells"
- power-domains
- ports
+allOf:
+ # AUX BUS does not exist on DP controllers
+ # Audio output also is present only on DP output
+ # p1 regions is present on DP, but not on eDP
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7280-edp
+ - qcom,sc8180x-edp
+ - qcom,sc8280xp-edp
+ then:
+ properties:
+ "#sound-dai-cells": false
+ else:
+ properties:
+ aux-bus: false
+ reg:
+ minItems: 5
+ required:
+ - "#sound-dai-cells"
+
additionalProperties: false
examples:
- |
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/clock/qcom,dispcc-sc7180.h>
- #include <dt-bindings/power/qcom-aoss-qmp.h>
#include <dt-bindings/power/qcom-rpmpd.h>
displayport-controller@ae90000 {
compatible = "qcom,sc7180-dp";
- reg = <0xae90000 0x1400>;
+ reg = <0xae90000 0x200>,
+ <0xae90200 0x200>,
+ <0xae90400 0xc00>,
+ <0xae91000 0x400>,
+ <0xae91400 0x400>;
interrupt-parent = <&mdss>;
interrupts = <12>;
clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
@@ -139,6 +220,8 @@ examples:
reg = <1>;
endpoint {
remote-endpoint = <&typec>;
+ data-lanes = <0 1>;
+ link-frequencies = /bits/ 64 <1620000000 2700000000 5400000000 8100000000>;
};
};
};
diff --git a/Documentation/devicetree/bindings/display/msm/dpu-common.yaml b/Documentation/devicetree/bindings/display/msm/dpu-common.yaml
new file mode 100644
index 000000000000..3f953aa5e694
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/dpu-common.yaml
@@ -0,0 +1,56 @@
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/dpu-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU common properties
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+ - Rob Clark <robdclark@gmail.com>
+
+description: |
+ Common properties for QCom DPU display controller.
+
+# Do not select this by default, otherwise it is also selected for all
+# display-controller@ nodes
+select:
+ false
+
+properties:
+ $nodename:
+ pattern: '^display-controller@[0-9a-f]+$'
+
+ interrupts:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ operating-points-v2: true
+ opp-table:
+ type: object
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description: |
+ Contains the list of output ports from DPU device. These ports
+ connect to interfaces that are external to the DPU hardware,
+ such as DSI, DP etc.
+
+ patternProperties:
+ "^port@[0-9a-f]+$":
+ $ref: /schemas/graph.yaml#/properties/port
+
+ # at least one port is required
+ required:
+ - port@0
+
+required:
+ - interrupts
+ - power-domains
+ - operating-points-v2
+ - ports
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/display/msm/dpu-sc7180.yaml b/Documentation/devicetree/bindings/display/msm/dpu-sc7180.yaml
deleted file mode 100644
index 12a86b1ec1bc..000000000000
--- a/Documentation/devicetree/bindings/display/msm/dpu-sc7180.yaml
+++ /dev/null
@@ -1,228 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/display/msm/dpu-sc7180.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Qualcomm Display DPU dt properties for SC7180 target
-
-maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
-
-description: |
- Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
- sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
- bindings of MDSS and DPU are mentioned for SC7180 target.
-
-properties:
- compatible:
- items:
- - const: qcom,sc7180-mdss
-
- reg:
- maxItems: 1
-
- reg-names:
- const: mdss
-
- power-domains:
- maxItems: 1
-
- clocks:
- items:
- - description: Display AHB clock from gcc
- - description: Display AHB clock from dispcc
- - description: Display core clock
-
- clock-names:
- items:
- - const: iface
- - const: ahb
- - const: core
-
- interrupts:
- maxItems: 1
-
- interrupt-controller: true
-
- "#address-cells": true
-
- "#size-cells": true
-
- "#interrupt-cells":
- const: 1
-
- iommus:
- items:
- - description: Phandle to apps_smmu node with SID mask for Hard-Fail port0
-
- ranges: true
-
- interconnects:
- items:
- - description: Interconnect path specifying the port ids for data bus
-
- interconnect-names:
- const: mdp0-mem
-
-patternProperties:
- "^display-controller@[0-9a-f]+$":
- type: object
- description: Node containing the properties of DPU.
-
- properties:
- compatible:
- items:
- - const: qcom,sc7180-dpu
-
- reg:
- items:
- - description: Address offset and size for mdp register set
- - description: Address offset and size for vbif register set
-
- reg-names:
- items:
- - const: mdp
- - const: vbif
-
- clocks:
- items:
- - description: Display hf axi clock
- - description: Display ahb clock
- - description: Display rotator clock
- - description: Display lut clock
- - description: Display core clock
- - description: Display vsync clock
-
- clock-names:
- items:
- - const: bus
- - const: iface
- - const: rot
- - const: lut
- - const: core
- - const: vsync
-
- interrupts:
- maxItems: 1
-
- power-domains:
- maxItems: 1
-
- operating-points-v2: true
-
- ports:
- $ref: /schemas/graph.yaml#/properties/ports
- description: |
- Contains the list of output ports from DPU device. These ports
- connect to interfaces that are external to the DPU hardware,
- such as DSI, DP etc. Each output port contains an endpoint that
- describes how it is connected to an external interface.
-
- properties:
- port@0:
- $ref: /schemas/graph.yaml#/properties/port
- description: DPU_INTF1 (DSI1)
-
- port@2:
- $ref: /schemas/graph.yaml#/properties/port
- description: DPU_INTF0 (DP)
-
- required:
- - port@0
-
- required:
- - compatible
- - reg
- - reg-names
- - clocks
- - interrupts
- - power-domains
- - operating-points-v2
- - ports
-
-required:
- - compatible
- - reg
- - reg-names
- - power-domains
- - clocks
- - interrupts
- - interrupt-controller
- - iommus
- - ranges
-
-additionalProperties: false
-
-examples:
- - |
- #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
- #include <dt-bindings/clock/qcom,gcc-sc7180.h>
- #include <dt-bindings/interrupt-controller/arm-gic.h>
- #include <dt-bindings/interconnect/qcom,sdm845.h>
- #include <dt-bindings/power/qcom-rpmpd.h>
-
- display-subsystem@ae00000 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "qcom,sc7180-mdss";
- reg = <0xae00000 0x1000>;
- reg-names = "mdss";
- power-domains = <&dispcc MDSS_GDSC>;
- clocks = <&gcc GCC_DISP_AHB_CLK>,
- <&dispcc DISP_CC_MDSS_AHB_CLK>,
- <&dispcc DISP_CC_MDSS_MDP_CLK>;
- clock-names = "iface", "ahb", "core";
-
- interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-controller;
- #interrupt-cells = <1>;
-
- interconnects = <&mmss_noc MASTER_MDP0 &mc_virt SLAVE_EBI1>;
- interconnect-names = "mdp0-mem";
-
- iommus = <&apps_smmu 0x800 0x2>;
- ranges;
-
- display-controller@ae01000 {
- compatible = "qcom,sc7180-dpu";
- reg = <0x0ae01000 0x8f000>,
- <0x0aeb0000 0x2008>;
-
- reg-names = "mdp", "vbif";
-
- clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
- <&dispcc DISP_CC_MDSS_AHB_CLK>,
- <&dispcc DISP_CC_MDSS_ROT_CLK>,
- <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
- <&dispcc DISP_CC_MDSS_MDP_CLK>,
- <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
- clock-names = "bus", "iface", "rot", "lut", "core",
- "vsync";
-
- interrupt-parent = <&mdss>;
- interrupts = <0>;
- power-domains = <&rpmhpd SC7180_CX>;
- operating-points-v2 = <&mdp_opp_table>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- dpu_intf1_out: endpoint {
- remote-endpoint = <&dsi0_in>;
- };
- };
-
- port@2 {
- reg = <2>;
- dpu_intf0_out: endpoint {
- remote-endpoint = <&dp_in>;
- };
- };
- };
- };
- };
-...
diff --git a/Documentation/devicetree/bindings/display/msm/dpu-sdm845.yaml b/Documentation/devicetree/bindings/display/msm/dpu-sdm845.yaml
deleted file mode 100644
index b4ea7c92fb3d..000000000000
--- a/Documentation/devicetree/bindings/display/msm/dpu-sdm845.yaml
+++ /dev/null
@@ -1,212 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/display/msm/dpu-sdm845.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Qualcomm Display DPU dt properties for SDM845 target
-
-maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
-
-description: |
- Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
- sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
- bindings of MDSS and DPU are mentioned for SDM845 target.
-
-properties:
- compatible:
- items:
- - const: qcom,sdm845-mdss
-
- reg:
- maxItems: 1
-
- reg-names:
- const: mdss
-
- power-domains:
- maxItems: 1
-
- clocks:
- items:
- - description: Display AHB clock from gcc
- - description: Display AXI clock
- - description: Display core clock
-
- clock-names:
- items:
- - const: iface
- - const: bus
- - const: core
-
- interrupts:
- maxItems: 1
-
- interrupt-controller: true
-
- "#address-cells": true
-
- "#size-cells": true
-
- "#interrupt-cells":
- const: 1
-
- iommus:
- items:
- - description: Phandle to apps_smmu node with SID mask for Hard-Fail port0
- - description: Phandle to apps_smmu node with SID mask for Hard-Fail port1
-
- ranges: true
-
-patternProperties:
- "^display-controller@[0-9a-f]+$":
- type: object
- description: Node containing the properties of DPU.
-
- properties:
- compatible:
- items:
- - const: qcom,sdm845-dpu
-
- reg:
- items:
- - description: Address offset and size for mdp register set
- - description: Address offset and size for vbif register set
-
- reg-names:
- items:
- - const: mdp
- - const: vbif
-
- clocks:
- items:
- - description: Display ahb clock
- - description: Display axi clock
- - description: Display core clock
- - description: Display vsync clock
-
- clock-names:
- items:
- - const: iface
- - const: bus
- - const: core
- - const: vsync
-
- interrupts:
- maxItems: 1
-
- power-domains:
- maxItems: 1
-
- operating-points-v2: true
- ports:
- $ref: /schemas/graph.yaml#/properties/ports
- description: |
- Contains the list of output ports from DPU device. These ports
- connect to interfaces that are external to the DPU hardware,
- such as DSI, DP etc. Each output port contains an endpoint that
- describes how it is connected to an external interface.
-
- properties:
- port@0:
- $ref: /schemas/graph.yaml#/properties/port
- description: DPU_INTF1 (DSI1)
-
- port@1:
- $ref: /schemas/graph.yaml#/properties/port
- description: DPU_INTF2 (DSI2)
-
- required:
- - port@0
- - port@1
-
- required:
- - compatible
- - reg
- - reg-names
- - clocks
- - interrupts
- - power-domains
- - operating-points-v2
- - ports
-
-required:
- - compatible
- - reg
- - reg-names
- - power-domains
- - clocks
- - interrupts
- - interrupt-controller
- - iommus
- - ranges
-
-additionalProperties: false
-
-examples:
- - |
- #include <dt-bindings/clock/qcom,dispcc-sdm845.h>
- #include <dt-bindings/clock/qcom,gcc-sdm845.h>
- #include <dt-bindings/interrupt-controller/arm-gic.h>
- #include <dt-bindings/power/qcom-rpmpd.h>
-
- display-subsystem@ae00000 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "qcom,sdm845-mdss";
- reg = <0x0ae00000 0x1000>;
- reg-names = "mdss";
- power-domains = <&dispcc MDSS_GDSC>;
-
- clocks = <&gcc GCC_DISP_AHB_CLK>,
- <&gcc GCC_DISP_AXI_CLK>,
- <&dispcc DISP_CC_MDSS_MDP_CLK>;
- clock-names = "iface", "bus", "core";
-
- interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-controller;
- #interrupt-cells = <1>;
-
- iommus = <&apps_smmu 0x880 0x8>,
- <&apps_smmu 0xc80 0x8>;
- ranges;
-
- display-controller@ae01000 {
- compatible = "qcom,sdm845-dpu";
- reg = <0x0ae01000 0x8f000>,
- <0x0aeb0000 0x2008>;
- reg-names = "mdp", "vbif";
-
- clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
- <&dispcc DISP_CC_MDSS_AXI_CLK>,
- <&dispcc DISP_CC_MDSS_MDP_CLK>,
- <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
- clock-names = "iface", "bus", "core", "vsync";
-
- interrupt-parent = <&mdss>;
- interrupts = <0>;
- power-domains = <&rpmhpd SDM845_CX>;
- operating-points-v2 = <&mdp_opp_table>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- dpu_intf1_out: endpoint {
- remote-endpoint = <&dsi0_in>;
- };
- };
-
- port@1 {
- reg = <1>;
- dpu_intf2_out: endpoint {
- remote-endpoint = <&dsi1_in>;
- };
- };
- };
- };
- };
-...
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-controller-main.yaml b/Documentation/devicetree/bindings/display/msm/dsi-controller-main.yaml
index 35426fde8610..e6c1ebfe8a32 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-controller-main.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-controller-main.yaml
@@ -7,15 +7,35 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Display DSI controller
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
-
-allOf:
- - $ref: "../dsi-controller.yaml#"
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
properties:
compatible:
- items:
- - const: qcom,mdss-dsi-ctrl
+ oneOf:
+ - items:
+ - enum:
+ - qcom,apq8064-dsi-ctrl
+ - qcom,msm8916-dsi-ctrl
+ - qcom,msm8953-dsi-ctrl
+ - qcom,msm8974-dsi-ctrl
+ - qcom,msm8996-dsi-ctrl
+ - qcom,msm8998-dsi-ctrl
+ - qcom,qcm2290-dsi-ctrl
+ - qcom,sc7180-dsi-ctrl
+ - qcom,sc7280-dsi-ctrl
+ - qcom,sdm660-dsi-ctrl
+ - qcom,sdm845-dsi-ctrl
+ - qcom,sm6115-dsi-ctrl
+ - qcom,sm8150-dsi-ctrl
+ - qcom,sm8250-dsi-ctrl
+ - qcom,sm8350-dsi-ctrl
+ - qcom,sm8450-dsi-ctrl
+ - qcom,sm8550-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+ - enum:
+ - qcom,dsi-ctrl-6g-qcm2290
+ - qcom,mdss-dsi-ctrl # This should always come with an SoC-specific compatible
+ deprecated: true
reg:
maxItems: 1
@@ -27,36 +47,34 @@ properties:
maxItems: 1
clocks:
- items:
- - description: Display byte clock
- - description: Display byte interface clock
- - description: Display pixel clock
- - description: Display escape clock
- - description: Display AHB clock
- - description: Display AXI clock
+ description: |
+ Several clocks are used, depending on the variant. Typical ones are::
+ - bus:: Display AHB clock.
+ - byte:: Display byte clock.
+ - byte_intf:: Display byte interface clock.
+ - core:: Display core clock.
+ - core_mss:: Core MultiMedia SubSystem clock.
+ - iface:: Display AXI clock.
+ - mdp_core:: MDP Core clock.
+ - mnoc:: MNOC clock
+ - pixel:: Display pixel clock.
+ minItems: 3
+ maxItems: 9
clock-names:
- items:
- - const: byte
- - const: byte_intf
- - const: pixel
- - const: core
- - const: iface
- - const: bus
+ minItems: 3
+ maxItems: 9
phys:
maxItems: 1
phy-names:
+ deprecated: true
const: dsi
- "#address-cells": true
-
- "#size-cells": true
-
syscon-sfpb:
description: A phandle to mmss_sfpb syscon node (only for DSIv2).
- $ref: "/schemas/types.yaml#/definitions/phandle"
+ $ref: /schemas/types.yaml#/definitions/phandle
qcom,dual-dsi-mode:
type: boolean
@@ -66,13 +84,15 @@ properties:
assigned-clocks:
minItems: 2
- maxItems: 2
+ maxItems: 4
description: |
Parents of "byte" and "pixel" for the given platform.
+ For DSIv2 platforms this should contain "byte", "esc", "src" and
+ "pixel_src" clocks.
assigned-clock-parents:
minItems: 2
- maxItems: 2
+ maxItems: 4
description: |
The Byte clock and Pixel clock PLL outputs provided by a DSI PHY block.
@@ -81,15 +101,18 @@ properties:
operating-points-v2: true
+ opp-table:
+ type: object
+
ports:
- $ref: "/schemas/graph.yaml#/properties/ports"
+ $ref: /schemas/graph.yaml#/properties/ports
description: |
Contains DSI controller input and output ports as children, each
containing one endpoint subnode.
properties:
port@0:
- $ref: "/schemas/graph.yaml#/$defs/port-base"
+ $ref: /schemas/graph.yaml#/$defs/port-base
unevaluatedProperties: false
description: |
Input endpoints of the controller.
@@ -100,12 +123,12 @@ properties:
properties:
data-lanes:
maxItems: 4
- minItems: 4
+ minItems: 1
items:
enum: [ 0, 1, 2, 3 ]
port@1:
- $ref: "/schemas/graph.yaml#/$defs/port-base"
+ $ref: /schemas/graph.yaml#/$defs/port-base
unevaluatedProperties: false
description: |
Output endpoints of the controller.
@@ -116,7 +139,7 @@ properties:
properties:
data-lanes:
maxItems: 4
- minItems: 4
+ minItems: 1
items:
enum: [ 0, 1, 2, 3 ]
@@ -124,6 +147,26 @@ properties:
- port@0
- port@1
+ avdd-supply:
+ description:
+ Phandle to vdd regulator device node
+
+ vcca-supply:
+ description:
+ Phandle to vdd regulator device node
+
+ vdd-supply:
+ description:
+ VDD regulator
+
+ vddio-supply:
+ description:
+ VDD-IO regulator
+
+ vdda-supply:
+ description:
+ VDDA regulator
+
required:
- compatible
- reg
@@ -132,14 +175,197 @@ required:
- clocks
- clock-names
- phys
- - phy-names
- assigned-clocks
- assigned-clock-parents
- - power-domains
- - operating-points-v2
- ports
-additionalProperties: false
+allOf:
+ - $ref: ../dsi-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,apq8064-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 7
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core_mmss
+ - const: src
+ - const: byte
+ - const: pixel
+ - const: core
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,msm8916-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 6
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: iface
+ - const: bus
+ - const: byte
+ - const: pixel
+ - const: core
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,msm8953-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 6
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: iface
+ - const: bus
+ - const: byte
+ - const: pixel
+ - const: core
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,msm8974-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 7
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: iface
+ - const: bus
+ - const: byte
+ - const: pixel
+ - const: core
+ - const: core_mmss
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,msm8996-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 7
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: byte
+ - const: iface
+ - const: bus
+ - const: core_mmss
+ - const: pixel
+ - const: core
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,msm8998-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 6
+ clock-names:
+ items:
+ - const: byte
+ - const: byte_intf
+ - const: pixel
+ - const: core
+ - const: iface
+ - const: bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sc7180-dsi-ctrl
+ - qcom,sc7280-dsi-ctrl
+ - qcom,sm8150-dsi-ctrl
+ - qcom,sm8250-dsi-ctrl
+ - qcom,sm8350-dsi-ctrl
+ - qcom,sm8450-dsi-ctrl
+ - qcom,sm8550-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 6
+ clock-names:
+ items:
+ - const: byte
+ - const: byte_intf
+ - const: pixel
+ - const: core
+ - const: iface
+ - const: bus
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sdm660-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 9
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: byte
+ - const: byte_intf
+ - const: mnoc
+ - const: iface
+ - const: bus
+ - const: core_mmss
+ - const: pixel
+ - const: core
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,sdm845-dsi-ctrl
+ - qcom,sm6115-dsi-ctrl
+ then:
+ properties:
+ clocks:
+ maxItems: 6
+ clock-names:
+ items:
+ - const: byte
+ - const: byte_intf
+ - const: pixel
+ - const: core
+ - const: iface
+ - const: bus
+
+unevaluatedProperties: false
examples:
- |
@@ -149,7 +375,7 @@ examples:
#include <dt-bindings/power/qcom-rpmpd.h>
dsi@ae94000 {
- compatible = "qcom,mdss-dsi-ctrl";
+ compatible = "qcom,sc7180-dsi-ctrl", "qcom,mdss-dsi-ctrl";
reg = <0x0ae94000 0x400>;
reg-names = "dsi_ctrl";
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
index 4399715953e1..e6b00d7387ce 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Display DSI 10nm PHY
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
allOf:
- $ref: dsi-phy-common.yaml#
@@ -35,11 +35,40 @@ properties:
Connected to DSI0_MIPI_DSI_PLL_VDDA0P9 pin for sc7180 target and
connected to VDDA_MIPI_DSI_0_PLL_0P9 pin for sdm845 target
+ qcom,phy-rescode-offset-top:
+ $ref: /schemas/types.yaml#/definitions/int8-array
+ maxItems: 5
+ description:
+ Integer array of offset for pull-up legs rescode for all five lanes.
+ To offset the drive strength from the calibrated value in an increasing
+ manner, -32 is the weakest and +31 is the strongest.
+ items:
+ minimum: -32
+ maximum: 31
+
+ qcom,phy-rescode-offset-bot:
+ $ref: /schemas/types.yaml#/definitions/int8-array
+ maxItems: 5
+ description:
+ Integer array of offset for pull-down legs rescode for all five lanes.
+ To offset the drive strength from the calibrated value in a decreasing
+ manner, -32 is the weakest and +31 is the strongest.
+ items:
+ minimum: -32
+ maximum: 31
+
+ qcom,phy-drive-ldo-level:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ The PHY LDO has an amplitude tuning feature to adjust the LDO output
+ for the HSTX drive. Use supported levels (mV) to offset the drive level
+ from the default value.
+ enum: [ 375, 400, 425, 450, 475, 500 ]
+
required:
- compatible
- reg
- reg-names
- - vdds-supply
unevaluatedProperties: false
@@ -64,5 +93,9 @@ examples:
clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
<&rpmhcc RPMH_CXO_CLK>;
clock-names = "iface", "ref";
+
+ qcom,phy-rescode-offset-top = /bits/ 8 <0 0 0 0 0>;
+ qcom,phy-rescode-offset-bot = /bits/ 8 <0 0 0 0 0>;
+ qcom,phy-drive-ldo-level = <400>;
};
...
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-14nm.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-14nm.yaml
index 064df50e21a5..a43e11d3b00d 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-14nm.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-14nm.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Display DSI 14nm PHY
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
allOf:
- $ref: dsi-phy-common.yaml#
@@ -16,7 +16,9 @@ properties:
compatible:
enum:
- qcom,dsi-phy-14nm
+ - qcom,dsi-phy-14nm-2290
- qcom,dsi-phy-14nm-660
+ - qcom,dsi-phy-14nm-8953
reg:
items:
@@ -37,7 +39,6 @@ required:
- compatible
- reg
- reg-names
- - vcca-supply
unevaluatedProperties: false
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-20nm.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-20nm.yaml
index b8de785ce815..9c1f9140c731 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-20nm.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-20nm.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Display DSI 20nm PHY
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
allOf:
- $ref: dsi-phy-common.yaml#
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-28nm.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-28nm.yaml
index 69eecaa64b18..cf4a338c4661 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-28nm.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-28nm.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Display DSI 28nm PHY
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
allOf:
- $ref: dsi-phy-common.yaml#
@@ -16,6 +16,7 @@ properties:
compatible:
enum:
- qcom,dsi-phy-28nm-hpm
+ - qcom,dsi-phy-28nm-hpm-fam-b
- qcom,dsi-phy-28nm-lp
- qcom,dsi-phy-28nm-8960
@@ -34,6 +35,10 @@ properties:
vddio-supply:
description: Phandle to vdd-io regulator device node.
+ qcom,dsi-phy-regulator-ldo-mode:
+ type: boolean
+ description: Indicates if the LDO mode PHY regulator is wanted.
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml
index c851770bbdf2..8e9031bbde73 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-7nm.yaml
@@ -18,6 +18,10 @@ properties:
- qcom,dsi-phy-7nm
- qcom,dsi-phy-7nm-8150
- qcom,sc7280-dsi-phy-7nm
+ - qcom,sm6375-dsi-phy-7nm
+ - qcom,sm8350-dsi-phy-5nm
+ - qcom,sm8450-dsi-phy-5nm
+ - qcom,sm8550-dsi-phy-4nm
reg:
items:
@@ -44,7 +48,6 @@ required:
- compatible
- reg
- reg-names
- - vdds-supply
unevaluatedProperties: false
diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-common.yaml b/Documentation/devicetree/bindings/display/msm/dsi-phy-common.yaml
index 502bdda90235..0f6f08890e7e 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi-phy-common.yaml
+++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-common.yaml
@@ -4,14 +4,13 @@
$id: http://devicetree.org/schemas/display/msm/dsi-phy-common.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Description of Qualcomm Display DSI PHY common dt properties
+title: Qualcomm Display DSI PHY Common Properties
maintainers:
- - Krishna Manikandan <mkrishn@codeaurora.org>
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
-description: |
- This defines the DSI PHY dt properties which are common for all
- dsi phy versions.
+description:
+ Common properties for Qualcomm Display DSI PHY.
properties:
"#clock-cells":
diff --git a/Documentation/devicetree/bindings/display/msm/edp.txt b/Documentation/devicetree/bindings/display/msm/edp.txt
deleted file mode 100644
index eff9daff418c..000000000000
--- a/Documentation/devicetree/bindings/display/msm/edp.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Qualcomm Technologies Inc. adreno/snapdragon eDP output
-
-Required properties:
-- compatible:
- * "qcom,mdss-edp"
-- reg: Physical base address and length of the registers of controller and PLL
-- reg-names: The names of register regions. The following regions are required:
- * "edp"
- * "pll_base"
-- interrupts: The interrupt signal from the eDP block.
-- power-domains: Should be <&mmcc MDSS_GDSC>.
-- clocks: device clocks
- See Documentation/devicetree/bindings/clock/clock-bindings.txt for details.
-- clock-names: the following clocks are required:
- * "core"
- * "iface"
- * "mdp_core"
- * "pixel"
- * "link"
-- #clock-cells: The value should be 1.
-- vdda-supply: phandle to vdda regulator device node
-- lvl-vdd-supply: phandle to regulator device node which is used to supply power
- to HPD receiving chip
-- panel-en-gpios: GPIO pin to supply power to panel.
-- panel-hpd-gpios: GPIO pin used for eDP hpd.
-
-
-Example:
- mdss_edp: qcom,mdss_edp@fd923400 {
- compatible = "qcom,mdss-edp";
- reg-names =
- "edp",
- "pll_base";
- reg = <0xfd923400 0x700>,
- <0xfd923a00 0xd4>;
- interrupt-parent = <&mdss_mdp>;
- interrupts = <12 0>;
- power-domains = <&mmcc MDSS_GDSC>;
- clock-names =
- "core",
- "pixel",
- "iface",
- "link",
- "mdp_core";
- clocks =
- <&mmcc MDSS_EDPAUX_CLK>,
- <&mmcc MDSS_EDPPIXEL_CLK>,
- <&mmcc MDSS_AHB_CLK>,
- <&mmcc MDSS_EDPLINK_CLK>,
- <&mmcc MDSS_MDP_CLK>;
- #clock-cells = <1>;
- vdda-supply = <&pma8084_l12>;
- lvl-vdd-supply = <&lvl_vreg>;
- panel-en-gpios = <&tlmm 137 0>;
- panel-hpd-gpios = <&tlmm 103 0>;
- };
diff --git a/Documentation/devicetree/bindings/display/msm/gmu.yaml b/Documentation/devicetree/bindings/display/msm/gmu.yaml
index fe55611d2603..029d72822d8b 100644
--- a/Documentation/devicetree/bindings/display/msm/gmu.yaml
+++ b/Documentation/devicetree/bindings/display/msm/gmu.yaml
@@ -3,10 +3,10 @@
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/msm/gmu.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/msm/gmu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Devicetree bindings for the GMU attached to certain Adreno GPUs
+title: GMU attached to certain Adreno GPUs
maintainers:
- Rob Clark <robdclark@gmail.com>
@@ -20,35 +20,24 @@ description: |
properties:
compatible:
items:
- - enum:
- - qcom,adreno-gmu-630.2
+ - pattern: '^qcom,adreno-gmu-6[0-9][0-9]\.[0-9]$'
- const: qcom,adreno-gmu
reg:
- items:
- - description: Core GMU registers
- - description: GMU PDC registers
- - description: GMU PDC sequence registers
+ minItems: 3
+ maxItems: 4
reg-names:
- items:
- - const: gmu
- - const: gmu_pdc
- - const: gmu_pdc_seq
+ minItems: 3
+ maxItems: 4
clocks:
- items:
- - description: GMU clock
- - description: GPU CX clock
- - description: GPU AXI clock
- - description: GPU MEMNOC clock
+ minItems: 4
+ maxItems: 7
clock-names:
- items:
- - const: gmu
- - const: cxo
- - const: axi
- - const: memnoc
+ minItems: 4
+ maxItems: 7
interrupts:
items:
@@ -76,6 +65,9 @@ properties:
operating-points-v2: true
+ opp-table:
+ type: object
+
required:
- compatible
- reg
@@ -91,6 +83,140 @@ required:
additionalProperties: false
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,adreno-gmu-618.0
+ - qcom,adreno-gmu-630.2
+ then:
+ properties:
+ reg:
+ items:
+ - description: Core GMU registers
+ - description: GMU PDC registers
+ - description: GMU PDC sequence registers
+ reg-names:
+ items:
+ - const: gmu
+ - const: gmu_pdc
+ - const: gmu_pdc_seq
+ clocks:
+ items:
+ - description: GMU clock
+ - description: GPU CX clock
+ - description: GPU AXI clock
+ - description: GPU MEMNOC clock
+ clock-names:
+ items:
+ - const: gmu
+ - const: cxo
+ - const: axi
+ - const: memnoc
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,adreno-gmu-635.0
+ then:
+ properties:
+ reg:
+ items:
+ - description: Core GMU registers
+ - description: Resource controller registers
+ - description: GMU PDC registers
+ reg-names:
+ items:
+ - const: gmu
+ - const: rscc
+ - const: gmu_pdc
+ clocks:
+ items:
+ - description: GMU clock
+ - description: GPU CX clock
+ - description: GPU AXI clock
+ - description: GPU MEMNOC clock
+ - description: GPU AHB clock
+ - description: GPU HUB CX clock
+ - description: GPU SMMU vote clock
+ clock-names:
+ items:
+ - const: gmu
+ - const: cxo
+ - const: axi
+ - const: memnoc
+ - const: ahb
+ - const: hub
+ - const: smmu_vote
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,adreno-gmu-640.1
+ then:
+ properties:
+ reg:
+ items:
+ - description: Core GMU registers
+ - description: GMU PDC registers
+ - description: GMU PDC sequence registers
+ reg-names:
+ items:
+ - const: gmu
+ - const: gmu_pdc
+ - const: gmu_pdc_seq
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,adreno-gmu-650.2
+ then:
+ properties:
+ reg:
+ items:
+ - description: Core GMU registers
+ - description: Resource controller registers
+ - description: GMU PDC registers
+ - description: GMU PDC sequence registers
+ reg-names:
+ items:
+ - const: gmu
+ - const: rscc
+ - const: gmu_pdc
+ - const: gmu_pdc_seq
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,adreno-gmu-640.1
+ - qcom,adreno-gmu-650.2
+ then:
+ properties:
+ clocks:
+ items:
+ - description: GPU AHB clock
+ - description: GMU clock
+ - description: GPU CX clock
+ - description: GPU AXI clock
+ - description: GPU MEMNOC clock
+ clock-names:
+ items:
+ - const: ahb
+ - const: gmu
+ - const: cxo
+ - const: axi
+ - const: memnoc
+
examples:
- |
#include <dt-bindings/clock/qcom,gpucc-sdm845.h>
diff --git a/Documentation/devicetree/bindings/display/msm/gpu.txt b/Documentation/devicetree/bindings/display/msm/gpu.txt
deleted file mode 100644
index 090dcb3fc34d..000000000000
--- a/Documentation/devicetree/bindings/display/msm/gpu.txt
+++ /dev/null
@@ -1,157 +0,0 @@
-Qualcomm adreno/snapdragon GPU
-
-Required properties:
-- compatible: "qcom,adreno-XYZ.W", "qcom,adreno" or
- "amd,imageon-XYZ.W", "amd,imageon"
- for example: "qcom,adreno-306.0", "qcom,adreno"
- Note that you need to list the less specific "qcom,adreno" (since this
- is what the device is matched on), in addition to the more specific
- with the chip-id.
- If "amd,imageon" is used, there should be no top level msm device.
-- reg: Physical base address and length of the controller's registers.
-- interrupts: The interrupt signal from the gpu.
-- clocks: device clocks (if applicable)
- See ../clocks/clock-bindings.txt for details.
-- clock-names: the following clocks are required by a3xx, a4xx and a5xx
- cores:
- * "core"
- * "iface"
- * "mem_iface"
- For GMU attached devices the GPU clocks are not used and are not required. The
- following devices should not list clocks:
- - qcom,adreno-630.2
-- iommus: optional phandle to an adreno iommu instance
-- operating-points-v2: optional phandle to the OPP operating points
-- interconnects: optional phandle to an interconnect provider. See
- ../interconnect/interconnect.txt for details. Some A3xx and all A4xx platforms
- will have two paths; all others will have one path.
-- interconnect-names: The names of the interconnect paths that correspond to the
- interconnects property. Values must be gfx-mem and ocmem.
-- qcom,gmu: For GMU attached devices a phandle to the GMU device that will
- control the power for the GPU. Applicable targets:
- - qcom,adreno-630.2
-- zap-shader: For a5xx and a6xx devices this node contains a memory-region that
- points to reserved memory to store the zap shader that can be used to help
- bring the GPU out of secure mode.
-- firmware-name: optional property of the 'zap-shader' node, listing the
- relative path of the device specific zap firmware.
-- sram: phandle to the On Chip Memory (OCMEM) that's present on some a3xx and
- a4xx Snapdragon SoCs. See
- Documentation/devicetree/bindings/sram/qcom,ocmem.yaml.
-
-Optional properties:
-- #cooling-cells: The value must be 2. For details, please refer
- Documentation/devicetree/bindings/thermal/thermal-cooling-devices.yaml.
-
-Example 3xx/4xx:
-
-/ {
- ...
-
- gpu: adreno@fdb00000 {
- compatible = "qcom,adreno-330.2",
- "qcom,adreno";
- reg = <0xfdb00000 0x10000>;
- reg-names = "kgsl_3d0_reg_memory";
- interrupts = <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "kgsl_3d0_irq";
- clock-names = "core",
- "iface",
- "mem_iface";
- clocks = <&mmcc OXILI_GFX3D_CLK>,
- <&mmcc OXILICX_AHB_CLK>,
- <&mmcc OXILICX_AXI_CLK>;
- sram = <&gpu_sram>;
- power-domains = <&mmcc OXILICX_GDSC>;
- operating-points-v2 = <&gpu_opp_table>;
- iommus = <&gpu_iommu 0>;
- #cooling-cells = <2>;
- };
-
- gpu_sram: ocmem@fdd00000 {
- compatible = "qcom,msm8974-ocmem";
-
- reg = <0xfdd00000 0x2000>,
- <0xfec00000 0x180000>;
- reg-names = "ctrl",
- "mem";
-
- clocks = <&rpmcc RPM_SMD_OCMEMGX_CLK>,
- <&mmcc OCMEMCX_OCMEMNOC_CLK>;
- clock-names = "core",
- "iface";
-
- #address-cells = <1>;
- #size-cells = <1>;
-
- gpu_sram: gpu-sram@0 {
- reg = <0x0 0x100000>;
- ranges = <0 0 0xfec00000 0x100000>;
- };
- };
-};
-
-Example a6xx (with GMU):
-
-/ {
- ...
-
- gpu@5000000 {
- compatible = "qcom,adreno-630.2", "qcom,adreno";
- #stream-id-cells = <16>;
-
- reg = <0x5000000 0x40000>, <0x509e000 0x10>;
- reg-names = "kgsl_3d0_reg_memory", "cx_mem";
-
- #cooling-cells = <2>;
-
- /*
- * Look ma, no clocks! The GPU clocks and power are
- * controlled entirely by the GMU
- */
-
- interrupts = <GIC_SPI 300 IRQ_TYPE_LEVEL_HIGH>;
-
- iommus = <&adreno_smmu 0>;
-
- operating-points-v2 = <&gpu_opp_table>;
-
- interconnects = <&rsc_hlos MASTER_GFX3D &rsc_hlos SLAVE_EBI1>;
- interconnect-names = "gfx-mem";
-
- gpu_opp_table: opp-table {
- compatible = "operating-points-v2";
-
- opp-430000000 {
- opp-hz = /bits/ 64 <430000000>;
- opp-level = <RPMH_REGULATOR_LEVEL_SVS_L1>;
- opp-peak-kBps = <5412000>;
- };
-
- opp-355000000 {
- opp-hz = /bits/ 64 <355000000>;
- opp-level = <RPMH_REGULATOR_LEVEL_SVS>;
- opp-peak-kBps = <3072000>;
- };
-
- opp-267000000 {
- opp-hz = /bits/ 64 <267000000>;
- opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS>;
- opp-peak-kBps = <3072000>;
- };
-
- opp-180000000 {
- opp-hz = /bits/ 64 <180000000>;
- opp-level = <RPMH_REGULATOR_LEVEL_MIN_SVS>;
- opp-peak-kBps = <1804000>;
- };
- };
-
- qcom,gmu = <&gmu>;
-
- zap-shader {
- memory-region = <&zap_shader_region>;
- firmware-name = "qcom/LENOVO/81JL/qcdxkmsuc850.mbn"
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/msm/gpu.yaml b/Documentation/devicetree/bindings/display/msm/gpu.yaml
new file mode 100644
index 000000000000..5dabe7b6794b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/gpu.yaml
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/display/msm/gpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Adreno or Snapdragon GPUs
+
+maintainers:
+ - Rob Clark <robdclark@gmail.com>
+
+properties:
+ compatible:
+ oneOf:
+ - description: |
+ The driver is parsing the compat string for Adreno to
+ figure out the gpu-id and patch level.
+ items:
+ - pattern: '^qcom,adreno-[3-6][0-9][0-9]\.[0-9]$'
+ - const: qcom,adreno
+ - description: |
+ The driver is parsing the compat string for Imageon to
+ figure out the gpu-id and patch level.
+ items:
+ - pattern: '^amd,imageon-200\.[0-1]$'
+ - const: amd,imageon
+
+ clocks: true
+
+ clock-names: true
+
+ reg:
+ minItems: 1
+ maxItems: 3
+
+ reg-names:
+ minItems: 1
+ items:
+ - const: kgsl_3d0_reg_memory
+ - const: cx_mem
+ - const: cx_dbgc
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-names:
+ maxItems: 1
+
+ interconnects:
+ minItems: 1
+ maxItems: 2
+
+ interconnect-names:
+ minItems: 1
+ items:
+ - const: gfx-mem
+ - const: ocmem
+
+ iommus:
+ minItems: 1
+ maxItems: 64
+
+ sram:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ minItems: 1
+ maxItems: 4
+ items:
+ maxItems: 1
+ description: |
+ phandles to one or more reserved on-chip SRAM regions.
+ phandle to the On Chip Memory (OCMEM) that's present on some a3xx and
+ a4xx Snapdragon SoCs. See
+ Documentation/devicetree/bindings/sram/qcom,ocmem.yaml
+
+ operating-points-v2: true
+ opp-table:
+ type: object
+
+ power-domains:
+ maxItems: 1
+
+ zap-shader:
+ type: object
+ additionalProperties: false
+ description: |
+ For a5xx and a6xx devices this node contains a memory-region that
+ points to reserved memory to store the zap shader that can be used to
+ help bring the GPU out of secure mode.
+ properties:
+ memory-region:
+ maxItems: 1
+
+ firmware-name:
+ description: |
+ Default name of the firmware to load to the remote processor.
+
+ "#cooling-cells":
+ const: 2
+
+ nvmem-cell-names:
+ maxItems: 1
+
+ nvmem-cells:
+ description: efuse registers
+ maxItems: 1
+
+ qcom,gmu:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: |
+ For GMU attached devices a phandle to the GMU device that will
+ control the power for the GPU.
+
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ pattern: '^qcom,adreno-[3-5][0-9][0-9]\.[0-9]$'
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 7
+
+ clock-names:
+ items:
+ anyOf:
+ - const: core
+ description: GPU Core clock
+ - const: iface
+ description: GPU Interface clock
+ - const: mem
+ description: GPU Memory clock
+ - const: mem_iface
+ description: GPU Memory Interface clock
+ - const: alt_mem_iface
+ description: GPU Alternative Memory Interface clock
+ - const: gfx3d
+ description: GPU 3D engine clock
+ - const: rbbmtimer
+ description: GPU RBBM Timer for Adreno 5xx series
+ - const: rbcpr
+ description: GPU RB Core Power Reduction clock
+ minItems: 2
+ maxItems: 7
+
+ required:
+ - clocks
+ - clock-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ pattern: '^qcom,adreno-6[0-9][0-9]\.[0-9]$'
+
+ then: # Since Adreno 6xx series clocks should be defined in GMU
+ properties:
+ clocks: false
+ clock-names: false
+
+examples:
+ - |
+
+ // Example a3xx/4xx:
+
+ #include <dt-bindings/clock/qcom,mmcc-msm8974.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ gpu: gpu@fdb00000 {
+ compatible = "qcom,adreno-330.2", "qcom,adreno";
+
+ reg = <0xfdb00000 0x10000>;
+ reg-names = "kgsl_3d0_reg_memory";
+
+ clock-names = "core", "iface", "mem_iface";
+ clocks = <&mmcc OXILI_GFX3D_CLK>,
+ <&mmcc OXILICX_AHB_CLK>,
+ <&mmcc OXILICX_AXI_CLK>;
+
+ interrupts = <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "kgsl_3d0_irq";
+
+ sram = <&gpu_sram>;
+ power-domains = <&mmcc OXILICX_GDSC>;
+ operating-points-v2 = <&gpu_opp_table>;
+ iommus = <&gpu_iommu 0>;
+ #cooling-cells = <2>;
+ };
+
+ ocmem@fdd00000 {
+ compatible = "qcom,msm8974-ocmem";
+
+ reg = <0xfdd00000 0x2000>,
+ <0xfec00000 0x180000>;
+ reg-names = "ctrl", "mem";
+
+ clocks = <&rpmcc RPM_SMD_OCMEMGX_CLK>,
+ <&mmcc OCMEMCX_OCMEMNOC_CLK>;
+ clock-names = "core", "iface";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0xfec00000 0x100000>;
+
+ gpu_sram: gpu-sram@0 {
+ reg = <0x0 0x100000>;
+ };
+ };
+ - |
+
+ // Example a6xx (with GMU):
+
+ #include <dt-bindings/clock/qcom,gpucc-sdm845.h>
+ #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sdm845.h>
+
+ reserved-memory {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ zap_shader_region: gpu@8f200000 {
+ compatible = "shared-dma-pool";
+ reg = <0x0 0x90b00000 0x0 0xa00000>;
+ no-map;
+ };
+ };
+
+ gpu@5000000 {
+ compatible = "qcom,adreno-630.2", "qcom,adreno";
+
+ reg = <0x5000000 0x40000>, <0x509e000 0x10>;
+ reg-names = "kgsl_3d0_reg_memory", "cx_mem";
+
+ #cooling-cells = <2>;
+
+ interrupts = <GIC_SPI 300 IRQ_TYPE_LEVEL_HIGH>;
+
+ iommus = <&adreno_smmu 0>;
+
+ operating-points-v2 = <&gpu_opp_table>;
+
+ interconnects = <&rsc_hlos MASTER_GFX3D &rsc_hlos SLAVE_EBI1>;
+ interconnect-names = "gfx-mem";
+
+ qcom,gmu = <&gmu>;
+
+ gpu_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-430000000 {
+ opp-hz = /bits/ 64 <430000000>;
+ opp-level = <RPMH_REGULATOR_LEVEL_SVS_L1>;
+ opp-peak-kBps = <5412000>;
+ };
+
+ opp-355000000 {
+ opp-hz = /bits/ 64 <355000000>;
+ opp-level = <RPMH_REGULATOR_LEVEL_SVS>;
+ opp-peak-kBps = <3072000>;
+ };
+
+ opp-267000000 {
+ opp-hz = /bits/ 64 <267000000>;
+ opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS>;
+ opp-peak-kBps = <3072000>;
+ };
+
+ opp-180000000 {
+ opp-hz = /bits/ 64 <180000000>;
+ opp-level = <RPMH_REGULATOR_LEVEL_MIN_SVS>;
+ opp-peak-kBps = <1804000>;
+ };
+ };
+
+ zap-shader {
+ memory-region = <&zap_shader_region>;
+ firmware-name = "qcom/LENOVO/81JL/qcdxkmsuc850.mbn";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/msm/hdmi.txt b/Documentation/devicetree/bindings/display/msm/hdmi.txt
deleted file mode 100644
index 5f90a40da51b..000000000000
--- a/Documentation/devicetree/bindings/display/msm/hdmi.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-Qualcomm adreno/snapdragon hdmi output
-
-Required properties:
-- compatible: one of the following
- * "qcom,hdmi-tx-8996"
- * "qcom,hdmi-tx-8994"
- * "qcom,hdmi-tx-8084"
- * "qcom,hdmi-tx-8974"
- * "qcom,hdmi-tx-8660"
- * "qcom,hdmi-tx-8960"
-- reg: Physical base address and length of the controller's registers
-- reg-names: "core_physical"
-- interrupts: The interrupt signal from the hdmi block.
-- power-domains: Should be <&mmcc MDSS_GDSC>.
-- clocks: device clocks
- See ../clocks/clock-bindings.txt for details.
-- core-vdda-supply: phandle to supply regulator
-- hdmi-mux-supply: phandle to mux regulator
-- phys: the phandle for the HDMI PHY device
-- phy-names: the name of the corresponding PHY device
-
-Optional properties:
-- hpd-gpios: hpd pin
-- qcom,hdmi-tx-mux-en-gpios: hdmi mux enable pin
-- qcom,hdmi-tx-mux-sel-gpios: hdmi mux select pin
-- qcom,hdmi-tx-mux-lpm-gpios: hdmi mux lpm pin
-- power-domains: reference to the power domain(s), if available.
-- pinctrl-names: the pin control state names; should contain "default"
-- pinctrl-0: the default pinctrl state (active)
-- pinctrl-1: the "sleep" pinctrl state
-
-HDMI PHY:
-Required properties:
-- compatible: Could be the following
- * "qcom,hdmi-phy-8660"
- * "qcom,hdmi-phy-8960"
- * "qcom,hdmi-phy-8974"
- * "qcom,hdmi-phy-8084"
- * "qcom,hdmi-phy-8996"
-- #phy-cells: Number of cells in a PHY specifier; Should be 0.
-- reg: Physical base address and length of the registers of the PHY sub blocks.
-- reg-names: The names of register regions. The following regions are required:
- * "hdmi_phy"
- * "hdmi_pll"
- For HDMI PHY on msm8996, these additional register regions are required:
- * "hdmi_tx_l0"
- * "hdmi_tx_l1"
- * "hdmi_tx_l3"
- * "hdmi_tx_l4"
-- power-domains: Should be <&mmcc MDSS_GDSC>.
-- clocks: device clocks
- See Documentation/devicetree/bindings/clock/clock-bindings.txt for details.
-- core-vdda-supply: phandle to vdda regulator device node
-
-Example:
-
-/ {
- ...
-
- hdmi: hdmi@4a00000 {
- compatible = "qcom,hdmi-tx-8960";
- reg-names = "core_physical";
- reg = <0x04a00000 0x2f0>;
- interrupts = <GIC_SPI 79 0>;
- power-domains = <&mmcc MDSS_GDSC>;
- clock-names =
- "core",
- "master_iface",
- "slave_iface";
- clocks =
- <&mmcc HDMI_APP_CLK>,
- <&mmcc HDMI_M_AHB_CLK>,
- <&mmcc HDMI_S_AHB_CLK>;
- qcom,hdmi-tx-ddc-clk = <&msmgpio 70 GPIO_ACTIVE_HIGH>;
- qcom,hdmi-tx-ddc-data = <&msmgpio 71 GPIO_ACTIVE_HIGH>;
- qcom,hdmi-tx-hpd = <&msmgpio 72 GPIO_ACTIVE_HIGH>;
- core-vdda-supply = <&pm8921_hdmi_mvs>;
- hdmi-mux-supply = <&ext_3p3v>;
- pinctrl-names = "default", "sleep";
- pinctrl-0 = <&hpd_active &ddc_active &cec_active>;
- pinctrl-1 = <&hpd_suspend &ddc_suspend &cec_suspend>;
-
- phys = <&hdmi_phy>;
- phy-names = "hdmi_phy";
- };
-
- hdmi_phy: phy@4a00400 {
- compatible = "qcom,hdmi-phy-8960";
- reg-names = "hdmi_phy",
- "hdmi_pll";
- reg = <0x4a00400 0x60>,
- <0x4a00500 0x100>;
- #phy-cells = <0>;
- power-domains = <&mmcc MDSS_GDSC>;
- clock-names = "slave_iface";
- clocks = <&mmcc HDMI_S_AHB_CLK>;
- core-vdda-supply = <&pm8921_hdmi_mvs>;
- };
-};
diff --git a/Documentation/devicetree/bindings/display/msm/hdmi.yaml b/Documentation/devicetree/bindings/display/msm/hdmi.yaml
new file mode 100644
index 000000000000..47e97669821c
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/hdmi.yaml
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/display/msm/hdmi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Adreno/Snapdragon HDMI output
+
+maintainers:
+ - Rob Clark <robdclark@gmail.com>
+
+properties:
+ compatible:
+ enum:
+ - qcom,hdmi-tx-8084
+ - qcom,hdmi-tx-8660
+ - qcom,hdmi-tx-8960
+ - qcom,hdmi-tx-8974
+ - qcom,hdmi-tx-8994
+ - qcom,hdmi-tx-8996
+
+ clocks:
+ minItems: 1
+ maxItems: 5
+
+ clock-names:
+ minItems: 1
+ maxItems: 5
+
+ reg:
+ minItems: 1
+ maxItems: 3
+
+ reg-names:
+ minItems: 1
+ items:
+ - const: core_physical
+ - const: qfprom_physical
+ - const: hdcp_physical
+
+ interrupts:
+ maxItems: 1
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ enum:
+ - hdmi_phy
+ - hdmi-phy
+ deprecated: true
+
+ core-vdda-supply:
+ description: phandle to VDDA supply regulator
+
+ hdmi-mux-supply:
+ description: phandle to mux regulator
+ deprecated: true
+
+ core-vcc-supply:
+ description: phandle to VCC supply regulator
+
+ hpd-gpios:
+ maxItems: 1
+ description: hpd pin
+
+ qcom,hdmi-tx-mux-en-gpios:
+ maxItems: 1
+ deprecated: true
+ description: HDMI mux enable pin
+
+ qcom,hdmi-tx-mux-sel-gpios:
+ maxItems: 1
+ deprecated: true
+ description: HDMI mux select pin
+
+ qcom,hdmi-tx-mux-lpm-gpios:
+ maxItems: 1
+ deprecated: true
+ description: HDMI mux lpm pin
+
+ '#sound-dai-cells':
+ const: 1
+
+ ports:
+ type: object
+ $ref: /schemas/graph.yaml#/properties/ports
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ description: |
+ Input endpoints of the controller.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ description: |
+ Output endpoints of the controller.
+
+ required:
+ - port@0
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - reg
+ - reg-names
+ - interrupts
+ - phys
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,hdmi-tx-8960
+ - qcom,hdmi-tx-8660
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ clock-names:
+ items:
+ - const: core
+ - const: master_iface
+ - const: slave_iface
+ core-vcc-supplies: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,hdmi-tx-8974
+ - qcom,hdmi-tx-8084
+ - qcom,hdmi-tx-8994
+ - qcom,hdmi-tx-8996
+ then:
+ properties:
+ clocks:
+ minItems: 5
+ clock-names:
+ items:
+ - const: mdp_core
+ - const: iface
+ - const: core
+ - const: alt_iface
+ - const: extp
+ hdmi-mux-supplies: false
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ hdmi: hdmi@4a00000 {
+ compatible = "qcom,hdmi-tx-8960";
+ reg-names = "core_physical";
+ reg = <0x04a00000 0x2f0>;
+ interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>;
+ clock-names = "core",
+ "master_iface",
+ "slave_iface";
+ clocks = <&clk 61>,
+ <&clk 72>,
+ <&clk 98>;
+ hpd-gpios = <&msmgpio 72 GPIO_ACTIVE_HIGH>;
+ core-vdda-supply = <&pm8921_hdmi_mvs>;
+ hdmi-mux-supply = <&ext_3p3v>;
+ pinctrl-names = "default", "sleep";
+ pinctrl-0 = <&hpd_active &ddc_active &cec_active>;
+ pinctrl-1 = <&hpd_suspend &ddc_suspend &cec_suspend>;
+
+ phys = <&hdmi_phy>;
+ };
+ - |
+ #include <dt-bindings/clock/qcom,gcc-msm8996.h>
+ #include <dt-bindings/clock/qcom,mmcc-msm8996.h>
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ hdmi@9a0000 {
+ compatible = "qcom,hdmi-tx-8996";
+ reg = <0x009a0000 0x50c>,
+ <0x00070000 0x6158>,
+ <0x009e0000 0xfff>;
+ reg-names = "core_physical",
+ "qfprom_physical",
+ "hdcp_physical";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <8 IRQ_TYPE_LEVEL_HIGH>;
+
+ clocks = <&mmcc MDSS_MDP_CLK>,
+ <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_HDMI_CLK>,
+ <&mmcc MDSS_HDMI_AHB_CLK>,
+ <&mmcc MDSS_EXTPCLK_CLK>;
+ clock-names = "mdp_core",
+ "iface",
+ "core",
+ "alt_iface",
+ "extp";
+
+ phys = <&hdmi_phy>;
+ #sound-dai-cells = <1>;
+
+ pinctrl-names = "default", "sleep";
+ pinctrl-0 = <&hdmi_hpd_active &hdmi_ddc_active>;
+ pinctrl-1 = <&hdmi_hpd_suspend &hdmi_ddc_suspend>;
+
+ core-vdda-supply = <&vreg_l12a_1p8>;
+ core-vcc-supply = <&vreg_s4a_1p8>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&mdp5_intf3_out>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/mdp4.txt b/Documentation/devicetree/bindings/display/msm/mdp4.txt
deleted file mode 100644
index b07eeb38f709..000000000000
--- a/Documentation/devicetree/bindings/display/msm/mdp4.txt
+++ /dev/null
@@ -1,114 +0,0 @@
-Qualcomm adreno/snapdragon MDP4 display controller
-
-Description:
-
-This is the bindings documentation for the MDP4 display controller found in
-SoCs like MSM8960, APQ8064 and MSM8660.
-
-Required properties:
-- compatible:
- * "qcom,mdp4" - mdp4
-- reg: Physical base address and length of the controller's registers.
-- interrupts: The interrupt signal from the display controller.
-- clocks: device clocks
- See ../clocks/clock-bindings.txt for details.
-- clock-names: the following clocks are required.
- * "core_clk"
- * "iface_clk"
- * "bus_clk"
- * "lut_clk"
- * "hdmi_clk"
- * "tv_clk"
-- ports: contains the list of output ports from MDP. These connect to interfaces
- that are external to the MDP hardware, such as HDMI, DSI, EDP etc (LVDS is a
- special case since it is a part of the MDP block itself).
-
- Each output port contains an endpoint that describes how it is connected to an
- external interface. These are described by the standard properties documented
- here:
- Documentation/devicetree/bindings/graph.txt
- Documentation/devicetree/bindings/media/video-interfaces.txt
-
- The output port mappings are:
- Port 0 -> LCDC/LVDS
- Port 1 -> DSI1 Cmd/Video
- Port 2 -> DSI2 Cmd/Video
- Port 3 -> DTV
-
-Optional properties:
-- clock-names: the following clocks are optional:
- * "lut_clk"
-- qcom,lcdc-align-lsb: Boolean value indicating that LSB alignment should be
- used for LCDC. This is only valid for 18bpp panels.
-
-Example:
-
-/ {
- ...
-
- hdmi: hdmi@4a00000 {
- ...
- ports {
- ...
- port@0 {
- reg = <0>;
- hdmi_in: endpoint {
- remote-endpoint = <&mdp_dtv_out>;
- };
- };
- ...
- };
- ...
- };
-
- ...
-
- mdp: mdp@5100000 {
- compatible = "qcom,mdp4";
- reg = <0x05100000 0xf0000>;
- interrupts = <GIC_SPI 75 0>;
- clock-names =
- "core_clk",
- "iface_clk",
- "lut_clk",
- "hdmi_clk",
- "tv_clk";
- clocks =
- <&mmcc MDP_CLK>,
- <&mmcc MDP_AHB_CLK>,
- <&mmcc MDP_AXI_CLK>,
- <&mmcc MDP_LUT_CLK>,
- <&mmcc HDMI_TV_CLK>,
- <&mmcc MDP_TV_CLK>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- mdp_lvds_out: endpoint {
- };
- };
-
- port@1 {
- reg = <1>;
- mdp_dsi1_out: endpoint {
- };
- };
-
- port@2 {
- reg = <2>;
- mdp_dsi2_out: endpoint {
- };
- };
-
- port@3 {
- reg = <3>;
- mdp_dtv_out: endpoint {
- remote-endpoint = <&hdmi_in>;
- };
- };
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/msm/mdp4.yaml b/Documentation/devicetree/bindings/display/msm/mdp4.yaml
new file mode 100644
index 000000000000..35204a287579
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/mdp4.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/mdp4.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Adreno/Snapdragon MDP4 display controller
+
+description: >
+ MDP4 display controller found in SoCs like MSM8960, APQ8064 and MSM8660.
+
+maintainers:
+ - Rob Clark <robdclark@gmail.com>
+
+properties:
+ compatible:
+ const: qcom,mdp4
+
+ clocks:
+ minItems: 6
+ maxItems: 6
+
+ clock-names:
+ items:
+ - const: core_clk
+ - const: iface_clk
+ - const: bus_clk
+ - const: lut_clk
+ - const: hdmi_clk
+ - const: tv_clk
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ iommus:
+ maxItems: 4
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: LCDC/LVDS
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: DSI1 Cmd / Video
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: DSI2 Cmd / Video
+
+ port@3:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Digital TV
+
+ qcom,lcdc-align-lsb:
+ type: boolean
+ description: >
+ Indication that LSB alignment should be used for LCDC.
+ This is only valid for 18bpp panels.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ mdp: mdp@5100000 {
+ compatible = "qcom,mdp4";
+ reg = <0x05100000 0xf0000>;
+ interrupts = <0 75 0>;
+ clock-names =
+ "core_clk",
+ "iface_clk",
+ "bus_clk",
+ "lut_clk",
+ "hdmi_clk",
+ "tv_clk";
+ clocks =
+ <&mmcc 77>,
+ <&mmcc 86>,
+ <&mmcc 102>,
+ <&mmcc 75>,
+ <&mmcc 97>,
+ <&mmcc 12>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ mdp_lvds_out: endpoint {
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ mdp_dsi1_out: endpoint {
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+ mdp_dsi2_out: endpoint {
+ };
+ };
+
+ port@3 {
+ reg = <3>;
+ mdp_dtv_out: endpoint {
+ remote-endpoint = <&hdmi_in>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/msm/mdp5.txt b/Documentation/devicetree/bindings/display/msm/mdp5.txt
deleted file mode 100644
index 43d11279c925..000000000000
--- a/Documentation/devicetree/bindings/display/msm/mdp5.txt
+++ /dev/null
@@ -1,160 +0,0 @@
-Qualcomm adreno/snapdragon MDP5 display controller
-
-Description:
-
-This is the bindings documentation for the Mobile Display Subsytem(MDSS) that
-encapsulates sub-blocks like MDP5, DSI, HDMI, eDP etc, and the MDP5 display
-controller found in SoCs like MSM8974, APQ8084, MSM8916, MSM8994 and MSM8996.
-
-MDSS:
-Required properties:
-- compatible:
- * "qcom,mdss" - MDSS
-- reg: Physical base address and length of the controller's registers.
-- reg-names: The names of register regions. The following regions are required:
- * "mdss_phys"
- * "vbif_phys"
-- interrupts: The interrupt signal from MDSS.
-- interrupt-controller: identifies the node as an interrupt controller.
-- #interrupt-cells: specifies the number of cells needed to encode an interrupt
- source, should be 1.
-- power-domains: a power domain consumer specifier according to
- Documentation/devicetree/bindings/power/power_domain.txt
-- clocks: device clocks. See ../clocks/clock-bindings.txt for details.
-- clock-names: the following clocks are required.
- * "iface"
- * "bus"
- * "vsync"
-- #address-cells: number of address cells for the MDSS children. Should be 1.
-- #size-cells: Should be 1.
-- ranges: parent bus address space is the same as the child bus address space.
-
-Optional properties:
-- clock-names: the following clocks are optional:
- * "lut"
-
-MDP5:
-Required properties:
-- compatible:
- * "qcom,mdp5" - MDP5
-- reg: Physical base address and length of the controller's registers.
-- reg-names: The names of register regions. The following regions are required:
- * "mdp_phys"
-- interrupts: Interrupt line from MDP5 to MDSS interrupt controller.
-- clocks: device clocks. See ../clocks/clock-bindings.txt for details.
-- clock-names: the following clocks are required.
-- * "bus"
-- * "iface"
-- * "core"
-- * "vsync"
-- ports: contains the list of output ports from MDP. These connect to interfaces
- that are external to the MDP hardware, such as HDMI, DSI, EDP etc (LVDS is a
- special case since it is a part of the MDP block itself).
-
- Each output port contains an endpoint that describes how it is connected to an
- external interface. These are described by the standard properties documented
- here:
- Documentation/devicetree/bindings/graph.txt
- Documentation/devicetree/bindings/media/video-interfaces.txt
-
- The availability of output ports can vary across SoC revisions:
-
- For MSM8974 and APQ8084:
- Port 0 -> MDP_INTF0 (eDP)
- Port 1 -> MDP_INTF1 (DSI1)
- Port 2 -> MDP_INTF2 (DSI2)
- Port 3 -> MDP_INTF3 (HDMI)
-
- For MSM8916:
- Port 0 -> MDP_INTF1 (DSI1)
-
- For MSM8994 and MSM8996:
- Port 0 -> MDP_INTF1 (DSI1)
- Port 1 -> MDP_INTF2 (DSI2)
- Port 2 -> MDP_INTF3 (HDMI)
-
-Optional properties:
-- clock-names: the following clocks are optional:
- * "lut"
- * "tbu"
- * "tbu_rt"
-
-Example:
-
-/ {
- ...
-
- mdss: mdss@1a00000 {
- compatible = "qcom,mdss";
- reg = <0x1a00000 0x1000>,
- <0x1ac8000 0x3000>;
- reg-names = "mdss_phys", "vbif_phys";
-
- power-domains = <&gcc MDSS_GDSC>;
-
- clocks = <&gcc GCC_MDSS_AHB_CLK>,
- <&gcc GCC_MDSS_AXI_CLK>,
- <&gcc GCC_MDSS_VSYNC_CLK>;
- clock-names = "iface",
- "bus",
- "vsync"
-
- interrupts = <0 72 0>;
-
- interrupt-controller;
- #interrupt-cells = <1>;
-
- #address-cells = <1>;
- #size-cells = <1>;
- ranges;
-
- mdp: mdp@1a01000 {
- compatible = "qcom,mdp5";
- reg = <0x1a01000 0x90000>;
- reg-names = "mdp_phys";
-
- interrupt-parent = <&mdss>;
- interrupts = <0 0>;
-
- clocks = <&gcc GCC_MDSS_AHB_CLK>,
- <&gcc GCC_MDSS_AXI_CLK>,
- <&gcc GCC_MDSS_MDP_CLK>,
- <&gcc GCC_MDSS_VSYNC_CLK>;
- clock-names = "iface",
- "bus",
- "core",
- "vsync";
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- mdp5_intf1_out: endpoint {
- remote-endpoint = <&dsi0_in>;
- };
- };
- };
- };
-
- dsi0: dsi@1a98000 {
- ...
- ports {
- ...
- port@0 {
- reg = <0>;
- dsi0_in: endpoint {
- remote-endpoint = <&mdp5_intf1_out>;
- };
- };
- ...
- };
- ...
- };
-
- dsi_phy0: dsi-phy@1a98300 {
- ...
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/display/msm/mdss-common.yaml b/Documentation/devicetree/bindings/display/msm/mdss-common.yaml
new file mode 100644
index 000000000000..ccd7d6417523
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/mdss-common.yaml
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/mdss-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display MDSS common properties
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+ - Rob Clark <robdclark@gmail.com>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc.
+
+# Do not select this by default, otherwise it is also selected for qcom,mdss
+# devices.
+select:
+ false
+
+properties:
+ $nodename:
+ pattern: "^display-subsystem@[0-9a-f]+$"
+
+ reg:
+ maxItems: 1
+
+ reg-names:
+ const: mdss
+
+ power-domains:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 4
+
+ clock-names:
+ minItems: 2
+ maxItems: 4
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-controller: true
+
+ "#address-cells": true
+
+ "#size-cells": true
+
+ "#interrupt-cells":
+ const: 1
+
+ iommus:
+ minItems: 1
+ items:
+ - description: Phandle to apps_smmu node with SID mask for Hard-Fail port0
+ - description: Phandle to apps_smmu node with SID mask for Hard-Fail port1
+
+ ranges: true
+
+ interconnects:
+ minItems: 1
+ items:
+ - description: Interconnect path from mdp0 (or a single mdp) port to the data bus
+ - description: Interconnect path from mdp1 port to the data bus
+
+ interconnect-names:
+ minItems: 1
+ items:
+ - const: mdp0-mem
+ - const: mdp1-mem
+
+ resets:
+ items:
+ - description: MDSS_CORE reset
+
+required:
+ - reg
+ - reg-names
+ - power-domains
+ - clocks
+ - interrupts
+ - interrupt-controller
+ - iommus
+ - ranges
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml b/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml
new file mode 100644
index 000000000000..a763cf8da122
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,mdp5.yaml
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,mdp5.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Adreno/Snapdragon Mobile Display controller (MDP5)
+
+description:
+ MDP5 display controller found in SoCs like MSM8974, APQ8084, MSM8916, MSM8994
+ and MSM8996.
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+ - Rob Clark <robdclark@gmail.com>
+
+properties:
+ compatible:
+ oneOf:
+ - const: qcom,mdp5
+ deprecated: true
+ - items:
+ - enum:
+ - qcom,apq8084-mdp5
+ - qcom,msm8916-mdp5
+ - qcom,msm8917-mdp5
+ - qcom,msm8953-mdp5
+ - qcom,msm8974-mdp5
+ - qcom,msm8976-mdp5
+ - qcom,msm8994-mdp5
+ - qcom,msm8996-mdp5
+ - qcom,sdm630-mdp5
+ - qcom,sdm660-mdp5
+ - const: qcom,mdp5
+
+ $nodename:
+ pattern: '^display-controller@[0-9a-f]+$'
+
+ reg:
+ maxItems: 1
+
+ reg-names:
+ items:
+ - const: mdp_phys
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 4
+ maxItems: 7
+
+ clock-names:
+ oneOf:
+ - minItems: 4
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+ - const: vsync
+ - const: lut
+ - const: tbu
+ - const: tbu_rt
+ # MSM8996 has additional iommu clock
+ - items:
+ - const: iface
+ - const: bus
+ - const: core
+ - const: iommu
+ - const: vsync
+
+ interconnects:
+ minItems: 1
+ items:
+ - description: Interconnect path from mdp0 (or a single mdp) port to the data bus
+ - description: Interconnect path from mdp1 port to the data bus
+ - description: Interconnect path from rotator port to the data bus
+
+ interconnect-names:
+ minItems: 1
+ items:
+ - const: mdp0-mem
+ - const: mdp1-mem
+ - const: rotator-mem
+
+ iommus:
+ items:
+ - description: apps SMMU with the Stream-ID mask for Hard-Fail port0
+
+ power-domains:
+ maxItems: 1
+
+ operating-points-v2: true
+ opp-table:
+ type: object
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description: >
+ Contains the list of output ports from DPU device. These ports
+ connect to interfaces that are external to the DPU hardware,
+ such as DSI, DP etc. MDP5 devices support up to 4 ports:
+ one or two DSI ports, HDMI and eDP.
+
+ patternProperties:
+ "^port@[0-3]+$":
+ $ref: /schemas/graph.yaml#/properties/port
+
+ # at least one port is required
+ required:
+ - port@0
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-msm8916.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ display-controller@1a01000 {
+ compatible = "qcom,mdp5";
+ reg = <0x1a01000 0x90000>;
+ reg-names = "mdp_phys";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ clocks = <&gcc GCC_MDSS_AHB_CLK>,
+ <&gcc GCC_MDSS_AXI_CLK>,
+ <&gcc GCC_MDSS_MDP_CLK>,
+ <&gcc GCC_MDSS_VSYNC_CLK>;
+ clock-names = "iface",
+ "bus",
+ "core",
+ "vsync";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,mdss.yaml
new file mode 100644
index 000000000000..b0100105e428
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,mdss.yaml
@@ -0,0 +1,211 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Mobile Display SubSystem (MDSS)
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+ - Rob Clark <robdclark@gmail.com>
+
+description:
+ This is the bindings documentation for the Mobile Display Subsytem(MDSS) that
+ encapsulates sub-blocks like MDP5, DSI, HDMI, eDP, etc.
+
+properties:
+ $nodename:
+ pattern: "^display-subsystem@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - qcom,mdss
+
+ reg:
+ minItems: 2
+ maxItems: 3
+
+ reg-names:
+ minItems: 2
+ items:
+ - const: mdss_phys
+ - const: vbif_phys
+ - const: vbif_nrt_phys
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 1
+
+ power-domains:
+ maxItems: 1
+ description: |
+ The MDSS power domain provided by GCC
+
+ clocks:
+ oneOf:
+ - minItems: 3
+ items:
+ - description: Display abh clock
+ - description: Display axi clock
+ - description: Display vsync clock
+ - description: Display core clock
+ - minItems: 1
+ items:
+ - description: Display abh clock
+ - description: Display core clock
+
+ clock-names:
+ oneOf:
+ - minItems: 3
+ items:
+ - const: iface
+ - const: bus
+ - const: vsync
+ - const: core
+ - minItems: 1
+ items:
+ - const: iface
+ - const: core
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 1
+
+ ranges: true
+
+ resets:
+ items:
+ - description: MDSS_CORE reset
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - interrupts
+ - interrupt-controller
+ - "#interrupt-cells"
+ - power-domains
+ - clocks
+ - clock-names
+ - "#address-cells"
+ - "#size-cells"
+ - ranges
+
+patternProperties:
+ "^display-controller@[1-9a-f][0-9a-f]*$":
+ type: object
+ additionalProperties: true
+ properties:
+ compatible:
+ contains:
+ const: qcom,mdp5
+
+ "^dsi@[1-9a-f][0-9a-f]*$":
+ type: object
+ additionalProperties: true
+ properties:
+ compatible:
+ contains:
+ const: qcom,mdss-dsi-ctrl
+
+ "^phy@[1-9a-f][0-9a-f]*$":
+ type: object
+ additionalProperties: true
+ properties:
+ compatible:
+ enum:
+ - qcom,dsi-phy-14nm
+ - qcom,dsi-phy-14nm-660
+ - qcom,dsi-phy-14nm-8953
+ - qcom,dsi-phy-20nm
+ - qcom,dsi-phy-28nm-hpm
+ - qcom,dsi-phy-28nm-lp
+ - qcom,hdmi-phy-8084
+ - qcom,hdmi-phy-8660
+ - qcom,hdmi-phy-8960
+ - qcom,hdmi-phy-8974
+ - qcom,hdmi-phy-8996
+
+ "^hdmi-tx@[1-9a-f][0-9a-f]*$":
+ type: object
+ additionalProperties: true
+ properties:
+ compatible:
+ enum:
+ - qcom,hdmi-tx-8084
+ - qcom,hdmi-tx-8660
+ - qcom,hdmi-tx-8960
+ - qcom,hdmi-tx-8974
+ - qcom,hdmi-tx-8994
+ - qcom,hdmi-tx-8996
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-msm8916.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ display-subsystem@1a00000 {
+ compatible = "qcom,mdss";
+ reg = <0x1a00000 0x1000>,
+ <0x1ac8000 0x3000>;
+ reg-names = "mdss_phys", "vbif_phys";
+
+ power-domains = <&gcc MDSS_GDSC>;
+
+ clocks = <&gcc GCC_MDSS_AHB_CLK>,
+ <&gcc GCC_MDSS_AXI_CLK>,
+ <&gcc GCC_MDSS_VSYNC_CLK>;
+ clock-names = "iface",
+ "bus",
+ "vsync";
+
+ interrupts = <GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>;
+
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@1a01000 {
+ compatible = "qcom,msm8916-mdp5", "qcom,mdp5";
+ reg = <0x01a01000 0x89000>;
+ reg-names = "mdp_phys";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ clocks = <&gcc GCC_MDSS_AHB_CLK>,
+ <&gcc GCC_MDSS_AXI_CLK>,
+ <&gcc GCC_MDSS_MDP_CLK>,
+ <&gcc GCC_MDSS_VSYNC_CLK>;
+ clock-names = "iface",
+ "bus",
+ "core",
+ "vsync";
+
+ iommus = <&apps_iommu 4>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ mdp5_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,msm8998-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,msm8998-dpu.yaml
new file mode 100644
index 000000000000..8d3cd46260fb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,msm8998-dpu.yaml
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,msm8998-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on MSM8998
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,msm8998-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for regdma register set
+ - description: Address offset and size for vbif register set
+ - description: Address offset and size for non-realtime vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: regdma
+ - const: vbif
+ - const: vbif_nrt
+
+ clocks:
+ items:
+ - description: Display ahb clock
+ - description: Display axi clock
+ - description: Display mem-noc clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: mnoc
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,mmcc-msm8998.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@c901000 {
+ compatible = "qcom,msm8998-dpu";
+ reg = <0x0c901000 0x8f000>,
+ <0x0c9a8e00 0xf0>,
+ <0x0c9b0000 0x2008>,
+ <0x0c9b8000 0x1040>;
+ reg-names = "mdp", "regdma", "vbif", "vbif_nrt";
+
+ clocks = <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_AXI_CLK>,
+ <&mmcc MNOC_AHB_CLK>,
+ <&mmcc MDSS_MDP_CLK>,
+ <&mmcc MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "mnoc", "core", "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd MSM8998_VDDMX>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,msm8998-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,msm8998-mdss.yaml
new file mode 100644
index 000000000000..3c2b6ed98a56
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,msm8998-mdss.yaml
@@ -0,0 +1,272 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,msm8998-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm MSM8998 Display MDSS
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for MSM8998 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,msm8998-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock
+ - description: Display AXI clock
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,msm8998-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,msm8998-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-10nm-8998
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,mmcc-msm8998.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@c900000 {
+ compatible = "qcom,msm8998-mdss";
+ reg = <0x0c900000 0x1000>;
+ reg-names = "mdss";
+
+ clocks = <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_AXI_CLK>,
+ <&mmcc MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "core";
+
+ #address-cells = <1>;
+ #interrupt-cells = <1>;
+ #size-cells = <1>;
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ iommus = <&mmss_smmu 0>;
+
+ power-domains = <&mmcc MDSS_GDSC>;
+ ranges;
+
+ display-controller@c901000 {
+ compatible = "qcom,msm8998-dpu";
+ reg = <0x0c901000 0x8f000>,
+ <0x0c9a8e00 0xf0>,
+ <0x0c9b0000 0x2008>,
+ <0x0c9b8000 0x1040>;
+ reg-names = "mdp", "regdma", "vbif", "vbif_nrt";
+
+ clocks = <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_AXI_CLK>,
+ <&mmcc MNOC_AHB_CLK>,
+ <&mmcc MDSS_MDP_CLK>,
+ <&mmcc MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "mnoc", "core", "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd MSM8998_VDDMX>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+
+ dsi@c994000 {
+ compatible = "qcom,msm8998-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0c994000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&mmcc MDSS_BYTE0_CLK>,
+ <&mmcc MDSS_BYTE0_INTF_CLK>,
+ <&mmcc MDSS_PCLK0_CLK>,
+ <&mmcc MDSS_ESC0_CLK>,
+ <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&mmcc BYTE0_CLK_SRC>, <&mmcc PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmpd MSM8998_VDDCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi0_phy: phy@c994400 {
+ compatible = "qcom,dsi-phy-10nm-8998";
+ reg = <0x0c994400 0x200>,
+ <0x0c994600 0x280>,
+ <0x0c994a00 0x1e0>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&mmcc MDSS_AHB_CLK>,
+ <&rpmcc RPM_SMD_XO_CLK_SRC>;
+ clock-names = "iface", "ref";
+
+ vdds-supply = <&pm8998_l1>;
+ };
+
+ dsi@c996000 {
+ compatible = "qcom,msm8998-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0c996000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&mmcc MDSS_BYTE1_CLK>,
+ <&mmcc MDSS_BYTE1_INTF_CLK>,
+ <&mmcc MDSS_PCLK1_CLK>,
+ <&mmcc MDSS_ESC1_CLK>,
+ <&mmcc MDSS_AHB_CLK>,
+ <&mmcc MDSS_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&mmcc BYTE1_CLK_SRC>, <&mmcc PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmpd MSM8998_VDDCX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@c996400 {
+ compatible = "qcom,dsi-phy-10nm-8998";
+ reg = <0x0c996400 0x200>,
+ <0x0c996600 0x280>,
+ <0x0c996a00 0x10e>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&mmcc MDSS_AHB_CLK>,
+ <&rpmcc RPM_SMD_XO_CLK_SRC>;
+ clock-names = "iface", "ref";
+
+ vdds-supply = <&pm8998_l1>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-dpu.yaml
new file mode 100644
index 000000000000..414f4e7ebdf1
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-dpu.yaml
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,qcm2290-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on QCM2290
+
+maintainers:
+ - Loic Poulain <loic.poulain@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,qcm2290-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display AXI clock from gcc
+ - description: Display AHB clock from dispcc
+ - description: Display core clock from dispcc
+ - description: Display lut clock from dispcc
+ - description: Display vsync clock from dispcc
+
+ clock-names:
+ items:
+ - const: bus
+ - const: iface
+ - const: core
+ - const: lut
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-qcm2290.h>
+ #include <dt-bindings/clock/qcom,gcc-qcm2290.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@5e01000 {
+ compatible = "qcom,qcm2290-dpu";
+ reg = <0x05e01000 0x8f000>,
+ <0x05eb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "core", "lut", "vsync";
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd QCM2290_VDDCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-mdss.yaml
new file mode 100644
index 000000000000..2995b84b2cd4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,qcm2290-mdss.yaml
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,qcm2290-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm QCM220 Display MDSS
+
+maintainers:
+ - Loic Poulain <loic.poulain@linaro.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller and DSI. Device tree bindings of MDSS
+ are mentioned for QCM2290 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,qcm2290-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display AXI clock
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+
+ iommus:
+ maxItems: 2
+
+ interconnects:
+ maxItems: 1
+
+ interconnect-names:
+ maxItems: 1
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,qcm2290-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-ctrl-6g-qcm2290
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-14nm-2290
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-qcm2290.h>
+ #include <dt-bindings/clock/qcom,gcc-qcm2290.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,qcm2290.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@5e00000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "qcom,qcm2290-mdss";
+ reg = <0x05e00000 0x1000>;
+ reg-names = "mdss";
+ power-domains = <&dispcc MDSS_GDSC>;
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "core";
+
+ interrupts = <GIC_SPI 186 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ interconnects = <&mmrt_virt MASTER_MDP0 &bimc SLAVE_EBI1>;
+ interconnect-names = "mdp0-mem";
+
+ iommus = <&apps_smmu 0x420 0x2>,
+ <&apps_smmu 0x421 0x0>;
+ ranges;
+
+ display-controller@5e01000 {
+ compatible = "qcom,qcm2290-dpu";
+ reg = <0x05e01000 0x8f000>,
+ <0x05eb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "core", "lut", "vsync";
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd QCM2290_VDDCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+
+ dsi@5e94000 {
+ compatible = "qcom,dsi-ctrl-6g-qcm2290";
+ reg = <0x05e94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>, <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmpd QCM2290_VDDCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi0_phy: phy@5e94400 {
+ compatible = "qcom,dsi-phy-14nm-2290";
+ reg = <0x05e94400 0x100>,
+ <0x05e94500 0x300>,
+ <0x05e94800 0x188>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>, <&rpmcc RPM_SMD_XO_CLK_SRC>;
+ clock-names = "iface", "ref";
+ vcca-supply = <&vreg_dsi_phy>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc7180-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc7180-dpu.yaml
new file mode 100644
index 000000000000..1fb8321d9ee8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc7180-dpu.yaml
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc7180-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on SC7180
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc7180-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display hf axi clock
+ - description: Display ahb clock
+ - description: Display rotator clock
+ - description: Display lut clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: iface
+ - const: rot
+ - const: lut
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7180.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc7180-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_ROT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "rot", "lut", "core",
+ "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SC7180_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+ endpoint {
+ remote-endpoint = <&dp_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc7180-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc7180-mdss.yaml
new file mode 100644
index 000000000000..42ef06edddc4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc7180-mdss.yaml
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc7180-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SC7180 Display MDSS
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for SC7180 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc7180-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display AHB clock from dispcc
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: ahb
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 1
+
+ interconnect-names:
+ maxItems: 1
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc7180-dpu
+
+ "^displayport-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc7180-dp
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sc7180-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-10nm
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc7180.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7180.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sdm845.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "qcom,sc7180-mdss";
+ reg = <0xae00000 0x1000>;
+ reg-names = "mdss";
+ power-domains = <&dispcc MDSS_GDSC>;
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "ahb", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ interconnects = <&mmss_noc MASTER_MDP0 &mc_virt SLAVE_EBI1>;
+ interconnect-names = "mdp0-mem";
+
+ iommus = <&apps_smmu 0x800 0x2>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc7180-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_ROT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "rot", "lut", "core",
+ "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SC7180_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+ dpu_intf0_out: endpoint {
+ remote-endpoint = <&dp_in>;
+ };
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sc7180-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>, <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi_phy 0>, <&dsi_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SC7180_CX>;
+
+ phys = <&dsi_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ dsi_phy: phy@ae94400 {
+ compatible = "qcom,dsi-phy-10nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94a00 0x1e0>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+
+ displayport-controller@ae90000 {
+ compatible = "qcom,sc7180-dp";
+
+ reg = <0xae90000 0x200>,
+ <0xae90200 0x200>,
+ <0xae90400 0xc00>,
+ <0xae91000 0x400>,
+ <0xae91400 0x400>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <12>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_AUX_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_LINK_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_LINK_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_PIXEL_CLK>;
+ clock-names = "core_iface", "core_aux", "ctrl_link",
+ "ctrl_link_iface", "stream_pixel";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_DP_LINK_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_DP_PIXEL_CLK_SRC>;
+ assigned-clock-parents = <&dp_phy 0>, <&dp_phy 1>;
+ phys = <&dp_phy>;
+ phy-names = "dp";
+
+ operating-points-v2 = <&dp_opp_table>;
+ power-domains = <&rpmhpd SC7180_CX>;
+
+ #sound-dai-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ port@0 {
+ reg = <0>;
+ dp_in: endpoint {
+ remote-endpoint = <&dpu_intf0_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dp_out: endpoint { };
+ };
+ };
+
+ dp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-160000000 {
+ opp-hz = /bits/ 64 <160000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-270000000 {
+ opp-hz = /bits/ 64 <270000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-540000000 {
+ opp-hz = /bits/ 64 <540000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-810000000 {
+ opp-hz = /bits/ 64 <810000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc7280-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc7280-dpu.yaml
new file mode 100644
index 000000000000..26dc073bd19a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc7280-dpu.yaml
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc7280-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on SC7280
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc7280-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display ahb clock
+ - description: Display lut clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: nrt_bus
+ - const: iface
+ - const: lut
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc7280-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SC7280_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&edp_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc7280-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc7280-mdss.yaml
new file mode 100644
index 000000000000..078e1d1a7d2f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc7280-mdss.yaml
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc7280-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SC7280 Display MDSS
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem (MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for SC7280.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc7280-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display AHB clock from dispcc
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: ahb
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 1
+
+ interconnect-names:
+ maxItems: 1
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc7280-dpu
+
+ "^displayport-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc7280-dp
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sc7280-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^edp@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc7280-edp
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ enum:
+ - qcom,sc7280-dsi-phy-7nm
+ - qcom,sc7280-edp-phy
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,gcc-sc7280.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sc7280.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "qcom,sc7280-mdss";
+ reg = <0xae00000 0x1000>;
+ reg-names = "mdss";
+ power-domains = <&dispcc DISP_CC_MDSS_CORE_GDSC>;
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface",
+ "ahb",
+ "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ interconnects = <&mmss_noc MASTER_MDP0 &mc_virt SLAVE_EBI1>;
+ interconnect-names = "mdp0-mem";
+
+ iommus = <&apps_smmu 0x900 0x402>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc7280-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SC7280_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf5_out: endpoint {
+ remote-endpoint = <&edp_in>;
+ };
+ };
+
+ port@2 {
+ reg = <2>;
+ dpu_intf0_out: endpoint {
+ remote-endpoint = <&dp_in>;
+ };
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sc7280-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&mdss_dsi_phy 0>, <&mdss_dsi_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SC7280_CX>;
+
+ phys = <&mdss_dsi_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ mdss_dsi_phy: phy@ae94400 {
+ compatible = "qcom,sc7280-dsi-phy-7nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94900 0x280>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+
+ vdds-supply = <&vreg_dsi_supply>;
+ };
+
+ edp@aea0000 {
+ compatible = "qcom,sc7280-edp";
+ pinctrl-names = "default";
+ pinctrl-0 = <&edp_hot_plug_det>;
+
+ reg = <0xaea0000 0x200>,
+ <0xaea0200 0x200>,
+ <0xaea0400 0xc00>,
+ <0xaea1000 0x400>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <14>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_EDP_AUX_CLK>,
+ <&dispcc DISP_CC_MDSS_EDP_LINK_CLK>,
+ <&dispcc DISP_CC_MDSS_EDP_LINK_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_EDP_PIXEL_CLK>;
+ clock-names = "core_iface",
+ "core_aux",
+ "ctrl_link",
+ "ctrl_link_iface",
+ "stream_pixel";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_EDP_LINK_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_EDP_PIXEL_CLK_SRC>;
+ assigned-clock-parents = <&mdss_edp_phy 0>, <&mdss_edp_phy 1>;
+
+ phys = <&mdss_edp_phy>;
+ phy-names = "dp";
+
+ operating-points-v2 = <&edp_opp_table>;
+ power-domains = <&rpmhpd SC7280_CX>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ edp_in: endpoint {
+ remote-endpoint = <&dpu_intf5_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ mdss_edp_out: endpoint { };
+ };
+ };
+
+ edp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-160000000 {
+ opp-hz = /bits/ 64 <160000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-270000000 {
+ opp-hz = /bits/ 64 <270000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-540000000 {
+ opp-hz = /bits/ 64 <540000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+
+ opp-810000000 {
+ opp-hz = /bits/ 64 <810000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ mdss_edp_phy: phy@aec2a00 {
+ compatible = "qcom,sc7280-edp-phy";
+
+ reg = <0xaec2a00 0x19c>,
+ <0xaec2200 0xa0>,
+ <0xaec2600 0xa0>,
+ <0xaec2000 0x1c0>;
+
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&gcc GCC_EDP_CLKREF_EN>;
+ clock-names = "aux",
+ "cfg_ahb";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+ };
+
+ displayport-controller@ae90000 {
+ compatible = "qcom,sc7280-dp";
+
+ reg = <0xae90000 0x200>,
+ <0xae90200 0x200>,
+ <0xae90400 0xc00>,
+ <0xae91000 0x400>,
+ <0xae91400 0x400>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <12>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_AUX_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_LINK_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_LINK_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_DP_PIXEL_CLK>;
+ clock-names = "core_iface",
+ "core_aux",
+ "ctrl_link",
+ "ctrl_link_iface",
+ "stream_pixel";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_DP_LINK_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_DP_PIXEL_CLK_SRC>;
+ assigned-clock-parents = <&dp_phy 0>, <&dp_phy 1>;
+ phys = <&dp_phy>;
+ phy-names = "dp";
+
+ operating-points-v2 = <&dp_opp_table>;
+ power-domains = <&rpmhpd SC7280_CX>;
+
+ #sound-dai-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dp_in: endpoint {
+ remote-endpoint = <&dpu_intf0_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dp_out: endpoint { };
+ };
+ };
+
+ dp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-160000000 {
+ opp-hz = /bits/ 64 <160000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-270000000 {
+ opp-hz = /bits/ 64 <270000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-540000000 {
+ opp-hz = /bits/ 64 <540000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-810000000 {
+ opp-hz = /bits/ 64 <810000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-dpu.yaml
new file mode 100644
index 000000000000..f2c8e16cf067
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-dpu.yaml
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc8280xp-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SC8280XP Display Processing Unit
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description:
+ Device tree bindings for SC8280XP Display Processing Unit.
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc8280xp-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display ahb clock
+ - description: Display lut clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: nrt_bus
+ - const: iface
+ - const: lut
+ - const: core
+ - const: vsync
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc8280xp.h>
+ #include <dt-bindings/clock/qcom,gcc-sc8280xp.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sc8280xp.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc8280xp-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc0 DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc0 DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc0 DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc0 DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc0 DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc0 DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <460000000>,
+ <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SC8280XP_MMCX>;
+
+ interrupt-parent = <&mdss0>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp0_in>;
+ };
+ };
+
+ port@4 {
+ reg = <4>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp1_in>;
+ };
+ };
+
+ port@5 {
+ reg = <5>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp3_in>;
+ };
+ };
+
+ port@6 {
+ reg = <6>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp2_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-mdss.yaml
new file mode 100644
index 000000000000..c239544bc37f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sc8280xp-mdss.yaml
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sc8280xp-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SC8280XP Mobile Display Subsystem
+
+maintainers:
+ - Bjorn Andersson <andersson@kernel.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem (MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sc8280xp-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display AHB clock from dispcc
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: ahb
+ - const: core
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sc8280xp-dpu
+
+ "^displayport-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ enum:
+ - qcom,sc8280xp-dp
+ - qcom,sc8280xp-edp
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sc8280xp.h>
+ #include <dt-bindings/clock/qcom,gcc-sc8280xp.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sc8280xp.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sc8280xp-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ power-domains = <&dispcc0 MDSS_GDSC>;
+
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&dispcc0 DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc0 DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface",
+ "ahb",
+ "core";
+
+ resets = <&dispcc0 DISP_CC_MDSS_CORE_BCR>;
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ interconnects = <&mmss_noc MASTER_MDP0 0 &mc_virt SLAVE_EBI1 0>,
+ <&mmss_noc MASTER_MDP1 0 &mc_virt SLAVE_EBI1 0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ iommus = <&apps_smmu 0x1000 0x402>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sc8280xp-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc0 DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc0 DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc0 DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc0 DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc0 DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdss0_mdp_opp_table>;
+ power-domains = <&rpmhpd SC8280XP_MMCX>;
+
+ interrupt-parent = <&mdss0>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp0_in>;
+ };
+ };
+
+ port@4 {
+ reg = <4>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp1_in>;
+ };
+ };
+
+ port@5 {
+ reg = <5>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp3_in>;
+ };
+ };
+
+ port@6 {
+ reg = <6>;
+ endpoint {
+ remote-endpoint = <&mdss0_dp2_in>;
+ };
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sdm845-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sdm845-dpu.yaml
new file mode 100644
index 000000000000..0f7765d832e7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sdm845-dpu.yaml
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sdm845-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on SDM845
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sdm845-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display GCC bus clock
+ - description: Display ahb clock
+ - description: Display axi clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: gcc-bus
+ - const: iface
+ - const: bus
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sdm845.h>
+ #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sdm845-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "gcc-bus", "iface", "bus", "core", "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SDM845_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sdm845-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sdm845-mdss.yaml
new file mode 100644
index 000000000000..6ecb00920d7f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sdm845-mdss.yaml
@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sdm845-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SDM845 Display MDSS
+
+maintainers:
+ - Krishna Manikandan <quic_mkrishn@quicinc.com>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for SDM845 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sdm845-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: core
+
+ iommus:
+ maxItems: 2
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sdm845-dpu
+
+ "^displayport-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sdm845-dp
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sdm845-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-10nm
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sdm845.h>
+ #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "qcom,sdm845-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+ power-domains = <&dispcc MDSS_GDSC>;
+
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x880 0x8>,
+ <&apps_smmu 0xc80 0x8>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sdm845-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "gcc-bus", "iface", "bus", "core", "vsync";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+ power-domains = <&rpmhpd SDM845_CX>;
+ operating-points-v2 = <&mdp_opp_table>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sdm845-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SDM845_CX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi0_phy: phy@ae94400 {
+ compatible = "qcom,dsi-phy-10nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94a00 0x1e0>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+
+ dsi@ae96000 {
+ compatible = "qcom,sdm845-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae96000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE1_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC1_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SDM845_CX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@ae96400 {
+ compatible = "qcom,dsi-phy-10nm";
+ reg = <0x0ae96400 0x200>,
+ <0x0ae96600 0x280>,
+ <0x0ae96a00 0x10e>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm6115-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm6115-dpu.yaml
new file mode 100644
index 000000000000..bf62c2f5325a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm6115-dpu.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm6115-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display DPU on SM6115
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm6115-dpu
+
+ reg:
+ items:
+ - description: MDP register set
+ - description: VBIF register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display AXI
+ - description: Display AHB
+ - description: Display core
+ - description: Display lut
+ - description: Display rotator
+ - description: Display vsync
+
+ clock-names:
+ items:
+ - const: bus
+ - const: iface
+ - const: core
+ - const: lut
+ - const: rot
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm6115-dispcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm6115.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@5e01000 {
+ compatible = "qcom,sm6115-dpu";
+ reg = <0x05e01000 0x8f000>,
+ <0x05eb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_ROT_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "core", "lut", "rot", "vsync";
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd SM6115_VDDCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm6115-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm6115-mdss.yaml
new file mode 100644
index 000000000000..b9f83088f370
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm6115-mdss.yaml
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm6115-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM6115 Display MDSS
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller and DSI. Device tree bindings of MDSS
+ are mentioned for SM6115 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm6115-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display AXI clock
+ - description: Display core clock
+
+ iommus:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm6115-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: qcom,sm6115-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+ - description: Old binding, please don't use
+ deprecated: true
+ const: qcom,dsi-ctrl-6g-qcm2290
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-14nm-2290
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm6115-dispcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm6115.h>
+ #include <dt-bindings/clock/qcom,rpmcc.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@5e00000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "qcom,sm6115-mdss";
+ reg = <0x05e00000 0x1000>;
+ reg-names = "mdss";
+ power-domains = <&dispcc MDSS_GDSC>;
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+
+ interrupts = <GIC_SPI 186 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x420 0x2>,
+ <&apps_smmu 0x421 0x0>;
+ ranges;
+
+ display-controller@5e01000 {
+ compatible = "qcom,sm6115-dpu";
+ reg = <0x05e01000 0x8f000>,
+ <0x05eb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_ROT_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus", "iface", "core", "lut", "rot", "vsync";
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmpd SM6115_VDDCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+ };
+
+ dsi@5e94000 {
+ compatible = "qcom,sm6115-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x05e94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>, <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmpd SM6115_VDDCX>;
+ phys = <&dsi0_phy>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi0_phy: phy@5e94400 {
+ compatible = "qcom,dsi-phy-14nm-2290";
+ reg = <0x05e94400 0x100>,
+ <0x05e94500 0x300>,
+ <0x05e94800 0x188>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>, <&rpmcc RPM_SMD_XO_CLK_SRC>;
+ clock-names = "iface", "ref";
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-dpu.yaml
new file mode 100644
index 000000000000..2b3f3fe9bdf7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-dpu.yaml
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8150-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8150 Display DPU
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8150-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display ahb clock
+ - description: Display hf axi clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+ - const: vsync
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8150.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8150.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8150.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8150-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "core", "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8150_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml
new file mode 100644
index 000000000000..5182e958e069
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8150-mdss.yaml
@@ -0,0 +1,332 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8150-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8150 Display MDSS
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for SM8150 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: qcom,sm8150-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: nrt_bus
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8150-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sm8150-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-7nm
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8150.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8150.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8150.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sm8150-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ interconnects = <&mmss_noc MASTER_MDP_PORT0 &mc_virt SLAVE_EBI_CH0>,
+ <&mmss_noc MASTER_MDP_PORT1 &mc_virt SLAVE_EBI_CH0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ power-domains = <&dispcc MDSS_GDSC>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "nrt_bus", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x800 0x420>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8150-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "core", "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8150_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-171428571 {
+ opp-hz = /bits/ 64 <171428571>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-345000000 {
+ opp-hz = /bits/ 64 <345000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-460000000 {
+ opp-hz = /bits/ 64 <460000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sm8150-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8150_MMCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ dsi0_phy: phy@ae94400 {
+ compatible = "qcom,dsi-phy-7nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+
+ dsi@ae96000 {
+ compatible = "qcom,sm8150-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae96000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE1_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC1_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8150_MMCX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@ae96400 {
+ compatible = "qcom,dsi-phy-7nm";
+ reg = <0x0ae96400 0x200>,
+ <0x0ae96600 0x280>,
+ <0x0ae96900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8250-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8250-dpu.yaml
new file mode 100644
index 000000000000..687c8c170cd4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8250-dpu.yaml
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8250-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8250 Display DPU
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8250-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display ahb clock
+ - description: Display hf axi clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8250.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8250.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8250.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8250-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "core", "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8250-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8250-mdss.yaml
new file mode 100644
index 000000000000..368d3db0ce96
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8250-mdss.yaml
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8250-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8250 Display MDSS
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description:
+ Device tree bindings for MSM Mobile Display Subsystem(MDSS) that encapsulates
+ sub-blocks like DPU display controller, DSI and DP interfaces etc. Device tree
+ bindings of MDSS are mentioned for SM8250 target.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8250-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: nrt_bus
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8250-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sm8250-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-7nm
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8250.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8250.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8250.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sm8250-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ interconnects = <&mmss_noc MASTER_MDP_PORT0 &mc_virt SLAVE_EBI_CH0>,
+ <&mmss_noc MASTER_MDP_PORT1 &mc_virt SLAVE_EBI_CH0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ power-domains = <&dispcc MDSS_GDSC>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "nrt_bus", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x820 0x402>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8250-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "iface", "bus", "core", "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-345000000 {
+ opp-hz = /bits/ 64 <345000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-460000000 {
+ opp-hz = /bits/ 64 <460000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sm8250-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ dsi0_phy: phy@ae94400 {
+ compatible = "qcom,dsi-phy-7nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+
+ dsi@ae96000 {
+ compatible = "qcom,sm8250-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae96000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE1_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC1_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8250_MMCX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@ae96400 {
+ compatible = "qcom,dsi-phy-7nm";
+ reg = <0x0ae96400 0x200>,
+ <0x0ae96600 0x280>,
+ <0x0ae96900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8350-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8350-dpu.yaml
new file mode 100644
index 000000000000..120500395c9a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8350-dpu.yaml
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8350-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8350 Display DPU
+
+maintainers:
+ - Robert Foss <robert.foss@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8350-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display ahb clock
+ - description: Display lut clock
+ - description: Display core clock
+ - description: Display vsync clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: nrt_bus
+ - const: iface
+ - const: lut
+ - const: core
+ - const: vsync
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8350.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8350.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8350.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8350-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8350_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-345000000 {
+ opp-hz = /bits/ 64 <345000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-460000000 {
+ opp-hz = /bits/ 64 <460000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8350-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8350-mdss.yaml
new file mode 100644
index 000000000000..4d94dbff3054
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8350-mdss.yaml
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8350-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8350 Display MDSS
+
+maintainers:
+ - Robert Foss <robert.foss@linaro.org>
+
+description:
+ MSM Mobile Display Subsystem(MDSS) that encapsulates sub-blocks like
+ DPU display controller, DSI and DP interfaces etc.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - const: qcom,sm8350-mdss
+
+ clocks:
+ items:
+ - description: Display AHB clock from gcc
+ - description: Display hf axi clock
+ - description: Display sf axi clock
+ - description: Display core clock
+
+ clock-names:
+ items:
+ - const: iface
+ - const: bus
+ - const: nrt_bus
+ - const: core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ items:
+ - const: mdp0-mem
+ - const: mdp1-mem
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8350-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sm8350-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,dsi-phy-5nm-8350
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,dispcc-sm8350.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8350.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8350.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sm8350-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ interconnects = <&mmss_noc MASTER_MDP0 0 &mc_virt SLAVE_EBI1 0>,
+ <&mmss_noc MASTER_MDP1 0 &mc_virt SLAVE_EBI1 0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ power-domains = <&dispcc MDSS_GDSC>;
+ resets = <&dispcc DISP_CC_MDSS_CORE_BCR>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "nrt_bus", "core";
+
+ iommus = <&apps_smmu 0x820 0x402>;
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8350-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8350_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-345000000 {
+ opp-hz = /bits/ 64 <345000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-460000000 {
+ opp-hz = /bits/ 64 <460000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ dsi0: dsi@ae94000 {
+ compatible = "qcom,sm8350-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&mdss_dsi0_phy 0>,
+ <&mdss_dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8350_MMCX>;
+
+ phys = <&mdss_dsi0_phy>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8450-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8450-dpu.yaml
new file mode 100644
index 000000000000..0d17ece1c453
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8450-dpu.yaml
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8450-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8450 Display DPU
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8450-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display hf axi
+ - description: Display sf axi
+ - description: Display ahb
+ - description: Display lut
+ - description: Display core
+ - description: Display vsync
+
+ clock-names:
+ items:
+ - const: bus
+ - const: nrt_bus
+ - const: iface
+ - const: lut
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8450-dispcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8450.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8450-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-172000000{
+ opp-hz = /bits/ 64 <172000000>;
+ required-opps = <&rpmhpd_opp_low_svs_d1>;
+ };
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-325000000 {
+ opp-hz = /bits/ 64 <325000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-375000000 {
+ opp-hz = /bits/ 64 <375000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-500000000 {
+ opp-hz = /bits/ 64 <500000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8450-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8450-mdss.yaml
new file mode 100644
index 000000000000..f26eb5643aed
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8450-mdss.yaml
@@ -0,0 +1,345 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8450-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8450 Display MDSS
+
+maintainers:
+ - Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+
+description:
+ SM8450 MSM Mobile Display Subsystem(MDSS), which encapsulates sub-blocks like
+ DPU display controller, DSI and DP interfaces etc.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8450-mdss
+
+ clocks:
+ items:
+ - description: Display AHB
+ - description: Display hf AXI
+ - description: Display sf AXI
+ - description: Display core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8450-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sm8450-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8450-dsi-phy-5nm
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8450-dispcc.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8450.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sm8450-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ interconnects = <&mmss_noc MASTER_MDP_DISP 0 &mc_virt SLAVE_EBI1_DISP 0>,
+ <&mmss_noc MASTER_MDP_DISP 0 &mc_virt SLAVE_EBI1_DISP 0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ resets = <&dispcc DISP_CC_MDSS_CORE_BCR>;
+
+ power-domains = <&dispcc MDSS_GDSC>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "nrt_bus", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x2800 0x402>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8450-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&gcc GCC_DISP_SF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-172000000{
+ opp-hz = /bits/ 64 <172000000>;
+ required-opps = <&rpmhpd_opp_low_svs_d1>;
+ };
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-325000000 {
+ opp-hz = /bits/ 64 <325000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-375000000 {
+ opp-hz = /bits/ 64 <375000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-500000000 {
+ opp-hz = /bits/ 64 <500000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sm8450-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-160310000{
+ opp-hz = /bits/ 64 <160310000>;
+ required-opps = <&rpmhpd_opp_low_svs_d1>;
+ };
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ dsi0_phy: phy@ae94400 {
+ compatible = "qcom,sm8450-dsi-phy-5nm";
+ reg = <0x0ae94400 0x200>,
+ <0x0ae94600 0x280>,
+ <0x0ae94900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+
+ dsi@ae96000 {
+ compatible = "qcom,sm8450-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae96000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE1_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC1_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8450_MMCX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@ae96400 {
+ compatible = "qcom,sm8450-dsi-phy-5nm";
+ reg = <0x0ae96400 0x200>,
+ <0x0ae96600 0x280>,
+ <0x0ae96900 0x260>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ vdds-supply = <&vreg_dsi_phy>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8550-dpu.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8550-dpu.yaml
new file mode 100644
index 000000000000..ff58a747bb6f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8550-dpu.yaml
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8550-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8550 Display DPU
+
+maintainers:
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+$ref: /schemas/display/msm/dpu-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8550-dpu
+
+ reg:
+ items:
+ - description: Address offset and size for mdp register set
+ - description: Address offset and size for vbif register set
+
+ reg-names:
+ items:
+ - const: mdp
+ - const: vbif
+
+ clocks:
+ items:
+ - description: Display AHB
+ - description: Display hf axi
+ - description: Display MDSS ahb
+ - description: Display lut
+ - description: Display core
+ - description: Display vsync
+
+ clock-names:
+ items:
+ - const: bus
+ - const: nrt_bus
+ - const: iface
+ - const: lut
+ - const: core
+ - const: vsync
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8550-dispcc.h>
+ #include <dt-bindings/clock/qcom,sm8550-gcc.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8550-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8550_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-325000000 {
+ opp-hz = /bits/ 64 <325000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-375000000 {
+ opp-hz = /bits/ 64 <375000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-514000000 {
+ opp-hz = /bits/ 64 <514000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/msm/qcom,sm8550-mdss.yaml b/Documentation/devicetree/bindings/display/msm/qcom,sm8550-mdss.yaml
new file mode 100644
index 000000000000..887be33ba108
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/msm/qcom,sm8550-mdss.yaml
@@ -0,0 +1,333 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/msm/qcom,sm8550-mdss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8550 Display MDSS
+
+maintainers:
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+description:
+ SM8550 MSM Mobile Display Subsystem(MDSS), which encapsulates sub-blocks like
+ DPU display controller, DSI and DP interfaces etc.
+
+$ref: /schemas/display/msm/mdss-common.yaml#
+
+properties:
+ compatible:
+ const: qcom,sm8550-mdss
+
+ clocks:
+ items:
+ - description: Display MDSS AHB
+ - description: Display AHB
+ - description: Display hf AXI
+ - description: Display core
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 2
+
+ interconnect-names:
+ maxItems: 2
+
+patternProperties:
+ "^display-controller@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8550-dpu
+
+ "^dsi@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ items:
+ - const: qcom,sm8550-dsi-ctrl
+ - const: qcom,mdss-dsi-ctrl
+
+ "^phy@[0-9a-f]+$":
+ type: object
+ properties:
+ compatible:
+ const: qcom,sm8550-dsi-phy-4nm
+
+required:
+ - compatible
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,sm8550-dispcc.h>
+ #include <dt-bindings/clock/qcom,sm8550-gcc.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interconnect/qcom,sm8550-rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ display-subsystem@ae00000 {
+ compatible = "qcom,sm8550-mdss";
+ reg = <0x0ae00000 0x1000>;
+ reg-names = "mdss";
+
+ interconnects = <&mmss_noc MASTER_MDP 0 &gem_noc SLAVE_LLCC 0>,
+ <&mc_virt MASTER_LLCC 0 &mc_virt SLAVE_EBI1 0>;
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
+ resets = <&dispcc DISP_CC_MDSS_CORE_BCR>;
+
+ power-domains = <&dispcc MDSS_GDSC>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>;
+ clock-names = "iface", "bus", "nrt_bus", "core";
+
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+
+ iommus = <&apps_smmu 0x1c00 0x2>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ display-controller@ae01000 {
+ compatible = "qcom,sm8550-dpu";
+ reg = <0x0ae01000 0x8f000>,
+ <0x0aeb0000 0x2008>;
+ reg-names = "mdp", "vbif";
+
+ clocks = <&gcc GCC_DISP_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_LUT_CLK>,
+ <&dispcc DISP_CC_MDSS_MDP_CLK>,
+ <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ clock-names = "bus",
+ "nrt_bus",
+ "iface",
+ "lut",
+ "core",
+ "vsync";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_VSYNC_CLK>;
+ assigned-clock-rates = <19200000>;
+
+ operating-points-v2 = <&mdp_opp_table>;
+ power-domains = <&rpmhpd SM8550_MMCX>;
+
+ interrupt-parent = <&mdss>;
+ interrupts = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dpu_intf1_out: endpoint {
+ remote-endpoint = <&dsi0_in>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dpu_intf2_out: endpoint {
+ remote-endpoint = <&dsi1_in>;
+ };
+ };
+ };
+
+ mdp_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-200000000 {
+ opp-hz = /bits/ 64 <200000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-325000000 {
+ opp-hz = /bits/ 64 <325000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-375000000 {
+ opp-hz = /bits/ 64 <375000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+
+ opp-514000000 {
+ opp-hz = /bits/ 64 <514000000>;
+ required-opps = <&rpmhpd_opp_nom>;
+ };
+ };
+ };
+
+ dsi@ae94000 {
+ compatible = "qcom,sm8550-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae94000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <4>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE0_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC0_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE0_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK0_CLK_SRC>;
+ assigned-clock-parents = <&dsi0_phy 0>, <&dsi0_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8550_MMCX>;
+
+ phys = <&dsi0_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi0_in: endpoint {
+ remote-endpoint = <&dpu_intf1_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi0_out: endpoint {
+ };
+ };
+ };
+
+ dsi_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-187500000 {
+ opp-hz = /bits/ 64 <187500000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-300000000 {
+ opp-hz = /bits/ 64 <300000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-358000000 {
+ opp-hz = /bits/ 64 <358000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
+
+ dsi0_phy: phy@ae94400 {
+ compatible = "qcom,sm8550-dsi-phy-4nm";
+ reg = <0x0ae95000 0x200>,
+ <0x0ae95200 0x280>,
+ <0x0ae95500 0x400>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ };
+
+ dsi@ae96000 {
+ compatible = "qcom,sm8550-dsi-ctrl", "qcom,mdss-dsi-ctrl";
+ reg = <0x0ae96000 0x400>;
+ reg-names = "dsi_ctrl";
+
+ interrupt-parent = <&mdss>;
+ interrupts = <5>;
+
+ clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK>,
+ <&dispcc DISP_CC_MDSS_BYTE1_INTF_CLK>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK>,
+ <&dispcc DISP_CC_MDSS_ESC1_CLK>,
+ <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&gcc GCC_DISP_HF_AXI_CLK>;
+ clock-names = "byte",
+ "byte_intf",
+ "pixel",
+ "core",
+ "iface",
+ "bus";
+
+ assigned-clocks = <&dispcc DISP_CC_MDSS_BYTE1_CLK_SRC>,
+ <&dispcc DISP_CC_MDSS_PCLK1_CLK_SRC>;
+ assigned-clock-parents = <&dsi1_phy 0>, <&dsi1_phy 1>;
+
+ operating-points-v2 = <&dsi_opp_table>;
+ power-domains = <&rpmhpd SM8550_MMCX>;
+
+ phys = <&dsi1_phy>;
+ phy-names = "dsi";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi1_in: endpoint {
+ remote-endpoint = <&dpu_intf2_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi1_out: endpoint {
+ };
+ };
+ };
+ };
+
+ dsi1_phy: phy@ae96400 {
+ compatible = "qcom,sm8550-dsi-phy-4nm";
+ reg = <0x0ae97000 0x200>,
+ <0x0ae97200 0x280>,
+ <0x0ae97500 0x400>;
+ reg-names = "dsi_phy",
+ "dsi_phy_lane",
+ "dsi_pll";
+
+ #clock-cells = <1>;
+ #phy-cells = <0>;
+
+ clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "ref";
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml
index a108029ecfab..acd2f3faa6b9 100644
--- a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml
+++ b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Asia Better Technology 3.0" (320x480 pixels) 24-bit IPS LCD panel
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Paul Cercueil <paul@crapouillou.net>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/panel/advantech,idk-1110wr.yaml b/Documentation/devicetree/bindings/display/panel/advantech,idk-1110wr.yaml
index 93878c2cd370..f6fea9085aab 100644
--- a/Documentation/devicetree/bindings/display/panel/advantech,idk-1110wr.yaml
+++ b/Documentation/devicetree/bindings/display/panel/advantech,idk-1110wr.yaml
@@ -11,13 +11,23 @@ maintainers:
- Thierry Reding <thierry.reding@gmail.com>
allOf:
- - $ref: lvds.yaml#
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: advantech,idk-1110wr
+
+ required:
+ - compatible
properties:
compatible:
items:
- const: advantech,idk-1110wr
- - {} # panel-lvds, but not listed here to avoid false select
+ - const: panel-lvds
data-mapping:
const: jeida-24
@@ -35,6 +45,11 @@ additionalProperties: false
required:
- compatible
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
+ - port
examples:
- |+
diff --git a/Documentation/devicetree/bindings/display/panel/arm,rtsm-display.yaml b/Documentation/devicetree/bindings/display/panel/arm,rtsm-display.yaml
new file mode 100644
index 000000000000..4ad484f09ba3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/arm,rtsm-display.yaml
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/arm,rtsm-display.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm RTSM Virtual Platforms Display
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: arm,rtsm-display
+
+ port: true
+
+required:
+ - compatible
+ - port
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/arm,versatile-tft-panel.yaml b/Documentation/devicetree/bindings/display/panel/arm,versatile-tft-panel.yaml
index be69e0cc50fc..c9958f824d9a 100644
--- a/Documentation/devicetree/bindings/display/panel/arm,versatile-tft-panel.yaml
+++ b/Documentation/devicetree/bindings/display/panel/arm,versatile-tft-panel.yaml
@@ -37,9 +37,6 @@ examples:
compatible = "arm,versatile-sysreg", "syscon", "simple-mfd";
reg = <0x00000 0x1000>;
- #address-cells = <1>;
- #size-cells = <0>;
-
panel {
compatible = "arm,versatile-tft-panel";
diff --git a/Documentation/devicetree/bindings/display/panel/auo,a030jtn01.yaml b/Documentation/devicetree/bindings/display/panel/auo,a030jtn01.yaml
new file mode 100644
index 000000000000..86c834eb4d98
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/auo,a030jtn01.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/auo,a030jtn01.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AUO A030JTN01 3.0" (320x480 pixels) 24-bit TFT LCD panel
+
+description: |
+ Delta RGB 8-bit panel found in some Retrogame handhelds
+
+maintainers:
+ - Paul Cercueil <paul@crapouillou.net>
+ - Christophe Branchereau <cbranchereau@gmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ const: auo,a030jtn01
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - power-supply
+ - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "auo,a030jtn01";
+ reg = <0>;
+
+ spi-max-frequency = <10000000>;
+
+ reset-gpios = <&gpe 4 GPIO_ACTIVE_LOW>;
+ power-supply = <&lcd_power>;
+
+ backlight = <&backlight>;
+
+ port {
+ panel_input: endpoint {
+ remote-endpoint = <&panel_output>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/boe,bf060y8m-aj0.yaml b/Documentation/devicetree/bindings/display/panel/boe,bf060y8m-aj0.yaml
new file mode 100644
index 000000000000..a8f3afa922c8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/boe,bf060y8m-aj0.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/boe,bf060y8m-aj0.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: BOE BF060Y8M-AJ0 5.99" 1080x2160 AMOLED Panel
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
+
+description: |
+ This is a 5.99" 1080x2160 16.7M Color active matrix AMOLED
+ video mode panel module on MIPI-DSI 4-Lane interface, GGRB
+ pixel arrangement, 63 micrometers pitch, with an active
+ area of 68.04 x 136.08 millimeters.
+ Each pixel is divided into red and green dots, or blue and
+ green dots, and two pixels share red or blue dots which are
+ arranged in vertical stripe.
+ The DriverIC for this panel module is SW43404.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: boe,bf060y8m-aj0
+
+ elvdd-supply:
+ description: EL Driving positive (VDD) supply (4.40-4.80V)
+ elvss-supply:
+ description: EL Driving negative (VSS) supply (-5.00V to -1.40V)
+ vcc-supply:
+ description: Core (TSP) voltage supply (2.70-3.60V)
+ vci-supply:
+ description: DriverIC Operation supply (2.60-3.60V)
+ vddio-supply:
+ description: I/O voltage supply (1.62-1.98V)
+
+ port: true
+ reg: true
+ reset-gpios: true
+
+required:
+ - compatible
+ - elvdd-supply
+ - elvss-supply
+ - vcc-supply
+ - vci-supply
+ - vddio-supply
+ - reg
+ - reset-gpios
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "boe,bf060y8m-aj0";
+ reg = <0>;
+
+ reset-gpios = <&tlmm 94 GPIO_ACTIVE_HIGH>;
+
+ vcc-supply = <&disp_vcc_vreg>;
+ vddio-supply = <&disp_vddio_vreg>;
+ vci-supply = <&disp_vci_vreg>;
+ elvdd-supply = <&disp_elvdd_vreg>;
+ elvss-supply = <&disp_elvss_vreg>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/boe,tv101wum-nl6.yaml b/Documentation/devicetree/bindings/display/panel/boe,tv101wum-nl6.yaml
index b87a2e28c866..aed55608ebf6 100644
--- a/Documentation/devicetree/bindings/display/panel/boe,tv101wum-nl6.yaml
+++ b/Documentation/devicetree/bindings/display/panel/boe,tv101wum-nl6.yaml
@@ -26,6 +26,12 @@ properties:
- auo,b101uan08.3
# BOE TV105WUM-NW0 10.5" WUXGA TFT LCD panel
- boe,tv105wum-nw0
+ # BOE TV110C9M-LL3 10.95" WUXGA TFT LCD panel
+ - boe,tv110c9m-ll3
+ # INX HJ110IZ-01A 10.95" WUXGA TFT LCD panel
+ - innolux,hj110iz-01a
+ # STARRY 2081101QFH032011-53G 10.1" WUXGA TFT LCD panel
+ - starry,2081101qfh032011-53g
reg:
description: the virtual channel number of a DSI peripheral
@@ -36,6 +42,9 @@ properties:
pp1800-supply:
description: core voltage supply
+ pp3300-supply:
+ description: core voltage supply
+
avdd-supply:
description: phandle of the regulator that provides positive voltage
@@ -46,6 +55,7 @@ properties:
description: phandle of the backlight device attached to the panel
port: true
+ rotation: true
required:
- compatible
diff --git a/Documentation/devicetree/bindings/display/panel/display-timings.yaml b/Documentation/devicetree/bindings/display/panel/display-timings.yaml
index 56903ded005e..dc5f7e36e30b 100644
--- a/Documentation/devicetree/bindings/display/panel/display-timings.yaml
+++ b/Documentation/devicetree/bindings/display/panel/display-timings.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/panel/display-timings.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: display timings bindings
+title: display timings
maintainers:
- Thierry Reding <thierry.reding@gmail.com>
@@ -31,8 +31,7 @@ properties:
patternProperties:
"^timing":
type: object
- allOf:
- - $ref: panel-timing.yaml#
+ $ref: panel-timing.yaml#
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/display/panel/ebbg,ft8719.yaml b/Documentation/devicetree/bindings/display/panel/ebbg,ft8719.yaml
new file mode 100644
index 000000000000..80deedc01c7c
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/ebbg,ft8719.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/ebbg,ft8719.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: EBBG FT8719 MIPI-DSI LCD panel
+
+maintainers:
+ - Joel Selvaraj <jo@jsfamily.in>
+
+description: |
+ The FT8719 panel from EBBG is a FHD+ LCD display panel with a resolution
+ of 1080x2246. It is a video mode DSI panel. The backlight is managed
+ through the QCOM WLED driver.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: ebbg,ft8719
+
+ reg:
+ maxItems: 1
+ description: DSI virtual channel of the peripheral
+
+ vddio-supply:
+ description: power IC supply regulator
+
+ vddpos-supply:
+ description: positive boost supply regulator
+
+ vddneg-supply:
+ description: negative boost supply regulator
+
+required:
+ - compatible
+ - reg
+ - vddio-supply
+ - vddpos-supply
+ - vddneg-supply
+ - reset-gpios
+ - port
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "ebbg,ft8719";
+ reg = <0>;
+
+ vddio-supply = <&vreg_l14a_1p88>;
+ vddpos-supply = <&lab>;
+ vddneg-supply = <&ibb>;
+
+ reset-gpios = <&tlmm 6 GPIO_ACTIVE_LOW>;
+
+ backlight = <&pmi8998_wled>;
+
+ port {
+ ebbg_ft8719_in_0: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/elida,kd35t133.yaml b/Documentation/devicetree/bindings/display/panel/elida,kd35t133.yaml
index 7adb83e2e8d9..265ab6d30572 100644
--- a/Documentation/devicetree/bindings/display/panel/elida,kd35t133.yaml
+++ b/Documentation/devicetree/bindings/display/panel/elida,kd35t133.yaml
@@ -17,7 +17,9 @@ properties:
const: elida,kd35t133
reg: true
backlight: true
+ port: true
reset-gpios: true
+ rotation: true
iovcc-supply:
description: regulator that supplies the iovcc voltage
vdd-supply:
@@ -27,6 +29,7 @@ required:
- compatible
- reg
- backlight
+ - port
- iovcc-supply
- vdd-supply
@@ -43,6 +46,12 @@ examples:
backlight = <&backlight>;
iovcc-supply = <&vcc_1v8>;
vdd-supply = <&vcc3v3_lcd>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/feiyang,fy07024di26a30d.yaml b/Documentation/devicetree/bindings/display/panel/feiyang,fy07024di26a30d.yaml
index 95acf9e96f1c..92df69e80a82 100644
--- a/Documentation/devicetree/bindings/display/panel/feiyang,fy07024di26a30d.yaml
+++ b/Documentation/devicetree/bindings/display/panel/feiyang,fy07024di26a30d.yaml
@@ -26,6 +26,7 @@ properties:
dvdd-supply:
description: 3v3 digital regulator
+ port: true
reset-gpios: true
backlight: true
@@ -35,7 +36,7 @@ required:
- reg
- avdd-supply
- dvdd-supply
- - reset-gpios
+ - port
additionalProperties: false
@@ -54,5 +55,11 @@ examples:
dvdd-supply = <&reg_dldo2>;
reset-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* LCD-RST: PD24 */
backlight = <&backlight>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/focaltech,gpt3.yaml b/Documentation/devicetree/bindings/display/panel/focaltech,gpt3.yaml
new file mode 100644
index 000000000000..d54e96b2a9e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/focaltech,gpt3.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/focaltech,gpt3.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Focaltech GPT3 3.0" (640x480 pixels) IPS LCD panel
+
+maintainers:
+ - Christophe Branchereau <cbranchereau@gmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ const: focaltech,gpt3
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - power-supply
+ - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "focaltech,gpt3";
+ reg = <0>;
+
+ spi-max-frequency = <3125000>;
+
+ reset-gpios = <&gpe 2 GPIO_ACTIVE_LOW>;
+
+ backlight = <&backlight>;
+ power-supply = <&vcc>;
+
+ port {
+ panel_input: endpoint {
+ remote-endpoint = <&panel_output>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/himax,hx8394.yaml b/Documentation/devicetree/bindings/display/panel/himax,hx8394.yaml
new file mode 100644
index 000000000000..1b2a1baa26f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/himax,hx8394.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: (GPL-2.0-only or BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/himax,hx8394.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Himax HX8394 MIPI-DSI LCD panel controller
+
+maintainers:
+ - Ondrej Jirman <megi@xff.cz>
+ - Javier Martinez Canillas <javierm@redhat.com>
+
+description:
+ Device tree bindings for panels based on the Himax HX8394 controller,
+ such as the HannStar HSD060BHW4 720x1440 TFT LCD panel connected with
+ a MIPI-DSI video interface.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - hannstar,hsd060bhw4
+ - const: himax,hx8394
+
+ reg: true
+
+ reset-gpios: true
+
+ backlight: true
+
+ port: true
+
+ vcc-supply:
+ description: Panel power supply
+
+ iovcc-supply:
+ description: I/O voltage supply
+
+required:
+ - compatible
+ - reg
+ - reset-gpios
+ - backlight
+ - port
+ - vcc-supply
+ - iovcc-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "hannstar,hsd060bhw4", "himax,hx8394";
+ reg = <0>;
+ vcc-supply = <&reg_2v8_p>;
+ iovcc-supply = <&reg_1v8_p>;
+ reset-gpios = <&gpio3 13 GPIO_ACTIVE_LOW>;
+ backlight = <&backlight>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9163.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9163.yaml
new file mode 100644
index 000000000000..90e323e19edb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9163.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/ilitek,ili9163.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ilitek ILI9163 display panels
+
+maintainers:
+ - Daniel Mack <daniel@zonque.org>
+
+description:
+ This binding is for display panels using an Ilitek ILI9163 controller in SPI
+ mode.
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - newhaven,1.8-128160EF
+ - const: ilitek,ili9163
+
+ spi-max-frequency:
+ maximum: 32000000
+
+ dc-gpios:
+ maxItems: 1
+ description: Display data/command selection (D/CX)
+
+ backlight: true
+ reg: true
+ reset-gpios: true
+ rotation: true
+
+required:
+ - compatible
+ - reg
+ - dc-gpios
+ - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ backlight: backlight {
+ compatible = "gpio-backlight";
+ gpios = <&gpio 22 GPIO_ACTIVE_HIGH>;
+ };
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ display@0 {
+ compatible = "newhaven,1.8-128160EF", "ilitek,ili9163";
+ reg = <0>;
+ spi-max-frequency = <32000000>;
+ dc-gpios = <&gpio0 24 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio0 25 GPIO_ACTIVE_HIGH>;
+ rotation = <180>;
+ backlight = <&backlight>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml
index e89c1ea62ffa..7d221ef35443 100644
--- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml
+++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml
@@ -15,11 +15,9 @@ description: |
960 TFT source driver pins and 240 TFT gate driver pins, VCOM, VCOML and
VCOMH outputs.
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml
index 20ce88ab4b3a..94f169ea065a 100644
--- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml
+++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9341.yaml
@@ -16,13 +16,16 @@ description: |
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
items:
- enum:
+ - adafruit,yx240qv29
# ili9341 240*320 Color on stm32f429-disco board
- st,sf-tc240t-9370-t
+ - canaan,kd233-tft
- const: ilitek,ili9341
reg: true
@@ -47,32 +50,50 @@ properties:
vddi-led-supply:
description: Voltage supply for the LED driver (1.65 .. 3.3 V)
-additionalProperties: false
+unevaluatedProperties: false
required:
- compatible
- reg
- dc-gpios
- - port
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - st,sf-tc240t-9370-t
+then:
+ required:
+ - port
examples:
- |+
+ #include <dt-bindings/gpio/gpio.h>
spi {
#address-cells = <1>;
#size-cells = <0>;
panel: display@0 {
- compatible = "st,sf-tc240t-9370-t",
- "ilitek,ili9341";
- reg = <0>;
- spi-3wire;
- spi-max-frequency = <10000000>;
- dc-gpios = <&gpiod 13 0>;
- port {
- panel_in: endpoint {
- remote-endpoint = <&display_out>;
- };
- };
- };
+ compatible = "st,sf-tc240t-9370-t",
+ "ilitek,ili9341";
+ reg = <0>;
+ spi-3wire;
+ spi-max-frequency = <10000000>;
+ dc-gpios = <&gpiod 13 0>;
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&display_out>;
+ };
+ };
};
+ display@1{
+ compatible = "adafruit,yx240qv29", "ilitek,ili9341";
+ reg = <1>;
+ spi-max-frequency = <10000000>;
+ dc-gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio0 8 GPIO_ACTIVE_HIGH>;
+ rotation = <270>;
+ backlight = <&backlight>;
+ };
+ };
...
-
diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9881c.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9881c.yaml
index b2fcec4f22fd..c5d1df680858 100644
--- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9881c.yaml
+++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9881c.yaml
@@ -9,24 +9,28 @@ title: Ilitek ILI9881c based MIPI-DSI panels
maintainers:
- Maxime Ripard <mripard@kernel.org>
+allOf:
+ - $ref: panel-common.yaml#
+
properties:
compatible:
items:
- enum:
- bananapi,lhr050h41
- feixin,k101-im2byl02
+ - wanchanglong,w552946aba
- const: ilitek,ili9881c
backlight: true
power-supply: true
reg: true
reset-gpios: true
+ rotation: true
required:
- compatible
- power-supply
- reg
- - reset-gpios
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/display/panel/innolux,ee101ia-01d.yaml b/Documentation/devicetree/bindings/display/panel/innolux,ee101ia-01d.yaml
index a69681e724cb..ab6b7be88341 100644
--- a/Documentation/devicetree/bindings/display/panel/innolux,ee101ia-01d.yaml
+++ b/Documentation/devicetree/bindings/display/panel/innolux,ee101ia-01d.yaml
@@ -11,15 +11,26 @@ maintainers:
- Thierry Reding <thierry.reding@gmail.com>
allOf:
- - $ref: lvds.yaml#
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: innolux,ee101ia-01d
+
+ required:
+ - compatible
properties:
compatible:
items:
- const: innolux,ee101ia-01d
- - {} # panel-lvds, but not listed here to avoid false select
+ - const: panel-lvds
backlight: true
+ data-mapping: true
enable-gpios: true
power-supply: true
width-mm: true
@@ -27,5 +38,13 @@ properties:
panel-timing: true
port: true
+required:
+ - compatible
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
+ - port
+
additionalProperties: false
...
diff --git a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml
index cda36c04e85c..72788e3e6c59 100644
--- a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml
+++ b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Innolux EJ030NA 3.0" (320x480 pixels) 24-bit TFT LCD panel
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Paul Cercueil <paul@crapouillou.net>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.yaml b/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.yaml
deleted file mode 100644
index 243dac2416f3..000000000000
--- a/Documentation/devicetree/bindings/display/panel/innolux,p120zdg-bf1.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/display/panel/innolux,p120zdg-bf1.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Innolux P120ZDG-BF1 12.02 inch eDP 2K display panel
-
-maintainers:
- - Sandeep Panda <spanda@codeaurora.org>
- - Douglas Anderson <dianders@chromium.org>
-
-allOf:
- - $ref: panel-common.yaml#
-
-properties:
- compatible:
- const: innolux,p120zdg-bf1
-
- enable-gpios: true
- power-supply: true
- backlight: true
- no-hpd: true
-
-required:
- - compatible
- - power-supply
-
-additionalProperties: false
-
-examples:
- - |
- #include <dt-bindings/gpio/gpio.h>
-
- panel_edp: panel-edp {
- compatible = "innolux,p120zdg-bf1";
- enable-gpios = <&msmgpio 31 GPIO_ACTIVE_LOW>;
- power-supply = <&pm8916_l2>;
- backlight = <&backlight>;
- no-hpd;
- };
-
-...
diff --git a/Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml b/Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml
new file mode 100644
index 000000000000..41eb7fbf7715
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/jadard,jd9365da-h3.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/jadard,jd9365da-h3.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Jadard JD9365DA-HE WXGA DSI panel
+
+maintainers:
+ - Jagan Teki <jagan@edgeble.ai>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - chongzhou,cz101b4001
+ - radxa,display-10hd-ad001
+ - radxa,display-8hd-ad002
+ - const: jadard,jd9365da-h3
+
+ reg: true
+
+ vdd-supply:
+ description: supply regulator for VDD, usually 3.3V
+
+ vccio-supply:
+ description: supply regulator for VCCIO, usually 1.8V
+
+ reset-gpios: true
+
+ backlight: true
+
+ port: true
+
+required:
+ - compatible
+ - reg
+ - vdd-supply
+ - vccio-supply
+ - reset-gpios
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/pinctrl/rockchip.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "chongzhou,cz101b4001", "jadard,jd9365da-h3";
+ reg = <0>;
+ vdd-supply = <&lcd_3v3>;
+ vccio-supply = <&vcca_1v8>;
+ reset-gpios = <&gpio1 RK_PC2 GPIO_ACTIVE_HIGH>;
+ backlight = <&backlight>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/jdi,lt070me05000.yaml b/Documentation/devicetree/bindings/display/panel/jdi,lt070me05000.yaml
index 4f92365e888a..63c82a4378ff 100644
--- a/Documentation/devicetree/bindings/display/panel/jdi,lt070me05000.yaml
+++ b/Documentation/devicetree/bindings/display/panel/jdi,lt070me05000.yaml
@@ -35,6 +35,8 @@ properties:
phandle of the gpio for power ic line
Power IC supply enable, High active
+ port: true
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml
index c45c92a3d41f..b4be9bd8ddde 100644
--- a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml
+++ b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: King Display KD035G6-54NT 3.5" (320x240 pixels) 24-bit TFT LCD panel
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Paul Cercueil <paul@crapouillou.net>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -26,6 +23,8 @@ properties:
reg: true
reset-gpios: true
+ spi-3wire: true
+
required:
- compatible
- power-supply
diff --git a/Documentation/devicetree/bindings/display/panel/leadtek,ltk035c5444t.yaml b/Documentation/devicetree/bindings/display/panel/leadtek,ltk035c5444t.yaml
new file mode 100644
index 000000000000..ebdca5f5a001
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/leadtek,ltk035c5444t.yaml
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/leadtek,ltk035c5444t.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Leadtek ltk035c5444t 3.5" (640x480 pixels) 24-bit IPS LCD panel
+
+maintainers:
+ - Paul Cercueil <paul@crapouillou.net>
+ - Christophe Branchereau <cbranchereau@gmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ const: leadtek,ltk035c5444t
+
+ backlight: true
+ port: true
+ power-supply: true
+ reg: true
+ reset-gpios: true
+
+ spi-3wire: true
+
+required:
+ - compatible
+ - power-supply
+ - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "leadtek,ltk035c5444t";
+ reg = <0>;
+
+ spi-3wire;
+ spi-max-frequency = <3125000>;
+
+ reset-gpios = <&gpe 2 GPIO_ACTIVE_LOW>;
+
+ backlight = <&backlight>;
+ power-supply = <&vcc>;
+
+ port {
+ panel_input: endpoint {
+ remote-endpoint = <&panel_output>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/leadtek,ltk050h3146w.yaml b/Documentation/devicetree/bindings/display/panel/leadtek,ltk050h3146w.yaml
index 3715882b63b6..3f6efbb942da 100644
--- a/Documentation/devicetree/bindings/display/panel/leadtek,ltk050h3146w.yaml
+++ b/Documentation/devicetree/bindings/display/panel/leadtek,ltk050h3146w.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Leadtek LTK050H3146W 5.0in 720x1280 DSI panel
maintainers:
- - Heiko Stuebner <heiko.stuebner@theobroma-systems.com>
+ - Quentin Schulz <quentin.schulz@theobroma-systems.com>
allOf:
- $ref: panel-common.yaml#
diff --git a/Documentation/devicetree/bindings/display/panel/lg,lg4573.yaml b/Documentation/devicetree/bindings/display/panel/lg,lg4573.yaml
index b4314ce7b411..ee357e139ac0 100644
--- a/Documentation/devicetree/bindings/display/panel/lg,lg4573.yaml
+++ b/Documentation/devicetree/bindings/display/panel/lg,lg4573.yaml
@@ -15,13 +15,13 @@ maintainers:
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
const: lg,lg4573
reg: true
- spi-max-frequency: true
required:
- compatible
diff --git a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml
index 830e335ddb53..628c4b898111 100644
--- a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml
+++ b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: LG.Philips LB035Q02 Panel
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Tomi Valkeinen <tomi.valkeinen@ti.com>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -24,6 +21,9 @@ properties:
enable-gpios: true
port: true
+ spi-cpha: true
+ spi-cpol: true
+
required:
- compatible
- enable-gpios
diff --git a/Documentation/devicetree/bindings/display/panel/mitsubishi,aa104xd12.yaml b/Documentation/devicetree/bindings/display/panel/mitsubishi,aa104xd12.yaml
index b5e7ee230fa6..3623ffa6518d 100644
--- a/Documentation/devicetree/bindings/display/panel/mitsubishi,aa104xd12.yaml
+++ b/Documentation/devicetree/bindings/display/panel/mitsubishi,aa104xd12.yaml
@@ -11,13 +11,23 @@ maintainers:
- Thierry Reding <thierry.reding@gmail.com>
allOf:
- - $ref: lvds.yaml#
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: mitsubishi,aa104xd12
+
+ required:
+ - compatible
properties:
compatible:
items:
- const: mitsubishi,aa104xd12
- - {} # panel-lvds, but not listed here to avoid false select
+ - const: panel-lvds
vcc-supply:
description: Reference to the regulator powering the panel VCC pins.
@@ -39,6 +49,11 @@ additionalProperties: false
required:
- compatible
- vcc-supply
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
+ - port
examples:
- |+
diff --git a/Documentation/devicetree/bindings/display/panel/mitsubishi,aa121td01.yaml b/Documentation/devicetree/bindings/display/panel/mitsubishi,aa121td01.yaml
index 977c50a85b67..37f01d847aac 100644
--- a/Documentation/devicetree/bindings/display/panel/mitsubishi,aa121td01.yaml
+++ b/Documentation/devicetree/bindings/display/panel/mitsubishi,aa121td01.yaml
@@ -11,13 +11,23 @@ maintainers:
- Thierry Reding <thierry.reding@gmail.com>
allOf:
- - $ref: lvds.yaml#
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: mitsubishi,aa121td01
+
+ required:
+ - compatible
properties:
compatible:
items:
- const: mitsubishi,aa121td01
- - {} # panel-lvds, but not listed here to avoid false select
+ - const: panel-lvds
vcc-supply:
description: Reference to the regulator powering the panel VCC pins.
@@ -39,6 +49,11 @@ additionalProperties: false
required:
- compatible
- vcc-supply
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
+ - port
examples:
- |+
diff --git a/Documentation/devicetree/bindings/display/panel/nec,nl8048hl11.yaml b/Documentation/devicetree/bindings/display/panel/nec,nl8048hl11.yaml
index aa788eaa2f71..accf933d6e46 100644
--- a/Documentation/devicetree/bindings/display/panel/nec,nl8048hl11.yaml
+++ b/Documentation/devicetree/bindings/display/panel/nec,nl8048hl11.yaml
@@ -15,6 +15,7 @@ maintainers:
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -34,13 +35,13 @@ required:
- reset-gpios
- port
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
- spi0 {
+ spi {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/panel/newvision,nv3051d.yaml b/Documentation/devicetree/bindings/display/panel/newvision,nv3051d.yaml
new file mode 100644
index 000000000000..116c1b6030a2
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/newvision,nv3051d.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/newvision,nv3051d.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NewVision NV3051D based LCD panel
+
+description: |
+ The NewVision NV3051D is a driver chip used to drive DSI panels. For now,
+ this driver only supports the 640x480 panels found in the Anbernic RG353
+ based devices.
+
+maintainers:
+ - Chris Morgan <macromorgan@hotmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - anbernic,rg353p-panel
+ - anbernic,rg353v-panel
+ - const: newvision,nv3051d
+
+ reg: true
+ backlight: true
+ port: true
+ reset-gpios:
+ description: Active low reset GPIO
+ vdd-supply: true
+
+required:
+ - compatible
+ - reg
+ - backlight
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "anbernic,rg353p-panel", "newvision,nv3051d";
+ reg = <0>;
+ backlight = <&backlight>;
+ reset-gpios = <&gpio4 0 GPIO_ACTIVE_LOW>;
+ vdd-supply = <&vcc3v3_lcd>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/novatek,nt35950.yaml b/Documentation/devicetree/bindings/display/panel/novatek,nt35950.yaml
new file mode 100644
index 000000000000..377a05d48a02
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/novatek,nt35950.yaml
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/novatek,nt35950.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Novatek NT35950-based display panels
+
+maintainers:
+ - AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
+
+description: |
+ The nt35950 IC from Novatek is a Driver IC used to drive MIPI-DSI panels,
+ with Static RAM for content retention in command mode and also supports
+ video mode with VESA Frame Buffer Compression or Display Stream Compression
+ on single, or dual dsi port(s).
+ This DDIC is also capable of upscaling an input image to the panel's native
+ resolution, for example it can upscale a 1920x1080 input to 3840x2160 with
+ either bilinear interpolation or pixel duplication.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - sharp,ls055d1sx04
+ - const: novatek,nt35950
+ description: This indicates the panel manufacturer of the panel
+ that is in turn using the NT35950 panel driver. The compatible
+ string determines how the NT35950 panel driver shall be configured
+ to work with the indicated panel. The novatek,nt35950 compatible shall
+ always be provided as a fallback.
+
+ reset-gpios:
+ maxItems: 1
+ description: phandle of gpio for reset line - This should be 8mA, gpio
+ can be configured using mux, pinctrl, pinctrl-names (active high)
+
+ avdd-supply:
+ description: positive boost supply regulator
+ avee-supply:
+ description: negative boost supply regulator
+ dvdd-supply:
+ description: regulator that supplies the digital voltage
+ vddio-supply:
+ description: regulator that supplies the I/O voltage
+
+ backlight: true
+ ports: true
+ reg: true
+
+required:
+ - compatible
+ - reg
+ - reset-gpios
+ - avdd-supply
+ - avee-supply
+ - dvdd-supply
+ - vddio-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "sharp,ls055d1sx04", "novatek,nt35950";
+ reg = <0>;
+
+ backlight = <&pmi8998_wled>;
+ reset-gpios = <&tlmm 94 GPIO_ACTIVE_HIGH>;
+
+ avdd-supply = <&lab>;
+ avee-supply = <&ibb>;
+ dvdd-supply = <&disp_dvdd_vreg>;
+ vddio-supply = <&vreg_l14a_1p85>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ panel_in0: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ panel_in1: endpoint {
+ remote-endpoint = <&dsi1_out>;
+ };
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/novatek,nt36523.yaml b/Documentation/devicetree/bindings/display/panel/novatek,nt36523.yaml
new file mode 100644
index 000000000000..0039561ef04c
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/novatek,nt36523.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/novatek,nt36523.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Novatek NT36523 based DSI display Panels
+
+maintainers:
+ - Jianhua Lu <lujianhua000@gmail.com>
+
+description: |
+ The Novatek NT36523 is a generic DSI Panel IC used to drive dsi
+ panels. Support video mode panels from China Star Optoelectronics
+ Technology (CSOT) and BOE Technology.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - xiaomi,elish-boe-nt36523
+ - xiaomi,elish-csot-nt36523
+ - const: novatek,nt36523
+
+ reset-gpios:
+ maxItems: 1
+ description: phandle of gpio for reset line - This should be 8mA
+
+ vddio-supply:
+ description: regulator that supplies the I/O voltage
+
+ reg: true
+ ports: true
+ backlight: true
+
+required:
+ - compatible
+ - reg
+ - vddio-supply
+ - reset-gpios
+ - ports
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "xiaomi,elish-csot-nt36523", "novatek,nt36523";
+ reg = <0>;
+
+ vddio-supply = <&vreg_l14a_1p88>;
+ reset-gpios = <&tlmm 75 GPIO_ACTIVE_LOW>;
+ backlight = <&backlight>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ panel_in_0: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+
+ port@1{
+ reg = <1>;
+ panel_in_1: endpoint {
+ remote-endpoint = <&dsi1_out>;
+ };
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml b/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml
index ef4c0a24512d..ae821f465e1c 100644
--- a/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml
+++ b/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml
@@ -46,11 +46,12 @@ properties:
reg: true
port: true
+ backlight: true
required:
- compatible
- reg
- - vddi0-supply
+ - vddio-supply
- vddpos-supply
- vddneg-supply
- reset-gpios
@@ -69,14 +70,13 @@ examples:
panel@0 {
compatible = "tianma,fhd-video", "novatek,nt36672a";
reg = <0>;
- vddi0-supply = <&vreg_l14a_1p88>;
+ vddio-supply = <&vreg_l14a_1p88>;
vddpos-supply = <&lab>;
vddneg-supply = <&ibb>;
+ backlight = <&pmi8998_wled>;
reset-gpios = <&tlmm 6 GPIO_ACTIVE_HIGH>;
- #address-cells = <1>;
- #size-cells = <0>;
port {
tianma_nt36672a_in_0: endpoint {
remote-endpoint = <&dsi0_out>;
diff --git a/Documentation/devicetree/bindings/display/panel/olimex,lcd-olinuxino.yaml b/Documentation/devicetree/bindings/display/panel/olimex,lcd-olinuxino.yaml
index 2329d9610f83..9f97598efdfa 100644
--- a/Documentation/devicetree/bindings/display/panel/olimex,lcd-olinuxino.yaml
+++ b/Documentation/devicetree/bindings/display/panel/olimex,lcd-olinuxino.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/panel/olimex,lcd-olinuxino.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Binding for Olimex Ltd. LCD-OLinuXino bridge panel.
+title: Olimex Ltd. LCD-OLinuXino bridge panel.
maintainers:
- Stefan Mavrodiev <stefan@olimex.com>
diff --git a/Documentation/devicetree/bindings/display/panel/orisetech,otm8009a.yaml b/Documentation/devicetree/bindings/display/panel/orisetech,otm8009a.yaml
index 4b6dda6dbc0f..ad7d3575190e 100644
--- a/Documentation/devicetree/bindings/display/panel/orisetech,otm8009a.yaml
+++ b/Documentation/devicetree/bindings/display/panel/orisetech,otm8009a.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Orise Tech OTM8009A 3.97" 480x800 TFT LCD panel (MIPI-DSI video mode)
maintainers:
- - Philippe CORNU <philippe.cornu@st.com>
+ - Philippe CORNU <philippe.cornu@foss.st.com>
description: |
The Orise Tech OTM8009A is a 3.97" 480x800 TFT LCD panel connected using
@@ -50,4 +50,3 @@ examples:
};
};
...
-
diff --git a/Documentation/devicetree/bindings/display/panel/panel-edp.yaml b/Documentation/devicetree/bindings/display/panel/panel-edp.yaml
new file mode 100644
index 000000000000..bb0cf6827e79
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/panel-edp.yaml
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/panel-edp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Probeable (via DP AUX / EDID) eDP Panels with simple poweron sequences
+
+maintainers:
+ - Douglas Anderson <dianders@chromium.org>
+
+description: |
+ This binding file can be used to indicate that an eDP panel is connected
+ to a Embedded DisplayPort AUX bus (see display/dp-aux-bus.yaml) without
+ actually specifying exactly what panel is connected. This is useful for
+ the case that more than one different panel could be connected to the
+ board, either for second-sourcing purposes or to support multiple SKUs
+ with different LCDs that hook up to a common board.
+
+ As per above, a requirement for using this binding is that the panel is
+ represented under the DP AUX bus. This means that we can use any
+ information provided by the DP AUX bus (including the EDID) to identify
+ the panel. We can use this to identify display size, resolution, and
+ timings among other things.
+
+ One piece of information about eDP panels that is typically _not_
+ provided anywhere on the DP AUX bus is the power sequencing timings.
+ This is the reason why, historically, we've always had to explicitly
+ list eDP panels. We solve that here with two tricks. The "worst case"
+ power on timings for any panels expected to be connected to a board are
+ specified in these bindings. Once we've powered on, it's expected that
+ the operating system will lookup the panel in a table (based on EDID
+ information) to figure out other power sequencing timings.
+
+ eDP panels in general can have somewhat arbitrary power sequencing
+ requirements. However, even though it's arbitrary in general, the
+ vast majority of panel datasheets have a power sequence diagram that
+ looks the exactly the same as every other panel. Each panel datasheet
+ cares about different timings in this diagram but the fact that the
+ diagram is so similar means we can come up with a single driver to
+ handle it.
+
+ These diagrams all look roughly like this, sometimes labeled with
+ slightly different numbers / lines but all pretty much the same
+ sequence. This is because much of this diagram comes straight from
+ the eDP Standard.
+
+ __________________________________________________
+ Vdd ___/: :\____ /
+ _/ : : \_____/
+ :<T1>:<T2>: :<--T10-->:<T11>:<T12>:
+ : +-----------------------+---------+---------+
+ eDP -----------+ Black video | Src vid | Blk vid +
+ Display : +-----------------------+---------+---------+
+ : _______________________:_________:_________:
+ HPD :<T3>| : : |
+ ___________| : : |_____________
+ : : : :
+ Sink +-----------------------:---------:---------+
+ AUX CH -----------+ AUX Ch operational : : +-------------
+ +-----------------------:---------:---------+
+ : : : :
+ :<T4>: :<T7>: : :
+ Src main +------+------+--------------+---------+
+ lnk data----------------+LnkTrn| Idle |Valid vid data| Idle/off+-------------
+ +------+------+--------------+---------+
+ : <T5> :<-T6->:<-T8->: :
+ :__:<T9>:
+ LED_EN | |
+ _____________________________________| |____________________________
+ : :
+ __________:__:_
+ PWM | : : |
+ __________________________| : : |__________________________
+ : : : :
+ _____________:__________:__:_:______
+ Bklight ____/: : : : : :\____
+ power _______/ :<---T13---->: : : :<T16>: \______________
+ (Vbl) :<T17>:<---------T14--------->: :<-T15->:<T18>:
+
+ The above looks fairly complex but, as per above, each panel only cares
+ about a subset of those timings.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: edp-panel
+
+ hpd-reliable-delay-ms:
+ description:
+ A fixed amount of time that must be waited after powering on the
+ panel's power-supply before the HPD signal is a reliable way to know
+ when the AUX channel is ready. This is useful for panels that glitch
+ the HPD at the start of power-on. This value is not needed if HPD is
+ always reliable for all panels that might be connected.
+
+ hpd-absent-delay-ms:
+ description:
+ The panel specifies that HPD will be asserted this many milliseconds
+ from power on (timing T3 in the diagram above). If we have no way to
+ measure HPD then a fixed delay of this many milliseconds can be used.
+ This can also be used as a timeout when waiting for HPD. Does not
+ include the hpd-reliable-delay, so if hpd-reliable-delay was 80 ms
+ and hpd-absent-delay was 200 ms then we'd do a fixed 80 ms delay and
+ then we know HPD would assert in the next 120 ms. This value is not
+ needed if HPD hooked up, either through a GPIO in the panel node or
+ hooked up directly to the eDP controller.
+
+ backlight: true
+ enable-gpios: true
+ port: true
+ power-supply: true
+ no-hpd: true
+ hpd-gpios: true
+
+additionalProperties: false
+
+required:
+ - compatible
+ - power-supply
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ bridge@2d {
+ compatible = "ti,sn65dsi86";
+ reg = <0x2d>;
+
+ interrupt-parent = <&tlmm>;
+ interrupts = <10 IRQ_TYPE_LEVEL_HIGH>;
+
+ enable-gpios = <&tlmm 102 GPIO_ACTIVE_HIGH>;
+
+ vpll-supply = <&src_pp1800_s4a>;
+ vccio-supply = <&src_pp1800_s4a>;
+ vcca-supply = <&src_pp1200_l2a>;
+ vcc-supply = <&src_pp1200_l2a>;
+
+ clocks = <&rpmhcc RPMH_LN_BB_CLK2>;
+ clock-names = "refclk";
+
+ no-hpd;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ sn65dsi86_out: endpoint {
+ remote-endpoint = <&panel_in_edp>;
+ };
+ };
+ };
+
+ aux-bus {
+ panel {
+ compatible = "edp-panel";
+ power-supply = <&pp3300_dx_edp>;
+ backlight = <&backlight>;
+ hpd-gpios = <&sn65dsi86_bridge 2 GPIO_ACTIVE_HIGH>;
+ hpd-reliable-delay-ms = <15>;
+
+ port {
+ panel_in_edp: endpoint {
+ remote-endpoint = <&sn65dsi86_out>;
+ };
+ };
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/panel-lvds.yaml b/Documentation/devicetree/bindings/display/panel/panel-lvds.yaml
new file mode 100644
index 000000000000..929fe046d1e7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/panel-lvds.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/panel-lvds.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Generic LVDS Display Panel
+
+maintainers:
+ - Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
+ - Thierry Reding <thierry.reding@gmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: panel-lvds
+
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - advantech,idk-1110wr
+ - advantech,idk-2121wr
+ - innolux,ee101ia-01d
+ - mitsubishi,aa104xd12
+ - mitsubishi,aa121td01
+ - sgd,gktw70sdae4se
+
+ required:
+ - compatible
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - auo,b101ew05
+ - tbs,a711-panel
+
+ - const: panel-lvds
+
+unevaluatedProperties: false
+
+required:
+ - compatible
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
+ - port
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/panel-mipi-dbi-spi.yaml b/Documentation/devicetree/bindings/display/panel/panel-mipi-dbi-spi.yaml
new file mode 100644
index 000000000000..9b701df5e9d2
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/panel-mipi-dbi-spi.yaml
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: (GPL-2.0-only or BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/panel-mipi-dbi-spi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MIPI DBI SPI Panel
+
+maintainers:
+ - Noralf Trønnes <noralf@tronnes.org>
+
+description: |
+ This binding is for display panels using a MIPI DBI compatible controller
+ in SPI mode.
+
+ The MIPI Alliance Standard for Display Bus Interface defines the electrical
+ and logical interfaces for display controllers historically used in mobile
+ phones. The standard defines 4 display architecture types and this binding is
+ for type 1 which has full frame memory. There are 3 interface types in the
+ standard and type C is the serial interface.
+
+ The standard defines the following interface signals for type C:
+ - Power:
+ - Vdd: Power supply for display module
+ Called power-supply in this binding.
+ - Vddi: Logic level supply for interface signals
+ Called io-supply in this binding.
+ - Interface:
+ - CSx: Chip select
+ - SCL: Serial clock
+ - Dout: Serial out
+ - Din: Serial in
+ - SDA: Bidrectional in/out
+ - D/CX: Data/command selection, high=data, low=command
+ Called dc-gpios in this binding.
+ - RESX: Reset when low
+ Called reset-gpios in this binding.
+
+ The type C interface has 3 options:
+
+ - Option 1: 9-bit mode and D/CX as the 9th bit
+ | Command | the next command or following data |
+ |<0><D7><D6><D5><D4><D3><D2><D1><D0>|<D/CX><D7><D6><D5><D4><D3><D2><D1><D0>|
+
+ - Option 2: 16-bit mode and D/CX as a 9th bit
+ | Command or data |
+ |<X><X><X><X><X><X><X><D/CX><D7><D6><D5><D4><D3><D2><D1><D0>|
+
+ - Option 3: 8-bit mode and D/CX as a separate interface line
+ | Command or data |
+ |<D7><D6><D5><D4><D3><D2><D1><D0>|
+
+ The panel resolution is specified using the panel-timing node properties
+ hactive (width) and vactive (height). The other mandatory panel-timing
+ properties should be set to zero except clock-frequency which can be
+ optionally set to inform about the actual pixel clock frequency.
+
+ If the panel is wired to the controller at an offset specify this using
+ hback-porch (x-offset) and vback-porch (y-offset).
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - sainsmart18
+ - const: panel-mipi-dbi-spi
+
+ write-only:
+ type: boolean
+ description:
+ Controller is not readable (ie. Din (MISO on the SPI interface) is not
+ wired up).
+
+ dc-gpios:
+ maxItems: 1
+ description: |
+ Controller data/command selection (D/CX) in 4-line SPI mode.
+ If not set, the controller is in 3-line SPI mode.
+
+ io-supply:
+ description: |
+ Logic level supply for interface signals (Vddi).
+ No need to set if this is the same as power-supply.
+
+required:
+ - compatible
+ - reg
+ - width-mm
+ - height-mm
+ - panel-timing
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ display@0{
+ compatible = "sainsmart18", "panel-mipi-dbi-spi";
+ reg = <0>;
+ spi-max-frequency = <40000000>;
+
+ dc-gpios = <&gpio 24 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio 25 GPIO_ACTIVE_HIGH>;
+ write-only;
+
+ backlight = <&backlight>;
+
+ width-mm = <35>;
+ height-mm = <28>;
+
+ panel-timing {
+ hactive = <160>;
+ vactive = <128>;
+ hback-porch = <0>;
+ vback-porch = <0>;
+ clock-frequency = <0>;
+ hfront-porch = <0>;
+ hsync-len = <0>;
+ vfront-porch = <0>;
+ vsync-len = <0>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/panel-simple-dsi.yaml b/Documentation/devicetree/bindings/display/panel/panel-simple-dsi.yaml
index fbd71669248f..90c04cff8281 100644
--- a/Documentation/devicetree/bindings/display/panel/panel-simple-dsi.yaml
+++ b/Documentation/devicetree/bindings/display/panel/panel-simple-dsi.yaml
@@ -19,9 +19,6 @@ description: |
If the panel is more advanced a dedicated binding file is required.
-allOf:
- - $ref: panel-common.yaml#
-
properties:
compatible:
@@ -35,6 +32,8 @@ properties:
- boe,tv080wum-nl0
# Innolux P079ZCA 7.85" 768x1024 TFT LCD panel
- innolux,p079zca
+ # JDI FHD_R63452 1080x1920 5.2" IPS LCD Panel
+ - jdi,fhd-r63452
# Khadas TS050 5" 1080x1920 LCD panel
- khadas,ts050
# Kingdisplay KD097D04 9.7" 1536x2048 TFT LCD panel
@@ -65,12 +64,31 @@ properties:
reset-gpios: true
port: true
power-supply: true
+ vddio-supply: true
+
+allOf:
+ - $ref: panel-common.yaml#
+ - if:
+ properties:
+ compatible:
+ enum:
+ - samsung,s6e3fc2x01
+ - samsung,sofef00
+ then:
+ properties:
+ power-supply: false
+ required:
+ - vddio-supply
+ else:
+ properties:
+ vddio-supply: false
+ required:
+ - power-supply
additionalProperties: false
required:
- compatible
- - power-supply
- reg
examples:
diff --git a/Documentation/devicetree/bindings/display/panel/panel-simple.yaml b/Documentation/devicetree/bindings/display/panel/panel-simple.yaml
index 335776c45474..01560fe226dd 100644
--- a/Documentation/devicetree/bindings/display/panel/panel-simple.yaml
+++ b/Documentation/devicetree/bindings/display/panel/panel-simple.yaml
@@ -35,6 +35,8 @@ properties:
- ampire,am-480272h3tmqw-t01h
# Ampire AM-800480R3TMQW-A1H 7.0" WVGA TFT LCD panel
- ampire,am800480r3tmqwa1h
+ # Ampire AM-800600P5TMQW-TB8H 8.0" SVGA TFT LCD panel
+ - ampire,am800600p5tmqw-tb8h
# AU Optronics Corporation 10.1" WSVGA TFT LCD panel
- auo,b101aw03
# AU Optronics Corporation 10.1" WSVGA TFT LCD panel
@@ -105,6 +107,10 @@ properties:
- chunghwa,claa101wb01
# Chunghwa Picture Tubes Ltd. 10.1" WXGA TFT LCD panel
- chunghwa,claa101wb03
+ # DataImage, Inc. 4.3" WQVGA (480x272) TFT LCD panel with 24-bit parallel interface.
+ - dataimage,fg040346dsswbg04
+ # DataImage, Inc. 10.1" WXGA (1280×800) TFT LCD panel
+ - dataimage,fg1001l0dsswmg01
# DataImage, Inc. 7" WVGA (800x480) TFT LCD panel with 24-bit parallel interface.
- dataimage,scf0700c48ggu18
# DLC Display Co. DLC1010GIG 10.1" WXGA TFT LCD Panel
@@ -135,6 +141,8 @@ properties:
# Emerging Display Technology Corp. WVGA TFT Display with capacitive touch
- edt,etm0700g0dh6
- edt,etm0700g0edh6
+ # Emerging Display Technology Corp. LVDS WSVGA TFT Display with capacitive touch
+ - edt,etml0700y5dha
# Emerging Display Technology Corp. 5.7" VGA TFT LCD panel with
# capacitive touch
- edt,etmv570g2dhu
@@ -156,6 +164,8 @@ properties:
- hannstar,hsd070pww1
# HannStar Display Corp. HSD100PXN1 10.1" XGA LVDS panel
- hannstar,hsd100pxn1
+ # HannStar Display Corp. HSD101PWW2 10.1" WXGA (1280x800) LVDS panel
+ - hannstar,hsd101pww2
# Hitachi Ltd. Corporation 9" WVGA (800x480) TFT LCD panel
- hit,tx23d38vm0caa
# InfoVision Optoelectronics M133NWF4 R0 13.3" FHD (1920x1080) TFT LCD panel
@@ -166,6 +176,8 @@ properties:
- innolux,at070tn92
# Innolux G070Y2-L01 7" WVGA (800x480) TFT LCD panel
- innolux,g070y2-l01
+ # Innolux G070Y2-T02 7" WVGA (800x480) TFT LCD TTL panel
+ - innolux,g070y2-t02
# Innolux Corporation 10.1" G101ICE-L01 WXGA (1280x800) LVDS panel
- innolux,g101ice-l01
# Innolux Corporation 12.1" WXGA (1280x800) TFT LCD panel
@@ -180,6 +192,8 @@ properties:
- innolux,n125hce-gn1
# InnoLux 15.6" WXGA TFT LCD panel
- innolux,n156bge-l21
+ # Innolux P120ZDG-BF1 12.02 inch eDP 2K display panel
+ - innolux,p120zdg-bf1
# Innolux Corporation 7.0" WSVGA (1024x600) TFT LCD panel
- innolux,zj070na-01p
# King & Display KD116N21-30NV-A010 eDP TFT LCD panel
@@ -220,6 +234,10 @@ properties:
- logictechno,lttd800480070-l6wh-rt
# Mitsubishi "AA070MC01 7.0" WVGA TFT LCD panel
- mitsubishi,aa070mc01-ca1
+ # Multi-Inno Technology Co.,Ltd MI0700S4T-6 7" 800x480 TFT Resistive Touch Module
+ - multi-inno,mi0700s4t-6
+ # Multi-Inno Technology Co.,Ltd MI0800FT-9 8" 800x600 TFT Resistive Touch Module
+ - multi-inno,mi0800ft-9
# Multi-Inno Technology Co.,Ltd MI1010AIT-1CP 10.1" 1280x800 LVDS IPS Cap Touch Mod.
- multi-inno,mi1010ait-1cp
# NEC LCD Technologies, Ltd. 12.1" WXGA (1280x800) LVDS TFT LCD panel
@@ -266,6 +284,8 @@ properties:
- samsung,atna33xc20
# Samsung 12.2" (2560x1600 pixels) TFT LCD panel
- samsung,lsn122dl01-c01
+ # Samsung Electronics 10.1" WXGA (1280x800) TFT LCD panel
+ - samsung,ltl101al01
# Samsung Electronics 10.1" WSVGA TFT LCD panel
- samsung,ltn101nt05
# Samsung Electronics 14" WXGA (1366x768) TFT LCD panel
@@ -280,6 +300,8 @@ properties:
- sharp,lq101k1ly04
# Sharp 12.3" (2400x1600 pixels) TFT LCD panel
- sharp,lq123p1jx31
+ # Sharp 14" (1920x1080 pixels) TFT LCD panel
+ - sharp,lq140m1jw46
# Sharp LS020B1DD01D 2.0" HQVGA TFT LCD panel
- sharp,ls020b1dd01d
# Shelly SCA07010-BFN-LNN 7.0" WVGA TFT LCD panel
@@ -288,6 +310,10 @@ properties:
- starry,kr070pe2t
# Starry 12.2" (1920x1200 pixels) TFT LCD panel
- starry,kr122ea0sra
+ # Startek KD070WVFPA043-C069A 7" TFT LCD panel
+ - startek,kd070wvfpa
+ # Team Source Display Technology TST043015CMHX 4.3" WQVGA TFT LCD panel
+ - team-source-display,tst043015cmhx
# Tianma Micro-electronics TM070JDHG30 7.0" WXGA TFT LCD panel
- tianma,tm070jdhg30
# Tianma Micro-electronics TM070JVHG33 7.0" WXGA TFT LCD panel
@@ -309,6 +335,8 @@ properties:
- urt,umsh-8596md-11t
- urt,umsh-8596md-19t
- urt,umsh-8596md-20t
+ # Vivax TPC-9150 tablet 9.0" WSVGA TFT LCD panel
+ - vivax,tpc9150-panel
# VXT 800x480 color TFT LCD panel
- vxt,vl050-8048nt-c01
# Winstar Display Corporation 3.5" QVGA (320x240) TFT LCD panel
@@ -317,6 +345,7 @@ properties:
- yes-optoelectronics,ytc700tlag-05-201c
backlight: true
+ ddc-i2c-bus: true
enable-gpios: true
port: true
power-supply: true
diff --git a/Documentation/devicetree/bindings/display/panel/panel-timing.yaml b/Documentation/devicetree/bindings/display/panel/panel-timing.yaml
index 9bf592dc3033..aea69b84ca5d 100644
--- a/Documentation/devicetree/bindings/display/panel/panel-timing.yaml
+++ b/Documentation/devicetree/bindings/display/panel/panel-timing.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/panel/panel-timing.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: panel timing bindings
+title: panel timing
maintainers:
- Thierry Reding <thierry.reding@gmail.com>
@@ -17,29 +17,29 @@ description: |
The parameters are defined as seen in the following illustration.
- +----------+-------------------------------------+----------+-------+
- | | ^ | | |
- | | |vback_porch | | |
- | | v | | |
- +----------#######################################----------+-------+
- | # ^ # | |
- | # | # | |
- | hback # | # hfront | hsync |
- | porch # | hactive # porch | len |
- |<-------->#<-------+--------------------------->#<-------->|<----->|
- | # | # | |
- | # |vactive # | |
- | # | # | |
- | # v # | |
- +----------#######################################----------+-------+
- | | ^ | | |
- | | |vfront_porch | | |
- | | v | | |
- +----------+-------------------------------------+----------+-------+
- | | ^ | | |
- | | |vsync_len | | |
- | | v | | |
- +----------+-------------------------------------+----------+-------+
+ +-------+----------+-------------------------------------+----------+
+ | | | ^ | |
+ | | | |vsync_len | |
+ | | | v | |
+ +-------+----------+-------------------------------------+----------+
+ | | | ^ | |
+ | | | |vback_porch | |
+ | | | v | |
+ +-------+----------#######################################----------+
+ | | # ^ # |
+ | | # | # |
+ | hsync | hback # | # hfront |
+ | len | porch # | hactive # porch |
+ |<----->|<-------->#<-------+--------------------------->#<-------->|
+ | | # | # |
+ | | # |vactive # |
+ | | # | # |
+ | | # v # |
+ +-------+----------#######################################----------+
+ | | | ^ | |
+ | | | |vfront_porch | |
+ | | | v | |
+ +-------+----------+-------------------------------------+----------+
The following is the panel timings shown with time on the x-axis.
@@ -71,78 +71,72 @@ properties:
hfront-porch:
description: Horizontal front porch panel timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of pixels
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of pixels
hback-porch:
description: Horizontal back porch timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of pixels
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of pixels
hsync-len:
description: Horizontal sync length panel timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of pixels
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of pixels
vfront-porch:
description: Vertical front porch panel timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of lines
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of lines
vback-porch:
description: Vertical back porch panel timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of lines
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of lines
vsync-len:
description: Vertical sync length panel timing
+ $ref: /schemas/types.yaml#/definitions/uint32-array
oneOf:
- - $ref: /schemas/types.yaml#/definitions/uint32
- maxItems: 1
+ - maxItems: 1
items:
description: typical number of lines
- - $ref: /schemas/types.yaml#/definitions/uint32-array
- minItems: 3
+ - minItems: 3
maxItems: 3
items:
description: min, typ, max number of lines
@@ -152,6 +146,7 @@ properties:
Horizontal sync pulse.
0 selects active low, 1 selects active high.
If omitted then it is not used by the hardware
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
vsync-active:
@@ -159,6 +154,7 @@ properties:
Vertical sync pulse.
0 selects active low, 1 selects active high.
If omitted then it is not used by the hardware
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
de-active:
@@ -166,6 +162,7 @@ properties:
Data enable.
0 selects active low, 1 selects active high.
If omitted then it is not used by the hardware
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
pixelclk-active:
@@ -175,6 +172,7 @@ properties:
sample data on rising edge.
Use 1 to drive pixel data on rising edge and
sample data on falling edge
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
syncclk-active:
@@ -185,6 +183,7 @@ properties:
sample sync on rising edge of pixel clock.
Use 1 to drive sync on rising edge and
sample sync on falling edge of pixel clock
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
interlaced:
diff --git a/Documentation/devicetree/bindings/display/panel/raydium,rm67191.yaml b/Documentation/devicetree/bindings/display/panel/raydium,rm67191.yaml
index 745dd247c409..d62fd692bf10 100644
--- a/Documentation/devicetree/bindings/display/panel/raydium,rm67191.yaml
+++ b/Documentation/devicetree/bindings/display/panel/raydium,rm67191.yaml
@@ -24,6 +24,7 @@ properties:
dsi-lanes:
description: Number of DSI lanes to be used must be <3> or <4>
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [3, 4]
v3p3-supply:
@@ -37,6 +38,7 @@ properties:
0 - burst-mode
1 - non-burst with sync event
2 - non-burst with sync pulse
+ $ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1, 2]
required:
diff --git a/Documentation/devicetree/bindings/display/panel/raydium,rm68200.yaml b/Documentation/devicetree/bindings/display/panel/raydium,rm68200.yaml
index 39477793d289..e8ce2315631a 100644
--- a/Documentation/devicetree/bindings/display/panel/raydium,rm68200.yaml
+++ b/Documentation/devicetree/bindings/display/panel/raydium,rm68200.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Raydium Semiconductor Corporation RM68200 5.5" 720p MIPI-DSI TFT LCD panel
maintainers:
- - Philippe CORNU <philippe.cornu@st.com>
+ - Philippe CORNU <philippe.cornu@foss.st.com>
description: |
The Raydium Semiconductor Corporation RM68200 is a 5.5" 720x1280 TFT LCD
diff --git a/Documentation/devicetree/bindings/display/panel/ronbo,rb070d30.yaml b/Documentation/devicetree/bindings/display/panel/ronbo,rb070d30.yaml
index d67617f6f74a..95ce22c6787a 100644
--- a/Documentation/devicetree/bindings/display/panel/ronbo,rb070d30.yaml
+++ b/Documentation/devicetree/bindings/display/panel/ronbo,rb070d30.yaml
@@ -37,7 +37,7 @@ properties:
backlight:
description: Backlight used by the panel
- $ref: "/schemas/types.yaml#/definitions/phandle"
+ $ref: /schemas/types.yaml#/definitions/phandle
required:
- compatible
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,ams495qa01.yaml b/Documentation/devicetree/bindings/display/panel/samsung,ams495qa01.yaml
new file mode 100644
index 000000000000..58fa073ce258
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/samsung,ams495qa01.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/samsung,ams495qa01.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung AMS495QA01 panel with Magnachip D53E6EA8966 controller
+
+maintainers:
+ - Chris Morgan <macromorgan@hotmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: samsung,ams495qa01
+
+ reg: true
+ reset-gpios:
+ description: reset gpio, must be GPIO_ACTIVE_LOW
+ elvdd-supply:
+ description: regulator that supplies voltage to the panel display
+ enable-gpios: true
+ port: true
+ vdd-supply:
+ description: regulator that supplies voltage to panel logic
+
+required:
+ - compatible
+ - reg
+ - reset-gpios
+ - vdd-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "samsung,ams495qa01";
+ reg = <0>;
+ reset-gpios = <&gpio4 0 GPIO_ACTIVE_LOW>;
+ vdd-supply = <&vcc_3v3>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml
index 060ee27a4749..c0fabeb38628 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Samsung LD9040 AMOLED LCD parallel RGB panel with SPI control bus
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Andrzej Hajda <a.hajda@samsung.com>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -45,6 +42,9 @@ properties:
panel-height-mm:
description: physical panel height [mm]
+ spi-cpha: true
+ spi-cpol: true
+
required:
- compatible
- reg
@@ -63,8 +63,6 @@ examples:
lcd@0 {
compatible = "samsung,ld9040";
- #address-cells = <1>;
- #size-cells = <0>;
reg = <0>;
vdd3-supply = <&ldo7_reg>;
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,lms380kf01.yaml b/Documentation/devicetree/bindings/display/panel/samsung,lms380kf01.yaml
index 251f0c7115aa..70ffc88d2a08 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,lms380kf01.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,lms380kf01.yaml
@@ -9,14 +9,13 @@ title: Samsung LMS380KF01 display panel
description: The LMS380KF01 is a 480x800 DPI display panel from Samsung Mobile
Displays (SMD) utilizing the WideChips WS2401 display controller. It can be
used with internal or external backlight control.
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -59,7 +58,7 @@ required:
- spi-cpol
- port
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,lms397kf04.yaml b/Documentation/devicetree/bindings/display/panel/samsung,lms397kf04.yaml
index cd62968426fb..5e77cee93f83 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,lms397kf04.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,lms397kf04.yaml
@@ -14,6 +14,7 @@ maintainers:
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -51,7 +52,7 @@ required:
- spi-cpol
- port
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6d27a1.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6d27a1.yaml
new file mode 100644
index 000000000000..d273faf4442a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6d27a1.yaml
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/samsung,s6d27a1.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S6D27A1 display panel
+
+description: The S6D27A1 is a 480x800 DPI display panel from Samsung Mobile
+ Displays (SMD).
+
+maintainers:
+ - Markuss Broks <markuss.broks@gmail.com>
+
+allOf:
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ const: samsung,s6d27a1
+
+ reg: true
+
+ interrupts:
+ description: provides an optional ESD (electrostatic discharge)
+ interrupt that signals abnormalities in the display hardware.
+ This can also be raised for other reasons like erroneous
+ configuration.
+ maxItems: 1
+
+ reset-gpios: true
+
+ vci-supply:
+ description: regulator that supplies the VCI analog voltage
+ usually around 3.0 V
+
+ vccio-supply:
+ description: regulator that supplies the VCCIO voltage usually
+ around 1.8 V
+
+ backlight: true
+
+ spi-cpha: true
+
+ spi-cpol: true
+
+ spi-max-frequency:
+ maximum: 1200000
+
+ port: true
+
+required:
+ - compatible
+ - reg
+ - vci-supply
+ - vccio-supply
+ - spi-cpha
+ - spi-cpol
+ - port
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ spi {
+ compatible = "spi-gpio";
+ sck-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>;
+ miso-gpios = <&gpio 1 GPIO_ACTIVE_HIGH>;
+ mosi-gpios = <&gpio 2 GPIO_ACTIVE_HIGH>;
+ cs-gpios = <&gpio 3 GPIO_ACTIVE_HIGH>;
+ num-chipselects = <1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "samsung,s6d27a1";
+ spi-max-frequency = <1200000>;
+ spi-cpha;
+ spi-cpol;
+ reg = <0>;
+ vci-supply = <&lcd_3v0_reg>;
+ vccio-supply = <&lcd_1v8_reg>;
+ reset-gpios = <&gpio 4 GPIO_ACTIVE_LOW>;
+ interrupt-parent = <&gpio>;
+ interrupts = <5 IRQ_TYPE_EDGE_RISING>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&display_out>;
+ };
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml
index ea58df49263a..6f1fc7469f07 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml
@@ -12,6 +12,7 @@ maintainers:
allOf:
- $ref: panel-common.yaml#
- $ref: /schemas/leds/backlight/common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -23,6 +24,10 @@ properties:
default-brightness: true
max-brightness: true
+ spi-3wire: true
+ spi-cpha: true
+ spi-cpol: true
+
vdd3-supply:
description: VDD regulator
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e88a0-ams452ef01.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6e88a0-ams452ef01.yaml
index 44ce98f68705..b749e9e906b7 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,s6e88a0-ams452ef01.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e88a0-ams452ef01.yaml
@@ -16,6 +16,7 @@ properties:
compatible:
const: samsung,s6e88a0-ams452ef01
reg: true
+ port: true
reset-gpios: true
vdd3-supply:
description: core voltage supply
@@ -25,6 +26,7 @@ properties:
required:
- compatible
- reg
+ - port
- vdd3-supply
- vci-supply
- reset-gpios
@@ -46,5 +48,11 @@ examples:
vdd3-supply = <&pm8916_l17>;
vci-supply = <&reg_vlcd_vci>;
reset-gpios = <&msmgpio 25 GPIO_ACTIVE_HIGH>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e8aa0.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6e8aa0.yaml
index ca959451557e..1cdc91b3439f 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,s6e8aa0.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e8aa0.yaml
@@ -36,6 +36,7 @@ properties:
init-delay:
description: delay after initialization sequence [ms]
+ $ref: /schemas/types.yaml#/definitions/uint32
panel-width-mm:
description: physical panel width [mm]
diff --git a/Documentation/devicetree/bindings/display/panel/seiko,43wvf1g.yaml b/Documentation/devicetree/bindings/display/panel/seiko,43wvf1g.yaml
index cfaa50cf5f5d..1df3cbb51ff9 100644
--- a/Documentation/devicetree/bindings/display/panel/seiko,43wvf1g.yaml
+++ b/Documentation/devicetree/bindings/display/panel/seiko,43wvf1g.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Seiko Instruments Inc. 4.3" WVGA (800 x RGB x 480) TFT with Touch-Panel
maintainers:
- - Marco Franchi <marco.franchi@nxp.com>
+ - Fabio Estevam <festevam@gmail.com>
allOf:
- $ref: panel-common.yaml#
@@ -25,6 +25,8 @@ properties:
avdd-supply:
description: 5v analog regulator
+ enable-gpios: true
+
required:
- compatible
- dvdd-supply
diff --git a/Documentation/devicetree/bindings/display/panel/sgd,gktw70sdae4se.yaml b/Documentation/devicetree/bindings/display/panel/sgd,gktw70sdae4se.yaml
index e63a570ae59d..e32d9188a3e0 100644
--- a/Documentation/devicetree/bindings/display/panel/sgd,gktw70sdae4se.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sgd,gktw70sdae4se.yaml
@@ -7,17 +7,27 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Solomon Goldentek Display GKTW70SDAE4SE 7" WVGA LVDS Display Panel
maintainers:
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
- Thierry Reding <thierry.reding@gmail.com>
allOf:
- - $ref: lvds.yaml#
+ - $ref: panel-common.yaml#
+ - $ref: /schemas/display/lvds.yaml#
+
+select:
+ properties:
+ compatible:
+ contains:
+ const: sgd,gktw70sdae4se
+
+ required:
+ - compatible
properties:
compatible:
items:
- const: sgd,gktw70sdae4se
- - {} # panel-lvds, but not listed here to avoid false select
+ - const: panel-lvds
data-mapping:
const: jeida-18
@@ -35,6 +45,11 @@ additionalProperties: false
required:
- compatible
+ - port
+ - data-mapping
+ - width-mm
+ - height-mm
+ - panel-timing
examples:
- |+
diff --git a/Documentation/devicetree/bindings/display/panel/sharp,lq101r1sx01.yaml b/Documentation/devicetree/bindings/display/panel/sharp,lq101r1sx01.yaml
index a679d3647dbd..57b44a0e763d 100644
--- a/Documentation/devicetree/bindings/display/panel/sharp,lq101r1sx01.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sharp,lq101r1sx01.yaml
@@ -30,7 +30,12 @@ allOf:
properties:
compatible:
- const: sharp,lq101r1sx01
+ oneOf:
+ - items:
+ - const: sharp,lq101r1sx03
+ - const: sharp,lq101r1sx01
+ - enum:
+ - sharp,lq101r1sx01
reg: true
power-supply: true
diff --git a/Documentation/devicetree/bindings/display/panel/sharp,ls060t1sx01.yaml b/Documentation/devicetree/bindings/display/panel/sharp,ls060t1sx01.yaml
new file mode 100644
index 000000000000..271c097cc9a4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/sharp,ls060t1sx01.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/sharp,ls060t1sx01.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sharp Microelectronics 6.0" FullHD TFT LCD panel
+
+maintainers:
+ - Dmitry Baryskov <dmitry.baryshkov@linaro.org>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: sharp,ls060t1sx01
+
+ reg: true
+ backlight: true
+ reset-gpios: true
+ port: true
+
+ avdd-supply:
+ description: handle of the regulator that provides the positive supply voltage
+ avee-supply:
+ description: handle of the regulator that provides the negative supply voltage
+ vddi-supply:
+ description: handle of the regulator that provides the I/O supply voltage
+ vddh-supply:
+ description: handle of the regulator that provides the analog supply voltage
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "sharp,ls060t1sx01";
+ reg = <0>;
+ avdd-supply = <&pm8941_l22>;
+ backlight = <&backlight>;
+ reset-gpios = <&pm8916_gpios 25 GPIO_ACTIVE_LOW>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/panel/sitronix,st7701.yaml b/Documentation/devicetree/bindings/display/panel/sitronix,st7701.yaml
index 6dff59fe4be1..4dc0cd4a6a77 100644
--- a/Documentation/devicetree/bindings/display/panel/sitronix,st7701.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sitronix,st7701.yaml
@@ -17,6 +17,9 @@ description: |
Techstar TS8550B is 480x854, 2-lane MIPI DSI LCD panel which has
inbuilt ST7701 chip.
+ Densitron DMT028VGHMCMI-1A is 480x640, 2-lane MIPI DSI LCD panel
+ which has built-in ST7701 chip.
+
allOf:
- $ref: panel-common.yaml#
@@ -24,6 +27,8 @@ properties:
compatible:
items:
- enum:
+ - densitron,dmt028vghmcmi-1a
+ - elida,kd50t048a
- techstar,ts8550b
- const: sitronix,st7701
@@ -37,7 +42,9 @@ properties:
IOVCC-supply:
description: I/O system regulator
+ port: true
reset-gpios: true
+ rotation: true
backlight: true
@@ -46,6 +53,7 @@ required:
- reg
- VCC-supply
- IOVCC-supply
+ - port
- reset-gpios
additionalProperties: false
@@ -65,5 +73,11 @@ examples:
IOVCC-supply = <&reg_dldo2>;
reset-gpios = <&pio 3 24 GPIO_ACTIVE_HIGH>; /* LCD-RST: PD24 */
backlight = <&backlight>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml
index fa46d151e7b3..fa6556363cca 100644
--- a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Sitronix ST7789V RGB panel with SPI control bus
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Maxime Ripard <mripard@kernel.org>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -26,6 +23,13 @@ properties:
backlight: true
port: true
+ spi-cpha: true
+ spi-cpol: true
+
+ dc-gpios:
+ maxItems: 1
+ description: DCX pin, Display data/command selection pin in parallel interface
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/display/panel/sony,acx424akp.yaml b/Documentation/devicetree/bindings/display/panel/sony,acx424akp.yaml
index 78d060097052..059cc6dbcfca 100644
--- a/Documentation/devicetree/bindings/display/panel/sony,acx424akp.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sony,acx424akp.yaml
@@ -4,7 +4,12 @@
$id: http://devicetree.org/schemas/display/panel/sony,acx424akp.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Sony ACX424AKP 4" 480x864 AMOLED panel
+title: Sony ACX424AKP/ACX424AKM 4" 480x864/480x854 AMOLED panel
+
+description: The Sony ACX424AKP and ACX424AKM are panels built around
+ the Novatek NT35560 display controller. The only difference is that
+ the AKM is configured to use 10 pixels less in the Y axis than the
+ AKP.
maintainers:
- Linus Walleij <linus.walleij@linaro.org>
@@ -14,7 +19,9 @@ allOf:
properties:
compatible:
- const: sony,acx424akp
+ enum:
+ - sony,acx424akp
+ - sony,acx424akm
reg: true
reset-gpios: true
vddi-supply:
diff --git a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml
index 95d053c548ab..98abdf4ddeac 100644
--- a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml
+++ b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml
@@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Sony ACX565AKM SDI Panel
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Tomi Valkeinen <tomi.valkeinen@ti.com>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml
new file mode 100644
index 000000000000..b6b885b4c22d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/sony,td4353-jdi.yaml
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/sony,td4353-jdi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sony TD4353 JDI 5 / 5.7" 2160x1080 MIPI-DSI Panel
+
+maintainers:
+ - Konrad Dybcio <konrad.dybcio@somainline.org>
+
+description: |
+ The Sony TD4353 JDI is a 5 (XZ2c) / 5.7 (XZ2) inch 2160x1080
+ MIPI-DSI panel, used in Xperia XZ2 and XZ2 Compact smartphones.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: sony,td4353-jdi-tama
+
+ reg: true
+
+ backlight: true
+
+ vddio-supply:
+ description: VDDIO 1.8V supply
+
+ vsp-supply:
+ description: Positive 5.5V supply
+
+ vsn-supply:
+ description: Negative 5.5V supply
+
+ panel-reset-gpios:
+ description: Display panel reset pin
+
+ touch-reset-gpios:
+ description: Touch panel reset pin
+
+ port: true
+
+required:
+ - compatible
+ - reg
+ - vddio-supply
+ - vsp-supply
+ - vsn-supply
+ - panel-reset-gpios
+ - touch-reset-gpios
+ - port
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel: panel@0 {
+ compatible = "sony,td4353-jdi-tama";
+ reg = <0>;
+
+ backlight = <&pmi8998_wled>;
+ vddio-supply = <&vreg_l14a_1p8>;
+ vsp-supply = <&lab>;
+ vsn-supply = <&ibb>;
+ panel-reset-gpios = <&tlmm 6 GPIO_ACTIVE_HIGH>;
+ touch-reset-gpios = <&tlmm 99 GPIO_ACTIVE_HIGH>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/panel/sony,tulip-truly-nt35521.yaml b/Documentation/devicetree/bindings/display/panel/sony,tulip-truly-nt35521.yaml
new file mode 100644
index 000000000000..967972939598
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/sony,tulip-truly-nt35521.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/sony,tulip-truly-nt35521.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sony Tulip Truly NT35521 5.24" 1280x720 MIPI-DSI Panel
+
+maintainers:
+ - Shawn Guo <shawn.guo@linaro.org>
+
+description: |
+ The Sony Tulip Truly NT35521 is a 5.24" 1280x720 MIPI-DSI panel, which
+ can be found no Sony Xperia M4 phone. The panel backlight is managed
+ through DSI link.
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: sony,tulip-truly-nt35521
+
+ reg: true
+
+ positive5-supply:
+ description: Positive 5V supply
+
+ negative5-supply:
+ description: Negative 5V supply
+
+ reset-gpios: true
+
+ enable-gpios: true
+
+ port: true
+
+required:
+ - compatible
+ - reg
+ - positive5-supply
+ - negative5-supply
+ - reset-gpios
+ - enable-gpios
+ - port
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "sony,tulip-truly-nt35521";
+ reg = <0>;
+ positive5-supply = <&positive5_reg>;
+ negative5-supply = <&negative5_reg>;
+ reset-gpios = <&msmgpio 25 GPIO_ACTIVE_LOW>;
+ enable-gpios = <&msmgpio 10 GPIO_ACTIVE_HIGH>;
+
+ port {
+ panel_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml
index 4aa605613445..e8c8ee8d7c88 100644
--- a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml
+++ b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml
@@ -6,16 +6,13 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Toppoly TD Panels
-description: |
- The panel must obey the rules for a SPI slave device as specified in
- spi/spi-controller.yaml
-
maintainers:
- Marek Belisko <marek@goldelico.com>
- H. Nikolaus Schaller <hns@goldelico.com>
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -31,6 +28,9 @@ properties:
backlight: true
port: true
+ spi-cpha: true
+ spi-cpol: true
+
required:
- compatible
- port
diff --git a/Documentation/devicetree/bindings/display/panel/tpo,tpg110.yaml b/Documentation/devicetree/bindings/display/panel/tpo,tpg110.yaml
index 6f1f02044b4b..f0243d196191 100644
--- a/Documentation/devicetree/bindings/display/panel/tpo,tpg110.yaml
+++ b/Documentation/devicetree/bindings/display/panel/tpo,tpg110.yaml
@@ -41,6 +41,7 @@ description: |+
allOf:
- $ref: panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/display/panel/visionox,rm69299.yaml b/Documentation/devicetree/bindings/display/panel/visionox,rm69299.yaml
index 076b057b4af5..444ac2a4772d 100644
--- a/Documentation/devicetree/bindings/display/panel/visionox,rm69299.yaml
+++ b/Documentation/devicetree/bindings/display/panel/visionox,rm69299.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/panel/visionox,rm69299.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Visionox model RM69299 Panels Device Tree Bindings.
+title: Visionox model RM69299 Panels
maintainers:
- Harigovindan P <harigovi@codeaurora.org>
@@ -19,6 +19,8 @@ properties:
compatible:
const: visionox,rm69299-1080p-display
+ reg: true
+
vdda-supply:
description: |
Phandle of the regulator that provides the vdda supply voltage.
@@ -34,6 +36,7 @@ additionalProperties: false
required:
- compatible
+ - reg
- vdda-supply
- vdd3p3-supply
- reset-gpios
@@ -41,16 +44,22 @@ required:
examples:
- |
- panel {
- compatible = "visionox,rm69299-1080p-display";
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "visionox,rm69299-1080p-display";
+ reg = <0>;
- vdda-supply = <&src_pp1800_l8c>;
- vdd3p3-supply = <&src_pp2800_l18a>;
+ vdda-supply = <&src_pp1800_l8c>;
+ vdd3p3-supply = <&src_pp2800_l18a>;
- reset-gpios = <&pm6150l_gpio 3 0>;
- port {
- panel0_in: endpoint {
- remote-endpoint = <&dsi0_out>;
+ reset-gpios = <&pm6150l_gpio 3 0>;
+ port {
+ panel0_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
};
};
};
diff --git a/Documentation/devicetree/bindings/display/panel/visionox,vtdr6130.yaml b/Documentation/devicetree/bindings/display/panel/visionox,vtdr6130.yaml
new file mode 100644
index 000000000000..84562a5b710a
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/visionox,vtdr6130.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/panel/visionox,vtdr6130.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Visionox VTDR6130 AMOLED DSI Panel
+
+maintainers:
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+allOf:
+ - $ref: panel-common.yaml#
+
+properties:
+ compatible:
+ const: visionox,vtdr6130
+
+ reg:
+ maxItems: 1
+ description: DSI virtual channel
+
+ vddio-supply: true
+ vci-supply: true
+ vdd-supply: true
+ port: true
+ reset-gpios: true
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - vddio-supply
+ - vci-supply
+ - vdd-supply
+ - reset-gpios
+ - port
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ dsi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ panel@0 {
+ compatible = "visionox,vtdr6130";
+ reg = <0>;
+
+ vddio-supply = <&vreg_l12b_1p8>;
+ vci-supply = <&vreg_l13b_3p0>;
+ vdd-supply = <&vreg_l11b_1p2>;
+
+ reset-gpios = <&tlmm 133 GPIO_ACTIVE_LOW>;
+
+ port {
+ panel0_in: endpoint {
+ remote-endpoint = <&dsi0_out>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/display/panel/xinpeng,xpp055c272.yaml b/Documentation/devicetree/bindings/display/panel/xinpeng,xpp055c272.yaml
index d5c46a3cc2b0..c407deb6afb1 100644
--- a/Documentation/devicetree/bindings/display/panel/xinpeng,xpp055c272.yaml
+++ b/Documentation/devicetree/bindings/display/panel/xinpeng,xpp055c272.yaml
@@ -17,6 +17,7 @@ properties:
const: xinpeng,xpp055c272
reg: true
backlight: true
+ port: true
reset-gpios: true
iovcc-supply:
description: regulator that supplies the iovcc voltage
@@ -27,6 +28,7 @@ required:
- compatible
- reg
- backlight
+ - port
- iovcc-supply
- vci-supply
@@ -44,6 +46,12 @@ examples:
backlight = <&backlight>;
iovcc-supply = <&vcc_1v8>;
vci-supply = <&vcc3v3_lcd>;
+
+ port {
+ mipi_in_panel: endpoint {
+ remote-endpoint = <&mipi_out_panel>;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/display/renesas,du.yaml b/Documentation/devicetree/bindings/display/renesas,du.yaml
index e3ca5389c17d..c5b9e6812bce 100644
--- a/Documentation/devicetree/bindings/display/renesas,du.yaml
+++ b/Documentation/devicetree/bindings/display/renesas,du.yaml
@@ -39,6 +39,8 @@ properties:
- renesas,du-r8a77980 # for R-Car V3H compatible DU
- renesas,du-r8a77990 # for R-Car E3 compatible DU
- renesas,du-r8a77995 # for R-Car D3 compatible DU
+ - renesas,du-r8a779a0 # for R-Car V3U compatible DU
+ - renesas,du-r8a779g0 # for R-Car V4H compatible DU
reg:
maxItems: 1
@@ -74,18 +76,22 @@ properties:
unevaluatedProperties: false
renesas,cmms:
- $ref: "/schemas/types.yaml#/definitions/phandle-array"
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ maxItems: 1
description:
A list of phandles to the CMM instances present in the SoC, one for each
available DU channel.
renesas,vsps:
- $ref: "/schemas/types.yaml#/definitions/phandle-array"
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ items:
+ - description: phandle to VSP instance that serves the DU channel
+ - description: Channel index identifying the LIF instance in that VSP
description:
A list of phandle and channel index tuples to the VSPs that handle the
- memory interfaces for the DU channels. The phandle identifies the VSP
- instance that serves the DU channel, and the channel index identifies
- the LIF instance in that VSP.
+ memory interfaces for the DU channels.
required:
- compatible
@@ -104,7 +110,6 @@ allOf:
properties:
clocks:
minItems: 1
- maxItems: 3
items:
- description: Functional clock
- description: DU_DOTCLKIN0 input clock
@@ -112,7 +117,6 @@ allOf:
clock-names:
minItems: 1
- maxItems: 3
items:
- const: du.0
- pattern: '^dclkin\.[01]$'
@@ -154,7 +158,6 @@ allOf:
properties:
clocks:
minItems: 2
- maxItems: 4
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -163,7 +166,6 @@ allOf:
clock-names:
minItems: 2
- maxItems: 4
items:
- const: du.0
- const: du.1
@@ -211,7 +213,6 @@ allOf:
properties:
clocks:
minItems: 2
- maxItems: 4
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -220,7 +221,6 @@ allOf:
clock-names:
minItems: 2
- maxItems: 4
items:
- const: du.0
- const: du.1
@@ -266,7 +266,6 @@ allOf:
properties:
clocks:
minItems: 2
- maxItems: 4
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -275,7 +274,6 @@ allOf:
clock-names:
minItems: 2
- maxItems: 4
items:
- const: du.0
- const: du.1
@@ -322,7 +320,6 @@ allOf:
properties:
clocks:
minItems: 2
- maxItems: 4
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -331,7 +328,6 @@ allOf:
clock-names:
minItems: 2
- maxItems: 4
items:
- const: du.0
- const: du.1
@@ -381,7 +377,6 @@ allOf:
properties:
clocks:
minItems: 3
- maxItems: 6
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -392,7 +387,6 @@ allOf:
clock-names:
minItems: 3
- maxItems: 6
items:
- const: du.0
- const: du.1
@@ -443,7 +437,6 @@ allOf:
properties:
clocks:
minItems: 4
- maxItems: 8
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -456,7 +449,6 @@ allOf:
clock-names:
minItems: 4
- maxItems: 8
items:
- const: du.0
- const: du.1
@@ -520,7 +512,6 @@ allOf:
properties:
clocks:
minItems: 3
- maxItems: 6
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -531,7 +522,6 @@ allOf:
clock-names:
minItems: 3
- maxItems: 6
items:
- const: du.0
- const: du.1
@@ -591,7 +581,6 @@ allOf:
properties:
clocks:
minItems: 3
- maxItems: 6
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -602,7 +591,6 @@ allOf:
clock-names:
minItems: 3
- maxItems: 6
items:
- const: du.0
- const: du.1
@@ -661,14 +649,12 @@ allOf:
properties:
clocks:
minItems: 1
- maxItems: 2
items:
- description: Functional clock for DU0
- description: DU_DOTCLKIN0 input clock
clock-names:
minItems: 1
- maxItems: 2
items:
- const: du.0
- const: dclkin.0
@@ -718,7 +704,6 @@ allOf:
properties:
clocks:
minItems: 2
- maxItems: 4
items:
- description: Functional clock for DU0
- description: Functional clock for DU1
@@ -727,7 +712,6 @@ allOf:
clock-names:
minItems: 2
- maxItems: 4
items:
- const: du.0
- const: du.1
@@ -773,6 +757,56 @@ allOf:
- reset-names
- renesas,vsps
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - renesas,du-r8a779a0
+ - renesas,du-r8a779g0
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Functional clock
+
+ clock-names:
+ items:
+ - const: du.0
+
+ interrupts:
+ maxItems: 2
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: du.0
+
+ ports:
+ properties:
+ port@0:
+ description: DSI 0
+ port@1:
+ description: DSI 1
+ port@2: false
+ port@3: false
+
+ required:
+ - port@0
+ - port@1
+
+ renesas,vsps:
+ minItems: 2
+
+ required:
+ - clock-names
+ - interrupts
+ - resets
+ - reset-names
+ - renesas,vsps
+
additionalProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/display/rockchip/analogix_dp-rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/analogix_dp-rockchip.txt
deleted file mode 100644
index 43561584c13a..000000000000
--- a/Documentation/devicetree/bindings/display/rockchip/analogix_dp-rockchip.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-Rockchip RK3288 specific extensions to the Analogix Display Port
-================================
-
-Required properties:
-- compatible: "rockchip,rk3288-dp",
- "rockchip,rk3399-edp";
-
-- reg: physical base address of the controller and length
-
-- clocks: from common clock binding: handle to dp clock.
- of memory mapped region.
-
-- clock-names: from common clock binding:
- Required elements: "dp" "pclk"
-
-- resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
-
-- pinctrl-names: Names corresponding to the chip hotplug pinctrl states.
-- pinctrl-0: pin-control mode. should be <&edp_hpd>
-
-- reset-names: Must include the name "dp"
-
-- rockchip,grf: this soc should set GRF regs, so need get grf here.
-
-- ports: there are 2 port nodes with endpoint definitions as defined in
- Documentation/devicetree/bindings/media/video-interfaces.txt.
- Port 0: contained 2 endpoints, connecting to the output of vop.
- Port 1: contained 1 endpoint, connecting to the input of panel.
-
-Optional property for different chips:
-- clocks: from common clock binding: handle to grf_vio clock.
-
-- clock-names: from common clock binding:
- Required elements: "grf"
-
-For the below properties, please refer to Analogix DP binding document:
- * Documentation/devicetree/bindings/display/bridge/analogix_dp.txt
-- phys (required)
-- phy-names (required)
-- hpd-gpios (optional)
-- force-hpd (optional)
--------------------------------------------------------------------------------
-
-Example:
- dp-controller: dp@ff970000 {
- compatible = "rockchip,rk3288-dp";
- reg = <0xff970000 0x4000>;
- interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru SCLK_EDP>, <&cru PCLK_EDP_CTRL>;
- clock-names = "dp", "pclk";
- phys = <&dp_phy>;
- phy-names = "dp";
-
- rockchip,grf = <&grf>;
- resets = <&cru 111>;
- reset-names = "dp";
-
- pinctrl-names = "default";
- pinctrl-0 = <&edp_hpd>;
-
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
- edp_in: port@0 {
- reg = <0>;
- #address-cells = <1>;
- #size-cells = <0>;
- edp_in_vopb: endpoint@0 {
- reg = <0>;
- remote-endpoint = <&vopb_out_edp>;
- };
- edp_in_vopl: endpoint@1 {
- reg = <1>;
- remote-endpoint = <&vopl_out_edp>;
- };
- };
-
- edp_out: port@1 {
- reg = <1>;
- #address-cells = <1>;
- #size-cells = <0>;
- edp_out_panel: endpoint {
- reg = <0>;
- remote-endpoint = <&panel_in_edp>
- };
- };
- };
- };
-
- pinctrl {
- edp {
- edp_hpd: edp-hpd {
- rockchip,pins = <7 11 RK_FUNC_2 &pcfg_pull_none>;
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt
deleted file mode 100644
index 39792f051d2d..000000000000
--- a/Documentation/devicetree/bindings/display/rockchip/dw_mipi_dsi_rockchip.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Rockchip specific extensions to the Synopsys Designware MIPI DSI
-================================
-
-Required properties:
-- #address-cells: Should be <1>.
-- #size-cells: Should be <0>.
-- compatible: one of
- "rockchip,px30-mipi-dsi", "snps,dw-mipi-dsi"
- "rockchip,rk3288-mipi-dsi", "snps,dw-mipi-dsi"
- "rockchip,rk3399-mipi-dsi", "snps,dw-mipi-dsi"
-- reg: Represent the physical address range of the controller.
-- interrupts: Represent the controller's interrupt to the CPU(s).
-- clocks, clock-names: Phandles to the controller's pll reference
- clock(ref) when using an internal dphy and APB clock(pclk).
- For RK3399, a phy config clock (phy_cfg) and a grf clock(grf)
- are required. As described in [1].
-- rockchip,grf: this soc should set GRF regs to mux vopl/vopb.
-- ports: contain a port node with endpoint definitions as defined in [2].
- For vopb,set the reg = <0> and set the reg = <1> for vopl.
-- video port 0 for the VOP input, the remote endpoint maybe vopb or vopl
-- video port 1 for either a panel or subsequent encoder
-
-Optional properties:
-- phys: from general PHY binding: the phandle for the PHY device.
-- phy-names: Should be "dphy" if phys references an external phy.
-- #phy-cells: Defined when used as ISP phy, should be 0.
-- power-domains: a phandle to mipi dsi power domain node.
-- resets: list of phandle + reset specifier pairs, as described in [3].
-- reset-names: string reset name, must be "apb".
-
-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
-[2] Documentation/devicetree/bindings/media/video-interfaces.txt
-[3] Documentation/devicetree/bindings/reset/reset.txt
-
-Example:
- mipi_dsi: mipi@ff960000 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "rockchip,rk3288-mipi-dsi", "snps,dw-mipi-dsi";
- reg = <0xff960000 0x4000>;
- interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru SCLK_MIPI_24M>, <&cru PCLK_MIPI_DSI0>;
- clock-names = "ref", "pclk";
- resets = <&cru SRST_MIPIDSI0>;
- reset-names = "apb";
- rockchip,grf = <&grf>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- mipi_in: port@0 {
- reg = <0>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- mipi_in_vopb: endpoint@0 {
- reg = <0>;
- remote-endpoint = <&vopb_out_mipi>;
- };
- mipi_in_vopl: endpoint@1 {
- reg = <1>;
- remote-endpoint = <&vopl_out_mipi>;
- };
- };
-
- mipi_out: port@1 {
- reg = <1>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- mipi_out_panel: endpoint {
- remote-endpoint = <&panel_in_mipi>;
- };
- };
- };
-
- panel {
- compatible ="boe,tv080wum-nl0";
- reg = <0>;
-
- enable-gpios = <&gpio7 3 GPIO_ACTIVE_HIGH>;
- pinctrl-names = "default";
- pinctrl-0 = <&lcd_en>;
- backlight = <&backlight>;
-
- port {
- panel_in_mipi: endpoint {
- remote-endpoint = <&mipi_out_panel>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml
new file mode 100644
index 000000000000..60dedf9b2be7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,analogix-dp.yaml
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/rockchip/rockchip,analogix-dp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip specific extensions to the Analogix Display Port
+
+maintainers:
+ - Sandy Huang <hjc@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3288-dp
+ - rockchip,rk3399-edp
+
+ clocks:
+ minItems: 2
+ maxItems: 3
+
+ clock-names:
+ minItems: 2
+ items:
+ - const: dp
+ - const: pclk
+ - const: grf
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: dp
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ This SoC makes use of GRF regs.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - rockchip,grf
+
+allOf:
+ - $ref: /schemas/display/bridge/analogix,dp.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/rk3288-cru.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ dp@ff970000 {
+ compatible = "rockchip,rk3288-dp";
+ reg = <0xff970000 0x4000>;
+ interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cru SCLK_EDP>, <&cru PCLK_EDP_CTRL>;
+ clock-names = "dp", "pclk";
+ phys = <&dp_phy>;
+ phy-names = "dp";
+ resets = <&cru 111>;
+ reset-names = "dp";
+ rockchip,grf = <&grf>;
+ pinctrl-0 = <&edp_hpd>;
+ pinctrl-names = "default";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ edp_in: port@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ edp_in_vopb: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&vopb_out_edp>;
+ };
+ edp_in_vopl: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&vopl_out_edp>;
+ };
+ };
+
+ edp_out: port@1 {
+ reg = <1>;
+
+ edp_out_panel: endpoint {
+ remote-endpoint = <&panel_in_edp>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-hdmi.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-hdmi.yaml
index da3b889ad8fc..7e59dee15a5f 100644
--- a/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-hdmi.yaml
@@ -23,10 +23,22 @@ properties:
- rockchip,rk3288-dw-hdmi
- rockchip,rk3328-dw-hdmi
- rockchip,rk3399-dw-hdmi
+ - rockchip,rk3568-dw-hdmi
reg-io-width:
const: 4
+ avdd-0v9-supply:
+ description:
+ A 0.9V supply that powers up the SoC internal circuitry. The actual pin name
+ varies between the different SoCs and is usually HDMI_TX_AVDD_0V9 or sometimes
+ HDMI_AVDD_1V0.
+
+ avdd-1v8-supply:
+ description:
+ A 1.8V supply that powers up the SoC internal circuitry. The pin name on the
+ SoC usually is HDMI_TX_AVDD_1V8.
+
clocks:
minItems: 2
items:
@@ -36,7 +48,8 @@ properties:
# order when present.
- description: The HDMI CEC controller main clock
- description: Power for GRF IO
- - description: External clock for some HDMI PHY
+ - description: External clock for some HDMI PHY (old clock name, deprecated)
+ - description: External clock for some HDMI PHY (new name)
clock-names:
minItems: 2
@@ -47,10 +60,14 @@ properties:
- cec
- grf
- vpll
+ - ref
- enum:
- grf
- vpll
- - const: vpll
+ - ref
+ - enum:
+ - vpll
+ - ref
ddc-i2c-bus:
$ref: /schemas/types.yaml#/definitions/phandle
@@ -72,6 +89,7 @@ properties:
The unwedge pinctrl entry shall drive the DDC SDA line low. This is
intended to work around a hardware errata that can cause the DDC I2C
bus to be wedged.
+ minItems: 1
items:
- const: default
- const: unwedge
@@ -79,27 +97,21 @@ properties:
ports:
$ref: /schemas/graph.yaml#/properties/ports
- properties:
- port:
- $ref: /schemas/graph.yaml#/$defs/port-base
- unevaluatedProperties: false
+ patternProperties:
+ "^port(@0)?$":
+ $ref: /schemas/graph.yaml#/properties/port
description: Input of the DWC HDMI TX
-
properties:
+ endpoint:
+ description: Connection to the VOP
endpoint@0:
- $ref: /schemas/graph.yaml#/properties/endpoint
description: Connection to the VOPB
-
endpoint@1:
- $ref: /schemas/graph.yaml#/properties/endpoint
description: Connection to the VOPL
-
- required:
- - endpoint@0
- - endpoint@1
-
- required:
- - port
+ properties:
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: Output of the DWC HDMI TX
rockchip,grf:
$ref: /schemas/types.yaml#/definitions/phandle
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-mipi-dsi.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-mipi-dsi.yaml
new file mode 100644
index 000000000000..8e8a40879140
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,dw-mipi-dsi.yaml
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/rockchip/rockchip,dw-mipi-dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip specific extensions to the Synopsys Designware MIPI DSI
+
+maintainers:
+ - Sandy Huang <hjc@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - rockchip,px30-mipi-dsi
+ - rockchip,rk3288-mipi-dsi
+ - rockchip,rk3399-mipi-dsi
+ - rockchip,rk3568-mipi-dsi
+ - const: snps,dw-mipi-dsi
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ oneOf:
+ - minItems: 2
+ items:
+ - const: ref
+ - const: pclk
+ - const: phy_cfg
+ - const: grf
+ - const: pclk
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ This SoC uses GRF regs to switch between vopl/vopb.
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ const: dphy
+
+ "#phy-cells":
+ const: 0
+ description:
+ Defined when in use as ISP phy.
+
+ power-domains:
+ maxItems: 1
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - rockchip,grf
+
+allOf:
+ - $ref: /schemas/display/bridge/snps,dw-mipi-dsi.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - rockchip,px30-mipi-dsi
+ - rockchip,rk3568-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ maxItems: 1
+
+ required:
+ - phys
+ - phy-names
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3288-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ maxItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3399-mipi-dsi
+
+ then:
+ properties:
+ clocks:
+ minItems: 4
+
+ clock-names:
+ minItems: 4
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/rk3288-cru.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ mipi_dsi: dsi@ff960000 {
+ compatible = "rockchip,rk3288-mipi-dsi", "snps,dw-mipi-dsi";
+ reg = <0xff960000 0x4000>;
+ interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cru SCLK_MIPIDSI_24M>, <&cru PCLK_MIPI_DSI0>;
+ clock-names = "ref", "pclk";
+ resets = <&cru SRST_MIPIDSI0>;
+ reset-names = "apb";
+ rockchip,grf = <&grf>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mipi_in: port@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mipi_in_vopb: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&vopb_out_mipi>;
+ };
+ mipi_in_vopl: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&vopl_out_mipi>;
+ };
+ };
+
+ mipi_out: port@1 {
+ reg = <1>;
+
+ mipi_out_panel: endpoint {
+ remote-endpoint = <&panel_in_mipi>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,lvds.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,lvds.yaml
new file mode 100644
index 000000000000..03b002a05c47
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,lvds.yaml
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/rockchip/rockchip,lvds.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip low-voltage differential signal (LVDS) transmitter
+
+maintainers:
+ - Sandy Huang <hjc@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+ compatible:
+ enum:
+ - rockchip,px30-lvds
+ - rockchip,rk3288-lvds
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: pclk_lvds
+
+ avdd1v0-supply:
+ description: 1.0V analog power.
+
+ avdd1v8-supply:
+ description: 1.8V analog power.
+
+ avdd3v3-supply:
+ description: 3.3V analog power.
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: Phandle to the general register files syscon.
+
+ rockchip,output:
+ $ref: /schemas/types.yaml#/definitions/string
+ enum: [rgb, lvds, duallvds]
+ description: This describes the output interface.
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ const: dphy
+
+ pinctrl-names:
+ const: lcdc
+
+ pinctrl-0: true
+
+ power-domains:
+ maxItems: 1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port 0 for the VOP input.
+ The remote endpoint maybe vopb or vopl.
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Video port 1 for either a panel or subsequent encoder.
+
+ required:
+ - port@0
+ - port@1
+
+required:
+ - compatible
+ - rockchip,grf
+ - rockchip,output
+ - ports
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,px30-lvds
+
+ then:
+ properties:
+ reg: false
+ clocks: false
+ clock-names: false
+ avdd1v0-supply: false
+ avdd1v8-supply: false
+ avdd3v3-supply: false
+
+ required:
+ - phys
+ - phy-names
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3288-lvds
+
+ then:
+ properties:
+ phys: false
+ phy-names: false
+
+ required:
+ - reg
+ - clocks
+ - clock-names
+ - avdd1v0-supply
+ - avdd1v8-supply
+ - avdd3v3-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/rk3288-cru.h>
+
+ lvds: lvds@ff96c000 {
+ compatible = "rockchip,rk3288-lvds";
+ reg = <0xff96c000 0x4000>;
+ clocks = <&cru PCLK_LVDS_PHY>;
+ clock-names = "pclk_lvds";
+ avdd1v0-supply = <&vdd10_lcd>;
+ avdd1v8-supply = <&vcc18_lcd>;
+ avdd3v3-supply = <&vcca_33>;
+ pinctrl-names = "lcdc";
+ pinctrl-0 = <&lcdc_ctl>;
+ rockchip,grf = <&grf>;
+ rockchip,output = "rgb";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ lvds_in: port@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ lvds_in_vopb: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&vopb_out_lvds>;
+ };
+ lvds_in_vopl: endpoint@1 {
+ reg = <1>;
+ remote-endpoint = <&vopl_out_lvds>;
+ };
+ };
+
+ lvds_out: port@1 {
+ reg = <1>;
+
+ lvds_out_panel: endpoint {
+ remote-endpoint = <&panel_in_lvds>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml
index 008c144257cb..1a68a940d165 100644
--- a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml
@@ -26,14 +26,6 @@ properties:
clock-names:
const: hclk
- pinctrl-0:
- maxItems: 2
-
- pinctrl-names:
- const: default
- description:
- Switch the iomux for the HPD/I2C pins to HDMI function.
-
power-domains:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-drm.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip-drm.yaml
index 7204da5eb4c5..a8d18a37cb23 100644
--- a/Documentation/devicetree/bindings/display/rockchip/rockchip-drm.yaml
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip-drm.yaml
@@ -21,6 +21,8 @@ properties:
ports:
$ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ maxItems: 1
description: |
Should contain a list of phandles pointing to display interface port
of vop devices. vop definitions as defined in
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-lvds.txt b/Documentation/devicetree/bindings/display/rockchip/rockchip-lvds.txt
deleted file mode 100644
index aaf8c44cf90f..000000000000
--- a/Documentation/devicetree/bindings/display/rockchip/rockchip-lvds.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-Rockchip RK3288 LVDS interface
-================================
-
-Required properties:
-- compatible: matching the soc type, one of
- - "rockchip,rk3288-lvds";
- - "rockchip,px30-lvds";
-
-- reg: physical base address of the controller and length
- of memory mapped region.
-- clocks: must include clock specifiers corresponding to entries in the
- clock-names property.
-- clock-names: must contain "pclk_lvds"
-
-- avdd1v0-supply: regulator phandle for 1.0V analog power
-- avdd1v8-supply: regulator phandle for 1.8V analog power
-- avdd3v3-supply: regulator phandle for 3.3V analog power
-
-- rockchip,grf: phandle to the general register files syscon
-- rockchip,output: "rgb", "lvds" or "duallvds", This describes the output interface
-
-- phys: LVDS/DSI DPHY (px30 only)
-- phy-names: name of the PHY, must be "dphy" (px30 only)
-
-Optional properties:
-- pinctrl-names: must contain a "lcdc" entry.
-- pinctrl-0: pin control group to be used for this controller.
-
-Required nodes:
-
-The lvds has two video ports as described by
- Documentation/devicetree/bindings/media/video-interfaces.txt
-Their connections are modeled using the OF graph bindings specified in
- Documentation/devicetree/bindings/graph.txt.
-
-- video port 0 for the VOP input, the remote endpoint maybe vopb or vopl
-- video port 1 for either a panel or subsequent encoder
-
-Example:
-
-lvds_panel: lvds-panel {
- compatible = "auo,b101ean01";
- enable-gpios = <&gpio7 21 GPIO_ACTIVE_HIGH>;
- data-mapping = "jeida-24";
-
- ports {
- panel_in_lvds: endpoint {
- remote-endpoint = <&lvds_out_panel>;
- };
- };
-};
-
-For Rockchip RK3288:
-
- lvds: lvds@ff96c000 {
- compatible = "rockchip,rk3288-lvds";
- rockchip,grf = <&grf>;
- reg = <0xff96c000 0x4000>;
- clocks = <&cru PCLK_LVDS_PHY>;
- clock-names = "pclk_lvds";
- pinctrl-names = "lcdc";
- pinctrl-0 = <&lcdc_ctl>;
- avdd1v0-supply = <&vdd10_lcd>;
- avdd1v8-supply = <&vcc18_lcd>;
- avdd3v3-supply = <&vcca_33>;
- rockchip,output = "rgb";
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- lvds_in: port@0 {
- reg = <0>;
-
- lvds_in_vopb: endpoint@0 {
- reg = <0>;
- remote-endpoint = <&vopb_out_lvds>;
- };
- lvds_in_vopl: endpoint@1 {
- reg = <1>;
- remote-endpoint = <&vopl_out_lvds>;
- };
- };
-
- lvds_out: port@1 {
- reg = <1>;
-
- lvds_out_panel: endpoint {
- remote-endpoint = <&panel_in_lvds>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop2.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop2.yaml
new file mode 100644
index 000000000000..fba45091d909
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop2.yaml
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/rockchip/rockchip-vop2.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip SoC display controller (VOP2)
+
+description:
+ VOP2 (Video Output Processor v2) is the display controller for the Rockchip
+ series of SoCs which transfers the image data from a video memory
+ buffer to an external LCD interface.
+
+maintainers:
+ - Sandy Huang <hjc@rock-chips.com>
+ - Heiko Stuebner <heiko@sntech.de>
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3566-vop
+ - rockchip,rk3568-vop
+
+ reg:
+ items:
+ - description:
+ Must contain one entry corresponding to the base address and length
+ of the register space.
+ - description:
+ Can optionally contain a second entry corresponding to
+ the CRTC gamma LUT address.
+
+ reg-names:
+ items:
+ - const: vop
+ - const: gamma-lut
+
+ interrupts:
+ maxItems: 1
+ description:
+ The VOP interrupt is shared by several interrupt sources, such as
+ frame start (VSYNC), line flag and other status interrupts.
+
+ clocks:
+ items:
+ - description: Clock for ddr buffer transfer.
+ - description: Clock for the ahb bus to R/W the phy regs.
+ - description: Pixel clock for video port 0.
+ - description: Pixel clock for video port 1.
+ - description: Pixel clock for video port 2.
+
+ clock-names:
+ items:
+ - const: aclk
+ - const: hclk
+ - const: dclk_vp0
+ - const: dclk_vp1
+ - const: dclk_vp2
+
+ rockchip,grf:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to GRF regs used for misc control
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Output endpoint of VP0
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Output endpoint of VP1
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description:
+ Output endpoint of VP2
+
+ iommus:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - interrupts
+ - clocks
+ - clock-names
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/rk3568-cru.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/rk3568-power.h>
+ bus {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ vop: vop@fe040000 {
+ compatible = "rockchip,rk3568-vop";
+ reg = <0x0 0xfe040000 0x0 0x3000>, <0x0 0xfe044000 0x0 0x1000>;
+ reg-names = "vop", "gamma-lut";
+ interrupts = <GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cru ACLK_VOP>,
+ <&cru HCLK_VOP>,
+ <&cru DCLK_VOP0>,
+ <&cru DCLK_VOP1>,
+ <&cru DCLK_VOP2>;
+ clock-names = "aclk",
+ "hclk",
+ "dclk_vp0",
+ "dclk_vp1",
+ "dclk_vp2";
+ power-domains = <&power RK3568_PD_VO>;
+ iommus = <&vop_mmu>;
+ vop_out: ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ vp0: port@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+ vp1: port@1 {
+ reg = <1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+ vp2: port@2 {
+ reg = <2>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi-ddc.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi-ddc.yaml
new file mode 100644
index 000000000000..458d399cb025
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi-ddc.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos-hdmi-ddc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos SoC HDMI DDC
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ oneOf:
+ - const: samsung,exynos4210-hdmiddc
+ - const: samsung,exynos5-hdmiddc
+ deprecated: true
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ddc@50 {
+ compatible = "samsung,exynos4210-hdmiddc";
+ reg = <0x50>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi.yaml
new file mode 100644
index 000000000000..e4a68c5a1a09
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-hdmi.yaml
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos-hdmi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos SoC HDMI
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos4210-hdmi
+ - samsung,exynos4212-hdmi
+ - samsung,exynos5420-hdmi
+ - samsung,exynos5433-hdmi
+
+ clocks:
+ minItems: 5
+ maxItems: 10
+
+ clock-names:
+ minItems: 5
+ maxItems: 10
+
+ ddc:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the HDMI DDC node.
+
+ hdmi-en-supply:
+ description:
+ Provides voltage source for DCC lines available on HDMI connector. When
+ there is no power provided for DDC epprom, some TV-sets do not pulls up
+ HPD (hot plug detect) line, what causes HDMI block to stay turned off.
+ When provided, the regulator allows TV-set correctly signal HPD event.
+
+ hpd-gpios:
+ maxItems: 1
+ description:
+ A GPIO line connected to HPD
+
+ interrupts:
+ maxItems: 1
+
+ phy:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: Phandle to the HDMI PHY node.
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description:
+ Contains a port which is connected to mic node.
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ samsung,syscon-phandle:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the PMU system controller node.
+
+ samsung,sysreg-phandle:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to DISP system controller interface.
+
+ '#sound-dai-cells':
+ const: 0
+
+ vdd-supply:
+ description:
+ VDD 1.0V HDMI TX.
+
+ vdd_osc-supply:
+ description:
+ VDD 1.8V HDMI OSC.
+
+ vdd_pll-supply:
+ description:
+ VDD 1.0V HDMI PLL.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - ddc
+ - hpd-gpios
+ - interrupts
+ - phy
+ - reg
+ - samsung,syscon-phandle
+ - '#sound-dai-cells'
+ - vdd-supply
+ - vdd_osc-supply
+ - vdd_pll-supply
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5433-hdmi
+ then:
+ properties:
+ clocks:
+ items:
+ - description: Gate of HDMI IP APB bus.
+ - description: Gate of HDMI-PHY IP APB bus.
+ - description: Gate of HDMI TMDS clock.
+ - description: Gate of HDMI pixel clock.
+ - description: TMDS clock generated by HDMI-PHY.
+ - description: MUX used to switch between oscclk and tmds_clko,
+ respectively if HDMI-PHY is off and operational.
+ - description: Pixel clock generated by HDMI-PHY.
+ - description: MUX used to switch between oscclk and pixel_clko,
+ respectively if HDMI-PHY is off and operational.
+ - description: Oscillator clock, used as parent of following *_user
+ clocks in case HDMI-PHY is not operational.
+ - description: Gate of HDMI SPDIF clock.
+ clock-names:
+ items:
+ - const: hdmi_pclk
+ - const: hdmi_i_pclk
+ - const: i_tmds_clk
+ - const: i_pixel_clk
+ - const: tmds_clko
+ - const: tmds_clko_user
+ - const: pixel_clko
+ - const: pixel_clko_user
+ - const: oscclk
+ - const: i_spdif_clk
+ required:
+ - samsung,sysreg-phandle
+ else:
+ properties:
+ clocks:
+ items:
+ - description: Gate of HDMI IP bus clock.
+ - description: Gate of HDMI special clock.
+ - description: Pixel special clock, one of the two possible inputs
+ of HDMI clock mux.
+ - description: HDMI PHY clock output, one of two possible inputs of
+ HDMI clock mux.
+ - description: It is required by the driver to switch between the 2
+ parents i.e. sclk_pixel and sclk_hdmiphy. If hdmiphy is stable
+ after configuration, parent is set to sclk_hdmiphy else
+ sclk_pixel.
+ clock-names:
+ items:
+ - const: hdmi
+ - const: sclk_hdmi
+ - const: sclk_pixel
+ - const: sclk_hdmiphy
+ - const: mout_hdmi
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5433.h>
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ hdmi@13970000 {
+ compatible = "samsung,exynos5433-hdmi";
+ reg = <0x13970000 0x70000>;
+ interrupts = <GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cmu_disp CLK_PCLK_HDMI>,
+ <&cmu_disp CLK_PCLK_HDMIPHY>,
+ <&cmu_disp CLK_PHYCLK_HDMIPHY_TMDS_CLKO>,
+ <&cmu_disp CLK_PHYCLK_HDMI_PIXEL>,
+ <&cmu_disp CLK_PHYCLK_HDMIPHY_TMDS_CLKO_PHY>,
+ <&cmu_disp CLK_MOUT_PHYCLK_HDMIPHY_TMDS_CLKO_USER>,
+ <&cmu_disp CLK_PHYCLK_HDMIPHY_PIXEL_CLKO_PHY>,
+ <&cmu_disp CLK_MOUT_PHYCLK_HDMIPHY_PIXEL_CLKO_USER>,
+ <&xxti>,
+ <&cmu_disp CLK_SCLK_HDMI_SPDIF>;
+ clock-names = "hdmi_pclk",
+ "hdmi_i_pclk",
+ "i_tmds_clk",
+ "i_pixel_clk",
+ "tmds_clko",
+ "tmds_clko_user",
+ "pixel_clko",
+ "pixel_clko_user",
+ "oscclk",
+ "i_spdif_clk";
+ phy = <&hdmiphy>;
+ ddc = <&hsi2c_11>;
+ samsung,syscon-phandle = <&pmu_system_controller>;
+ samsung,sysreg-phandle = <&syscon_disp>;
+ #sound-dai-cells = <0>;
+
+ hpd-gpios = <&gpa3 0 GPIO_ACTIVE_HIGH>;
+ vdd-supply = <&ldo6_reg>;
+ vdd_osc-supply = <&ldo7_reg>;
+ vdd_pll-supply = <&ldo6_reg>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ hdmi_to_tv: endpoint {
+ remote-endpoint = <&tv_to_hdmi>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ hdmi_to_mhl: endpoint {
+ remote-endpoint = <&mhl_to_hdmi>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos-mixer.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-mixer.yaml
new file mode 100644
index 000000000000..25d53fde92e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos-mixer.yaml
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos-mixer.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos SoC Mixer
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description:
+ Samsung Exynos SoC Mixer is responsible for mixing and blending multiple data
+ inputs before passing it to an output device. The output is passed to HDMI.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - samsung,exynos4210-mixer
+ - samsung,exynos4212-mixer
+ - samsung,exynos5250-mixer
+ - samsung,exynos5420-mixer
+ - const: samsung,exynos5-mixer
+ deprecated: true
+
+ clocks:
+ minItems: 3
+ items:
+ - description: Gate of Mixer IP bus clock.
+ - description: Gate of HDMI IP bus clock, needed together with sclk_hdmi.
+ - description: HDMI Special clock, one of the two possible inputs of
+ mixer mux.
+ - description: Video Processor clock.
+ - description: Mixer mux clock.
+ - description: Mixer Special clock.
+
+ clock-names:
+ minItems: 3
+ items:
+ - const: mixer
+ - const: hdmi
+ - const: sclk_hdmi
+ - const: vp
+ - const: mout_mixer
+ - const: sclk_mixer
+
+ interconnects:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ minItems: 1
+ items:
+ - description: Mixer memory region.
+ - description: Video Processor memory region.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - interrupts
+ - reg
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos4210-mixer
+ - samsung,exynos4212-mixer
+ then:
+ properties:
+ clocks:
+ minItems: 6
+ maxItems: 6
+ regs:
+ minItems: 2
+ maxItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos4212-mixer
+ then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+ regs:
+ minItems: 2
+ maxItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - samsung,exynos5-mixer
+ - samsung,exynos5250-mixer
+ - samsung,exynos5420-mixer
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+ regs:
+ minItems: 1
+ maxItems: 1
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5250.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ mixer@14450000 {
+ compatible = "samsung,exynos5250-mixer";
+ reg = <0x14450000 0x10000>;
+ interrupts = <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&clock CLK_MIXER>,
+ <&clock CLK_HDMI>,
+ <&clock CLK_SCLK_HDMI>;
+ clock-names = "mixer",
+ "hdmi",
+ "sclk_hdmi";
+ iommus = <&sysmmu_tv>;
+ power-domains = <&pd_disp1>;
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-decon.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-decon.yaml
new file mode 100644
index 000000000000..6380eeebb073
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-decon.yaml
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos5433-decon.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos5433 SoC Display and Enhancement Controller (DECON)
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ DECON (Display and Enhancement Controller) is the Display Controller for the
+ Exynos5433 series of SoCs which transfers the image data from a video memory
+ buffer to an external LCD interface.
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos5433-decon
+ - samsung,exynos5433-decon-tv
+
+ clocks:
+ maxItems: 11
+
+ clock-names:
+ items:
+ - const: pclk
+ - const: aclk_decon
+ - const: aclk_smmu_decon0x
+ - const: aclk_xiu_decon0x
+ - const: pclk_smmu_decon0x
+ - const: aclk_smmu_decon1x
+ - const: aclk_xiu_decon1x
+ - const: pclk_smmu_decon1x
+ - const: sclk_decon_vclk
+ - const: sclk_decon_eclk
+ - const: dsd
+
+ interrupts:
+ minItems: 3
+ maxItems: 4
+ description: |
+ Interrupts depend on mode of work:
+ - video mode: vsync
+ - command mode: lcd_sys
+ - command mode with software trigger: lcd_sys, te
+
+ interrupt-names:
+ minItems: 3
+ items:
+ - const: fifo
+ - const: vsync
+ - const: lcd_sys
+ - const: te
+
+ iommus:
+ maxItems: 2
+
+ iommu-names:
+ items:
+ - const: m0
+ - const: m1
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description:
+ Contains a port which is connected to mic node.
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ samsung,disp-sysreg:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to DISP system controller interface.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - interrupts
+ - interrupt-names
+ - ports
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5433.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ display-controller@13800000 {
+ compatible = "samsung,exynos5433-decon";
+ reg = <0x13800000 0x2104>;
+ clocks = <&cmu_disp CLK_PCLK_DECON>,
+ <&cmu_disp CLK_ACLK_DECON>,
+ <&cmu_disp CLK_ACLK_SMMU_DECON0X>,
+ <&cmu_disp CLK_ACLK_XIU_DECON0X>,
+ <&cmu_disp CLK_PCLK_SMMU_DECON0X>,
+ <&cmu_disp CLK_ACLK_SMMU_DECON1X>,
+ <&cmu_disp CLK_ACLK_XIU_DECON1X>,
+ <&cmu_disp CLK_PCLK_SMMU_DECON1X>,
+ <&cmu_disp CLK_SCLK_DECON_VCLK>,
+ <&cmu_disp CLK_SCLK_DECON_ECLK>,
+ <&cmu_disp CLK_SCLK_DSD>;
+ clock-names = "pclk",
+ "aclk_decon",
+ "aclk_smmu_decon0x",
+ "aclk_xiu_decon0x",
+ "pclk_smmu_decon0x",
+ "aclk_smmu_decon1x",
+ "aclk_xiu_decon1x",
+ "pclk_smmu_decon1x",
+ "sclk_decon_vclk",
+ "sclk_decon_eclk",
+ "dsd";
+ power-domains = <&pd_disp>;
+ interrupt-names = "fifo", "vsync", "lcd_sys";
+ interrupts = <GIC_SPI 201 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 202 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 203 IRQ_TYPE_LEVEL_HIGH>;
+ samsung,disp-sysreg = <&syscon_disp>;
+ iommus = <&sysmmu_decon0x>, <&sysmmu_decon1x>;
+ iommu-names = "m0", "m1";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ decon_to_mic: endpoint {
+ remote-endpoint = <&mic_to_decon>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-mic.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-mic.yaml
new file mode 100644
index 000000000000..26e5017737a3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos5433-mic.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos5433-mic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos5433 SoC Mobile Image Compressor (MIC)
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ MIC (Mobile Image Compressor) resides between DECON and MIPI DSI. MIPI DSI is
+ not capable of transferring high resoltuion frame data as DECON can send. MIC
+ solves this problem by compressing the frame data by 1/2 before it is
+ transferred through MIPI DSI. The compressed frame data must be uncompressed
+ in the panel PCB.
+
+properties:
+ compatible:
+ const: samsung,exynos5433-mic
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: pclk_mic0
+ - const: sclk_rgb_vclk_to_mic0
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description:
+ Contains a port which is connected to mic node.
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ samsung,disp-syscon:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to DISP system controller interface.
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - ports
+ - reg
+ - samsung,disp-syscon
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos5433.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ image-processor@13930000 {
+ compatible = "samsung,exynos5433-mic";
+ reg = <0x13930000 0x48>;
+ clocks = <&cmu_disp CLK_PCLK_MIC0>,
+ <&cmu_disp CLK_SCLK_RGB_VCLK_TO_MIC0>;
+ clock-names = "pclk_mic0",
+ "sclk_rgb_vclk_to_mic0";
+ power-domains = <&pd_disp>;
+ samsung,disp-syscon = <&syscon_disp>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ mic_to_decon: endpoint {
+ remote-endpoint = <&decon_to_mic>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ mic_to_dsi: endpoint {
+ remote-endpoint = <&dsi_to_mic>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,exynos7-decon.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,exynos7-decon.yaml
new file mode 100644
index 000000000000..992c23ca7a4e
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,exynos7-decon.yaml
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,exynos7-decon.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos7 SoC Display and Enhancement Controller (DECON)
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ DECON (Display and Enhancement Controller) is the Display Controller for the
+ Exynos7 series of SoCs which transfers the image data from a video memory
+ buffer to an external LCD interface.
+
+properties:
+ compatible:
+ const: samsung,exynos7-decon
+
+ clocks:
+ maxItems: 4
+
+ clock-names:
+ items:
+ - const: pclk_decon0
+ - const: aclk_decon0
+ - const: decon0_eclk
+ - const: decon0_vclk
+
+ display-timings:
+ $ref: ../panel/display-timings.yaml#
+
+ i80-if-timings:
+ type: object
+ additionalProperties: false
+ description: timing configuration for lcd i80 interface support
+ properties:
+ cs-setup:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of address signal is enabled until
+ chip select is enabled.
+ default: 0
+
+ wr-active:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS is enabled.
+ default: 1
+
+ wr-hold:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS is disabled until write
+ signal is disabled.
+ default: 0
+
+ wr-setup:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS signal is enabled until
+ write signal is enabled.
+ default: 0
+
+ interrupts:
+ items:
+ - description: FIFO level
+ - description: VSYNC
+ - description: LCD system
+
+ interrupt-names:
+ items:
+ - const: fifo
+ - const: vsync
+ - const: lcd_sys
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - interrupts
+ - interrupt-names
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos7-clk.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ display-controller@13930000 {
+ compatible = "samsung,exynos7-decon";
+ reg = <0x13930000 0x1000>;
+ interrupt-names = "fifo", "vsync", "lcd_sys";
+ interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 189 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 188 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&clock_disp 100>, /* PCLK_DECON_INT */
+ <&clock_disp 101>, /* ACLK_DECON_INT */
+ <&clock_disp 102>, /* SCLK_DECON_INT_ECLK */
+ <&clock_disp 103>; /* SCLK_DECON_INT_EXTCLKPLL */
+ clock-names = "pclk_decon0",
+ "aclk_decon0",
+ "decon0_eclk",
+ "decon0_vclk";
+ pinctrl-0 = <&lcd_clk &pwm1_out>;
+ pinctrl-names = "default";
+ };
diff --git a/Documentation/devicetree/bindings/display/samsung/samsung,fimd.yaml b/Documentation/devicetree/bindings/display/samsung/samsung,fimd.yaml
new file mode 100644
index 000000000000..075231716b2f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/samsung/samsung,fimd.yaml
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/samsung/samsung,fimd.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S3C/S5P/Exynos SoC Fully Interactive Mobile Display (FIMD)
+
+maintainers:
+ - Inki Dae <inki.dae@samsung.com>
+ - Seung-Woo Kim <sw0312.kim@samsung.com>
+ - Kyungmin Park <kyungmin.park@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - samsung,s3c2443-fimd
+ - samsung,s3c6400-fimd
+ - samsung,s5pv210-fimd
+ - samsung,exynos3250-fimd
+ - samsung,exynos4210-fimd
+ - samsung,exynos5250-fimd
+ - samsung,exynos5420-fimd
+
+ '#address-cells':
+ const: 1
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: sclk_fimd
+ - const: fimd
+
+ display-timings:
+ $ref: ../panel/display-timings.yaml#
+
+ i80-if-timings:
+ type: object
+ additionalProperties: false
+ description: |
+ Timing configuration for lcd i80 interface support.
+ The parameters are defined as::
+ VCLK(internal) __|??????|_____|??????|_____|??????|_____|??????|_____|??
+ : : : : :
+ Address Output --:<XXXXXXXXXXX:XXXXXXXXXXXX:XXXXXXXXXXXX:XXXXXXXXXXXX:XX
+ | cs-setup+1 | : : :
+ |<---------->| : : :
+ Chip Select ???????????????|____________:____________:____________|??
+ | wr-setup+1 | | wr-hold+1 |
+ |<---------->| |<---------->|
+ Write Enable ????????????????????????????|____________|???????????????
+ | wr-active+1|
+ |<---------->|
+ Video Data ----------------------------<XXXXXXXXXXXXXXXXXXXXXXXXX>--
+
+ properties:
+ cs-setup:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of address signal is enabled until
+ chip select is enabled.
+ default: 0
+
+ wr-active:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS is enabled.
+ default: 1
+
+ wr-hold:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS is disabled until write
+ signal is disabled.
+ default: 0
+
+ wr-setup:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Clock cycles for the active period of CS signal is enabled until
+ write signal is enabled.
+ default: 0
+
+ iommus:
+ minItems: 1
+ maxItems: 2
+
+ iommu-names:
+ items:
+ - const: m0
+ - const: m1
+
+ interrupts:
+ items:
+ - description: FIFO level
+ - description: VSYNC
+ - description: LCD system
+
+ interrupt-names:
+ items:
+ - const: fifo
+ - const: vsync
+ - const: lcd_sys
+
+ power-domains:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ samsung,invert-vden:
+ type: boolean
+ description:
+ Video enable signal is inverted.
+
+ samsung,invert-vclk:
+ type: boolean
+ description:
+ Video clock signal is inverted.
+
+ samsung,sysreg:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to System Register syscon.
+
+ '#size-cells':
+ const: 0
+
+patternProperties:
+ "^port@[0-4]+$":
+ $ref: /schemas/graph.yaml#/properties/port
+ description: |
+ Contains ports with port with index::
+ 0 - for CAMIF0 input,
+ 1 - for CAMIF1 input,
+ 2 - for CAMIF2 input,
+ 3 - for parallel output,
+ 4 - for write-back interface
+
+required:
+ - compatible
+ - clocks
+ - clock-names
+ - interrupts
+ - interrupt-names
+ - reg
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: samsung,exynos5420-fimd
+ then:
+ properties:
+ iommus:
+ minItems: 2
+ maxItems: 2
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/exynos4.h>
+
+ fimd@11c00000 {
+ compatible = "samsung,exynos4210-fimd";
+ interrupt-parent = <&combiner>;
+ reg = <0x11c00000 0x20000>;
+ interrupt-names = "fifo", "vsync", "lcd_sys";
+ interrupts = <11 0>, <11 1>, <11 2>;
+ clocks = <&clock CLK_SCLK_FIMD0>, <&clock CLK_FIMD0>;
+ clock-names = "sclk_fimd", "fimd";
+ power-domains = <&pd_lcd0>;
+ iommus = <&sysmmu_fimd0>;
+ samsung,sysreg = <&sys_reg>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ samsung,invert-vden;
+ samsung,invert-vclk;
+
+ pinctrl-0 = <&lcd_clk>, <&lcd_data24>;
+ pinctrl-names = "default";
+
+ port@3 {
+ reg = <3>;
+
+ fimd_dpi_ep: endpoint {
+ remote-endpoint = <&lcd_ep>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
index c2499a7906f5..296500f9da05 100644
--- a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
+++ b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
@@ -4,10 +4,9 @@
$id: http://devicetree.org/schemas/display/simple-framebuffer.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Simple Framebuffer Device Tree Bindings
+title: Simple Framebuffer
maintainers:
- - Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
- Hans de Goede <hdegoede@redhat.com>
description: |+
@@ -27,6 +26,11 @@ description: |+
over control to a driver for the real hardware. The bindings for the
hw nodes must specify which node is considered the primary node.
+ If a panel node is given, then the driver uses this to configure the
+ physical width and height of the display. If no panel node is given,
+ then the driver uses the width and height properties of the simplefb
+ node to estimate it.
+
It is advised to add display# aliases to help the OS determine how
to number things. If display# aliases are used, then if the simplefb
node contains a display property then the /aliases/display# path
@@ -52,16 +56,23 @@ description: |+
properties:
compatible:
- items:
- - enum:
- - apple,simple-framebuffer
- - allwinner,simple-framebuffer
- - amlogic,simple-framebuffer
+ oneOf:
+ - items:
+ - enum:
+ - apple,simple-framebuffer
+ - allwinner,simple-framebuffer
+ - amlogic,simple-framebuffer
+ - const: simple-framebuffer
- const: simple-framebuffer
reg:
description: Location and size of the framebuffer memory
+ memory-region:
+ maxItems: 1
+ description: Phandle to a node describing the memory to be used for the
+ framebuffer. If present, overrides the "reg" property (if one exists).
+
clocks:
description: List of clocks used by the framebuffer.
@@ -83,20 +94,38 @@ properties:
format:
description: >
Format of the framebuffer:
+ * `a1r5g5b5` - 16-bit pixels, d[15]=a, d[14:10]=r, d[9:5]=g, d[4:0]=b
+ * `a2r10g10b10` - 32-bit pixels, d[31:30]=a, d[29:20]=r, d[19:10]=g, d[9:0]=b
* `a8b8g8r8` - 32-bit pixels, d[31:24]=a, d[23:16]=b, d[15:8]=g, d[7:0]=r
+ * `a8r8g8b8` - 32-bit pixels, d[31:24]=a, d[23:16]=r, d[15:8]=g, d[7:0]=b
* `r5g6b5` - 16-bit pixels, d[15:11]=r, d[10:5]=g, d[4:0]=b
+ * `r5g5b5a1` - 16-bit pixels, d[15:11]=r, d[10:6]=g, d[5:1]=b d[1:0]=a
+ * `r8g8b8` - 24-bit pixels, d[23:16]=r, d[15:8]=g, d[7:0]=b
+ * `x1r5g5b5` - 16-bit pixels, d[14:10]=r, d[9:5]=g, d[4:0]=b
* `x2r10g10b10` - 32-bit pixels, d[29:20]=r, d[19:10]=g, d[9:0]=b
* `x8r8g8b8` - 32-bit pixels, d[23:16]=r, d[15:8]=g, d[7:0]=b
+ * `x8b8g8r8` - 32-bit pixels, d[23:16]=b, d[15:8]=g, d[7:0]=r
enum:
+ - a1r5g5b5
+ - a2r10g10b10
- a8b8g8r8
+ - a8r8g8b8
- r5g6b5
+ - r5g5b5a1
+ - r8g8b8
+ - x1r5g5b5
- x2r10g10b10
- x8r8g8b8
+ - x8b8g8r8
display:
$ref: /schemas/types.yaml#/definitions/phandle
description: Primary display hardware node
+ panel:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: Display panel node
+
allwinner,pipeline:
description: Pipeline used by the framebuffer on Allwinner SoCs
enum:
diff --git a/Documentation/devicetree/bindings/display/sitronix,st7735r.yaml b/Documentation/devicetree/bindings/display/sitronix,st7735r.yaml
index 0cebaaefda03..621f27148419 100644
--- a/Documentation/devicetree/bindings/display/sitronix,st7735r.yaml
+++ b/Documentation/devicetree/bindings/display/sitronix,st7735r.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/display/sitronix,st7735r.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Sitronix ST7735R Display Panels Device Tree Bindings
+title: Sitronix ST7735R Display Panels
maintainers:
- David Lechner <david@lechnology.com>
@@ -15,6 +15,7 @@ description:
allOf:
- $ref: panel/panel-common.yaml#
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
properties:
compatible:
@@ -32,15 +33,13 @@ properties:
- okaya,rh128128t
- const: sitronix,st7715r
- spi-max-frequency:
- maximum: 32000000
-
dc-gpios:
maxItems: 1
description: Display data/command selection (D/CX)
backlight: true
reg: true
+ spi-max-frequency: true
reset-gpios: true
rotation: true
@@ -48,7 +47,6 @@ required:
- compatible
- reg
- dc-gpios
- - reset-gpios
additionalProperties: false
@@ -72,6 +70,7 @@ examples:
dc-gpios = <&gpio 43 GPIO_ACTIVE_HIGH>;
reset-gpios = <&gpio 80 GPIO_ACTIVE_HIGH>;
rotation = <270>;
+ backlight = <&backlight>;
};
};
diff --git a/Documentation/devicetree/bindings/display/solomon,ssd1307fb.yaml b/Documentation/devicetree/bindings/display/solomon,ssd1307fb.yaml
index 2ed2a7d0ca2f..94bb5ef567c6 100644
--- a/Documentation/devicetree/bindings/display/solomon,ssd1307fb.yaml
+++ b/Documentation/devicetree/bindings/display/solomon,ssd1307fb.yaml
@@ -8,14 +8,24 @@ title: Solomon SSD1307 OLED Controller Framebuffer
maintainers:
- Maxime Ripard <mripard@kernel.org>
+ - Javier Martinez Canillas <javierm@redhat.com>
properties:
compatible:
- enum:
- - solomon,ssd1305fb-i2c
- - solomon,ssd1306fb-i2c
- - solomon,ssd1307fb-i2c
- - solomon,ssd1309fb-i2c
+ oneOf:
+ # Deprecated compatible strings
+ - enum:
+ - solomon,ssd1305fb-i2c
+ - solomon,ssd1306fb-i2c
+ - solomon,ssd1307fb-i2c
+ - solomon,ssd1309fb-i2c
+ deprecated: true
+ - enum:
+ - sinowealth,sh1106
+ - solomon,ssd1305
+ - solomon,ssd1306
+ - solomon,ssd1307
+ - solomon,ssd1309
reg:
maxItems: 1
@@ -26,6 +36,14 @@ properties:
reset-gpios:
maxItems: 1
+ # Only required for SPI
+ dc-gpios:
+ description:
+ GPIO connected to the controller's D/C# (Data/Command) pin,
+ that is needed for 4-wire SPI to tell the controller if the
+ data sent is for a command register or the display data RAM
+ maxItems: 1
+
vbat-supply:
description: The supply for VBAT
@@ -130,11 +148,27 @@ required:
- reg
allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: sinowealth,sh1106
+ then:
+ properties:
+ solomon,dclk-div:
+ default: 1
+ solomon,dclk-frq:
+ default: 5
+
- if:
properties:
compatible:
contains:
- const: solomon,ssd1305fb-i2c
+ enum:
+ - solomon,ssd1305-i2c
+ - solomon,ssd1305
then:
properties:
solomon,dclk-div:
@@ -146,7 +180,9 @@ allOf:
properties:
compatible:
contains:
- const: solomon,ssd1306fb-i2c
+ enum:
+ - solomon,ssd1306-i2c
+ - solomon,ssd1306
then:
properties:
solomon,dclk-div:
@@ -158,7 +194,9 @@ allOf:
properties:
compatible:
contains:
- const: solomon,ssd1307fb-i2c
+ enum:
+ - solomon,ssd1307-i2c
+ - solomon,ssd1307
then:
properties:
solomon,dclk-div:
@@ -172,7 +210,9 @@ allOf:
properties:
compatible:
contains:
- const: solomon,ssd1309fb-i2c
+ enum:
+ - solomon,ssd1309-i2c
+ - solomon,ssd1309
then:
properties:
solomon,dclk-div:
@@ -180,26 +220,53 @@ allOf:
solomon,dclk-frq:
default: 10
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
- i2c1 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
- ssd1307: oled@3c {
- compatible = "solomon,ssd1307fb-i2c";
+ ssd1307_i2c: oled@3c {
+ compatible = "solomon,ssd1307";
reg = <0x3c>;
pwms = <&pwm 4 3000>;
reset-gpios = <&gpio2 7>;
};
- ssd1306: oled@3d {
- compatible = "solomon,ssd1306fb-i2c";
- reg = <0x3c>;
+ ssd1306_i2c: oled@3d {
+ compatible = "solomon,ssd1306";
+ reg = <0x3d>;
+ pwms = <&pwm 4 3000>;
+ reset-gpios = <&gpio2 7>;
+ solomon,com-lrremap;
+ solomon,com-invdir;
+ solomon,com-offset = <32>;
+ solomon,lookup-table = /bits/ 8 <0x3f 0x3f 0x3f 0x3f>;
+ };
+ };
+ - |
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ssd1307_spi: oled@0 {
+ compatible = "solomon,ssd1307";
+ reg = <0x0>;
+ pwms = <&pwm 4 3000>;
+ reset-gpios = <&gpio2 7>;
+ dc-gpios = <&gpio2 8>;
+ spi-max-frequency = <10000000>;
+ };
+
+ ssd1306_spi: oled@1 {
+ compatible = "solomon,ssd1306";
+ reg = <0x1>;
pwms = <&pwm 4 3000>;
reset-gpios = <&gpio2 7>;
+ dc-gpios = <&gpio2 8>;
+ spi-max-frequency = <10000000>;
solomon,com-lrremap;
solomon,com-invdir;
solomon,com-offset = <32>;
diff --git a/Documentation/devicetree/bindings/display/sprd/sprd,display-subsystem.yaml b/Documentation/devicetree/bindings/display/sprd/sprd,display-subsystem.yaml
new file mode 100644
index 000000000000..b3d5e1b96fae
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/sprd/sprd,display-subsystem.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/sprd/sprd,display-subsystem.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc DRM master device
+
+maintainers:
+ - Kevin Tang <kevin.tang@unisoc.com>
+
+description: |
+ The Unisoc DRM master device is a virtual device needed to list all
+ DPU devices or other display interface nodes that comprise the
+ graphics subsystem.
+
+ Unisoc's display pipeline have several components as below description,
+ multi display controllers and corresponding physical interfaces.
+ For different display scenarios, dpu0 and dpu1 maybe binding to different
+ encoder.
+
+ E.g:
+ dpu0 and dpu1 both binding to DSI for dual mipi-dsi display;
+ dpu0 binding to DSI for primary display, and dpu1 binding to DP for external display;
+
+ +-----------------------------------------+
+ | |
+ | +---------+ |
+ +----+ | +----+ +---------+ |DPHY/CPHY| | +------+
+ | +----->+dpu0+--->+MIPI|DSI +--->+Combo +----->+Panel0|
+ |AXI | | +----+ +---------+ +---------+ | +------+
+ | | | ^ |
+ | | | | |
+ | | | +-----------+ |
+ | | | | |
+ |APB | | +--+-+ +-----------+ +---+ | +------+
+ | +----->+dpu1+--->+DisplayPort+--->+PHY+--------->+Panel1|
+ | | | +----+ +-----------+ +---+ | +------+
+ +----+ | |
+ +-----------------------------------------+
+
+properties:
+ compatible:
+ const: sprd,display-subsystem
+
+ ports:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ maxItems: 1
+ description:
+ Should contain a list of phandles pointing to display interface port
+ of DPU devices.
+
+required:
+ - compatible
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ display-subsystem {
+ compatible = "sprd,display-subsystem";
+ ports = <&dpu_out>;
+ };
diff --git a/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dpu.yaml b/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dpu.yaml
new file mode 100644
index 000000000000..4ebea60b8c5b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dpu.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/sprd/sprd,sharkl3-dpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc Sharkl3 Display Processor Unit (DPU)
+
+maintainers:
+ - Kevin Tang <kevin.tang@unisoc.com>
+
+description: |
+ DPU (Display Processor Unit) is the Display Controller for the Unisoc SoCs
+ which transfers the image data from a video memory buffer to an internal
+ LCD interface.
+
+properties:
+ compatible:
+ const: sprd,sharkl3-dpu
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: clk_src_128m
+ - const: clk_src_384m
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ port:
+ type: object
+ description:
+ A port node with endpoint definitions as defined in
+ Documentation/devicetree/bindings/media/video-interfaces.txt.
+ That port should be the output endpoint, usually output to
+ the associated DSI.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - port
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/sprd,sc9860-clk.h>
+ dpu: dpu@63000000 {
+ compatible = "sprd,sharkl3-dpu";
+ reg = <0x63000000 0x1000>;
+ interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+ clock-names = "clk_src_128m", "clk_src_384m";
+
+ clocks = <&pll CLK_TWPLL_128M>,
+ <&pll CLK_TWPLL_384M>;
+
+ dpu_port: port {
+ dpu_out: endpoint {
+ remote-endpoint = <&dsi_in>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dsi-host.yaml b/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dsi-host.yaml
new file mode 100644
index 000000000000..bc5594d18643
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/sprd/sprd,sharkl3-dsi-host.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/sprd/sprd,sharkl3-dsi-host.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc MIPI DSI Controller
+
+maintainers:
+ - Kevin Tang <kevin.tang@unisoc.com>
+
+properties:
+ compatible:
+ const: sprd,sharkl3-dsi-host
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 2
+
+ clocks:
+ minItems: 1
+
+ clock-names:
+ items:
+ - const: clk_src_96m
+
+ power-domains:
+ maxItems: 1
+
+ ports:
+ type: object
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ port@0:
+ type: object
+ description:
+ A port node with endpoint definitions as defined in
+ Documentation/devicetree/bindings/media/video-interfaces.txt.
+ That port should be the input endpoint, usually coming from
+ the associated DPU.
+
+ required:
+ - "#address-cells"
+ - "#size-cells"
+ - port@0
+
+ additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - ports
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/sprd,sc9860-clk.h>
+ dsi: dsi@63100000 {
+ compatible = "sprd,sharkl3-dsi-host";
+ reg = <0x63100000 0x1000>;
+ interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
+ clock-names = "clk_src_96m";
+ clocks = <&pll CLK_TWPLL_96M>;
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ port@0 {
+ reg = <0>;
+ dsi_in: endpoint {
+ remote-endpoint = <&dpu_out>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/st,stm32-dsi.yaml b/Documentation/devicetree/bindings/display/st,stm32-dsi.yaml
index ed310bbe3afe..c488308d7be1 100644
--- a/Documentation/devicetree/bindings/display/st,stm32-dsi.yaml
+++ b/Documentation/devicetree/bindings/display/st,stm32-dsi.yaml
@@ -7,8 +7,8 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: STMicroelectronics STM32 DSI host controller
maintainers:
- - Philippe Cornu <philippe.cornu@st.com>
- - Yannick Fertre <yannick.fertre@st.com>
+ - Philippe Cornu <philippe.cornu@foss.st.com>
+ - Yannick Fertre <yannick.fertre@foss.st.com>
description:
The STMicroelectronics STM32 DSI controller uses the Synopsys DesignWare MIPI-DSI host controller.
@@ -58,9 +58,20 @@ properties:
DSI input port node, connected to the ltdc rgb output port.
port@1:
- $ref: /schemas/graph.yaml#/properties/port
- description:
- DSI output port node, connected to a panel or a bridge input port"
+ $ref: /schemas/graph.yaml#/$defs/port-base
+ unevaluatedProperties: false
+ description: |
+ DSI output port node, connected to a panel or a bridge input port.
+ properties:
+ endpoint:
+ $ref: /schemas/media/video-interfaces.yaml#
+ unevaluatedProperties: false
+ properties:
+ data-lanes:
+ minItems: 1
+ items:
+ - const: 1
+ - const: 2
required:
- "#address-cells"
@@ -110,7 +121,7 @@ examples:
};
};
- panel-dsi@0 {
+ panel@0 {
compatible = "orisetech,otm8009a";
reg = <0>;
reset-gpios = <&gpioe 4 GPIO_ACTIVE_LOW>;
@@ -125,4 +136,3 @@ examples:
};
...
-
diff --git a/Documentation/devicetree/bindings/display/st,stm32-ltdc.yaml b/Documentation/devicetree/bindings/display/st,stm32-ltdc.yaml
index 4ae3d75492d3..d6ea4d62a2cf 100644
--- a/Documentation/devicetree/bindings/display/st,stm32-ltdc.yaml
+++ b/Documentation/devicetree/bindings/display/st,stm32-ltdc.yaml
@@ -7,8 +7,8 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: STMicroelectronics STM32 lcd-tft display controller
maintainers:
- - Philippe Cornu <philippe.cornu@st.com>
- - Yannick Fertre <yannick.fertre@st.com>
+ - Philippe Cornu <philippe.cornu@foss.st.com>
+ - Yannick Fertre <yannick.fertre@foss.st.com>
properties:
compatible:
@@ -75,4 +75,3 @@ examples:
};
...
-
diff --git a/Documentation/devicetree/bindings/display/ste,mcde.yaml b/Documentation/devicetree/bindings/display/ste,mcde.yaml
index de0c678b3c29..564ea845c82e 100644
--- a/Documentation/devicetree/bindings/display/ste,mcde.yaml
+++ b/Documentation/devicetree/bindings/display/ste,mcde.yaml
@@ -58,8 +58,8 @@ patternProperties:
"^dsi@[0-9a-f]+$":
description: subnodes for the three DSI host adapters
type: object
- allOf:
- - $ref: dsi-controller.yaml#
+ $ref: dsi-controller.yaml#
+
properties:
compatible:
const: ste,mcde-dsi
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.txt b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.txt
deleted file mode 100644
index e4a25cedc5cf..000000000000
--- a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-NVIDIA Tegra MIPI pad calibration controller
-
-Required properties:
-- compatible: "nvidia,tegra<chip>-mipi"
-- reg: Physical base address and length of the controller's registers.
-- clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
-- clock-names: Must include the following entries:
- - mipi-cal
-- #nvidia,mipi-calibrate-cells: Should be 1. The cell is a bitmask of the pads
- that need to be calibrated for a given device.
-
-User nodes need to contain an nvidia,mipi-calibrate property that has a
-phandle to refer to the calibration controller node and a bitmask of the pads
-that need to be calibrated.
-
-Example:
-
- mipi: mipi@700e3000 {
- compatible = "nvidia,tegra114-mipi";
- reg = <0x700e3000 0x100>;
- clocks = <&tegra_car TEGRA114_CLK_MIPI_CAL>;
- clock-names = "mipi-cal";
- #nvidia,mipi-calibrate-cells = <1>;
- };
-
- ...
-
- host1x@50000000 {
- ...
-
- dsi@54300000 {
- ...
-
- nvidia,mipi-calibrate = <&mipi 0x060>;
-
- ...
- };
-
- ...
- };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.yaml
new file mode 100644
index 000000000000..f448624dd779
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra114-mipi.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra114-mipi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra MIPI pad calibration controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^mipi@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra114-mipi
+ - nvidia,tegra210-mipi
+ - nvidia,tegra186-mipi
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+
+ clock-names:
+ items:
+ - const: mipi-cal
+
+ power-domains:
+ maxItems: 1
+
+ "#nvidia,mipi-calibrate-cells":
+ description: The number of cells in a MIPI calibration specifier.
+ Should be 1. The single cell specifies a bitmask of the pads that
+ need to be calibrated for a given device.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ const: 1
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - "#nvidia,mipi-calibrate-cells"
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra114-car.h>
+
+ mipi@700e3000 {
+ compatible = "nvidia,tegra114-mipi";
+ reg = <0x700e3000 0x100>;
+ clocks = <&tegra_car TEGRA114_CLK_MIPI_CAL>;
+ clock-names = "mipi-cal";
+ #nvidia,mipi-calibrate-cells = <1>;
+ };
+
+ dsia: dsi@54300000 {
+ compatible = "nvidia,tegra114-dsi";
+ reg = <0x54300000 0x00040000>;
+ clocks = <&tegra_car TEGRA114_CLK_DSIA>,
+ <&tegra_car TEGRA114_CLK_DSIALP>,
+ <&tegra_car TEGRA114_CLK_PLL_D_OUT0>;
+ clock-names = "dsi", "lp", "parent";
+ resets = <&tegra_car 48>;
+ reset-names = "dsi";
+ nvidia,mipi-calibrate = <&mipi 0x060>; /* DSIA & DSIB pads */
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-dpaux.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-dpaux.yaml
new file mode 100644
index 000000000000..5cdbc527a560
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-dpaux.yaml
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra124-dpaux.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra DisplayPort AUX Interface
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+description: |
+ The Tegra Display Port Auxiliary (DPAUX) pad controller manages two
+ pins which can be assigned to either the DPAUX channel or to an I2C
+ controller.
+
+ When configured for DisplayPort AUX operation, the DPAUX controller
+ can also be used to communicate with a DisplayPort device using the
+ AUX channel.
+
+properties:
+ $nodename:
+ pattern: "^dpaux@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra124-dpaux
+ - nvidia,tegra210-dpaux
+ - nvidia,tegra186-dpaux
+ - nvidia,tegra194-dpaux
+
+ - items:
+ - const: nvidia,tegra132-dpaux
+ - const: nvidia,tegra124-dpaux
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: clock input for the DPAUX hardware
+ - description: reference clock
+
+ clock-names:
+ items:
+ - const: dpaux
+ - const: parent
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: dpaux
+
+ power-domains:
+ maxItems: 1
+
+ i2c-bus:
+ description: Subnode where I2C slave devices are listed. This
+ subnode must be always present. If there are no I2C slave
+ devices, an empty node should be added. See ../../i2c/i2c.yaml
+ for more information.
+ type: object
+
+ aux-bus:
+ $ref: /schemas/display/dp-aux-bus.yaml#
+
+ vdd-supply:
+ description: phandle of a supply that powers the DisplayPort
+ link
+
+patternProperties:
+ "^pinmux-[a-z0-9]+$":
+ description:
+ Since only three configurations are possible, only three child
+ nodes are needed to describe the pin mux'ing options for the
+ DPAUX pads. Furthermore, given that the pad functions are only
+ applicable to a single set of pads, the child nodes only need
+ to describe the pad group the functions are being applied to
+ rather than the individual pads.
+ type: object
+ properties:
+ groups:
+ const: dpaux-io
+
+ function:
+ enum:
+ - aux
+ - i2c
+ - off
+
+ additionalProperties: false
+
+ required:
+ - groups
+ - function
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra210-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ dpaux: dpaux@545c0000 {
+ compatible = "nvidia,tegra210-dpaux";
+ reg = <0x545c0000 0x00040000>;
+ interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA210_CLK_DPAUX>,
+ <&tegra_car TEGRA210_CLK_PLL_DP>;
+ clock-names = "dpaux", "parent";
+ resets = <&tegra_car 181>;
+ reset-names = "dpaux";
+ power-domains = <&pd_sor>;
+
+ state_dpaux_aux: pinmux-aux {
+ groups = "dpaux-io";
+ function = "aux";
+ };
+
+ state_dpaux_i2c: pinmux-i2c {
+ groups = "dpaux-io";
+ function = "i2c";
+ };
+
+ state_dpaux_off: pinmux-off {
+ groups = "dpaux-io";
+ function = "off";
+ };
+
+ i2c-bus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-sor.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-sor.yaml
new file mode 100644
index 000000000000..70f0e45c71d6
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-sor.yaml
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra124-sor.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra SOR Output Encoder
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+description: |
+ The Serial Output Resource (SOR) can be used to drive HDMI, LVDS, eDP
+ and DP outputs.
+
+properties:
+ $nodename:
+ pattern: "^sor@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra124-sor
+ - nvidia,tegra210-sor
+ - nvidia,tegra210-sor1
+ - nvidia,tegra186-sor
+ - nvidia,tegra186-sor1
+ - nvidia,tegra194-sor
+
+ - items:
+ - const: nvidia,tegra132-sor
+ - const: nvidia,tegra124-sor
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 5
+ maxItems: 6
+
+ clock-names:
+ minItems: 5
+ maxItems: 6
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: sor
+
+ power-domains:
+ maxItems: 1
+
+ avdd-io-hdmi-dp-supply:
+ description: I/O supply for HDMI/DP
+
+ vdd-hdmi-dp-pll-supply:
+ description: PLL supply for HDMI/DP
+
+ hdmi-supply:
+ description: +5.0V HDMI connector supply, required for HDMI
+
+ # Tegra186 and later
+ nvidia,interface:
+ description: index of the SOR interface
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ nvidia,ddc-i2c-bus:
+ description: phandle of an I2C controller used for DDC EDID
+ probing
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ nvidia,hpd-gpio:
+ description: specifies a GPIO used for hotplug detection
+ maxItems: 1
+
+ nvidia,edid:
+ description: supplies a binary EDID blob
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+
+ nvidia,panel:
+ description: phandle of a display panel, required for eDP
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ nvidia,xbar-cfg:
+ description: 5 cells containing the crossbar configuration.
+ Each lane of the SOR, identified by the cell's index, is
+ mapped via the crossbar to the pad specified by the cell's
+ value.
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+
+ # optional when driving an eDP output
+ nvidia,dpaux:
+ description: phandle to a DispayPort AUX interface
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra186-sor
+ - nvidia,tegra194-sor
+ then:
+ properties:
+ clocks:
+ items:
+ - description: clock input for the SOR hardware
+ - description: SOR output clock
+ - description: input for the pixel clock
+ - description: reference clock for the SOR clock
+ - description: safe reference clock for the SOR clock
+ during power up
+ - description: SOR pad output clock
+
+ clock-names:
+ items:
+ - const: sor
+ - enum:
+ - source # deprecated
+ - out
+ - const: parent
+ - const: dp
+ - const: safe
+ - const: pad
+ else:
+ properties:
+ clocks:
+ items:
+ - description: clock input for the SOR hardware
+ - description: SOR output clock
+ - description: input for the pixel clock
+ - description: reference clock for the SOR clock
+ - description: safe reference clock for the SOR clock
+ during power up
+
+ clock-names:
+ items:
+ - const: sor
+ - enum:
+ - source # deprecated
+ - out
+ - const: parent
+ - const: dp
+ - const: safe
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - avdd-io-hdmi-dp-supply
+ - vdd-hdmi-dp-pll-supply
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra210-car.h>
+ #include <dt-bindings/gpio/tegra-gpio.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ sor0: sor@54540000 {
+ compatible = "nvidia,tegra210-sor";
+ reg = <0x54540000 0x00040000>;
+ interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA210_CLK_SOR0>,
+ <&tegra_car TEGRA210_CLK_SOR0_OUT>,
+ <&tegra_car TEGRA210_CLK_PLL_D_OUT0>,
+ <&tegra_car TEGRA210_CLK_PLL_DP>,
+ <&tegra_car TEGRA210_CLK_SOR_SAFE>;
+ clock-names = "sor", "out", "parent", "dp", "safe";
+ resets = <&tegra_car 182>;
+ reset-names = "sor";
+ pinctrl-0 = <&state_dpaux_aux>;
+ pinctrl-1 = <&state_dpaux_i2c>;
+ pinctrl-2 = <&state_dpaux_off>;
+ pinctrl-names = "aux", "i2c", "off";
+ power-domains = <&pd_sor>;
+
+ avdd-io-hdmi-dp-supply = <&avdd_1v05>;
+ vdd-hdmi-dp-pll-supply = <&vdd_1v8>;
+ hdmi-supply = <&vdd_hdmi>;
+
+ nvidia,ddc-i2c-bus = <&hdmi_ddc>;
+ nvidia,hpd-gpio = <&gpio TEGRA_GPIO(CC, 1) GPIO_ACTIVE_LOW>;
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-vic.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-vic.yaml
new file mode 100644
index 000000000000..7200095ef19e
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra124-vic.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra124-vic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Video Image Composer
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^vic@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra124-vic
+ - nvidia,tegra210-vic
+ - nvidia,tegra186-vic
+ - nvidia,tegra194-vic
+ - nvidia,tegra234-vic
+
+ - items:
+ - const: nvidia,tegra132-vic
+ - const: nvidia,tegra124-vic
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: clock input for the VIC hardware
+
+ clock-names:
+ items:
+ - const: vic
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: vic
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ description: Description of the interconnect paths for the VIC;
+ see ../interconnect/interconnect.txt for details.
+ items:
+ - description: memory read client for VIC
+ - description: memory write client for VIC
+
+ interconnect-names:
+ items:
+ - const: dma-mem # read
+ - const: write
+
+ dma-coherent: true
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dc.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dc.yaml
new file mode 100644
index 000000000000..ce4589466a18
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dc.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra186-dc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra186 (and later) Display Controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^display@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra186-dc
+ - nvidia,tegra194-dc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: display controller pixel clock
+
+ clock-names:
+ items:
+ - const: dc
+
+ resets:
+ items:
+ - description: display controller reset
+
+ reset-names:
+ items:
+ - const: dc
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ description: Description of the interconnect paths for the
+ display controller; see ../interconnect/interconnect.txt
+ for details.
+
+ interconnect-names:
+ items:
+ - const: dma-mem # read-0
+ - const: read-1
+
+ nvidia,outputs:
+ description: A list of phandles of outputs that this display
+ controller can drive.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+
+ nvidia,head:
+ description: The number of the display controller head. This
+ is used to setup the various types of output to receive
+ video data from the given head.
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+ - nvidia,outputs
+ - nvidia,head
+
+# see nvidia,tegra186-display.yaml for examples
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-display.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-display.yaml
new file mode 100644
index 000000000000..117c371ce24b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-display.yaml
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra186-display.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra186 (and later) Display Hub
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^display-hub@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra186-display
+ - nvidia,tegra194-display
+
+ '#address-cells':
+ enum: [ 1, 2 ]
+
+ '#size-cells':
+ enum: [ 1, 2 ]
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 3
+
+ clock-names:
+ minItems: 2
+ maxItems: 3
+
+ resets:
+ items:
+ - description: display hub reset
+ - description: window group 0 reset
+ - description: window group 1 reset
+ - description: window group 2 reset
+ - description: window group 3 reset
+ - description: window group 4 reset
+ - description: window group 5 reset
+
+ reset-names:
+ items:
+ - const: misc
+ - const: wgrp0
+ - const: wgrp1
+ - const: wgrp2
+ - const: wgrp3
+ - const: wgrp4
+ - const: wgrp5
+
+ power-domains:
+ maxItems: 1
+
+ ranges:
+ maxItems: 1
+
+patternProperties:
+ "^display@[0-9a-f]+$":
+ type: object
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra186-display
+ then:
+ properties:
+ clocks:
+ items:
+ - description: display core clock
+ - description: display stream compression clock
+ - description: display hub clock
+
+ clock-names:
+ items:
+ - const: disp
+ - const: dsc
+ - const: hub
+ else:
+ properties:
+ clocks:
+ items:
+ - description: display core clock
+ - description: display hub clock
+
+ clock-names:
+ items:
+ - const: disp
+ - const: hub
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+ - "#address-cells"
+ - "#size-cells"
+ - ranges
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/power/tegra186-powergate.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ display-hub@15200000 {
+ compatible = "nvidia,tegra186-display";
+ reg = <0x15200000 0x00040000>;
+ resets = <&bpmp TEGRA186_RESET_NVDISPLAY0_MISC>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP0>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP1>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP2>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP3>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP4>,
+ <&bpmp TEGRA186_RESET_NVDISPLAY0_WGRP5>;
+ reset-names = "misc", "wgrp0", "wgrp1", "wgrp2",
+ "wgrp3", "wgrp4", "wgrp5";
+ clocks = <&bpmp TEGRA186_CLK_NVDISPLAY_DISP>,
+ <&bpmp TEGRA186_CLK_NVDISPLAY_DSC>,
+ <&bpmp TEGRA186_CLK_NVDISPLAYHUB>;
+ clock-names = "disp", "dsc", "hub";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_DISP>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x15200000 0x15200000 0x40000>;
+
+ display@15200000 {
+ compatible = "nvidia,tegra186-dc";
+ reg = <0x15200000 0x10000>;
+ interrupts = <GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA186_CLK_NVDISPLAY_P0>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA186_RESET_NVDISPLAY0_HEAD0>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_DISP>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+ iommus = <&smmu TEGRA186_SID_NVDISPLAY>;
+
+ nvidia,outputs = <&dsia &dsib &sor0 &sor1>;
+ nvidia,head = <0>;
+ };
+
+ display@15210000 {
+ compatible = "nvidia,tegra186-dc";
+ reg = <0x15210000 0x10000>;
+ interrupts = <GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA186_CLK_NVDISPLAY_P1>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA186_RESET_NVDISPLAY0_HEAD1>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_DISPB>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+ iommus = <&smmu TEGRA186_SID_NVDISPLAY>;
+
+ nvidia,outputs = <&dsia &dsib &sor0 &sor1>;
+ nvidia,head = <1>;
+ };
+
+ display@15220000 {
+ compatible = "nvidia,tegra186-dc";
+ reg = <0x15220000 0x10000>;
+ interrupts = <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA186_CLK_NVDISPLAY_P2>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA186_RESET_NVDISPLAY0_HEAD2>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_DISPC>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+ iommus = <&smmu TEGRA186_SID_NVDISPLAY>;
+
+ nvidia,outputs = <&sor0 &sor1>;
+ nvidia,head = <2>;
+ };
+ };
+
+ - |
+ #include <dt-bindings/clock/tegra194-clock.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra194-mc.h>
+ #include <dt-bindings/power/tegra194-powergate.h>
+ #include <dt-bindings/reset/tegra194-reset.h>
+
+ display-hub@15200000 {
+ compatible = "nvidia,tegra194-display";
+ reg = <0x15200000 0x00040000>;
+ resets = <&bpmp TEGRA194_RESET_NVDISPLAY0_MISC>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP0>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP1>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP2>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP3>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP4>,
+ <&bpmp TEGRA194_RESET_NVDISPLAY0_WGRP5>;
+ reset-names = "misc", "wgrp0", "wgrp1", "wgrp2",
+ "wgrp3", "wgrp4", "wgrp5";
+ clocks = <&bpmp TEGRA194_CLK_NVDISPLAY_DISP>,
+ <&bpmp TEGRA194_CLK_NVDISPLAYHUB>;
+ clock-names = "disp", "hub";
+
+ power-domains = <&bpmp TEGRA194_POWER_DOMAIN_DISP>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x15200000 0x15200000 0x40000>;
+
+ display@15200000 {
+ compatible = "nvidia,tegra194-dc";
+ reg = <0x15200000 0x10000>;
+ interrupts = <GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA194_CLK_NVDISPLAY_P0>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA194_RESET_NVDISPLAY0_HEAD0>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA194_POWER_DOMAIN_DISP>;
+ interconnects = <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+
+ nvidia,outputs = <&sor0 &sor1 &sor2 &sor3>;
+ nvidia,head = <0>;
+ };
+
+ display@15210000 {
+ compatible = "nvidia,tegra194-dc";
+ reg = <0x15210000 0x10000>;
+ interrupts = <GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA194_CLK_NVDISPLAY_P1>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA194_RESET_NVDISPLAY0_HEAD1>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA194_POWER_DOMAIN_DISPB>;
+ interconnects = <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+
+ nvidia,outputs = <&sor0 &sor1 &sor2 &sor3>;
+ nvidia,head = <1>;
+ };
+
+ display@15220000 {
+ compatible = "nvidia,tegra194-dc";
+ reg = <0x15220000 0x10000>;
+ interrupts = <GIC_SPI 155 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA194_CLK_NVDISPLAY_P2>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA194_RESET_NVDISPLAY0_HEAD2>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA194_POWER_DOMAIN_DISPC>;
+ interconnects = <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+
+ nvidia,outputs = <&sor0 &sor1 &sor2 &sor3>;
+ nvidia,head = <2>;
+ };
+
+ display@15230000 {
+ compatible = "nvidia,tegra194-dc";
+ reg = <0x15230000 0x10000>;
+ interrupts = <GIC_SPI 242 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA194_CLK_NVDISPLAY_P3>;
+ clock-names = "dc";
+ resets = <&bpmp TEGRA194_RESET_NVDISPLAY0_HEAD3>;
+ reset-names = "dc";
+
+ power-domains = <&bpmp TEGRA194_POWER_DOMAIN_DISPC>;
+ interconnects = <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR &emc>,
+ <&mc TEGRA194_MEMORY_CLIENT_NVDISPLAYR1 &emc>;
+ interconnect-names = "dma-mem", "read-1";
+
+ nvidia,outputs = <&sor0 &sor1 &sor2 &sor3>;
+ nvidia,head = <3>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dsi-padctl.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dsi-padctl.yaml
new file mode 100644
index 000000000000..da75b71e8ece
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra186-dsi-padctl.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra186-dsi-padctl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra MIPI DSI pad controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^padctl@[0-9a-f]+$"
+
+ compatible:
+ const: nvidia,tegra186-dsi-padctl
+
+ reg:
+ maxItems: 1
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: dsi
+
+allOf:
+ - $ref: /schemas/reset/reset.yaml
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ padctl@15880000 {
+ compatible = "nvidia,tegra186-dsi-padctl";
+ reg = <0x15880000 0x10000>;
+ resets = <&bpmp TEGRA186_RESET_DSI>;
+ reset-names = "dsi";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dc.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dc.yaml
new file mode 100644
index 000000000000..69be95afd562
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dc.yaml
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-dc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Display Controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^dc@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-dc
+ - nvidia,tegra30-dc
+ - nvidia,tegra114-dc
+ - nvidia,tegra124-dc
+ - nvidia,tegra210-dc
+
+ - items:
+ - const: nvidia,tegra124-dc
+ - const: nvidia,tegra132-dc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ items:
+ - description: display controller pixel clock
+ - description: parent clock # optional
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: dc
+ - const: parent # optional
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: dc
+
+ interconnect-names: true
+ interconnects: true
+
+ iommus:
+ maxItems: 1
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the core power domain
+
+ memory-region: true
+
+ nvidia,head:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: The number of the display controller head. This is used to setup the various
+ types of output to receive video data from the given head.
+
+ nvidia,outputs:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ description: A list of phandles of outputs that this display controller can drive.
+
+ rgb:
+ type: object
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra20-dc
+ - nvidia,tegra30-dc
+ - nvidia,tegra114-dc
+ then:
+ properties:
+ interconnects:
+ items:
+ - description: window A memory client
+ - description: window B memory client
+ - description: window B memory client (vertical filter)
+ - description: window C memory client
+ - description: cursor memory client
+
+ interconnect-names:
+ items:
+ - const: wina
+ - const: winb
+ - const: winb-vfilter
+ - const: winc
+ - const: cursor
+
+ rgb:
+ description: Each display controller node has a child node, named "rgb", that represents
+ the RGB output associated with the controller.
+ type: object
+ properties:
+ nvidia,ddc-i2c-bus:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: phandle of an I2C controller used for DDC EDID probing
+
+ nvidia,hpd-gpio:
+ description: specifies a GPIO used for hotplug detection
+ maxItems: 1
+
+ nvidia,edid:
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+ description: supplies a binary EDID blob
+
+ nvidia,panel:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: phandle of a display panel
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra124-dc
+ then:
+ properties:
+ interconnects:
+ minItems: 4
+ items:
+ - description: window A memory client
+ - description: window B memory client
+ - description: window C memory client
+ - description: cursor memory client
+ - description: window D memory client
+ - description: window T memory client
+
+ interconnect-names:
+ minItems: 4
+ items:
+ - const: wina
+ - const: winb
+ - const: winc
+ - const: cursor
+ - const: wind
+ - const: wint
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ dc@54200000 {
+ compatible = "nvidia,tegra20-dc";
+ reg = <0x54200000 0x00040000>;
+ interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_DISP1>;
+ clock-names = "dc";
+ resets = <&tegra_car 27>;
+ reset-names = "dc";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dsi.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dsi.yaml
new file mode 100644
index 000000000000..59e1dc0813e7
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-dsi.yaml
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Display Serial Interface
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-dsi
+ - nvidia,tegra30-dsi
+ - nvidia,tegra114-dsi
+ - nvidia,tegra124-dsi
+ - nvidia,tegra210-dsi
+ - nvidia,tegra186-dsi
+
+ - items:
+ - const: nvidia,tegra132-dsi
+ - const: nvidia,tegra124-dsi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 3
+
+ clock-names:
+ minItems: 2
+ maxItems: 3
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: dsi
+
+ operating-points-v2: true
+
+ power-domains:
+ maxItems: 1
+
+ avdd-dsi-csi-supply:
+ description: phandle of a supply that powers the DSI controller
+
+ nvidia,mipi-calibrate:
+ description: Should contain a phandle and a specifier specifying
+ which pads are used by this DSI output and need to be
+ calibrated. See nvidia,tegra114-mipi.yaml for details.
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+
+ nvidia,ddc-i2c-bus:
+ description: phandle of an I2C controller used for DDC EDID
+ probing
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ nvidia,hpd-gpio:
+ description: specifies a GPIO used for hotplug detection
+ maxItems: 1
+
+ nvidia,edid:
+ description: supplies a binary EDID blob
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+
+ nvidia,panel:
+ description: phandle of a display panel
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ nvidia,ganged-mode:
+ description: contains a phandle to a second DSI controller to
+ gang up with in order to support up to 8 data lanes
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+allOf:
+ - $ref: ../dsi-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra20-dsi
+ - nvidia,tegra30-dsi
+ then:
+ properties:
+ clocks:
+ items:
+ - description: DSI module clock
+ - description: input for the pixel clock
+
+ clock-names:
+ items:
+ - const: dsi
+ - const: parent
+ else:
+ properties:
+ clocks:
+ items:
+ - description: DSI module clock
+ - description: low-power module clock
+ - description: input for the pixel clock
+
+ clock-names:
+ items:
+ - const: dsi
+ - const: lp
+ - const: parent
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra186-dsi
+ then:
+ required:
+ - interrupts
+
+unevaluatedProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/power/tegra186-powergate.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ dsi@15300000 {
+ compatible = "nvidia,tegra186-dsi";
+ reg = <0x15300000 0x10000>;
+ interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&bpmp TEGRA186_CLK_DSI>,
+ <&bpmp TEGRA186_CLK_DSIA_LP>,
+ <&bpmp TEGRA186_CLK_PLLD>;
+ clock-names = "dsi", "lp", "parent";
+ resets = <&bpmp TEGRA186_RESET_DSI>;
+ reset-names = "dsi";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_DISP>;
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-epp.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-epp.yaml
new file mode 100644
index 000000000000..3c095a5491fe
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-epp.yaml
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-epp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Encoder Pre-Processor
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^epp@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra20-epp
+ - nvidia,tegra30-epp
+ - nvidia,tegra114-epp
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: epp
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 4
+
+ interconnect-names:
+ maxItems: 4
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the core power domain
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ epp@540c0000 {
+ compatible = "nvidia,tegra20-epp";
+ reg = <0x540c0000 0x00040000>;
+ interrupts = <GIC_SPI 70 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_EPP>;
+ resets = <&tegra_car 19>;
+ reset-names = "epp";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr2d.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr2d.yaml
new file mode 100644
index 000000000000..1026b0bc3dc8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr2d.yaml
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-gr2d.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA 2D graphics engine
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^gr2d@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra20-gr2d
+ - nvidia,tegra30-gr2d
+ - nvidia,tegra114-gr2d
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+
+ resets:
+ items:
+ - description: module reset
+ - description: memory client hotflush reset
+
+ reset-names:
+ items:
+ - const: 2d
+ - const: mc
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 4
+
+ interconnect-names:
+ maxItems: 4
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the HEG or core power domain
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra20-mc.h>
+
+ gr2d@54140000 {
+ compatible = "nvidia,tegra20-gr2d";
+ reg = <0x54140000 0x00040000>;
+ interrupts = <GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_GR2D>;
+ resets = <&tegra_car 21>, <&mc TEGRA20_MC_RESET_2D>;
+ reset-names = "2d", "mc";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr3d.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr3d.yaml
new file mode 100644
index 000000000000..59a52e732ca3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-gr3d.yaml
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-gr3d.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA 3D graphics engine
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^gr3d@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra20-gr3d
+ - nvidia,tegra30-gr3d
+ - nvidia,tegra114-gr3d
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ maxItems: 2
+
+ resets:
+ minItems: 2
+ maxItems: 4
+
+ reset-names:
+ minItems: 2
+ maxItems: 4
+
+ iommus:
+ minItems: 1
+ maxItems: 2
+
+ interconnects:
+ minItems: 4
+ maxItems: 10
+
+ interconnect-names:
+ minItems: 4
+ maxItems: 10
+
+ operating-points-v2: true
+
+ power-domains:
+ minItems: 1
+ maxItems: 2
+
+ power-domain-names:
+ maxItems: 2
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra20-gr2d
+ then:
+ properties:
+ clocks:
+ items:
+ - description: module clock
+
+ clock-names:
+ items:
+ - const: 3d
+
+ resets:
+ items:
+ - description: module reset
+ - description: memory client hotflush reset
+
+ reset-names:
+ items:
+ - const: 3d
+ - const: mc
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ minItems: 4
+ maxItems: 4
+
+ interconnect-names:
+ minItems: 4
+ maxItems: 4
+
+ power-domains:
+ items:
+ - description: phandle to the TD power domain
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra30-gr3d
+ then:
+ properties:
+ clocks:
+ items:
+ - description: primary module clock
+ - description: secondary module clock
+
+ clock-names:
+ items:
+ - const: 3d
+ - const: 3d2
+
+ resets:
+ items:
+ - description: primary module reset
+ - description: secondary module reset
+ - description: primary memory client hotflush reset
+ - description: secondary memory client hotflush reset
+
+ reset-names:
+ items:
+ - const: 3d
+ - const: 3d2
+ - const: mc
+ - const: mc2
+
+ iommus:
+ minItems: 2
+ maxItems: 2
+
+ interconnects:
+ minItems: 8
+ maxItems: 8
+
+ interconnect-names:
+ minItems: 8
+ maxItems: 8
+
+ power-domains:
+ items:
+ - description: phandle to the TD power domain
+ - description: phandle to the TD2 power domain
+
+ power-domain-names:
+ items:
+ - const: 3d0
+ - const: 3d1
+
+ dependencies:
+ power-domains: [ power-domain-names ]
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra114-gr2d
+ then:
+ properties:
+ clocks:
+ items:
+ - description: module clock
+
+ clock-names:
+ items:
+ - const: 3d
+
+ resets:
+ items:
+ - description: module reset
+ - description: memory client hotflush reset
+
+ reset-names:
+ items:
+ - const: 3d
+ - const: mc
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ minItems: 10
+ maxItems: 10
+
+ interconnect-names:
+ minItems: 10
+ maxItems: 10
+
+ power-domains:
+ items:
+ - description: phandle to the TD power domain
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/memory/tegra20-mc.h>
+
+ gr3d@54180000 {
+ compatible = "nvidia,tegra20-gr3d";
+ reg = <0x54180000 0x00040000>;
+ clocks = <&tegra_car TEGRA20_CLK_GR3D>;
+ resets = <&tegra_car 24>, <&mc TEGRA20_MC_RESET_3D>;
+ reset-names = "3d", "mc";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-hdmi.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-hdmi.yaml
new file mode 100644
index 000000000000..f77197e4869f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-hdmi.yaml
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-hdmi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra HDMI Output Encoder
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^hdmi@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-hdmi
+ - nvidia,tegra30-hdmi
+ - nvidia,tegra114-hdmi
+ - nvidia,tegra124-hdmi
+
+ - items:
+ - const: nvidia,tegra132-hdmi
+ - const: nvidia,tegra124-hdmi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+ - description: parent clock
+
+ clock-names:
+ items:
+ - const: hdmi
+ - const: parent
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: hdmi
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the core power domain
+
+ hdmi-supply:
+ description: supply for the +5V HDMI connector pin
+
+ vdd-supply:
+ description: regulator for supply voltage
+
+ pll-supply:
+ description: regulator for PLL
+
+ nvidia,ddc-i2c-bus:
+ description: phandle of an I2C controller used for DDC EDID
+ probing
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ nvidia,hpd-gpio:
+ description: specifies a GPIO used for hotplug detection
+ maxItems: 1
+
+ nvidia,edid:
+ description: supplies a binary EDID blob
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+
+ nvidia,panel:
+ description: phandle of a display panel
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+ "#sound-dai-cells":
+ const: 0
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - pll-supply
+ - vdd-supply
+ - nvidia,ddc-i2c-bus
+ - nvidia,hpd-gpio
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra124-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/gpio/tegra-gpio.h>
+
+ hdmi@54280000 {
+ compatible = "nvidia,tegra124-hdmi";
+ reg = <0x54280000 0x00040000>;
+ interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA124_CLK_HDMI>,
+ <&tegra_car TEGRA124_CLK_PLL_D2_OUT0>;
+ clock-names = "hdmi", "parent";
+ resets = <&tegra_car 51>;
+ reset-names = "hdmi";
+
+ hdmi-supply = <&vdd_5v0_hdmi>;
+ pll-supply = <&vdd_hdmi_pll>;
+ vdd-supply = <&vdd_3v3_hdmi>;
+
+ nvidia,ddc-i2c-bus = <&hdmi_ddc>;
+ nvidia,hpd-gpio = <&gpio TEGRA_GPIO(N, 7) GPIO_ACTIVE_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
deleted file mode 100644
index 8a6d3e1ee306..000000000000
--- a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt
+++ /dev/null
@@ -1,622 +0,0 @@
-NVIDIA Tegra host1x
-
-Required properties:
-- compatible: "nvidia,tegra<chip>-host1x"
-- reg: Physical base address and length of the controller's registers.
- For pre-Tegra186, one entry describing the whole register area.
- For Tegra186, one entry for each entry in reg-names:
- "vm" - VM region assigned to Linux
- "hypervisor" - Hypervisor region (only if Linux acts as hypervisor)
-- interrupts: The interrupt outputs from the controller.
-- #address-cells: The number of cells used to represent physical base addresses
- in the host1x address space. Should be 1.
-- #size-cells: The number of cells used to represent the size of an address
- range in the host1x address space. Should be 1.
-- ranges: The mapping of the host1x address space to the CPU address space.
-- clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
-- resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
-- reset-names: Must include the following entries:
- - host1x
-
-Each host1x client module having to perform DMA through the Memory Controller
-should have the interconnect endpoints set to the Memory Client and External
-Memory respectively.
-
-The host1x top-level node defines a number of children, each representing one
-of the following host1x client modules:
-
-- mpe: video encoder
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-mpe"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - mpe
-
- Optional properties:
- - interconnects: Must contain entry for the MPE memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- vi: video input
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-vi"
- - reg: Physical base address and length of the controller registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
- - Tegra20/Tegra30/Tegra114/Tegra124:
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - vi
- - Tegra210:
- - power-domains: Must include venc powergate node as vi is in VE partition.
-
- ports (optional node)
- vi can have optional ports node and max 6 ports are supported. Each port
- should have single 'endpoint' child node. All port nodes are grouped under
- ports node. Please refer to the bindings defined in
- Documentation/devicetree/bindings/media/video-interfaces.txt
-
- csi (required node)
- Tegra210 has CSI part of VI sharing same host interface and register space.
- So, VI device node should have CSI child node.
-
- - csi: mipi csi interface to vi
-
- Required properties:
- - compatible: "nvidia,tegra210-csi"
- - reg: Physical base address offset to parent and length of the controller
- registers.
- - clocks: Must contain entries csi, cilab, cilcd, cile, csi_tpg clocks.
- See ../clocks/clock-bindings.txt for details.
- - power-domains: Must include sor powergate node as csicil is in
- SOR partition.
-
- channel (optional nodes)
- Maximum 6 channels are supported with each csi brick as either x4 or x2
- based on hw connectivity to sensor.
-
- Required properties:
- - reg: csi port number. Valid port numbers are 0 through 5.
- - nvidia,mipi-calibrate: Should contain a phandle and a specifier
- specifying which pads are used by this CSI port and need to be
- calibrated. See also ../display/tegra/nvidia,tegra114-mipi.txt.
-
- Each channel node must contain 2 port nodes which can be grouped
- under 'ports' node and each port should have a single child 'endpoint'
- node.
-
- ports node
- Please refer to the bindings defined in
- Documentation/devicetree/bindings/media/video-interfaces.txt
-
- ports node must contain below 2 port nodes.
- port@0 with single child 'endpoint' node always a sink.
- port@1 with single child 'endpoint' node always a source.
-
- port@0 (required node)
- Required properties:
- - reg: 0
-
- endpoint (required node)
- Required properties:
- - data-lanes: an array of data lane from 1 to 8. Valid array
- lengths are 1/2/4/8.
- - remote-endpoint: phandle to sensor 'endpoint' node.
-
- port@1 (required node)
- Required properties:
- - reg: 1
-
- endpoint (required node)
- Required properties:
- - remote-endpoint: phandle to vi port 'endpoint' node.
-
- Optional properties:
- - interconnects: Must contain entry for the VI memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- epp: encoder pre-processor
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-epp"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - epp
-
- Optional properties:
- - interconnects: Must contain entry for the EPP memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- isp: image signal processor
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-isp"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - isp
-
- Optional properties:
- - interconnects: Must contain entry for the ISP memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- gr2d: 2D graphics engine
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-gr2d"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - 2d
-
- Optional properties:
- - interconnects: Must contain entry for the GR2D memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- gr3d: 3D graphics engine
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-gr3d"
- - reg: Physical base address and length of the controller's registers.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- (This property may be omitted if the only clock in the list is "3d")
- - 3d
- This MUST be the first entry.
- - 3d2 (Only required on SoCs with two 3D clocks)
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - 3d
- - 3d2 (Only required on SoCs with two 3D clocks)
-
- Optional properties:
- - interconnects: Must contain entry for the GR3D memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- dc: display controller
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-dc"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - dc
- This MUST be the first entry.
- - parent
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - dc
- - nvidia,head: The number of the display controller head. This is used to
- setup the various types of output to receive video data from the given
- head.
-
- Each display controller node has a child node, named "rgb", that represents
- the RGB output associated with the controller. It can take the following
- optional properties:
- - nvidia,ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing
- - nvidia,hpd-gpio: specifies a GPIO used for hotplug detection
- - nvidia,edid: supplies a binary EDID blob
- - nvidia,panel: phandle of a display panel
- - interconnects: Must contain entry for the DC memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-- hdmi: High Definition Multimedia Interface
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-hdmi"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - hdmi-supply: supply for the +5V HDMI connector pin
- - vdd-supply: regulator for supply voltage
- - pll-supply: regulator for PLL
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - hdmi
- This MUST be the first entry.
- - parent
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - hdmi
-
- Optional properties:
- - nvidia,ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing
- - nvidia,hpd-gpio: specifies a GPIO used for hotplug detection
- - nvidia,edid: supplies a binary EDID blob
- - nvidia,panel: phandle of a display panel
-
-- tvo: TV encoder output
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-tvo"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain one entry, for the module clock.
- See ../clocks/clock-bindings.txt for details.
-
-- dsi: display serial interface
-
- Required properties:
- - compatible: "nvidia,tegra<chip>-dsi"
- - reg: Physical base address and length of the controller's registers.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - dsi
- This MUST be the first entry.
- - lp
- - parent
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - dsi
- - avdd-dsi-supply: phandle of a supply that powers the DSI controller
- - nvidia,mipi-calibrate: Should contain a phandle and a specifier specifying
- which pads are used by this DSI output and need to be calibrated. See also
- ../display/tegra/nvidia,tegra114-mipi.txt.
-
- Optional properties:
- - nvidia,ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing
- - nvidia,hpd-gpio: specifies a GPIO used for hotplug detection
- - nvidia,edid: supplies a binary EDID blob
- - nvidia,panel: phandle of a display panel
- - nvidia,ganged-mode: contains a phandle to a second DSI controller to gang
- up with in order to support up to 8 data lanes
-
-- sor: serial output resource
-
- Required properties:
- - compatible: Should be:
- - "nvidia,tegra124-sor": for Tegra124 and Tegra132
- - "nvidia,tegra132-sor": for Tegra132
- - "nvidia,tegra210-sor": for Tegra210
- - "nvidia,tegra210-sor1": for Tegra210
- - "nvidia,tegra186-sor": for Tegra186
- - "nvidia,tegra186-sor1": for Tegra186
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - sor: clock input for the SOR hardware
- - out: SOR output clock
- - parent: input for the pixel clock
- - dp: reference clock for the SOR clock
- - safe: safe reference for the SOR clock during power up
-
- For Tegra186 and later:
- - pad: SOR pad output clock (on Tegra186 and later)
-
- Obsolete:
- - source: source clock for the SOR clock (obsolete, use "out" instead)
-
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - sor
-
- Required properties on Tegra186 and later:
- - nvidia,interface: index of the SOR interface
-
- Optional properties:
- - nvidia,ddc-i2c-bus: phandle of an I2C controller used for DDC EDID probing
- - nvidia,hpd-gpio: specifies a GPIO used for hotplug detection
- - nvidia,edid: supplies a binary EDID blob
- - nvidia,panel: phandle of a display panel
- - nvidia,xbar-cfg: 5 cells containing the crossbar configuration. Each lane
- of the SOR, identified by the cell's index, is mapped via the crossbar to
- the pad specified by the cell's value.
-
- Optional properties when driving an eDP output:
- - nvidia,dpaux: phandle to a DispayPort AUX interface
-
-- dpaux: DisplayPort AUX interface
- - compatible : Should contain one of the following:
- - "nvidia,tegra124-dpaux": for Tegra124 and Tegra132
- - "nvidia,tegra210-dpaux": for Tegra210
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - dpaux: clock input for the DPAUX hardware
- - parent: reference clock
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - dpaux
- - vdd-supply: phandle of a supply that powers the DisplayPort link
- - i2c-bus: Subnode where I2C slave devices are listed. This subnode
- must be always present. If there are no I2C slave devices, an empty
- node should be added. See ../../i2c/i2c.txt for more information.
-
- See ../pinctrl/nvidia,tegra124-dpaux-padctl.txt for information
- regarding the DPAUX pad controller bindings.
-
-- vic: Video Image Compositor
- - compatible : "nvidia,tegra<chip>-vic"
- - reg: Physical base address and length of the controller's registers.
- - interrupts: The interrupt outputs from the controller.
- - clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
- - clock-names: Must include the following entries:
- - vic: clock input for the VIC hardware
- - resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: Must include the following entries:
- - vic
-
- Optional properties:
- - interconnects: Must contain entry for the VIC memory clients.
- - interconnect-names: Must include name of the interconnect path for each
- interconnect entry. Consult TRM documentation for information about
- available memory clients, see MEMORY CONTROLLER section.
-
-Example:
-
-/ {
- ...
-
- host1x {
- compatible = "nvidia,tegra20-host1x", "simple-bus";
- reg = <0x50000000 0x00024000>;
- interrupts = <0 65 0x04 /* mpcore syncpt */
- 0 67 0x04>; /* mpcore general */
- clocks = <&tegra_car TEGRA20_CLK_HOST1X>;
- resets = <&tegra_car 28>;
- reset-names = "host1x";
-
- #address-cells = <1>;
- #size-cells = <1>;
-
- ranges = <0x54000000 0x54000000 0x04000000>;
-
- mpe {
- compatible = "nvidia,tegra20-mpe";
- reg = <0x54040000 0x00040000>;
- interrupts = <0 68 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_MPE>;
- resets = <&tegra_car 60>;
- reset-names = "mpe";
- };
-
- vi@54080000 {
- compatible = "nvidia,tegra210-vi";
- reg = <0x0 0x54080000 0x0 0x700>;
- interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
- assigned-clocks = <&tegra_car TEGRA210_CLK_VI>;
- assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_C4_OUT0>;
-
- clocks = <&tegra_car TEGRA210_CLK_VI>;
- power-domains = <&pd_venc>;
-
- #address-cells = <1>;
- #size-cells = <1>;
-
- ranges = <0x0 0x0 0x54080000 0x2000>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- imx219_vi_in0: endpoint {
- remote-endpoint = <&imx219_csi_out0>;
- };
- };
- };
-
- csi@838 {
- compatible = "nvidia,tegra210-csi";
- reg = <0x838 0x1300>;
- assigned-clocks = <&tegra_car TEGRA210_CLK_CILAB>,
- <&tegra_car TEGRA210_CLK_CILCD>,
- <&tegra_car TEGRA210_CLK_CILE>,
- <&tegra_car TEGRA210_CLK_CSI_TPG>;
- assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_P>,
- <&tegra_car TEGRA210_CLK_PLL_P>,
- <&tegra_car TEGRA210_CLK_PLL_P>;
- assigned-clock-rates = <102000000>,
- <102000000>,
- <102000000>,
- <972000000>;
-
- clocks = <&tegra_car TEGRA210_CLK_CSI>,
- <&tegra_car TEGRA210_CLK_CILAB>,
- <&tegra_car TEGRA210_CLK_CILCD>,
- <&tegra_car TEGRA210_CLK_CILE>,
- <&tegra_car TEGRA210_CLK_CSI_TPG>;
- clock-names = "csi", "cilab", "cilcd", "cile", "csi_tpg";
- power-domains = <&pd_sor>;
-
- #address-cells = <1>;
- #size-cells = <0>;
-
- channel@0 {
- reg = <0>;
- nvidia,mipi-calibrate = <&mipi 0x001>;
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- reg = <0>;
- imx219_csi_in0: endpoint {
- data-lanes = <1 2>;
- remote-endpoint = <&imx219_out0>;
- };
- };
-
- port@1 {
- reg = <1>;
- imx219_csi_out0: endpoint {
- remote-endpoint = <&imx219_vi_in0>;
- };
- };
- };
- };
- };
- };
-
- epp {
- compatible = "nvidia,tegra20-epp";
- reg = <0x540c0000 0x00040000>;
- interrupts = <0 70 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_EPP>;
- resets = <&tegra_car 19>;
- reset-names = "epp";
- };
-
- isp {
- compatible = "nvidia,tegra20-isp";
- reg = <0x54100000 0x00040000>;
- interrupts = <0 71 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_ISP>;
- resets = <&tegra_car 23>;
- reset-names = "isp";
- };
-
- gr2d {
- compatible = "nvidia,tegra20-gr2d";
- reg = <0x54140000 0x00040000>;
- interrupts = <0 72 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_GR2D>;
- resets = <&tegra_car 21>;
- reset-names = "2d";
- };
-
- gr3d {
- compatible = "nvidia,tegra20-gr3d";
- reg = <0x54180000 0x00040000>;
- clocks = <&tegra_car TEGRA20_CLK_GR3D>;
- resets = <&tegra_car 24>;
- reset-names = "3d";
- };
-
- dc@54200000 {
- compatible = "nvidia,tegra20-dc";
- reg = <0x54200000 0x00040000>;
- interrupts = <0 73 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_DISP1>,
- <&tegra_car TEGRA20_CLK_PLL_P>;
- clock-names = "dc", "parent";
- resets = <&tegra_car 27>;
- reset-names = "dc";
-
- interconnects = <&mc TEGRA20_MC_DISPLAY0A &emc>,
- <&mc TEGRA20_MC_DISPLAY0B &emc>,
- <&mc TEGRA20_MC_DISPLAY0C &emc>,
- <&mc TEGRA20_MC_DISPLAYHC &emc>;
- interconnect-names = "wina",
- "winb",
- "winc",
- "cursor";
-
- rgb {
- status = "disabled";
- };
- };
-
- dc@54240000 {
- compatible = "nvidia,tegra20-dc";
- reg = <0x54240000 0x00040000>;
- interrupts = <0 74 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_DISP2>,
- <&tegra_car TEGRA20_CLK_PLL_P>;
- clock-names = "dc", "parent";
- resets = <&tegra_car 26>;
- reset-names = "dc";
-
- interconnects = <&mc TEGRA20_MC_DISPLAY0AB &emc>,
- <&mc TEGRA20_MC_DISPLAY0BB &emc>,
- <&mc TEGRA20_MC_DISPLAY0CB &emc>,
- <&mc TEGRA20_MC_DISPLAYHCB &emc>;
- interconnect-names = "wina",
- "winb",
- "winc",
- "cursor";
-
- rgb {
- status = "disabled";
- };
- };
-
- hdmi {
- compatible = "nvidia,tegra20-hdmi";
- reg = <0x54280000 0x00040000>;
- interrupts = <0 75 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_HDMI>,
- <&tegra_car TEGRA20_CLK_PLL_D_OUT0>;
- clock-names = "hdmi", "parent";
- resets = <&tegra_car 51>;
- reset-names = "hdmi";
- status = "disabled";
- };
-
- tvo {
- compatible = "nvidia,tegra20-tvo";
- reg = <0x542c0000 0x00040000>;
- interrupts = <0 76 0x04>;
- clocks = <&tegra_car TEGRA20_CLK_TVO>;
- status = "disabled";
- };
-
- dsi {
- compatible = "nvidia,tegra20-dsi";
- reg = <0x54300000 0x00040000>;
- clocks = <&tegra_car TEGRA20_CLK_DSI>,
- <&tegra_car TEGRA20_CLK_PLL_D_OUT0>;
- clock-names = "dsi", "parent";
- resets = <&tegra_car 48>;
- reset-names = "dsi";
- status = "disabled";
- };
- };
-
- ...
-};
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.yaml
new file mode 100644
index 000000000000..94c5242c03b2
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.yaml
@@ -0,0 +1,430 @@
+# SPDX-License-Identifier: GPL-2.0-only
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-host1x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra host1x controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+description: The host1x top-level node defines a number of children, each
+ representing one of the host1x client modules defined in this binding.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-host1x
+ - nvidia,tegra30-host1x
+ - nvidia,tegra114-host1x
+ - nvidia,tegra124-host1x
+ - nvidia,tegra210-host1x
+ - nvidia,tegra186-host1x
+ - nvidia,tegra194-host1x
+ - nvidia,tegra234-host1x
+
+ - items:
+ - const: nvidia,tegra132-host1x
+ - const: nvidia,tegra124-host1x
+
+ reg:
+ minItems: 1
+ maxItems: 3
+
+ reg-names:
+ minItems: 1
+ maxItems: 3
+
+ interrupts:
+ minItems: 1
+ maxItems: 9
+
+ interrupt-names:
+ minItems: 1
+ maxItems: 9
+
+ '#address-cells':
+ description: The number of cells used to represent physical base addresses
+ in the host1x address space.
+ enum: [1, 2]
+
+ '#size-cells':
+ description: The number of cells used to represent the size of an address
+ range in the host1x address space.
+ enum: [1, 2]
+
+ ranges:
+ maxItems: 1
+
+ clocks:
+ description: Must contain one entry, for the module clock. See
+ ../clocks/clock-bindings.txt for details.
+
+ clock-names:
+ items:
+ - const: host1x
+
+ resets:
+ minItems: 1 # MC reset is optional on Tegra186 and later
+ items:
+ - description: module reset
+ - description: memory client hotflush reset
+
+ reset-names:
+ minItems: 1 # MC reset is optional on Tegra186 and later
+ items:
+ - const: host1x
+ - const: mc
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ items:
+ - description: memory read client for host1x
+
+ interconnect-names:
+ items:
+ - const: dma-mem # read
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the HEG or core power domain
+
+required:
+ - compatible
+ - interrupts
+ - interrupt-names
+ - '#address-cells'
+ - '#size-cells'
+ - ranges
+ - reg
+ - clocks
+ - clock-names
+
+unevaluatedProperties:
+ type: object
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra20-host1x
+ - nvidia,tegra30-host1x
+ - nvidia,tegra114-host1x
+ - nvidia,tegra124-host1x
+ - nvidia,tegra210-host1x
+ then:
+ properties:
+ interrupts:
+ items:
+ - description: host1x syncpoint interrupt
+ - description: host1x general interrupt
+
+ interrupt-names:
+ items:
+ - const: syncpt
+ - const: host1x
+ required:
+ - resets
+ - reset-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra186-host1x
+ - nvidia,tegra194-host1x
+ then:
+ properties:
+ reg-names:
+ items:
+ - const: hypervisor
+ - const: vm
+
+ reg:
+ items:
+ - description: region used by the hypervisor
+ - description: region assigned to the virtual machine
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ maxItems: 1
+
+ interrupts:
+ items:
+ - description: host1x syncpoint interrupt
+ - description: host1x general interrupt
+
+ interrupt-names:
+ items:
+ - const: syncpt
+ - const: host1x
+
+ iommu-map:
+ description: Specification of stream IDs available for memory context device
+ use. Should be a mapping of IDs 0..n to IOMMU entries corresponding to
+ usable stream IDs.
+
+ required:
+ - reg-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra234-host1x
+ then:
+ properties:
+ reg-names:
+ items:
+ - const: common
+ - const: hypervisor
+ - const: vm
+
+ reg:
+ items:
+ - description: region used by host1x server
+ - description: region used by the hypervisor
+ - description: region assigned to the virtual machine
+
+ interrupts:
+ items:
+ - description: host1x syncpoint interrupt 0
+ - description: host1x syncpoint interrupt 1
+ - description: host1x syncpoint interrupt 2
+ - description: host1x syncpoint interrupt 3
+ - description: host1x syncpoint interrupt 4
+ - description: host1x syncpoint interrupt 5
+ - description: host1x syncpoint interrupt 6
+ - description: host1x syncpoint interrupt 7
+ - description: host1x general interrupt
+
+ interrupt-names:
+ items:
+ - const: syncpt0
+ - const: syncpt1
+ - const: syncpt2
+ - const: syncpt3
+ - const: syncpt4
+ - const: syncpt5
+ - const: syncpt6
+ - const: syncpt7
+ - const: host1x
+
+ iommu-map:
+ description: Specification of stream IDs available for memory context device
+ use. Should be a mapping of IDs 0..n to IOMMU entries corresponding to
+ usable stream IDs.
+
+ required:
+ - reg-names
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/gpio/tegra-gpio.h>
+ #include <dt-bindings/memory/tegra20-mc.h>
+
+ host1x@50000000 {
+ compatible = "nvidia,tegra20-host1x";
+ reg = <0x50000000 0x00024000>;
+ interrupts = <0 65 0x04>, /* mpcore syncpt */
+ <0 67 0x04>; /* mpcore general */
+ interrupt-names = "syncpt", "host1x";
+ clocks = <&tegra_car TEGRA20_CLK_HOST1X>;
+ clock-names = "host1x";
+ resets = <&tegra_car 28>, <&mc TEGRA20_MC_RESET_HC>;
+ reset-names = "host1x", "mc";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x54000000 0x54000000 0x04000000>;
+
+ mpe@54040000 {
+ compatible = "nvidia,tegra20-mpe";
+ reg = <0x54040000 0x00040000>;
+ interrupts = <0 68 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_MPE>;
+ resets = <&tegra_car 60>;
+ reset-names = "mpe";
+ };
+
+ vi@54080000 {
+ compatible = "nvidia,tegra20-vi";
+ reg = <0x54080000 0x00040000>;
+ interrupts = <0 69 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_VI>;
+ resets = <&tegra_car 100>;
+ reset-names = "vi";
+ };
+
+ epp@540c0000 {
+ compatible = "nvidia,tegra20-epp";
+ reg = <0x540c0000 0x00040000>;
+ interrupts = <0 70 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_EPP>;
+ resets = <&tegra_car 19>;
+ reset-names = "epp";
+ };
+
+ isp@54100000 {
+ compatible = "nvidia,tegra20-isp";
+ reg = <0x54100000 0x00040000>;
+ interrupts = <0 71 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_ISP>;
+ resets = <&tegra_car 23>;
+ reset-names = "isp";
+ };
+
+ gr2d@54140000 {
+ compatible = "nvidia,tegra20-gr2d";
+ reg = <0x54140000 0x00040000>;
+ interrupts = <0 72 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_GR2D>;
+ resets = <&tegra_car 21>, <&mc TEGRA20_MC_RESET_2D>;
+ reset-names = "2d", "mc";
+ };
+
+ gr3d@54180000 {
+ compatible = "nvidia,tegra20-gr3d";
+ reg = <0x54180000 0x00040000>;
+ clocks = <&tegra_car TEGRA20_CLK_GR3D>;
+ resets = <&tegra_car 24>, <&mc TEGRA20_MC_RESET_3D>;
+ reset-names = "3d", "mc";
+ };
+
+ dc@54200000 {
+ compatible = "nvidia,tegra20-dc";
+ reg = <0x54200000 0x00040000>;
+ interrupts = <0 73 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_DISP1>;
+ clock-names = "dc";
+ resets = <&tegra_car 27>;
+ reset-names = "dc";
+
+ rgb {
+ };
+ };
+
+ dc@54240000 {
+ compatible = "nvidia,tegra20-dc";
+ reg = <0x54240000 0x00040000>;
+ interrupts = <0 74 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_DISP2>;
+ clock-names = "dc";
+ resets = <&tegra_car 26>;
+ reset-names = "dc";
+
+ rgb {
+ };
+ };
+
+ hdmi@54280000 {
+ compatible = "nvidia,tegra20-hdmi";
+ reg = <0x54280000 0x00040000>;
+ interrupts = <0 75 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_HDMI>,
+ <&tegra_car TEGRA20_CLK_PLL_D_OUT0>;
+ clock-names = "hdmi", "parent";
+ resets = <&tegra_car 51>;
+ reset-names = "hdmi";
+
+ hdmi-supply = <&vdd_5v0_hdmi>;
+ pll-supply = <&vdd_hdmi_pll>;
+ vdd-supply = <&vdd_3v3_hdmi>;
+
+ nvidia,ddc-i2c-bus = <&hdmi_ddc>;
+ nvidia,hpd-gpio = <&gpio TEGRA_GPIO(N, 7) GPIO_ACTIVE_HIGH>;
+ };
+
+ tvo@542c0000 {
+ compatible = "nvidia,tegra20-tvo";
+ reg = <0x542c0000 0x00040000>;
+ interrupts = <0 76 0x04>;
+ clocks = <&tegra_car TEGRA20_CLK_TVO>;
+ };
+
+ dsi@54300000 {
+ compatible = "nvidia,tegra20-dsi";
+ reg = <0x54300000 0x00040000>;
+ clocks = <&tegra_car TEGRA20_CLK_DSI>,
+ <&tegra_car TEGRA20_CLK_PLL_D_OUT0>;
+ clock-names = "dsi", "parent";
+ resets = <&tegra_car 48>;
+ reset-names = "dsi";
+ };
+ };
+
+ - |
+ #include <dt-bindings/clock/tegra210-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra210-mc.h>
+
+ host1x@50000000 {
+ compatible = "nvidia,tegra210-host1x";
+ reg = <0x50000000 0x00024000>;
+ interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_HIGH>, /* mpcore syncpt */
+ <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* mpcore general */
+ interrupt-names = "syncpt", "host1x";
+ clocks = <&tegra_car TEGRA210_CLK_HOST1X>;
+ clock-names = "host1x";
+ resets = <&tegra_car 28>;
+ reset-names = "host1x";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x54000000 0x54000000 0x01000000>;
+ iommus = <&mc TEGRA_SWGROUP_HC>;
+
+ vi@54080000 {
+ compatible = "nvidia,tegra210-vi";
+ reg = <0x54080000 0x00000700>;
+ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+ assigned-clocks = <&tegra_car TEGRA210_CLK_VI>;
+ assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_C4_OUT0>;
+
+ clocks = <&tegra_car TEGRA210_CLK_VI>;
+ power-domains = <&pd_venc>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x0 0x54080000 0x2000>;
+
+ csi@838 {
+ compatible = "nvidia,tegra210-csi";
+ reg = <0x838 0x1300>;
+ assigned-clocks = <&tegra_car TEGRA210_CLK_CILAB>,
+ <&tegra_car TEGRA210_CLK_CILCD>,
+ <&tegra_car TEGRA210_CLK_CILE>,
+ <&tegra_car TEGRA210_CLK_CSI_TPG>;
+ assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_P>,
+ <&tegra_car TEGRA210_CLK_PLL_P>,
+ <&tegra_car TEGRA210_CLK_PLL_P>;
+ assigned-clock-rates = <102000000>,
+ <102000000>,
+ <102000000>,
+ <972000000>;
+
+ clocks = <&tegra_car TEGRA210_CLK_CSI>,
+ <&tegra_car TEGRA210_CLK_CILAB>,
+ <&tegra_car TEGRA210_CLK_CILCD>,
+ <&tegra_car TEGRA210_CLK_CILE>,
+ <&tegra_car TEGRA210_CLK_CSI_TPG>;
+ clock-names = "csi", "cilab", "cilcd", "cile", "csi_tpg";
+ power-domains = <&pd_sor>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-isp.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-isp.yaml
new file mode 100644
index 000000000000..3bc3b22e98e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-isp.yaml
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-isp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra ISP processor
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ compatible:
+ enum:
+ - nvidia,tegra20-isp
+ - nvidia,tegra30-isp
+ - nvidia,tegra210-isp
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: isp
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ items:
+ - description: memory write client
+
+ interconnect-names:
+ items:
+ - const: dma-mem # write
+
+ power-domains:
+ items:
+ - description: phandle to the VENC or core power domain
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ isp@54100000 {
+ compatible = "nvidia,tegra20-isp";
+ reg = <0x54100000 0x00040000>;
+ interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_ISP>;
+ resets = <&tegra_car 23>;
+ reset-names = "isp";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-mpe.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-mpe.yaml
new file mode 100644
index 000000000000..2cd3e60cd0a8
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-mpe.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-mpe.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Video Encoder
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^mpe@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra20-mpe
+ - nvidia,tegra30-mpe
+ - nvidia,tegra114-mpe
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: mpe
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ maxItems: 6
+
+ interconnect-names:
+ maxItems: 6
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the MPE power domain
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ mpe@54040000 {
+ compatible = "nvidia,tegra20-mpe";
+ reg = <0x54040000 0x00040000>;
+ interrupts = <GIC_SPI 68 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_MPE>;
+ resets = <&tegra_car 60>;
+ reset-names = "mpe";
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-tvo.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-tvo.yaml
new file mode 100644
index 000000000000..6c84d8b7eb7b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-tvo.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-tvo.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra TV Encoder Output
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^tvo@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra20-tvo
+ - nvidia,tegra30-tvo
+ - nvidia,tegra114-tvo
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the core power domain
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ tvo@542c0000 {
+ compatible = "nvidia,tegra20-tvo";
+ reg = <0x542c0000 0x00040000>;
+ interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_TVO>;
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-vi.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-vi.yaml
new file mode 100644
index 000000000000..a42bf33d1e7d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-vi.yaml
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra20-vi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Video Input controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^vi@[0-9a-f]+$"
+
+ compatible:
+ oneOf:
+ - const: nvidia,tegra20-vi
+ - const: nvidia,tegra30-vi
+ - const: nvidia,tegra114-vi
+ - const: nvidia,tegra124-vi
+ - items:
+ - const: nvidia,tegra132-vi
+ - const: nvidia,tegra124-vi
+ - const: nvidia,tegra210-vi
+ - const: nvidia,tegra186-vi
+ - const: nvidia,tegra194-vi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ resets:
+ items:
+ - description: module reset
+
+ reset-names:
+ items:
+ - const: vi
+
+ iommus:
+ maxItems: 1
+
+ interconnects:
+ minItems: 4
+ maxItems: 5
+
+ interconnect-names:
+ minItems: 4
+ maxItems: 5
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the VENC power domain
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 1
+
+ ranges:
+ maxItems: 1
+
+ avdd-dsi-csi-supply:
+ description: DSI/CSI power supply. Must supply 1.2 V.
+
+patternProperties:
+ "^csi@[0-9a-f]+$":
+ type: object
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra20-vi
+ - nvidia,tegra30-vi
+ - nvidia,tegra114-vi
+ - nvidia,tegra124-vi
+ then:
+ required:
+ - resets
+ - reset-names
+ else:
+ required:
+ - power-domains
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ vi@54080000 {
+ compatible = "nvidia,tegra20-vi";
+ reg = <0x54080000 0x00040000>;
+ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&tegra_car TEGRA20_CLK_VI>;
+ resets = <&tegra_car 100>;
+ reset-names = "vi";
+ };
+
+ - |
+ #include <dt-bindings/clock/tegra210-car.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ vi@54080000 {
+ compatible = "nvidia,tegra210-vi";
+ reg = <0x54080000 0x00000700>;
+ interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>;
+ assigned-clocks = <&tegra_car TEGRA210_CLK_VI>;
+ assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_C4_OUT0>;
+
+ clocks = <&tegra_car TEGRA210_CLK_VI>;
+ power-domains = <&pd_venc>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ ranges = <0x0 0x54080000 0x2000>;
+
+ csi@838 {
+ compatible = "nvidia,tegra210-csi";
+ reg = <0x838 0x1300>;
+ assigned-clocks = <&tegra_car TEGRA210_CLK_CILAB>,
+ <&tegra_car TEGRA210_CLK_CILCD>,
+ <&tegra_car TEGRA210_CLK_CILE>,
+ <&tegra_car TEGRA210_CLK_CSI_TPG>;
+ assigned-clock-parents = <&tegra_car TEGRA210_CLK_PLL_P>,
+ <&tegra_car TEGRA210_CLK_PLL_P>,
+ <&tegra_car TEGRA210_CLK_PLL_P>;
+ assigned-clock-rates = <102000000>,
+ <102000000>,
+ <102000000>,
+ <972000000>;
+
+ clocks = <&tegra_car TEGRA210_CLK_CSI>,
+ <&tegra_car TEGRA210_CLK_CILAB>,
+ <&tegra_car TEGRA210_CLK_CILCD>,
+ <&tegra_car TEGRA210_CLK_CILE>,
+ <&tegra_car TEGRA210_CLK_CSI_TPG>;
+ clock-names = "csi", "cilab", "cilcd", "cile", "csi_tpg";
+ power-domains = <&pd_sor>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra210-csi.yaml b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra210-csi.yaml
new file mode 100644
index 000000000000..fa07a40d1004
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra210-csi.yaml
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/tegra/nvidia,tegra210-csi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra CSI controller
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^csi@[0-9a-f]+$"
+
+ compatible:
+ enum:
+ - nvidia,tegra210-csi
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: module clock
+ - description: A/B lanes clock
+ - description: C/D lanes clock
+ - description: E lane clock
+ - description: test pattern generator clock
+
+ clock-names:
+ items:
+ - const: csi
+ - const: cilab
+ - const: cilcd
+ - const: cile
+ - const: csi_tpg
+
+ power-domains:
+ maxItems: 1
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - power-domains
+
+# see nvidia,tegra20-vi.yaml for an example
diff --git a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
index 781c1868b0b8..b6b402f16161 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml
@@ -2,8 +2,8 @@
# Copyright 2019 Texas Instruments Incorporated
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/ti/ti,am65x-dss.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/ti/ti,am65x-dss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Texas Instruments AM65x Display Subsystem
@@ -88,8 +88,7 @@ properties:
The DSS DPI output port node from video port 2
ti,am65x-oldi-io-ctrl:
- $ref: "/schemas/types.yaml#/definitions/phandle-array"
- maxItems: 1
+ $ref: /schemas/types.yaml#/definitions/phandle
description:
phandle to syscon device node mapping OLDI IO_CTRL registers.
The mapped range should point to OLDI_DAT0_IO_CTRL, map it and
diff --git a/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
index 2986f9acc9f0..fad7cba58d39 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,j721e-dss.yaml
@@ -2,8 +2,8 @@
# Copyright 2019 Texas Instruments Incorporated
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/ti/ti,j721e-dss.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/ti/ti,j721e-dss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Texas Instruments J721E Display Subsystem
diff --git a/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
index 7ce7bbad5780..96b1439f88e3 100644
--- a/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
+++ b/Documentation/devicetree/bindings/display/ti/ti,k2g-dss.yaml
@@ -2,8 +2,8 @@
# Copyright 2019 Texas Instruments Incorporated
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/display/ti/ti,k2g-dss.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/display/ti/ti,k2g-dss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Texas Instruments K2G Display Subsystem
diff --git a/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt b/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
index 3e64075ac7ec..3b3d0bbfcfff 100644
--- a/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
+++ b/Documentation/devicetree/bindings/display/tilcdc/tilcdc.txt
@@ -60,7 +60,7 @@ Example:
blue-and-red-wiring = "crossed";
port {
- lcdc_0: endpoint@0 {
+ lcdc_0: endpoint {
remote-endpoint = <&hdmi_0>;
};
};
@@ -75,7 +75,7 @@ Example:
pinctrl-1 = <&nxp_hdmi_bonelt_off_pins>;
port {
- hdmi_0: endpoint@0 {
+ hdmi_0: endpoint {
remote-endpoint = <&lcdc_0>;
};
};
diff --git a/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml b/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml
index d88bd93f4b80..554f9d5809d4 100644
--- a/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml
+++ b/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml
@@ -117,6 +117,45 @@ properties:
- const: dp-phy0
- const: dp-phy1
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description: |
+ Connections to the programmable logic and the DisplayPort PHYs. Each port
+ shall have a single endpoint.
+
+ properties:
+ port@0:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The live video input from the programmable logic
+
+ port@1:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The live graphics input from the programmable logic
+
+ port@2:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The live audio input from the programmable logic
+
+ port@3:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The blended video output to the programmable logic
+
+ port@4:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The mixed audio output to the programmable logic
+
+ port@5:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: The DisplayPort output
+
+ required:
+ - port@0
+ - port@1
+ - port@2
+ - port@3
+ - port@4
+ - port@5
+
required:
- compatible
- reg
@@ -130,6 +169,7 @@ required:
- dma-names
- phys
- phy-names
+ - ports
additionalProperties: false
@@ -160,10 +200,37 @@ examples:
<&xlnx_dpdma 2>,
<&xlnx_dpdma 3>;
- phys = <&psgtr 1 PHY_TYPE_DP 0 3 27000000>,
- <&psgtr 0 PHY_TYPE_DP 1 3 27000000>;
+ phys = <&psgtr 1 PHY_TYPE_DP 0 3>,
+ <&psgtr 0 PHY_TYPE_DP 1 3>;
phy-names = "dp-phy0", "dp-phy1";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ };
+ port@1 {
+ reg = <1>;
+ };
+ port@2 {
+ reg = <2>;
+ };
+ port@3 {
+ reg = <3>;
+ };
+ port@4 {
+ reg = <4>;
+ };
+ port@5 {
+ reg = <5>;
+ dpsub_dp_out: endpoint {
+ remote-endpoint = <&dp_connector>;
+ };
+ };
+ };
};
...
diff --git a/Documentation/devicetree/bindings/display/xylon,logicvc-display.yaml b/Documentation/devicetree/bindings/display/xylon,logicvc-display.yaml
new file mode 100644
index 000000000000..76b804b7c880
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/xylon,logicvc-display.yaml
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright 2019 Bootlin
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/xylon,logicvc-display.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xylon LogiCVC display controller
+
+maintainers:
+ - Paul Kocialkowski <paul.kocialkowski@bootlin.com>
+
+description: |
+ The Xylon LogiCVC is a display controller that supports multiple layers.
+ It is usually implemented as programmable logic and was optimized for use
+ with Xilinx Zynq-7000 SoCs and Xilinx FPGAs.
+
+ Because the controller is intended for use in a FPGA, most of the
+ configuration of the controller takes place at logic configuration bitstream
+ synthesis time. As a result, many of the device-tree bindings are meant to
+ reflect the synthesis configuration and must not be configured differently.
+ Matching synthesis parameters are provided when applicable.
+
+ Layers are declared in the "layers" sub-node and have dedicated configuration.
+ In version 3 of the controller, each layer has fixed memory offset and address
+ starting from the video memory base address for its framebuffer. In version 4,
+ framebuffers are configured with a direct memory address instead.
+
+properties:
+ compatible:
+ enum:
+ - xylon,logicvc-3.02.a-display
+ - xylon,logicvc-4.01.a-display
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 4
+
+ clock-names:
+ minItems: 1
+ items:
+ # vclk is required and must be provided as first item.
+ - const: vclk
+ # Other clocks are optional and can be provided in any order.
+ - enum:
+ - vclk2
+ - lvdsclk
+ - lvdsclkn
+ - enum:
+ - vclk2
+ - lvdsclk
+ - lvdsclkn
+ - enum:
+ - vclk2
+ - lvdsclk
+ - lvdsclkn
+
+ interrupts:
+ maxItems: 1
+
+ memory-region:
+ maxItems: 1
+
+ xylon,display-interface:
+ enum:
+ # Parallel RGB interface (C_DISPLAY_INTERFACE == 0)
+ - parallel-rgb
+ # ITU-T BR656 interface (C_DISPLAY_INTERFACE == 1)
+ - bt656
+ # 4-bit LVDS interface (C_DISPLAY_INTERFACE == 2)
+ - lvds-4bits
+ # 3-bit LVDS interface (C_DISPLAY_INTERFACE == 4)
+ - lvds-3bits
+ # DVI interface (C_DISPLAY_INTERFACE == 5)
+ - dvi
+ description: Display output interface (C_DISPLAY_INTERFACE).
+
+ xylon,display-colorspace:
+ enum:
+ # RGB colorspace (C_DISPLAY_COLOR_SPACE == 0)
+ - rgb
+ # YUV 4:2:2 colorspace (C_DISPLAY_COLOR_SPACE == 1)
+ - yuv422
+ # YUV 4:4:4 colorspace (C_DISPLAY_COLOR_SPACE == 2)
+ - yuv444
+ description: Display output colorspace (C_DISPLAY_COLOR_SPACE).
+
+ xylon,display-depth:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Display output depth (C_PIXEL_DATA_WIDTH).
+
+ xylon,row-stride:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Fixed number of pixels in a framebuffer row (C_ROW_STRIDE).
+
+ xylon,dithering:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: Dithering module is enabled (C_XCOLOR)
+
+ xylon,background-layer:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: |
+ The last layer is used to display a black background (C_USE_BACKGROUND).
+ The layer must still be registered.
+
+ xylon,layers-configurable:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: |
+ Configuration of layers' size, position and offset is enabled
+ (C_USE_SIZE_POSITION).
+
+ layers:
+ type: object
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ patternProperties:
+ "^layer@[0-9]+$":
+ type: object
+
+ properties:
+ reg:
+ maxItems: 1
+
+ xylon,layer-depth:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Layer depth (C_LAYER_X_DATA_WIDTH).
+
+ xylon,layer-colorspace:
+ enum:
+ # RGB colorspace (C_LAYER_X_TYPE == 0)
+ - rgb
+ # YUV packed colorspace (C_LAYER_X_TYPE == 0)
+ - yuv
+ description: Layer colorspace (C_LAYER_X_TYPE).
+
+ xylon,layer-alpha-mode:
+ enum:
+ # Alpha is configured layer-wide (C_LAYER_X_ALPHA_MODE == 0)
+ - layer
+ # Alpha is configured per-pixel (C_LAYER_X_ALPHA_MODE == 1)
+ - pixel
+ description: Alpha mode for the layer (C_LAYER_X_ALPHA_MODE).
+
+ xylon,layer-base-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Offset in number of lines (C_LAYER_X_OFFSET) starting from the
+ video RAM base (C_VMEM_BASEADDR), only for version 3.
+
+ xylon,layer-buffer-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Offset in number of lines (C_BUFFER_*_OFFSET) starting from the
+ layer base offset for the second buffer used in double-buffering.
+
+ xylon,layer-primary:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: |
+ Layer should be registered as a primary plane (exactly one is
+ required).
+
+ additionalProperties: false
+
+ required:
+ - reg
+ - xylon,layer-depth
+ - xylon,layer-colorspace
+ - xylon,layer-alpha-mode
+
+ required:
+ - "#address-cells"
+ - "#size-cells"
+ - layer@0
+
+ additionalProperties: false
+
+ description: |
+ The description of the display controller layers, containing layer
+ sub-nodes that each describe a registered layer.
+
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+ description: |
+ Video output port, typically connected to a panel or bridge.
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - interrupts
+ - xylon,display-interface
+ - xylon,display-colorspace
+ - xylon,display-depth
+ - xylon,row-stride
+ - layers
+ - port
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ logicvc: logicvc@43c00000 {
+ compatible = "xylon,logicvc-3.02.a", "syscon", "simple-mfd";
+ reg = <0x43c00000 0x6000>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ logicvc_display: display@0 {
+ compatible = "xylon,logicvc-3.02.a-display";
+ reg = <0x0 0x6000>;
+
+ memory-region = <&logicvc_cma>;
+
+ clocks = <&logicvc_vclk 0>, <&logicvc_lvdsclk 0>;
+ clock-names = "vclk", "lvdsclk";
+
+ interrupt-parent = <&intc>;
+ interrupts = <0 34 IRQ_TYPE_LEVEL_HIGH>;
+
+ xylon,display-interface = "lvds-4bits";
+ xylon,display-colorspace = "rgb";
+ xylon,display-depth = <16>;
+ xylon,row-stride = <1024>;
+
+ xylon,layers-configurable;
+
+ layers {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ layer@0 {
+ reg = <0>;
+ xylon,layer-depth = <16>;
+ xylon,layer-colorspace = "rgb";
+ xylon,layer-alpha-mode = "layer";
+ xylon,layer-base-offset = <0>;
+ xylon,layer-buffer-offset = <480>;
+ xylon,layer-primary;
+ };
+
+ layer@1 {
+ reg = <1>;
+ xylon,layer-depth = <16>;
+ xylon,layer-colorspace = "rgb";
+ xylon,layer-alpha-mode = "layer";
+ xylon,layer-base-offset = <2400>;
+ xylon,layer-buffer-offset = <480>;
+ };
+
+ layer@2 {
+ reg = <2>;
+ xylon,layer-depth = <16>;
+ xylon,layer-colorspace = "rgb";
+ xylon,layer-alpha-mode = "layer";
+ xylon,layer-base-offset = <960>;
+ xylon,layer-buffer-offset = <480>;
+ };
+
+ layer@3 {
+ reg = <3>;
+ xylon,layer-depth = <16>;
+ xylon,layer-colorspace = "rgb";
+ xylon,layer-alpha-mode = "layer";
+ xylon,layer-base-offset = <480>;
+ xylon,layer-buffer-offset = <480>;
+ };
+
+ layer@4 {
+ reg = <4>;
+ xylon,layer-depth = <16>;
+ xylon,layer-colorspace = "rgb";
+ xylon,layer-alpha-mode = "layer";
+ xylon,layer-base-offset = <8192>;
+ xylon,layer-buffer-offset = <480>;
+ };
+ };
+
+ port {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ logicvc_output: endpoint@0 {
+ reg = <0>;
+ remote-endpoint = <&panel_input>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun4i-a10-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun4i-a10-dma.yaml
index 83808199657b..02d5bd035409 100644
--- a/Documentation/devicetree/bindings/dma/allwinner,sun4i-a10-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/allwinner,sun4i-a10-dma.yaml
@@ -4,14 +4,14 @@
$id: http://devicetree.org/schemas/dma/allwinner,sun4i-a10-dma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A10 DMA Controller Device Tree Bindings
+title: Allwinner A10 DMA Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
"#dma-cells":
diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
index b6e1ebfaf366..ec2d7a789ffe 100644
--- a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
@@ -4,14 +4,14 @@
$id: http://devicetree.org/schemas/dma/allwinner,sun50i-a64-dma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A64 DMA Controller Device Tree Bindings
+title: Allwinner A64 DMA Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
"#dma-cells":
@@ -20,9 +20,11 @@ properties:
compatible:
oneOf:
- - const: allwinner,sun50i-a64-dma
- - const: allwinner,sun50i-a100-dma
- - const: allwinner,sun50i-h6-dma
+ - enum:
+ - allwinner,sun20i-d1-dma
+ - allwinner,sun50i-a64-dma
+ - allwinner,sun50i-a100-dma
+ - allwinner,sun50i-h6-dma
- items:
- const: allwinner,sun8i-r40-dma
- const: allwinner,sun50i-a64-dma
@@ -58,13 +60,14 @@ if:
properties:
compatible:
enum:
+ - allwinner,sun20i-d1-dma
- allwinner,sun50i-a100-dma
- allwinner,sun50i-h6-dma
then:
properties:
clocks:
- maxItems: 2
+ minItems: 2
required:
- clock-names
diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun6i-a31-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun6i-a31-dma.yaml
index a6df6f8b54db..5d554bcfab3d 100644
--- a/Documentation/devicetree/bindings/dma/allwinner,sun6i-a31-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/allwinner,sun6i-a31-dma.yaml
@@ -4,14 +4,14 @@
$id: http://devicetree.org/schemas/dma/allwinner,sun6i-a31-dma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A31 DMA Controller Device Tree Bindings
+title: Allwinner A31 DMA Controller
maintainers:
- Chen-Yu Tsai <wens@csie.org>
- Maxime Ripard <mripard@kernel.org>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
"#dma-cells":
diff --git a/Documentation/devicetree/bindings/dma/altr,msgdma.yaml b/Documentation/devicetree/bindings/dma/altr,msgdma.yaml
index b193ee2db4a7..391bf5838602 100644
--- a/Documentation/devicetree/bindings/dma/altr,msgdma.yaml
+++ b/Documentation/devicetree/bindings/dma/altr,msgdma.yaml
@@ -7,14 +7,14 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Altera mSGDMA IP core
maintainers:
- - Olivier Dautricourt <olivier.dautricourt@orolia.com>
+ - Olivier Dautricourt <olivierdautricourt@gmail.com>
description: |
Altera / Intel modular Scatter-Gather Direct Memory Access (mSGDMA)
intellectual property (IP)
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/apple,admac.yaml b/Documentation/devicetree/bindings/dma/apple,admac.yaml
new file mode 100644
index 000000000000..ab193bc8bdbb
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/apple,admac.yaml
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/apple,admac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Apple Audio DMA Controller (ADMAC)
+
+description: |
+ Apple's Audio DMA Controller (ADMAC) is used to fetch and store audio samples
+ on SoCs from the "Apple Silicon" family.
+
+ The controller has been seen with up to 24 channels. Even-numbered channels
+ are TX-only, odd-numbered are RX-only. Individual channels are coupled to
+ fixed device endpoints.
+
+maintainers:
+ - Martin Povišer <povik+lin@cutebit.org>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - apple,t6000-admac
+ - apple,t8103-admac
+ - apple,t8112-admac
+ - const: apple,admac
+
+ reg:
+ maxItems: 1
+
+ '#dma-cells':
+ const: 1
+ description:
+ Clients specify a single cell with channel number.
+
+ dma-channels:
+ maximum: 24
+
+ interrupts:
+ minItems: 4
+ maxItems: 4
+ description:
+ Interrupts that correspond to the 4 IRQ outputs of the controller. Usually
+ only one of the controller outputs will be connected as an usable interrupt
+ source. The remaining interrupts will be left without a valid value, e.g.
+ in an interrupts-extended list the disconnected positions will contain
+ an empty phandle reference <0>.
+
+ iommus:
+ minItems: 1
+ maxItems: 2
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - '#dma-cells'
+ - dma-channels
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/apple-aic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ aic: interrupt-controller {
+ interrupt-controller;
+ #interrupt-cells = <3>;
+ };
+
+ admac: dma-controller@238200000 {
+ compatible = "apple,t8103-admac", "apple,admac";
+ reg = <0x38200000 0x34000>;
+ dma-channels = <24>;
+ interrupts-extended = <0>,
+ <&aic AIC_IRQ 626 IRQ_TYPE_LEVEL_HIGH>,
+ <0>,
+ <0>;
+ #dma-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/dma/arm,pl330.yaml b/Documentation/devicetree/bindings/dma/arm,pl330.yaml
new file mode 100644
index 000000000000..4a3dd6f5309b
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/arm,pl330.yaml
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/arm,pl330.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM PrimeCell PL330 DMA Controller
+
+maintainers:
+ - Vinod Koul <vkoul@kernel.org>
+
+description:
+ The ARM PrimeCell PL330 DMA controller can move blocks of memory contents
+ between memory and peripherals or memory to memory.
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,pl330
+ required:
+ - compatible
+
+allOf:
+ - $ref: dma-controller.yaml#
+ - $ref: /schemas/arm/primecell.yaml#
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - arm,pl330
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ minItems: 1
+ maxItems: 32
+ description: A single combined interrupt or an interrupt per event
+
+ '#dma-cells':
+ const: 1
+ description: Contains the DMA request number for the consumer
+
+ arm,pl330-broken-no-flushp:
+ type: boolean
+ description: quirk for avoiding to execute DMAFLUSHP
+
+ arm,pl330-periph-burst:
+ type: boolean
+ description: quirk for performing burst transfer only
+
+ dma-coherent: true
+
+ iommus:
+ minItems: 1
+ maxItems: 9
+ description: Up to 1 IOMMU entry per DMA channel for writes and 1
+ IOMMU entry for reads.
+
+ power-domains:
+ maxItems: 1
+
+ resets:
+ minItems: 1
+ maxItems: 2
+
+ reset-names:
+ minItems: 1
+ items:
+ - const: dma
+ - const: dma-ocp
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ dma-controller@12680000 {
+ compatible = "arm,pl330", "arm,primecell";
+ reg = <0x12680000 0x1000>;
+ interrupts = <99>;
+ #dma-cells = <1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/dma/arm-pl08x.yaml b/Documentation/devicetree/bindings/dma/arm-pl08x.yaml
index 3bd9eea543ca..ab25ae63d2c3 100644
--- a/Documentation/devicetree/bindings/dma/arm-pl08x.yaml
+++ b/Documentation/devicetree/bindings/dma/arm-pl08x.yaml
@@ -10,7 +10,8 @@ maintainers:
- Vinod Koul <vkoul@kernel.org>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: /schemas/arm/primecell.yaml#
+ - $ref: dma-controller.yaml#
# We need a select here so we don't match all nodes with 'arm,primecell'
select:
@@ -89,6 +90,9 @@ properties:
- 64
description: bus width used for memcpy in bits. FTDMAC020 also accept 64 bits
+ resets:
+ maxItems: 1
+
required:
- reg
- interrupts
diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt b/Documentation/devicetree/bindings/dma/arm-pl330.txt
deleted file mode 100644
index 315e90122afa..000000000000
--- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-* ARM PrimeCell PL330 DMA Controller
-
-The ARM PrimeCell PL330 DMA controller can move blocks of memory contents
-between memory and peripherals or memory to memory.
-
-Required properties:
- - compatible: should include both "arm,pl330" and "arm,primecell".
- - reg: physical base address of the controller and length of memory mapped
- region.
- - interrupts: interrupt number to the cpu.
-
-Optional properties:
- - dma-coherent : Present if dma operations are coherent
- - #dma-cells: must be <1>. used to represent the number of integer
- cells in the dmas property of client device.
- - dma-channels: contains the total number of DMA channels supported by the DMAC
- - dma-requests: contains the total number of DMA requests supported by the DMAC
- - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
- - arm,pl330-periph-burst: quirk for performing burst transfer only
- - resets: contains an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
- - reset-names: must contain at least "dma", and optional is "dma-ocp".
-
-Example:
-
- pdma0: pdma@12680000 {
- compatible = "arm,pl330", "arm,primecell";
- reg = <0x12680000 0x1000>;
- interrupts = <99>;
- #dma-cells = <1>;
- #dma-channels = <8>;
- #dma-requests = <32>;
- };
-
-Client drivers (device nodes requiring dma transfers from dev-to-mem or
-mem-to-dev) should specify the DMA channel numbers and dma channel names
-as shown below.
-
- [property name] = <[phandle of the dma controller] [dma request id]>;
- [property name] = <[dma channel name]>
-
- where 'dma request id' is the dma request number which is connected
- to the client controller. The 'property name' 'dmas' and 'dma-names'
- as required by the generic dma device tree binding helpers. The dma
- names correspond 1:1 with the dma request ids in the dmas property.
-
- Example: dmas = <&pdma0 12
- &pdma1 11>;
- dma-names = "tx", "rx";
diff --git a/Documentation/devicetree/bindings/dma/dma-common.yaml b/Documentation/devicetree/bindings/dma/dma-common.yaml
index ad06d36af208..ea700f8ee6c6 100644
--- a/Documentation/devicetree/bindings/dma/dma-common.yaml
+++ b/Documentation/devicetree/bindings/dma/dma-common.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/dma/dma-common.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: DMA Engine Generic Binding
+title: DMA Engine Common Properties
maintainers:
- Vinod Koul <vkoul@kernel.org>
diff --git a/Documentation/devicetree/bindings/dma/dma-controller.yaml b/Documentation/devicetree/bindings/dma/dma-controller.yaml
index 0043b91da95e..04d150d4d15d 100644
--- a/Documentation/devicetree/bindings/dma/dma-controller.yaml
+++ b/Documentation/devicetree/bindings/dma/dma-controller.yaml
@@ -4,13 +4,13 @@
$id: http://devicetree.org/schemas/dma/dma-controller.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: DMA Controller Generic Binding
+title: DMA Controller Common Properties
maintainers:
- Vinod Koul <vkoul@kernel.org>
allOf:
- - $ref: "dma-common.yaml#"
+ - $ref: dma-common.yaml#
# Everything else is described in the common file
properties:
@@ -24,10 +24,10 @@ examples:
dma: dma-controller@48000000 {
compatible = "ti,omap-sdma";
reg = <0x48000000 0x1000>;
- interrupts = <0 12 0x4
- 0 13 0x4
- 0 14 0x4
- 0 15 0x4>;
+ interrupts = <0 12 0x4>,
+ <0 13 0x4>,
+ <0 14 0x4>,
+ <0 15 0x4>;
#dma-cells = <1>;
dma-channels = <32>;
dma-requests = <127>;
diff --git a/Documentation/devicetree/bindings/dma/dma-router.yaml b/Documentation/devicetree/bindings/dma/dma-router.yaml
index e72748496fd9..346fe0fa4460 100644
--- a/Documentation/devicetree/bindings/dma/dma-router.yaml
+++ b/Documentation/devicetree/bindings/dma/dma-router.yaml
@@ -4,13 +4,13 @@
$id: http://devicetree.org/schemas/dma/dma-router.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: DMA Router Generic Binding
+title: DMA Router Common Properties
maintainers:
- Vinod Koul <vkoul@kernel.org>
allOf:
- - $ref: "dma-common.yaml#"
+ - $ref: dma-common.yaml#
description:
DMA routers are transparent IP blocks used to route DMA request
@@ -24,6 +24,8 @@ properties:
dma-masters:
$ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ maxItems: 1
description:
Array of phandles to the DMA controllers the router can direct
the signal to.
diff --git a/Documentation/devicetree/bindings/dma/fsl,edma.yaml b/Documentation/devicetree/bindings/dma/fsl,edma.yaml
new file mode 100644
index 000000000000..5fd8fc604261
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/fsl,edma.yaml
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/fsl,edma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale enhanced Direct Memory Access(eDMA) Controller
+
+description: |
+ The eDMA channels have multiplex capability by programmable
+ memory-mapped registers. channels are split into two groups, called
+ DMAMUX0 and DMAMUX1, specific DMA request source can only be multiplexed
+ by any channel of certain group, DMAMUX0 or DMAMUX1, but not both.
+
+maintainers:
+ - Peng Fan <peng.fan@nxp.com>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - fsl,vf610-edma
+ - fsl,imx7ulp-edma
+ - items:
+ - const: fsl,ls1028a-edma
+ - const: fsl,vf610-edma
+
+ reg:
+ minItems: 2
+ maxItems: 3
+
+ interrupts:
+ minItems: 2
+ maxItems: 17
+
+ interrupt-names:
+ minItems: 2
+ maxItems: 17
+
+ "#dma-cells":
+ const: 2
+
+ dma-channels:
+ const: 32
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ maxItems: 2
+
+ big-endian:
+ description: |
+ If present registers and hardware scatter/gather descriptors of the
+ eDMA are implemented in big endian mode, otherwise in little mode.
+ type: boolean
+
+required:
+ - "#dma-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - dma-channels
+
+allOf:
+ - $ref: dma-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,vf610-edma
+ then:
+ properties:
+ clock-names:
+ items:
+ - const: dmamux0
+ - const: dmamux1
+ interrupts:
+ maxItems: 2
+ interrupt-names:
+ items:
+ - const: edma-tx
+ - const: edma-err
+ reg:
+ maxItems: 3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: fsl,imx7ulp-edma
+ then:
+ properties:
+ clock-names:
+ items:
+ - const: dma
+ - const: dmamux0
+ interrupts:
+ maxItems: 17
+ reg:
+ maxItems: 2
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/vf610-clock.h>
+
+ edma0: dma-controller@40018000 {
+ #dma-cells = <2>;
+ compatible = "fsl,vf610-edma";
+ reg = <0x40018000 0x2000>,
+ <0x40024000 0x1000>,
+ <0x40025000 0x1000>;
+ interrupts = <0 8 IRQ_TYPE_LEVEL_HIGH>,
+ <0 9 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "edma-tx", "edma-err";
+ dma-channels = <32>;
+ clock-names = "dmamux0", "dmamux1";
+ clocks = <&clks VF610_CLK_DMAMUX0>, <&clks VF610_CLK_DMAMUX1>;
+ };
+
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/imx7ulp-clock.h>
+
+ edma1: dma-controller@40080000 {
+ #dma-cells = <2>;
+ compatible = "fsl,imx7ulp-edma";
+ reg = <0x40080000 0x2000>,
+ <0x40210000 0x1000>;
+ dma-channels = <32>;
+ interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>,
+ /* last is eDMA2-ERR interrupt */
+ <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
+ clock-names = "dma", "dmamux0";
+ clocks = <&pcc2 IMX7ULP_CLK_DMA1>, <&pcc2 IMX7ULP_CLK_DMA_MUX1>;
+ };
diff --git a/Documentation/devicetree/bindings/dma/fsl,imx-sdma.yaml b/Documentation/devicetree/bindings/dma/fsl,imx-sdma.yaml
new file mode 100644
index 000000000000..b95dd8db5a30
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/fsl,imx-sdma.yaml
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/fsl,imx-sdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale Smart Direct Memory Access (SDMA) Controller for i.MX
+
+maintainers:
+ - Joy Zou <joy.zou@nxp.com>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - fsl,imx50-sdma
+ - fsl,imx51-sdma
+ - fsl,imx53-sdma
+ - fsl,imx6q-sdma
+ - fsl,imx7d-sdma
+ - const: fsl,imx35-sdma
+ - items:
+ - enum:
+ - fsl,imx6sx-sdma
+ - fsl,imx6sl-sdma
+ - const: fsl,imx6q-sdma
+ - items:
+ - const: fsl,imx6ul-sdma
+ - const: fsl,imx6q-sdma
+ - const: fsl,imx35-sdma
+ - items:
+ - const: fsl,imx6sll-sdma
+ - const: fsl,imx6ul-sdma
+ - items:
+ - const: fsl,imx8mq-sdma
+ - const: fsl,imx7d-sdma
+ - items:
+ - enum:
+ - fsl,imx8mp-sdma
+ - fsl,imx8mn-sdma
+ - fsl,imx8mm-sdma
+ - const: fsl,imx8mq-sdma
+ - items:
+ - enum:
+ - fsl,imx25-sdma
+ - fsl,imx31-sdma
+ - fsl,imx35-sdma
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ fsl,sdma-ram-script-name:
+ $ref: /schemas/types.yaml#/definitions/string
+ description: Should contain the full path of SDMA RAM scripts firmware.
+
+ "#dma-cells":
+ const: 3
+ description: |
+ The first cell: request/event ID
+
+ The second cell: peripheral types ID
+ enum:
+ - MCU domain SSI: 0
+ - Shared SSI: 1
+ - MMC: 2
+ - SDHC: 3
+ - MCU domain UART: 4
+ - Shared UART: 5
+ - FIRI: 6
+ - MCU domain CSPI: 7
+ - Shared CSPI: 8
+ - SIM: 9
+ - ATA: 10
+ - CCM: 11
+ - External peripheral: 12
+ - Memory Stick Host Controller: 13
+ - Shared Memory Stick Host Controller: 14
+ - DSP: 15
+ - Memory: 16
+ - FIFO type Memory: 17
+ - SPDIF: 18
+ - IPU Memory: 19
+ - ASRC: 20
+ - ESAI: 21
+ - SSI Dual FIFO: 22
+ description: needs firmware more than ver 2
+ - Shared ASRC: 23
+ - SAI: 24
+ - HDMI Audio: 25
+
+ The third cell: transfer priority ID
+ enum:
+ - High: 0
+ - Medium: 1
+ - Low: 2
+
+ gpr:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: The phandle to the General Purpose Register (GPR) node
+
+ fsl,sdma-event-remap:
+ $ref: /schemas/types.yaml#/definitions/uint32-matrix
+ maxItems: 2
+ items:
+ items:
+ - description: GPR register offset
+ - description: GPR register shift
+ - description: GPR register value
+ description: |
+ Register bits of sdma event remap, the format is <reg shift val>.
+ The order is <RX>, <TX>.
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: ipg
+ - const: ahb
+
+ iram:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description: The phandle to the On-chip RAM (OCRAM) node.
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - fsl,sdma-ram-script-name
+
+additionalProperties: false
+
+examples:
+ - |
+ sdma: dma-controller@83fb0000 {
+ compatible = "fsl,imx51-sdma", "fsl,imx35-sdma";
+ reg = <0x83fb0000 0x4000>;
+ interrupts = <6>;
+ #dma-cells = <3>;
+ fsl,sdma-ram-script-name = "sdma-imx51.bin";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/dma/fsl,mxs-dma.yaml b/Documentation/devicetree/bindings/dma/fsl,mxs-dma.yaml
new file mode 100644
index 000000000000..add9c77e8b52
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/fsl,mxs-dma.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/fsl,mxs-dma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Freescale Direct Memory Access (DMA) Controller from i.MX23/i.MX28
+
+maintainers:
+ - Marek Vasut <marex@denx.de>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - fsl,imx6q-dma-apbh
+ - fsl,imx6sx-dma-apbh
+ - fsl,imx7d-dma-apbh
+ - const: fsl,imx28-dma-apbh
+ - enum:
+ - fsl,imx23-dma-apbh
+ - fsl,imx23-dma-apbx
+ - fsl,imx28-dma-apbh
+ - fsl,imx28-dma-apbx
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ interrupts:
+ minItems: 4
+ maxItems: 16
+
+ "#dma-cells":
+ const: 1
+
+ dma-channels:
+ enum: [4, 8, 16]
+
+required:
+ - compatible
+ - reg
+ - "#dma-cells"
+ - dma-channels
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ interrupt-parent = <&irqc>;
+
+ dma-controller@80004000 {
+ compatible = "fsl,imx28-dma-apbh";
+ reg = <0x80004000 0x2000>;
+ interrupts = <82 83 84 85
+ 88 88 88 88
+ 88 88 88 88
+ 87 86 0 0>;
+ #dma-cells = <1>;
+ dma-channels = <16>;
+ };
+
+ dma-controller@80024000 {
+ compatible = "fsl,imx28-dma-apbx";
+ reg = <0x80024000 0x2000>;
+ interrupts = <78 79 66 0
+ 80 81 68 69
+ 70 71 72 73
+ 74 75 76 77>;
+ #dma-cells = <1>;
+ dma-channels = <16>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/dma/fsl-edma.txt b/Documentation/devicetree/bindings/dma/fsl-edma.txt
deleted file mode 100644
index ee1754739b4b..000000000000
--- a/Documentation/devicetree/bindings/dma/fsl-edma.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-* Freescale enhanced Direct Memory Access(eDMA) Controller
-
- The eDMA channels have multiplex capability by programmble memory-mapped
-registers. channels are split into two groups, called DMAMUX0 and DMAMUX1,
-specific DMA request source can only be multiplexed by any channel of certain
-group, DMAMUX0 or DMAMUX1, but not both.
-
-* eDMA Controller
-Required properties:
-- compatible :
- - "fsl,vf610-edma" for eDMA used similar to that on Vybrid vf610 SoC
- - "fsl,imx7ulp-edma" for eDMA2 used similar to that on i.mx7ulp
- - "fsl,ls1028a-edma" followed by "fsl,vf610-edma" for eDMA used on the
- LS1028A SoC.
-- reg : Specifies base physical address(s) and size of the eDMA registers.
- The 1st region is eDMA control register's address and size.
- The 2nd and the 3rd regions are programmable channel multiplexing
- control register's address and size.
-- interrupts : A list of interrupt-specifiers, one for each entry in
- interrupt-names on vf610 similar SoC. But for i.mx7ulp per channel
- per transmission interrupt, total 16 channel interrupt and 1
- error interrupt(located in the last), no interrupt-names list on
- i.mx7ulp for clean on dts.
-- #dma-cells : Must be <2>.
- The 1st cell specifies the DMAMUX(0 for DMAMUX0 and 1 for DMAMUX1).
- Specific request source can only be multiplexed by specific channels
- group called DMAMUX.
- The 2nd cell specifies the request source(slot) ID.
- See the SoC's reference manual for all the supported request sources.
-- dma-channels : Number of channels supported by the controller
-- clock-names : A list of channel group clock names. Should contain:
- "dmamux0" - clock name of mux0 group
- "dmamux1" - clock name of mux1 group
- Note: No dmamux0 on i.mx7ulp, but another 'dma' clk added on i.mx7ulp.
-- clocks : A list of phandle and clock-specifier pairs, one for each entry in
- clock-names.
-
-Optional properties:
-- big-endian: If present registers and hardware scatter/gather descriptors
- of the eDMA are implemented in big endian mode, otherwise in little
- mode.
-- interrupt-names : Should contain the below on vf610 similar SoC but not used
- on i.mx7ulp similar SoC:
- "edma-tx" - the transmission interrupt
- "edma-err" - the error interrupt
-
-
-Examples:
-
-edma0: dma-controller@40018000 {
- #dma-cells = <2>;
- compatible = "fsl,vf610-edma";
- reg = <0x40018000 0x2000>,
- <0x40024000 0x1000>,
- <0x40025000 0x1000>;
- interrupts = <0 8 IRQ_TYPE_LEVEL_HIGH>,
- <0 9 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "edma-tx", "edma-err";
- dma-channels = <32>;
- clock-names = "dmamux0", "dmamux1";
- clocks = <&clks VF610_CLK_DMAMUX0>,
- <&clks VF610_CLK_DMAMUX1>;
-}; /* vf610 */
-
-edma1: dma-controller@40080000 {
- #dma-cells = <2>;
- compatible = "fsl,imx7ulp-edma";
- reg = <0x40080000 0x2000>,
- <0x40210000 0x1000>;
- dma-channels = <32>;
- interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>,
- /* last is eDMA2-ERR interrupt */
- <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
- clock-names = "dma", "dmamux0";
- clocks = <&pcc2 IMX7ULP_CLK_DMA1>,
- <&pcc2 IMX7ULP_CLK_DMA_MUX1>;
-}; /* i.mx7ulp */
-
-* DMA clients
-DMA client drivers that uses the DMA function must use the format described
-in the dma.txt file, using a two-cell specifier for each channel: the 1st
-specifies the channel group(DMAMUX) in which this request can be multiplexed,
-and the 2nd specifies the request source.
-
-Examples:
-
-sai2: sai@40031000 {
- compatible = "fsl,vf610-sai";
- reg = <0x40031000 0x1000>;
- interrupts = <0 86 IRQ_TYPE_LEVEL_HIGH>;
- clock-names = "sai";
- clocks = <&clks VF610_CLK_SAI2>;
- dma-names = "tx", "rx";
- dmas = <&edma0 0 21>,
- <&edma0 0 20>;
-};
diff --git a/Documentation/devicetree/bindings/dma/fsl-imx-dma.txt b/Documentation/devicetree/bindings/dma/fsl-imx-dma.txt
index 7bd8847d6394..1c9929d53727 100644
--- a/Documentation/devicetree/bindings/dma/fsl-imx-dma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-imx-dma.txt
@@ -13,8 +13,10 @@ Required properties:
- #dma-cells : Has to be 1. imx-dma does not support anything else.
Optional properties:
-- #dma-channels : Number of DMA channels supported. Should be 16.
-- #dma-requests : Number of DMA requests supported.
+- dma-channels : Number of DMA channels supported. Should be 16.
+- #dma-channels : deprecated
+- dma-requests : Number of DMA requests supported.
+- #dma-requests : deprecated
Example:
@@ -23,7 +25,7 @@ Example:
reg = <0x10001000 0x1000>;
interrupts = <32 33>;
#dma-cells = <1>;
- #dma-channels = <16>;
+ dma-channels = <16>;
};
diff --git a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt b/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
deleted file mode 100644
index 12c316ff4834..000000000000
--- a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt
+++ /dev/null
@@ -1,118 +0,0 @@
-* Freescale Smart Direct Memory Access (SDMA) Controller for i.MX
-
-Required properties:
-- compatible : Should be one of
- "fsl,imx25-sdma"
- "fsl,imx31-sdma", "fsl,imx31-to1-sdma", "fsl,imx31-to2-sdma"
- "fsl,imx35-sdma", "fsl,imx35-to1-sdma", "fsl,imx35-to2-sdma"
- "fsl,imx51-sdma"
- "fsl,imx53-sdma"
- "fsl,imx6q-sdma"
- "fsl,imx7d-sdma"
- "fsl,imx6ul-sdma"
- "fsl,imx8mq-sdma"
- "fsl,imx8mm-sdma"
- "fsl,imx8mn-sdma"
- "fsl,imx8mp-sdma"
- The -to variants should be preferred since they allow to determine the
- correct ROM script addresses needed for the driver to work without additional
- firmware.
-- reg : Should contain SDMA registers location and length
-- interrupts : Should contain SDMA interrupt
-- #dma-cells : Must be <3>.
- The first cell specifies the DMA request/event ID. See details below
- about the second and third cell.
-- fsl,sdma-ram-script-name : Should contain the full path of SDMA RAM
- scripts firmware
-
-The second cell of dma phandle specifies the peripheral type of DMA transfer.
-The full ID of peripheral types can be found below.
-
- ID transfer type
- ---------------------
- 0 MCU domain SSI
- 1 Shared SSI
- 2 MMC
- 3 SDHC
- 4 MCU domain UART
- 5 Shared UART
- 6 FIRI
- 7 MCU domain CSPI
- 8 Shared CSPI
- 9 SIM
- 10 ATA
- 11 CCM
- 12 External peripheral
- 13 Memory Stick Host Controller
- 14 Shared Memory Stick Host Controller
- 15 DSP
- 16 Memory
- 17 FIFO type Memory
- 18 SPDIF
- 19 IPU Memory
- 20 ASRC
- 21 ESAI
- 22 SSI Dual FIFO (needs firmware ver >= 2)
- 23 Shared ASRC
- 24 SAI
-
-The third cell specifies the transfer priority as below.
-
- ID transfer priority
- -------------------------
- 0 High
- 1 Medium
- 2 Low
-
-Optional properties:
-
-- gpr : The phandle to the General Purpose Register (GPR) node.
-- fsl,sdma-event-remap : Register bits of sdma event remap, the format is
- <reg shift val>.
- reg is the GPR register offset.
- shift is the bit position inside the GPR register.
- val is the value of the bit (0 or 1).
-
-Examples:
-
-sdma@83fb0000 {
- compatible = "fsl,imx51-sdma", "fsl,imx35-sdma";
- reg = <0x83fb0000 0x4000>;
- interrupts = <6>;
- #dma-cells = <3>;
- fsl,sdma-ram-script-name = "sdma-imx51.bin";
-};
-
-DMA clients connected to the i.MX SDMA controller must use the format
-described in the dma.txt file.
-
-Examples:
-
-ssi2: ssi@70014000 {
- compatible = "fsl,imx51-ssi", "fsl,imx21-ssi";
- reg = <0x70014000 0x4000>;
- interrupts = <30>;
- clocks = <&clks 49>;
- dmas = <&sdma 24 1 0>,
- <&sdma 25 1 0>;
- dma-names = "rx", "tx";
- fsl,fifo-depth = <15>;
-};
-
-Using the fsl,sdma-event-remap property:
-
-If we want to use SDMA on the SAI1 port on a MX6SX:
-
-&sdma {
- gpr = <&gpr>;
- /* SDMA events remap for SAI1_RX and SAI1_TX */
- fsl,sdma-event-remap = <0 15 1>, <0 16 1>;
-};
-
-The fsl,sdma-event-remap property in this case has two values:
-- <0 15 1> means that the offset is 0, so GPR0 is the register of the
-SDMA remap. Bit 15 of GPR0 selects between UART4_RX and SAI1_RX.
-Setting bit 15 to 1 selects SAI1_RX.
-- <0 16 1> means that the offset is 0, so GPR0 is the register of the
-SDMA remap. Bit 16 of GPR0 selects between UART4_TX and SAI1_TX.
-Setting bit 16 to 1 selects SAI1_TX.
diff --git a/Documentation/devicetree/bindings/dma/fsl-mxs-dma.txt b/Documentation/devicetree/bindings/dma/fsl-mxs-dma.txt
deleted file mode 100644
index e30e184f50c7..000000000000
--- a/Documentation/devicetree/bindings/dma/fsl-mxs-dma.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-* Freescale MXS DMA
-
-Required properties:
-- compatible : Should be "fsl,<chip>-dma-apbh" or "fsl,<chip>-dma-apbx"
-- reg : Should contain registers location and length
-- interrupts : Should contain the interrupt numbers of DMA channels.
- If a channel is empty/reserved, 0 should be filled in place.
-- #dma-cells : Must be <1>. The number cell specifies the channel ID.
-- dma-channels : Number of channels supported by the DMA controller
-
-Optional properties:
-- interrupt-names : Name of DMA channel interrupts
-
-Supported chips:
-imx23, imx28.
-
-Examples:
-
-dma_apbh: dma-apbh@80004000 {
- compatible = "fsl,imx28-dma-apbh";
- reg = <0x80004000 0x2000>;
- interrupts = <82 83 84 85
- 88 88 88 88
- 88 88 88 88
- 87 86 0 0>;
- interrupt-names = "ssp0", "ssp1", "ssp2", "ssp3",
- "gpmi0", "gmpi1", "gpmi2", "gmpi3",
- "gpmi4", "gmpi5", "gpmi6", "gmpi7",
- "hsadc", "lcdif", "empty", "empty";
- #dma-cells = <1>;
- dma-channels = <16>;
-};
-
-dma_apbx: dma-apbx@80024000 {
- compatible = "fsl,imx28-dma-apbx";
- reg = <0x80024000 0x2000>;
- interrupts = <78 79 66 0
- 80 81 68 69
- 70 71 72 73
- 74 75 76 77>;
- interrupt-names = "auart4-rx", "auart4-tx", "spdif-tx", "empty",
- "saif0", "saif1", "i2c0", "i2c1",
- "auart0-rx", "auart0-tx", "auart1-rx", "auart1-tx",
- "auart2-rx", "auart2-tx", "auart3-rx", "auart3-tx";
- #dma-cells = <1>;
- dma-channels = <16>;
-};
-
-DMA clients connected to the MXS DMA controller must use the format
-described in the dma.txt file.
-
-Examples:
-
-auart0: serial@8006a000 {
- compatible = "fsl,imx28-auart", "fsl,imx23-auart";
- reg = <0x8006a000 0x2000>;
- interrupts = <112>;
- dmas = <&dma_apbx 8>, <&dma_apbx 9>;
- dma-names = "rx", "tx";
-};
diff --git a/Documentation/devicetree/bindings/dma/ingenic,dma.yaml b/Documentation/devicetree/bindings/dma/ingenic,dma.yaml
index ac4d59494fc8..37400496e086 100644
--- a/Documentation/devicetree/bindings/dma/ingenic,dma.yaml
+++ b/Documentation/devicetree/bindings/dma/ingenic,dma.yaml
@@ -4,25 +4,34 @@
$id: http://devicetree.org/schemas/dma/ingenic,dma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Ingenic SoCs DMA Controller DT bindings
+title: Ingenic SoCs DMA Controller
maintainers:
- Paul Cercueil <paul@crapouillou.net>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
- enum:
- - ingenic,jz4740-dma
- - ingenic,jz4725b-dma
- - ingenic,jz4760-dma
- - ingenic,jz4760b-dma
- - ingenic,jz4770-dma
- - ingenic,jz4780-dma
- - ingenic,x1000-dma
- - ingenic,x1830-dma
+ oneOf:
+ - enum:
+ - ingenic,jz4740-dma
+ - ingenic,jz4725b-dma
+ - ingenic,jz4755-dma
+ - ingenic,jz4760-dma
+ - ingenic,jz4760-bdma
+ - ingenic,jz4760-mdma
+ - ingenic,jz4760b-dma
+ - ingenic,jz4760b-bdma
+ - ingenic,jz4760b-mdma
+ - ingenic,jz4770-dma
+ - ingenic,jz4780-dma
+ - ingenic,x1000-dma
+ - ingenic,x1830-dma
+ - items:
+ - const: ingenic,jz4770-bdma
+ - const: ingenic,jz4760b-bdma
reg:
items:
@@ -36,13 +45,19 @@ properties:
maxItems: 1
"#dma-cells":
- const: 2
+ enum: [2, 3]
description: >
DMA clients must use the format described in dma.txt, giving a phandle
- to the DMA controller plus the following 2 integer cells:
+ to the DMA controller plus the following integer cells:
- - Request type: The DMA request type for transfers to/from the
- device on the allocated channel, as defined in the SoC documentation.
+ - Request type: The DMA request type specifies the device endpoint that
+ will be the source or destination of the DMA transfer.
+ If "#dma-cells" is 2, the request type is a single cell, and the
+ direction will be unidirectional (either RX or TX but not both).
+ If "#dma-cells" is 3, the request type has two cells; the first
+ one corresponds to the host to device direction (TX), the second one
+ corresponds to the device to host direction (RX). The DMA channel is
+ then bidirectional.
- Channel: If set to 0xffffffff, any available channel will be allocated
for the client. Otherwise, the exact channel specified will be used.
@@ -68,7 +83,7 @@ unevaluatedProperties: false
examples:
- |
- #include <dt-bindings/clock/jz4780-cgu.h>
+ #include <dt-bindings/clock/ingenic,jz4780-cgu.h>
dma: dma-controller@13420000 {
compatible = "ingenic,jz4780-dma";
reg = <0x13420000 0x400>, <0x13421000 0x40>;
diff --git a/Documentation/devicetree/bindings/dma/intel,ldma.yaml b/Documentation/devicetree/bindings/dma/intel,ldma.yaml
index a5c4be783593..d6bb553a2c6f 100644
--- a/Documentation/devicetree/bindings/dma/intel,ldma.yaml
+++ b/Documentation/devicetree/bindings/dma/intel,ldma.yaml
@@ -11,7 +11,7 @@ maintainers:
- mallikarjunax.reddy@intel.com
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/mediatek,uart-dma.yaml b/Documentation/devicetree/bindings/dma/mediatek,uart-dma.yaml
new file mode 100644
index 000000000000..dab468a88942
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/mediatek,uart-dma.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/mediatek,uart-dma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek UART APDMA controller
+
+maintainers:
+ - Long Cheng <long.cheng@mediatek.com>
+
+description: |
+ The MediaTek UART APDMA controller provides DMA capabilities
+ for the UART peripheral bus.
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - mediatek,mt2712-uart-dma
+ - mediatek,mt6795-uart-dma
+ - mediatek,mt8365-uart-dma
+ - mediatek,mt8516-uart-dma
+ - const: mediatek,mt6577-uart-dma
+ - enum:
+ - mediatek,mt6577-uart-dma
+
+ reg:
+ minItems: 1
+ maxItems: 16
+
+ interrupts:
+ description: |
+ TX, RX interrupt lines for each UART APDMA channel
+ minItems: 1
+ maxItems: 16
+
+ clocks:
+ description: Must contain one entry for the APDMA main clock
+ maxItems: 1
+
+ clock-names:
+ const: apdma
+
+ "#dma-cells":
+ const: 1
+ description: |
+ The first cell specifies the UART APDMA channel number
+
+ dma-requests:
+ description: |
+ Number of virtual channels of the UART APDMA controller
+ maximum: 16
+
+ mediatek,dma-33bits:
+ type: boolean
+ description: Enable 33-bits UART APDMA support
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+if:
+ not:
+ required:
+ - dma-requests
+then:
+ properties:
+ interrupts:
+ maxItems: 8
+ reg:
+ maxItems: 8
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt2712-clk.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ apdma: dma-controller@11000400 {
+ compatible = "mediatek,mt2712-uart-dma",
+ "mediatek,mt6577-uart-dma";
+ reg = <0 0x11000400 0 0x80>,
+ <0 0x11000480 0 0x80>,
+ <0 0x11000500 0 0x80>,
+ <0 0x11000580 0 0x80>,
+ <0 0x11000600 0 0x80>,
+ <0 0x11000680 0 0x80>,
+ <0 0x11000700 0 0x80>,
+ <0 0x11000780 0 0x80>,
+ <0 0x11000800 0 0x80>,
+ <0 0x11000880 0 0x80>,
+ <0 0x11000900 0 0x80>,
+ <0 0x11000980 0 0x80>;
+ interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 104 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 105 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 106 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 107 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 108 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 109 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 110 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 112 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 113 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 114 IRQ_TYPE_LEVEL_LOW>;
+ dma-requests = <12>;
+ clocks = <&pericfg CLK_PERI_AP_DMA>;
+ clock-names = "apdma";
+ mediatek,dma-33bits;
+ #dma-cells = <1>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/dma/mmp-dma.txt b/Documentation/devicetree/bindings/dma/mmp-dma.txt
index 8f7364a7b349..ec18bf0a802a 100644
--- a/Documentation/devicetree/bindings/dma/mmp-dma.txt
+++ b/Documentation/devicetree/bindings/dma/mmp-dma.txt
@@ -10,10 +10,12 @@ Required properties:
or one irq for pdma device
Optional properties:
-- #dma-channels: Number of DMA channels supported by the controller (defaults
+- dma-channels: Number of DMA channels supported by the controller (defaults
to 32 when not specified)
-- #dma-requests: Number of DMA requestor lines supported by the controller
+- #dma-channels: deprecated
+- dma-requests: Number of DMA requestor lines supported by the controller
(defaults to 32 when not specified)
+- #dma-requests: deprecated
"marvell,pdma-1.0"
Used platforms: pxa25x, pxa27x, pxa3xx, pxa93x, pxa168, pxa910, pxa688.
@@ -33,7 +35,7 @@ pdma: dma-controller@d4000000 {
reg = <0xd4000000 0x10000>;
interrupts = <0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15>;
interrupt-parent = <&intcmux32>;
- #dma-channels = <16>;
+ dma-channels = <16>;
};
/*
@@ -45,7 +47,7 @@ pdma: dma-controller@d4000000 {
compatible = "marvell,pdma-1.0";
reg = <0xd4000000 0x10000>;
interrupts = <47>;
- #dma-channels = <16>;
+ dma-channels = <16>;
};
diff --git a/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt b/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
index 8a9f3559335b..7e14e26676ec 100644
--- a/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
+++ b/Documentation/devicetree/bindings/dma/moxa,moxart-dma.txt
@@ -34,8 +34,8 @@ Example:
Use specific request line passing from dma
For example, MMC request line is 5
- sdhci: sdhci@98e00000 {
- compatible = "moxa,moxart-sdhci";
+ mmc: mmc@98e00000 {
+ compatible = "moxa,moxart-mmc";
reg = <0x98e00000 0x5C>;
interrupts = <5 0>;
clocks = <&clk_apb>;
diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
deleted file mode 100644
index fef9c1eeb264..000000000000
--- a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-* Mediatek UART APDMA Controller
-
-Required properties:
-- compatible should contain:
- * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
- * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
- * "mediatek,mt8516-uart-dma", "mediatek,mt6577" for MT8516 SoC
-
-- reg: The base address of the APDMA register bank.
-
-- interrupts: A single interrupt specifier.
- One interrupt per dma-requests, or 8 if no dma-requests property is present
-
-- dma-requests: The number of DMA channels
-
-- clocks : Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
-- clock-names: The APDMA clock for register accesses
-
-- mediatek,dma-33bits: Present if the DMA requires support
-
-Examples:
-
- apdma: dma-controller@11000400 {
- compatible = "mediatek,mt2712-uart-dma",
- "mediatek,mt6577-uart-dma";
- reg = <0 0x11000400 0 0x80>,
- <0 0x11000480 0 0x80>,
- <0 0x11000500 0 0x80>,
- <0 0x11000580 0 0x80>,
- <0 0x11000600 0 0x80>,
- <0 0x11000680 0 0x80>,
- <0 0x11000700 0 0x80>,
- <0 0x11000780 0 0x80>,
- <0 0x11000800 0 0x80>,
- <0 0x11000880 0 0x80>,
- <0 0x11000900 0 0x80>,
- <0 0x11000980 0 0x80>;
- interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 104 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 105 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 106 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 107 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 108 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 109 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 110 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 112 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 113 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 114 IRQ_TYPE_LEVEL_LOW>;
- dma-requests = <12>;
- clocks = <&pericfg CLK_PERI_AP_DMA>;
- clock-names = "apdma";
- mediatek,dma-33bits;
- #dma-cells = <1>;
- };
diff --git a/Documentation/devicetree/bindings/dma/nvidia,tegra186-gpc-dma.yaml b/Documentation/devicetree/bindings/dma/nvidia,tegra186-gpc-dma.yaml
new file mode 100644
index 000000000000..a790e5687844
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/nvidia,tegra186-gpc-dma.yaml
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/nvidia,tegra186-gpc-dma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra GPC DMA Controller
+
+description: |
+ The Tegra General Purpose Central (GPC) DMA controller is used for faster
+ data transfers between memory to memory, memory to device and device to
+ memory.
+
+maintainers:
+ - Jon Hunter <jonathanh@nvidia.com>
+ - Rajesh Gumasta <rgumasta@nvidia.com>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ oneOf:
+ - const: nvidia,tegra186-gpcdma
+ - items:
+ - enum:
+ - nvidia,tegra234-gpcdma
+ - nvidia,tegra194-gpcdma
+ - const: nvidia,tegra186-gpcdma
+
+ "#dma-cells":
+ const: 1
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ description:
+ Should contain all of the per-channel DMA interrupts in
+ ascending order with respect to the DMA channel index.
+ minItems: 1
+ maxItems: 32
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: gpcdma
+
+ iommus:
+ maxItems: 1
+
+ dma-coherent: true
+
+ dma-channel-mask:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - resets
+ - reset-names
+ - "#dma-cells"
+ - iommus
+ - dma-channel-mask
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ dma-controller@2600000 {
+ compatible = "nvidia,tegra186-gpcdma";
+ reg = <0x2600000 0x210000>;
+ resets = <&bpmp TEGRA186_RESET_GPCDMA>;
+ reset-names = "gpcdma";
+ interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 78 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 88 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 91 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 95 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
+ #dma-cells = <1>;
+ iommus = <&smmu TEGRA186_SID_GPCDMA_0>;
+ dma-coherent;
+ dma-channel-mask = <0xfffffffe>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml b/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml
index 5c2e2f156e31..4003dbe94940 100644
--- a/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml
+++ b/Documentation/devicetree/bindings/dma/nvidia,tegra210-adma.yaml
@@ -14,7 +14,7 @@ maintainers:
- Jon Hunter <jonathanh@nvidia.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
@@ -23,7 +23,9 @@ properties:
- nvidia,tegra210-adma
- nvidia,tegra186-adma
- items:
- - const: nvidia,tegra194-adma
+ - enum:
+ - nvidia,tegra234-adma
+ - nvidia,tegra194-adma
- const: nvidia,tegra186-adma
reg:
diff --git a/Documentation/devicetree/bindings/dma/owl-dma.yaml b/Documentation/devicetree/bindings/dma/owl-dma.yaml
index 93b4847554fb..ec8b3dc37ca4 100644
--- a/Documentation/devicetree/bindings/dma/owl-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/owl-dma.yaml
@@ -15,7 +15,7 @@ maintainers:
- Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/qcom,adm.yaml b/Documentation/devicetree/bindings/dma/qcom,adm.yaml
new file mode 100644
index 000000000000..6a9d7bc74aff
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/qcom,adm.yaml
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/qcom,adm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm ADM DMA Controller
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+description: |
+ QCOM ADM DMA controller provides DMA capabilities for
+ peripheral buses such as NAND and SPI.
+
+properties:
+ compatible:
+ const: qcom,adm
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ "#dma-cells":
+ const: 1
+
+ clocks:
+ items:
+ - description: phandle to the core clock
+ - description: phandle to the iface clock
+
+ clock-names:
+ items:
+ - const: core
+ - const: iface
+
+ resets:
+ items:
+ - description: phandle to the clk reset
+ - description: phandle to the pbus reset
+ - description: phandle to the c0 reset
+ - description: phandle to the c1 reset
+ - description: phandle to the c2 reset
+
+ reset-names:
+ items:
+ - const: clk
+ - const: pbus
+ - const: c0
+ - const: c1
+ - const: c2
+
+ qcom,ee:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: indicates the security domain identifier used in the secure world.
+ minimum: 0
+ maximum: 255
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - "#dma-cells"
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - qcom,ee
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-ipq806x.h>
+ #include <dt-bindings/reset/qcom,gcc-ipq806x.h>
+
+ adm_dma: dma-controller@18300000 {
+ compatible = "qcom,adm";
+ reg = <0x18300000 0x100000>;
+ interrupts = <0 170 0>;
+ #dma-cells = <1>;
+
+ clocks = <&gcc ADM0_CLK>,
+ <&gcc ADM0_PBUS_CLK>;
+ clock-names = "core", "iface";
+
+ resets = <&gcc ADM0_RESET>,
+ <&gcc ADM0_PBUS_RESET>,
+ <&gcc ADM0_C0_RESET>,
+ <&gcc ADM0_C1_RESET>,
+ <&gcc ADM0_C2_RESET>;
+ reset-names = "clk", "pbus", "c0", "c1", "c2";
+ qcom,ee = <0>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml
new file mode 100644
index 000000000000..f1ddcf672261
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/qcom,bam-dma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies Inc BAM DMA controller
+
+maintainers:
+ - Andy Gross <agross@kernel.org>
+ - Bjorn Andersson <andersson@kernel.org>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ compatible:
+ enum:
+ # APQ8064, IPQ8064 and MSM8960
+ - qcom,bam-v1.3.0
+ # MSM8974, APQ8074 and APQ8084
+ - qcom,bam-v1.4.0
+ # MSM8916 and SDM845
+ - qcom,bam-v1.7.0
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: bam_clk
+
+ "#dma-cells":
+ const: 1
+
+ interrupts:
+ maxItems: 1
+
+ iommus:
+ minItems: 1
+ maxItems: 4
+
+ num-channels:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Indicates supported number of DMA channels in a remotely controlled bam.
+
+ qcom,controlled-remotely:
+ type: boolean
+ description:
+ Indicates that the bam is controlled by remote proccessor i.e. execution
+ environment.
+
+ qcom,ee:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 7
+ description:
+ Indicates the active Execution Environment identifier (0-7) used in the
+ secure world.
+
+ qcom,num-ees:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Indicates supported number of Execution Environments in a remotely
+ controlled bam.
+
+ qcom,powered-remotely:
+ type: boolean
+ description:
+ Indicates that the bam is powered up by a remote processor but must be
+ initialized by the local processor.
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - "#dma-cells"
+ - interrupts
+ - qcom,ee
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/qcom,gcc-msm8974.h>
+
+ dma-controller@f9944000 {
+ compatible = "qcom,bam-v1.4.0";
+ reg = <0xf9944000 0x19000>;
+ interrupts = <GIC_SPI 239 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&gcc GCC_BLSP2_AHB_CLK>;
+ clock-names = "bam_clk";
+ #dma-cells = <1>;
+ qcom,ee = <0>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml
index e614fe3187bb..f61145c91b6d 100644
--- a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml
+++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml
@@ -14,14 +14,32 @@ description: |
peripheral buses such as I2C, UART, and SPI.
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
- enum:
- - qcom,sdm845-gpi-dma
- - qcom,sm8150-gpi-dma
- - qcom,sm8250-gpi-dma
+ oneOf:
+ - enum:
+ - qcom,sdm845-gpi-dma
+ - qcom,sm6350-gpi-dma
+ - items:
+ - enum:
+ - qcom,qcm2290-gpi-dma
+ - qcom,qdu1000-gpi-dma
+ - qcom,sc7280-gpi-dma
+ - qcom,sm6115-gpi-dma
+ - qcom,sm6375-gpi-dma
+ - qcom,sm8350-gpi-dma
+ - qcom,sm8450-gpi-dma
+ - qcom,sm8550-gpi-dma
+ - const: qcom,sm6350-gpi-dma
+ - items:
+ - enum:
+ - qcom,sdm670-gpi-dma
+ - qcom,sm6125-gpi-dma
+ - qcom,sm8150-gpi-dma
+ - qcom,sm8250-gpi-dma
+ - const: qcom,sdm845-gpi-dma
reg:
maxItems: 1
@@ -29,6 +47,7 @@ properties:
interrupts:
description:
Interrupt lines for each GPI instance
+ minItems: 1
maxItems: 13
"#dma-cells":
diff --git a/Documentation/devicetree/bindings/dma/qcom_adm.txt b/Documentation/devicetree/bindings/dma/qcom_adm.txt
deleted file mode 100644
index 9d3b2f917b7b..000000000000
--- a/Documentation/devicetree/bindings/dma/qcom_adm.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-QCOM ADM DMA Controller
-
-Required properties:
-- compatible: must contain "qcom,adm" for IPQ/APQ8064 and MSM8960
-- reg: Address range for DMA registers
-- interrupts: Should contain one interrupt shared by all channels
-- #dma-cells: must be <2>. First cell denotes the channel number. Second cell
- denotes CRCI (client rate control interface) flow control assignment.
-- clocks: Should contain the core clock and interface clock.
-- clock-names: Must contain "core" for the core clock and "iface" for the
- interface clock.
-- resets: Must contain an entry for each entry in reset names.
-- reset-names: Must include the following entries:
- - clk
- - c0
- - c1
- - c2
-- qcom,ee: indicates the security domain identifier used in the secure world.
-
-Example:
- adm_dma: dma@18300000 {
- compatible = "qcom,adm";
- reg = <0x18300000 0x100000>;
- interrupts = <0 170 0>;
- #dma-cells = <2>;
-
- clocks = <&gcc ADM0_CLK>, <&gcc ADM0_PBUS_CLK>;
- clock-names = "core", "iface";
-
- resets = <&gcc ADM0_RESET>,
- <&gcc ADM0_C0_RESET>,
- <&gcc ADM0_C1_RESET>,
- <&gcc ADM0_C2_RESET>;
- reset-names = "clk", "c0", "c1", "c2";
- qcom,ee = <0>;
- };
-
-DMA clients must use the format descripted in the dma.txt file, using a three
-cell specifier for each channel.
-
-Each dmas request consists of 3 cells:
- 1. phandle pointing to the DMA controller
- 2. channel number
- 3. CRCI assignment, if applicable. If no CRCI flow control is required, use 0.
- The CRCI is used for flow control. It identifies the peripheral device that
- is the source/destination for the transferred data.
-
-Example:
-
- spi4: spi@1a280000 {
- spi-max-frequency = <50000000>;
-
- pinctrl-0 = <&spi_pins>;
- pinctrl-names = "default";
-
- cs-gpios = <&qcom_pinmux 20 0>;
-
- dmas = <&adm_dma 6 9>,
- <&adm_dma 5 10>;
- dma-names = "rx", "tx";
- };
diff --git a/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt b/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt
deleted file mode 100644
index cf5b9e44432c..000000000000
--- a/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-QCOM BAM DMA controller
-
-Required properties:
-- compatible: must be one of the following:
- * "qcom,bam-v1.4.0" for MSM8974, APQ8074 and APQ8084
- * "qcom,bam-v1.3.0" for APQ8064, IPQ8064 and MSM8960
- * "qcom,bam-v1.7.0" for MSM8916
-- reg: Address range for DMA registers
-- interrupts: Should contain the one interrupt shared by all channels
-- #dma-cells: must be <1>, the cell in the dmas property of the client device
- represents the channel number
-- clocks: required clock
-- clock-names: must contain "bam_clk" entry
-- qcom,ee : indicates the active Execution Environment identifier (0-7) used in
- the secure world.
-- qcom,controlled-remotely : optional, indicates that the bam is controlled by
- remote proccessor i.e. execution environment.
-- num-channels : optional, indicates supported number of DMA channels in a
- remotely controlled bam.
-- qcom,num-ees : optional, indicates supported number of Execution Environments
- in a remotely controlled bam.
-
-Example:
-
- uart-bam: dma@f9984000 = {
- compatible = "qcom,bam-v1.4.0";
- reg = <0xf9984000 0x15000>;
- interrupts = <0 94 0>;
- clocks = <&gcc GCC_BAM_DMA_AHB_CLK>;
- clock-names = "bam_clk";
- #dma-cells = <1>;
- qcom,ee = <0>;
- };
-
-DMA clients must use the format described in the dma.txt file, using a two cell
-specifier for each channel.
-
-Example:
- serial@f991e000 {
- compatible = "qcom,msm-uart";
- reg = <0xf991e000 0x1000>
- <0xf9944000 0x19000>;
- interrupts = <0 108 0>;
- clocks = <&gcc GCC_BLSP1_UART2_APPS_CLK>,
- <&gcc GCC_BLSP1_AHB_CLK>;
- clock-names = "core", "iface";
-
- dmas = <&uart-bam 0>, <&uart-bam 1>;
- dma-names = "rx", "tx";
- };
diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml
index d8142cbd13d3..03aa067b1229 100644
--- a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml
+++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml
@@ -10,7 +10,7 @@ maintainers:
- Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
@@ -42,7 +42,11 @@ properties:
- const: renesas,rcar-dmac
- items:
- - const: renesas,dmac-r8a779a0 # R-Car V3U
+ - enum:
+ - renesas,dmac-r8a779a0 # R-Car V3U
+ - renesas,dmac-r8a779f0 # R-Car S4-8
+ - renesas,dmac-r8a779g0 # R-Car V4H
+ - const: renesas,rcar-gen4-dmac # R-Car Gen4
reg: true
@@ -117,7 +121,7 @@ if:
compatible:
contains:
enum:
- - renesas,dmac-r8a779a0
+ - renesas,rcar-gen4-dmac
then:
properties:
reg:
diff --git a/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml b/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml
index 7a4f415d74dc..c284abc6784a 100644
--- a/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml
+++ b/Documentation/devicetree/bindings/dma/renesas,rz-dmac.yaml
@@ -4,19 +4,21 @@
$id: http://devicetree.org/schemas/dma/renesas,rz-dmac.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Renesas RZ/G2L DMA Controller
+title: Renesas RZ/{G2L,G2UL,V2L} DMA Controller
maintainers:
- Biju Das <biju.das.jz@bp.renesas.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
items:
- enum:
+ - renesas,r9a07g043-dmac # RZ/G2UL
- renesas,r9a07g044-dmac # RZ/G2{L,LC}
+ - renesas,r9a07g054-dmac # RZ/V2L
- const: renesas,rz-dmac
reg:
@@ -52,6 +54,11 @@ properties:
- description: DMA main clock
- description: DMA register access clock
+ clock-names:
+ items:
+ - const: main
+ - const: register
+
'#dma-cells':
const: 1
description:
@@ -75,16 +82,23 @@ properties:
- description: Reset for DMA ARESETN reset terminal
- description: Reset for DMA RST_ASYNC reset terminal
+ reset-names:
+ items:
+ - const: arst
+ - const: rst_async
+
required:
- compatible
- reg
- interrupts
- interrupt-names
- clocks
+ - clock-names
- '#dma-cells'
- dma-channels
- power-domains
- resets
+ - reset-names
additionalProperties: false
@@ -122,9 +136,11 @@ examples:
"ch12", "ch13", "ch14", "ch15";
clocks = <&cpg CPG_MOD R9A07G044_DMAC_ACLK>,
<&cpg CPG_MOD R9A07G044_DMAC_PCLK>;
+ clock-names = "main", "register";
power-domains = <&cpg>;
resets = <&cpg R9A07G044_DMAC_ARESETN>,
<&cpg R9A07G044_DMAC_RST_ASYNC>;
+ reset-names = "arst", "rst_async";
#dma-cells = <1>;
dma-channels = <16>;
};
diff --git a/Documentation/devicetree/bindings/dma/renesas,rzn1-dmamux.yaml b/Documentation/devicetree/bindings/dma/renesas,rzn1-dmamux.yaml
new file mode 100644
index 000000000000..ee9833dcc36c
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/renesas,rzn1-dmamux.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/renesas,rzn1-dmamux.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Renesas RZ/N1 DMA mux
+
+maintainers:
+ - Miquel Raynal <miquel.raynal@bootlin.com>
+
+allOf:
+ - $ref: dma-router.yaml#
+
+properties:
+ compatible:
+ const: renesas,rzn1-dmamux
+
+ reg:
+ maxItems: 1
+ description: DMA mux first register offset within the system control parent.
+
+ '#dma-cells':
+ const: 6
+ description:
+ The first four cells are dedicated to the master DMA controller. The fifth
+ cell gives the DMA mux bit index that must be set starting from 0. The
+ sixth cell gives the binary value that must be written there, ie. 0 or 1.
+
+ dma-masters:
+ minItems: 1
+ maxItems: 2
+
+ dma-requests:
+ const: 32
+
+required:
+ - reg
+ - dma-requests
+
+additionalProperties: false
+
+examples:
+ - |
+ dma-router@a0 {
+ compatible = "renesas,rzn1-dmamux";
+ reg = <0xa0 4>;
+ #dma-cells = <6>;
+ dma-masters = <&dma0 &dma1>;
+ dma-requests = <32>;
+ };
diff --git a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.yaml b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.yaml
index ab287c652b2c..17813599fccb 100644
--- a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.yaml
+++ b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.yaml
@@ -10,7 +10,7 @@ maintainers:
- Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml b/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml
index d32a71b975fe..a1af0b906365 100644
--- a/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml
+++ b/Documentation/devicetree/bindings/dma/sifive,fu540-c000-pdma.yaml
@@ -22,10 +22,21 @@ description: |
https://static.dev.sifive.com/FU540-C000-v1.0.pdf
+allOf:
+ - $ref: dma-controller.yaml#
+
properties:
compatible:
items:
- - const: sifive,fu540-c000-pdma
+ - enum:
+ - sifive,fu540-c000-pdma
+ - const: sifive,pdma0
+ description:
+ Should be "sifive,<chip>-pdma" and "sifive,pdma<version>".
+ Supported compatible strings are -
+ "sifive,fu540-c000-pdma" for the SiFive PDMA v0 as integrated onto the
+ SiFive FU540 chip resp and "sifive,pdma0" for the SiFive PDMA v0 IP block
+ with no chip integration tweaks.
reg:
maxItems: 1
@@ -34,6 +45,12 @@ properties:
minItems: 1
maxItems: 8
+ dma-channels:
+ description: For backwards-compatibility, the default value is 4
+ minimum: 1
+ maximum: 4
+ default: 4
+
'#dma-cells':
const: 1
@@ -41,16 +58,16 @@ required:
- compatible
- reg
- interrupts
- - '#dma-cells'
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
- dma@3000000 {
- compatible = "sifive,fu540-c000-pdma";
+ dma-controller@3000000 {
+ compatible = "sifive,fu540-c000-pdma", "sifive,pdma0";
reg = <0x3000000 0x8000>;
- interrupts = <23 24 25 26 27 28 29 30>;
+ dma-channels = <4>;
+ interrupts = <23>, <24>, <25>, <26>, <27>, <28>, <29>, <30>;
#dma-cells = <1>;
};
diff --git a/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml b/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml
index 6b35089ac017..5da8291a7de0 100644
--- a/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml
+++ b/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml
@@ -11,11 +11,17 @@ maintainers:
- Andy Shevchenko <andriy.shevchenko@linux.intel.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
- const: snps,dma-spear1340
+ oneOf:
+ - const: snps,dma-spear1340
+ - items:
+ - enum:
+ - renesas,r9a06g032-dma
+ - const: renesas,rzn1-dma
+
"#dma-cells":
minimum: 3
diff --git a/Documentation/devicetree/bindings/dma/snps,dw-axi-dmac.yaml b/Documentation/devicetree/bindings/dma/snps,dw-axi-dmac.yaml
index 79e241498e25..363cf8bd150d 100644
--- a/Documentation/devicetree/bindings/dma/snps,dw-axi-dmac.yaml
+++ b/Documentation/devicetree/bindings/dma/snps,dw-axi-dmac.yaml
@@ -8,19 +8,19 @@ title: Synopsys DesignWare AXI DMA Controller
maintainers:
- Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
- - Jee Heng Sia <jee.heng.sia@intel.com>
description:
Synopsys DesignWare AXI DMA Controller DT Binding
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
enum:
- snps,axi-dma-1.01a
- intel,kmb-axi-dma
+ - starfive,jh7110-axi-dma
reg:
minItems: 1
@@ -34,7 +34,12 @@ properties:
- const: axidma_apb_regs
interrupts:
- maxItems: 1
+ description:
+ If the IP-core synthesis parameter DMAX_INTR_IO_TYPE is set to 1, this
+ will be per-channel interrupts. Otherwise, this is a single combined IRQ
+ for all channels.
+ minItems: 1
+ maxItems: 8
clocks:
items:
@@ -53,6 +58,10 @@ properties:
minimum: 1
maximum: 8
+ resets:
+ minItems: 1
+ maxItems: 2
+
snps,dma-masters:
description: |
Number of AXI masters supported by the hardware.
@@ -102,25 +111,44 @@ required:
- snps,priority
- snps,block-size
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - starfive,jh7110-axi-dma
+then:
+ properties:
+ resets:
+ minItems: 2
+ items:
+ - description: AXI reset line
+ - description: AHB reset line
+ - description: module reset
+else:
+ properties:
+ resets:
+ maxItems: 1
+
additionalProperties: false
examples:
- |
- #include <dt-bindings/interrupt-controller/arm-gic.h>
- #include <dt-bindings/interrupt-controller/irq.h>
- /* example with snps,dw-axi-dmac */
- dmac: dma-controller@80000 {
- compatible = "snps,axi-dma-1.01a";
- reg = <0x80000 0x400>;
- clocks = <&core_clk>, <&cfgr_clk>;
- clock-names = "core-clk", "cfgr-clk";
- interrupt-parent = <&intc>;
- interrupts = <27>;
- #dma-cells = <1>;
- dma-channels = <4>;
- snps,dma-masters = <2>;
- snps,data-width = <3>;
- snps,block-size = <4096 4096 4096 4096>;
- snps,priority = <0 1 2 3>;
- snps,axi-max-burst-len = <16>;
- };
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ /* example with snps,dw-axi-dmac */
+ dma-controller@80000 {
+ compatible = "snps,axi-dma-1.01a";
+ reg = <0x80000 0x400>;
+ clocks = <&core_clk>, <&cfgr_clk>;
+ clock-names = "core-clk", "cfgr-clk";
+ interrupt-parent = <&intc>;
+ interrupts = <27>;
+ #dma-cells = <1>;
+ dma-channels = <4>;
+ snps,dma-masters = <2>;
+ snps,data-width = <3>;
+ snps,block-size = <4096 4096 4096 4096>;
+ snps,priority = <0 1 2 3>;
+ snps,axi-max-burst-len = <16>;
+ };
diff --git a/Documentation/devicetree/bindings/dma/socionext,uniphier-mio-dmac.yaml b/Documentation/devicetree/bindings/dma/socionext,uniphier-mio-dmac.yaml
index e7bf6dd7da29..23c8a7bf24de 100644
--- a/Documentation/devicetree/bindings/dma/socionext,uniphier-mio-dmac.yaml
+++ b/Documentation/devicetree/bindings/dma/socionext,uniphier-mio-dmac.yaml
@@ -14,7 +14,7 @@ maintainers:
- Masahiro Yamada <yamada.masahiro@socionext.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/socionext,uniphier-xdmac.yaml b/Documentation/devicetree/bindings/dma/socionext,uniphier-xdmac.yaml
index 371f18773198..da61d1ddc9c3 100644
--- a/Documentation/devicetree/bindings/dma/socionext,uniphier-xdmac.yaml
+++ b/Documentation/devicetree/bindings/dma/socionext,uniphier-xdmac.yaml
@@ -15,7 +15,7 @@ maintainers:
- Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/sprd-dma.txt b/Documentation/devicetree/bindings/dma/sprd-dma.txt
index adccea9941f1..c7e9b5fd50e7 100644
--- a/Documentation/devicetree/bindings/dma/sprd-dma.txt
+++ b/Documentation/devicetree/bindings/dma/sprd-dma.txt
@@ -8,10 +8,13 @@ Required properties:
- interrupts: Should contain one interrupt shared by all channel.
- #dma-cells: must be <1>. Used to represent the number of integer
cells in the dmas property of client device.
-- #dma-channels : Number of DMA channels supported. Should be 32.
+- dma-channels : Number of DMA channels supported. Should be 32.
- clock-names: Should contain the clock of the DMA controller.
- clocks: Should contain a clock specifier for each entry in clock-names.
+Deprecated properties:
+- #dma-channels : Number of DMA channels supported. Should be 32.
+
Example:
Controller:
@@ -20,7 +23,7 @@ apdma: dma-controller@20100000 {
reg = <0x20100000 0x4000>;
interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
#dma-cells = <1>;
- #dma-channels = <32>;
+ dma-channels = <32>;
clock-names = "enable";
clocks = <&clk_ap_ahb_gates 5>;
};
diff --git a/Documentation/devicetree/bindings/dma/st,stm32-dma.yaml b/Documentation/devicetree/bindings/dma/st,stm32-dma.yaml
index 4bf676fd25dc..329847ef096a 100644
--- a/Documentation/devicetree/bindings/dma/st,stm32-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/st,stm32-dma.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/dma/st,stm32-dma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 DMA Controller bindings
+title: STMicroelectronics STM32 DMA Controller
description: |
The STM32 DMA is a general-purpose direct memory access controller capable of
@@ -50,10 +50,10 @@ description: |
maintainers:
- - Amelie Delaunay <amelie.delaunay@st.com>
+ - Amelie Delaunay <amelie.delaunay@foss.st.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
"#dma-cells":
diff --git a/Documentation/devicetree/bindings/dma/st,stm32-dmamux.yaml b/Documentation/devicetree/bindings/dma/st,stm32-dmamux.yaml
index c8d2b51d8410..e722fbcd8a5f 100644
--- a/Documentation/devicetree/bindings/dma/st,stm32-dmamux.yaml
+++ b/Documentation/devicetree/bindings/dma/st,stm32-dmamux.yaml
@@ -4,13 +4,13 @@
$id: http://devicetree.org/schemas/dma/st,stm32-dmamux.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 DMA MUX (DMA request router) bindings
+title: STMicroelectronics STM32 DMA MUX (DMA request router)
maintainers:
- - Amelie Delaunay <amelie.delaunay@st.com>
+ - Amelie Delaunay <amelie.delaunay@foss.st.com>
allOf:
- - $ref: "dma-router.yaml#"
+ - $ref: dma-router.yaml#
properties:
"#dma-cells":
@@ -46,9 +46,8 @@ examples:
#dma-cells = <3>;
dma-requests = <128>;
dma-channels = <16>;
- dma-masters = <&dma1 &dma2>;
+ dma-masters = <&dma1>, <&dma2>;
clocks = <&timer_clk>;
};
...
-
diff --git a/Documentation/devicetree/bindings/dma/st,stm32-mdma.yaml b/Documentation/devicetree/bindings/dma/st,stm32-mdma.yaml
index c30be840be1c..3874544dfa74 100644
--- a/Documentation/devicetree/bindings/dma/st,stm32-mdma.yaml
+++ b/Documentation/devicetree/bindings/dma/st,stm32-mdma.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/dma/st,stm32-mdma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 MDMA Controller bindings
+title: STMicroelectronics STM32 MDMA Controller
description: |
The STM32 MDMA is a general-purpose direct memory access controller capable of
@@ -50,10 +50,10 @@ description: |
if no HW ack signal is used by the MDMA client
maintainers:
- - Amelie Delaunay <amelie.delaunay@st.com>
+ - Amelie Delaunay <amelie.delaunay@foss.st.com>
allOf:
- - $ref: "dma-controller.yaml#"
+ - $ref: dma-controller.yaml#
properties:
"#dma-cells":
@@ -104,4 +104,3 @@ examples:
};
...
-
diff --git a/Documentation/devicetree/bindings/dma/ste-dma40.txt b/Documentation/devicetree/bindings/dma/ste-dma40.txt
deleted file mode 100644
index 99ab5c4d331e..000000000000
--- a/Documentation/devicetree/bindings/dma/ste-dma40.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-* DMA40 DMA Controller
-
-Required properties:
-- compatible: "stericsson,dma40"
-- reg: Address range of the DMAC registers
-- reg-names: Names of the above areas to use during resource look-up
-- interrupt: Should contain the DMAC interrupt number
-- #dma-cells: must be <3>
-- memcpy-channels: Channels to be used for memcpy
-
-Optional properties:
-- dma-channels: Number of channels supported by hardware - if not present
- the driver will attempt to obtain the information from H/W
-- disabled-channels: Channels which can not be used
-
-Example:
-
- dma: dma-controller@801c0000 {
- compatible = "stericsson,db8500-dma40", "stericsson,dma40";
- reg = <0x801C0000 0x1000 0x40010000 0x800>;
- reg-names = "base", "lcpa";
- interrupt-parent = <&intc>;
- interrupts = <0 25 0x4>;
-
- #dma-cells = <2>;
- memcpy-channels = <56 57 58 59 60>;
- disabled-channels = <12>;
- dma-channels = <8>;
- };
-
-Clients
-Required properties:
-- dmas: Comma separated list of dma channel requests
-- dma-names: Names of the aforementioned requested channels
-
-Each dmas request consists of 4 cells:
- 1. A phandle pointing to the DMA controller
- 2. Device signal number, the signal line for single and burst requests
- connected from the device to the DMA40 engine
- 3. The DMA request line number (only when 'use fixed channel' is set)
- 4. A 32bit mask specifying; mode, direction and endianness
- [NB: This list will grow]
- 0x00000001: Mode:
- Logical channel when unset
- Physical channel when set
- 0x00000002: Direction:
- Memory to Device when unset
- Device to Memory when set
- 0x00000004: Endianness:
- Little endian when unset
- Big endian when set
- 0x00000008: Use fixed channel:
- Use automatic channel selection when unset
- Use DMA request line number when set
- 0x00000010: Set channel as high priority:
- Normal priority when unset
- High priority when set
-
-Existing signal numbers for the DB8500 ASIC. Unless specified, the signals are
-bidirectional, i.e. the same for RX and TX operations:
-
-0: SPI controller 0
-1: SD/MMC controller 0 (unused)
-2: SD/MMC controller 1 (unused)
-3: SD/MMC controller 2 (unused)
-4: I2C port 1
-5: I2C port 3
-6: I2C port 2
-7: I2C port 4
-8: Synchronous Serial Port SSP0
-9: Synchronous Serial Port SSP1
-10: Multi-Channel Display Engine MCDE RX
-11: UART port 2
-12: UART port 1
-13: UART port 0
-14: Multirate Serial Port MSP2
-15: I2C port 0
-16: USB OTG in/out endpoints 7 & 15
-17: USB OTG in/out endpoints 6 & 14
-18: USB OTG in/out endpoints 5 & 13
-19: USB OTG in/out endpoints 4 & 12
-20: SLIMbus or HSI channel 0
-21: SLIMbus or HSI channel 1
-22: SLIMbus or HSI channel 2
-23: SLIMbus or HSI channel 3
-24: Multimedia DSP SXA0
-25: Multimedia DSP SXA1
-26: Multimedia DSP SXA2
-27: Multimedia DSP SXA3
-28: SD/MM controller 2
-29: SD/MM controller 0
-30: MSP port 1 on DB8500 v1, MSP port 3 on DB8500 v2
-31: MSP port 0 or SLIMbus channel 0
-32: SD/MM controller 1
-33: SPI controller 2
-34: i2c3 RX2 TX2
-35: SPI controller 1
-36: USB OTG in/out endpoints 3 & 11
-37: USB OTG in/out endpoints 2 & 10
-38: USB OTG in/out endpoints 1 & 9
-39: USB OTG in/out endpoints 8
-40: SPI controller 3
-41: SD/MM controller 3
-42: SD/MM controller 4
-43: SD/MM controller 5
-44: Multimedia DSP SXA4
-45: Multimedia DSP SXA5
-46: SLIMbus channel 8 or Multimedia DSP SXA6
-47: SLIMbus channel 9 or Multimedia DSP SXA7
-48: Crypto Accelerator 1
-49: Crypto Accelerator 1 TX or Hash Accelerator 1 TX
-50: Hash Accelerator 1 TX
-51: memcpy TX (to be used by the DMA driver for memcpy operations)
-52: SLIMbus or HSI channel 4
-53: SLIMbus or HSI channel 5
-54: SLIMbus or HSI channel 6
-55: SLIMbus or HSI channel 7
-56: memcpy (to be used by the DMA driver for memcpy operations)
-57: memcpy (to be used by the DMA driver for memcpy operations)
-58: memcpy (to be used by the DMA driver for memcpy operations)
-59: memcpy (to be used by the DMA driver for memcpy operations)
-60: memcpy (to be used by the DMA driver for memcpy operations)
-61: Crypto Accelerator 0
-62: Crypto Accelerator 0 TX or Hash Accelerator 0 TX
-63: Hash Accelerator 0 TX
-
-Example:
-
- uart@80120000 {
- compatible = "arm,pl011", "arm,primecell";
- reg = <0x80120000 0x1000>;
- interrupts = <0 11 0x4>;
-
- dmas = <&dma 13 0 0x2>, /* Logical - DevToMem */
- <&dma 13 0 0x0>; /* Logical - MemToDev */
- dma-names = "rx", "rx";
-
- };
diff --git a/Documentation/devicetree/bindings/dma/stericsson,dma40.yaml b/Documentation/devicetree/bindings/dma/stericsson,dma40.yaml
new file mode 100644
index 000000000000..64845347f44d
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/stericsson,dma40.yaml
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/stericsson,dma40.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ST-Ericsson DMA40 DMA Engine
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+ - $ref: dma-controller.yaml#
+
+properties:
+ "#dma-cells":
+ const: 3
+ description: |
+ The first cell is the unique device channel number as indicated by this
+ table for DB8500 which is the only ASIC known to use DMA40:
+
+ 0: SPI controller 0
+ 1: SD/MMC controller 0 (unused)
+ 2: SD/MMC controller 1 (unused)
+ 3: SD/MMC controller 2 (unused)
+ 4: I2C port 1
+ 5: I2C port 3
+ 6: I2C port 2
+ 7: I2C port 4
+ 8: Synchronous Serial Port SSP0
+ 9: Synchronous Serial Port SSP1
+ 10: Multi-Channel Display Engine MCDE RX
+ 11: UART port 2
+ 12: UART port 1
+ 13: UART port 0
+ 14: Multirate Serial Port MSP2
+ 15: I2C port 0
+ 16: USB OTG in/out endpoints 7 & 15
+ 17: USB OTG in/out endpoints 6 & 14
+ 18: USB OTG in/out endpoints 5 & 13
+ 19: USB OTG in/out endpoints 4 & 12
+ 20: SLIMbus or HSI channel 0
+ 21: SLIMbus or HSI channel 1
+ 22: SLIMbus or HSI channel 2
+ 23: SLIMbus or HSI channel 3
+ 24: Multimedia DSP SXA0
+ 25: Multimedia DSP SXA1
+ 26: Multimedia DSP SXA2
+ 27: Multimedia DSP SXA3
+ 28: SD/MMC controller 2
+ 29: SD/MMC controller 0
+ 30: MSP port 1 on DB8500 v1, MSP port 3 on DB8500 v2
+ 31: MSP port 0 or SLIMbus channel 0
+ 32: SD/MMC controller 1
+ 33: SPI controller 2
+ 34: i2c3 RX2 TX2
+ 35: SPI controller 1
+ 36: USB OTG in/out endpoints 3 & 11
+ 37: USB OTG in/out endpoints 2 & 10
+ 38: USB OTG in/out endpoints 1 & 9
+ 39: USB OTG in/out endpoints 8
+ 40: SPI controller 3
+ 41: SD/MMC controller 3
+ 42: SD/MMC controller 4
+ 43: SD/MMC controller 5
+ 44: Multimedia DSP SXA4
+ 45: Multimedia DSP SXA5
+ 46: SLIMbus channel 8 or Multimedia DSP SXA6
+ 47: SLIMbus channel 9 or Multimedia DSP SXA7
+ 48: Crypto Accelerator 1
+ 49: Crypto Accelerator 1 TX or Hash Accelerator 1 TX
+ 50: Hash Accelerator 1 TX
+ 51: memcpy TX (to be used by the DMA driver for memcpy operations)
+ 52: SLIMbus or HSI channel 4
+ 53: SLIMbus or HSI channel 5
+ 54: SLIMbus or HSI channel 6
+ 55: SLIMbus or HSI channel 7
+ 56: memcpy (to be used by the DMA driver for memcpy operations)
+ 57: memcpy (to be used by the DMA driver for memcpy operations)
+ 58: memcpy (to be used by the DMA driver for memcpy operations)
+ 59: memcpy (to be used by the DMA driver for memcpy operations)
+ 60: memcpy (to be used by the DMA driver for memcpy operations)
+ 61: Crypto Accelerator 0
+ 62: Crypto Accelerator 0 TX or Hash Accelerator 0 TX
+ 63: Hash Accelerator 0 TX
+
+ The second cell is the DMA request line number. This is only used when
+ a fixed channel is allocated, and indicated by setting bit 3 in the
+ flags field (see below).
+
+ The third cell is a 32bit flags bitfield with the following possible
+ bits set:
+ 0x00000001 (bit 0) - mode:
+ Logical channel when unset
+ Physical channel when set
+ 0x00000002 (bit 1) - direction:
+ Memory to Device when unset
+ Device to Memory when set
+ 0x00000004 (bit 2) - endianness:
+ Little endian when unset
+ Big endian when set
+ 0x00000008 (bit 3) - use fixed channel:
+ Use automatic channel selection when unset
+ Use DMA request line number when set
+ 0x00000010 (bit 4) - set channel as high priority:
+ Normal priority when unset
+ High priority when set
+
+ compatible:
+ items:
+ - const: stericsson,db8500-dma40
+ - const: stericsson,dma40
+
+ reg:
+ items:
+ - description: DMA40 memory base
+ - description: LCPA memory base
+
+ reg-names:
+ items:
+ - const: base
+ - const: lcpa
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ memcpy-channels:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ description: Array of u32 elements indicating which channels on the DMA
+ engine are elegible for memcpy transfers
+
+required:
+ - "#dma-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - memcpy-channels
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/mfd/dbx500-prcmu.h>
+ dma-controller@801c0000 {
+ compatible = "stericsson,db8500-dma40", "stericsson,dma40";
+ reg = <0x801c0000 0x1000>, <0x40010000 0x800>;
+ reg-names = "base", "lcpa";
+ interrupts = <GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>;
+ #dma-cells = <3>;
+ memcpy-channels = <56 57 58 59 60>;
+ clocks = <&prcmu_clk PRCMU_DMACLK>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt
index b849a1ed389d..47e477cce6d2 100644
--- a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt
+++ b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt
@@ -4,7 +4,7 @@ Required properties:
- compatible: "ti,dra7-dma-crossbar" for DRA7xx DMA crossbar
"ti,am335x-edma-crossbar" for AM335x and AM437x
- reg: Memory map for accessing module
-- #dma-cells: Should be set to to match with the DMA controller's dma-cells
+- #dma-cells: Should be set to match with the DMA controller's dma-cells
for ti,dra7-dma-crossbar and <3> for ti,am335x-edma-crossbar.
- dma-requests: Number of DMA requests the crossbar can receive
- dma-masters: phandle pointing to the DMA controller
diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml
index df29d59d13a8..beecfe7a1732 100644
--- a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml
+++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml
@@ -6,7 +6,7 @@
$id: http://devicetree.org/schemas/dma/ti/k3-bcdma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Texas Instruments K3 DMSS BCDMA Device Tree Bindings
+title: Texas Instruments K3 DMSS BCDMA
maintainers:
- Peter Ujfalusi <peter.ujfalusi@gmail.com>
@@ -28,12 +28,19 @@ description: |
PDMAs can be configured via BCDMA split channel's peer registers to match with
the configuration of the legacy peripheral.
-allOf:
- - $ref: /schemas/dma/dma-controller.yaml#
-
properties:
compatible:
- const: ti,am64-dmss-bcdma
+ enum:
+ - ti,am62a-dmss-bcdma-csirx
+ - ti,am64-dmss-bcdma
+
+ reg:
+ minItems: 3
+ maxItems: 5
+
+ reg-names:
+ minItems: 3
+ maxItems: 5
"#dma-cells":
const: 3
@@ -64,19 +71,13 @@ properties:
cell 3: ASEL value for the channel
- reg:
- maxItems: 5
-
- reg-names:
- items:
- - const: gcfg
- - const: bchanrt
- - const: rchanrt
- - const: tchanrt
- - const: ringrt
-
msi-parent: true
+ power-domains:
+ description:
+ Power domain if available
+ maxItems: 1
+
ti,asel:
$ref: /schemas/types.yaml#/definitions/uint32
description: ASEL value for non slave channels
@@ -122,10 +123,51 @@ required:
- msi-parent
- ti,sci
- ti,sci-dev-id
- - ti,sci-rm-range-bchan
- - ti,sci-rm-range-tchan
- ti,sci-rm-range-rchan
+allOf:
+ - $ref: /schemas/dma/dma-controller.yaml#
+ - $ref: /schemas/arm/keystone/ti,k3-sci-common.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: ti,am62a-dmss-bcdma-csirx
+ then:
+ properties:
+ ti,sci-rm-range-bchan: false
+ ti,sci-rm-range-tchan: false
+
+ reg:
+ maxItems: 3
+
+ reg-names:
+ items:
+ - const: gcfg
+ - const: rchanrt
+ - const: ringrt
+
+ required:
+ - power-domains
+
+ else:
+ properties:
+ reg:
+ minItems: 5
+
+ reg-names:
+ items:
+ - const: gcfg
+ - const: bchanrt
+ - const: rchanrt
+ - const: tchanrt
+ - const: ringrt
+
+ required:
+ - ti,sci-rm-range-bchan
+ - ti,sci-rm-range-tchan
+
unevaluatedProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml
index ea19d12a9337..a69f62f854d8 100644
--- a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml
+++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml
@@ -6,7 +6,7 @@
$id: http://devicetree.org/schemas/dma/ti/k3-pktdma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Texas Instruments K3 DMSS PKTDMA Device Tree Bindings
+title: Texas Instruments K3 DMSS PKTDMA
maintainers:
- Peter Ujfalusi <peter.ujfalusi@gmail.com>
@@ -25,6 +25,7 @@ description: |
allOf:
- $ref: /schemas/dma/dma-controller.yaml#
+ - $ref: /schemas/arm/keystone/ti,k3-sci-common.yaml#
properties:
compatible:
diff --git a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml
index 6a09bbf83d46..22f6c5e2f7f4 100644
--- a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml
+++ b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml
@@ -6,7 +6,7 @@
$id: http://devicetree.org/schemas/dma/ti/k3-udma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Texas Instruments K3 NAVSS Unified DMA Device Tree Bindings
+title: Texas Instruments K3 NAVSS Unified DMA
maintainers:
- Peter Ujfalusi <peter.ujfalusi@gmail.com>
@@ -43,7 +43,8 @@ description: |
configuration of the legacy peripheral.
allOf:
- - $ref: "../dma-controller.yaml#"
+ - $ref: ../dma-controller.yaml#
+ - $ref: /schemas/arm/keystone/ti,k3-sci-common.yaml#
properties:
"#dma-cells":
@@ -78,14 +79,6 @@ properties:
msi-parent: true
- ti,sci:
- description: phandle to TI-SCI compatible System controller node
- $ref: /schemas/types.yaml#/definitions/phandle
-
- ti,sci-dev-id:
- description: TI-SCI device id of UDMAP
- $ref: /schemas/types.yaml#/definitions/uint32
-
ti,ringacc:
description: phandle to the ring accelerator node
$ref: /schemas/types.yaml#/definitions/phandle
diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
index 325aca52cd43..d1700a5c36bf 100644
--- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
+++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
@@ -110,7 +110,11 @@ axi_vdma_0: axivdma@40030000 {
Required properties:
- dmas: a list of <[Video DMA device phandle] [Channel ID]> pairs,
where Channel ID is '0' for write/tx and '1' for read/rx
- channel.
+ channel. For MCMDA, MM2S channel(write/tx) ID start from
+ '0' and is in [0-15] range. S2MM channel(read/rx) ID start
+ from '16' and is in [16-31] range. These channels ID are
+ fixed irrespective of IP configuration.
+
- dma-names: a list of DMA channel names, one per "dmas" entry
Example:
diff --git a/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dma-1.0.yaml b/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dma-1.0.yaml
new file mode 100644
index 000000000000..23ada8f87526
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dma-1.0.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/xilinx/xlnx,zynqmp-dma-1.0.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx ZynqMP DMA Engine
+
+description: |
+ The Xilinx ZynqMP DMA engine supports memory to memory transfers,
+ memory to device and device to memory transfers. It also has flow
+ control and rate control support for slave/peripheral dma access.
+
+maintainers:
+ - Michael Tretter <m.tretter@pengutronix.de>
+
+allOf:
+ - $ref: ../dma-controller.yaml#
+
+properties:
+ "#dma-cells":
+ const: 1
+
+ compatible:
+ const: xlnx,zynqmp-dma-1.0
+
+ reg:
+ description: memory map for gdma/adma module access
+ maxItems: 1
+
+ interrupts:
+ description: DMA channel interrupt
+ maxItems: 1
+
+ clocks:
+ description: input clocks
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: clk_main
+ - const: clk_apb
+
+ xlnx,bus-width:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum:
+ - 64
+ - 128
+ description: AXI bus width in bits
+
+ iommus:
+ maxItems: 1
+
+ power-domains:
+ maxItems: 1
+
+ dma-coherent:
+ description: present if dma operations are coherent
+
+required:
+ - "#dma-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/xlnx-zynqmp-clk.h>
+
+ fpd_dma_chan1: dma-controller@fd500000 {
+ compatible = "xlnx,zynqmp-dma-1.0";
+ reg = <0xfd500000 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <0 117 0x4>;
+ #dma-cells = <1>;
+ clock-names = "clk_main", "clk_apb";
+ clocks = <&zynqmp_clk GDMA_REF>, <&zynqmp_clk LPD_LSBUS>;
+ xlnx,bus-width = <128>;
+ dma-coherent;
+ };
diff --git a/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml b/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml
index 2a595b18ff6c..d6cbd95ec26d 100644
--- a/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml
+++ b/Documentation/devicetree/bindings/dma/xilinx/xlnx,zynqmp-dpdma.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/dma/xilinx/xlnx,zynqmp-dpdma.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx ZynqMP DisplayPort DMA Controller Device Tree Bindings
+title: Xilinx ZynqMP DisplayPort DMA Controller
description: |
These bindings describe the DMA engine included in the Xilinx ZynqMP
@@ -16,7 +16,7 @@ maintainers:
- Laurent Pinchart <laurent.pinchart@ideasonboard.com>
allOf:
- - $ref: "../dma-controller.yaml#"
+ - $ref: ../dma-controller.yaml#
properties:
"#dma-cells":
diff --git a/Documentation/devicetree/bindings/dma/xilinx/zynqmp_dma.txt b/Documentation/devicetree/bindings/dma/xilinx/zynqmp_dma.txt
deleted file mode 100644
index 07a5a7aa9ea0..000000000000
--- a/Documentation/devicetree/bindings/dma/xilinx/zynqmp_dma.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Xilinx ZynqMP DMA engine, it does support memory to memory transfers,
-memory to device and device to memory transfers. It also has flow
-control and rate control support for slave/peripheral dma access.
-
-Required properties:
-- compatible : Should be "xlnx,zynqmp-dma-1.0"
-- reg : Memory map for gdma/adma module access.
-- interrupts : Should contain DMA channel interrupt.
-- xlnx,bus-width : Axi buswidth in bits. Should contain 128 or 64
-- clock-names : List of input clocks "clk_main", "clk_apb"
- (see clock bindings for details)
-
-Optional properties:
-- dma-coherent : Present if dma operations are coherent.
-
-Example:
-++++++++
-fpd_dma_chan1: dma@fd500000 {
- compatible = "xlnx,zynqmp-dma-1.0";
- reg = <0x0 0xFD500000 0x1000>;
- interrupt-parent = <&gic>;
- interrupts = <0 117 4>;
- clock-names = "clk_main", "clk_apb";
- xlnx,bus-width = <128>;
- dma-coherent;
-};
diff --git a/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml b/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml
index 7afc9f2be13a..9af40da5688e 100644
--- a/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml
+++ b/Documentation/devicetree/bindings/dsp/fsl,dsp.yaml
@@ -8,6 +8,7 @@ title: NXP i.MX8 DSP core
maintainers:
- Daniel Baluta <daniel.baluta@nxp.com>
+ - Shengjiu Wang <shengjiu.wang@nxp.com>
description: |
Some boards from i.MX8 family contain a DSP core used for
@@ -19,6 +20,11 @@ properties:
- fsl,imx8qxp-dsp
- fsl,imx8qm-dsp
- fsl,imx8mp-dsp
+ - fsl,imx8ulp-dsp
+ - fsl,imx8qxp-hifi4
+ - fsl,imx8qm-hifi4
+ - fsl,imx8mp-hifi4
+ - fsl,imx8ulp-hifi4
reg:
maxItems: 1
@@ -28,37 +34,53 @@ properties:
- description: ipg clock
- description: ocram clock
- description: core clock
+ - description: debug interface clock
+ - description: message unit clock
+ minItems: 3
clock-names:
items:
- const: ipg
- const: ocram
- const: core
+ - const: debug
+ - const: mu
+ minItems: 3
power-domains:
description:
List of phandle and PM domain specifier as documented in
Documentation/devicetree/bindings/power/power_domain.txt
+ minItems: 1
maxItems: 4
mboxes:
description:
List of <&phandle type channel> - 2 channels for TXDB, 2 channels for RXDB
+ or - 1 channel for TX, 1 channel for RX, 1 channel for RXDB
(see mailbox/fsl,mu.txt)
+ minItems: 3
maxItems: 4
mbox-names:
- items:
- - const: txdb0
- - const: txdb1
- - const: rxdb0
- - const: rxdb1
+ minItems: 3
+ maxItems: 4
memory-region:
description:
phandle to a node describing reserved memory (System RAM memory)
used by DSP (see bindings/reserved-memory/reserved-memory.txt)
- maxItems: 1
+ minItems: 1
+ maxItems: 4
+
+ firmware-name:
+ description: |
+ Default name of the firmware to load to the remote processor.
+
+ fsl,dsp-ctrl:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to syscon block which provide access for processor enablement
required:
- compatible
@@ -70,6 +92,58 @@ required:
- mbox-names
- memory-region
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx8qxp-dsp
+ - fsl,imx8qm-dsp
+ - fsl,imx8qxp-hifi4
+ - fsl,imx8qm-hifi4
+ then:
+ properties:
+ power-domains:
+ minItems: 4
+ else:
+ properties:
+ power-domains:
+ maxItems: 1
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - fsl,imx8qxp-hifi4
+ - fsl,imx8qm-hifi4
+ - fsl,imx8mp-hifi4
+ - fsl,imx8ulp-hifi4
+ then:
+ properties:
+ memory-region:
+ minItems: 4
+ mboxes:
+ maxItems: 3
+ mbox-names:
+ items:
+ - const: tx
+ - const: rx
+ - const: rxdb
+ else:
+ properties:
+ memory-region:
+ maxItems: 1
+ mboxes:
+ minItems: 4
+ mbox-names:
+ items:
+ - const: txdb0
+ - const: txdb1
+ - const: rxdb0
+ - const: rxdb1
+
additionalProperties: false
examples:
@@ -91,3 +165,41 @@ examples:
mboxes = <&lsio_mu13 2 0>, <&lsio_mu13 2 1>, <&lsio_mu13 3 0>, <&lsio_mu13 3 1>;
memory-region = <&dsp_reserved>;
};
+ - |
+ #include <dt-bindings/clock/imx8mp-clock.h>
+ dsp_reserved: dsp@92400000 {
+ reg = <0x92400000 0x1000000>;
+ no-map;
+ };
+ dsp_vdev0vring0: vdev0vring0@942f0000 {
+ reg = <0x942f0000 0x8000>;
+ no-map;
+ };
+ dsp_vdev0vring1: vdev0vring1@942f8000 {
+ reg = <0x942f8000 0x8000>;
+ no-map;
+ };
+ dsp_vdev0buffer: vdev0buffer@94300000 {
+ compatible = "shared-dma-pool";
+ reg = <0x94300000 0x100000>;
+ no-map;
+ };
+
+ dsp: dsp@3b6e8000 {
+ compatible = "fsl,imx8mp-hifi4";
+ reg = <0x3b6e8000 0x88000>;
+ clocks = <&audio_blk_ctrl IMX8MP_CLK_AUDIOMIX_DSP_ROOT>,
+ <&audio_blk_ctrl IMX8MP_CLK_AUDIOMIX_OCRAMA_IPG>,
+ <&audio_blk_ctrl IMX8MP_CLK_AUDIOMIX_DSP_ROOT>,
+ <&audio_blk_ctrl IMX8MP_CLK_AUDIOMIX_DSPDBG_ROOT>;
+ clock-names = "ipg", "ocram", "core", "debug";
+ firmware-name = "imx/dsp/hifi4.bin";
+ power-domains = <&audiomix_pd>;
+ mbox-names = "tx", "rx", "rxdb";
+ mboxes = <&mu2 0 0>,
+ <&mu2 1 0>,
+ <&mu2 3 0>;
+ memory-region = <&dsp_vdev0buffer>, <&dsp_vdev0vring0>,
+ <&dsp_vdev0vring1>, <&dsp_reserved>;
+ fsl,dsp-ctrl = <&audio_blk_ctrl>;
+ };
diff --git a/Documentation/devicetree/bindings/dsp/mediatek,mt8186-dsp.yaml b/Documentation/devicetree/bindings/dsp/mediatek,mt8186-dsp.yaml
new file mode 100644
index 000000000000..88575da1e6d5
--- /dev/null
+++ b/Documentation/devicetree/bindings/dsp/mediatek,mt8186-dsp.yaml
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dsp/mediatek,mt8186-dsp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek mt8186 DSP core
+
+maintainers:
+ - Tinghan Shen <tinghan.shen@mediatek.com>
+
+description: |
+ MediaTek mt8186 SoC contains a DSP core used for
+ advanced pre- and post- audio processing.
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt8186-dsp
+ - mediatek,mt8188-dsp
+
+ reg:
+ items:
+ - description: Address and size of the DSP config registers
+ - description: Address and size of the DSP SRAM
+ - description: Address and size of the DSP secure registers
+ - description: Address and size of the DSP bus registers
+
+ reg-names:
+ items:
+ - const: cfg
+ - const: sram
+ - const: sec
+ - const: bus
+
+ clocks:
+ items:
+ - description: mux for audio dsp clock
+ - description: mux for audio dsp local bus
+
+ clock-names:
+ items:
+ - const: audiodsp
+ - const: adsp_bus
+
+ power-domains:
+ maxItems: 1
+
+ mboxes:
+ items:
+ - description: mailbox for receiving audio DSP requests.
+ - description: mailbox for transmitting requests to audio DSP.
+
+ mbox-names:
+ items:
+ - const: rx
+ - const: tx
+
+ memory-region:
+ items:
+ - description: dma buffer between host and DSP.
+ - description: DSP system memory.
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+ - power-domains
+ - mbox-names
+ - mboxes
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/mt8186-clk.h>
+ dsp@10680000 {
+ compatible = "mediatek,mt8186-dsp";
+ reg = <0x10680000 0x2000>,
+ <0x10800000 0x100000>,
+ <0x1068b000 0x100>,
+ <0x1068f000 0x1000>;
+ reg-names = "cfg", "sram", "sec", "bus";
+ clocks = <&topckgen CLK_TOP_AUDIODSP>,
+ <&topckgen CLK_TOP_ADSP_BUS>;
+ clock-names = "audiodsp",
+ "adsp_bus";
+ power-domains = <&spm 6>;
+ mbox-names = "rx", "tx";
+ mboxes = <&adsp_mailbox0>, <&adsp_mailbox1>;
+ };
diff --git a/Documentation/devicetree/bindings/dsp/mediatek,mt8195-dsp.yaml b/Documentation/devicetree/bindings/dsp/mediatek,mt8195-dsp.yaml
new file mode 100644
index 000000000000..ca8d8661f872
--- /dev/null
+++ b/Documentation/devicetree/bindings/dsp/mediatek,mt8195-dsp.yaml
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dsp/mediatek,mt8195-dsp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek mt8195 DSP core
+
+maintainers:
+ - YC Hung <yc.hung@mediatek.com>
+
+description: |
+ Some boards from mt8195 contain a DSP core used for
+ advanced pre- and post- audio processing.
+
+properties:
+ compatible:
+ const: mediatek,mt8195-dsp
+
+ reg:
+ items:
+ - description: Address and size of the DSP Cfg registers
+ - description: Address and size of the DSP SRAM
+
+ reg-names:
+ items:
+ - const: cfg
+ - const: sram
+
+ clocks:
+ items:
+ - description: mux for audio dsp clock
+ - description: 26M clock
+ - description: mux for audio dsp local bus
+ - description: default audio dsp local bus clock source
+ - description: clock gate for audio dsp clock
+ - description: mux for audio dsp access external bus
+
+ clock-names:
+ items:
+ - const: adsp_sel
+ - const: clk26m_ck
+ - const: audio_local_bus
+ - const: mainpll_d7_d2
+ - const: scp_adsp_audiodsp
+ - const: audio_h
+
+ power-domains:
+ maxItems: 1
+
+ mboxes:
+ items:
+ - description: mailbox for receiving audio DSP requests.
+ - description: mailbox for transmitting requests to audio DSP.
+
+ mbox-names:
+ items:
+ - const: rx
+ - const: tx
+
+ memory-region:
+ items:
+ - description: dma buffer between host and DSP.
+ - description: DSP system memory.
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+ - clock-names
+ - memory-region
+ - power-domains
+ - mbox-names
+ - mboxes
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ dsp@10803000 {
+ compatible = "mediatek,mt8195-dsp";
+ reg = <0x10803000 0x1000>,
+ <0x10840000 0x40000>;
+ reg-names = "cfg", "sram";
+ clocks = <&topckgen 10>, //CLK_TOP_ADSP
+ <&clk26m>,
+ <&topckgen 107>, //CLK_TOP_AUDIO_LOCAL_BUS
+ <&topckgen 136>, //CLK_TOP_MAINPLL_D7_D2
+ <&scp_adsp 0>, //CLK_SCP_ADSP_AUDIODSP
+ <&topckgen 34>; //CLK_TOP_AUDIO_H
+ clock-names = "adsp_sel",
+ "clk26m_ck",
+ "audio_local_bus",
+ "mainpll_d7_d2",
+ "scp_adsp_audiodsp",
+ "audio_h";
+ memory-region = <&adsp_dma_mem_reserved>,
+ <&adsp_mem_reserved>;
+ power-domains = <&spm 6>; //MT8195_POWER_DOMAIN_ADSP
+ mbox-names = "rx", "tx";
+ mboxes = <&adsp_mailbox0>, <&adsp_mailbox1>;
+ };
diff --git a/Documentation/devicetree/bindings/dvfs/performance-domain.yaml b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
index c8b91207f34d..1dcb85a02a76 100644
--- a/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
+++ b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
@@ -43,7 +43,6 @@ properties:
performance-domains:
$ref: '/schemas/types.yaml#/definitions/phandle-array'
- maxItems: 1
description:
A phandle and performance domain specifier as defined by bindings of the
performance controller/provider specified by phandle.
@@ -52,10 +51,16 @@ additionalProperties: true
examples:
- |
- performance: performance-controller@12340000 {
- compatible = "qcom,cpufreq-hw";
- reg = <0x12340000 0x1000>;
- #performance-domain-cells = <1>;
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ performance: performance-controller@11bc00 {
+ compatible = "mediatek,cpufreq-hw";
+ reg = <0 0x0011bc10 0 0x120>, <0 0x0011bd30 0 0x120>;
+
+ #performance-domain-cells = <1>;
+ };
};
// The node above defines a performance controller that is a performance
diff --git a/Documentation/devicetree/bindings/edac/dmc-520.yaml b/Documentation/devicetree/bindings/edac/dmc-520.yaml
index 3b6842e92d1b..84db3966662a 100644
--- a/Documentation/devicetree/bindings/edac/dmc-520.yaml
+++ b/Documentation/devicetree/bindings/edac/dmc-520.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/edac/dmc-520.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: ARM DMC-520 EDAC bindings
+title: ARM DMC-520 EDAC
maintainers:
- Lei Wang <lewan@microsoft.com>
diff --git a/Documentation/devicetree/bindings/eeprom/at24.txt b/Documentation/devicetree/bindings/eeprom/at24.txt
deleted file mode 100644
index c94acbb8cb0c..000000000000
--- a/Documentation/devicetree/bindings/eeprom/at24.txt
+++ /dev/null
@@ -1 +0,0 @@
-This file has been moved to at24.yaml.
diff --git a/Documentation/devicetree/bindings/eeprom/at24.yaml b/Documentation/devicetree/bindings/eeprom/at24.yaml
index 914a423ec449..84af0d5f52aa 100644
--- a/Documentation/devicetree/bindings/eeprom/at24.yaml
+++ b/Documentation/devicetree/bindings/eeprom/at24.yaml
@@ -10,6 +10,9 @@ title: I2C EEPROMs compatible with Atmel's AT24
maintainers:
- Bartosz Golaszewski <bgolaszewski@baylibre.com>
+allOf:
+ - $ref: /schemas/nvmem/nvmem.yaml
+
select:
properties:
compatible:
@@ -87,6 +90,10 @@ properties:
- items:
pattern: cs1024$
- items:
+ pattern: c1025$
+ - items:
+ pattern: cs1025$
+ - items:
pattern: c2048$
- items:
pattern: cs2048$
@@ -95,26 +102,31 @@ properties:
# These are special cases that don't conform to the above pattern.
# Each requires a standard at24 model as fallback.
- items:
- - const: nxp,se97b
- - const: atmel,24c02
+ - enum:
+ - rohm,br24g01
+ - rohm,br24t01
+ - const: atmel,24c01
- items:
- - const: renesas,r1ex24002
+ - enum:
+ - nxp,se97b
+ - renesas,r1ex24002
- const: atmel,24c02
- items:
+ - enum:
+ - onnn,cat24c04
+ - onnn,cat24c05
+ - const: atmel,24c04
+ - items:
- const: renesas,r1ex24016
- const: atmel,24c16
- items:
- const: giantec,gt24c32a
- const: atmel,24c32
- items:
- - const: renesas,r1ex24128
+ - enum:
+ - renesas,r1ex24128
+ - samsung,s524ad0xd1
- const: atmel,24c128
- - items:
- - const: rohm,br24g01
- - const: atmel,24c01
- - items:
- - const: rohm,br24t01
- - const: atmel,24c01
label:
description: Descriptive name of the EEPROM.
@@ -174,7 +186,7 @@ required:
- compatible
- reg
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/eeprom/at25.yaml b/Documentation/devicetree/bindings/eeprom/at25.yaml
index fbf99e346966..11e2a95a7bcb 100644
--- a/Documentation/devicetree/bindings/eeprom/at25.yaml
+++ b/Documentation/devicetree/bindings/eeprom/at25.yaml
@@ -44,8 +44,6 @@ properties:
reg:
maxItems: 1
- spi-max-frequency: true
-
pagesize:
$ref: /schemas/types.yaml#/definitions/uint32
enum: [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
@@ -105,6 +103,8 @@ required:
- spi-max-frequency
allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+ - $ref: /schemas/nvmem/nvmem.yaml
- if:
properties:
compatible:
@@ -117,12 +117,12 @@ allOf:
- size
- address-width
-additionalProperties: false
+unevaluatedProperties: false
examples:
- |
#include <dt-bindings/gpio/gpio.h>
- spi0 {
+ spi {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/eeprom/microchip,93lc46b.yaml b/Documentation/devicetree/bindings/eeprom/microchip,93lc46b.yaml
new file mode 100644
index 000000000000..144e86ce5c0a
--- /dev/null
+++ b/Documentation/devicetree/bindings/eeprom/microchip,93lc46b.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/eeprom/microchip,93lc46b.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip 93xx46 SPI compatible EEPROM family
+
+maintainers:
+ - Cory Tusar <cory.tusar@pid1solutions.com>
+
+properties:
+ compatible:
+ enum:
+ - atmel,at93c46
+ - atmel,at93c46d
+ - atmel,at93c56
+ - atmel,at93c66
+ - eeprom-93xx46
+ - microchip,93lc46b
+
+ data-size:
+ description: number of data bits per word
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [8, 16]
+
+ reg:
+ description: chip select of EEPROM
+ maxItems: 1
+
+ read-only:
+ description:
+ parameter-less property which disables writes to the EEPROM
+ type: boolean
+
+ select-gpios:
+ description:
+ specifies the GPIO that needs to be asserted prior to each access
+ of EEPROM (e.g. for SPI bus multiplexing)
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - data-size
+ - spi-max-frequency
+
+allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+ - $ref: /schemas/nvmem/nvmem.yaml
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ eeprom@0 {
+ compatible = "eeprom-93xx46";
+ reg = <0>;
+ spi-max-frequency = <1000000>;
+ spi-cs-high;
+ data-size = <8>;
+ select-gpios = <&gpio4 4 GPIO_ACTIVE_HIGH>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/example-schema.yaml b/Documentation/devicetree/bindings/example-schema.yaml
index ff6ec65145cf..f4eec4c42fb3 100644
--- a/Documentation/devicetree/bindings/example-schema.yaml
+++ b/Documentation/devicetree/bindings/example-schema.yaml
@@ -11,7 +11,7 @@ $id: http://devicetree.org/schemas/example-schema.yaml#
# $schema is the meta-schema this schema should be validated with.
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: An example schema annotated with jsonschema details
+title: An Example Device
maintainers:
- Rob Herring <robh@kernel.org>
@@ -119,7 +119,7 @@ properties:
# valid for this binding.
clock-frequency:
- # The type is set in the core schema. Per device schema only need to set
+ # The type is set in the core schema. Per-device schema only need to set
# constraints on the possible values.
minimum: 100
maximum: 400000
@@ -133,24 +133,24 @@ properties:
# *-supply is always a single phandle, so nothing more to define.
foo-supply: true
- # Vendor specific properties
+ # Vendor-specific properties
#
- # Vendor specific properties have slightly different schema requirements than
+ # Vendor-specific properties have slightly different schema requirements than
# common properties. They must have at least a type definition and
# 'description'.
vendor,int-property:
- description: Vendor specific properties must have a description
+ description: Vendor-specific properties must have a description
$ref: /schemas/types.yaml#/definitions/uint32
enum: [2, 4, 6, 8, 10]
vendor,bool-property:
- description: Vendor specific properties must have a description. Boolean
+ description: Vendor-specific properties must have a description. Boolean
properties are one case where the json-schema 'type' keyword can be used
directly.
type: boolean
vendor,string-array-property:
- description: Vendor specific properties should reference a type in the
+ description: Vendor-specific properties should reference a type in the
core schema.
$ref: /schemas/types.yaml#/definitions/string-array
items:
@@ -158,14 +158,26 @@ properties:
- enum: [baz, boo]
vendor,property-in-standard-units-microvolt:
- description: Vendor specific properties having a standard unit suffix
+ description: Vendor-specific properties having a standard unit suffix
don't need a type.
enum: [ 100, 200, 300 ]
+ vendor,int-array-variable-length-and-constrained-values:
+ description: Array might define what type of elements might be used (e.g.
+ their range).
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ minItems: 2
+ maxItems: 3
+ items:
+ minimum: 0
+ maximum: 8
+
child-node:
description: Child nodes are just another property from a json-schema
perspective.
type: object # DT nodes are json objects
+ # Child nodes also need additionalProperties or unevaluatedProperties
+ additionalProperties: false
properties:
vendor,a-child-node-property:
description: Child node properties have all the same schema
@@ -207,6 +219,10 @@ allOf:
then:
required:
- foo-supply
+ else:
+ # If otherwise the property is not allowed:
+ properties:
+ foo-supply: false
# Altering schema depending on presence of properties is usually done by
# dependencies (see above), however some adjustments might require if:
- if:
@@ -235,13 +251,13 @@ examples:
# be overridden or an appropriate parent bus node should be shown (such as on
# i2c buses).
#
- # Any includes used have to be explicitly included.
+ # Any includes used have to be explicitly included. Use 4-space indentation.
- |
node@1000 {
- compatible = "vendor,soc4-ip", "vendor,soc1-ip";
- reg = <0x1000 0x80>,
- <0x3000 0x80>;
- reg-names = "core", "aux";
- interrupts = <10>;
- interrupt-controller;
+ compatible = "vendor,soc4-ip", "vendor,soc1-ip";
+ reg = <0x1000 0x80>,
+ <0x3000 0x80>;
+ reg-names = "core", "aux";
+ interrupts = <10>;
+ interrupt-controller;
};
diff --git a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml
index 20e1ccfc8630..e00c8072bae9 100644
--- a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml
+++ b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml
@@ -8,7 +8,6 @@ title: ChromeOS EC USB Type-C cable and accessories detection
maintainers:
- Benson Leung <bleung@chromium.org>
- - Enric Balletbo i Serra <enric.balletbo@collabora.com>
description: |
On ChromeOS systems with USB Type C ports, the ChromeOS Embedded Controller is
@@ -35,12 +34,13 @@ additionalProperties: false
examples:
- |
- spi0 {
+ spi {
#address-cells = <1>;
#size-cells = <0>;
cros-ec@0 {
compatible = "google,cros-ec-spi";
reg = <0>;
+ interrupts = <44 0>;
usbc_extcon0: extcon0 {
compatible = "google,extcon-usbc-cros-ec";
diff --git a/Documentation/devicetree/bindings/extcon/extcon-usbc-tusb320.yaml b/Documentation/devicetree/bindings/extcon/extcon-usbc-tusb320.yaml
index 9875b4d5c356..126107dd57b1 100644
--- a/Documentation/devicetree/bindings/extcon/extcon-usbc-tusb320.yaml
+++ b/Documentation/devicetree/bindings/extcon/extcon-usbc-tusb320.yaml
@@ -11,7 +11,9 @@ maintainers:
properties:
compatible:
- const: ti,tusb320
+ enum:
+ - ti,tusb320
+ - ti,tusb320l
reg:
maxItems: 1
@@ -28,7 +30,7 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
tusb320@61 {
diff --git a/Documentation/devicetree/bindings/extcon/maxim,max77843.yaml b/Documentation/devicetree/bindings/extcon/maxim,max77843.yaml
new file mode 100644
index 000000000000..128960545640
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/maxim,max77843.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/extcon/maxim,max77843.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Maxim MAX77843 MicroUSB and Companion Power Management IC Extcon
+
+maintainers:
+ - Chanwoo Choi <cw00.choi@samsung.com>
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+description: |
+ This is a part of device tree bindings for Maxim MAX77843 MicroUSB
+ Integrated Circuit (MUIC).
+
+ See also Documentation/devicetree/bindings/mfd/maxim,max77843.yaml for
+ additional information and example.
+
+properties:
+ compatible:
+ const: maxim,max77843-muic
+
+ connector:
+ $ref: /schemas/connector/usb-connector.yaml#
+
+ ports:
+ $ref: /schemas/graph.yaml#/properties/ports
+ description:
+ Any connector to the data bus of this controller should be modelled using
+ the OF graph bindings specified
+ properties:
+ port:
+ $ref: /schemas/graph.yaml#/properties/port
+
+required:
+ - compatible
+ - connector
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/extcon/siliconmitus,sm5502-muic.yaml b/Documentation/devicetree/bindings/extcon/siliconmitus,sm5502-muic.yaml
index fd2e55088888..7a224b2f0977 100644
--- a/Documentation/devicetree/bindings/extcon/siliconmitus,sm5502-muic.yaml
+++ b/Documentation/devicetree/bindings/extcon/siliconmitus,sm5502-muic.yaml
@@ -20,11 +20,12 @@ properties:
enum:
- siliconmitus,sm5502-muic
- siliconmitus,sm5504-muic
+ - siliconmitus,sm5703-muic
reg:
maxItems: 1
- description: I2C slave address of the device. Usually 0x25 for SM5502,
- 0x14 for SM5504.
+ description: I2C slave address of the device. Usually 0x25 for SM5502
+ and SM5703, 0x14 for SM5504.
interrupts:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/firmware/amlogic,meson-gxbb-sm.yaml b/Documentation/devicetree/bindings/firmware/amlogic,meson-gxbb-sm.yaml
new file mode 100644
index 000000000000..8f50e698760e
--- /dev/null
+++ b/Documentation/devicetree/bindings/firmware/amlogic,meson-gxbb-sm.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/firmware/amlogic,meson-gxbb-sm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Amlogic Secure Monitor (SM)
+
+description:
+ In the Amlogic SoCs the Secure Monitor code is used to provide access to the
+ NVMEM, enable JTAG, set USB boot, etc...
+
+maintainers:
+ - Neil Armstrong <neil.armstrong@linaro.org>
+
+properties:
+ compatible:
+ oneOf:
+ - const: amlogic,meson-gxbb-sm
+ - items:
+ - const: amlogic,meson-gx-sm
+ - const: amlogic,meson-gxbb-sm
+
+ power-controller:
+ type: object
+ $ref: /schemas/power/amlogic,meson-sec-pwrc.yaml#
+
+required:
+ - compatible
+
+additionalProperties: false
+
+examples:
+ - |
+ firmware {
+ secure-monitor {
+ compatible = "amlogic,meson-gxbb-sm";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/firmware/arm,scmi.yaml b/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
index 5c4c6782e052..5824c43e9893 100644
--- a/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
+++ b/Documentation/devicetree/bindings/firmware/arm,scmi.yaml
@@ -5,7 +5,7 @@
$id: http://devicetree.org/schemas/firmware/arm,scmi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: System Control and Management Interface (SCMI) Message Protocol bindings
+title: System Control and Management Interface (SCMI) Message Protocol
maintainers:
- Sudeep Holla <sudeep.holla@arm.com>
@@ -38,6 +38,9 @@ properties:
The virtio transport only supports a single device.
items:
- const: arm,scmi-virtio
+ - description: SCMI compliant firmware with OP-TEE transport
+ items:
+ - const: linaro,scmi-optee
interrupts:
description:
@@ -53,17 +56,38 @@ properties:
description:
Specifies the mailboxes used to communicate with SCMI compliant
firmware.
- items:
- - const: tx
- - const: rx
+ oneOf:
+ - items:
+ - const: tx
+ - const: rx
+ minItems: 1
+ - items:
+ - const: tx
+ - const: tx_reply
+ - const: rx
+ minItems: 2
mboxes:
description:
List of phandle and mailbox channel specifiers. It should contain
- exactly one or two mailboxes, one for transmitting messages("tx")
- and another optional for receiving the notifications("rx") if supported.
+ exactly one, two or three mailboxes; the first one or two for transmitting
+ messages ("tx") and another optional ("rx") for receiving notifications
+ and delayed responses, if supported by the platform.
+ The number of mailboxes needed for transmitting messages depends on the
+ type of channels exposed by the specific underlying mailbox controller;
+ one single channel descriptor is enough if such channel is bidirectional,
+ while two channel descriptors are needed to represent the SCMI ("tx")
+ channel if the underlying mailbox channels are of unidirectional type.
+ The effective combination in numbers of mboxes and shmem descriptors let
+ the SCMI subsystem determine unambiguosly which type of SCMI channels are
+ made available by the underlying mailbox controller and how to use them.
+ 1 mbox / 1 shmem => SCMI TX over 1 mailbox bidirectional channel
+ 2 mbox / 2 shmem => SCMI TX and RX over 2 mailbox bidirectional channels
+ 2 mbox / 1 shmem => SCMI TX over 2 mailbox unidirectional channels
+ 3 mbox / 2 shmem => SCMI TX and RX over 3 mailbox unidirectional channels
+ Any other combination of mboxes and shmem is invalid.
minItems: 1
- maxItems: 2
+ maxItems: 3
shmem:
description:
@@ -78,13 +102,28 @@ properties:
'#size-cells':
const: 0
+ atomic-threshold-us:
+ description:
+ An optional time value, expressed in microseconds, representing, on this
+ platform, the threshold above which any SCMI command, advertised to have
+ an higher-than-threshold execution latency, should not be considered for
+ atomic mode of operation, even if requested.
+ default: 0
+
arm,smc-id:
$ref: /schemas/types.yaml#/definitions/uint32
description:
SMC id required when using smc or hvc transports
+ linaro,optee-channel-id:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Channel specifier required when using OP-TEE transport.
+
protocol@11:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x11
@@ -96,7 +135,9 @@ properties:
- '#power-domain-cells'
protocol@13:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x13
@@ -108,7 +149,9 @@ properties:
- '#clock-cells'
protocol@14:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x14
@@ -120,7 +163,9 @@ properties:
- '#clock-cells'
protocol@15:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x15
@@ -132,7 +177,9 @@ properties:
- '#thermal-sensor-cells'
protocol@16:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x16
@@ -144,20 +191,31 @@ properties:
- '#reset-cells'
protocol@17:
- type: object
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
properties:
reg:
const: 0x17
regulators:
type: object
+ additionalProperties: false
description:
The list of all regulators provided by this SCMI controller.
+ properties:
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
patternProperties:
- '^regulators@[0-9a-f]+$':
+ '^regulator@[0-9a-f]+$':
type: object
$ref: "../regulator/regulator.yaml#"
+ unevaluatedProperties: false
properties:
reg:
@@ -167,10 +225,18 @@ properties:
required:
- reg
+ protocol@18:
+ $ref: '#/$defs/protocol-node'
+ unevaluatedProperties: false
+
+ properties:
+ reg:
+ const: 0x18
+
additionalProperties: false
-patternProperties:
- '^protocol@[0-9a-f]+$':
+$defs:
+ protocol-node:
type: object
description:
Each sub-node represents a protocol supported. If the platform
@@ -183,18 +249,31 @@ patternProperties:
maxItems: 1
mbox-names:
- items:
- - const: tx
- - const: rx
+ oneOf:
+ - items:
+ - const: tx
+ - const: rx
+ minItems: 1
+ - items:
+ - const: tx
+ - const: tx_reply
+ - const: rx
+ minItems: 2
mboxes:
minItems: 1
- maxItems: 2
+ maxItems: 3
shmem:
minItems: 1
maxItems: 2
+ linaro,optee-channel-id:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Channel specifier required when using OP-TEE transport and
+ protocol has a dedicated communication channel.
+
required:
- reg
@@ -226,6 +305,16 @@ else:
- arm,smc-id
- shmem
+ else:
+ if:
+ properties:
+ compatible:
+ contains:
+ const: linaro,scmi-optee
+ then:
+ required:
+ - linaro,optee-channel-id
+
examples:
- |
firmware {
@@ -240,6 +329,8 @@ examples:
#address-cells = <1>;
#size-cells = <0>;
+ atomic-threshold-us = <10000>;
+
scmi_devpd: protocol@11 {
reg = <0x11>;
#power-domain-cells = <1>;
@@ -289,6 +380,10 @@ examples:
};
};
};
+
+ scmi_powercap: protocol@18 {
+ reg = <0x18>;
+ };
};
};
@@ -330,7 +425,7 @@ examples:
firmware {
scmi {
compatible = "arm,scmi-smc";
- shmem = <&cpu_scp_lpri0 &cpu_scp_lpri1>;
+ shmem = <&cpu_scp_lpri0>, <&cpu_scp_lpri1>;
arm,smc-id = <0xc3000001>;
#address-cells = <1>;
@@ -340,7 +435,48 @@ examples:
reg = <0x11>;
#power-domain-cells = <1>;
};
+ };
+ };
+
+ - |
+ firmware {
+ scmi {
+ compatible = "linaro,scmi-optee";
+ linaro,optee-channel-id = <0>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ scmi_dvfs1: protocol@13 {
+ reg = <0x13>;
+ linaro,optee-channel-id = <1>;
+ shmem = <&cpu_optee_lpri0>;
+ #clock-cells = <1>;
+ };
+ scmi_clk0: protocol@14 {
+ reg = <0x14>;
+ #clock-cells = <1>;
+ };
+ };
+ };
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ sram@51000000 {
+ compatible = "mmio-sram";
+ reg = <0x0 0x51000000 0x0 0x10000>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0 0x0 0x51000000 0x10000>;
+
+ cpu_optee_lpri0: optee-sram-section@0 {
+ compatible = "arm,scmi-shmem";
+ reg = <0x0 0x80>;
+ };
};
};
diff --git a/Documentation/devicetree/bindings/firmware/arm,scpi.yaml b/Documentation/devicetree/bindings/firmware/arm,scpi.yaml
index 23b346bd1252..241317239ffc 100644
--- a/Documentation/devicetree/bindings/firmware/arm,scpi.yaml
+++ b/Documentation/devicetree/bindings/firmware/arm,scpi.yaml
@@ -5,7 +5,7 @@
$id: http://devicetree.org/schemas/firmware/arm,scpi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: System Control and Power Interface (SCPI) Message Protocol bindings
+title: System Control and Power Interface (SCPI) Message Protocol
maintainers:
- Sudeep Holla <sudeep.holla@arm.com>
@@ -43,6 +43,7 @@ properties:
by remote SCP firmware for use by SCPI message protocol should be
specified in any order.
minItems: 1
+ maxItems: 4
shmem:
description:
@@ -51,6 +52,7 @@ properties:
be any memory reserved for the purpose of this communication between the
processors.
minItems: 1
+ maxItems: 4
power-controller:
type: object
@@ -235,8 +237,8 @@ examples:
firmware {
scpi {
compatible = "amlogic,meson-gxbb-scpi", "arm,scpi-pre-1.0";
- mboxes = <&mailbox 1 &mailbox 2>;
- shmem = <&cpu_scp_lpri &cpu_scp_hpri>;
+ mboxes = <&mailbox 1>, <&mailbox 2>;
+ shmem = <&cpu_scp_lpri>, <&cpu_scp_hpri>;
scpi_sensors1: sensors {
compatible = "amlogic,meson-gxbb-scpi-sensors", "arm,scpi-sensors";
diff --git a/Documentation/devicetree/bindings/firmware/fsl,scu.yaml b/Documentation/devicetree/bindings/firmware/fsl,scu.yaml
new file mode 100644
index 000000000000..557e524786c2
--- /dev/null
+++ b/Documentation/devicetree/bindings/firmware/fsl,scu.yaml
@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/firmware/fsl,scu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP i.MX System Controller Firmware (SCFW)
+
+maintainers:
+ - Dong Aisheng <aisheng.dong@nxp.com>
+
+description:
+ The System Controller Firmware (SCFW) is a low-level system function
+ which runs on a dedicated Cortex-M core to provide power, clock, and
+ resource management. It exists on some i.MX8 processors. e.g. i.MX8QM
+ (QM, QP), and i.MX8QX (QXP, DX).
+ The AP communicates with the SC using a multi-ported MU module found
+ in the LSIO subsystem. The current definition of this MU module provides
+ 5 remote AP connections to the SC to support up to 5 execution environments
+ (TZ, HV, standard Linux, etc.). The SC side of this MU module interfaces
+ with the LSIO DSC IP bus. The SC firmware will communicate with this MU
+ using the MSI bus.
+
+properties:
+ compatible:
+ const: fsl,imx-scu
+
+ clock-controller:
+ description:
+ Clock controller node that provides the clocks controlled by the SCU
+ $ref: /schemas/clock/fsl,scu-clk.yaml
+
+ gpio:
+ description:
+ Control the GPIO PINs on SCU domain over the firmware APIs
+ $ref: /schemas/gpio/fsl,imx8qxp-sc-gpio.yaml
+
+ ocotp:
+ description:
+ OCOTP controller node provided by the SCU
+ $ref: /schemas/nvmem/fsl,scu-ocotp.yaml
+
+ keys:
+ description:
+ Keys provided by the SCU
+ $ref: /schemas/input/fsl,scu-key.yaml
+
+ mboxes:
+ description:
+ A list of phandles of TX MU channels followed by a list of phandles of
+ RX MU channels. The list may include at the end one more optional MU
+ channel for general interrupt. The number of expected tx and rx
+ channels is 1 TX and 1 RX channels if MU instance is "fsl,imx8-mu-scu"
+ compatible, 4 TX and 4 RX channels otherwise. All MU channels must be
+ within the same MU instance. Cross instances are not allowed. The MU
+ instance can only be one of LSIO MU0~M4 for imx8qxp and imx8qm. Users
+ need to ensure that one is used that does not conflict with other
+ execution environments such as ATF.
+ oneOf:
+ - items:
+ - description: TX0 MU channel
+ - description: RX0 MU channel
+ - items:
+ - description: TX0 MU channel
+ - description: RX0 MU channel
+ - description: optional MU channel for general interrupt
+ - items:
+ - description: TX0 MU channel
+ - description: TX1 MU channel
+ - description: TX2 MU channel
+ - description: TX3 MU channel
+ - description: RX0 MU channel
+ - description: RX1 MU channel
+ - description: RX2 MU channel
+ - description: RX3 MU channel
+ - items:
+ - description: TX0 MU channel
+ - description: TX1 MU channel
+ - description: TX2 MU channel
+ - description: TX3 MU channel
+ - description: RX0 MU channel
+ - description: RX1 MU channel
+ - description: RX2 MU channel
+ - description: RX3 MU channel
+ - description: optional MU channel for general interrupt
+
+ mbox-names:
+ oneOf:
+ - items:
+ - const: tx0
+ - const: rx0
+ - items:
+ - const: tx0
+ - const: rx0
+ - const: gip3
+ - items:
+ - const: tx0
+ - const: tx1
+ - const: tx2
+ - const: tx3
+ - const: rx0
+ - const: rx1
+ - const: rx2
+ - const: rx3
+ - items:
+ - const: tx0
+ - const: tx1
+ - const: tx2
+ - const: tx3
+ - const: rx0
+ - const: rx1
+ - const: rx2
+ - const: rx3
+ - const: gip3
+
+ pinctrl:
+ description:
+ Pin controller provided by the SCU
+ $ref: /schemas/pinctrl/fsl,scu-pinctrl.yaml
+
+ power-controller:
+ description:
+ Power domains controller node that provides the power domains
+ controlled by the SCU
+ $ref: /schemas/power/fsl,scu-pd.yaml
+
+ rtc:
+ description:
+ RTC controller provided by the SCU
+ $ref: /schemas/rtc/fsl,scu-rtc.yaml
+
+ thermal-sensor:
+ description:
+ Thermal sensor provided by the SCU
+ $ref: /schemas/thermal/fsl,scu-thermal.yaml
+
+ watchdog:
+ description:
+ Watchdog controller provided by the SCU
+ $ref: /schemas/watchdog/fsl,scu-wdt.yaml
+
+required:
+ - compatible
+ - mbox-names
+ - mboxes
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/firmware/imx/rsrc.h>
+ #include <dt-bindings/input/input.h>
+ #include <dt-bindings/pinctrl/pads-imx8qxp.h>
+
+ firmware {
+ system-controller {
+ compatible = "fsl,imx-scu";
+ mbox-names = "tx0", "tx1", "tx2", "tx3",
+ "rx0", "rx1", "rx2", "rx3",
+ "gip3";
+ mboxes = <&lsio_mu1 0 0 &lsio_mu1 0 1 &lsio_mu1 0 2 &lsio_mu1 0 3
+ &lsio_mu1 1 0 &lsio_mu1 1 1 &lsio_mu1 1 2 &lsio_mu1 1 3
+ &lsio_mu1 3 3>;
+
+ clock-controller {
+ compatible = "fsl,imx8qxp-clk", "fsl,scu-clk";
+ #clock-cells = <2>;
+ };
+
+ pinctrl {
+ compatible = "fsl,imx8qxp-iomuxc";
+
+ pinctrl_lpuart0: lpuart0grp {
+ fsl,pins = <
+ IMX8QXP_UART0_RX_ADMA_UART0_RX 0x06000020
+ IMX8QXP_UART0_TX_ADMA_UART0_TX 0x06000020
+ >;
+ };
+ };
+
+ ocotp {
+ compatible = "fsl,imx8qxp-scu-ocotp";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ fec_mac0: mac@2c4 {
+ reg = <0x2c4 6>;
+ };
+ };
+
+ power-controller {
+ compatible = "fsl,imx8qxp-scu-pd", "fsl,scu-pd";
+ #power-domain-cells = <1>;
+ };
+
+ rtc {
+ compatible = "fsl,imx8qxp-sc-rtc";
+ };
+
+ keys {
+ compatible = "fsl,imx8qxp-sc-key", "fsl,imx-sc-key";
+ linux,keycodes = <KEY_POWER>;
+ };
+
+ watchdog {
+ compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
+ timeout-sec = <60>;
+ };
+
+ thermal-sensor {
+ compatible = "fsl,imx8qxp-sc-thermal", "fsl,imx-sc-thermal";
+ #thermal-sensor-cells = <1>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
index c435c9f369a4..9a785bbaafb7 100644
--- a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
+++ b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
@@ -37,6 +37,20 @@ properties:
should be named with the instance number of the NPE engine used for
the crypto engine.
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+patternProperties:
+ hss@[0-9]+$:
+ $ref: /schemas/net/intel,ixp4xx-hss.yaml#
+ type: object
+ description: Optional node for the High Speed Serial link (HSS), the
+ node should be named with the instance number of the NPE engine
+ used for the HSS.
+
required:
- compatible
- reg
@@ -45,9 +59,30 @@ additionalProperties: false
examples:
- |
+ #include <dt-bindings/gpio/gpio.h>
+
npe: npe@c8006000 {
compatible = "intel,ixp4xx-network-processing-engine";
reg = <0xc8006000 0x1000>, <0xc8007000 0x1000>, <0xc8008000 0x1000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ hss@0 {
+ compatible = "intel,ixp4xx-hss";
+ reg = <0>;
+ intel,npe-handle = <&npe 0>;
+ intel,queue-chl-rxtrig = <&qmgr 12>;
+ intel,queue-chl-txready = <&qmgr 34>;
+ intel,queue-pkt-rx = <&qmgr 13>;
+ intel,queue-pkt-tx = <&qmgr 14>, <&qmgr 15>, <&qmgr 16>, <&qmgr 17>;
+ intel,queue-pkt-rxfree = <&qmgr 18>, <&qmgr 19>, <&qmgr 20>, <&qmgr 21>;
+ intel,queue-pkt-txdone = <&qmgr 22>;
+ cts-gpios = <&gpio0 10 GPIO_ACTIVE_LOW>;
+ rts-gpios = <&gpio0 14 GPIO_ACTIVE_LOW>;
+ dcd-gpios = <&gpio0 6 GPIO_ACTIVE_LOW>;
+ dtr-gpios = <&gpio_74 2 GPIO_ACTIVE_LOW>;
+ clk-internal-gpios = <&gpio_74 0 GPIO_ACTIVE_HIGH>;
+ };
crypto {
compatible = "intel,ixp4xx-crypto";
diff --git a/Documentation/devicetree/bindings/firmware/meson/meson_sm.txt b/Documentation/devicetree/bindings/firmware/meson/meson_sm.txt
deleted file mode 100644
index c248cd44f727..000000000000
--- a/Documentation/devicetree/bindings/firmware/meson/meson_sm.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-* Amlogic Secure Monitor
-
-In the Amlogic SoCs the Secure Monitor code is used to provide access to the
-NVMEM, enable JTAG, set USB boot, etc...
-
-Required properties for the secure monitor node:
-- compatible: Should be "amlogic,meson-gxbb-sm"
-
-Example:
-
- firmware {
- sm: secure-monitor {
- compatible = "amlogic,meson-gxbb-sm";
- };
- };
diff --git a/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.txt b/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.txt
deleted file mode 100644
index e44a13bc06ed..000000000000
--- a/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-NVIDIA Tegra Boot and Power Management Processor (BPMP)
-
-The BPMP is a specific processor in Tegra chip, which is designed for
-booting process handling and offloading the power management, clock
-management, and reset control tasks from the CPU. The binding document
-defines the resources that would be used by the BPMP firmware driver,
-which can create the interprocessor communication (IPC) between the CPU
-and BPMP.
-
-Required properties:
-- compatible
- Array of strings
- One of:
- - "nvidia,tegra186-bpmp"
-- mboxes : The phandle of mailbox controller and the mailbox specifier.
-- shmem : List of the phandle of the TX and RX shared memory area that
- the IPC between CPU and BPMP is based on.
-- #clock-cells : Should be 1.
-- #power-domain-cells : Should be 1.
-- #reset-cells : Should be 1.
-
-This node is a mailbox consumer. See the following files for details of
-the mailbox subsystem, and the specifiers implemented by the relevant
-provider(s):
-
-- .../mailbox/mailbox.txt
-- .../mailbox/nvidia,tegra186-hsp.txt
-
-This node is a clock, power domain, and reset provider. See the following
-files for general documentation of those features, and the specifiers
-implemented by this node:
-
-- .../clock/clock-bindings.txt
-- <dt-bindings/clock/tegra186-clock.h>
-- ../power/power-domain.yaml
-- <dt-bindings/power/tegra186-powergate.h>
-- .../reset/reset.txt
-- <dt-bindings/reset/tegra186-reset.h>
-
-The BPMP implements some services which must be represented by separate nodes.
-For example, it can provide access to certain I2C controllers, and the I2C
-bindings represent each I2C controller as a device tree node. Such nodes should
-be nested directly inside the main BPMP node.
-
-Software can determine whether a child node of the BPMP node represents a device
-by checking for a compatible property. Any node with a compatible property
-represents a device that can be instantiated. Nodes without a compatible
-property may be used to provide configuration information regarding the BPMP
-itself, although no such configuration nodes are currently defined by this
-binding.
-
-The BPMP firmware defines no single global name-/numbering-space for such
-services. Put another way, the numbering scheme for I2C buses is distinct from
-the numbering scheme for any other service the BPMP may provide (e.g. a future
-hypothetical SPI bus service). As such, child device nodes will have no reg
-property, and the BPMP node will have no #address-cells or #size-cells property.
-
-The shared memory bindings for BPMP
------------------------------------
-
-The shared memory area for the IPC TX and RX between CPU and BPMP are
-predefined and work on top of sysram, which is an SRAM inside the chip.
-
-See ".../sram/sram.txt" for the bindings.
-
-Example:
-
-hsp_top0: hsp@3c00000 {
- ...
- #mbox-cells = <2>;
-};
-
-sysram@30000000 {
- compatible = "nvidia,tegra186-sysram", "mmio-sram";
- reg = <0x0 0x30000000 0x0 0x50000>;
- #address-cells = <2>;
- #size-cells = <2>;
- ranges = <0 0x0 0x0 0x30000000 0x0 0x50000>;
-
- cpu_bpmp_tx: shmem@4e000 {
- compatible = "nvidia,tegra186-bpmp-shmem";
- reg = <0x0 0x4e000 0x0 0x1000>;
- label = "cpu-bpmp-tx";
- pool;
- };
-
- cpu_bpmp_rx: shmem@4f000 {
- compatible = "nvidia,tegra186-bpmp-shmem";
- reg = <0x0 0x4f000 0x0 0x1000>;
- label = "cpu-bpmp-rx";
- pool;
- };
-};
-
-bpmp {
- compatible = "nvidia,tegra186-bpmp";
- mboxes = <&hsp_top0 TEGRA_HSP_MBOX_TYPE_DB TEGRA_HSP_DB_MASTER_BPMP>;
- shmem = <&cpu_bpmp_tx &cpu_bpmp_rx>;
- #clock-cells = <1>;
- #power-domain-cells = <1>;
- #reset-cells = <1>;
-
- i2c {
- compatible = "...";
- ...
- };
-};
diff --git a/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.yaml b/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.yaml
new file mode 100644
index 000000000000..833c07f1685c
--- /dev/null
+++ b/Documentation/devicetree/bindings/firmware/nvidia,tegra186-bpmp.yaml
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/firmware/nvidia,tegra186-bpmp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra Boot and Power Management Processor (BPMP)
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+description: |
+ The BPMP is a specific processor in Tegra chip, which is designed for
+ booting process handling and offloading the power management, clock
+ management, and reset control tasks from the CPU. The binding document
+ defines the resources that would be used by the BPMP firmware driver,
+ which can create the interprocessor communication (IPC) between the
+ CPU and BPMP.
+
+ This node is a mailbox consumer. See the following files for details
+ of the mailbox subsystem, and the specifiers implemented by the
+ relevant provider(s):
+
+ - .../mailbox/mailbox.txt
+ - .../mailbox/nvidia,tegra186-hsp.yaml
+
+ This node is a clock, power domain, and reset provider. See the
+ following files for general documentation of those features, and the
+ specifiers implemented by this node:
+
+ - .../clock/clock-bindings.txt
+ - <dt-bindings/clock/tegra186-clock.h>
+ - ../power/power-domain.yaml
+ - <dt-bindings/power/tegra186-powergate.h>
+ - .../reset/reset.txt
+ - <dt-bindings/reset/tegra186-reset.h>
+
+ The BPMP implements some services which must be represented by
+ separate nodes. For example, it can provide access to certain I2C
+ controllers, and the I2C bindings represent each I2C controller as a
+ device tree node. Such nodes should be nested directly inside the main
+ BPMP node.
+
+ Software can determine whether a child node of the BPMP node
+ represents a device by checking for a compatible property. Any node
+ with a compatible property represents a device that can be
+ instantiated. Nodes without a compatible property may be used to
+ provide configuration information regarding the BPMP itself, although
+ no such configuration nodes are currently defined by this binding.
+
+ The BPMP firmware defines no single global name-/numbering-space for
+ such services. Put another way, the numbering scheme for I2C buses is
+ distinct from the numbering scheme for any other service the BPMP may
+ provide (e.g. a future hypothetical SPI bus service). As such, child
+ device nodes will have no reg property, and the BPMP node will have no
+ "#address-cells" or "#size-cells" property.
+
+ The shared memory area for the IPC TX and RX between CPU and BPMP are
+ predefined and work on top of sysram, which is an SRAM inside the
+ chip. See ".../sram/sram.yaml" for the bindings.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - nvidia,tegra194-bpmp
+ - nvidia,tegra234-bpmp
+ - const: nvidia,tegra186-bpmp
+ - const: nvidia,tegra186-bpmp
+
+ mboxes:
+ description: A phandle and channel specifier for the mailbox used to
+ communicate with the BPMP.
+ maxItems: 1
+
+ shmem:
+ description: List of the phandle to the TX and RX shared memory area
+ that the IPC between CPU and BPMP is based on.
+ minItems: 2
+ maxItems: 2
+
+ "#clock-cells":
+ const: 1
+
+ "#power-domain-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ interconnects:
+ items:
+ - description: memory read client
+ - description: memory write client
+ - description: DMA read client
+ - description: DMA write client
+
+ interconnect-names:
+ items:
+ - const: read
+ - const: write
+ - const: dma-mem # dma-read
+ - const: dma-write
+
+ iommus:
+ maxItems: 1
+
+ i2c:
+ type: object
+
+ thermal:
+ type: object
+
+additionalProperties: false
+
+required:
+ - compatible
+ - mboxes
+ - shmem
+ - "#clock-cells"
+ - "#power-domain-cells"
+ - "#reset-cells"
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/mailbox/tegra186-hsp.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+
+ hsp_top0: hsp@3c00000 {
+ compatible = "nvidia,tegra186-hsp";
+ reg = <0x03c00000 0xa0000>;
+ interrupts = <GIC_SPI 176 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "doorbell";
+ #mbox-cells = <2>;
+ };
+
+ sram@30000000 {
+ compatible = "nvidia,tegra186-sysram", "mmio-sram";
+ reg = <0x30000000 0x50000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0x0 0x30000000 0x50000>;
+
+ cpu_bpmp_tx: sram@4e000 {
+ reg = <0x4e000 0x1000>;
+ label = "cpu-bpmp-tx";
+ pool;
+ };
+
+ cpu_bpmp_rx: sram@4f000 {
+ reg = <0x4f000 0x1000>;
+ label = "cpu-bpmp-rx";
+ pool;
+ };
+ };
+
+ bpmp {
+ compatible = "nvidia,tegra186-bpmp";
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_BPMPR &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_BPMPW &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_BPMPDMAR &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_BPMPDMAW &emc>;
+ interconnect-names = "read", "write", "dma-mem", "dma-write";
+ iommus = <&smmu TEGRA186_SID_BPMP>;
+ mboxes = <&hsp_top0 TEGRA_HSP_MBOX_TYPE_DB
+ TEGRA_HSP_DB_MASTER_BPMP>;
+ shmem = <&cpu_bpmp_tx>, <&cpu_bpmp_rx>;
+ #clock-cells = <1>;
+ #power-domain-cells = <1>;
+ #reset-cells = <1>;
+
+ i2c {
+ compatible = "nvidia,tegra186-bpmp-i2c";
+ nvidia,bpmp-bus-id = <5>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ thermal {
+ compatible = "nvidia,tegra186-bpmp-thermal";
+ #thermal-sensor-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/firmware/qcom,scm.txt b/Documentation/devicetree/bindings/firmware/qcom,scm.txt
deleted file mode 100644
index a7333ad938d2..000000000000
--- a/Documentation/devicetree/bindings/firmware/qcom,scm.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-QCOM Secure Channel Manager (SCM)
-
-Qualcomm processors include an interface to communicate to the secure firmware.
-This interface allows for clients to request different types of actions. These
-can include CPU power up/down, HDCP requests, loading of firmware, and other
-assorted actions.
-
-Required properties:
-- compatible: must contain one of the following:
- * "qcom,scm-apq8064"
- * "qcom,scm-apq8084"
- * "qcom,scm-ipq4019"
- * "qcom,scm-ipq806x"
- * "qcom,scm-ipq8074"
- * "qcom,scm-mdm9607"
- * "qcom,scm-msm8660"
- * "qcom,scm-msm8916"
- * "qcom,scm-msm8960"
- * "qcom,scm-msm8974"
- * "qcom,scm-msm8994"
- * "qcom,scm-msm8996"
- * "qcom,scm-msm8998"
- * "qcom,scm-sc7180"
- * "qcom,scm-sc7280"
- * "qcom,scm-sdm845"
- * "qcom,scm-sdx55"
- * "qcom,scm-sm8150"
- * "qcom,scm-sm8250"
- * "qcom,scm-sm8350"
- and:
- * "qcom,scm"
-- clocks: Specifies clocks needed by the SCM interface, if any:
- * core clock required for "qcom,scm-apq8064", "qcom,scm-msm8660" and
- "qcom,scm-msm8960"
- * core, iface and bus clocks required for "qcom,scm-apq8084",
- "qcom,scm-msm8916" and "qcom,scm-msm8974"
-- clock-names: Must contain "core" for the core clock, "iface" for the interface
- clock and "bus" for the bus clock per the requirements of the compatible.
-- qcom,dload-mode: phandle to the TCSR hardware block and offset of the
- download mode control register (optional)
-
-Example for MSM8916:
-
- firmware {
- scm {
- compatible = "qcom,msm8916", "qcom,scm";
- clocks = <&gcc GCC_CRYPTO_CLK> ,
- <&gcc GCC_CRYPTO_AXI_CLK>,
- <&gcc GCC_CRYPTO_AHB_CLK>;
- clock-names = "core", "bus", "iface";
- };
- };
diff --git a/Documentation/devicetree/bindings/firmware/qcom,scm.yaml b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml
new file mode 100644
index 000000000000..367d04ad1923
--- /dev/null
+++ b/Documentation/devicetree/bindings/firmware/qcom,scm.yaml
@@ -0,0 +1,212 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/firmware/qcom,scm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: QCOM Secure Channel Manager (SCM)
+
+description: |
+ Qualcomm processors include an interface to communicate to the secure firmware.
+ This interface allows for clients to request different types of actions.
+ These can include CPU power up/down, HDCP requests, loading of firmware,
+ and other assorted actions.
+
+maintainers:
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+ - Robert Marko <robimarko@gmail.com>
+ - Guru Das Srinagesh <quic_gurus@quicinc.com>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - qcom,scm-apq8064
+ - qcom,scm-apq8084
+ - qcom,scm-ipq4019
+ - qcom,scm-ipq5332
+ - qcom,scm-ipq6018
+ - qcom,scm-ipq806x
+ - qcom,scm-ipq8074
+ - qcom,scm-ipq9574
+ - qcom,scm-mdm9607
+ - qcom,scm-msm8226
+ - qcom,scm-msm8660
+ - qcom,scm-msm8916
+ - qcom,scm-msm8953
+ - qcom,scm-msm8960
+ - qcom,scm-msm8974
+ - qcom,scm-msm8976
+ - qcom,scm-msm8994
+ - qcom,scm-msm8996
+ - qcom,scm-msm8998
+ - qcom,scm-qcm2290
+ - qcom,scm-qdu1000
+ - qcom,scm-sa8775p
+ - qcom,scm-sc7180
+ - qcom,scm-sc7280
+ - qcom,scm-sc8180x
+ - qcom,scm-sc8280xp
+ - qcom,scm-sdm670
+ - qcom,scm-sdm845
+ - qcom,scm-sdx55
+ - qcom,scm-sdx65
+ - qcom,scm-sm6115
+ - qcom,scm-sm6125
+ - qcom,scm-sm6350
+ - qcom,scm-sm6375
+ - qcom,scm-sm8150
+ - qcom,scm-sm8250
+ - qcom,scm-sm8350
+ - qcom,scm-sm8450
+ - qcom,scm-sm8550
+ - qcom,scm-qcs404
+ - const: qcom,scm
+
+ clocks:
+ minItems: 1
+ maxItems: 3
+
+ clock-names:
+ minItems: 1
+ maxItems: 3
+
+ interconnects:
+ maxItems: 1
+
+ interconnect-names:
+ maxItems: 1
+
+ '#reset-cells':
+ const: 1
+
+ interrupts:
+ description:
+ The wait-queue interrupt that firmware raises as part of handshake
+ protocol to handle sleeping SCM calls.
+ maxItems: 1
+
+ qcom,dload-mode:
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+ items:
+ - items:
+ - description: phandle to TCSR hardware block
+ - description: offset of the download mode control register
+ description: TCSR hardware block
+
+allOf:
+ # Clocks
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,scm-apq8064
+ - qcom,scm-apq8084
+ - qcom,scm-mdm9607
+ - qcom,scm-msm8226
+ - qcom,scm-msm8660
+ - qcom,scm-msm8916
+ - qcom,scm-msm8953
+ - qcom,scm-msm8960
+ - qcom,scm-msm8974
+ - qcom,scm-msm8976
+ - qcom,scm-qcm2290
+ - qcom,scm-sm6375
+ then:
+ required:
+ - clocks
+ - clock-names
+ else:
+ properties:
+ clock-names: false
+ clocks: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,scm-apq8064
+ - qcom,scm-msm8660
+ - qcom,scm-msm8960
+ - qcom,scm-qcm2290
+ - qcom,scm-sm6375
+ then:
+ properties:
+ clock-names:
+ items:
+ - const: core
+
+ clocks:
+ maxItems: 1
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,scm-apq8084
+ - qcom,scm-mdm9607
+ - qcom,scm-msm8226
+ - qcom,scm-msm8916
+ - qcom,scm-msm8953
+ - qcom,scm-msm8974
+ - qcom,scm-msm8976
+ then:
+ properties:
+ clock-names:
+ items:
+ - const: core
+ - const: bus
+ - const: iface
+
+ clocks:
+ minItems: 3
+ maxItems: 3
+
+ # Interconnects
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,scm-qdu1000
+ - qcom,scm-sm8450
+ - qcom,scm-sm8550
+ then:
+ properties:
+ interconnects: false
+
+ # Interrupts
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - qcom,scm-sm8450
+ - qcom,scm-sm8550
+ then:
+ properties:
+ interrupts: false
+
+required:
+ - compatible
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/qcom,gcc-msm8916.h>
+
+ firmware {
+ scm {
+ compatible = "qcom,scm-msm8916", "qcom,scm";
+ clocks = <&gcc GCC_CRYPTO_CLK>,
+ <&gcc GCC_CRYPTO_AXI_CLK>,
+ <&gcc GCC_CRYPTO_AHB_CLK>;
+ clock-names = "core", "bus", "iface";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/firmware/qemu,fw-cfg-mmio.yaml b/Documentation/devicetree/bindings/firmware/qemu,fw-cfg-mmio.yaml
new file mode 100644
index 000000000000..3faae3236665
--- /dev/null
+++ b/Documentation/devicetree/bindings/firmware/qemu,fw-cfg-mmio.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/firmware/qemu,fw-cfg-mmio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: QEMU Firmware Configuration
+
+maintainers:
+ - Rob Herring <robh@kernel.org>
+
+description: |
+ Various QEMU emulation / virtualization targets provide the following
+ Firmware Configuration interface on the "virt" machine type:
+
+ - A write-only, 16-bit wide selector (or control) register,
+ - a read-write, 64-bit wide data register.
+
+ QEMU exposes the control and data register to guests as memory mapped
+ registers; their location is communicated to the guest's UEFI firmware in the
+ DTB that QEMU places at the bottom of the guest's DRAM.
+
+ The authoritative guest-side hardware interface documentation to the fw_cfg
+ device can be found in "docs/specs/fw_cfg.txt" in the QEMU source tree.
+
+
+properties:
+ compatible:
+ const: qemu,fw-cfg-mmio
+
+ reg:
+ maxItems: 1
+ description: |
+ * Bytes 0x0 to 0x7 cover the data register.
+ * Bytes 0x8 to 0x9 cover the selector register.
+ * Further registers may be appended to the region in case of future interface
+ revisions / feature bits.
+
+ dma-coherent: true
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+
+ fw-cfg@9020000 {
+ compatible = "qemu,fw-cfg-mmio";
+ reg = <0x9020000 0xa>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/fpga/fpga-region.txt b/Documentation/devicetree/bindings/fpga/fpga-region.txt
index 7d3515264838..6694ef29a267 100644
--- a/Documentation/devicetree/bindings/fpga/fpga-region.txt
+++ b/Documentation/devicetree/bindings/fpga/fpga-region.txt
@@ -330,7 +330,7 @@ succeeded.
The Device Tree Overlay will contain:
* "target-path" or "target"
- The insertion point where the the contents of the overlay will go into the
+ The insertion point where the contents of the overlay will go into the
live tree. target-path is a full path, while target is a phandle.
* "ranges"
The address space mapping from processor to FPGA bus(ses).
diff --git a/Documentation/devicetree/bindings/fpga/lattice,sysconfig.yaml b/Documentation/devicetree/bindings/fpga/lattice,sysconfig.yaml
new file mode 100644
index 000000000000..4fb05eb84e2a
--- /dev/null
+++ b/Documentation/devicetree/bindings/fpga/lattice,sysconfig.yaml
@@ -0,0 +1,81 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/fpga/lattice,sysconfig.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Lattice Slave SPI sysCONFIG FPGA manager
+
+maintainers:
+ - Ivan Bornyakov <i.bornyakov@metrotek.ru>
+
+description: |
+ Lattice sysCONFIG port, which is used for FPGA configuration, among others,
+ have Slave Serial Peripheral Interface. Only full reconfiguration is
+ supported.
+
+ Programming of ECP5 is done by writing uncompressed bitstream image in .bit
+ format into FPGA's SRAM configuration memory.
+
+properties:
+ compatible:
+ enum:
+ - lattice,sysconfig-ecp5
+
+ reg:
+ maxItems: 1
+
+ program-gpios:
+ description:
+ A GPIO line connected to PROGRAMN (active low) pin of the device.
+ Initiates configuration sequence.
+ maxItems: 1
+
+ init-gpios:
+ description:
+ A GPIO line connected to INITN (active low) pin of the device.
+ Indicates that the FPGA is ready to be configured.
+ maxItems: 1
+
+ done-gpios:
+ description:
+ A GPIO line connected to DONE (active high) pin of the device.
+ Indicates that the configuration sequence is complete.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: lattice,sysconfig-ecp5
+ then:
+ properties:
+ spi-max-frequency:
+ maximum: 60000000
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ fpga-mgr@0 {
+ compatible = "lattice,sysconfig-ecp5";
+ reg = <0>;
+ spi-max-frequency = <20000000>;
+ program-gpios = <&gpio3 4 GPIO_ACTIVE_LOW>;
+ init-gpios = <&gpio3 3 GPIO_ACTIVE_LOW>;
+ done-gpios = <&gpio3 2 GPIO_ACTIVE_HIGH>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml b/Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml
new file mode 100644
index 000000000000..527532f039ce
--- /dev/null
+++ b/Documentation/devicetree/bindings/fpga/microchip,mpf-spi-fpga-mgr.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/fpga/microchip,mpf-spi-fpga-mgr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip Polarfire FPGA manager.
+
+maintainers:
+ - Ivan Bornyakov <i.bornyakov@metrotek.ru>
+
+description:
+ Device Tree Bindings for Microchip Polarfire FPGA Manager using slave SPI to
+ load the bitstream in .dat format.
+
+properties:
+ compatible:
+ enum:
+ - microchip,mpf-spi-fpga-mgr
+
+ reg:
+ description: SPI chip select
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ fpga_mgr@0 {
+ compatible = "microchip,mpf-spi-fpga-mgr";
+ spi-max-frequency = <20000000>;
+ reg = <0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/fpga/xilinx-pr-decoupler.txt b/Documentation/devicetree/bindings/fpga/xilinx-pr-decoupler.txt
deleted file mode 100644
index 0acdfa6d62a4..000000000000
--- a/Documentation/devicetree/bindings/fpga/xilinx-pr-decoupler.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Xilinx LogiCORE Partial Reconfig Decoupler Softcore
-
-The Xilinx LogiCORE Partial Reconfig Decoupler manages one or more
-decouplers / fpga bridges.
-The controller can decouple/disable the bridges which prevents signal
-changes from passing through the bridge. The controller can also
-couple / enable the bridges which allows traffic to pass through the
-bridge normally.
-
-Xilinx LogiCORE Dynamic Function eXchange(DFX) AXI shutdown manager
-Softcore is compatible with the Xilinx LogiCORE pr-decoupler.
-
-The Dynamic Function eXchange AXI shutdown manager prevents AXI traffic
-from passing through the bridge. The controller safely handles AXI4MM
-and AXI4-Lite interfaces on a Reconfigurable Partition when it is
-undergoing dynamic reconfiguration, preventing the system deadlock
-that can occur if AXI transactions are interrupted by DFX
-
-The Driver supports only MMIO handling. A PR region can have multiple
-PR Decouplers which can be handled independently or chained via decouple/
-decouple_status signals.
-
-Required properties:
-- compatible : Should contain "xlnx,pr-decoupler-1.00" followed by
- "xlnx,pr-decoupler" or
- "xlnx,dfx-axi-shutdown-manager-1.00" followed by
- "xlnx,dfx-axi-shutdown-manager"
-- regs : base address and size for decoupler module
-- clocks : input clock to IP
-- clock-names : should contain "aclk"
-
-See Documentation/devicetree/bindings/fpga/fpga-region.txt and
-Documentation/devicetree/bindings/fpga/fpga-bridge.txt for generic bindings.
-
-Example:
-Partial Reconfig Decoupler:
- fpga-bridge@100000450 {
- compatible = "xlnx,pr-decoupler-1.00",
- "xlnx-pr-decoupler";
- regs = <0x10000045 0x10>;
- clocks = <&clkc 15>;
- clock-names = "aclk";
- bridge-enable = <0>;
- };
-
-Dynamic Function eXchange AXI shutdown manager:
- fpga-bridge@100000450 {
- compatible = "xlnx,dfx-axi-shutdown-manager-1.00",
- "xlnx,dfx-axi-shutdown-manager";
- regs = <0x10000045 0x10>;
- clocks = <&clkc 15>;
- clock-names = "aclk";
- bridge-enable = <0>;
- };
diff --git a/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt b/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt
deleted file mode 100644
index 5ef659c1394d..000000000000
--- a/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-Xilinx Slave Serial SPI FPGA Manager
-
-Xilinx Spartan-6 and 7 Series FPGAs support a method of loading the
-bitstream over what is referred to as "slave serial" interface.
-The slave serial link is not technically SPI, and might require extra
-circuits in order to play nicely with other SPI slaves on the same bus.
-
-See:
-- https://www.xilinx.com/support/documentation/user_guides/ug380.pdf
-- https://www.xilinx.com/support/documentation/user_guides/ug470_7Series_Config.pdf
-- https://www.xilinx.com/support/documentation/application_notes/xapp583-fpga-configuration.pdf
-
-Required properties:
-- compatible: should contain "xlnx,fpga-slave-serial"
-- reg: spi chip select of the FPGA
-- prog_b-gpios: config pin (referred to as PROGRAM_B in the manual)
-- done-gpios: config status pin (referred to as DONE in the manual)
-
-Optional properties:
-- init-b-gpios: initialization status and configuration error pin
- (referred to as INIT_B in the manual)
-
-Example for full FPGA configuration:
-
- fpga-region0 {
- compatible = "fpga-region";
- fpga-mgr = <&fpga_mgr_spi>;
- #address-cells = <0x1>;
- #size-cells = <0x1>;
- };
-
- spi1: spi@10680 {
- compatible = "marvell,armada-xp-spi", "marvell,orion-spi";
- pinctrl-0 = <&spi0_pins>;
- pinctrl-names = "default";
- #address-cells = <1>;
- #size-cells = <0>;
- cell-index = <1>;
- interrupts = <92>;
- clocks = <&coreclk 0>;
-
- fpga_mgr_spi: fpga-mgr@0 {
- compatible = "xlnx,fpga-slave-serial";
- spi-max-frequency = <60000000>;
- spi-cpha;
- reg = <0>;
- prog_b-gpios = <&gpio0 29 GPIO_ACTIVE_LOW>;
- init-b-gpios = <&gpio0 28 GPIO_ACTIVE_LOW>;
- done-gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>;
- };
- };
diff --git a/Documentation/devicetree/bindings/fpga/xilinx-zynq-fpga-mgr.yaml b/Documentation/devicetree/bindings/fpga/xilinx-zynq-fpga-mgr.yaml
index 29daca4be47f..f47b6140a742 100644
--- a/Documentation/devicetree/bindings/fpga/xilinx-zynq-fpga-mgr.yaml
+++ b/Documentation/devicetree/bindings/fpga/xilinx-zynq-fpga-mgr.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/fpga/xilinx-zynq-fpga-mgr.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx Zynq FPGA Manager Device Tree Bindings
+title: Xilinx Zynq FPGA Manager
maintainers:
- Michal Simek <michal.simek@xilinx.com>
diff --git a/Documentation/devicetree/bindings/fpga/xlnx,fpga-slave-serial.yaml b/Documentation/devicetree/bindings/fpga/xlnx,fpga-slave-serial.yaml
new file mode 100644
index 000000000000..614d86ad825f
--- /dev/null
+++ b/Documentation/devicetree/bindings/fpga/xlnx,fpga-slave-serial.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/fpga/xlnx,fpga-slave-serial.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx Slave Serial SPI FPGA
+
+maintainers:
+ - Nava kishore Manne <nava.kishore.manne@amd.com>
+
+description: |
+ Xilinx Spartan-6 and 7 Series FPGAs support a method of loading the bitstream
+ over what is referred to as slave serial interface.The slave serial link is
+ not technically SPI, and might require extra circuits in order to play nicely
+ with other SPI slaves on the same bus.
+
+ Datasheets:
+ https://www.xilinx.com/support/documentation/user_guides/ug380.pdf
+ https://www.xilinx.com/support/documentation/user_guides/ug470_7Series_Config.pdf
+ https://www.xilinx.com/support/documentation/application_notes/xapp583-fpga-configuration.pdf
+
+allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+properties:
+ compatible:
+ enum:
+ - xlnx,fpga-slave-serial
+
+ spi-cpha: true
+
+ spi-max-frequency:
+ maximum: 60000000
+
+ reg:
+ maxItems: 1
+
+ prog_b-gpios:
+ description:
+ config pin (referred to as PROGRAM_B in the manual)
+ maxItems: 1
+
+ done-gpios:
+ description:
+ config status pin (referred to as DONE in the manual)
+ maxItems: 1
+
+ init-b-gpios:
+ description:
+ initialization status and configuration error pin
+ (referred to as INIT_B in the manual)
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - prog_b-gpios
+ - done-gpios
+ - init-b-gpios
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ fpga_mgr_spi: fpga-mgr@0 {
+ compatible = "xlnx,fpga-slave-serial";
+ spi-max-frequency = <60000000>;
+ spi-cpha;
+ reg = <0>;
+ prog_b-gpios = <&gpio0 29 GPIO_ACTIVE_LOW>;
+ init-b-gpios = <&gpio0 28 GPIO_ACTIVE_LOW>;
+ done-gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/fpga/xlnx,pr-decoupler.yaml b/Documentation/devicetree/bindings/fpga/xlnx,pr-decoupler.yaml
new file mode 100644
index 000000000000..a7d4b8e59e19
--- /dev/null
+++ b/Documentation/devicetree/bindings/fpga/xlnx,pr-decoupler.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/fpga/xlnx,pr-decoupler.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx LogiCORE Partial Reconfig Decoupler/AXI shutdown manager Softcore
+
+maintainers:
+ - Nava kishore Manne <nava.kishore.manne@amd.com>
+
+description: |
+ The Xilinx LogiCORE Partial Reconfig(PR) Decoupler manages one or more
+ decouplers/fpga bridges. The controller can decouple/disable the bridges
+ which prevents signal changes from passing through the bridge. The controller
+ can also couple / enable the bridges which allows traffic to pass through the
+ bridge normally.
+ Xilinx LogiCORE Dynamic Function eXchange(DFX) AXI shutdown manager Softcore
+ is compatible with the Xilinx LogiCORE pr-decoupler. The Dynamic Function
+ eXchange AXI shutdown manager prevents AXI traffic from passing through the
+ bridge. The controller safely handles AXI4MM and AXI4-Lite interfaces on a
+ Reconfigurable Partition when it is undergoing dynamic reconfiguration,
+ preventing the system deadlock that can occur if AXI transactions are
+ interrupted by DFX.
+ Please refer to fpga-region.txt and fpga-bridge.txt in this directory for
+ common binding part and usage.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: xlnx,pr-decoupler-1.00
+ - const: xlnx,pr-decoupler
+ - items:
+ - const: xlnx,dfx-axi-shutdown-manager-1.00
+ - const: xlnx,dfx-axi-shutdown-manager
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: aclk
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ fpga-bridge@100000450 {
+ compatible = "xlnx,pr-decoupler-1.00", "xlnx,pr-decoupler";
+ reg = <0x10000045 0x10>;
+ clocks = <&clkc 15>;
+ clock-names = "aclk";
+ };
+...
diff --git a/Documentation/devicetree/bindings/fpga/xlnx,zynqmp-pcap-fpga.yaml b/Documentation/devicetree/bindings/fpga/xlnx,zynqmp-pcap-fpga.yaml
index 6cd2bdc06b5f..00a8d92ff736 100644
--- a/Documentation/devicetree/bindings/fpga/xlnx,zynqmp-pcap-fpga.yaml
+++ b/Documentation/devicetree/bindings/fpga/xlnx,zynqmp-pcap-fpga.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/fpga/xlnx,zynqmp-pcap-fpga.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx Zynq Ultrascale MPSoC FPGA Manager Device Tree Bindings
+title: Xilinx Zynq Ultrascale MPSoC FPGA Manager
maintainers:
- Nava kishore Manne <navam@xilinx.com>
diff --git a/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.txt b/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.txt
deleted file mode 100644
index b109911669e4..000000000000
--- a/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-NVIDIA Tegra20/Tegra30/Tegr114/Tegra124 fuse block.
-
-Required properties:
-- compatible : For Tegra20, must contain "nvidia,tegra20-efuse". For Tegra30,
- must contain "nvidia,tegra30-efuse". For Tegra114, must contain
- "nvidia,tegra114-efuse". For Tegra124, must contain "nvidia,tegra124-efuse".
- For Tegra132 must contain "nvidia,tegra132-efuse", "nvidia,tegra124-efuse".
- For Tegra210 must contain "nvidia,tegra210-efuse". For Tegra186 must contain
- "nvidia,tegra186-efuse". For Tegra194 must contain "nvidia,tegra194-efuse".
- For Tegra234 must contain "nvidia,tegra234-efuse".
- Details:
- nvidia,tegra20-efuse: Tegra20 requires using APB DMA to read the fuse data
- due to a hardware bug. Tegra20 also lacks certain information which is
- available in later generations such as fab code, lot code, wafer id,..
- nvidia,tegra30-efuse, nvidia,tegra114-efuse and nvidia,tegra124-efuse:
- The differences between these SoCs are the size of the efuse array,
- the location of the spare (OEM programmable) bits and the location of
- the speedo data.
-- reg: Should contain 1 entry: the entry gives the physical address and length
- of the fuse registers.
-- clocks: Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
-- clock-names: Must include the following entries:
- - fuse
-- resets: Must contain an entry for each entry in reset-names.
- See ../reset/reset.txt for details.
-- reset-names: Must include the following entries:
- - fuse
-
-Example:
-
- fuse@7000f800 {
- compatible = "nvidia,tegra20-efuse";
- reg = <0x7000f800 0x400>,
- <0x70000000 0x400>;
- clocks = <&tegra_car TEGRA20_CLK_FUSE>;
- clock-names = "fuse";
- resets = <&tegra_car 39>;
- reset-names = "fuse";
- };
-
-
diff --git a/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.yaml b/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.yaml
new file mode 100644
index 000000000000..02f0b0462377
--- /dev/null
+++ b/Documentation/devicetree/bindings/fuse/nvidia,tegra20-fuse.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/fuse/nvidia,tegra20-fuse.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra FUSE block
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-efuse
+ - nvidia,tegra30-efuse
+ - nvidia,tegra114-efuse
+ - nvidia,tegra124-efuse
+ - nvidia,tegra210-efuse
+ - nvidia,tegra186-efuse
+ - nvidia,tegra194-efuse
+ - nvidia,tegra234-efuse
+
+ - items:
+ - const: nvidia,tegra132-efuse
+ - const: nvidia,tegra124-efuse
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: fuse
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: fuse
+
+ operating-points-v2: true
+
+ power-domains:
+ items:
+ - description: phandle to the core power domain
+
+additionalProperties: false
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra20-efuse
+ - nvidia,tegra30-efuse
+ - nvidia,tegra114-efuse
+ - nvidia,tegra124-efuse
+ - nvidia,tegra132-efuse
+ - nvidia,tegra210-efuse
+then:
+ required:
+ - resets
+ - reset-names
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra20-car.h>
+
+ fuse@7000f800 {
+ compatible = "nvidia,tegra20-efuse";
+ reg = <0x7000f800 0x400>;
+ clocks = <&tegra_car TEGRA20_CLK_FUSE>;
+ clock-names = "fuse";
+ resets = <&tegra_car 39>;
+ reset-names = "fuse";
+ };
diff --git a/Documentation/devicetree/bindings/gnss/brcm,bcm4751.yaml b/Documentation/devicetree/bindings/gnss/brcm,bcm4751.yaml
new file mode 100644
index 000000000000..c21549e0fba6
--- /dev/null
+++ b/Documentation/devicetree/bindings/gnss/brcm,bcm4751.yaml
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gnss/brcm,bcm4751.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom BCM4751 family GNSS Receiver
+
+maintainers:
+ - Johan Hovold <johan@kernel.org>
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description:
+ Broadcom GPS chips can be used over the UART or I2C bus. The UART
+ bus requires CTS/RTS support. The number of the capsule is more
+ elaborate than the compatibles BCM4751 may be printed
+ BCM4751IFBG for example.
+
+allOf:
+ - $ref: gnss-common.yaml#
+
+properties:
+ compatible:
+ enum:
+ - brcm,bcm4751
+ - brcm,bcm4752
+ - brcm,bcm4753
+
+ reg:
+ description:
+ The I2C Address, not required on UART buses.
+
+ vdd-auxin-supply:
+ description:
+ Main voltage supply, pin name VDD_AUXIN, typically connected
+ directly to a battery such as LiIon 3.8V battery or a 2.6V supply.
+
+ vddio-supply:
+ description:
+ IO voltage supply, pin name VDDIO, typically 1.8V
+
+ reset-gpios:
+ maxItems: 1
+ description: An optional active low reset line, should be flagged with
+ GPIO_ACTIVE_LOW.
+
+ enable-gpios:
+ description: Enable GPIO line, connected to pins named REGPU or NSTANDBY.
+ If the line is active low such as NSTANDBY, it should be tagged
+ GPIO_ACTIVE_LOW.
+
+required:
+ - compatible
+ - enable-gpios
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ serial {
+ gnss {
+ compatible = "brcm,bcm4751";
+ vdd-auxin-supply = <&vbat>;
+ reset-gpios = <&gpio0 15 GPIO_ACTIVE_LOW>;
+ enable-gpios = <&gpio0 16 GPIO_ACTIVE_HIGH>;
+ current-speed = <38400>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gnss/gnss-common.yaml b/Documentation/devicetree/bindings/gnss/gnss-common.yaml
new file mode 100644
index 000000000000..963b926e30a7
--- /dev/null
+++ b/Documentation/devicetree/bindings/gnss/gnss-common.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gnss/gnss-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Common Properties for Global Navigation Satellite Systems (GNSS)
+ receiver devices
+
+maintainers:
+ - Johan Hovold <johan@kernel.org>
+
+description: |
+ This document defines device tree properties common to Global Navigation
+ Satellite System receivers.
+
+properties:
+ $nodename:
+ pattern: "^gnss(@.*)?$"
+
+ lna-supply:
+ description: A separate regulator supplying power for the Low Noise
+ Amplifier (LNA). This is an amplifier connected between the GNSS
+ device and the receiver antenna.
+
+ enable-gpios:
+ description: A GPIO line that will enable the GNSS receiver when
+ asserted. If this line is active low, the GPIO phandle should
+ consequently be tagged with the GPIO_ACTIVE_LOW flag so the operating
+ system can rely on asserting the line to enable the GNSS device.
+ maxItems: 1
+
+ timepulse-gpios:
+ description: When a timepulse is provided to the GNSS device using a
+ GPIO line, this is used.
+ maxItems: 1
+
+ current-speed:
+ description: The baudrate in bits per second of the device as it comes
+ online, current active speed.
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+additionalProperties: true
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ serial {
+ gnss {
+ compatible = "u-blox,neo-8";
+ vcc-supply = <&gnss_reg>;
+ timepulse-gpios = <&gpio0 16 GPIO_ACTIVE_HIGH>;
+ current-speed = <4800>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gnss/gnss.txt b/Documentation/devicetree/bindings/gnss/gnss.txt
deleted file mode 100644
index d6dc9c0d8249..000000000000
--- a/Documentation/devicetree/bindings/gnss/gnss.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-GNSS Receiver DT binding
-
-This documents the binding structure and common properties for GNSS receiver
-devices.
-
-A GNSS receiver node is a node named "gnss" and typically resides on a serial
-bus (e.g. UART, I2C or SPI).
-
-Please refer to the following documents for generic properties:
-
- Documentation/devicetree/bindings/serial/serial.yaml
- Documentation/devicetree/bindings/spi/spi-bus.txt
-
-Required properties:
-
-- compatible : A string reflecting the vendor and specific device the node
- represents
-
-Optional properties:
-- lna-supply : Separate supply for an LNA
-- enable-gpios : GPIO used to enable the device
-- timepulse-gpios : Time pulse GPIO
-
-Example:
-
-serial@1234 {
- compatible = "ns16550a";
-
- gnss {
- compatible = "u-blox,neo-8";
-
- vcc-supply = <&gnss_reg>;
- timepulse-gpios = <&gpio0 16 GPIO_ACTIVE_HIGH>;
-
- current-speed = <4800>;
- };
-};
diff --git a/Documentation/devicetree/bindings/gnss/mediatek.txt b/Documentation/devicetree/bindings/gnss/mediatek.txt
deleted file mode 100644
index 80cb802813c5..000000000000
--- a/Documentation/devicetree/bindings/gnss/mediatek.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-Mediatek-based GNSS Receiver DT binding
-
-Mediatek chipsets are used in GNSS-receiver modules produced by several
-vendors and can use a UART interface.
-
-Please see Documentation/devicetree/bindings/gnss/gnss.txt for generic
-properties.
-
-Required properties:
-
-- compatible : Must be
-
- "globaltop,pa6h"
-
-- vcc-supply : Main voltage regulator (pin name: VCC)
-
-Optional properties:
-
-- current-speed : Default UART baud rate
-- gnss-fix-gpios : GPIO used to determine device position fix state
- (pin name: FIX, 3D_FIX)
-- reset-gpios : GPIO used to reset the device (pin name: RESET, NRESET)
-- timepulse-gpios : Time pulse GPIO (pin name: PPS1, 1PPS)
-- vbackup-supply : Backup voltage regulator (pin name: VBAT, VBACKUP)
-
-Example:
-
-serial@1234 {
- compatible = "ns16550a";
-
- gnss {
- compatible = "globaltop,pa6h";
- vcc-supply = <&vcc_3v3>;
- };
-};
diff --git a/Documentation/devicetree/bindings/gnss/mediatek.yaml b/Documentation/devicetree/bindings/gnss/mediatek.yaml
new file mode 100644
index 000000000000..c0eb35beb2ef
--- /dev/null
+++ b/Documentation/devicetree/bindings/gnss/mediatek.yaml
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gnss/mediatek.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek GNSS Receiver
+
+maintainers:
+ - Johan Hovold <johan@kernel.org>
+
+description:
+ Mediatek chipsets are used in GNSS-receiver modules produced by several
+ vendors and can use a UART interface.
+
+allOf:
+ - $ref: gnss-common.yaml#
+
+properties:
+ compatible:
+ const: globaltop,pa6h
+
+ vcc-supply:
+ description:
+ Main voltage regulator, pin name VCC.
+
+ reset-gpios:
+ maxItems: 1
+ description: An optional reset line, with names such as RESET or NRESET.
+ If the line is active low it should be flagged with GPIO_ACTIVE_LOW.
+
+ timepulse-gpios:
+ description: Comes with pin names such as PPS1 or 1PPS.
+
+ gnss-fix-gpios:
+ maxItems: 1
+ description: GPIO used to determine device position fix state, pin names
+ FIX or 3D_FIX.
+
+ vbackup-supply:
+ description:
+ Regulator providing backup voltage, pin names such as VBAT or VBACKUP.
+
+required:
+ - compatible
+ - vcc-supply
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ serial {
+ gnss {
+ compatible = "globaltop,pa6h";
+ vcc-supply = <&vcc_3v3>;
+ reset-gpios = <&gpio 1 GPIO_ACTIVE_LOW>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gnss/sirfstar.txt b/Documentation/devicetree/bindings/gnss/sirfstar.txt
deleted file mode 100644
index f4252b6b660b..000000000000
--- a/Documentation/devicetree/bindings/gnss/sirfstar.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-SiRFstar-based GNSS Receiver DT binding
-
-SiRFstar chipsets are used in GNSS-receiver modules produced by several
-vendors and can use UART, SPI or I2C interfaces.
-
-Please see Documentation/devicetree/bindings/gnss/gnss.txt for generic
-properties.
-
-Required properties:
-
-- compatible : Must be one of
-
- "fastrax,uc430"
- "linx,r4"
- "wi2wi,w2sg0004"
- "wi2wi,w2sg0008i"
- "wi2wi,w2sg0084i"
-
-- vcc-supply : Main voltage regulator (pin name: 3V3_IN, VCC, VDD)
-
-Required properties (I2C):
-- reg : I2C slave address
-
-Required properties (SPI):
-- reg : SPI chip select address
-
-Optional properties:
-
-- sirf,onoff-gpios : GPIO used to power on and off device (pin name: ON_OFF)
-- sirf,wakeup-gpios : GPIO used to determine device power state
- (pin name: RFPWRUP, WAKEUP)
-- timepulse-gpios : Time pulse GPIO (pin name: 1PPS, TM)
-
-Example:
-
-serial@1234 {
- compatible = "ns16550a";
-
- gnss {
- compatible = "wi2wi,w2sg0084i";
-
- vcc-supply = <&gnss_reg>;
- sirf,onoff-gpios = <&gpio0 16 GPIO_ACTIVE_HIGH>;
- sirf,wakeup-gpios = <&gpio0 17 GPIO_ACTIVE_HIGH>;
- };
-};
diff --git a/Documentation/devicetree/bindings/gnss/sirfstar.yaml b/Documentation/devicetree/bindings/gnss/sirfstar.yaml
new file mode 100644
index 000000000000..0bbe684d82e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/gnss/sirfstar.yaml
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gnss/sirfstar.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SiRFstar GNSS Receiver
+
+maintainers:
+ - Johan Hovold <johan@kernel.org>
+
+description:
+ The SiRFstar GNSS receivers have incarnated over the years in different
+ chips, starting from the SiRFstarIII which was a chip that was introduced in
+ 2004 and used in a lot of dedicated GPS devices. In 2009 SiRF was acquired
+ by CSR (Cambridge Silicon Radio) and in 2012 the CSR GPS business was
+ acquired by Samsung, while some products remained with CSR. In 2014 CSR
+ was acquired by Qualcomm who still sell some of the SiRF products.
+
+ SiRF chips can be used over UART, I2C or SPI buses.
+
+allOf:
+ - $ref: gnss-common.yaml#
+
+properties:
+ compatible:
+ enum:
+ - csr,gsd4t
+ - csr,csrg05ta03-icje-r
+ - fastrax,uc430
+ - linx,r4
+ - wi2wi,w2sg0004
+ - wi2wi,w2sg0008i
+ - wi2wi,w2sg0084i
+
+ reg:
+ description:
+ The I2C Address, SPI chip select address. Not required on UART buses.
+
+ vcc-supply:
+ description:
+ Main voltage regulator, pin names such as 3V3_IN, VCC, VDD.
+
+ reset-gpios:
+ maxItems: 1
+ description: An optional active low reset line, should be flagged with
+ GPIO_ACTIVE_LOW.
+
+ sirf,onoff-gpios:
+ maxItems: 1
+ description: GPIO used to power on and off device, pin name ON_OFF.
+
+ sirf,wakeup-gpios:
+ maxItems: 1
+ description: GPIO used to determine device power state, pin names such
+ as RFPWRUP, WAKEUP.
+
+required:
+ - compatible
+ - vcc-supply
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ serial {
+ gnss {
+ compatible = "wi2wi,w2sg0084i";
+ vcc-supply = <&gnss_vcc_reg>;
+ reset-gpios = <&gpio0 15 GPIO_ACTIVE_LOW>;
+ sirf,onoff-gpios = <&gpio0 16 GPIO_ACTIVE_HIGH>;
+ sirf,wakeup-gpios = <&gpio0 17 GPIO_ACTIVE_HIGH>;
+ current-speed = <38400>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gnss/u-blox,neo-6m.yaml b/Documentation/devicetree/bindings/gnss/u-blox,neo-6m.yaml
new file mode 100644
index 000000000000..4835a280b3bf
--- /dev/null
+++ b/Documentation/devicetree/bindings/gnss/u-blox,neo-6m.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gnss/u-blox,neo-6m.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: U-blox GNSS Receiver
+
+allOf:
+ - $ref: gnss-common.yaml#
+
+maintainers:
+ - Johan Hovold <johan@kernel.org>
+
+description: >
+ The U-blox GNSS receivers can use UART, DDC (I2C), SPI and USB interfaces.
+
+properties:
+ compatible:
+ enum:
+ - u-blox,neo-6m
+ - u-blox,neo-8
+ - u-blox,neo-m8
+
+ reg:
+ description: >
+ The DDC Slave Address, SPI chip select address, the number of the USB hub
+ port or the USB host-controller port to which this device is attached,
+ depending on the bus used. Required for the DDC, SPI or USB busses.
+
+ vcc-supply:
+ description: >
+ Main voltage regulator
+
+ u-blox,extint-gpios:
+ maxItems: 1
+ description: >
+ GPIO connected to the "external interrupt" input pin
+
+ v-bckp-supply:
+ description: >
+ Backup voltage regulator
+
+required:
+ - compatible
+ - vcc-supply
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ serial {
+ gnss {
+ compatible = "u-blox,neo-8";
+ v-bckp-supply = <&gnss_v_bckp_reg>;
+ vcc-supply = <&gnss_vcc_reg>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gnss/u-blox.txt b/Documentation/devicetree/bindings/gnss/u-blox.txt
deleted file mode 100644
index 7cdefd058fe0..000000000000
--- a/Documentation/devicetree/bindings/gnss/u-blox.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-u-blox GNSS Receiver DT binding
-
-The u-blox GNSS receivers can use UART, DDC (I2C), SPI and USB interfaces.
-
-Please see Documentation/devicetree/bindings/gnss/gnss.txt for generic
-properties.
-
-Required properties:
-
-- compatible : Must be one of
-
- "u-blox,neo-6m"
- "u-blox,neo-8"
- "u-blox,neo-m8"
-
-- vcc-supply : Main voltage regulator
-
-Required properties (DDC):
-- reg : DDC (I2C) slave address
-
-Required properties (SPI):
-- reg : SPI chip select address
-
-Required properties (USB):
-- reg : Number of the USB hub port or the USB host-controller port
- to which this device is attached
-
-Optional properties:
-
-- timepulse-gpios : Time pulse GPIO
-- u-blox,extint-gpios : GPIO connected to the "external interrupt" input pin
-- v-bckp-supply : Backup voltage regulator
-
-Example:
-
-serial@1234 {
- compatible = "ns16550a";
-
- gnss {
- compatible = "u-blox,neo-8";
-
- v-bckp-supply = <&gnss_v_bckp_reg>;
- vcc-supply = <&gnss_vcc_reg>;
- };
-};
diff --git a/Documentation/devicetree/bindings/gpio/airoha,en7523-gpio.yaml b/Documentation/devicetree/bindings/gpio/airoha,en7523-gpio.yaml
new file mode 100644
index 000000000000..7c41d8e814cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/airoha,en7523-gpio.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/airoha,en7523-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Airoha EN7523 GPIO controller
+
+maintainers:
+ - John Crispin <john@phrozen.org>
+
+description: |
+ Airoha's GPIO controller on their ARM EN7523 SoCs consists of two banks of 32
+ GPIOs.
+
+properties:
+ $nodename:
+ pattern: "^gpio@[0-9a-f]+$"
+
+ compatible:
+ items:
+ - const: airoha,en7523-gpio
+
+ reg:
+ description: |
+ The first tuple points to the input register.
+ The second and third tuple point to the direction registers
+ The fourth tuple points to the output register
+ maxItems: 4
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+required:
+ - compatible
+ - reg
+ - "#gpio-cells"
+ - gpio-controller
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio0: gpio@1fbf0200 {
+ compatible = "airoha,en7523-gpio";
+ reg = <0x1fbf0204 0x4>,
+ <0x1fbf0200 0x4>,
+ <0x1fbf0220 0x4>,
+ <0x1fbf0214 0x4>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ };
+
+ gpio1: gpio@1fbf0270 {
+ compatible = "airoha,en7523-gpio";
+ reg = <0x1fbf0270 0x4>,
+ <0x1fbf0260 0x4>,
+ <0x1fbf0264 0x4>,
+ <0x1fbf0278 0x4>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt b/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
deleted file mode 100644
index 5d468ecd1809..000000000000
--- a/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-Broadcom STB "UPG GIO" GPIO controller
-
-The controller's registers are organized as sets of eight 32-bit
-registers with each set controlling a bank of up to 32 pins. A single
-interrupt is shared for all of the banks handled by the controller.
-
-Required properties:
-
-- compatible:
- Must be "brcm,brcmstb-gpio"
-
-- reg:
- Define the base and range of the I/O address space containing
- the brcmstb GPIO controller registers
-
-- #gpio-cells:
- Should be <2>. The first cell is the pin number (within the controller's
- pin space), and the second is used for the following:
- bit[0]: polarity (0 for active-high, 1 for active-low)
-
-- gpio-controller:
- Specifies that the node is a GPIO controller.
-
-- brcm,gpio-bank-widths:
- Number of GPIO lines for each bank. Number of elements must
- correspond to number of banks suggested by the 'reg' property.
-
-Optional properties:
-
-- interrupts:
- The interrupt shared by all GPIO lines for this controller.
-
-- interrupts-extended:
- Alternate form of specifying interrupts and parents that allows for
- multiple parents. This takes precedence over 'interrupts' and
- 'interrupt-parent'. Wakeup-capable GPIO controllers often route their
- wakeup interrupt lines through a different interrupt controller than the
- primary interrupt line, making this property necessary.
-
-- #interrupt-cells:
- Should be <2>. The first cell is the GPIO number, the second should specify
- flags. The following subset of flags is supported:
- - bits[3:0] trigger type and level flags
- 1 = low-to-high edge triggered
- 2 = high-to-low edge triggered
- 4 = active high level-sensitive
- 8 = active low level-sensitive
- Valid combinations are 1, 2, 3, 4, 8.
- See also Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-- interrupt-controller:
- Marks the device node as an interrupt controller
-
-- wakeup-source:
- GPIOs for this controller can be used as a wakeup source
-
-Example:
- upg_gio: gpio@f040a700 {
- #gpio-cells = <2>;
- #interrupt-cells = <2>;
- compatible = "brcm,bcm7445-gpio", "brcm,brcmstb-gpio";
- gpio-controller;
- interrupt-controller;
- reg = <0xf040a700 0x80>;
- interrupt-parent = <&irq0_intc>;
- interrupts = <0x6>;
- brcm,gpio-bank-widths = <32 32 32 24>;
- };
-
- upg_gio_aon: gpio@f04172c0 {
- #gpio-cells = <2>;
- #interrupt-cells = <2>;
- compatible = "brcm,bcm7445-gpio", "brcm,brcmstb-gpio";
- gpio-controller;
- interrupt-controller;
- reg = <0xf04172c0 0x40>;
- interrupt-parent = <&irq0_aon_intc>;
- interrupts = <0x6>;
- interrupts-extended = <&irq0_aon_intc 0x6>,
- <&aon_pm_l2_intc 0x5>;
- wakeup-source;
- brcm,gpio-bank-widths = <18 4>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml b/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml
new file mode 100644
index 000000000000..4a896ff7edc5
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/brcm,brcmstb-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom STB "UPG GIO" GPIO controller
+
+description: >
+ The controller's registers are organized as sets of eight 32-bit
+ registers with each set controlling a bank of up to 32 pins. A single
+ interrupt is shared for all of the banks handled by the controller.
+
+maintainers:
+ - Doug Berger <opendmb@gmail.com>
+ - Florian Fainelli <f.fainelli@gmail.com>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - brcm,bcm7445-gpio
+ - const: brcm,brcmstb-gpio
+
+ reg:
+ maxItems: 1
+ description: >
+ Define the base and range of the I/O address space containing
+ the brcmstb GPIO controller registers
+
+ "#gpio-cells":
+ const: 2
+ description: >
+ The first cell is the pin number (within the controller's
+ pin space), and the second is used for the following:
+ bit[0]: polarity (0 for active-high, 1 for active-low)
+
+ gpio-controller: true
+
+ brcm,gpio-bank-widths:
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ description: >
+ Number of GPIO lines for each bank. Number of elements must
+ correspond to number of banks suggested by the 'reg' property.
+
+ interrupts:
+ maxItems: 1
+ description: >
+ The interrupt shared by all GPIO lines for this controller.
+
+ "#interrupt-cells":
+ const: 2
+ description: |
+ The first cell is the GPIO number, the second should specify
+ flags. The following subset of flags is supported:
+ - bits[3:0] trigger type and level flags
+ 1 = low-to-high edge triggered
+ 2 = high-to-low edge triggered
+ 4 = active high level-sensitive
+ 8 = active low level-sensitive
+ Valid combinations are 1, 2, 3, 4, 8.
+
+ interrupt-controller: true
+
+ wakeup-source:
+ type: boolean
+ description: >
+ GPIOs for this controller can be used as a wakeup source
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+ - "brcm,gpio-bank-widths"
+
+additionalProperties: false
+
+examples:
+ - |
+ upg_gio: gpio@f040a700 {
+ #gpio-cells = <2>;
+ #interrupt-cells = <2>;
+ compatible = "brcm,bcm7445-gpio", "brcm,brcmstb-gpio";
+ gpio-controller;
+ interrupt-controller;
+ reg = <0xf040a700 0x80>;
+ interrupt-parent = <&irq0_intc>;
+ interrupts = <0x6>;
+ brcm,gpio-bank-widths = <32 32 32 24>;
+ };
+
+ upg_gio_aon: gpio@f04172c0 {
+ #gpio-cells = <2>;
+ #interrupt-cells = <2>;
+ compatible = "brcm,bcm7445-gpio", "brcm,brcmstb-gpio";
+ gpio-controller;
+ interrupt-controller;
+ reg = <0xf04172c0 0x40>;
+ interrupt-parent = <&irq0_aon_intc>;
+ interrupts = <0x6>;
+ wakeup-source;
+ brcm,gpio-bank-widths = <18 4>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/delta,tn48m-gpio.yaml b/Documentation/devicetree/bindings/gpio/delta,tn48m-gpio.yaml
new file mode 100644
index 000000000000..e3e668a12091
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/delta,tn48m-gpio.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/delta,tn48m-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Delta Networks TN48M CPLD GPIO controller
+
+maintainers:
+ - Robert Marko <robert.marko@sartura.hr>
+
+description: |
+ This module is part of the Delta TN48M multi-function device. For more
+ details see ../mfd/delta,tn48m-cpld.yaml.
+
+ Delta TN48M has an onboard Lattice CPLD that is used as an GPIO expander.
+ It provides 12 pins in total, they are input-only or ouput-only type.
+
+properties:
+ compatible:
+ enum:
+ - delta,tn48m-gpo
+ - delta,tn48m-gpi
+
+ reg:
+ maxItems: 1
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+required:
+ - compatible
+ - reg
+ - "#gpio-cells"
+ - gpio-controller
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml b/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml
index 5fe19fa5f67c..c0ad70e66f76 100644
--- a/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml
+++ b/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml
@@ -26,14 +26,13 @@ properties:
const: 2
registers-number:
+ $ref: /schemas/types.yaml#/definitions/uint32
description: Number of daisy-chained shift registers
enable-gpios:
description: GPIO connected to the OE (Output Enable) pin.
maxItems: 1
- spi-max-frequency: true
-
patternProperties:
"^(hog-[0-9]+|.+-hog(-[0-9]+)?)$":
type: object
@@ -58,7 +57,10 @@ required:
- '#gpio-cells'
- registers-number
-additionalProperties: false
+allOf:
+ - $ref: /schemas/spi/spi-peripheral-props.yaml#
+
+unevaluatedProperties: false
examples:
- |
diff --git a/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt b/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt
deleted file mode 100644
index d04236558619..000000000000
--- a/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Faraday Technology FTGPIO010 GPIO Controller
-
-Required properties:
-
-- compatible : Should be one of
- "cortina,gemini-gpio", "faraday,ftgpio010"
- "moxa,moxart-gpio", "faraday,ftgpio010"
- "faraday,ftgpio010"
-- reg : Should contain registers location and length
-- interrupts : Should contain the interrupt line for the GPIO block
-- gpio-controller : marks this as a GPIO controller
-- #gpio-cells : Should be 2, see gpio/gpio.txt
-- interrupt-controller : marks this as an interrupt controller
-- #interrupt-cells : a standard two-cell interrupt flag, see
- interrupt-controller/interrupts.txt
-
-Example:
-
-gpio@4d000000 {
- compatible = "cortina,gemini-gpio", "faraday,ftgpio010";
- reg = <0x4d000000 0x100>;
- interrupts = <22 IRQ_TYPE_LEVEL_HIGH>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
-};
diff --git a/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.yaml b/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.yaml
new file mode 100644
index 000000000000..640da5b9b0cc
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/faraday,ftgpio010.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Faraday Technology FTGPIO010 GPIO Controller
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: cortina,gemini-gpio
+ - const: faraday,ftgpio010
+ - items:
+ - const: moxa,moxart-gpio
+ - const: faraday,ftgpio010
+ - const: faraday,ftgpio010
+
+ reg:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+ description: Should contain the interrupt line for the GPIO block
+
+ gpio-controller: true
+ "#gpio-cells":
+ const: 2
+
+ interrupt-controller: true
+ "#interrupt-cells":
+ const: 2
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - "#gpio-cells"
+ - interrupt-controller
+ - "#interrupt-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ gpio@4d000000 {
+ compatible = "cortina,gemini-gpio", "faraday,ftgpio010";
+ reg = <0x4d000000 0x100>;
+ interrupts = <22 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/fcs,fxl6408.yaml b/Documentation/devicetree/bindings/gpio/fcs,fxl6408.yaml
new file mode 100644
index 000000000000..65b6970e42fb
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/fcs,fxl6408.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/fcs,fxl6408.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Fairchild FXL6408 I2C GPIO Expander
+
+maintainers:
+ - Emanuele Ghidoli <emanuele.ghidoli@toradex.com>
+
+properties:
+ compatible:
+ enum:
+ - fcs,fxl6408
+
+ reg:
+ maxItems: 1
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+ gpio-line-names:
+ minItems: 1
+ maxItems: 8
+
+patternProperties:
+ "^(hog-[0-9]+|.+-hog(-[0-9]+)?)$":
+ required:
+ - gpio-hog
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ gpio_expander_43: gpio-expander@43 {
+ compatible = "fcs,fxl6408";
+ reg = <0x43>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ gpio-line-names = "Wi-Fi_W_DISABLE", "Wi-Fi_WKUP_WLAN",
+ "PWR_EN_+V3.3_WiFi_N", "PCIe_REF_CLK_EN",
+ "USB_RESET_N", "USB_BYPASS_N", "Wi-Fi_PDn",
+ "Wi-Fi_WKUP_BT";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gpio/fsl,imx8qxp-sc-gpio.yaml b/Documentation/devicetree/bindings/gpio/fsl,imx8qxp-sc-gpio.yaml
new file mode 100644
index 000000000000..b7b32220935d
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/fsl,imx8qxp-sc-gpio.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/fsl,imx8qxp-sc-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: GPIO driver over IMX SCU firmware API
+
+maintainers:
+ - Shenwei Wang <shenwei.wang@nxp.com>
+
+description: |
+ This module provides the standard interface to control the
+ resource pins in SCU domain on i.MX8 platforms.
+
+properties:
+ compatible:
+ enum:
+ - fsl,imx8qxp-sc-gpio
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+required:
+ - compatible
+ - "#gpio-cells"
+ - gpio-controller
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio0: gpio {
+ compatible = "fsl,imx8qxp-sc-gpio";
+ gpio-controller;
+ #gpio-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.yaml b/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.yaml
index f57d22d1ebd6..ae18603697d7 100644
--- a/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.yaml
@@ -37,6 +37,8 @@ properties:
- fsl,imx8mp-gpio
- fsl,imx8mq-gpio
- fsl,imx8qxp-gpio
+ - fsl,imxrt1050-gpio
+ - fsl,imxrt1170-gpio
- const: fsl,imx35-gpio
reg:
diff --git a/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.txt b/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.txt
deleted file mode 100644
index bef353f370d8..000000000000
--- a/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Fujitsu MB86S7x GPIO Controller
--------------------------------
-
-Required properties:
-- compatible: Should be "fujitsu,mb86s70-gpio"
-- reg: Base address and length of register space
-- clocks: Specify the clock
-- gpio-controller: Marks the device node as a gpio controller.
-- #gpio-cells: Should be <2>. The first cell is the pin number and the
- second cell is used to specify optional parameters:
- - bit 0 specifies polarity (0 for normal, 1 for inverted).
-
-Examples:
- gpio0: gpio@31000000 {
- compatible = "fujitsu,mb86s70-gpio";
- reg = <0 0x31000000 0x10000>;
- gpio-controller;
- #gpio-cells = <2>;
- clocks = <&clk 0 2 1>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.yaml b/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.yaml
new file mode 100644
index 000000000000..d18d95285465
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/fujitsu,mb86s70-gpio.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/fujitsu,mb86s70-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Fujitsu MB86S7x GPIO Controller
+
+maintainers:
+ - Jassi Brar <jaswinder.singh@linaro.org>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: socionext,synquacer-gpio
+ - const: fujitsu,mb86s70-gpio
+ - const: fujitsu,mb86s70-gpio
+
+ reg:
+ maxItems: 1
+
+ '#gpio-cells':
+ const: 2
+
+ gpio-controller: true
+ gpio-line-names: true
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - '#gpio-cells'
+ - gpio-controller
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio@31000000 {
+ compatible = "fujitsu,mb86s70-gpio";
+ reg = <0x31000000 0x10000>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ clocks = <&clk 0 2 1>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/gpio/gpio-altera.txt b/Documentation/devicetree/bindings/gpio/gpio-altera.txt
index 146e554b3c67..2a80e272cd66 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-altera.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-altera.txt
@@ -9,8 +9,9 @@ Required properties:
- The second cell is reserved and is currently unused.
- gpio-controller : Marks the device node as a GPIO controller.
- interrupt-controller: Mark the device node as an interrupt controller
-- #interrupt-cells : Should be 1. The interrupt type is fixed in the hardware.
+- #interrupt-cells : Should be 2. The interrupt type is fixed in the hardware.
- The first cell is the GPIO offset number within the GPIO controller.
+ - The second cell is the interrupt trigger type and level flags.
- interrupts: Specify the interrupt.
- altr,interrupt-type: Specifies the interrupt trigger type the GPIO
hardware is synthesized. This field is required if the Altera GPIO controller
@@ -38,6 +39,6 @@ gpio_altr: gpio@ff200000 {
altr,interrupt-type = <IRQ_TYPE_EDGE_RISING>;
#gpio-cells = <2>;
gpio-controller;
- #interrupt-cells = <1>;
+ #interrupt-cells = <2>;
interrupt-controller;
};
diff --git a/Documentation/devicetree/bindings/gpio/gpio-axp209.txt b/Documentation/devicetree/bindings/gpio/gpio-axp209.txt
deleted file mode 100644
index fc42b2caa06d..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-axp209.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-AXP209 GPIO & pinctrl controller
-
-This driver follows the usual GPIO bindings found in
-Documentation/devicetree/bindings/gpio/gpio.txt
-
-This driver follows the usual pinctrl bindings found in
-Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
-
-This driver employs the per-pin muxing pattern.
-
-Required properties:
-- compatible: Should be one of:
- - "x-powers,axp209-gpio"
- - "x-powers,axp813-gpio"
-- #gpio-cells: Should be two. The first cell is the pin number and the
- second is the GPIO flags.
-- gpio-controller: Marks the device node as a GPIO controller.
-
-This node must be a subnode of the axp20x PMIC, documented in
-Documentation/devicetree/bindings/mfd/axp20x.txt
-
-Example:
-
-axp209: pmic@34 {
- compatible = "x-powers,axp209";
- reg = <0x34>;
- interrupt-parent = <&nmi_intc>;
- interrupts = <0 IRQ_TYPE_LEVEL_LOW>;
- interrupt-controller;
- #interrupt-cells = <1>;
-
- axp_gpio: gpio {
- compatible = "x-powers,axp209-gpio";
- gpio-controller;
- #gpio-cells = <2>;
- };
-};
-
-The GPIOs can be muxed to other functions and therefore, must be a subnode of
-axp_gpio.
-
-Example:
-
-&axp_gpio {
- gpio0_adc: gpio0-adc {
- pins = "GPIO0";
- function = "adc";
- };
-};
-
-&example_node {
- pinctrl-names = "default";
- pinctrl-0 = <&gpio0_adc>;
-};
-
-GPIOs and their functions
--------------------------
-
-Each GPIO is independent from the other (i.e. GPIO0 in gpio_in function does
-not force GPIO1 and GPIO2 to be in gpio_in function as well).
-
-axp209
-------
-GPIO | Functions
-------------------------
-GPIO0 | gpio_in, gpio_out, ldo, adc
-GPIO1 | gpio_in, gpio_out, ldo, adc
-GPIO2 | gpio_in, gpio_out
-
-axp813
-------
-GPIO | Functions
-------------------------
-GPIO0 | gpio_in, gpio_out, ldo, adc
-GPIO1 | gpio_in, gpio_out, ldo
diff --git a/Documentation/devicetree/bindings/gpio/gpio-consumer-common.yaml b/Documentation/devicetree/bindings/gpio/gpio-consumer-common.yaml
new file mode 100644
index 000000000000..40d0be31e200
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-consumer-common.yaml
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-consumer-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Common GPIO lines
+
+maintainers:
+ - Bartosz Golaszewski <brgl@bgdev.pl>
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description:
+ Pay attention to using proper GPIO flag (e.g. GPIO_ACTIVE_LOW) for the GPIOs
+ using inverted signal (e.g. RESETN).
+
+select: true
+
+properties:
+ enable-gpios:
+ maxItems: 1
+ description:
+ GPIO connected to the enable control pin.
+
+ reset-gpios:
+ description:
+ GPIO (or GPIOs for power sequence) connected to the device reset pin
+ (e.g. RESET or RESETN).
+
+ powerdown-gpios:
+ maxItems: 1
+ description:
+ GPIO connected to the power down pin (hardware power down or power cut,
+ e.g. PD or PWDN).
+
+ pwdn-gpios:
+ maxItems: 1
+ description: Use powerdown-gpios
+ deprecated: true
+
+ wakeup-gpios:
+ maxItems: 1
+ description:
+ GPIO connected to the pin waking up the device from suspend or other
+ power-saving modes.
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - mmc-pwrseq-simple
+ then:
+ properties:
+ reset-gpios:
+ minItems: 1
+ maxItems: 32
+ else:
+ properties:
+ reset-gpios:
+ maxItems: 1
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml b/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml
index f32e09ef937c..10e56cf306db 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml
+++ b/Documentation/devicetree/bindings/gpio/gpio-davinci.yaml
@@ -35,7 +35,7 @@ properties:
gpio-line-names:
description: strings describing the names of each gpio line.
minItems: 1
- maxItems: 100
+ maxItems: 144
"#gpio-cells":
const: 2
diff --git a/Documentation/devicetree/bindings/gpio/gpio-eic-sprd.txt b/Documentation/devicetree/bindings/gpio/gpio-eic-sprd.txt
deleted file mode 100644
index 54040a2bfe3a..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-eic-sprd.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-Spreadtrum EIC controller bindings
-
-The EIC is the abbreviation of external interrupt controller, which can
-be used only in input mode. The Spreadtrum platform has 2 EIC controllers,
-one is in digital chip, and another one is in PMIC. The digital chip EIC
-controller contains 4 sub-modules: EIC-debounce, EIC-latch, EIC-async and
-EIC-sync. But the PMIC EIC controller contains only one EIC-debounce sub-
-module.
-
-The EIC-debounce sub-module provides up to 8 source input signal
-connections. A debounce mechanism is used to capture the input signals'
-stable status (millisecond resolution) and a single-trigger mechanism
-is introduced into this sub-module to enhance the input event detection
-reliability. In addition, this sub-module's clock can be shut off
-automatically to reduce power dissipation. Moreover the debounce range
-is from 1ms to 4s with a step size of 1ms. The input signal will be
-ignored if it is asserted for less than 1 ms.
-
-The EIC-latch sub-module is used to latch some special power down signals
-and generate interrupts, since the EIC-latch does not depend on the APB
-clock to capture signals.
-
-The EIC-async sub-module uses a 32kHz clock to capture the short signals
-(microsecond resolution) to generate interrupts by level or edge trigger.
-
-The EIC-sync is similar with GPIO's input function, which is a synchronized
-signal input register. It can generate interrupts by level or edge trigger
-when detecting input signals.
-
-Required properties:
-- compatible: Should be one of the following:
- "sprd,sc9860-eic-debounce",
- "sprd,sc9860-eic-latch",
- "sprd,sc9860-eic-async",
- "sprd,sc9860-eic-sync",
- "sprd,sc2731-eic".
-- reg: Define the base and range of the I/O address space containing
- the GPIO controller registers.
-- gpio-controller: Marks the device node as a GPIO controller.
-- #gpio-cells: Should be <2>. The first cell is the gpio number and
- the second cell is used to specify optional parameters.
-- interrupt-controller: Marks the device node as an interrupt controller.
-- #interrupt-cells: Should be <2>. Specifies the number of cells needed
- to encode interrupt source.
-- interrupts: Should be the port interrupt shared by all the gpios.
-
-Example:
- eic_debounce: gpio@40210000 {
- compatible = "sprd,sc9860-eic-debounce";
- reg = <0 0x40210000 0 0x80>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- eic_latch: gpio@40210080 {
- compatible = "sprd,sc9860-eic-latch";
- reg = <0 0x40210080 0 0x20>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- eic_async: gpio@402100a0 {
- compatible = "sprd,sc9860-eic-async";
- reg = <0 0x402100a0 0 0x20>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- eic_sync: gpio@402100c0 {
- compatible = "sprd,sc9860-eic-sync";
- reg = <0 0x402100c0 0 0x20>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- pmic_eic: gpio@300 {
- compatible = "sprd,sc2731-eic";
- reg = <0x300>;
- interrupt-parent = <&sc2731_pmic>;
- interrupts = <5 IRQ_TYPE_LEVEL_HIGH>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-latch.yaml b/Documentation/devicetree/bindings/gpio/gpio-latch.yaml
new file mode 100644
index 000000000000..1ed82a2cebda
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-latch.yaml
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-latch.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: GPIO latch controller
+
+maintainers:
+ - Sascha Hauer <s.hauer@pengutronix.de>
+
+description: |
+ This binding describes a GPIO multiplexer based on latches connected to
+ other GPIOs, like this:
+
+ CLK0 ----------------------. ,--------.
+ CLK1 -------------------. `--------|> #0 |
+ | | |
+ OUT0 ----------------+--|-----------|D0 Q0|-----|<
+ OUT1 --------------+-|--|-----------|D1 Q1|-----|<
+ OUT2 ------------+-|-|--|-----------|D2 Q2|-----|<
+ OUT3 ----------+-|-|-|--|-----------|D3 Q3|-----|<
+ OUT4 --------+-|-|-|-|--|-----------|D4 Q4|-----|<
+ OUT5 ------+-|-|-|-|-|--|-----------|D5 Q5|-----|<
+ OUT6 ----+-|-|-|-|-|-|--|-----------|D6 Q6|-----|<
+ OUT7 --+-|-|-|-|-|-|-|--|-----------|D7 Q7|-----|<
+ | | | | | | | | | `--------'
+ | | | | | | | | |
+ | | | | | | | | | ,--------.
+ | | | | | | | | `-----------|> #1 |
+ | | | | | | | | | |
+ | | | | | | | `--------------|D0 Q0|-----|<
+ | | | | | | `----------------|D1 Q1|-----|<
+ | | | | | `------------------|D2 Q2|-----|<
+ | | | | `--------------------|D3 Q3|-----|<
+ | | | `----------------------|D4 Q4|-----|<
+ | | `------------------------|D5 Q5|-----|<
+ | `--------------------------|D6 Q6|-----|<
+ `----------------------------|D7 Q7|-----|<
+ `--------'
+
+ The number of clk-gpios and latched-gpios is not fixed. The actual number
+ of number of latches and the number of inputs per latch is derived from
+ the number of GPIOs given in the corresponding device tree properties.
+
+properties:
+ compatible:
+ const: gpio-latch
+ "#gpio-cells":
+ const: 2
+
+ clk-gpios:
+ description: Array of GPIOs to be used to clock a latch
+
+ latched-gpios:
+ description: Array of GPIOs to be used as inputs per latch
+
+ setup-duration-ns:
+ description: Delay in nanoseconds to wait after the latch inputs have been
+ set up
+
+ clock-duration-ns:
+ description: Delay in nanoseconds to wait between clock output changes
+
+ gpio-controller: true
+
+ gpio-line-names: true
+
+required:
+ - compatible
+ - "#gpio-cells"
+ - gpio-controller
+ - clk-gpios
+ - latched-gpios
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio-latch {
+ #gpio-cells = <2>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_di_do_leds>;
+ compatible = "gpio-latch";
+ gpio-controller;
+ setup-duration-ns = <100>;
+ clock-duration-ns = <100>;
+
+ clk-gpios = <&gpio3 7 0>, <&gpio3 8 0>;
+ latched-gpios = <&gpio3 21 0>, <&gpio3 22 0>,
+ <&gpio3 23 0>, <&gpio3 24 0>,
+ <&gpio3 25 0>, <&gpio3 26 0>,
+ <&gpio3 27 0>, <&gpio3 28 0>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
deleted file mode 100644
index 0fc6700ed800..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-* Marvell EBU GPIO controller
-
-Required properties:
-
-- compatible : Should be "marvell,orion-gpio", "marvell,mv78200-gpio",
- "marvell,armadaxp-gpio" or "marvell,armada-8k-gpio".
-
- "marvell,orion-gpio" should be used for Orion, Kirkwood, Dove,
- Discovery (except MV78200) and Armada 370. "marvell,mv78200-gpio"
- should be used for the Discovery MV78200.
-
- "marvel,armadaxp-gpio" should be used for all Armada XP SoCs
- (MV78230, MV78260, MV78460).
-
- "marvell,armada-8k-gpio" should be used for the Armada 7K and 8K
- SoCs (either from AP or CP), see
- Documentation/devicetree/bindings/arm/marvell/ap80x-system-controller.txt
- for specific details about the offset property.
-
-- reg: Address and length of the register set for the device. Only one
- entry is expected, except for the "marvell,armadaxp-gpio" variant
- for which two entries are expected: one for the general registers,
- one for the per-cpu registers. Not used for marvell,armada-8k-gpio.
-
-- interrupts: The list of interrupts that are used for all the pins
- managed by this GPIO bank. There can be more than one interrupt
- (example: 1 interrupt per 8 pins on Armada XP, which means 4
- interrupts per bank of 32 GPIOs).
-
-- interrupt-controller: identifies the node as an interrupt controller
-
-- #interrupt-cells: specifies the number of cells needed to encode an
- interrupt source. Should be two.
- The first cell is the GPIO number.
- The second cell is used to specify flags:
- bits[3:0] trigger type and level flags:
- 1 = low-to-high edge triggered.
- 2 = high-to-low edge triggered.
- 4 = active high level-sensitive.
- 8 = active low level-sensitive.
-
-- gpio-controller: marks the device node as a gpio controller
-
-- ngpios: number of GPIOs this controller has
-
-- #gpio-cells: Should be two. The first cell is the pin number. The
- second cell is reserved for flags, unused at the moment.
-
-Optional properties:
-
-In order to use the GPIO lines in PWM mode, some additional optional
-properties are required.
-
-- compatible: Must contain "marvell,armada-370-gpio"
-
-- reg: an additional register set is needed, for the GPIO Blink
- Counter on/off registers.
-
-- reg-names: Must contain an entry "pwm" corresponding to the
- additional register range needed for PWM operation.
-
-- #pwm-cells: Should be two. The first cell is the GPIO line number. The
- second cell is the period in nanoseconds.
-
-- clocks: Must be a phandle to the clock for the GPIO controller.
-
-Example:
-
- gpio0: gpio@d0018100 {
- compatible = "marvell,armadaxp-gpio";
- reg = <0xd0018100 0x40>,
- <0xd0018800 0x30>;
- ngpios = <32>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <16>, <17>, <18>, <19>;
- };
-
- gpio1: gpio@18140 {
- compatible = "marvell,armada-370-gpio";
- reg = <0x18140 0x40>, <0x181c8 0x08>;
- reg-names = "gpio", "pwm";
- ngpios = <17>;
- gpio-controller;
- #gpio-cells = <2>;
- #pwm-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <87>, <88>, <89>;
- clocks = <&coreclk 0>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml b/Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml
new file mode 100644
index 000000000000..f1bd1e6b2e1f
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-mvebu.yaml
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-mvebu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell EBU GPIO controller
+
+maintainers:
+ - Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+ - Andrew Lunn <andrew@lunn.ch>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - marvell,armada-8k-gpio
+ - marvell,orion-gpio
+
+ - items:
+ - enum:
+ - marvell,mv78200-gpio
+ - marvell,armada-370-gpio
+ - const: marvell,orion-gpio
+
+ - description: Deprecated binding
+ items:
+ - const: marvell,armadaxp-gpio
+ - const: marvell,orion-gpio
+ deprecated: true
+
+ reg:
+ description: |
+ Address and length of the register set for the device. Not used for
+ marvell,armada-8k-gpio.
+
+ A second entry can be provided, for the PWM function using the GPIO Blink
+ Counter on/off registers.
+ minItems: 1
+ maxItems: 2
+
+ reg-names:
+ items:
+ - const: gpio
+ - const: pwm
+ minItems: 1
+
+ offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Offset in the register map for the gpio registers (in bytes)
+
+ interrupts:
+ description: |
+ The list of interrupts that are used for all the pins managed by this
+ GPIO bank. There can be more than one interrupt (example: 1 interrupt
+ per 8 pins on Armada XP, which means 4 interrupts per bank of 32
+ GPIOs).
+ minItems: 1
+ maxItems: 4
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ gpio-controller: true
+
+ ngpios:
+ minimum: 1
+ maximum: 32
+
+ "#gpio-cells":
+ const: 2
+
+ marvell,pwm-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Offset in the register map for the pwm registers (in bytes)
+
+ "#pwm-cells":
+ description:
+ The first cell is the GPIO line number. The second cell is the period
+ in nanoseconds.
+ const: 2
+
+ clocks:
+ description:
+ Clock(s) used for PWM function.
+ items:
+ - description: Core clock
+ - description: AXI bus clock
+ minItems: 1
+
+ clock-names:
+ items:
+ - const: core
+ - const: axi
+ minItems: 1
+
+required:
+ - compatible
+ - gpio-controller
+ - ngpios
+ - "#gpio-cells"
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: marvell,armada-8k-gpio
+ then:
+ required:
+ - offset
+ else:
+ required:
+ - reg
+
+unevaluatedProperties: true
+
+examples:
+ - |
+ gpio@d0018100 {
+ compatible = "marvell,armadaxp-gpio", "marvell,orion-gpio";
+ reg = <0xd0018100 0x40>, <0xd0018800 0x30>;
+ ngpios = <32>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupts = <16>, <17>, <18>, <19>;
+ };
+
+ - |
+ gpio@18140 {
+ compatible = "marvell,armada-370-gpio", "marvell,orion-gpio";
+ reg = <0x18140 0x40>, <0x181c8 0x08>;
+ reg-names = "gpio", "pwm";
+ ngpios = <17>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ #pwm-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupts = <87>, <88>, <89>;
+ clocks = <&coreclk 0>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml b/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml
index 338c5312a106..5b0134304e51 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml
+++ b/Documentation/devicetree/bindings/gpio/gpio-pca9570.yaml
@@ -12,7 +12,9 @@ maintainers:
properties:
compatible:
enum:
+ - dlg,slg7xl45106
- nxp,pca9570
+ - nxp,pca9571
reg:
maxItems: 1
@@ -32,7 +34,7 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/gpio/gpio-pca95xx.yaml b/Documentation/devicetree/bindings/gpio/gpio-pca95xx.yaml
index b6a6e742b66d..fa116148ee90 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-pca95xx.yaml
+++ b/Documentation/devicetree/bindings/gpio/gpio-pca95xx.yaml
@@ -15,51 +15,59 @@ description: |+
properties:
compatible:
- enum:
- - exar,xra1202
- - maxim,max7310
- - maxim,max7312
- - maxim,max7313
- - maxim,max7315
- - maxim,max7319
- - maxim,max7320
- - maxim,max7321
- - maxim,max7322
- - maxim,max7323
- - maxim,max7324
- - maxim,max7325
- - maxim,max7326
- - maxim,max7327
- - nxp,pca6416
- - nxp,pca9505
- - nxp,pca9506
- - nxp,pca9534
- - nxp,pca9535
- - nxp,pca9536
- - nxp,pca9537
- - nxp,pca9538
- - nxp,pca9539
- - nxp,pca9554
- - nxp,pca9555
- - nxp,pca9556
- - nxp,pca9557
- - nxp,pca9574
- - nxp,pca9575
- - nxp,pca9698
- - nxp,pcal6416
- - nxp,pcal6524
- - nxp,pcal9535
- - nxp,pcal9554b
- - nxp,pcal9555a
- - onnn,cat9554
- - onnn,pca9654
- - ti,pca6107
- - ti,pca9536
- - ti,tca6408
- - ti,tca6416
- - ti,tca6424
- - ti,tca9539
- - ti,tca9554
+ oneOf:
+ - items:
+ - const: diodes,pi4ioe5v6534q
+ - const: nxp,pcal6534
+ - items:
+ - enum:
+ - exar,xra1202
+ - maxim,max7310
+ - maxim,max7312
+ - maxim,max7313
+ - maxim,max7315
+ - maxim,max7319
+ - maxim,max7320
+ - maxim,max7321
+ - maxim,max7322
+ - maxim,max7323
+ - maxim,max7324
+ - maxim,max7325
+ - maxim,max7326
+ - maxim,max7327
+ - nxp,pca6408
+ - nxp,pca6416
+ - nxp,pca9505
+ - nxp,pca9506
+ - nxp,pca9534
+ - nxp,pca9535
+ - nxp,pca9536
+ - nxp,pca9537
+ - nxp,pca9538
+ - nxp,pca9539
+ - nxp,pca9554
+ - nxp,pca9555
+ - nxp,pca9556
+ - nxp,pca9557
+ - nxp,pca9574
+ - nxp,pca9575
+ - nxp,pca9698
+ - nxp,pcal6408
+ - nxp,pcal6416
+ - nxp,pcal6524
+ - nxp,pcal6534
+ - nxp,pcal9535
+ - nxp,pcal9554b
+ - nxp,pcal9555a
+ - onnn,cat9554
+ - onnn,pca9654
+ - ti,pca6107
+ - ti,pca9536
+ - ti,tca6408
+ - ti,tca6416
+ - ti,tca6424
+ - ti,tca9539
+ - ti,tca9554
reg:
maxItems: 1
@@ -143,7 +151,7 @@ examples:
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/interrupt-controller/irq.h>
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -169,7 +177,7 @@ examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
- i2c1 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -190,20 +198,12 @@ examples:
"chg-status+red", "green", "blue", "en-esata",
"fault1", "p26", "p27";
};
-
- ts3a227@3b {
- compatible = "ti,ts3a227e";
- reg = <0x3b>;
- interrupt-parent = <&gpio99>;
- interrupts = <14 IRQ_TYPE_EDGE_RISING>;
- ti,micbias = <0>; /* 2.1V */
- };
};
- |
#include <dt-bindings/interrupt-controller/irq.h>
- i2c2 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
@@ -221,7 +221,7 @@ examples:
};
- |
- i2c3 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/gpio/gpio-pisosr.txt b/Documentation/devicetree/bindings/gpio/gpio-pisosr.txt
index 414a01cdf715..fba3c61f6a5b 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-pisosr.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-pisosr.txt
@@ -14,7 +14,7 @@ Optional properties:
- ngpios : Number of used GPIO lines (0..n-1), default is 8.
- load-gpios : GPIO pin specifier attached to load enable, this
pin is pulsed before reading from the device to
- load input pin values into the the device.
+ load input pin values into the device.
For other required and optional properties of SPI slave
nodes please refer to ../spi/spi-bus.txt.
diff --git a/Documentation/devicetree/bindings/gpio/gpio-samsung.txt b/Documentation/devicetree/bindings/gpio/gpio-samsung.txt
deleted file mode 100644
index 5375625e8cd2..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-samsung.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Samsung Exynos4 GPIO Controller
-
-Required properties:
-- compatible: Compatible property value should be "samsung,exynos4-gpio>".
-
-- reg: Physical base address of the controller and length of memory mapped
- region.
-
-- #gpio-cells: Should be 4. The syntax of the gpio specifier used by client nodes
- should be the following with values derived from the SoC user manual.
- <[phandle of the gpio controller node]
- [pin number within the gpio controller]
- [mux function]
- [flags and pull up/down]
- [drive strength]>
-
- Values for gpio specifier:
- - Pin number: is a value between 0 to 7.
- - Flags and Pull Up/Down: 0 - Pull Up/Down Disabled.
- 1 - Pull Down Enabled.
- 3 - Pull Up Enabled.
- Bit 16 (0x00010000) - Input is active low.
- - Drive Strength: 0 - 1x,
- 1 - 3x,
- 2 - 2x,
- 3 - 4x
-
-- gpio-controller: Specifies that the node is a gpio controller.
-- #address-cells: should be 1.
-- #size-cells: should be 1.
-
-Example:
-
- gpa0: gpio-controller@11400000 {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "samsung,exynos4-gpio";
- reg = <0x11400000 0x20>;
- #gpio-cells = <4>;
- gpio-controller;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-sprd.txt b/Documentation/devicetree/bindings/gpio/gpio-sprd.txt
deleted file mode 100644
index eca97d45388f..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-sprd.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-Spreadtrum GPIO controller bindings
-
-The controller's registers are organized as sets of sixteen 16-bit
-registers with each set controlling a bank of up to 16 pins. A single
-interrupt is shared for all of the banks handled by the controller.
-
-Required properties:
-- compatible: Should be "sprd,sc9860-gpio".
-- reg: Define the base and range of the I/O address space containing
-the GPIO controller registers.
-- gpio-controller: Marks the device node as a GPIO controller.
-- #gpio-cells: Should be <2>. The first cell is the gpio number and
-the second cell is used to specify optional parameters.
-- interrupt-controller: Marks the device node as an interrupt controller.
-- #interrupt-cells: Should be <2>. Specifies the number of cells needed
-to encode interrupt source.
-- interrupts: Should be the port interrupt shared by all the gpios.
-
-Example:
- ap_gpio: gpio@40280000 {
- compatible = "sprd,sc9860-gpio";
- reg = <0 0x40280000 0 0x1000>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
- interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-stmpe.txt b/Documentation/devicetree/bindings/gpio/gpio-stmpe.txt
index a0e4cf885213..b33f8f02c0d7 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-stmpe.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-stmpe.txt
@@ -8,8 +8,7 @@ Optional properties:
- st,norequest-mask: bitmask specifying which GPIOs should _not_ be requestable
due to different usage (e.g. touch, keypad)
-Node name must be stmpe_gpio and should be child node of stmpe node to which it
-belongs.
+Node should be child node of stmpe node to which it belongs.
Example:
stmpe_gpio {
diff --git a/Documentation/devicetree/bindings/gpio/gpio-tpic2810.txt b/Documentation/devicetree/bindings/gpio/gpio-tpic2810.txt
deleted file mode 100644
index 1afc2de7a537..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-tpic2810.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-TPIC2810 GPIO controller bindings
-
-Required properties:
- - compatible : Should be "ti,tpic2810".
- - reg : The I2C address of the device
- - gpio-controller : Marks the device node as a GPIO controller.
- - #gpio-cells : Should be two. For consumer use see gpio.txt.
-
-Example:
-
- gpio@60 {
- compatible = "ti,tpic2810";
- reg = <0x60>;
- gpio-controller;
- #gpio-cells = <2>;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-tpic2810.yaml b/Documentation/devicetree/bindings/gpio/gpio-tpic2810.yaml
new file mode 100644
index 000000000000..157969bc4c46
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio-tpic2810.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/gpio-tpic2810.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TPIC2810 GPIO controller
+
+maintainers:
+ - Aswath Govindraju <a-govindraju@ti.com>
+
+properties:
+ compatible:
+ enum:
+ - ti,tpic2810
+
+ reg:
+ maxItems: 1
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-line-names:
+ minItems: 1
+ maxItems: 32
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ gpio@60 {
+ compatible = "ti,tpic2810";
+ reg = <0x60>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ gpio-line-names = "LED A", "LED B", "LED C";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-vf610.yaml b/Documentation/devicetree/bindings/gpio/gpio-vf610.yaml
index 19738a457a58..d2c39dba56ad 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-vf610.yaml
+++ b/Documentation/devicetree/bindings/gpio/gpio-vf610.yaml
@@ -24,6 +24,11 @@ properties:
- items:
- const: fsl,imx7ulp-gpio
- const: fsl,vf610-gpio
+ - items:
+ - enum:
+ - fsl,imx93-gpio
+ - fsl,imx8ulp-gpio
+ - const: fsl,imx7ulp-gpio
reg:
description: The first reg tuple represents the PORT module, the second tuple
diff --git a/Documentation/devicetree/bindings/gpio/gpio-xilinx.txt b/Documentation/devicetree/bindings/gpio/gpio-xilinx.txt
deleted file mode 100644
index e506f30e1a95..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-xilinx.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-Xilinx plb/axi GPIO controller
-
-Dual channel GPIO controller with configurable number of pins
-(from 1 to 32 per channel). Every pin can be configured as
-input/output/tristate. Both channels share the same global IRQ but
-local interrupts can be enabled on channel basis.
-
-Required properties:
-- compatible : Should be "xlnx,xps-gpio-1.00.a"
-- reg : Address and length of the register set for the device
-- #gpio-cells : Should be two. The first cell is the pin number and the
- second cell is used to specify optional parameters (currently unused).
-- gpio-controller : Marks the device node as a GPIO controller.
-
-Optional properties:
-- clocks : Input clock specifier. Refer to common clock bindings.
-- interrupts : Interrupt mapping for GPIO IRQ.
-- xlnx,all-inputs : if n-th bit is setup, GPIO-n is input
-- xlnx,dout-default : if n-th bit is 1, GPIO-n default value is 1
-- xlnx,gpio-width : gpio width
-- xlnx,tri-default : if n-th bit is 1, GPIO-n is in tristate mode
-- xlnx,is-dual : if 1, controller also uses the second channel
-- xlnx,all-inputs-2 : as above but for the second channel
-- xlnx,dout-default-2 : as above but the second channel
-- xlnx,gpio2-width : as above but for the second channel
-- xlnx,tri-default-2 : as above but for the second channel
-
-
-Example:
-gpio: gpio@40000000 {
- #gpio-cells = <2>;
- compatible = "xlnx,xps-gpio-1.00.a";
- clocks = <&clkc25>;
- gpio-controller ;
- interrupt-parent = <&microblaze_0_intc>;
- interrupts = < 6 2 >;
- reg = < 0x40000000 0x10000 >;
- xlnx,all-inputs = <0x0>;
- xlnx,all-inputs-2 = <0x0>;
- xlnx,dout-default = <0x0>;
- xlnx,dout-default-2 = <0x0>;
- xlnx,gpio-width = <0x2>;
- xlnx,gpio2-width = <0x2>;
- xlnx,interrupt-present = <0x1>;
- xlnx,is-dual = <0x1>;
- xlnx,tri-default = <0xffffffff>;
- xlnx,tri-default-2 = <0xffffffff>;
-} ;
diff --git a/Documentation/devicetree/bindings/gpio/gpio-xlp.txt b/Documentation/devicetree/bindings/gpio/gpio-xlp.txt
deleted file mode 100644
index 47fc64922fe0..000000000000
--- a/Documentation/devicetree/bindings/gpio/gpio-xlp.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-Netlogic XLP Family GPIO
-========================
-
-This GPIO driver is used for following Netlogic XLP SoCs:
- XLP832, XLP316, XLP208, XLP980, XLP532
-This GPIO driver is also compatible with GPIO controller found on
-Broadcom Vulcan ARM64.
-
-Required properties:
--------------------
-
-- compatible: Should be one of the following:
- - "netlogic,xlp832-gpio": For Netlogic XLP832
- - "netlogic,xlp316-gpio": For Netlogic XLP316
- - "netlogic,xlp208-gpio": For Netlogic XLP208
- - "netlogic,xlp980-gpio": For Netlogic XLP980
- - "netlogic,xlp532-gpio": For Netlogic XLP532
- - "brcm,vulcan-gpio": For Broadcom Vulcan ARM64
-- reg: Physical base address and length of the controller's registers.
-- #gpio-cells: Should be two. The first cell is the pin number and the second
- cell is used to specify optional parameters (currently unused).
-- gpio-controller: Marks the device node as a GPIO controller.
-- nr-gpios: Number of GPIO pins supported by the controller.
-- interrupt-cells: Should be two. The first cell is the GPIO Number. The
- second cell is used to specify flags. The following subset of flags is
- supported:
- - trigger type:
- 1 = low to high edge triggered.
- 2 = high to low edge triggered.
- 4 = active high level-sensitive.
- 8 = active low level-sensitive.
-- interrupts: Interrupt number for this device.
-- interrupt-controller: Identifies the node as an interrupt controller.
-
-Example:
-
- gpio: xlp_gpio@34000 {
- compatible = "netlogic,xlp316-gpio";
- reg = <0 0x34100 0x1000
- 0 0x35100 0x1000>;
- #gpio-cells = <2>;
- gpio-controller;
- nr-gpios = <57>;
-
- #interrupt-cells = <2>;
- interrupt-parent = <&pic>;
- interrupts = <39>;
- interrupt-controller;
- };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-zynq.yaml b/Documentation/devicetree/bindings/gpio/gpio-zynq.yaml
index 378da2649e66..572e1718f501 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-zynq.yaml
+++ b/Documentation/devicetree/bindings/gpio/gpio-zynq.yaml
@@ -4,14 +4,18 @@
$id: http://devicetree.org/schemas/gpio/gpio-zynq.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Xilinx Zynq GPIO controller Device Tree Bindings
+title: Xilinx Zynq GPIO controller
maintainers:
- Michal Simek <michal.simek@xilinx.com>
properties:
compatible:
- const: xlnx,zynq-gpio-1.0
+ enum:
+ - xlnx,zynq-gpio-1.0
+ - xlnx,zynqmp-gpio-1.0
+ - xlnx,versal-gpio-1.0
+ - xlnx,pmc-gpio-1.0
reg:
maxItems: 1
@@ -24,6 +28,11 @@ properties:
gpio-controller: true
+ gpio-line-names:
+ description: strings describing the names of each gpio line
+ minItems: 58
+ maxItems: 174
+
interrupt-controller: true
"#interrupt-cells":
@@ -32,6 +41,54 @@ properties:
clocks:
maxItems: 1
+ power-domains:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ enum:
+ - xlnx,zynqmp-gpio-1.0
+ then:
+ properties:
+ gpio-line-names:
+ minItems: 174
+ maxItems: 174
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - xlnx,zynq-gpio-1.0
+ then:
+ properties:
+ gpio-line-names:
+ minItems: 118
+ maxItems: 118
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - xlnx,versal-gpio-1.0
+ then:
+ properties:
+ gpio-line-names:
+ minItems: 58
+ maxItems: 58
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - xlnx,pmc-gpio-1.0
+ then:
+ properties:
+ gpio-line-names:
+ minItems: 116
+ maxItems: 116
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/gpio/gpio.txt b/Documentation/devicetree/bindings/gpio/gpio.txt
index a8895d339bfe..d82c32217fff 100644
--- a/Documentation/devicetree/bindings/gpio/gpio.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio.txt
@@ -154,18 +154,35 @@ of the GPIOs that can't be used.
Optionally, a GPIO controller may have a "gpio-line-names" property. This is
an array of strings defining the names of the GPIO lines going out of the
-GPIO controller. This name should be the most meaningful producer name
-for the system, such as a rail name indicating the usage. Package names
-such as pin name are discouraged: such lines have opaque names (since they
-are by definition generic purpose) and such names are usually not very
-helpful. For example "MMC-CD", "Red LED Vdd" and "ethernet reset" are
-reasonable line names as they describe what the line is used for. "GPIO0"
-is not a good name to give to a GPIO line. Placeholders are discouraged:
-rather use the "" (blank string) if the use of the GPIO line is undefined
-in your design. The names are assigned starting from line offset 0 from
-left to right from the passed array. An incomplete array (where the number
-of passed named are less than ngpios) will still be used up until the last
-provided valid line index.
+GPIO controller.
+
+For lines which are routed to on-board devices, this name should be
+the most meaningful producer name for the system, such as a rail name
+indicating the usage. Package names, such as a pin name, are discouraged:
+such lines have opaque names (since they are by definition general-purpose)
+and such names are usually not very helpful. For example "MMC-CD", "Red LED
+Vdd" and "ethernet reset" are reasonable line names as they describe what
+the line is used for. "GPIO0" is not a good name to give to a GPIO line
+that is hard-wired to a specific device.
+
+However, in the case of lines that are routed to a general purpose header
+(e.g. the Raspberry Pi 40-pin header), and therefore are not hard-wired to
+specific devices, using a pin number or the names on the header is fine
+provided these are real (preferably unique) names. Using an SoC's pad name
+or package name, or names made up from kernel-internal software constructs,
+are strongly discouraged. For example "pin8 [gpio14/uart0_txd]" is fine
+if the board's documentation labels pin 8 as such. However "PortB_24" (an
+example of a name from an SoC's reference manual) would not be desirable.
+
+In either case placeholders are discouraged: rather use the "" (blank
+string) if the use of the GPIO line is undefined in your design. Ideally,
+try to add comments to the dts file describing the naming the convention
+you have chosen, and specifying from where the names are derived.
+
+The names are assigned starting from line offset 0, from left to right,
+from the passed array. An incomplete array (where the number of passed
+names is less than ngpios) will be used up until the last provided valid
+line index.
Example:
@@ -213,7 +230,7 @@ Example of two SOC GPIO banks defined as gpio-controller nodes:
gpio-controller;
#gpio-cells = <2>;
- line_b {
+ line_b-hog {
gpio-hog;
gpios = <6 0>;
output-low;
diff --git a/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml b/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml
new file mode 100644
index 000000000000..735d97d645a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/hisilicon,ascend910-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: HiSilicon common GPIO controller
+
+maintainers:
+ - Jay Fang <f.fangjian@huawei.com>
+
+description:
+ The HiSilicon common GPIO controller can be used for many different
+ types of SoC such as Huawei Ascend AI series chips.
+
+properties:
+ compatible:
+ const: hisilicon,ascend910-gpio
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+ ngpios:
+ minimum: 1
+ maximum: 32
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - gpio-controller
+ - "#gpio-cells"
+ - ngpios
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ gpio@840d0000 {
+ compatible = "hisilicon,ascend910-gpio";
+ reg = <0x840d0000 0x1000>;
+ ngpios = <32>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupts = <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml
new file mode 100644
index 000000000000..fb86e8ce6349
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/loongson,ls-gpio.yaml
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/loongson,ls-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson GPIO controller.
+
+maintainers:
+ - Yinbo Zhu <zhuyinbo@loongson.cn>
+
+properties:
+ compatible:
+ enum:
+ - loongson,ls2k-gpio
+ - loongson,ls7a-gpio
+
+ reg:
+ maxItems: 1
+
+ ngpios:
+ minimum: 1
+ maximum: 64
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+ gpio-ranges: true
+
+ interrupts:
+ minItems: 1
+ maxItems: 64
+
+required:
+ - compatible
+ - reg
+ - ngpios
+ - "#gpio-cells"
+ - gpio-controller
+ - gpio-ranges
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ gpio0: gpio@1fe00500 {
+ compatible = "loongson,ls2k-gpio";
+ reg = <0x1fe00500 0x38>;
+ ngpios = <64>;
+ #gpio-cells = <2>;
+ gpio-controller;
+ gpio-ranges = <&pctrl 0 0 15>,
+ <&pctrl 16 16 15>,
+ <&pctrl 32 32 10>,
+ <&pctrl 44 44 20>;
+ interrupt-parent = <&liointc1>;
+ interrupts = <28 IRQ_TYPE_LEVEL_LOW>,
+ <29 IRQ_TYPE_LEVEL_LOW>,
+ <30 IRQ_TYPE_LEVEL_LOW>,
+ <30 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <26 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <>,
+ <>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>,
+ <27 IRQ_TYPE_LEVEL_LOW>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/loongson,ls1x-gpio.yaml b/Documentation/devicetree/bindings/gpio/loongson,ls1x-gpio.yaml
new file mode 100644
index 000000000000..1a472c05697c
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/loongson,ls1x-gpio.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/loongson,ls1x-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson-1 GPIO controller
+
+maintainers:
+ - Keguang Zhang <keguang.zhang@gmail.com>
+
+properties:
+ compatible:
+ const: loongson,ls1x-gpio
+
+ reg:
+ maxItems: 1
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+ ngpios:
+ minimum: 1
+ maximum: 32
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+ - ngpios
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio0: gpio@1fd010c0 {
+ compatible = "loongson,ls1x-gpio";
+ reg = <0x1fd010c0 0x4>;
+
+ gpio-controller;
+ #gpio-cells = <2>;
+
+ ngpios = <32>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/gpio/microchip,mpfs-gpio.yaml b/Documentation/devicetree/bindings/gpio/microchip,mpfs-gpio.yaml
new file mode 100644
index 000000000000..d481e78958a7
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/microchip,mpfs-gpio.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/microchip,mpfs-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip MPFS GPIO Controller
+
+maintainers:
+ - Conor Dooley <conor.dooley@microchip.com>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - microchip,mpfs-gpio
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ description:
+ Interrupt mapping, one per GPIO. Maximum 32 GPIOs.
+ minItems: 1
+ maxItems: 32
+
+ interrupt-controller: true
+
+ clocks:
+ maxItems: 1
+
+ "#gpio-cells":
+ const: 2
+
+ "#interrupt-cells":
+ const: 1
+
+ ngpios:
+ description:
+ The number of GPIOs available.
+ minimum: 1
+ maximum: 32
+ default: 32
+
+ gpio-controller: true
+
+patternProperties:
+ "^.+-hog(-[0-9]+)?$":
+ type: object
+
+ additionalProperties: false
+
+ properties:
+ gpio-hog: true
+ gpios: true
+ input: true
+ output-high: true
+ output-low: true
+ line-name: true
+
+ required:
+ - gpio-hog
+ - gpios
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - "#interrupt-cells"
+ - interrupt-controller
+ - "#gpio-cells"
+ - gpio-controller
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ gpio@20122000 {
+ compatible = "microchip,mpfs-gpio";
+ reg = <0x20122000 0x1000>;
+ clocks = <&clkcfg 25>;
+ interrupt-parent = <&plic>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <1>;
+ interrupts = <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>,
+ <53>, <53>, <53>, <53>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/gpio/mstar,msc313-gpio.yaml b/Documentation/devicetree/bindings/gpio/mstar,msc313-gpio.yaml
index fe1e1c63ffe3..18fe90387b87 100644
--- a/Documentation/devicetree/bindings/gpio/mstar,msc313-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/mstar,msc313-gpio.yaml
@@ -14,7 +14,9 @@ properties:
pattern: "^gpio@[0-9a-f]+$"
compatible:
- const: mstar,msc313-gpio
+ enum:
+ - mstar,msc313-gpio
+ - sstar,ssd20xd-gpio
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.txt b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.txt
deleted file mode 100644
index adff16c71d21..000000000000
--- a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-NVIDIA Tegra186 GPIO controllers
-
-Tegra186 contains two GPIO controllers; a main controller and an "AON"
-controller. This binding document applies to both controllers. The register
-layouts for the controllers share many similarities, but also some significant
-differences. Hence, this document describes closely related but different
-bindings and compatible values.
-
-The Tegra186 GPIO controller allows software to set the IO direction of, and
-read/write the value of, numerous GPIO signals. Routing of GPIO signals to
-package balls is under the control of a separate pin controller HW block. Two
-major sets of registers exist:
-
-a) Security registers, which allow configuration of allowed access to the GPIO
-register set. These registers exist in a single contiguous block of physical
-address space. The size of this block, and the security features available,
-varies between the different GPIO controllers.
-
-Access to this set of registers is not necessary in all circumstances. Code
-that wishes to configure access to the GPIO registers needs access to these
-registers to do so. Code which simply wishes to read or write GPIO data does not
-need access to these registers.
-
-b) GPIO registers, which allow manipulation of the GPIO signals. In some GPIO
-controllers, these registers are exposed via multiple "physical aliases" in
-address space, each of which access the same underlying state. See the hardware
-documentation for rationale. Any particular GPIO client is expected to access
-just one of these physical aliases.
-
-Tegra HW documentation describes a unified naming convention for all GPIOs
-implemented by the SoC. Each GPIO is assigned to a port, and a port may control
-a number of GPIOs. Thus, each GPIO is named according to an alphabetical port
-name and an integer GPIO name within the port. For example, GPIO_PA0, GPIO_PN6,
-or GPIO_PCC3.
-
-The number of ports implemented by each GPIO controller varies. The number of
-implemented GPIOs within each port varies. GPIO registers within a controller
-are grouped and laid out according to the port they affect.
-
-The mapping from port name to the GPIO controller that implements that port, and
-the mapping from port name to register offset within a controller, are both
-extremely non-linear. The header file <dt-bindings/gpio/tegra186-gpio.h>
-describes the port-level mapping. In that file, the naming convention for ports
-matches the HW documentation. The values chosen for the names are alphabetically
-sorted within a particular controller. Drivers need to map between the DT GPIO
-IDs and HW register offsets using a lookup table.
-
-Each GPIO controller can generate a number of interrupt signals. Each signal
-represents the aggregate status for all GPIOs within a set of ports. Thus, the
-number of interrupt signals generated by a controller varies as a rough function
-of the number of ports it implements. Note that the HW documentation refers to
-both the overall controller HW module and the sets-of-ports as "controllers".
-
-Each GPIO controller in fact generates multiple interrupts signals for each set
-of ports. Each GPIO may be configured to feed into a specific one of the
-interrupt signals generated by a set-of-ports. The intent is for each generated
-signal to be routed to a different CPU, thus allowing different CPUs to each
-handle subsets of the interrupts within a port. The status of each of these
-per-port-set signals is reported via a separate register. Thus, a driver needs
-to know which status register to observe. This binding currently defines no
-configuration mechanism for this. By default, drivers should use register
-GPIO_${port}_INTERRUPT_STATUS_G1_0. Future revisions to the binding could
-define a property to configure this.
-
-Required properties:
-- compatible
- Array of strings.
- One of:
- - "nvidia,tegra186-gpio".
- - "nvidia,tegra186-gpio-aon".
- - "nvidia,tegra194-gpio".
- - "nvidia,tegra194-gpio-aon".
-- reg-names
- Array of strings.
- Contains a list of names for the register spaces described by the reg
- property. May contain the following entries, in any order:
- - "gpio": Mandatory. GPIO control registers. This may cover either:
- a) The single physical alias that this OS should use.
- b) All physical aliases that exist in the controller. This is
- appropriate when the OS is responsible for managing assignment of
- the physical aliases.
- - "security": Optional. Security configuration registers.
- Users of this binding MUST look up entries in the reg property by name,
- using this reg-names property to do so.
-- reg
- Array of (physical base address, length) tuples.
- Must contain one entry per entry in the reg-names property, in a matching
- order.
-- interrupts
- Array of interrupt specifiers.
- The interrupt outputs from the HW block, one per set of ports, in the
- order the HW manual describes them. The number of entries required varies
- depending on compatible value:
- - "nvidia,tegra186-gpio": 6 entries.
- - "nvidia,tegra186-gpio-aon": 1 entry.
- - "nvidia,tegra194-gpio": 6 entries.
- - "nvidia,tegra194-gpio-aon": 1 entry.
-- gpio-controller
- Boolean.
- Marks the device node as a GPIO controller/provider.
-- #gpio-cells
- Single-cell integer.
- Must be <2>.
- Indicates how many cells are used in a consumer's GPIO specifier.
- In the specifier:
- - The first cell is the pin number.
- See <dt-bindings/gpio/tegra186-gpio.h>.
- - The second cell contains flags:
- - Bit 0 specifies polarity
- - 0: Active-high (normal).
- - 1: Active-low (inverted).
-- interrupt-controller
- Boolean.
- Marks the device node as an interrupt controller/provider.
-- #interrupt-cells
- Single-cell integer.
- Must be <2>.
- Indicates how many cells are used in a consumer's interrupt specifier.
- In the specifier:
- - The first cell is the GPIO number.
- See <dt-bindings/gpio/tegra186-gpio.h>.
- - The second cell is contains flags:
- - Bits [3:0] indicate trigger type and level:
- - 1: Low-to-high edge triggered.
- - 2: High-to-low edge triggered.
- - 4: Active high level-sensitive.
- - 8: Active low level-sensitive.
- Valid combinations are 1, 2, 3, 4, 8.
-
-Example:
-
-#include <dt-bindings/interrupt-controller/irq.h>
-
-gpio@2200000 {
- compatible = "nvidia,tegra186-gpio";
- reg-names = "security", "gpio";
- reg =
- <0x0 0x2200000 0x0 0x10000>,
- <0x0 0x2210000 0x0 0x10000>;
- interrupts =
- <0 47 IRQ_TYPE_LEVEL_HIGH>,
- <0 50 IRQ_TYPE_LEVEL_HIGH>,
- <0 53 IRQ_TYPE_LEVEL_HIGH>,
- <0 56 IRQ_TYPE_LEVEL_HIGH>,
- <0 59 IRQ_TYPE_LEVEL_HIGH>,
- <0 180 IRQ_TYPE_LEVEL_HIGH>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
-};
-
-gpio@c2f0000 {
- compatible = "nvidia,tegra186-gpio-aon";
- reg-names = "security", "gpio";
- reg =
- <0x0 0xc2f0000 0x0 0x1000>,
- <0x0 0xc2f1000 0x0 0x1000>;
- interrupts =
- <0 60 IRQ_TYPE_LEVEL_HIGH>;
- gpio-controller;
- #gpio-cells = <2>;
- interrupt-controller;
- #interrupt-cells = <2>;
-};
diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml
new file mode 100644
index 000000000000..4ef06b2ff1ff
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/nvidia,tegra186-gpio.yaml
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/nvidia,tegra186-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra GPIO Controller (Tegra186 and later)
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+description: |
+ Tegra186 contains two GPIO controllers; a main controller and an "AON"
+ controller. This binding document applies to both controllers. The register
+ layouts for the controllers share many similarities, but also some
+ significant differences. Hence, this document describes closely related but
+ different bindings and compatible values.
+
+ The Tegra186 GPIO controller allows software to set the IO direction of,
+ and read/write the value of, numerous GPIO signals. Routing of GPIO signals
+ to package balls is under the control of a separate pin controller hardware
+ block. Two major sets of registers exist:
+
+ a) Security registers, which allow configuration of allowed access to the
+ GPIO register set. These registers exist in a single contiguous block
+ of physical address space. The size of this block, and the security
+ features available, varies between the different GPIO controllers.
+
+ Access to this set of registers is not necessary in all circumstances.
+ Code that wishes to configure access to the GPIO registers needs access
+ to these registers to do so. Code which simply wishes to read or write
+ GPIO data does not need access to these registers.
+
+ b) GPIO registers, which allow manipulation of the GPIO signals. In some
+ GPIO controllers, these registers are exposed via multiple "physical
+ aliases" in address space, each of which access the same underlying
+ state. See the hardware documentation for rationale. Any particular
+ GPIO client is expected to access just one of these physical aliases.
+
+ Tegra HW documentation describes a unified naming convention for all GPIOs
+ implemented by the SoC. Each GPIO is assigned to a port, and a port may
+ control a number of GPIOs. Thus, each GPIO is named according to an
+ alphabetical port name and an integer GPIO name within the port. For
+ example, GPIO_PA0, GPIO_PN6, or GPIO_PCC3.
+
+ The number of ports implemented by each GPIO controller varies. The number
+ of implemented GPIOs within each port varies. GPIO registers within a
+ controller are grouped and laid out according to the port they affect.
+
+ The mapping from port name to the GPIO controller that implements that
+ port, and the mapping from port name to register offset within a
+ controller, are both extremely non-linear. The header file
+ <dt-bindings/gpio/tegra186-gpio.h> describes the port-level mapping. In
+ that file, the naming convention for ports matches the HW documentation.
+ The values chosen for the names are alphabetically sorted within a
+ particular controller. Drivers need to map between the DT GPIO IDs and HW
+ register offsets using a lookup table.
+
+ Each GPIO controller can generate a number of interrupt signals. Each
+ signal represents the aggregate status for all GPIOs within a set of
+ ports. Thus, the number of interrupt signals generated by a controller
+ varies as a rough function of the number of ports it implements. Note
+ that the HW documentation refers to both the overall controller HW
+ module and the sets-of-ports as "controllers".
+
+ Each GPIO controller in fact generates multiple interrupts signals for
+ each set of ports. Each GPIO may be configured to feed into a specific
+ one of the interrupt signals generated by a set-of-ports. The intent is
+ for each generated signal to be routed to a different CPU, thus allowing
+ different CPUs to each handle subsets of the interrupts within a port.
+ The status of each of these per-port-set signals is reported via a
+ separate register. Thus, a driver needs to know which status register to
+ observe. This binding currently defines no configuration mechanism for
+ this. By default, drivers should use register
+ GPIO_${port}_INTERRUPT_STATUS_G1_0. Future revisions to the binding could
+ define a property to configure this.
+
+properties:
+ compatible:
+ enum:
+ - nvidia,tegra186-gpio
+ - nvidia,tegra186-gpio-aon
+ - nvidia,tegra194-gpio
+ - nvidia,tegra194-gpio-aon
+ - nvidia,tegra234-gpio
+ - nvidia,tegra234-gpio-aon
+
+ reg-names:
+ items:
+ - const: security
+ - const: gpio
+ minItems: 1
+
+ reg:
+ items:
+ - description: Security configuration registers.
+ - description: |
+ GPIO control registers. This may cover either:
+
+ a) The single physical alias that this OS should use.
+ b) All physical aliases that exist in the controller. This is
+ appropriate when the OS is responsible for managing assignment
+ of the physical aliases.
+ minItems: 1
+
+ interrupts:
+ description: The interrupt outputs from the HW block, one per set of
+ ports, in the order the HW manual describes them. The number of entries
+ required varies depending on compatible value.
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ description: |
+ Indicates how many cells are used in a consumer's GPIO specifier. In the
+ specifier:
+
+ - The first cell is the pin number.
+ See <dt-bindings/gpio/tegra186-gpio.h>.
+ - The second cell contains flags:
+ - Bit 0 specifies polarity
+ - 0: Active-high (normal).
+ - 1: Active-low (inverted).
+ const: 2
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ description: |
+ Indicates how many cells are used in a consumer's interrupt specifier.
+ In the specifier:
+
+ - The first cell is the GPIO number.
+ See <dt-bindings/gpio/tegra186-gpio.h>.
+ - The second cell is contains flags:
+ - Bits [3:0] indicate trigger type and level:
+ - 1: Low-to-high edge triggered.
+ - 2: High-to-low edge triggered.
+ - 4: Active high level-sensitive.
+ - 8: Active low level-sensitive.
+
+ Valid combinations are 1, 2, 3, 4, 8.
+ const: 2
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra186-gpio
+ - nvidia,tegra194-gpio
+ - nvidia,tegra234-gpio
+ then:
+ properties:
+ interrupts:
+ minItems: 6
+ maxItems: 48
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - nvidia,tegra186-gpio-aon
+ - nvidia,tegra194-gpio-aon
+ - nvidia,tegra234-gpio-aon
+ then:
+ properties:
+ interrupts:
+ minItems: 1
+ maxItems: 4
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ gpio@2200000 {
+ compatible = "nvidia,tegra186-gpio";
+ reg-names = "security", "gpio";
+ reg = <0x2200000 0x10000>,
+ <0x2210000 0x10000>;
+ interrupts = <0 47 IRQ_TYPE_LEVEL_HIGH>,
+ <0 50 IRQ_TYPE_LEVEL_HIGH>,
+ <0 53 IRQ_TYPE_LEVEL_HIGH>,
+ <0 56 IRQ_TYPE_LEVEL_HIGH>,
+ <0 59 IRQ_TYPE_LEVEL_HIGH>,
+ <0 180 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ };
+
+ gpio@c2f0000 {
+ compatible = "nvidia,tegra186-gpio-aon";
+ reg-names = "security", "gpio";
+ reg = <0xc2f0000 0x1000>,
+ <0xc2f1000 0x1000>;
+ interrupts = <0 60 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.txt b/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.txt
deleted file mode 100644
index 023c9526e5f8..000000000000
--- a/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-NVIDIA Tegra GPIO controller
-
-Required properties:
-- compatible : "nvidia,tegra<chip>-gpio"
-- reg : Physical base address and length of the controller's registers.
-- interrupts : The interrupt outputs from the controller. For Tegra20,
- there should be 7 interrupts specified, and for Tegra30, there should
- be 8 interrupts specified.
-- #gpio-cells : Should be two. The first cell is the pin number and the
- second cell is used to specify optional parameters:
- - bit 0 specifies polarity (0 for normal, 1 for inverted)
-- gpio-controller : Marks the device node as a GPIO controller.
-- #interrupt-cells : Should be 2.
- The first cell is the GPIO number.
- The second cell is used to specify flags:
- bits[3:0] trigger type and level flags:
- 1 = low-to-high edge triggered.
- 2 = high-to-low edge triggered.
- 4 = active high level-sensitive.
- 8 = active low level-sensitive.
- Valid combinations are 1, 2, 3, 4, 8.
-- interrupt-controller : Marks the device node as an interrupt controller.
-
-Example:
-
-gpio: gpio@6000d000 {
- compatible = "nvidia,tegra20-gpio";
- reg = < 0x6000d000 0x1000 >;
- interrupts = < 0 32 0x04
- 0 33 0x04
- 0 34 0x04
- 0 35 0x04
- 0 55 0x04
- 0 87 0x04
- 0 89 0x04 >;
- #gpio-cells = <2>;
- gpio-controller;
- #interrupt-cells = <2>;
- interrupt-controller;
-};
diff --git a/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.yaml b/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.yaml
new file mode 100644
index 000000000000..94b51749ee76
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/nvidia,tegra20-gpio.yaml
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/nvidia,tegra20-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra GPIO Controller (Tegra20 - Tegra210)
+
+maintainers:
+ - Thierry Reding <thierry.reding@gmail.com>
+ - Jon Hunter <jonathanh@nvidia.com>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nvidia,tegra20-gpio
+ - nvidia,tegra30-gpio
+
+ - items:
+ - enum:
+ - nvidia,tegra114-gpio
+ - nvidia,tegra124-gpio
+ - nvidia,tegra210-gpio
+ - const: nvidia,tegra30-gpio
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ description: The interrupt outputs from the controller. For Tegra20,
+ there should be 7 interrupts specified, and for Tegra30, there should
+ be 8 interrupts specified.
+
+ "#gpio-cells":
+ description: The first cell is the pin number and the second cell is used
+ to specify the GPIO polarity (0 = active high, 1 = active low).
+ const: 2
+
+ gpio-controller: true
+
+ gpio-ranges:
+ maxItems: 1
+
+ "#interrupt-cells":
+ description: |
+ Should be 2. The first cell is the GPIO number. The second cell is
+ used to specify flags:
+
+ bits[3:0] trigger type and level flags:
+ 1 = low-to-high edge triggered.
+ 2 = high-to-low edge triggered.
+ 4 = active high level-sensitive.
+ 8 = active low level-sensitive.
+
+ Valid combinations are 1, 2, 3, 4, 8.
+ const: 2
+
+ interrupt-controller: true
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: nvidia,tegra30-gpio
+ then:
+ properties:
+ interrupts:
+ minItems: 8
+ maxItems: 8
+ else:
+ properties:
+ interrupts:
+ minItems: 7
+ maxItems: 7
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - "#gpio-cells"
+ - gpio-controller
+ - "#interrupt-cells"
+ - interrupt-controller
+
+additionalProperties:
+ type: object
+ required:
+ - gpio-hog
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ gpio: gpio@6000d000 {
+ compatible = "nvidia,tegra20-gpio";
+ reg = <0x6000d000 0x1000>;
+ interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>;
+ #gpio-cells = <2>;
+ gpio-controller;
+ #interrupt-cells = <2>;
+ interrupt-controller;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/nxp,pcf8575.yaml b/Documentation/devicetree/bindings/gpio/nxp,pcf8575.yaml
index f0ff66c4c74e..3718103e966a 100644
--- a/Documentation/devicetree/bindings/gpio/nxp,pcf8575.yaml
+++ b/Documentation/devicetree/bindings/gpio/nxp,pcf8575.yaml
@@ -39,6 +39,10 @@ properties:
reg:
maxItems: 1
+ gpio-line-names:
+ minItems: 1
+ maxItems: 16
+
gpio-controller: true
'#gpio-cells':
diff --git a/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml b/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml
index 100f20cebd76..39fd959c45d2 100644
--- a/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml
@@ -28,10 +28,11 @@ properties:
- enum:
- realtek,rtl8380-gpio
- realtek,rtl8390-gpio
+ - realtek,rtl9300-gpio
+ - realtek,rtl9310-gpio
- const: realtek,otto-gpio
- reg:
- maxItems: 1
+ reg: true
"#gpio-cells":
const: 2
@@ -50,6 +51,23 @@ properties:
interrupts:
maxItems: 1
+if:
+ properties:
+ compatible:
+ contains:
+ const: realtek,rtl9300-gpio
+then:
+ properties:
+ reg:
+ items:
+ - description: GPIO and interrupt control
+ - description: interrupt CPU map
+else:
+ properties:
+ reg:
+ items:
+ - description: GPIO and interrupt control
+
required:
- compatible
- reg
@@ -74,5 +92,17 @@ examples:
interrupt-parent = <&rtlintc>;
interrupts = <23>;
};
+ - |
+ gpio@3300 {
+ compatible = "realtek,rtl9300-gpio", "realtek,otto-gpio";
+ reg = <0x3300 0x1c>, <0x3338 0x8>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ ngpios = <24>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupt-parent = <&rtlintc>;
+ interrupts = <13>;
+ };
...
diff --git a/Documentation/devicetree/bindings/gpio/renesas,rcar-gpio.yaml b/Documentation/devicetree/bindings/gpio/renesas,rcar-gpio.yaml
index f2541739ee3b..aa424e2b95f8 100644
--- a/Documentation/devicetree/bindings/gpio/renesas,rcar-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/renesas,rcar-gpio.yaml
@@ -49,7 +49,11 @@ properties:
- const: renesas,rcar-gen3-gpio # R-Car Gen3 or RZ/G2
- items:
- - const: renesas,gpio-r8a779a0 # R-Car V3U
+ - enum:
+ - renesas,gpio-r8a779a0 # R-Car V3U
+ - renesas,gpio-r8a779f0 # R-Car S4-8
+ - renesas,gpio-r8a779g0 # R-Car V4H
+ - const: renesas,rcar-gen4-gpio # R-Car Gen4
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml b/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml
index 0d62c28fb58d..affd823c881d 100644
--- a/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml
+++ b/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml
@@ -27,8 +27,12 @@ properties:
- description: APB interface clock source
- description: GPIO debounce reference clock source
+ gpio-ranges: true
+
gpio-controller: true
+ gpio-line-names: true
+
"#gpio-cells":
const: 2
diff --git a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml
index c2902aac2514..fc095646adea 100644
--- a/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/sifive,gpio.yaml
@@ -7,7 +7,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: SiFive GPIO controller
maintainers:
- - Yash Shah <yash.shah@sifive.com>
- Paul Walmsley <paul.walmsley@sifive.com>
properties:
@@ -47,6 +46,10 @@ properties:
maximum: 32
default: 16
+ gpio-line-names:
+ minItems: 1
+ maxItems: 32
+
gpio-controller: true
required:
@@ -77,9 +80,10 @@ examples:
gpio@10060000 {
compatible = "sifive,fu540-c000-gpio", "sifive,gpio0";
interrupt-parent = <&plic>;
- interrupts = <7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22>;
+ interrupts = <7>, <8>, <9>, <10>, <11>, <12>, <13>, <14>, <15>, <16>,
+ <17>, <18>, <19>, <20>, <21>, <22>;
reg = <0x10060000 0x1000>;
- clocks = <&tlclk PRCI_CLK_TLCLK>;
+ clocks = <&tlclk FU540_PRCI_CLK_TLCLK>;
gpio-controller;
#gpio-cells = <2>;
interrupt-controller;
diff --git a/Documentation/devicetree/bindings/gpio/socionext,uniphier-gpio.yaml b/Documentation/devicetree/bindings/gpio/socionext,uniphier-gpio.yaml
index bcafa494ed7a..228fa27ffdc3 100644
--- a/Documentation/devicetree/bindings/gpio/socionext,uniphier-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/socionext,uniphier-gpio.yaml
@@ -52,6 +52,23 @@ properties:
<child-interrupt-base parent-interrupt-base length> triplets.
$ref: /schemas/types.yaml#/definitions/uint32-matrix
+patternProperties:
+ "^.+-hog(-[0-9]+)?$":
+ type: object
+ properties:
+ gpio-hog: true
+ gpios: true
+ input: true
+ output-high: true
+ output-low: true
+ line-name: true
+
+ required:
+ - gpio-hog
+ - gpios
+
+ additionalProperties: false
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/gpio/sprd,gpio-eic.yaml b/Documentation/devicetree/bindings/gpio/sprd,gpio-eic.yaml
new file mode 100644
index 000000000000..99fcf970773a
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/sprd,gpio-eic.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2022 Unisoc Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/sprd,gpio-eic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc EIC controller
+
+maintainers:
+ - Orson Zhai <orsonzhai@gmail.com>
+ - Baolin Wang <baolin.wang7@gmail.com>
+ - Chunyan Zhang <zhang.lyra@gmail.com>
+
+description: |
+ The EIC is the abbreviation of external interrupt controller, which can
+ be used only in input mode. The Spreadtrum platform has 2 EIC controllers,
+ one is in digital chip, and another one is in PMIC. The digital chip EIC
+ controller contains 4 sub-modules, i.e. EIC-debounce, EIC-latch, EIC-async and
+ EIC-sync. But the PMIC EIC controller contains only one EIC-debounce sub-
+ module.
+
+ The EIC-debounce sub-module provides up to 8 source input signal
+ connections. A debounce mechanism is used to capture the input signals'
+ stable status (millisecond resolution) and a single-trigger mechanism
+ is introduced into this sub-module to enhance the input event detection
+ reliability. In addition, this sub-module's clock can be shut off
+ automatically to reduce power dissipation. Moreover the debounce range
+ is from 1ms to 4s with a step size of 1ms. The input signal will be
+ ignored if it is asserted for less than 1 ms.
+
+ The EIC-latch sub-module is used to latch some special power down signals
+ and generate interrupts, since the EIC-latch does not depend on the APB
+ clock to capture signals.
+
+ The EIC-async sub-module uses a 32kHz clock to capture the short signals
+ (microsecond resolution) to generate interrupts by level or edge trigger.
+
+ The EIC-sync is similar with GPIO's input function, which is a synchronized
+ signal input register. It can generate interrupts by level or edge trigger
+ when detecting input signals.
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - sprd,sc9860-eic-debounce
+ - sprd,sc9860-eic-latch
+ - sprd,sc9860-eic-async
+ - sprd,sc9860-eic-sync
+ - sprd,sc2731-eic
+ - items:
+ - enum:
+ - sprd,ums512-eic-debounce
+ - const: sprd,sc9860-eic-debounce
+ - items:
+ - enum:
+ - sprd,ums512-eic-latch
+ - const: sprd,sc9860-eic-latch
+ - items:
+ - enum:
+ - sprd,ums512-eic-async
+ - const: sprd,sc9860-eic-async
+ - items:
+ - enum:
+ - sprd,ums512-eic-sync
+ - const: sprd,sc9860-eic-sync
+ - items:
+ - enum:
+ - sprd,sc2730-eic
+ - const: sprd,sc2731-eic
+
+ reg:
+ minItems: 1
+ maxItems: 3
+ description:
+ EIC controller can support maximum 3 banks which has its own
+ address base.
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ interrupts:
+ maxItems: 1
+ description:
+ The interrupt shared by all GPIO lines for this controller.
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+ - interrupt-controller
+ - "#interrupt-cells"
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ eic_debounce: gpio@40210000 {
+ compatible = "sprd,sc9860-eic-debounce";
+ reg = <0 0x40210000 0 0x80>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupts = <GIC_SPI 52 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/gpio/sprd,gpio.yaml b/Documentation/devicetree/bindings/gpio/sprd,gpio.yaml
new file mode 100644
index 000000000000..483168838128
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/sprd,gpio.yaml
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright 2022 Unisoc Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/sprd,gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Unisoc GPIO controller
+
+maintainers:
+ - Orson Zhai <orsonzhai@gmail.com>
+ - Baolin Wang <baolin.wang7@gmail.com>
+ - Chunyan Zhang <zhang.lyra@gmail.com>
+
+description: |
+ The controller's registers are organized as sets of sixteen 16-bit
+ registers with each set controlling a bank of up to 16 pins. A single
+ interrupt is shared for all of the banks handled by the controller.
+
+properties:
+ compatible:
+ oneOf:
+ - const: sprd,sc9860-gpio
+ - items:
+ - enum:
+ - sprd,ums512-gpio
+ - const: sprd,sc9860-gpio
+
+ reg:
+ maxItems: 1
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ interrupts:
+ maxItems: 1
+ description: The interrupt shared by all GPIO lines for this controller.
+
+required:
+ - compatible
+ - reg
+ - gpio-controller
+ - "#gpio-cells"
+ - interrupt-controller
+ - "#interrupt-cells"
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ ap_gpio: gpio@40280000 {
+ compatible = "sprd,sc9860-gpio";
+ reg = <0 0x40280000 0 0x1000>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/gpio/ti,omap-gpio.yaml b/Documentation/devicetree/bindings/gpio/ti,omap-gpio.yaml
index 7087e4a5013f..bd721c839059 100644
--- a/Documentation/devicetree/bindings/gpio/ti,omap-gpio.yaml
+++ b/Documentation/devicetree/bindings/gpio/ti,omap-gpio.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/gpio/ti,omap-gpio.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: OMAP GPIO controller bindings
+title: OMAP GPIO controller
maintainers:
- Grygorii Strashko <grygorii.strashko@ti.com>
diff --git a/Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml b/Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml
index 9ad470e01953..b085450b527f 100644
--- a/Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml
+++ b/Documentation/devicetree/bindings/gpio/toshiba,gpio-visconti.yaml
@@ -43,7 +43,6 @@ required:
- gpio-controller
- interrupt-controller
- "#interrupt-cells"
- - interrupt-parent
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/gpio/x-powers,axp209-gpio.yaml b/Documentation/devicetree/bindings/gpio/x-powers,axp209-gpio.yaml
new file mode 100644
index 000000000000..31906c253940
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/x-powers,axp209-gpio.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/gpio/x-powers,axp209-gpio.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: X-Powers AXP209 GPIO
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+
+properties:
+ "#gpio-cells":
+ const: 2
+ description: >
+ The first cell is the pin number and the second is the GPIO flags.
+
+ compatible:
+ oneOf:
+ - enum:
+ - x-powers,axp209-gpio
+ - x-powers,axp221-gpio
+ - x-powers,axp813-gpio
+ - items:
+ - enum:
+ - x-powers,axp223-gpio
+ - x-powers,axp809-gpio
+ - const: x-powers,axp221-gpio
+ - items:
+ - const: x-powers,axp803-gpio
+ - const: x-powers,axp813-gpio
+
+ gpio-controller: true
+
+patternProperties:
+ "^.*-pins?$":
+ $ref: /schemas/pinctrl/pinmux-node.yaml#
+ additionalProperties: false
+
+ properties:
+ pins:
+ items:
+ enum:
+ - GPIO0
+ - GPIO1
+ - GPIO2
+
+ function:
+ enum:
+ - adc
+ - ldo
+ - gpio_in
+ - gpio_out
+
+required:
+ - compatible
+ - "#gpio-cells"
+ - gpio-controller
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml b/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml
new file mode 100644
index 000000000000..f333ee2288e7
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/xlnx,gpio-xilinx.yaml
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/xlnx,gpio-xilinx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Xilinx AXI GPIO controller
+
+maintainers:
+ - Neeli Srinivas <srinivas.neeli@xilinx.com>
+
+description:
+ The AXI GPIO design provides a general purpose input/output interface
+ to an AXI4-Lite interface. The AXI GPIO can be configured as either
+ a single or a dual-channel device. The width of each channel is
+ independently configurable. The channels can be configured to
+ generate an interrupt when a transition on any of their inputs occurs.
+
+properties:
+ compatible:
+ enum:
+ - xlnx,xps-gpio-1.00.a
+
+ reg:
+ maxItems: 1
+
+ "#gpio-cells":
+ const: 2
+
+ interrupts:
+ maxItems: 1
+
+ gpio-controller: true
+
+ gpio-line-names:
+ description: strings describing the names of each gpio line
+ minItems: 1
+ maxItems: 64
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ clocks:
+ maxItems: 1
+
+ interrupt-names: true
+
+ xlnx,all-inputs:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This option sets this GPIO channel1 bits in input mode.
+
+ xlnx,all-inputs-2:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This option sets this GPIO channel2 bits in input mode.
+
+ xlnx,all-outputs:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This option sets this GPIO channel1 bits in output mode.
+
+ xlnx,all-outputs-2:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This option sets this GPIO channel2 bits in output mode.
+
+ xlnx,dout-default:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Sets the default value of all the enabled bits of
+ channel1.
+ default: 0
+
+ xlnx,dout-default-2:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Sets the default value of all the enabled bits of
+ channel2.
+ default: 0
+
+ xlnx,gpio-width:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: The value defines the bit width of the GPIO channel1.
+ minimum: 1
+ maximum: 32
+ default: 32
+
+ xlnx,gpio2-width:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: The value defines the bit width of the GPIO channel2.
+ minimum: 1
+ maximum: 32
+ default: 32
+
+ xlnx,interrupt-present:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This parameter enables interrupt control logic
+ and interrupt registers in GPIO module.
+ minimum: 0
+ maximum: 1
+ default: 0
+
+ xlnx,is-dual:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This parameter enables a second GPIO channel (GPIO2).
+ minimum: 0
+ maximum: 1
+ default: 0
+
+ xlnx,tri-default:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This value configures the input or output mode
+ of each bit of GPIO channel1.
+
+ xlnx,tri-default-2:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This value configures the input or output mode
+ of each bit of GPIO channel2.
+
+required:
+ - reg
+ - compatible
+ - gpio-controller
+ - "#gpio-cells"
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ gpio@e000a000 {
+ compatible = "xlnx,xps-gpio-1.00.a";
+ reg = <0xa0020000 0x10000>;
+ #gpio-cells = <2>;
+ #interrupt-cells = <0x2>;
+ clocks = <&zynqmp_clk 71>;
+ gpio-controller;
+ interrupt-controller;
+ interrupt-names = "ip2intc_irpt";
+ interrupt-parent = <&gic>;
+ interrupts = <0 89 4>;
+ xlnx,all-inputs = <0x0>;
+ xlnx,all-inputs-2 = <0x0>;
+ xlnx,all-outputs = <0x0>;
+ xlnx,all-outputs-2 = <0x0>;
+ xlnx,dout-default = <0x0>;
+ xlnx,dout-default-2 = <0x0>;
+ xlnx,gpio-width = <0x20>;
+ xlnx,gpio2-width = <0x20>;
+ xlnx,interrupt-present = <0x1>;
+ xlnx,is-dual = <0x1>;
+ xlnx,tri-default = <0xFFFFFFFF>;
+ xlnx,tri-default-2 = <0xFFFFFFFF>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml b/Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml
new file mode 100644
index 000000000000..31c0fc345903
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/xlnx,zynqmp-gpio-modepin.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/gpio/xlnx,zynqmp-gpio-modepin.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ZynqMP Mode Pin GPIO controller
+
+description:
+ PS_MODE is 4-bits boot mode pins sampled on POR deassertion. Mode Pin
+ GPIO controller with configurable from numbers of pins (from 0 to 3 per
+ PS_MODE). Every pin can be configured as input/output.
+
+maintainers:
+ - Piyush Mehta <piyush.mehta@xilinx.com>
+
+properties:
+ compatible:
+ const: xlnx,zynqmp-gpio-modepin
+
+ gpio-controller: true
+
+ "#gpio-cells":
+ const: 2
+
+required:
+ - compatible
+ - gpio-controller
+ - "#gpio-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ zynqmp-firmware {
+ gpio {
+ compatible = "xlnx,zynqmp-gpio-modepin";
+ gpio-controller;
+ #gpio-cells = <2>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml b/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
index 6f98dd55fb4c..0400a361875d 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-bifrost.yaml
@@ -14,32 +14,53 @@ properties:
pattern: '^gpu@[a-f0-9]+$'
compatible:
- items:
- - enum:
- - amlogic,meson-g12a-mali
- - mediatek,mt8183-mali
- - realtek,rtd1619-mali
- - rockchip,px30-mali
- - rockchip,rk3568-mali
- - const: arm,mali-bifrost # Mali Bifrost GPU model/revision is fully discoverable
+ oneOf:
+ - items:
+ - enum:
+ - amlogic,meson-g12a-mali
+ - mediatek,mt8183-mali
+ - mediatek,mt8183b-mali
+ - mediatek,mt8186-mali
+ - realtek,rtd1619-mali
+ - renesas,r9a07g044-mali
+ - renesas,r9a07g054-mali
+ - rockchip,px30-mali
+ - rockchip,rk3568-mali
+ - const: arm,mali-bifrost # Mali Bifrost GPU model/revision is fully discoverable
+ - items:
+ - enum:
+ - mediatek,mt8195-mali
+ - const: mediatek,mt8192-mali
+ - const: arm,mali-valhall-jm # Mali Valhall GPU model/revision is fully discoverable
+ - items:
+ - enum:
+ - mediatek,mt8192-mali
+ - const: arm,mali-valhall-jm # Mali Valhall GPU model/revision is fully discoverable
reg:
maxItems: 1
interrupts:
+ minItems: 3
items:
- description: Job interrupt
- description: MMU interrupt
- description: GPU interrupt
+ - description: Event interrupt
interrupt-names:
+ minItems: 3
items:
- const: job
- const: mmu
- const: gpu
+ - const: event
clocks:
- maxItems: 1
+ minItems: 1
+ maxItems: 3
+
+ clock-names: true
mali-supply: true
@@ -49,10 +70,17 @@ properties:
power-domains:
minItems: 1
- maxItems: 3
+ maxItems: 5
+
+ power-domain-names:
+ minItems: 2
+ maxItems: 5
resets:
- maxItems: 2
+ minItems: 1
+ maxItems: 3
+
+ reset-names: true
"#cooling-cells":
const: 2
@@ -76,6 +104,13 @@ properties:
dma-coherent: true
+ nvmem-cell-names:
+ items:
+ - const: speed-bin
+
+ nvmem-cells:
+ maxItems: 1
+
required:
- compatible
- reg
@@ -92,8 +127,47 @@ allOf:
contains:
const: amlogic,meson-g12a-mali
then:
+ properties:
+ power-domains:
+ maxItems: 1
+ power-domain-names: false
+ required:
+ - resets
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - renesas,r9a07g044-mali
+ - renesas,r9a07g054-mali
+ then:
+ properties:
+ interrupts:
+ minItems: 4
+ interrupt-names:
+ minItems: 4
+ clocks:
+ minItems: 3
+ clock-names:
+ items:
+ - const: gpu
+ - const: bus
+ - const: bus_ace
+ power-domains:
+ maxItems: 1
+ power-domain-names: false
+ resets:
+ minItems: 3
+ reset-names:
+ items:
+ - const: rst
+ - const: axi_rst
+ - const: ace_rst
required:
+ - clock-names
+ - power-domains
- resets
+ - reset-names
- if:
properties:
compatible:
@@ -103,6 +177,7 @@ allOf:
properties:
power-domains:
minItems: 3
+ maxItems: 3
power-domain-names:
items:
- const: core0
@@ -115,9 +190,79 @@ allOf:
- power-domain-names
else:
properties:
+ sram-supply: false
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt8183b-mali
+ then:
+ properties:
+ power-domains:
+ minItems: 3
+ maxItems: 3
+ power-domain-names:
+ items:
+ - const: core0
+ - const: core1
+ - const: core2
+ required:
+ - power-domains
+ - power-domain-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt8186-mali
+ then:
+ properties:
+ power-domains:
+ minItems: 2
+ maxItems: 2
+ power-domain-names:
+ items:
+ - const: core0
+ - const: core1
+ required:
+ - power-domains
+ - power-domain-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt8192-mali
+ then:
+ properties:
+ power-domains:
+ minItems: 5
+ power-domain-names:
+ items:
+ - const: core0
+ - const: core1
+ - const: core2
+ - const: core3
+ - const: core4
+ required:
+ - power-domains
+ - power-domain-names
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: rockchip,rk3568-mali
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ clock-names:
+ items:
+ - const: gpu
+ - const: bus
power-domains:
maxItems: 1
- sram-supply: false
+ power-domain-names: false
+ required:
+ - clock-names
examples:
- |
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.yaml b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.yaml
index d209f272625d..2a25384ca3ef 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.yaml
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.yaml
@@ -74,7 +74,8 @@ properties:
- const: bus
mali-supply: true
- opp-table: true
+ opp-table:
+ type: object
power-domains:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.yaml b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.yaml
index eceaa176bd57..318122d95eb5 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.yaml
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.yaml
@@ -101,7 +101,8 @@ properties:
mali-supply: true
- opp-table: true
+ opp-table:
+ type: object
power-domains:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml b/Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml
index e6485f7b046f..dae55b8a267b 100644
--- a/Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml
+++ b/Documentation/devicetree/bindings/gpu/brcm,bcm-v3d.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/gpu/brcm,bcm-v3d.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Broadcom V3D GPU Bindings
+title: Broadcom V3D GPU
maintainers:
- Eric Anholt <eric@anholt.net>
@@ -16,6 +16,7 @@ properties:
compatible:
enum:
+ - brcm,2711-v3d
- brcm,7268-v3d
- brcm,7278-v3d
diff --git a/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvdec.yaml b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvdec.yaml
new file mode 100644
index 000000000000..ba4c6473ff92
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvdec.yaml
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpu/host1x/nvidia,tegra210-nvdec.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra NVDEC
+
+description: |
+ NVDEC is the hardware video decoder present on NVIDIA Tegra210
+ and newer chips. It is located on the Host1x bus and typically
+ programmed through Host1x channels.
+
+maintainers:
+ - Thierry Reding <treding@gmail.com>
+ - Mikko Perttunen <mperttunen@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^nvdec@[0-9a-f]*$"
+
+ compatible:
+ enum:
+ - nvidia,tegra210-nvdec
+ - nvidia,tegra186-nvdec
+ - nvidia,tegra194-nvdec
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: nvdec
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: nvdec
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ dma-coherent: true
+
+ interconnects:
+ items:
+ - description: DMA read memory client
+ - description: DMA read 2 memory client
+ - description: DMA write memory client
+
+ interconnect-names:
+ items:
+ - const: dma-mem
+ - const: read-1
+ - const: write
+
+ nvidia,host1x-class:
+ description: |
+ Host1x class of the engine, used to specify the targeted engine
+ when programming the engine through Host1x channels or when
+ configuring engine-specific behavior in Host1x.
+ default: 0xf0
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/power/tegra186-powergate.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ nvdec@15480000 {
+ compatible = "nvidia,tegra186-nvdec";
+ reg = <0x15480000 0x40000>;
+ clocks = <&bpmp TEGRA186_CLK_NVDEC>;
+ clock-names = "nvdec";
+ resets = <&bpmp TEGRA186_RESET_NVDEC>;
+ reset-names = "nvdec";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_NVDEC>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVDECSRD &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVDECSRD1 &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVDECSWR &emc>;
+ interconnect-names = "dma-mem", "read-1", "write";
+ iommus = <&smmu TEGRA186_SID_NVDEC>;
+ };
diff --git a/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvenc.yaml b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvenc.yaml
new file mode 100644
index 000000000000..c23dae713eb8
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvenc.yaml
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpu/host1x/nvidia,tegra210-nvenc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra NVENC
+
+description: |
+ NVENC is the hardware video encoder present on NVIDIA Tegra210
+ and newer chips. It is located on the Host1x bus and typically
+ programmed through Host1x channels.
+
+maintainers:
+ - Thierry Reding <treding@gmail.com>
+ - Mikko Perttunen <mperttunen@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^nvenc@[0-9a-f]*$"
+
+ compatible:
+ enum:
+ - nvidia,tegra210-nvenc
+ - nvidia,tegra186-nvenc
+ - nvidia,tegra194-nvenc
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: nvenc
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: nvenc
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ dma-coherent: true
+
+ interconnects:
+ minItems: 2
+ maxItems: 3
+
+ interconnect-names:
+ minItems: 2
+ maxItems: 3
+
+ nvidia,host1x-class:
+ description: |
+ Host1x class of the engine, used to specify the targeted engine
+ when programming the engine through Host1x channels or when
+ configuring engine-specific behavior in Host1x.
+ default: 0x21
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ enum:
+ - nvidia,tegra210-nvenc
+ - nvidia,tegra186-nvenc
+ then:
+ properties:
+ interconnects:
+ items:
+ - description: DMA read memory client
+ - description: DMA write memory client
+ interconnect-names:
+ items:
+ - const: dma-mem
+ - const: write
+ - if:
+ properties:
+ compatible:
+ enum:
+ - nvidia,tegra194-nvenc
+ then:
+ properties:
+ interconnects:
+ items:
+ - description: DMA read memory client
+ - description: DMA read 2 memory client
+ - description: DMA write memory client
+ interconnect-names:
+ items:
+ - const: dma-mem
+ - const: read-1
+ - const: write
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/power/tegra186-powergate.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ nvenc@154c0000 {
+ compatible = "nvidia,tegra186-nvenc";
+ reg = <0x154c0000 0x40000>;
+ clocks = <&bpmp TEGRA186_CLK_NVENC>;
+ clock-names = "nvenc";
+ resets = <&bpmp TEGRA186_RESET_NVENC>;
+ reset-names = "nvenc";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_MPE>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVENCSRD &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVENCSWR &emc>;
+ interconnect-names = "dma-mem", "write";
+ iommus = <&smmu TEGRA186_SID_NVENC>;
+ };
diff --git a/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvjpg.yaml b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvjpg.yaml
new file mode 100644
index 000000000000..99a33a5eac3f
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra210-nvjpg.yaml
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpu/host1x/nvidia,tegra210-nvjpg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra NVJPG
+
+description: |
+ NVJPG is the hardware JPEG decoder and encoder present on NVIDIA Tegra210
+ and newer chips. It is located on the Host1x bus and typically programmed
+ through Host1x channels.
+
+maintainers:
+ - Thierry Reding <treding@gmail.com>
+ - Mikko Perttunen <mperttunen@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^nvjpg@[0-9a-f]*$"
+
+ compatible:
+ enum:
+ - nvidia,tegra210-nvjpg
+ - nvidia,tegra186-nvjpg
+ - nvidia,tegra194-nvjpg
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: nvjpg
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: nvjpg
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ dma-coherent: true
+
+ interconnects:
+ items:
+ - description: DMA read memory client
+ - description: DMA write memory client
+
+ interconnect-names:
+ items:
+ - const: dma-mem
+ - const: write
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra186-clock.h>
+ #include <dt-bindings/memory/tegra186-mc.h>
+ #include <dt-bindings/power/tegra186-powergate.h>
+ #include <dt-bindings/reset/tegra186-reset.h>
+
+ nvjpg@15380000 {
+ compatible = "nvidia,tegra186-nvjpg";
+ reg = <0x15380000 0x40000>;
+ clocks = <&bpmp TEGRA186_CLK_NVJPG>;
+ clock-names = "nvjpg";
+ resets = <&bpmp TEGRA186_RESET_NVJPG>;
+ reset-names = "nvjpg";
+
+ power-domains = <&bpmp TEGRA186_POWER_DOMAIN_NVJPG>;
+ interconnects = <&mc TEGRA186_MEMORY_CLIENT_NVJPGSRD &emc>,
+ <&mc TEGRA186_MEMORY_CLIENT_NVJPGSWR &emc>;
+ interconnect-names = "dma-mem", "write";
+ iommus = <&smmu TEGRA186_SID_NVJPG>;
+ };
diff --git a/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra234-nvdec.yaml b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra234-nvdec.yaml
new file mode 100644
index 000000000000..0b7561c8b9bb
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpu/host1x/nvidia,tegra234-nvdec.yaml
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpu/host1x/nvidia,tegra234-nvdec.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NVIDIA Tegra234 NVDEC
+
+description: |
+ NVDEC is the hardware video decoder present on NVIDIA Tegra210
+ and newer chips. It is located on the Host1x bus and typically
+ programmed through Host1x channels.
+
+maintainers:
+ - Thierry Reding <treding@gmail.com>
+ - Mikko Perttunen <mperttunen@nvidia.com>
+
+properties:
+ $nodename:
+ pattern: "^nvdec@[0-9a-f]*$"
+
+ compatible:
+ enum:
+ - nvidia,tegra234-nvdec
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 3
+
+ clock-names:
+ items:
+ - const: nvdec
+ - const: fuse
+ - const: tsec_pka
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ items:
+ - const: nvdec
+
+ power-domains:
+ maxItems: 1
+
+ iommus:
+ maxItems: 1
+
+ dma-coherent: true
+
+ interconnects:
+ items:
+ - description: DMA read memory client
+ - description: DMA write memory client
+
+ interconnect-names:
+ items:
+ - const: dma-mem
+ - const: write
+
+ nvidia,memory-controller:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ phandle to the memory controller for determining information for the NVDEC
+ firmware secure carveout. This carveout is configured by the bootloader and
+ not accessible to CPU.
+
+ nvidia,bl-manifest-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to bootloader manifest from beginning of firmware that was configured by
+ the bootloader.
+
+ nvidia,bl-code-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to bootloader code section from beginning of firmware that was configured by
+ the bootloader.
+
+ nvidia,bl-data-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to bootloader data section from beginning of firmware that was configured by
+ the bootloader.
+
+ nvidia,os-manifest-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to operating system manifest from beginning of firmware that was configured by
+ the bootloader.
+
+ nvidia,os-code-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to operating system code section from beginning of firmware that was configured by
+ the bootloader.
+
+ nvidia,os-data-offset:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Offset to operating system data section from beginning of firmware that was configured
+ by the bootloader.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - power-domains
+ - nvidia,memory-controller
+ - nvidia,bl-manifest-offset
+ - nvidia,bl-code-offset
+ - nvidia,bl-data-offset
+ - nvidia,os-manifest-offset
+ - nvidia,os-code-offset
+ - nvidia,os-data-offset
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/tegra234-clock.h>
+ #include <dt-bindings/memory/tegra234-mc.h>
+ #include <dt-bindings/power/tegra234-powergate.h>
+ #include <dt-bindings/reset/tegra234-reset.h>
+
+ nvdec@15480000 {
+ compatible = "nvidia,tegra234-nvdec";
+ reg = <0x15480000 0x00040000>;
+ clocks = <&bpmp TEGRA234_CLK_NVDEC>,
+ <&bpmp TEGRA234_CLK_FUSE>,
+ <&bpmp TEGRA234_CLK_TSEC_PKA>;
+ clock-names = "nvdec", "fuse", "tsec_pka";
+ resets = <&bpmp TEGRA234_RESET_NVDEC>;
+ reset-names = "nvdec";
+ power-domains = <&bpmp TEGRA234_POWER_DOMAIN_NVDEC>;
+ interconnects = <&mc TEGRA234_MEMORY_CLIENT_NVDECSRD &emc>,
+ <&mc TEGRA234_MEMORY_CLIENT_NVDECSWR &emc>;
+ interconnect-names = "dma-mem", "write";
+ iommus = <&smmu_niso1 TEGRA234_SID_NVDEC>;
+ dma-coherent;
+
+ nvidia,memory-controller = <&mc>;
+
+ /* Placeholder values, to be replaced with values from overlay */
+ nvidia,bl-manifest-offset = <0>;
+ nvidia,bl-data-offset = <0>;
+ nvidia,bl-code-offset = <0>;
+ nvidia,os-manifest-offset = <0>;
+ nvidia,os-data-offset = <0>;
+ nvidia,os-code-offset = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/gpu/samsung-rotator.yaml b/Documentation/devicetree/bindings/gpu/samsung-rotator.yaml
index 62486f55177d..d60626ffb28e 100644
--- a/Documentation/devicetree/bindings/gpu/samsung-rotator.yaml
+++ b/Documentation/devicetree/bindings/gpu/samsung-rotator.yaml
@@ -53,4 +53,3 @@ examples:
clocks = <&clock 278>;
clock-names = "rotator";
};
-
diff --git a/Documentation/devicetree/bindings/gpu/vivante,gc.yaml b/Documentation/devicetree/bindings/gpu/vivante,gc.yaml
index 93e7244cdc0e..b1b10ea70ad9 100644
--- a/Documentation/devicetree/bindings/gpu/vivante,gc.yaml
+++ b/Documentation/devicetree/bindings/gpu/vivante,gc.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/gpu/vivante,gc.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Vivante GPU Bindings
+title: Vivante GPU
description: Vivante GPU core devices
diff --git a/Documentation/devicetree/bindings/h8300/cpu.txt b/Documentation/devicetree/bindings/h8300/cpu.txt
deleted file mode 100644
index 70cd58608f4b..000000000000
--- a/Documentation/devicetree/bindings/h8300/cpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-* H8/300 CPU bindings
-
-Required properties:
-
-- compatible: Compatible property value should be "renesas,h8300".
-- clock-frequency: Contains the clock frequency for CPU, in Hz.
-
-Example:
-
- cpu@0 {
- compatible = "renesas,h8300";
- clock-frequency = <20000000>;
- };
diff --git a/Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml b/Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml
new file mode 100644
index 000000000000..9d0c36ec1982
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwinfo/loongson,ls2k-chipid.yaml
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwinfo/loongson,ls2k-chipid.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson-2 SoC ChipID
+
+maintainers:
+ - Yinbo Zhu <zhuyinbo@loongson.cn>
+
+description: |
+ Loongson-2 SoC contains many groups of global utilities register
+ blocks, of which the ChipID group registers record SoC version,
+ feature, vendor and id information.
+
+properties:
+ compatible:
+ const: loongson,ls2k-chipid
+
+ reg:
+ maxItems: 1
+
+ little-endian: true
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ chipid: chipid@1fe00000 {
+ compatible = "loongson,ls2k-chipid";
+ reg = <0x1fe00000 0x3ffc>;
+ little-endian;
+ };
diff --git a/Documentation/devicetree/bindings/arm/renesas,prr.yaml b/Documentation/devicetree/bindings/hwinfo/renesas,prr.yaml
index 1f80767da38b..792f371cec03 100644
--- a/Documentation/devicetree/bindings/arm/renesas,prr.yaml
+++ b/Documentation/devicetree/bindings/hwinfo/renesas,prr.yaml
@@ -1,7 +1,7 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: http://devicetree.org/schemas/arm/renesas,prr.yaml#
+$id: http://devicetree.org/schemas/hwinfo/renesas,prr.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas Product Register
diff --git a/Documentation/devicetree/bindings/hwinfo/samsung,exynos-chipid.yaml b/Documentation/devicetree/bindings/hwinfo/samsung,exynos-chipid.yaml
new file mode 100644
index 000000000000..95cbdcb56efe
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwinfo/samsung,exynos-chipid.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwinfo/samsung,exynos-chipid.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung Exynos SoC series Chipid driver
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - samsung,exynos4210-chipid
+ - samsung,exynos850-chipid
+
+ reg:
+ maxItems: 1
+
+ samsung,asv-bin:
+ description:
+ Adaptive Supply Voltage bin selection. This can be used
+ to determine the ASV bin of an SoC if respective information
+ is missing in the CHIPID registers or in the OTP memory.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [0, 1, 2, 3]
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ chipid@10000000 {
+ compatible = "samsung,exynos4210-chipid";
+ reg = <0x10000000 0x100>;
+ samsung,asv-bin = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/hwinfo/samsung,s5pv210-chipid.yaml b/Documentation/devicetree/bindings/hwinfo/samsung,s5pv210-chipid.yaml
new file mode 100644
index 000000000000..563ded4fca83
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwinfo/samsung,s5pv210-chipid.yaml
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwinfo/samsung,s5pv210-chipid.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Samsung S5PV210 SoC ChipID
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ const: samsung,s5pv210-chipid
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ chipid@e0000000 {
+ compatible = "samsung,s5pv210-chipid";
+ reg = <0xe0000000 0x1000>;
+ };
diff --git a/Documentation/devicetree/bindings/hwinfo/ti,k3-socinfo.yaml b/Documentation/devicetree/bindings/hwinfo/ti,k3-socinfo.yaml
new file mode 100644
index 000000000000..dada28b47ea0
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwinfo/ti,k3-socinfo.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwinfo/ti,k3-socinfo.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments K3 Multicore SoC platforms chipid module
+
+maintainers:
+ - Tero Kristo <t-kristo@ti.com>
+ - Nishanth Menon <nm@ti.com>
+
+description: |
+ Texas Instruments (ARM64) K3 Multicore SoC platforms chipid module is
+ represented by CTRLMMR_xxx_JTAGID register which contains information about
+ SoC id and revision.
+
+properties:
+ $nodename:
+ pattern: "^chipid@[0-9a-f]+$"
+
+ compatible:
+ items:
+ - const: ti,am654-chipid
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ chipid@43000014 {
+ compatible = "ti,am654-chipid";
+ reg = <0x43000014 0x4>;
+ };
diff --git a/Documentation/devicetree/bindings/hwlock/allwinner,sun6i-a31-hwspinlock.yaml b/Documentation/devicetree/bindings/hwlock/allwinner,sun6i-a31-hwspinlock.yaml
index 10e5a53e447b..38478dad8b25 100644
--- a/Documentation/devicetree/bindings/hwlock/allwinner,sun6i-a31-hwspinlock.yaml
+++ b/Documentation/devicetree/bindings/hwlock/allwinner,sun6i-a31-hwspinlock.yaml
@@ -26,11 +26,15 @@ properties:
resets:
maxItems: 1
+ '#hwlock-cells':
+ const: 1
+
required:
- compatible
- reg
- clocks
- resets
+ - "#hwlock-cells"
additionalProperties: false
@@ -44,5 +48,6 @@ examples:
reg = <0x01c18000 0x1000>;
clocks = <&ccu CLK_BUS_SPINLOCK>;
resets = <&ccu RST_BUS_SPINLOCK>;
+ #hwlock-cells = <1>;
};
...
diff --git a/Documentation/devicetree/bindings/hwlock/qcom-hwspinlock.yaml b/Documentation/devicetree/bindings/hwlock/qcom-hwspinlock.yaml
index 1c7149f7d171..ee2726149cf3 100644
--- a/Documentation/devicetree/bindings/hwlock/qcom-hwspinlock.yaml
+++ b/Documentation/devicetree/bindings/hwlock/qcom-hwspinlock.yaml
@@ -15,9 +15,22 @@ description:
properties:
compatible:
- enum:
- - qcom,sfpb-mutex
- - qcom,tcsr-mutex
+ oneOf:
+ - enum:
+ - qcom,sfpb-mutex
+ - qcom,tcsr-mutex
+ - items:
+ - enum:
+ - qcom,apq8084-tcsr-mutex
+ - qcom,ipq6018-tcsr-mutex
+ - qcom,msm8226-tcsr-mutex
+ - qcom,msm8994-tcsr-mutex
+ - const: qcom,tcsr-mutex
+ - items:
+ - enum:
+ - qcom,msm8974-tcsr-mutex
+ - const: qcom,tcsr-mutex
+ - const: syscon
reg:
maxItems: 1
@@ -34,9 +47,9 @@ additionalProperties: false
examples:
- |
- tcsr_mutex: hwlock@1f40000 {
- compatible = "qcom,tcsr-mutex";
- reg = <0x01f40000 0x40000>;
- #hwlock-cells = <1>;
- };
+ hwlock@1f40000 {
+ compatible = "qcom,tcsr-mutex";
+ reg = <0x01f40000 0x40000>;
+ #hwlock-cells = <1>;
+ };
...
diff --git a/Documentation/devicetree/bindings/hwlock/st,stm32-hwspinlock.yaml b/Documentation/devicetree/bindings/hwlock/st,stm32-hwspinlock.yaml
index 47cf9c8d97e9..829d1fdf4c67 100644
--- a/Documentation/devicetree/bindings/hwlock/st,stm32-hwspinlock.yaml
+++ b/Documentation/devicetree/bindings/hwlock/st,stm32-hwspinlock.yaml
@@ -4,11 +4,10 @@
$id: http://devicetree.org/schemas/hwlock/st,stm32-hwspinlock.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: STMicroelectronics STM32 Hardware Spinlock bindings
+title: STMicroelectronics STM32 Hardware Spinlock
maintainers:
- - Benjamin Gaignard <benjamin.gaignard@st.com>
- - Fabien Dessenne <fabien.dessenne@st.com>
+ - Fabien Dessenne <fabien.dessenne@foss.st.com>
properties:
"#hwlock-cells":
diff --git a/Documentation/devicetree/bindings/hwlock/ti,omap-hwspinlock.yaml b/Documentation/devicetree/bindings/hwlock/ti,omap-hwspinlock.yaml
index ae1b37dbee75..0a955c7b9706 100644
--- a/Documentation/devicetree/bindings/hwlock/ti,omap-hwspinlock.yaml
+++ b/Documentation/devicetree/bindings/hwlock/ti,omap-hwspinlock.yaml
@@ -39,39 +39,8 @@ additionalProperties: false
examples:
- |
- /* OMAP4 SoCs */
- hwspinlock: spinlock@4a0f6000 {
+ spinlock@4a0f6000 {
compatible = "ti,omap4-hwspinlock";
reg = <0x4a0f6000 0x1000>;
#hwlock-cells = <1>;
};
-
- - |
- / {
- /* K3 AM65x SoCs */
- model = "Texas Instruments K3 AM654 SoC";
- compatible = "ti,am654-evm", "ti,am654";
- #address-cells = <2>;
- #size-cells = <2>;
-
- bus@100000 {
- compatible = "simple-bus";
- #address-cells = <2>;
- #size-cells = <2>;
- ranges = <0x00 0x00100000 0x00 0x00100000 0x00 0x00020000>, /* ctrl mmr */
- <0x00 0x30800000 0x00 0x30800000 0x00 0x0bc00000>; /* Main NavSS */
-
- bus@30800000 {
- compatible = "simple-mfd";
- #address-cells = <2>;
- #size-cells = <2>;
- ranges = <0x00 0x30800000 0x00 0x30800000 0x00 0x0bc00000>;
-
- spinlock@30e00000 {
- compatible = "ti,am654-hwspinlock";
- reg = <0x00 0x30e00000 0x00 0x1000>;
- #hwlock-cells = <1>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml b/Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml
index 154bee851139..ca2b47320689 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,adm1177.yaml
@@ -8,7 +8,6 @@ title: Analog Devices ADM1177 Hot Swap Controller and Digital Power Monitor
maintainers:
- Michael Hennerich <michael.hennerich@analog.com>
- - Beniamin Bia <beniamin.bia@analog.com>
description: |
Analog Devices ADM1177 Hot Swap Controller and Digital Power Monitor
@@ -53,16 +52,16 @@ examples:
- |
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/interrupt-controller/irq.h>
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
pwmon@5a {
- compatible = "adi,adm1177";
- reg = <0x5a>;
- shunt-resistor-micro-ohms = <50000>; /* 50 mOhm */
- adi,shutdown-threshold-microamp = <1059000>; /* 1.059 A */
- adi,vrange-high-enable;
+ compatible = "adi,adm1177";
+ reg = <0x5a>;
+ shunt-resistor-micro-ohms = <50000>; /* 50 mOhm */
+ adi,shutdown-threshold-microamp = <1059000>; /* 1.059 A */
+ adi,vrange-high-enable;
};
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,adm1266.yaml b/Documentation/devicetree/bindings/hwmon/adi,adm1266.yaml
index 43b4f4f57b49..4f8e11bd5142 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,adm1266.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,adm1266.yaml
@@ -39,13 +39,13 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
adm1266@40 {
- compatible = "adi,adm1266";
- reg = <0x40>;
+ compatible = "adi,adm1266";
+ reg = <0x40>;
};
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml
index 223393d7cafd..ab87f51c5aef 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,adm1275.yaml
@@ -37,6 +37,72 @@ properties:
description:
Shunt resistor value in micro-Ohm.
+ adi,volt-curr-sample-average:
+ description: |
+ Number of samples to be used to report voltage and current values.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [1, 2, 4, 8, 16, 32, 64, 128]
+
+ adi,power-sample-average:
+ description: |
+ Number of samples to be used to report power values.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [1, 2, 4, 8, 16, 32, 64, 128]
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adm1075
+ - adi,adm1276
+ then:
+ properties:
+ adi,volt-curr-sample-average:
+ default: 128
+ adi,power-sample-average: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adm1275
+ then:
+ properties:
+ adi,volt-curr-sample-average:
+ default: 16
+ adi,power-sample-average: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adm1272
+ then:
+ properties:
+ adi,volt-curr-sample-average:
+ default: 128
+ adi,power-sample-average:
+ default: 128
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adm1278
+ - adi,adm1293
+ - adi,adm1294
+ then:
+ properties:
+ adi,volt-curr-sample-average:
+ default: 128
+ adi,power-sample-average:
+ default: 1
+
required:
- compatible
- reg
@@ -53,5 +119,7 @@ examples:
compatible = "adi,adm1272";
reg = <0x10>;
shunt-resistor-micro-ohms = <500>;
+ adi,volt-curr-sample-average = <128>;
+ adi,power-sample-average = <128>;
};
};
diff --git a/Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml b/Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml
index 6747b870f297..0cf3ed6212a6 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,axi-fan-control.yaml
@@ -5,7 +5,7 @@
$id: http://devicetree.org/schemas/hwmon/adi,axi-fan-control.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Analog Devices AXI FAN Control Device Tree Bindings
+title: Analog Devices AXI FAN Control
maintainers:
- Nuno Sá <nuno.sa@analog.com>
@@ -49,15 +49,15 @@ additionalProperties: false
examples:
- |
fpga_axi: fpga-axi {
- #address-cells = <0x2>;
- #size-cells = <0x1>;
-
- axi_fan_control: axi-fan-control@80000000 {
- compatible = "adi,axi-fan-control-1.00.a";
- reg = <0x0 0x80000000 0x10000>;
- clocks = <&clk 71>;
- interrupts = <0 110 0>;
- pulses-per-revolution = <2>;
- };
+ #address-cells = <0x2>;
+ #size-cells = <0x1>;
+
+ axi_fan_control: axi-fan-control@80000000 {
+ compatible = "adi,axi-fan-control-1.00.a";
+ reg = <0x0 0x80000000 0x10000>;
+ clocks = <&clk 71>;
+ interrupts = <0 110 0>;
+ pulses-per-revolution = <2>;
+ };
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,ltc2945.yaml b/Documentation/devicetree/bindings/hwmon/adi,ltc2945.yaml
new file mode 100644
index 000000000000..5cb66e97e816
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/adi,ltc2945.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/adi,ltc2945.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices LTC2945 wide range i2c power monitor
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ Analog Devices LTC2945 wide range i2c power monitor over I2C.
+
+ https://www.analog.com/media/en/technical-documentation/data-sheets/LTC2945.pdf
+
+properties:
+ compatible:
+ enum:
+ - adi,ltc2945
+
+ reg:
+ maxItems: 1
+
+ shunt-resistor-micro-ohms:
+ description:
+ Shunt resistor value in micro-Ohms
+ default: 1000
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ power-monitor@6e {
+ compatible = "adi,ltc2945";
+ reg = <0x6e>;
+ /* 10 milli-Ohm shunt resistor */
+ shunt-resistor-micro-ohms = <10000>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,ltc2947.yaml b/Documentation/devicetree/bindings/hwmon/adi,ltc2947.yaml
index bf04151b63d2..152935334c76 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,ltc2947.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,ltc2947.yaml
@@ -87,15 +87,15 @@ additionalProperties: false
examples:
- |
spi {
- #address-cells = <1>;
- #size-cells = <0>;
-
- ltc2947_spi: ltc2947@0 {
- compatible = "adi,ltc2947";
- reg = <0>;
- /* accumulation takes place always for energ1/charge1. */
- /* accumulation only on positive current for energy2/charge2. */
- adi,accumulator-ctl-pol = <0 1>;
- };
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ltc2947_spi: ltc2947@0 {
+ compatible = "adi,ltc2947";
+ reg = <0>;
+ /* accumulation takes place always for energ1/charge1. */
+ /* accumulation only on positive current for energy2/charge2. */
+ adi,accumulator-ctl-pol = <0 1>;
+ };
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,ltc2992.yaml b/Documentation/devicetree/bindings/hwmon/adi,ltc2992.yaml
index 64a8fcb7bc46..b39c632956e8 100644
--- a/Documentation/devicetree/bindings/hwmon/adi,ltc2992.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adi,ltc2992.yaml
@@ -32,6 +32,7 @@ properties:
patternProperties:
"^channel@([0-1])$":
type: object
+ additionalProperties: false
description: |
Represents the two supplies to be monitored.
@@ -55,26 +56,26 @@ additionalProperties: false
examples:
- |
- i2c1 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
- ltc2992@6F {
- #address-cells = <1>;
- #size-cells = <0>;
+ ltc2992@6f {
+ #address-cells = <1>;
+ #size-cells = <0>;
- compatible = "adi,ltc2992";
- reg = <0x6F>;
+ compatible = "adi,ltc2992";
+ reg = <0x6f>;
- channel@0 {
- reg = <0x0>;
- shunt-resistor-micro-ohms = <10000>;
- };
+ channel@0 {
+ reg = <0x0>;
+ shunt-resistor-micro-ohms = <10000>;
+ };
- channel@1 {
- reg = <0x1>;
- shunt-resistor-micro-ohms = <10000>;
- };
+ channel@1 {
+ reg = <0x1>;
+ shunt-resistor-micro-ohms = <10000>;
+ };
};
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/adi,max31760.yaml b/Documentation/devicetree/bindings/hwmon/adi,max31760.yaml
new file mode 100644
index 000000000000..9f2d08d7b978
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/adi,max31760.yaml
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/adi,max31760.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices MAX31760 Fan-Speed Controller
+
+maintainers:
+ - Ibrahim Tilki <Ibrahim.Tilki@analog.com>
+
+description: |
+ Analog Devices MAX31760 Fan-Speed Controller
+ https://datasheets.maximintegrated.com/en/ds/MAX31760.pdf
+
+properties:
+ compatible:
+ enum:
+ - adi,max31760
+
+ reg:
+ description: I2C address of slave device.
+ minimum: 0x50
+ maximum: 0x57
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ fan-controller@50 {
+ reg = <0x50>;
+ compatible = "adi,max31760";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/adt7475.yaml b/Documentation/devicetree/bindings/hwmon/adt7475.yaml
index 7d9c083632b9..051c976ab711 100644
--- a/Documentation/devicetree/bindings/hwmon/adt7475.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adt7475.yaml
@@ -57,10 +57,30 @@ patternProperties:
Configures bypassing the individual voltage input attenuator. If
set to 1 the attenuator is bypassed if set to 0 the attenuator is
not bypassed. If the property is absent then the attenuator
- retains it's configuration from the bios/bootloader.
+ retains its configuration from the bios/bootloader.
$ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
+ "^adi,pin(5|10)-function$":
+ description: |
+ Configures the function for pin 5 on the adi,adt7473 and adi,adt7475. Or
+ pin 10 on the adi,adt7476 and adi,adt7490.
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - pwm2
+ - smbalert#
+
+ "^adi,pin(9|14)-function$":
+ description: |
+ Configures the function for pin 9 on the adi,adt7473 and adi,adt7475. Or
+ pin 14 on the adi,adt7476 and adi,adt7490
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - tach4
+ - therm#
+ - smbalert#
+ - gpio
+
required:
- compatible
- reg
@@ -79,6 +99,7 @@ examples:
adi,bypass-attenuator-in0 = <1>;
adi,bypass-attenuator-in1 = <0>;
adi,pwm-active-state = <1 0 1>;
+ adi,pin10-function = "smbalert#";
+ adi,pin14-function = "tach4";
};
};
-
diff --git a/Documentation/devicetree/bindings/hwmon/amd,sbrmi.yaml b/Documentation/devicetree/bindings/hwmon/amd,sbrmi.yaml
index 7598b083979c..353d81d89bf5 100644
--- a/Documentation/devicetree/bindings/hwmon/amd,sbrmi.yaml
+++ b/Documentation/devicetree/bindings/hwmon/amd,sbrmi.yaml
@@ -41,13 +41,13 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
sbrmi@3c {
- compatible = "amd,sbrmi";
- reg = <0x3c>;
+ compatible = "amd,sbrmi";
+ reg = <0x3c>;
};
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/amd,sbtsi.yaml b/Documentation/devicetree/bindings/hwmon/amd,sbtsi.yaml
index 446b09f1ce94..75088244a274 100644
--- a/Documentation/devicetree/bindings/hwmon/amd,sbtsi.yaml
+++ b/Documentation/devicetree/bindings/hwmon/amd,sbtsi.yaml
@@ -42,13 +42,13 @@ additionalProperties: false
examples:
- |
- i2c0 {
+ i2c {
#address-cells = <1>;
#size-cells = <0>;
sbtsi@4c {
- compatible = "amd,sbtsi";
- reg = <0x4c>;
+ compatible = "amd,sbtsi";
+ reg = <0x4c>;
};
};
...
diff --git a/Documentation/devicetree/bindings/hwmon/dps650ab.txt b/Documentation/devicetree/bindings/hwmon/dps650ab.txt
deleted file mode 100644
index 76780e795899..000000000000
--- a/Documentation/devicetree/bindings/hwmon/dps650ab.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Bindings for Delta Electronics DPS-650-AB power supply
-
-Required properties:
-- compatible : "delta,dps650ab"
-- reg : I2C address, one of 0x58, 0x59.
-
-Example:
- dps650ab@58 {
- compatible = "delta,dps650ab";
- reg = <0x58>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/hih6130.txt b/Documentation/devicetree/bindings/hwmon/hih6130.txt
deleted file mode 100644
index 2c43837af4c2..000000000000
--- a/Documentation/devicetree/bindings/hwmon/hih6130.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Honeywell Humidicon HIH-6130 humidity/temperature sensor
---------------------------------------------------------
-
-Requires node properties:
-- compatible : "honeywell,hi6130"
-- reg : the I2C address of the device. This is 0x27.
-
-Example:
- hih6130@27 {
- compatible = "honeywell,hih6130";
- reg = <0x27>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/hpe,gxp-fan-ctrl.yaml b/Documentation/devicetree/bindings/hwmon/hpe,gxp-fan-ctrl.yaml
new file mode 100644
index 000000000000..4a52aac6be72
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/hpe,gxp-fan-ctrl.yaml
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/hpe,gxp-fan-ctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: HPE GXP Fan Controller
+
+maintainers:
+ - Nick Hawkins <nick.hawkins@hpe.com>
+
+description: |
+ The HPE GXP fan controller controls the fans through an external CPLD
+ device that connects to the fans.
+
+properties:
+ compatible:
+ const: hpe,gxp-fan-ctrl
+
+ reg:
+ items:
+ - description: Fan controller PWM
+ - description: Programmable logic
+ - description: Function 2
+
+ reg-names:
+ items:
+ - const: base
+ - const: pl
+ - const: fn2
+
+required:
+ - compatible
+ - reg
+ - reg-names
+
+additionalProperties: false
+
+examples:
+ - |
+ fan-controller@1000c00 {
+ compatible = "hpe,gxp-fan-ctrl";
+ reg = <0x1000c00 0x200>, <0xd1000000 0xff>, <0x80200000 0x100000>;
+ reg-names = "base", "pl", "fn2";
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt b/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
deleted file mode 100644
index d9a2719f9243..000000000000
--- a/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-Device-tree bindings for IBM Common Form Factor Power Supply Versions 1 and 2
------------------------------------------------------------------------------
-
-Required properties:
- - compatible : Must be one of the following:
- "ibm,cffps1"
- "ibm,cffps2"
- or "ibm,cffps" if the system
- must support any version of the
- power supply
- - reg = < I2C bus address >; : Address of the power supply on the
- I2C bus.
-
-Example:
-
- i2c-bus@100 {
- #address-cells = <1>;
- #size-cells = <0>;
- #interrupt-cells = <1>;
- < more properties >
-
- power-supply@68 {
- compatible = "ibm,cffps1";
- reg = <0x68>;
- };
- };
diff --git a/Documentation/devicetree/bindings/hwmon/ibm,occ-hwmon.yaml b/Documentation/devicetree/bindings/hwmon/ibm,occ-hwmon.yaml
new file mode 100644
index 000000000000..3dbdc5af2804
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ibm,occ-hwmon.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ibm,occ-hwmon.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: IBM On-Chip Controller (OCC) accessed from a service processor
+
+maintainers:
+ - Eddie James <eajames@linux.ibm.com>
+
+description: |
+ The POWER processor On-Chip Controller (OCC) helps manage power and
+ thermals for the system. A service processor or baseboard management
+ controller can query the OCC for it's power and thermal data to report
+ through hwmon.
+
+properties:
+ compatible:
+ enum:
+ - ibm,p9-occ-hwmon
+ - ibm,p10-occ-hwmon
+
+ ibm,no-poll-on-init:
+ description: This property describes whether or not the OCC should
+ be polled during driver initialization.
+ type: boolean
+
+required:
+ - compatible
+
+additionalProperties: false
+
+examples:
+ - |
+ hwmon {
+ compatible = "ibm,p10-occ-hwmon";
+ ibm,no-poll-on-init;
+ };
diff --git a/Documentation/devicetree/bindings/i2c/ibm,p8-occ-hwmon.txt b/Documentation/devicetree/bindings/hwmon/ibm,p8-occ-hwmon.txt
index 5dc5d2e2573d..5dc5d2e2573d 100644
--- a/Documentation/devicetree/bindings/i2c/ibm,p8-occ-hwmon.txt
+++ b/Documentation/devicetree/bindings/hwmon/ibm,p8-occ-hwmon.txt
diff --git a/Documentation/devicetree/bindings/hwmon/iio-hwmon.yaml b/Documentation/devicetree/bindings/hwmon/iio-hwmon.yaml
new file mode 100644
index 000000000000..c54b5986b365
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/iio-hwmon.yaml
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/hwmon/iio-hwmon.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: ADC-attached Hardware Sensor
+
+maintainers:
+ - Jonathan Cameron <jic23@kernel.org>
+
+description: >
+ Bindings for hardware monitoring devices connected to ADC controllers
+ supporting the Industrial I/O bindings.
+
+properties:
+ compatible:
+ const: iio-hwmon
+
+ io-channels:
+ minItems: 1
+ maxItems: 8 # Should be enough
+ description: >
+ List of phandles to ADC channels to read the monitoring values
+
+required:
+ - compatible
+ - io-channels
+
+additionalProperties: false
+
+examples:
+ - |
+ iio-hwmon {
+ compatible = "iio-hwmon";
+ io-channels = <&adc 1>, <&adc 2>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/jc42.txt b/Documentation/devicetree/bindings/hwmon/jc42.txt
deleted file mode 100644
index f569db58f64a..000000000000
--- a/Documentation/devicetree/bindings/hwmon/jc42.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Properties for Jedec JC-42.4 compatible temperature sensors
-
-Required properties:
-- compatible: May include a device-specific string consisting of the
- manufacturer and the name of the chip. A list of supported
- chip names follows.
- Must include "jedec,jc-42.4-temp" for any Jedec JC-42.4
- compatible temperature sensor.
-
- Supported chip names:
- adi,adt7408
- atmel,at30ts00
- atmel,at30tse004
- onnn,cat6095
- onnn,cat34ts02
- maxim,max6604
- microchip,mcp9804
- microchip,mcp9805
- microchip,mcp9808
- microchip,mcp98243
- microchip,mcp98244
- microchip,mcp9843
- nxp,se97
- nxp,se98
- st,stts2002
- st,stts2004
- st,stts3000
- st,stts424
- st,stts424e
- idt,tse2002
- idt,tse2004
- idt,ts3000
- idt,ts3001
-
-- reg: I2C address
-
-Optional properties:
-- smbus-timeout-disable: When set, the smbus timeout function will be disabled.
- This is not supported on all chips.
-
-Example:
-
-temp-sensor@1a {
- compatible = "jedec,jc-42.4-temp";
- reg = <0x1a>;
-};
diff --git a/Documentation/devicetree/bindings/hwmon/jedec,jc42.yaml b/Documentation/devicetree/bindings/hwmon/jedec,jc42.yaml
new file mode 100644
index 000000000000..0e49b3901161
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/jedec,jc42.yaml
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/jedec,jc42.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Jedec JC-42.4 compatible temperature sensors
+
+maintainers:
+ - Jean Delvare <jdelvare@suse.com>
+ - Guenter Roeck <linux@roeck-us.net>
+
+select:
+ properties:
+ compatible:
+ const: jedec,jc-42.4-temp
+
+ required:
+ - compatible
+
+properties:
+ compatible:
+ oneOf:
+ - const: jedec,jc-42.4-temp
+ - items:
+ - enum:
+ - adi,adt7408
+ - atmel,at30ts00
+ - atmel,at30tse004
+ - idt,tse2002
+ - idt,tse2004
+ - idt,ts3000
+ - idt,ts3001
+ - maxim,max6604
+ - microchip,mcp9804
+ - microchip,mcp9805
+ - microchip,mcp9808
+ - microchip,mcp98243
+ - microchip,mcp98244
+ - microchip,mcp9843
+ - nxp,se97
+ - nxp,se97b
+ - nxp,se98
+ - onnn,cat6095
+ - onnn,cat34ts02
+ - st,stts2002
+ - st,stts2004
+ - st,stts3000
+ - st,stts424
+ - st,stts424e
+ - const: jedec,jc-42.4-temp
+
+ reg:
+ maxItems: 1
+
+ smbus-timeout-disable:
+ description: |
+ When set, the smbus timeout function will be disabled. This is not
+ supported on all chips.
+ type: boolean
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ temp-sensor@1a {
+ compatible = "jedec,jc-42.4-temp";
+ reg = <0x1a>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/lltc,ltc4151.yaml b/Documentation/devicetree/bindings/hwmon/lltc,ltc4151.yaml
new file mode 100644
index 000000000000..b1a4c235376e
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/lltc,ltc4151.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/lltc,ltc4151.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: LTC4151 High Voltage I2C Current and Voltage Monitor
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ const: lltc,ltc4151
+
+ reg:
+ maxItems: 1
+
+ shunt-resistor-micro-ohms:
+ description:
+ Shunt resistor value in micro-Ohms
+ default: 1000
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@6e {
+ compatible = "lltc,ltc4151";
+ reg = <0x6e>;
+ shunt-resistor-micro-ohms = <1500>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/lm70.txt b/Documentation/devicetree/bindings/hwmon/lm70.txt
deleted file mode 100644
index ea417a0d32af..000000000000
--- a/Documentation/devicetree/bindings/hwmon/lm70.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-* LM70/TMP121/LM71/LM74 thermometer.
-
-Required properties:
-- compatible: one of
- "ti,lm70"
- "ti,tmp121"
- "ti,tmp122"
- "ti,lm71"
- "ti,lm74"
-
-See Documentation/devicetree/bindings/spi/spi-bus.txt for more required and
-optional properties.
-
-Example:
-
-spi_master {
- temperature-sensor@0 {
- compatible = "ti,lm70";
- reg = <0>;
- spi-max-frequency = <1000000>;
- };
-};
diff --git a/Documentation/devicetree/bindings/hwmon/lm75.yaml b/Documentation/devicetree/bindings/hwmon/lm75.yaml
index 72980d083c21..8226e3b5d028 100644
--- a/Documentation/devicetree/bindings/hwmon/lm75.yaml
+++ b/Documentation/devicetree/bindings/hwmon/lm75.yaml
@@ -14,6 +14,7 @@ properties:
compatible:
enum:
- adi,adt75
+ - atmel,at30ts74
- dallas,ds1775
- dallas,ds75
- dallas,ds7505
diff --git a/Documentation/devicetree/bindings/hwmon/lm90.txt b/Documentation/devicetree/bindings/hwmon/lm90.txt
deleted file mode 100644
index 398dcb965751..000000000000
--- a/Documentation/devicetree/bindings/hwmon/lm90.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-* LM90 series thermometer.
-
-Required node properties:
-- compatible: manufacturer and chip name, one of
- "adi,adm1032"
- "adi,adt7461"
- "adi,adt7461a"
- "gmt,g781"
- "national,lm90"
- "national,lm86"
- "national,lm89"
- "national,lm99"
- "dallas,max6646"
- "dallas,max6647"
- "dallas,max6649"
- "dallas,max6657"
- "dallas,max6658"
- "dallas,max6659"
- "dallas,max6680"
- "dallas,max6681"
- "dallas,max6695"
- "dallas,max6696"
- "onnn,nct1008"
- "winbond,w83l771"
- "nxp,sa56004"
- "ti,tmp451"
-
-- reg: I2C bus address of the device
-
-- vcc-supply: vcc regulator for the supply voltage.
-
-Optional properties:
-- interrupts: Contains a single interrupt specifier which describes the
- LM90 "-ALERT" pin output.
- See interrupt-controller/interrupts.txt for the format.
-
-- #thermal-sensor-cells: should be set to 1. See thermal/thermal-sensor.yaml
- for details. See <include/dt-bindings/thermal/lm90.h> for the
- definition of the local, remote and 2nd remote sensor index
- constants.
-
-Example LM90 node:
-
-temp-sensor {
- compatible = "onnn,nct1008";
- reg = <0x4c>;
- vcc-supply = <&palmas_ldo6_reg>;
- interrupt-parent = <&gpio>;
- interrupts = <TEGRA_GPIO(O, 4) IRQ_TYPE_LEVEL_LOW>;
- #thermal-sensor-cells = <1>;
-}
diff --git a/Documentation/devicetree/bindings/hwmon/ltc4151.txt b/Documentation/devicetree/bindings/hwmon/ltc4151.txt
deleted file mode 100644
index d008a5ef525a..000000000000
--- a/Documentation/devicetree/bindings/hwmon/ltc4151.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-LTC4151 High Voltage I2C Current and Voltage Monitor
-
-Required properties:
-- compatible: Must be "lltc,ltc4151"
-- reg: I2C address
-
-Optional properties:
-- shunt-resistor-micro-ohms
- Shunt resistor value in micro-Ohms
- Defaults to <1000> if unset.
-
-Example:
-
-ltc4151@6e {
- compatible = "lltc,ltc4151";
- reg = <0x6e>;
- shunt-resistor-micro-ohms = <1500>;
-};
diff --git a/Documentation/devicetree/bindings/hwmon/mcp3021.txt b/Documentation/devicetree/bindings/hwmon/mcp3021.txt
deleted file mode 100644
index 294318ba6914..000000000000
--- a/Documentation/devicetree/bindings/hwmon/mcp3021.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-mcp3021 properties
-
-Required properties:
-- compatible: Must be one of the following:
- - "microchip,mcp3021" for mcp3021
- - "microchip,mcp3221" for mcp3221
-- reg: I2C address
-
-Optional properties:
-
-- reference-voltage-microvolt
- Reference voltage in microvolt (uV)
-
-Example:
-
-mcp3021@4d {
- compatible = "microchip,mcp3021";
- reg = <0x4d>;
-
- reference-voltage-microvolt = <4500000>; /* 4.5 V */
-};
diff --git a/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml b/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml
new file mode 100644
index 000000000000..390dd6755ff5
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/microchip,lan966x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip LAN966x Hardware Monitor
+
+maintainers:
+ - Michael Walle <michael@walle.cc>
+
+description: |
+ Microchip LAN966x temperature monitor and fan controller
+
+properties:
+ compatible:
+ enum:
+ - microchip,lan9668-hwmon
+
+ reg:
+ items:
+ - description: PVT registers
+ - description: FAN registers
+
+ reg-names:
+ items:
+ - const: pvt
+ - const: fan
+
+ clocks:
+ maxItems: 1
+
+ '#thermal-sensor-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ hwmon: hwmon@e2010180 {
+ compatible = "microchip,lan9668-hwmon";
+ reg = <0xe2010180 0xc>,
+ <0xe20042a8 0xc>;
+ reg-names = "pvt", "fan";
+ clocks = <&sys_clk>;
+ #thermal-sensor-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/microchip,mcp3021.yaml b/Documentation/devicetree/bindings/hwmon/microchip,mcp3021.yaml
new file mode 100644
index 000000000000..028d6e570131
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/microchip,mcp3021.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/microchip,mcp3021.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip MCP3021 A/D converter
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - microchip,mcp3021
+ - microchip,mcp3221
+
+ reg:
+ maxItems: 1
+
+ reference-voltage-microvolt:
+ description:
+ VDD supply power and reference voltage
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ adc@4d {
+ compatible = "microchip,mcp3021";
+ reg = <0x4d>;
+
+ reference-voltage-microvolt = <4500000>; /* 4.5 V */
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/microchip,sparx5-temp.yaml b/Documentation/devicetree/bindings/hwmon/microchip,sparx5-temp.yaml
index 76be625d5646..51e8619dbf3c 100644
--- a/Documentation/devicetree/bindings/hwmon/microchip,sparx5-temp.yaml
+++ b/Documentation/devicetree/bindings/hwmon/microchip,sparx5-temp.yaml
@@ -22,7 +22,7 @@ properties:
clocks:
items:
- - description: AHB reference clock
+ - description: System reference clock
'#thermal-sensor-cells':
const: 0
@@ -40,5 +40,5 @@ examples:
compatible = "microchip,sparx5-temp";
reg = <0x10508110 0xc>;
#thermal-sensor-cells = <0>;
- clocks = <&ahb_clk>;
+ clocks = <&sys_clk>;
};
diff --git a/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml b/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml
index b79f069a04c2..ae4f68d4e696 100644
--- a/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml
+++ b/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml
@@ -4,11 +4,37 @@
$id: http://devicetree.org/schemas/hwmon/moortec,mr75203.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Moortec Semiconductor MR75203 PVT Controller bindings
+title: Moortec Semiconductor MR75203 PVT Controller
maintainers:
- Rahul Tanwar <rtanwar@maxlinear.com>
+description: |
+ A Moortec PVT (Process, Voltage, Temperature) monitoring logic design can
+ include many different units.
+ Such a design will usually consists of several Moortec's embedded analog IPs,
+ and a single Moortec controller (mr75203) to configure and control the IPs.
+
+ Some of the Moortec's analog hard IPs that can be used in a design:
+ *) Temperature Sensor (TS) - used to monitor core temperature (e.g. mr74137).
+ *) Voltage Monitor (VM) - used to monitor voltage levels (e.g. mr74138).
+ *) Process Detector (PD) - used to assess silicon speed (e.g. mr74139).
+ *) Delay Chain - ring oscillator connected to the PD, used to measure IO
+ based transistors (e.g. mr76008 ring oscillator at 1.1V, mr76007 ring
+ oscillator at 1.8V).
+ *) Pre Scaler - provides divide-by-X scaling of input voltage, which can then
+ be presented for VM for measurement within its range (e.g. mr76006 -
+ divide by 2 pre-scaler).
+
+ TS, VM & PD also include a digital interface, which consists of configuration
+ inputs and measurement outputs.
+
+ Some of the units have number of series, each series can have slightly
+ different characteristics.
+
+ The mr75203 binding describes configuration for the controller unit, but also
+ for some of the analog IPs.
+
properties:
compatible:
const: moortec,mr75203
@@ -44,13 +70,76 @@ properties:
"#thermal-sensor-cells":
const: 1
+ moortec,vm-active-channels:
+ description:
+ Defines the number of channels per VM that are actually used and are
+ connected to some input source.
+ Maximum number of items - number of VMs.
+ Maximum value of each item - number of channels.
+ Minimum value of each item - 0 (which means entire VM sensor is not used).
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+
+ moortec,vm-pre-scaler-x2:
+ description:
+ Defines the channels that use a mr76006 pre-scaler to divide the input
+ source by 2.
+ The pre-scaler is used for input sources that exceed the VM input range.
+ The driver uses this information to present to the user with the actual
+ value of the voltage source.
+ For channels that are not listed, no pre-scaler is assumed.
+ Maximum number of items - total number of channels in all VMs.
+ Each channel should not appear more than once.
+ $ref: /schemas/types.yaml#/definitions/uint8-array
+
+ moortec,ts-series:
+ description:
+ Definition of the temperature equation and coefficients that shall be
+ used to convert the digital output to value in milli-Celsius.
+ minimum: 5
+ maximum: 6
+ default: 5
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ moortec,ts-coeff-g:
+ description:
+ G coefficient for temperature equation.
+ Default for series 5 = 60000
+ Default for series 6 = 57400
+ multipleOf: 1000
+ minimum: 1000
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ moortec,ts-coeff-h:
+ description:
+ H coefficient for temperature equation.
+ Default for series 5 = 200000
+ Default for series 6 = 249400
+ multipleOf: 1000
+ minimum: 1000
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ moortec,ts-coeff-cal5:
+ description:
+ cal5 coefficient for temperature equation.
+ Default for series 5 = 4094
+ Default for series 6 = 4096
+ minimum: 1
+ $ref: /schemas/types.yaml#/definitions/uint32
+
+ moortec,ts-coeff-j:
+ description:
+ J coefficient for temperature equation.
+ Default for series 5 = -100
+ Default for series 6 = 0
+ multipleOf: 1000
+ maximum: 0
+ $ref: /schemas/types.yaml#/definitions/int32
+
required:
- compatible
- reg
- reg-names
- - intel,vm-map
- clocks
- - resets
- "#thermal-sensor-cells"
additionalProperties: false
@@ -67,5 +156,9 @@ examples:
intel,vm-map = [03 01 04 ff ff];
clocks = <&osc0>;
resets = <&rcu0 0x40 7>;
+ moortec,vm-active-channels = /bits/ 8 <0x10 0x05>;
+ moortec,vm-pre-scaler-x2 = /bits/ 8 <5 6 20>;
+ moortec,ts-coeff-g = <61400>;
+ moortec,ts-coeff-h = <253700>;
#thermal-sensor-cells = <1>;
};
diff --git a/Documentation/devicetree/bindings/hwmon/national,lm90.yaml b/Documentation/devicetree/bindings/hwmon/national,lm90.yaml
new file mode 100644
index 000000000000..7b9d48d6d6da
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/national,lm90.yaml
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/national,lm90.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: LM90 series thermometer
+
+maintainers:
+ - Jean Delvare <jdelvare@suse.com>
+ - Guenter Roeck <linux@roeck-us.net>
+
+properties:
+ compatible:
+ enum:
+ - adi,adm1032
+ - adi,adt7461
+ - adi,adt7461a
+ - adi,adt7481
+ - dallas,max6646
+ - dallas,max6647
+ - dallas,max6649
+ - dallas,max6657
+ - dallas,max6658
+ - dallas,max6659
+ - dallas,max6680
+ - dallas,max6681
+ - dallas,max6695
+ - dallas,max6696
+ - gmt,g781
+ - national,lm86
+ - national,lm89
+ - national,lm90
+ - national,lm99
+ - nxp,sa56004
+ - onnn,nct1008
+ - ti,tmp451
+ - ti,tmp461
+ - winbond,w83l771
+
+
+ interrupts:
+ items:
+ - description: |
+ Single interrupt specifier which describes the LM90 "-ALERT" pin
+ output.
+
+ reg:
+ maxItems: 1
+
+ "#thermal-sensor-cells":
+ const: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+ vcc-supply:
+ description: phandle to the regulator that provides the +VCC supply
+
+ ti,extended-range-enable:
+ description: Set to enable extended range temperature.
+ type: boolean
+
+required:
+ - compatible
+ - reg
+
+patternProperties:
+ "^channel@([0-2])$":
+ type: object
+ description: Represents channels of the device and their specific configuration.
+
+ properties:
+ reg:
+ description: The channel number. 0 is local channel, 1-2 are remote channels.
+ items:
+ minimum: 0
+ maximum: 2
+
+ label:
+ description: A descriptive name for this channel, like "ambient" or "psu".
+
+ temperature-offset-millicelsius:
+ description: Temperature offset to be added to or subtracted from remote temperature measurements.
+
+ required:
+ - reg
+
+ additionalProperties: false
+
+allOf:
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adt7461
+ - adi,adt7461a
+ - adi,adt7481
+ - ti,tmp451
+ - ti,tmp461
+ then:
+ properties:
+ ti,extended-range-enable: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - dallas,max6646
+ - dallas,max6647
+ - dallas,max6649
+ - dallas,max6657
+ - dallas,max6658
+ - dallas,max6659
+ - dallas,max6695
+ - dallas,max6696
+ then:
+ patternProperties:
+ "^channel@([0-2])$":
+ properties:
+ temperature-offset-millicelsius: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adt7461
+ - adi,adt7461a
+ - adi,adt7481
+ - onnn,nct1008
+ then:
+ patternProperties:
+ "^channel@([0-2])$":
+ properties:
+ temperature-offset-millicelsius:
+ maximum: 127750
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adm1032
+ - dallas,max6680
+ - dallas,max6681
+ - gmt,g781
+ - national,lm86
+ - national,lm89
+ - national,lm90
+ - national,lm99
+ - nxp,sa56004
+ - winbond,w83l771
+ then:
+ patternProperties:
+ "^channel@([0-2])$":
+ properties:
+ temperature-offset-millicelsius:
+ maximum: 127875
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - ti,tmp451
+ - ti,tmp461
+ then:
+ patternProperties:
+ "^channel@([0-2])$":
+ properties:
+ temperature-offset-millicelsius:
+ maximum: 127937
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "onnn,nct1008";
+ reg = <0x4c>;
+ vcc-supply = <&palmas_ldo6_reg>;
+ interrupts = <4 IRQ_TYPE_LEVEL_LOW>;
+ #thermal-sensor-cells = <1>;
+ };
+ };
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "adi,adt7481";
+ reg = <0x4c>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ channel@0 {
+ reg = <0x0>;
+ label = "local";
+ };
+
+ channel@1 {
+ reg = <0x1>;
+ label = "front";
+ temperature-offset-millicelsius = <4000>;
+ };
+
+ channel@2 {
+ reg = <0x2>;
+ label = "back";
+ temperature-offset-millicelsius = <750>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ntc-thermistor.yaml b/Documentation/devicetree/bindings/hwmon/ntc-thermistor.yaml
new file mode 100644
index 000000000000..3d0146e20d3e
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ntc-thermistor.yaml
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+---
+$id: http://devicetree.org/schemas/hwmon/ntc-thermistor.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NTC thermistor temperature sensors
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+description: |
+ Thermistors with negative temperature coefficient (NTC) are resistors that
+ vary in resistance in an often non-linear way in relation to temperature.
+ The negative temperature coefficient means that the resistance decreases
+ as the temperature rises. Since the relationship between resistance and
+ temperature is non-linear, software drivers most often need to use a look
+ up table and interpolation to get from resistance to temperature.
+
+ When used in practice, a thermistor is often connected between ground, a
+ pull-up resistor or/and a pull-down resistor and a fixed voltage like this:
+
+ + e.g. 5V = pull-up voltage (puv)
+ |
+ +-+
+ | |
+ | | Pull-up resistor
+ | | (puo)
+ +-+
+ |-------------------------o
+ +-+ | ^
+ | |/ |
+ | / |
+ |/| Thermistor | Measured voltage (mv)
+ / | | "connected ground"
+ /| | |
+ +-+ |
+ |-------------------------o
+ +-+ ^
+ | | |
+ | | Pull-down resistor | Measured voltage (mv)
+ | | (pdo) | "connected positive"
+ +-+ |
+ | |
+ | v
+ + GND GND
+
+ The arrangements of where we measure the voltage over the thermistor are
+ called "connected ground" and "connected positive" and shall be understood as
+ the cases when either pull-up or pull-down resistance is zero.
+
+ If the pull-up resistance is 0 one end of the thermistor is connected to the
+ positive voltage and we get the thermistor on top of a pull-down resistor
+ and we take the measure between the thermistor and the pull-down resistor.
+
+ Conversely if the pull-down resistance is zero, one end of the thermistor is
+ connected to ground and we get the thermistor under the pull-up resistor
+ and we take the measure between the pull-up resistor and the thermistor.
+
+ We can use both pull-up and pull-down resistors at the same time, and then
+ the figure illustrates where the voltage will be measured for the "connected
+ ground" and "connected positive" cases.
+
+properties:
+ $nodename:
+ pattern: "^thermistor(.*)?$"
+
+ compatible:
+ oneOf:
+ - const: epcos,b57330v2103
+ - const: epcos,b57891s0103
+ - const: murata,ncp15wb473
+ - const: murata,ncp18wb473
+ - const: murata,ncp21wb473
+ - const: murata,ncp03wb473
+ - const: murata,ncp15wl333
+ - const: murata,ncp03wf104
+ - const: murata,ncp15xh103
+ - const: samsung,1404-001221
+ # Deprecated "ntp," compatible strings
+ - const: ntc,ncp15wb473
+ deprecated: true
+ - const: ntc,ncp18wb473
+ deprecated: true
+ - const: ntc,ncp21wb473
+ deprecated: true
+ - const: ntc,ncp03wb473
+ deprecated: true
+ - const: ntc,ncp15wl333
+ deprecated: true
+
+ "#thermal-sensor-cells":
+ description: Thermal sensor cells if used for thermal sensoring.
+ const: 0
+
+ pullup-uv:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Pull-up voltage in micro volts. Must always be specified.
+
+ pullup-ohm:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Pull-up resistance in ohms. Must always be specified, even
+ if zero.
+
+ pulldown-ohm:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Pull-down resistance in ohms. Must always be specified, even
+ if zero.
+
+ connected-positive:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description: Indicates how the thermistor is connected in series with
+ a pull-up and/or a pull-down resistor. See the description above for
+ an illustration. If this flag is NOT specified, the thermistor is assumed
+ to be connected-ground, which usually means a pull-down resistance of
+ zero but complex arrangements are possible.
+
+ # See /schemas/iio/adc/adc.yaml
+ io-channels:
+ maxItems: 1
+ description: IIO ADC channel to read the voltage over the resistor. Must
+ always be specified.
+
+required:
+ - compatible
+ - pullup-uv
+ - pullup-ohm
+ - pulldown-ohm
+ - io-channels
+
+additionalProperties: false
+
+examples:
+ - |
+ thermistor {
+ compatible = "murata,ncp18wb473";
+ io-channels = <&gpadc 0x06>;
+ pullup-uv = <1800000>;
+ pullup-ohm = <220000>;
+ pulldown-ohm = <0>;
+ #thermal-sensor-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt b/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
deleted file mode 100644
index 4c5c3712970e..000000000000
--- a/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-NTC Thermistor hwmon sensors
--------------------------------
-
-Requires node properties:
-- "compatible" value : one of
- "epcos,b57330v2103"
- "epcos,b57891s0103"
- "murata,ncp15wb473"
- "murata,ncp18wb473"
- "murata,ncp21wb473"
- "murata,ncp03wb473"
- "murata,ncp15wl333"
- "murata,ncp03wf104"
- "murata,ncp15xh103"
-
-/* Usage of vendor name "ntc" is deprecated */
-<DEPRECATED> "ntc,ncp15wb473"
-<DEPRECATED> "ntc,ncp18wb473"
-<DEPRECATED> "ntc,ncp21wb473"
-<DEPRECATED> "ntc,ncp03wb473"
-<DEPRECATED> "ntc,ncp15wl333"
-
-- "pullup-uv" Pull up voltage in micro volts
-- "pullup-ohm" Pull up resistor value in ohms
-- "pulldown-ohm" Pull down resistor value in ohms
-- "connected-positive" Always ON, If not specified.
- Status change is possible.
-- "io-channels" Channel node of ADC to be used for
- conversion.
-
-Optional node properties:
-- "#thermal-sensor-cells" Used to expose itself to thermal fw.
-
-Read more about iio bindings at
- https://github.com/devicetree-org/dt-schema/blob/master/schemas/iio/
-
-Example:
- ncp15wb473@0 {
- compatible = "murata,ncp15wb473";
- pullup-uv = <1800000>;
- pullup-ohm = <47000>;
- pulldown-ohm = <0>;
- io-channels = <&adc 3>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml b/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml
new file mode 100644
index 000000000000..358b262431fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/hwmon/nuvoton,nct6775.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nuvoton NCT6775 and compatible Super I/O chips
+
+maintainers:
+ - Zev Weiss <zev@bewilderbeest.net>
+
+properties:
+ compatible:
+ enum:
+ - nuvoton,nct6106
+ - nuvoton,nct6116
+ - nuvoton,nct6775
+ - nuvoton,nct6776
+ - nuvoton,nct6779
+ - nuvoton,nct6791
+ - nuvoton,nct6792
+ - nuvoton,nct6793
+ - nuvoton,nct6795
+ - nuvoton,nct6796
+ - nuvoton,nct6797
+ - nuvoton,nct6798
+
+ reg:
+ maxItems: 1
+
+ nuvoton,tsi-channel-mask:
+ description:
+ Bitmask indicating which TSI temperature sensor channels are
+ active. LSB is TSI0, bit 1 is TSI1, etc.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 0xff
+ default: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ superio@4d {
+ compatible = "nuvoton,nct6779";
+ reg = <0x4d>;
+ nuvoton,tsi-channel-mask = <0x03>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/nuvoton,nct7802.yaml b/Documentation/devicetree/bindings/hwmon/nuvoton,nct7802.yaml
new file mode 100644
index 000000000000..cd8dcd797031
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/nuvoton,nct7802.yaml
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/hwmon/nuvoton,nct7802.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nuvoton NCT7802Y Hardware Monitoring IC
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ The NCT7802Y is a hardware monitor IC which supports one on-die and up to
+ 5 remote temperature sensors with SMBus interface.
+
+ Datasheets:
+ https://www.nuvoton.com/export/resource-files/Nuvoton_NCT7802Y_Datasheet_V12.pdf
+
+additionalProperties: false
+
+properties:
+ compatible:
+ enum:
+ - nuvoton,nct7802
+
+ reg:
+ maxItems: 1
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+patternProperties:
+ "^channel@[0-3]$":
+ type: object
+
+ additionalProperties: false
+
+ properties:
+ reg:
+ items:
+ - enum:
+ - 0 # Local Temperature Sensor ("LTD")
+ - 1 # Remote Temperature Sensor or Voltage Sensor 1 ("RTD1")
+ - 2 # Remote Temperature Sensor or Voltage Sensor 2 ("RTD2")
+ - 3 # Remote Temperature Sensor or Voltage Sensor 3 ("RTD3")
+
+ sensor-type:
+ items:
+ - enum:
+ - temperature
+ - voltage
+
+ temperature-mode:
+ items:
+ - enum:
+ - thermistor
+ - thermal-diode
+
+ required:
+ - reg
+
+ allOf:
+ # For channels RTD1, RTD2 and RTD3, require sensor-type to be set.
+ # Otherwise (for all other channels), do not allow temperature-mode to be
+ # set.
+ - if:
+ properties:
+ reg:
+ items:
+ - enum:
+ - 1
+ - 2
+ - 3
+ then:
+ required:
+ - sensor-type
+ else:
+ not:
+ required:
+ - sensor-type
+
+ # For channels RTD1 and RTD2 and if sensor-type is "temperature", require
+ # temperature-mode to be set. Otherwise (for all other channels or
+ # sensor-type settings), do not allow temperature-mode to be set
+ - if:
+ properties:
+ reg:
+ items:
+ - enum:
+ - 1
+ - 2
+ sensor-type:
+ items:
+ - enum:
+ - temperature
+ then:
+ required:
+ - temperature-mode
+ else:
+ not:
+ required:
+ - temperature-mode
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ nct7802@28 {
+ compatible = "nuvoton,nct7802";
+ reg = <0x28>;
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ channel@0 { /* LTD */
+ reg = <0>;
+ };
+
+ channel@1 { /* RTD1 */
+ reg = <1>;
+ sensor-type = "voltage";
+ };
+
+ channel@2 { /* RTD2 */
+ reg = <2>;
+ sensor-type = "temperature";
+ temperature-mode = "thermal-diode";
+ };
+
+ channel@3 { /* RTD3 */
+ reg = <3>;
+ sensor-type = "temperature";
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/nxp,mc34vr500.yaml b/Documentation/devicetree/bindings/hwmon/nxp,mc34vr500.yaml
new file mode 100644
index 000000000000..306f67315835
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/nxp,mc34vr500.yaml
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/nxp,mc34vr500.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP MC34VR500 hwmon sensor
+
+maintainers:
+ - Mario Kicherer <dev@kicherer.org>
+
+properties:
+ compatible:
+ enum:
+ - nxp,mc34vr500
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ pmic@8 {
+ compatible = "nxp,mc34vr500";
+ reg = <0x08>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/pmbus/ti,lm25066.yaml b/Documentation/devicetree/bindings/hwmon/pmbus/ti,lm25066.yaml
new file mode 100644
index 000000000000..da8292bc32f5
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/pmbus/ti,lm25066.yaml
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/hwmon/pmbus/ti,lm25066.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: National Semiconductor/Texas Instruments LM250x6/LM506x power-management ICs
+
+maintainers:
+ - Zev Weiss <zev@bewilderbeest.net>
+
+description: |
+ The LM25066 family of power-management ICs (a.k.a. hot-swap
+ controllers or eFuses in various contexts) are PMBus devices that
+ offer temperature, current, voltage, and power monitoring.
+
+ Datasheet: https://www.ti.com/lit/ds/symlink/lm25066.pdf
+
+properties:
+ compatible:
+ enum:
+ - ti,lm25056
+ - ti,lm25066
+ - ti,lm5064
+ - ti,lm5066
+ - ti,lm5066i
+
+ reg:
+ maxItems: 1
+
+ shunt-resistor-micro-ohms:
+ description:
+ Shunt (sense) resistor value in micro-Ohms
+ default: 1000
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ pmic@40 {
+ compatible = "ti,lm25066";
+ reg = <0x40>;
+ shunt-resistor-micro-ohms = <675>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/pwm-fan.txt b/Documentation/devicetree/bindings/hwmon/pwm-fan.txt
index 4509e688623a..48886f0ce415 100644
--- a/Documentation/devicetree/bindings/hwmon/pwm-fan.txt
+++ b/Documentation/devicetree/bindings/hwmon/pwm-fan.txt
@@ -1,67 +1 @@
-Bindings for a fan connected to the PWM lines
-
-Required properties:
-- compatible : "pwm-fan"
-- pwms : the PWM that is used to control the PWM fan
-- cooling-levels : PWM duty cycle values in a range from 0 to 255
- which correspond to thermal cooling states
-
-Optional properties:
-- fan-supply : phandle to the regulator that provides power to the fan
-- interrupts : This contains an interrupt specifier for each fan
- tachometer output connected to an interrupt source.
- The output signal must generate a defined number of
- interrupts per fan revolution, which require that
- it must be self resetting edge interrupts. See
- interrupt-controller/interrupts.txt for the format.
-- pulses-per-revolution : define the number of pulses per fan revolution for
- each tachometer input as an integer (default is 2
- interrupts per revolution). The value must be
- greater than zero.
-
-Example:
- fan0: pwm-fan {
- compatible = "pwm-fan";
- #cooling-cells = <2>;
- pwms = <&pwm 0 10000 0>;
- cooling-levels = <0 102 170 230>;
- };
-
- thermal-zones {
- cpu_thermal: cpu-thermal {
- thermal-sensors = <&tmu 0>;
- polling-delay-passive = <0>;
- polling-delay = <0>;
- trips {
- cpu_alert1: cpu-alert1 {
- temperature = <100000>; /* millicelsius */
- hysteresis = <2000>; /* millicelsius */
- type = "passive";
- };
- };
- cooling-maps {
- map0 {
- trip = <&cpu_alert1>;
- cooling-device = <&fan0 0 1>;
- };
- };
- };
-
-Example 2:
- fan0: pwm-fan {
- compatible = "pwm-fan";
- pwms = <&pwm 0 40000 0>;
- fan-supply = <&reg_fan>;
- interrupt-parent = <&gpio5>;
- interrupts = <1 IRQ_TYPE_EDGE_FALLING>;
- pulses-per-revolution = <2>;
- };
-
-Example 3:
- fan0: pwm-fan {
- compatible = "pwm-fan";
- pwms = <&pwm1 0 25000 0>;
- interrupts-extended = <&gpio1 1 IRQ_TYPE_EDGE_FALLING>,
- <&gpio2 5 IRQ_TYPE_EDGE_FALLING>;
- pulses-per-revolution = <2>, <1>;
- };
+This file has moved to pwm-fan.yaml.
diff --git a/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml
new file mode 100644
index 000000000000..4e5abf7580cc
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/pwm-fan.yaml
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/pwm-fan.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Fan connected to PWM lines
+
+maintainers:
+ - Jean Delvare <jdelvare@suse.com>
+ - Guenter Roeck <linux@roeck-us.net>
+
+properties:
+ compatible:
+ const: pwm-fan
+
+ cooling-levels:
+ description: PWM duty cycle values corresponding to thermal cooling states.
+ $ref: /schemas/types.yaml#/definitions/uint32-array
+ items:
+ maximum: 255
+
+ fan-supply:
+ description: Phandle to the regulator that provides power to the fan.
+
+ interrupts:
+ description:
+ This contains an interrupt specifier for each fan tachometer output
+ connected to an interrupt source. The output signal must generate a
+ defined number of interrupts per fan revolution, which require that
+ it must be self resetting edge interrupts.
+ maxItems: 1
+
+ pulses-per-revolution:
+ description:
+ Define the number of pulses per fan revolution for each tachometer
+ input as an integer.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1
+ maximum: 4
+ default: 2
+
+ pwms:
+ description: The PWM that is used to control the fan.
+ maxItems: 1
+
+ "#cooling-cells": true
+
+required:
+ - compatible
+ - pwms
+
+additionalProperties: false
+
+examples:
+ - |
+ pwm-fan {
+ compatible = "pwm-fan";
+ cooling-levels = <0 102 170 230>;
+ pwms = <&pwm 0 10000 0>;
+ #cooling-cells = <2>;
+ };
+
+ thermal-zones {
+ cpu_thermal: cpu-thermal {
+ thermal-sensors = <&tmu 0>;
+ polling-delay-passive = <0>;
+ polling-delay = <0>;
+
+ trips {
+ cpu_alert1: cpu-alert1 {
+ temperature = <100000>; /* millicelsius */
+ hysteresis = <2000>; /* millicelsius */
+ type = "passive";
+ };
+ };
+
+ cooling-maps {
+ map0 {
+ trip = <&cpu_alert1>;
+ cooling-device = <&fan0 0 1>;
+ };
+ };
+ };
+ };
+
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ pwm-fan {
+ compatible = "pwm-fan";
+ pwms = <&pwm 0 40000 0>;
+ fan-supply = <&reg_fan>;
+ interrupt-parent = <&gpio5>;
+ interrupts = <1 IRQ_TYPE_EDGE_FALLING>;
+ pulses-per-revolution = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/sensirion,sht15.yaml b/Documentation/devicetree/bindings/hwmon/sensirion,sht15.yaml
new file mode 100644
index 000000000000..80df7182ea28
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/sensirion,sht15.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/sensirion,sht15.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sensirion SHT15 humidity and temperature sensor
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ const: sensirion,sht15
+
+ clk-gpios:
+ maxItems: 1
+
+ data-gpios:
+ maxItems: 1
+
+ vcc-supply:
+ description: regulator that drives the VCC pin
+
+required:
+ - compatible
+ - clk-gpios
+ - data-gpios
+ - vcc-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ sensor {
+ compatible = "sensirion,sht15";
+ clk-gpios = <&gpio4 12 0>;
+ data-gpios = <&gpio4 13 0>;
+ vcc-supply = <&reg_sht15>;
+
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_sensor>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml b/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml
index 7d49478d9668..159238efa9ed 100644
--- a/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml
+++ b/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml
@@ -10,7 +10,7 @@ maintainers:
- Christopher Ruehl chris.ruehl@gtsys.com.hk
description: |
- The SHTC1, SHTW1 and SHTC3 are digital humidity and temperature sensor
+ The SHTC1, SHTW1 and SHTC3 are digital humidity and temperature sensors
designed especially for battery-driven high-volume consumer electronics
applications.
For further information refere to Documentation/hwmon/shtc1.rst
@@ -31,13 +31,13 @@ properties:
sensirion,blocking-io:
$ref: /schemas/types.yaml#/definitions/flag
description:
- If set, the driver hold the i2c bus until measurement is finished.
+ If set, the driver holds the i2c bus until the measurement is finished.
sensirion,low-precision:
$ref: /schemas/types.yaml#/definitions/flag
description:
- If set, the sensor aquire data with low precision (not recommended).
- The driver aquire data with high precision by default.
+ If set, the sensor acquires data with low precision (not recommended).
+ The driver acquires data with high precision by default.
required:
- compatible
diff --git a/Documentation/devicetree/bindings/hwmon/sht15.txt b/Documentation/devicetree/bindings/hwmon/sht15.txt
deleted file mode 100644
index 6a80277cc426..000000000000
--- a/Documentation/devicetree/bindings/hwmon/sht15.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Sensirion SHT15 Humidity and Temperature Sensor
-
-Required properties:
-
- - "compatible": must be "sensirion,sht15".
- - "data-gpios": GPIO connected to the data line.
- - "clk-gpios": GPIO connected to the clock line.
- - "vcc-supply": regulator that drives the VCC pin.
-
-Example:
-
- sensor {
- pinctrl-names = "default";
- pinctrl-0 = <&pinctrl_sensor>;
- compatible = "sensirion,sht15";
- clk-gpios = <&gpio4 12 0>;
- data-gpios = <&gpio4 13 0>;
- vcc-supply = <&reg_sht15>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/starfive,jh71x0-temp.yaml b/Documentation/devicetree/bindings/hwmon/starfive,jh71x0-temp.yaml
new file mode 100644
index 000000000000..f5b34528928d
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/starfive,jh71x0-temp.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/starfive,jh71x0-temp.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: StarFive JH71x0 Temperature Sensor
+
+maintainers:
+ - Emil Renner Berthing <kernel@esmil.dk>
+
+description: |
+ StarFive Technology Co. JH71x0 embedded temperature sensor
+
+properties:
+ compatible:
+ enum:
+ - starfive,jh7100-temp
+ - starfive,jh7110-temp
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: "sense"
+ - const: "bus"
+
+ '#thermal-sensor-cells':
+ const: 0
+
+ resets:
+ minItems: 2
+ maxItems: 2
+
+ reset-names:
+ items:
+ - const: "sense"
+ - const: "bus"
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/starfive-jh7100.h>
+ #include <dt-bindings/reset/starfive-jh7100.h>
+
+ temperature-sensor@124a0000 {
+ compatible = "starfive,jh7100-temp";
+ reg = <0x124a0000 0x10000>;
+ clocks = <&clkgen JH7100_CLK_TEMP_SENSE>,
+ <&clkgen JH7100_CLK_TEMP_APB>;
+ clock-names = "sense", "bus";
+ #thermal-sensor-cells = <0>;
+ resets = <&rstgen JH7100_RSTN_TEMP_SENSE>,
+ <&rstgen JH7100_RSTN_TEMP_APB>;
+ reset-names = "sense", "bus";
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
index 6f0443322a36..8648877d2d01 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,ina2xx.yaml
@@ -26,6 +26,7 @@ properties:
- ti,ina226
- ti,ina230
- ti,ina231
+ - ti,ina238
reg:
maxItems: 1
@@ -35,6 +36,31 @@ properties:
Shunt resistor value in micro-Ohm.
$ref: /schemas/types.yaml#/definitions/uint32
+ ti,shunt-gain:
+ description: |
+ Programmable gain divisor for the shunt voltage accuracy and range. This
+ property only applies to devices that have configurable PGA/ADCRANGE. The
+ gain value is used configure the gain and to convert the shunt voltage,
+ current and power register values when reading measurements from the
+ device.
+
+ For devices that have a configurable PGA (e.g. INA209, INA219, INA220),
+ the gain value maps directly with the PG bits of the config register.
+
+ For devices that have ADCRANGE configuration (e.g. INA238) a shunt-gain
+ value of 1 maps to ADCRANGE=1 where no gain divisor is applied to the
+ shunt voltage, and a value of 4 maps to ADCRANGE=0 such that a wider
+ voltage range is used.
+
+ The default value is device dependent, and is defined by the reset value
+ of PGA/ADCRANGE in the respective configuration registers.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [1, 2, 4, 8]
+
+ vs-supply:
+ description: phandle to the regulator that provides the VS supply typically
+ in range from 2.7 V to 5.5 V.
+
required:
- compatible
- reg
@@ -51,5 +77,6 @@ examples:
compatible = "ti,ina220";
reg = <0x44>;
shunt-resistor = <1000>;
+ vs-supply = <&vdd_3v0>;
};
};
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml
new file mode 100644
index 000000000000..c5a889e3e27b
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp102.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp102.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP102 temperature sensor
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp102
+
+ interrupts:
+ maxItems: 1
+
+ reg:
+ maxItems: 1
+
+ "#thermal-sensor-cells":
+ const: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@48 {
+ compatible = "ti,tmp102";
+ reg = <0x48>;
+ interrupt-parent = <&gpio7>;
+ interrupts = <16 IRQ_TYPE_LEVEL_LOW>;
+ #thermal-sensor-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp108.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp108.yaml
new file mode 100644
index 000000000000..dcbc6fbc3b48
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp108.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp108.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP108 temperature sensor
+
+maintainers:
+ - Krzysztof Kozlowski <krzk@kernel.org>
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp108
+
+ interrupts:
+ items:
+ - description: alert interrupt
+
+ reg:
+ maxItems: 1
+
+ "#thermal-sensor-cells":
+ const: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@48 {
+ compatible = "ti,tmp108";
+ reg = <0x48>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <7 IRQ_TYPE_LEVEL_LOW>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&tmp_alrt>;
+ #thermal-sensor-cells = <0>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml
new file mode 100644
index 000000000000..0e8ddf0ad789
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp401.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP401, TPM411 and TMP43x temperature sensor
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ ±1°C Remote and Local temperature sensor
+
+ Datasheets:
+ https://www.ti.com/lit/ds/symlink/tmp401.pdf
+ https://www.ti.com/lit/ds/symlink/tmp411.pdf
+ https://www.ti.com/lit/ds/symlink/tmp431.pdf
+ https://www.ti.com/lit/ds/symlink/tmp435.pdf
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp401
+ - ti,tmp411
+ - ti,tmp431
+ - ti,tmp432
+ - ti,tmp435
+
+ reg:
+ maxItems: 1
+
+ ti,extended-range-enable:
+ description:
+ When set, this sensor measures over extended temperature range.
+ type: boolean
+
+ ti,n-factor:
+ description:
+ value to be used for converting remote channel measurements to
+ temperature.
+ $ref: /schemas/types.yaml#/definitions/int32
+ minimum: -128
+ maximum: 127
+
+ ti,beta-compensation:
+ description:
+ value to select beta correction range.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 15
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - ti,tmp401
+ then:
+ properties:
+ ti,n-factor: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - ti,tmp401
+ - ti,tmp411
+ then:
+ properties:
+ ti,beta-compensation: false
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp401";
+ reg = <0x4c>;
+ };
+ };
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp431";
+ reg = <0x4c>;
+ ti,extended-range-enable;
+ ti,n-factor = <0x3b>;
+ ti,beta-compensation = <0x7>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp421.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp421.yaml
new file mode 100644
index 000000000000..a6f1fa75a67c
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp421.yaml
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp421.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP42x/TMP44x temperature sensor
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ ±1°C Remote and Local temperature sensor
+ https://www.ti.com/lit/ds/symlink/tmp422.pdf
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp421
+ - ti,tmp422
+ - ti,tmp423
+ - ti,tmp441
+ - ti,tmp442
+ reg:
+ maxItems: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+patternProperties:
+ "^channel@([0-3])$":
+ type: object
+ description: |
+ Represents channels of the device and their specific configuration.
+
+ properties:
+ reg:
+ description: |
+ The channel number. 0 is local channel, 1-3 are remote channels
+ items:
+ minimum: 0
+ maximum: 3
+
+ label:
+ description: |
+ A descriptive name for this channel, like "ambient" or "psu".
+
+ ti,n-factor:
+ description: |
+ The value (two's complement) to be programmed in the channel specific N correction register.
+ For remote channels only.
+ $ref: /schemas/types.yaml#/definitions/int32
+ minimum: -128
+ maximum: 127
+
+ required:
+ - reg
+
+ additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp422";
+ reg = <0x4c>;
+ };
+ };
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp422";
+ reg = <0x4c>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ channel@0 {
+ reg = <0x0>;
+ ti,n-factor = <0x1>;
+ label = "local";
+ };
+
+ channel@1 {
+ reg = <0x1>;
+ ti,n-factor = <0x0>;
+ label = "somelabel";
+ };
+
+ channel@2 {
+ reg = <0x2>;
+ status = "disabled";
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp464.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp464.yaml
new file mode 100644
index 000000000000..f9c00cbb2806
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp464.yaml
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp464.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP464 and TMP468 temperature sensors
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ ±0.0625°C Remote and Local temperature sensor
+ https://www.ti.com/lit/ds/symlink/tmp464.pdf
+ https://www.ti.com/lit/ds/symlink/tmp468.pdf
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp464
+ - ti,tmp468
+
+ reg:
+ maxItems: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+patternProperties:
+ "^channel@([0-8])$":
+ type: object
+ description: |
+ Represents channels of the device and their specific configuration.
+
+ properties:
+ reg:
+ description: |
+ The channel number. 0 is local channel, 1-8 are remote channels.
+ items:
+ minimum: 0
+ maximum: 8
+
+ label:
+ description: |
+ A descriptive name for this channel, like "ambient" or "psu".
+
+ ti,n-factor:
+ description: |
+ The value (two's complement) to be programmed in the channel specific N correction register.
+ For remote channels only.
+ $ref: /schemas/types.yaml#/definitions/int32
+ minimum: -128
+ maximum: 127
+
+ required:
+ - reg
+
+ additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4b {
+ compatible = "ti,tmp464";
+ reg = <0x4b>;
+ };
+ };
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4b {
+ compatible = "ti,tmp464";
+ reg = <0x4b>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ channel@0 {
+ reg = <0x0>;
+ label = "local";
+ };
+
+ channel@1 {
+ reg = <0x1>;
+ ti,n-factor = <(-10)>;
+ label = "external";
+ };
+
+ channel@2 {
+ reg = <0x2>;
+ ti,n-factor = <0x10>;
+ label = "somelabel";
+ };
+
+ channel@3 {
+ reg = <0x3>;
+ status = "disabled";
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml
index 1502b22c77cc..fde5225ce012 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml
@@ -77,15 +77,15 @@ additionalProperties: false
examples:
- |
i2c {
- #address-cells = <1>;
- #size-cells = <0>;
-
- tmp513@5c {
- compatible = "ti,tmp513";
- reg = <0x5C>;
- shunt-resistor-micro-ohms = <330000>;
- ti,bus-range-microvolt = <32000000>;
- ti,pga-gain = <8>;
- ti,nfactor = <0x1 0xF3 0x00>;
- };
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ tmp513@5c {
+ compatible = "ti,tmp513";
+ reg = <0x5c>;
+ shunt-resistor-micro-ohms = <330000>;
+ ti,bus-range-microvolt = <32000000>;
+ ti,pga-gain = <8>;
+ ti,nfactor = <0x1 0xf3 0x00>;
+ };
};
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tps23861.yaml b/Documentation/devicetree/bindings/hwmon/ti,tps23861.yaml
index 3bc8e73dfbf0..bce68a326919 100644
--- a/Documentation/devicetree/bindings/hwmon/ti,tps23861.yaml
+++ b/Documentation/devicetree/bindings/hwmon/ti,tps23861.yaml
@@ -40,12 +40,12 @@ additionalProperties: false
examples:
- |
i2c {
- #address-cells = <1>;
- #size-cells = <0>;
-
- tps23861@30 {
- compatible = "ti,tps23861";
- reg = <0x30>;
- shunt-resistor-micro-ohms = <255000>;
- };
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ tps23861@30 {
+ compatible = "ti,tps23861";
+ reg = <0x30>;
+ shunt-resistor-micro-ohms = <255000>;
+ };
};
diff --git a/Documentation/devicetree/bindings/hwmon/tmp108.txt b/Documentation/devicetree/bindings/hwmon/tmp108.txt
deleted file mode 100644
index 54d4beed4ee5..000000000000
--- a/Documentation/devicetree/bindings/hwmon/tmp108.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-TMP108 temperature sensor
--------------------------
-
-This device supports I2C only.
-
-Requires node properties:
-- compatible : "ti,tmp108"
-- reg : the I2C address of the device. This is 0x48, 0x49, 0x4a, or 0x4b.
-
-Optional properties:
-- interrupts: Reference to the TMP108 alert interrupt.
-- #thermal-sensor-cells: should be set to 0.
-
-Example:
- tmp108@48 {
- compatible = "ti,tmp108";
- reg = <0x48>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/vexpress.txt b/Documentation/devicetree/bindings/hwmon/vexpress.txt
index 9c27ed694bbb..4a4df4ffc460 100644
--- a/Documentation/devicetree/bindings/hwmon/vexpress.txt
+++ b/Documentation/devicetree/bindings/hwmon/vexpress.txt
@@ -9,7 +9,7 @@ Requires node properties:
"arm,vexpress-power"
"arm,vexpress-energy"
- "arm,vexpress-sysreg,func" when controlled via vexpress-sysreg
- (see Documentation/devicetree/bindings/arm/vexpress-sysreg.txt
+ (see Documentation/devicetree/bindings/arm/vexpress-config.yaml
for more details)
Optional node properties:
diff --git a/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
index 6097e8ac46c1..5a799246a373 100644
--- a/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
+++ b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
@@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/i2c/allwinner,sun6i-a31-p2wi.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: Allwinner A31 P2WI (Push/Pull 2 Wires Interface) Device Tree Bindings
+title: Allwinner A31 P2WI (Push/Pull 2 Wires Interface)
maintainers:
- Chen-Yu Tsai <wens@csie.org>
@@ -55,7 +55,7 @@ examples:
#size-cells = <0>;
axp221: pmic@68 {
- compatible = "x-powers,axp221";
+ /* compatible = "x-powers,axp221"; */
reg = <0x68>;
};
};
diff --git a/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml b/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml
index 6ecb0270d88d..26bed558c6b8 100644
--- a/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml
@@ -2,13 +2,13 @@
# Copyright 2019 BayLibre, SAS
%YAML 1.2
---
-$id: "http://devicetree.org/schemas/i2c/amlogic,meson6-i2c.yaml#"
-$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+$id: http://devicetree.org/schemas/i2c/amlogic,meson6-i2c.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Amlogic Meson I2C Controller
maintainers:
- - Neil Armstrong <narmstrong@baylibre.com>
+ - Neil Armstrong <neil.armstrong@linaro.org>
- Beniamino Galvani <b.galvani@gmail.com>
allOf:
diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml
new file mode 100644
index 000000000000..077d2a539c83
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml='ctx'>
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_STATFS_H
#define _ASM_X86_STATFS_H
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
new file mode 100644
index 000000000000..650e3256ea7d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI__SVM_H
+#define _UAPI__SVM_H
+
+#define SVM_EXIT_READ_CR0 0x000
+#define SVM_EXIT_READ_CR2 0x002
+#define SVM_EXIT_READ_CR3 0x003
+#define SVM_EXIT_READ_CR4 0x004
+#define SVM_EXIT_READ_CR8 0x008
+#define SVM_EXIT_WRITE_CR0 0x010
+#define SVM_EXIT_WRITE_CR2 0x012
+#define SVM_EXIT_WRITE_CR3 0x013
+#define SVM_EXIT_WRITE_CR4 0x014
+#define SVM_EXIT_WRITE_CR8 0x018
+#define SVM_EXIT_READ_DR0 0x020
+#define SVM_EXIT_READ_DR1 0x021
+#define SVM_EXIT_READ_DR2 0x022
+#define SVM_EXIT_READ_DR3 0x023
+#define SVM_EXIT_READ_DR4 0x024
+#define SVM_EXIT_READ_DR5 0x025
+#define SVM_EXIT_READ_DR6 0x026
+#define SVM_EXIT_READ_DR7 0x027
+#define SVM_EXIT_WRITE_DR0 0x030
+#define SVM_EXIT_WRITE_DR1 0x031
+#define SVM_EXIT_WRITE_DR2 0x032
+#define SVM_EXIT_WRITE_DR3 0x033
+#define SVM_EXIT_WRITE_DR4 0x034
+#define SVM_EXIT_WRITE_DR5 0x035
+#define SVM_EXIT_WRITE_DR6 0x036
+#define SVM_EXIT_WRITE_DR7 0x037
+#define SVM_EXIT_EXCP_BASE 0x040
+#define SVM_EXIT_LAST_EXCP 0x05f
+#define SVM_EXIT_INTR 0x060
+#define SVM_EXIT_NMI 0x061
+#define SVM_EXIT_SMI 0x062
+#define SVM_EXIT_INIT 0x063
+#define SVM_EXIT_VINTR 0x064
+#define SVM_EXIT_CR0_SEL_WRITE 0x065
+#define SVM_EXIT_IDTR_READ 0x066
+#define SVM_EXIT_GDTR_READ 0x067
+#define SVM_EXIT_LDTR_READ 0x068
+#define SVM_EXIT_TR_READ 0x069
+#define SVM_EXIT_IDTR_WRITE 0x06a
+#define SVM_EXIT_GDTR_WRITE 0x06b
+#define SVM_EXIT_LDTR_WRITE 0x06c
+#define SVM_EXIT_TR_WRITE 0x06d
+#define SVM_EXIT_RDTSC 0x06e
+#define SVM_EXIT_RDPMC 0x06f
+#define SVM_EXIT_PUSHF 0x070
+#define SVM_EXIT_POPF 0x071
+#define SVM_EXIT_CPUID 0x072
+#define SVM_EXIT_RSM 0x073
+#define SVM_EXIT_IRET 0x074
+#define SVM_EXIT_SWINT 0x075
+#define SVM_EXIT_INVD 0x076
+#define SVM_EXIT_PAUSE 0x077
+#define SVM_EXIT_HLT 0x078
+#define SVM_EXIT_INVLPG 0x079
+#define SVM_EXIT_INVLPGA 0x07a
+#define SVM_EXIT_IOIO 0x07b
+#define SVM_EXIT_MSR 0x07c
+#define SVM_EXIT_TASK_SWITCH 0x07d
+#define SVM_EXIT_FERR_FREEZE 0x07e
+#define SVM_EXIT_SHUTDOWN 0x07f
+#define SVM_EXIT_VMRUN 0x080
+#define SVM_EXIT_VMMCALL 0x081
+#define SVM_EXIT_VMLOAD 0x082
+#define SVM_EXIT_VMSAVE 0x083
+#define SVM_EXIT_STGI 0x084
+#define SVM_EXIT_CLGI 0x085
+#define SVM_EXIT_SKINIT 0x086
+#define SVM_EXIT_RDTSCP 0x087
+#define SVM_EXIT_ICEBP 0x088
+#define SVM_EXIT_WBINVD 0x089
+#define SVM_EXIT_MONITOR 0x08a
+#define SVM_EXIT_MWAIT 0x08b
+#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
+#define SVM_EXIT_RDPRU 0x08e
+#define SVM_EXIT_EFER_WRITE_TRAP 0x08f
+#define SVM_EXIT_CR0_WRITE_TRAP 0x090
+#define SVM_EXIT_CR1_WRITE_TRAP 0x091
+#define SVM_EXIT_CR2_WRITE_TRAP 0x092
+#define SVM_EXIT_CR3_WRITE_TRAP 0x093
+#define SVM_EXIT_CR4_WRITE_TRAP 0x094
+#define SVM_EXIT_CR5_WRITE_TRAP 0x095
+#define SVM_EXIT_CR6_WRITE_TRAP 0x096
+#define SVM_EXIT_CR7_WRITE_TRAP 0x097
+#define SVM_EXIT_CR8_WRITE_TRAP 0x098
+#define SVM_EXIT_CR9_WRITE_TRAP 0x099
+#define SVM_EXIT_CR10_WRITE_TRAP 0x09a
+#define SVM_EXIT_CR11_WRITE_TRAP 0x09b
+#define SVM_EXIT_CR12_WRITE_TRAP 0x09c
+#define SVM_EXIT_CR13_WRITE_TRAP 0x09d
+#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
+#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
+#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_BUS_LOCK 0x0a5
+#define SVM_EXIT_IDLE_HLT 0x0a6
+#define SVM_EXIT_NPF 0x400
+#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
+#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+#define SVM_EXIT_VMGEXIT 0x403
+
+/* SEV-ES software-defined VMGEXIT events */
+#define SVM_VMGEXIT_MMIO_READ 0x80000001
+#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
+#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
+#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
+#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
+#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
+#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
+#define SVM_VMGEXIT_PSC 0x80000010
+#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012
+#define SVM_VMGEXIT_AP_CREATION 0x80000013
+#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
+#define SVM_VMGEXIT_AP_CREATE 1
+#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018
+#define SVM_VMGEXIT_SAVIC 0x8000001a
+#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0
+#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1
+#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL
+#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
+#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
+#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
+ /* SW_EXITINFO1[3:0] */ \
+ (((((u64)reason_set) & 0xf)) | \
+ /* SW_EXITINFO1[11:4] */ \
+ ((((u64)reason_code) & 0xff) << 4))
+#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
+
+/* Exit code reserved for hypervisor/software use */
+#define SVM_EXIT_SW 0xf0000000
+
+#define SVM_EXIT_ERR -1
+
+#define SVM_EXIT_REASONS \
+ { SVM_EXIT_READ_CR0, "read_cr0" }, \
+ { SVM_EXIT_READ_CR2, "read_cr2" }, \
+ { SVM_EXIT_READ_CR3, "read_cr3" }, \
+ { SVM_EXIT_READ_CR4, "read_cr4" }, \
+ { SVM_EXIT_READ_CR8, "read_cr8" }, \
+ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
+ { SVM_EXIT_WRITE_CR2, "write_cr2" }, \
+ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
+ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
+ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
+ { SVM_EXIT_READ_DR0, "read_dr0" }, \
+ { SVM_EXIT_READ_DR1, "read_dr1" }, \
+ { SVM_EXIT_READ_DR2, "read_dr2" }, \
+ { SVM_EXIT_READ_DR3, "read_dr3" }, \
+ { SVM_EXIT_READ_DR4, "read_dr4" }, \
+ { SVM_EXIT_READ_DR5, "read_dr5" }, \
+ { SVM_EXIT_READ_DR6, "read_dr6" }, \
+ { SVM_EXIT_READ_DR7, "read_dr7" }, \
+ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
+ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
+ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
+ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
+ { SVM_EXIT_WRITE_DR4, "write_dr4" }, \
+ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
+ { SVM_EXIT_WRITE_DR6, "write_dr6" }, \
+ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
+ { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
+ { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \
+ { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \
+ { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \
+ { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \
+ { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \
+ { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
+ { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
+ { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \
+ { SVM_EXIT_INTR, "interrupt" }, \
+ { SVM_EXIT_NMI, "nmi" }, \
+ { SVM_EXIT_SMI, "smi" }, \
+ { SVM_EXIT_INIT, "init" }, \
+ { SVM_EXIT_VINTR, "vintr" }, \
+ { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
+ { SVM_EXIT_IDTR_READ, "read_idtr" }, \
+ { SVM_EXIT_GDTR_READ, "read_gdtr" }, \
+ { SVM_EXIT_LDTR_READ, "read_ldtr" }, \
+ { SVM_EXIT_TR_READ, "read_rt" }, \
+ { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \
+ { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \
+ { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \
+ { SVM_EXIT_TR_WRITE, "write_rt" }, \
+ { SVM_EXIT_RDTSC, "rdtsc" }, \
+ { SVM_EXIT_RDPMC, "rdpmc" }, \
+ { SVM_EXIT_PUSHF, "pushf" }, \
+ { SVM_EXIT_POPF, "popf" }, \
+ { SVM_EXIT_CPUID, "cpuid" }, \
+ { SVM_EXIT_RSM, "rsm" }, \
+ { SVM_EXIT_IRET, "iret" }, \
+ { SVM_EXIT_SWINT, "swint" }, \
+ { SVM_EXIT_INVD, "invd" }, \
+ { SVM_EXIT_PAUSE, "pause" }, \
+ { SVM_EXIT_HLT, "hlt" }, \
+ { SVM_EXIT_INVLPG, "invlpg" }, \
+ { SVM_EXIT_INVLPGA, "invlpga" }, \
+ { SVM_EXIT_IOIO, "io" }, \
+ { SVM_EXIT_MSR, "msr" }, \
+ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+ { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \
+ { SVM_EXIT_SHUTDOWN, "shutdown" }, \
+ { SVM_EXIT_VMRUN, "vmrun" }, \
+ { SVM_EXIT_VMMCALL, "hypercall" }, \
+ { SVM_EXIT_VMLOAD, "vmload" }, \
+ { SVM_EXIT_VMSAVE, "vmsave" }, \
+ { SVM_EXIT_STGI, "stgi" }, \
+ { SVM_EXIT_CLGI, "clgi" }, \
+ { SVM_EXIT_SKINIT, "skinit" }, \
+ { SVM_EXIT_RDTSCP, "rdtscp" }, \
+ { SVM_EXIT_ICEBP, "icebp" }, \
+ { SVM_EXIT_WBINVD, "wbinvd" }, \
+ { SVM_EXIT_MONITOR, "monitor" }, \
+ { SVM_EXIT_MWAIT, "mwait" }, \
+ { SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \
+ { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \
+ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
+ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
+ { SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_BUS_LOCK, "buslock" }, \
+ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \
+ { SVM_EXIT_NPF, "npf" }, \
+ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
+ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
+ { SVM_EXIT_VMGEXIT, "vmgexit" }, \
+ { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \
+ { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \
+ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
+ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
+ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
+ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \
+ { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \
+ { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \
+ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \
+ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \
+ { SVM_EXIT_ERR, "invalid_guest_state" }
+
+
+#endif /* _UAPI__SVM_H */
diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/uapi/asm/swab.h
index 557cd9f00661..cd3fd8ddbe9a 100644
--- a/arch/x86/include/asm/swab.h
+++ b/arch/x86/include/uapi/asm/swab.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_X86_SWAB_H
#define _ASM_X86_SWAB_H
@@ -6,22 +7,7 @@
static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
{
-#ifdef __i386__
-# ifdef CONFIG_X86_BSWAP
- asm("bswap %0" : "=r" (val) : "0" (val));
-# else
- asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
- "rorl $16,%0\n\t" /* swap words */
- "xchgb %b0,%h0" /* swap higher bytes */
- : "=q" (val)
- : "0" (val));
-# endif
-
-#else /* __i386__ */
- asm("bswapl %0"
- : "=r" (val)
- : "0" (val));
-#endif
+ asm("bswapl %0" : "=r" (val) : "0" (val));
return val;
}
#define __arch_swab32 __arch_swab32
@@ -37,22 +23,12 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
__u64 u;
} v;
v.u = val;
-# ifdef CONFIG_X86_BSWAP
asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
: "=r" (v.s.a), "=r" (v.s.b)
: "0" (v.s.a), "1" (v.s.b));
-# else
- v.s.a = __arch_swab32(v.s.a);
- v.s.b = __arch_swab32(v.s.b);
- asm("xchgl %0,%1"
- : "=r" (v.s.a), "=r" (v.s.b)
- : "0" (v.s.a), "1" (v.s.b));
-# endif
return v.u;
#else /* __i386__ */
- asm("bswapq %0"
- : "=r" (val)
- : "0" (val));
+ asm("bswapq %0" : "=r" (val) : "0" (val));
return val;
#endif
}
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
new file mode 100644
index 000000000000..5657b7a49f03
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_UCONTEXT_H
+#define _ASM_X86_UCONTEXT_H
+
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE 0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext. All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ * saved CS is not 64-bit)
+ * new SS = saved SS (will fail IRET and signal if invalid)
+ * else
+ * new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ * with zero or garbage in the SS slot (e.g. old CRIU) and call
+ * sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ * context, delete the old SS segment (with modify_ldt), and change
+ * the saved CS to a 64-bit segment. These DOSEMU versions expect
+ * sigreturn to send them back to 64-bit mode without killing them,
+ * despite the fact that the SS selector when the signal was raised is
+ * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel
+ * will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ * modifying the saved context will end up in exactly the state they
+ * started in, even if they were running in a segmented context when
+ * the signal was raised.. Old kernels would lose track of the
+ * previous SS value.
+ */
+#define UC_SIGCONTEXT_SS 0x2
+#define UC_STRICT_RESTORE_SS 0x4
+#endif
+
+#include <asm-generic/ucontext.h>
+
+#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h
new file mode 100644
index 000000000000..be5e2e747f50
--- /dev/null
+++ b/arch/x86/include/uapi/asm/unistd.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_UNISTD_H
+#define _UAPI_ASM_X86_UNISTD_H
+
+/*
+ * x32 syscall flag bit. Some user programs expect syscall NR macros
+ * and __X32_SYSCALL_BIT to have type int, even though syscall numbers
+ * are, for practical purposes, unsigned long.
+ *
+ * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right
+ * thing regardless.
+ */
+#define __X32_SYSCALL_BIT 0x40000000
+
+#ifndef __KERNEL__
+# ifdef __i386__
+# include <asm/unistd_32.h>
+# elif defined(__ILP32__)
+# include <asm/unistd_x32.h>
+# else
+# include <asm/unistd_64.h>
+# endif
+#endif
+
+#endif /* _UAPI_ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
new file mode 100644
index 000000000000..18909b8050bc
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VM86_H
+#define _UAPI_ASM_X86_VM86_H
+
+/*
+ * I'm guessing at the VIF/VIP flag usage, but hope that this is how
+ * the Pentium uses them. Linux will return from vm86 mode when both
+ * VIF and VIP is set.
+ *
+ * On a Pentium, we could probably optimize the virtual flags directly
+ * in the eflags register instead of doing it "by hand" in vflags...
+ *
+ * Linus
+ */
+
+#include <asm/processor-flags.h>
+
+#define BIOSSEG 0x0f000
+
+#define CPU_086 0
+#define CPU_186 1
+#define CPU_286 2
+#define CPU_386 3
+#define CPU_486 4
+#define CPU_586 5
+
+/*
+ * Return values for the 'vm86()' system call
+ */
+#define VM86_TYPE(retval) ((retval) & 0xff)
+#define VM86_ARG(retval) ((retval) >> 8)
+
+#define VM86_SIGNAL 0 /* return due to signal */
+#define VM86_UNKNOWN 1 /* unhandled GP fault
+ - IO-instruction or similar */
+#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
+#define VM86_STI 3 /* sti/popf/iret instruction enabled
+ virtual interrupts */
+
+/*
+ * Additional return values when invoking new vm86()
+ */
+#define VM86_PICRETURN 4 /* return due to pending PIC request */
+#define VM86_TRAP 6 /* return due to DOS-debugger request */
+
+/*
+ * function codes when invoking new vm86()
+ */
+#define VM86_PLUS_INSTALL_CHECK 0
+#define VM86_ENTER 1
+#define VM86_ENTER_NO_BYPASS 2
+#define VM86_REQUEST_IRQ 3
+#define VM86_FREE_IRQ 4
+#define VM86_GET_IRQ_BITS 5
+#define VM86_GET_AND_RESET_IRQ 6
+
+/*
+ * This is the stack-layout seen by the user space program when we have
+ * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
+ * is 'kernel_vm86_regs' (see below).
+ */
+
+struct vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ long __null_ds;
+ long __null_es;
+ long __null_fs;
+ long __null_gs;
+ long orig_eax;
+ long eip;
+ unsigned short cs, __csh;
+ long eflags;
+ long esp;
+ unsigned short ss, __ssh;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct revectored_struct {
+ unsigned long __map[8]; /* 256 bits */
+};
+
+struct vm86_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+};
+
+/*
+ * flags masks
+ */
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
+
+struct vm86plus_info_struct {
+ unsigned long force_return_for_pic:1;
+ unsigned long vm86dbg_active:1; /* for debugger */
+ unsigned long vm86dbg_TFpendig:1; /* for debugger */
+ unsigned long unused:28;
+ unsigned long is_vm86pus:1; /* for vm86 internal use */
+ unsigned char vm86dbg_intxxtab[32]; /* for debugger */
+};
+struct vm86plus_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+
+#endif /* _UAPI_ASM_X86_VM86_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
new file mode 100644
index 000000000000..1baa86dfe029
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+#ifndef _UAPIVMX_H
+#define _UAPIVMX_H
+
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+#define VMX_EXIT_REASONS_SGX_ENCLAVE_MODE 0x08000000
+
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT_SIGNAL 3
+#define EXIT_REASON_SIPI_SIGNAL 4
+#define EXIT_REASON_OTHER_SMI 6
+
+#define EXIT_REASON_INTERRUPT_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_TRAP_FLAG 37
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+#define EXIT_REASON_RDRAND 57
+#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_VMFUNC 59
+#define EXIT_REASON_ENCLS 60
+#define EXIT_REASON_RDSEED 61
+#define EXIT_REASON_PML_FULL 62
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_UMWAIT 67
+#define EXIT_REASON_TPAUSE 68
+#define EXIT_REASON_BUS_LOCK 74
+#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
+#define EXIT_REASON_TDCALL 77
+#define EXIT_REASON_MSR_READ_IMM 84
+#define EXIT_REASON_MSR_WRITE_IMM 85
+
+#define VMX_EXIT_REASONS \
+ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
+ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
+ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \
+ { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \
+ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \
+ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
+ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
+ { EXIT_REASON_CPUID, "CPUID" }, \
+ { EXIT_REASON_HLT, "HLT" }, \
+ { EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVLPG, "INVLPG" }, \
+ { EXIT_REASON_RDPMC, "RDPMC" }, \
+ { EXIT_REASON_RDTSC, "RDTSC" }, \
+ { EXIT_REASON_VMCALL, "VMCALL" }, \
+ { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
+ { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
+ { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
+ { EXIT_REASON_VMPTRST, "VMPTRST" }, \
+ { EXIT_REASON_VMREAD, "VMREAD" }, \
+ { EXIT_REASON_VMRESUME, "VMRESUME" }, \
+ { EXIT_REASON_VMWRITE, "VMWRITE" }, \
+ { EXIT_REASON_VMOFF, "VMOFF" }, \
+ { EXIT_REASON_VMON, "VMON" }, \
+ { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
+ { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
+ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
+ { EXIT_REASON_MSR_READ, "MSR_READ" }, \
+ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
+ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
+ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
+ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \
+ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
+ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
+ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
+ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
+ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
+ { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \
+ { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \
+ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
+ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_INVEPT, "INVEPT" }, \
+ { EXIT_REASON_RDTSCP, "RDTSCP" }, \
+ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
+ { EXIT_REASON_INVVPID, "INVVPID" }, \
+ { EXIT_REASON_WBINVD, "WBINVD" }, \
+ { EXIT_REASON_XSETBV, "XSETBV" }, \
+ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
+ { EXIT_REASON_RDRAND, "RDRAND" }, \
+ { EXIT_REASON_INVPCID, "INVPCID" }, \
+ { EXIT_REASON_VMFUNC, "VMFUNC" }, \
+ { EXIT_REASON_ENCLS, "ENCLS" }, \
+ { EXIT_REASON_RDSEED, "RDSEED" }, \
+ { EXIT_REASON_PML_FULL, "PML_FULL" }, \
+ { EXIT_REASON_XSAVES, "XSAVES" }, \
+ { EXIT_REASON_XRSTORS, "XRSTORS" }, \
+ { EXIT_REASON_UMWAIT, "UMWAIT" }, \
+ { EXIT_REASON_TPAUSE, "TPAUSE" }, \
+ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \
+ { EXIT_REASON_NOTIFY, "NOTIFY" }, \
+ { EXIT_REASON_TDCALL, "TDCALL" }, \
+ { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \
+ { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" }
+
+#define VMX_EXIT_REASON_FLAGS \
+ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
+
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
+#endif /* _UAPIVMX_H */
diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h
new file mode 100644
index 000000000000..75275f547444
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vsyscall.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VSYSCALL_H
+#define _UAPI_ASM_X86_VSYSCALL_H
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+ __NR_vgetcpu,
+};
+
+#define VSYSCALL_ADDR (-10UL << 20)
+
+#endif /* _UAPI_ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
index 08f4fd731469..ef66569e7e22 100644
--- a/arch/x86/kernel/.gitignore
+++ b/arch/x86/kernel/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
vsyscall.lds
vsyscall_32.lds
vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..bc184dd38d99 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,131 +1,168 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+always-$(KBUILD_BUILTIN) += vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
-CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
+CFLAGS_REMOVE_head32.o = -pg
+CFLAGS_REMOVE_rethook.o = -pg
endif
-#
-# vsyscalls (which work on the user stack) should have
-# no stack-protector checks:
-#
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
-CFLAGS_hpet.o := $(nostackp)
-CFLAGS_tsc.o := $(nostackp)
-CFLAGS_paravirt.o := $(nostackp)
-GCOV_PROFILE_vsyscall_64.o := n
-GCOV_PROFILE_hpet.o := n
-GCOV_PROFILE_tsc.o := n
-GCOV_PROFILE_paravirt.o := n
-
-obj-y := process_$(BITS).o signal.o entry_$(BITS).o
-obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time.o ioport.o ldt.o dumpstack.o
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
+
+# With some compiler versions the generated code results in boot hangs, caused
+# by several compilation units. To be safe, disable all instrumentation.
+KCSAN_SANITIZE := n
+KMSAN_SANITIZE_head$(BITS).o := n
+KMSAN_SANITIZE_nmi.o := n
+
+# If instrumentation of the following files is enabled, boot hangs during
+# first second.
+KCOV_INSTRUMENT_head$(BITS).o := n
+# These are called from save_stack_trace() on debug paths,
+# and produce large amounts of uninteresting coverage.
+KCOV_INSTRUMENT_stacktrace.o := n
+KCOV_INSTRUMENT_dumpstack.o := n
+KCOV_INSTRUMENT_dumpstack_$(BITS).o := n
+KCOV_INSTRUMENT_unwind_orc.o := n
+KCOV_INSTRUMENT_unwind_frame.o := n
+KCOV_INSTRUMENT_unwind_guess.o := n
+
+CFLAGS_head32.o := -fno-stack-protector
+CFLAGS_head64.o := -fno-stack-protector
+CFLAGS_irq.o := -I $(src)/../include/asm/trace
+
+obj-y += head_$(BITS).o
+obj-y += head$(BITS).o
+obj-y += ebda.o
+obj-y += platform-quirks.o
+obj-y += process_$(BITS).o signal.o signal_$(BITS).o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED) += fred.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
+obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
-obj-$(CONFIG_X86_VISWS) += visws_quirks.o
-obj-$(CONFIG_X86_32) += probe_roms_32.o
-obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
-obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-y += probe_roms.o
+obj-$(CONFIG_X86_32) += sys_ia32.o
+obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
-obj-y += tsc.o io_delay.o rtc.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
+obj-y += alternative.o i8253.o hw_breakpoint.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
+obj-y += resource.o
+obj-y += irqflags.o
+obj-y += static_call.o
-obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
-obj-y += i387.o xsave.o
+obj-y += fpu/
obj-y += ptrace.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-$(CONFIG_ISA_DMA_API) += i8237.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
-obj-$(CONFIG_SFI) += sfi.o
obj-y += reboot.o
-obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o
+obj-$(CONFIG_SMP) += smpboot.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
-obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
-obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
-obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o
+obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+obj-$(CONFIG_X86_32) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
-obj-$(CONFIG_APB_TIMER) += apb_timer.o
-obj-$(CONFIG_K8_NB) += k8.o
-obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
-obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
+obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_AMD_NODE) += amd_node.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
+
+obj-$(CONFIG_EISA) += eisa.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-obj-$(CONFIG_SCx200) += scx200.o
-scx200-y += scx200_32.o
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
-obj-$(CONFIG_OLPC) += olpc.o
-obj-$(CONFIG_X86_MRST) += mrst.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_UMIP) += umip.o
-microcode-y := microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
-obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+obj-$(CONFIG_CFI) += cfi.o
+
+obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+
+obj-$(CONFIG_X86_CET) += cet.o
-obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
- obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
- obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
- obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
- obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
- obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+ obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3ef..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,12 @@
-subdir- := realmode
+# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
endif
-$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000000..e21419e686eb
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Arch-specific APEI-related functions.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ return apei_smca_report_x86_error(ctx_info, lapic_id);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce0..9fa321a95eb3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1,84 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* boot.c - Architecture-Specific Low-Level ACPI Boot Support
*
* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
+#define pr_fmt(fmt) "ACPI: " fmt
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/acpi_pmtmr.h>
#include <linux/efi.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/dmi.h>
#include <linux/irq.h>
#include <linux/slab.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
+#include <linux/pgtable.h>
+
+#include <xen/xen.h>
+#include <asm/e820/api.h>
+#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
-#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
-u32 acpi_rsdt_forced;
int acpi_disabled;
EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
# include <asm/proto.h>
-# include <asm/numa_64.h>
#endif /* X86 */
-#define BAD_MADT_ENTRY(entry, end) ( \
- (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
- ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
-
-#define PREFIX "ACPI: "
-
int acpi_noirq; /* skip ACPI IRQ initialization */
+static int acpi_nobgrt; /* skip ACPI BGRT */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+int acpi_disable_cmcff;
+bool acpi_int_src_ovr[NR_IRQS_LEGACY];
+/* ACPI SCI override configuration */
u8 acpi_sci_flags __initdata;
-int acpi_sci_override_gsi __initdata;
+u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
int acpi_skip_timer_override __initdata;
int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
#ifdef CONFIG_X86_LOCAL_APIC
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+static bool has_lapic_cpus __initdata;
+static bool acpi_support_online_capable;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
+#ifdef CONFIG_X86_IO_APIC
+/*
+ * Locks related to IOAPIC hotplug
+ * Hotplug side:
+ * ->device_hotplug_lock
+ * ->acpi_ioapic_lock
+ * ->ioapic_lock
+ * Interrupt mapping side:
+ * ->acpi_ioapic_lock
+ * ->ioapic_mutex
+ * ->ioapic_lock
+ */
+static DEFINE_MUTEX(acpi_ioapic_lock);
#endif
/* --------------------------------------------------------------------------
@@ -100,71 +103,25 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
-static unsigned int gsi_to_irq(unsigned int gsi)
-{
- unsigned int irq = gsi + NR_IRQS_LEGACY;
- unsigned int i;
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- if (isa_irq_to_gsi[i] == gsi) {
- return i;
- }
- }
-
- /* Provide an identity mapping of gsi == irq
- * except on truly weird platforms that have
- * non isa irqs in the first 16 gsis.
- */
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
-
- return irq;
-}
-
-static u32 irq_to_gsi(int irq)
-{
- unsigned int gsi;
-
- if (irq < NR_IRQS_LEGACY)
- gsi = isa_irq_to_gsi[irq];
- else if (irq < gsi_top)
- gsi = irq;
- else if (irq < (gsi_top + NR_IRQS_LEGACY))
- gsi = irq - gsi_top;
- else
- gsi = 0xffffffff;
-
- return gsi;
-}
-
/*
- * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
- * to map the target physical address. The problem is that set_fixmap()
- * provides a single page, and it is possible that the page is not
- * sufficient.
- * By using this area, we can map up to MAX_IO_APICS pages temporarily,
- * i.e. until the next __va_range() call.
- *
- * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
- * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
- * count idx down while incrementing the phys address.
+ * This is just a simple wrapper around early_memremap(),
+ * with sanity checks for phys == 0 and size == 0.
*/
-char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
{
if (!phys || !size)
return NULL;
- return early_ioremap(phys, size);
+ return early_memremap(phys, size);
}
-void __init __acpi_unmap_table(char *map, unsigned long size)
+
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
{
if (!map || !size)
return;
- early_iounmap(map, size);
+ early_memunmap(map, size);
}
#ifdef CONFIG_X86_LOCAL_APIC
@@ -172,56 +129,84 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
{
struct acpi_table_madt *madt = NULL;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -EINVAL;
madt = (struct acpi_table_madt *)table;
if (!madt) {
- printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ pr_warn("Unable to map MADT\n");
return -ENODEV;
}
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
- printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
- madt->address);
+ pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
+ /* ACPI 6.3 and newer support the online capable bit. */
+ if (acpi_gbl_FADT.header.revision > 6 ||
+ (acpi_gbl_FADT.header.revision == 6 &&
+ acpi_gbl_FADT.minor_revision >= 3))
+ acpi_support_online_capable = true;
+
default_acpi_madt_oem_check(madt->header.oem_id,
madt->header.oem_table_id);
return 0;
}
-static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+static bool __init acpi_is_processor_usable(u32 lapic_flags)
{
- unsigned int ver = 0;
-
- if (!enabled) {
- ++disabled_cpus;
- return;
- }
+ if (lapic_flags & ACPI_MADT_ENABLED)
+ return true;
- if (boot_cpu_physical_apicid != -1U)
- ver = apic_version[boot_cpu_physical_apicid];
+ if (!acpi_support_online_capable ||
+ (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return true;
- generic_processor_info(id, ver);
+ return false;
}
static int __init
-acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_x2apic *processor = NULL;
+#ifdef CONFIG_X86_X2APIC
+ u32 apic_id;
+ u8 enabled;
+#endif
processor = (struct acpi_madt_local_x2apic *)header;
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
#ifdef CONFIG_X86_X2APIC
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+
+ /* Ignore invalid ID */
+ if (apic_id == 0xffffffff)
+ return 0;
+
+ /* don't register processors that cannot be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ /*
+ * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure
+ * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID
+ * in x2APIC must be equal or greater than 0xff.
+ */
+ if (has_lapic_cpus && apic_id < 0xff)
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -229,17 +214,22 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->local_apic_id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ if (!apic_id_valid(apic_id)) {
+ if (enabled)
+ pr_warn("x2apic entry ignored\n");
+ return 0;
+ }
+
+ topology_register_apic(apic_id, processor->uid, enabled);
#else
- printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
#endif
return 0;
}
static int __init
-acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_apic *processor = NULL;
@@ -248,7 +238,37 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* Ignore processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ has_lapic_cpus = true;
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* don't register processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
/*
* We need to register disabled CPU as well to permit
@@ -257,14 +277,15 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
static int __init
-acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_local_sapic *processor = NULL;
@@ -273,16 +294,17 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
if (BAD_MADT_ENTRY(processor, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
- acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
static int __init
-acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
const unsigned long end)
{
struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
@@ -292,13 +314,15 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
return -EINVAL;
+ acpi_table_print_madt_entry(&header->common);
+
acpi_lapic_addr = lapic_addr_ovr->address;
return 0;
}
static int __init
-acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
const unsigned long end)
{
struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
@@ -308,16 +332,16 @@ acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
if (BAD_MADT_ENTRY(x2apic_nmi, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
if (x2apic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
static int __init
-acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
@@ -326,32 +350,137 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
if (BAD_MADT_ENTRY(lapic_nmi, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
if (lapic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
-
-#endif /*CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi);
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ /*
+ * Check bus_irq boundary.
+ */
+ if (bus_irq >= NR_IRQS_LEGACY) {
+ pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq);
+ return;
+ }
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+ return;
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return;
+ if (!dev || !dev_is_pci(dev))
+ return;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+}
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi)
+{
+ struct mpc_intsrc mp_irq;
+ int ioapic, pin;
+
+ /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+ return ioapic;
+ }
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+
+ return 0;
+}
static int __init
-acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_ioapic(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
ioapic = (struct acpi_madt_io_apic *)header;
if (BAD_MADT_ENTRY(ioapic, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
- mp_register_ioapic(ioapic->id,
- ioapic->address, ioapic->global_irq_base);
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
return 0;
}
@@ -374,12 +503,12 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
- /*
- * mp_config_acpi_legacy_irqs() already setup IRQs < 16
- * If GSI is < 16, this will update its flags,
- * else it will create a new mp_irqs[] entry.
- */
- mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ if (bus_irq < NR_IRQS_LEGACY)
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ else
+ mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
* stash over-ride to indicate we've been here
@@ -390,7 +519,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
}
static int __init
-acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
const unsigned long end)
{
struct acpi_madt_interrupt_override *intsrc = NULL;
@@ -400,7 +529,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(intsrc, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
+
+ if (intsrc->source_irq < NR_IRQS_LEGACY)
+ acpi_int_src_ovr[intsrc->source_irq] = true;
if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
acpi_sci_ioapic_setup(intsrc->source_irq,
@@ -410,10 +542,17 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (acpi_skip_timer_override &&
- intsrc->source_irq == 0 && intsrc->global_irq == 2) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
- return 0;
+ if (intsrc->source_irq == 0) {
+ if (acpi_skip_timer_override) {
+ pr_warn("BIOS IRQ0 override ignored.\n");
+ return 0;
+ }
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+ intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+ pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ }
}
mp_override_legacy_irq(intsrc->source_irq,
@@ -425,7 +564,7 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
}
static int __init
-acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_nmi_source *nmi_src = NULL;
@@ -434,7 +573,7 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
if (BAD_MADT_ENTRY(nmi_src, end))
return -EINVAL;
- acpi_table_print_madt_entry(header);
+ acpi_table_print_madt_entry(&header->common);
/* TBD: Support nimsrc entries? */
@@ -451,10 +590,10 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end
* If a PIC-mode SCI is not recognized or gives spurious IRQ7's
* it may require Edge Trigger -- use "acpi_sci=edge"
*
- * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers
* for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
- * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
- * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
*/
void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
@@ -463,7 +602,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
unsigned int old, new;
/* Real old ELCR mask */
- old = inb(0x4d0) | (inb(0x4d1) << 8);
+ old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8);
/*
* If we use ACPI to set PCI IRQs, then we should clear ELCR
@@ -488,202 +627,257 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
if (old == new)
return;
- printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
- outb(new, 0x4d0);
- outb(new >> 8, 0x4d1);
+ pr_warn("setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, PIC_ELCR1);
+ outb(new >> 8, PIC_ELCR2);
}
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- *irq = gsi_to_irq(gsi);
+ int rc, irq, trigger, polarity;
-#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
- setup_IO_APIC_irq_extra(gsi);
-#endif
+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+ *irqp = gsi;
+ return 0;
+ }
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc)
+ return rc;
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq < 0)
+ return irq;
+
+ *irqp = irq;
return 0;
}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
- if (isa_irq >= 16)
- return -1;
- *gsi = irq_to_gsi(isa_irq);
- return 0;
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
}
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
-
#ifdef CONFIG_PCI
/*
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
- if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (trigger == ACPI_LEVEL_SENSITIVE)
- eisa_set_level_irq(gsi);
- }
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ elcr_set_level_irq(gsi);
#endif
+ return gsi;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ int irq = gsi;
#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
- }
+ int node;
+ struct irq_alloc_info info;
+
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+ mutex_unlock(&acpi_ioapic_lock);
#endif
- irq = gsi_to_irq(plat_gsi);
return irq;
}
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+ int irq;
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+}
+#endif
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
+
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
+}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
+
+void acpi_unregister_gsi(u32 gsi)
+{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+#endif
+
/*
* ACPI based hotplug support for CPU
*/
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
nid = acpi_get_node(handle);
- if (nid == -1 || !node_online(nid))
- return;
-#ifdef CONFIG_X86_64
- apicid_to_node[physid] = nid;
- numa_set_node(cpu, nid);
-#else /* CONFIG_X86_32 */
- apicid_2_node[physid] = nid;
- cpu_to_node_map[cpu] = nid;
-#endif
-
+ if (nid != NUMA_NO_NODE) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
#endif
+ return 0;
}
-static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu)
{
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
- union acpi_object *obj;
- struct acpi_madt_local_apic *lapic;
- cpumask_var_t tmp_map, new_map;
- u8 physid;
- int cpu;
- int retval = -ENOMEM;
-
- if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
- return -EINVAL;
-
- if (!buffer.length || !buffer.pointer)
- return -EINVAL;
-
- obj = buffer.pointer;
- if (obj->type != ACPI_TYPE_BUFFER ||
- obj->buffer.length < sizeof(*lapic)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
-
- if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
- !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- physid = lapic->id;
-
- kfree(buffer.pointer);
- buffer.length = ACPI_ALLOCATE_BUFFER;
- buffer.pointer = NULL;
+ int cpu = topology_hotplug_apic(physid, acpi_id);
- if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
- goto out;
-
- if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
- goto free_tmp_map;
-
- cpumask_copy(tmp_map, cpu_present_mask);
- acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
-
- /*
- * If mp_register_lapic successfully generates a new logical cpu
- * number, then the following will get us exactly what was mapped
- */
- cpumask_andnot(new_map, cpu_present_mask, tmp_map);
- if (cpumask_empty(new_map)) {
- printk ("Unable to map lapic to logical cpu number\n");
- retval = -EINVAL;
- goto free_new_map;
+ if (cpu < 0) {
+ pr_info("Unable to map lapic to logical cpu number\n");
+ return cpu;
}
acpi_processor_set_pdc(handle);
-
- cpu = cpumask_first(new_map);
acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
- retval = 0;
-
-free_new_map:
- free_cpumask_var(new_map);
-free_tmp_map:
- free_cpumask_var(tmp_map);
-out:
- return retval;
+ return 0;
}
+EXPORT_SYMBOL(acpi_map_cpu);
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int acpi_unmap_cpu(int cpu)
{
- return _acpi_map_lsapic(handle, pcpu);
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+ topology_hotunplug_apic(cpu);
+ return 0;
}
-EXPORT_SYMBOL(acpi_map_lsapic);
+EXPORT_SYMBOL(acpi_unmap_cpu);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
-int acpi_unmap_lsapic(int cpu)
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
{
- per_cpu(x86_cpu_to_apicid, cpu) = -1;
- set_cpu_present(cpu, false);
- num_processors--;
-
- return (0);
-}
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ int ioapic_id;
+ u64 addr;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
+ if (ioapic_id < 0) {
+ unsigned long long uid;
+ acpi_status status;
+
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &uid);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_warn(handle, "failed to get IOAPIC ID.\n");
+ return -EINVAL;
+ }
+ ioapic_id = (int)uid;
+ }
-EXPORT_SYMBOL(acpi_unmap_lsapic);
-#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
-int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
-{
- /* TBD */
- return -EINVAL;
+ return ret;
}
-
EXPORT_SYMBOL(acpi_register_ioapic);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
{
- /* TBD */
- return -EINVAL;
-}
+ int ret = -ENOSYS;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_unregister_ioapic(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+ return ret;
+}
EXPORT_SYMBOL(acpi_unregister_ioapic);
-static int __init acpi_parse_sbf(struct acpi_table_header *table)
+/**
+ * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base
+ * has been registered
+ * @handle: ACPI handle of the IOAPIC device
+ * @gsi_base: GSI base associated with the IOAPIC
+ *
+ * Assume caller holds some type of lock to serialize acpi_ioapic_registered()
+ * with acpi_register_ioapic()/acpi_unregister_ioapic().
+ */
+int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
{
- struct acpi_table_boot *sb;
+ int ret = 0;
- sb = (struct acpi_table_boot *)table;
- if (!sb) {
- printk(KERN_WARNING PREFIX "Unable to map SBF\n");
- return -ENODEV;
- }
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_ioapic_registered(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
sbf_port = sb->cmos_index; /* Save CMOS port */
@@ -693,21 +887,14 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
#ifdef CONFIG_HPET_TIMER
#include <asm/hpet.h>
-static struct __initdata resource *hpet_res;
+static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
- struct acpi_table_hpet *hpet_tbl;
-
- hpet_tbl = (struct acpi_table_hpet *)table;
- if (!hpet_tbl) {
- printk(KERN_WARNING PREFIX "Unable to map HPET\n");
- return -ENODEV;
- }
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
- printk(KERN_WARNING PREFIX "HPET timers must be located in "
- "memory.\n");
+ pr_warn("HPET timers must be located in memory.\n");
return -1;
}
@@ -719,9 +906,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
* want to allocate a resource there.
*/
if (!hpet_address) {
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: %#lx is invalid\n",
- hpet_tbl->id, hpet_address);
+ pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address);
return 0;
}
#ifdef CONFIG_X86_64
@@ -732,28 +917,25 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
*/
if (hpet_address == 0xfed0000000000000UL) {
if (!hpet_force_user) {
- printk(KERN_WARNING PREFIX "HPET id: %#x "
- "base: 0xfed0000000000000 is bogus\n "
- "try hpet=force on the kernel command line to "
- "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address = 0;
return 0;
}
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: 0xfed0000000000000 fixed up "
- "to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address >>= 32;
}
#endif
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, hpet_address);
+ pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address);
/*
* Allocate and initialize the HPET firmware resource for adding into
* the resource tree during the lateinit timeframe.
*/
#define HPET_RESOURCE_NAME_SIZE 9
- hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+ hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
+ SMP_CACHE_BYTES);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
@@ -786,6 +968,27 @@ late_initcall(hpet_insert_resource);
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
+ if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
+ pr_debug("no legacy devices present\n");
+ x86_platform.legacy.devices.pnpbios = 0;
+ }
+
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ pr_debug("not registering RTC platform device\n");
+ x86_platform.legacy.rtc = 0;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
+ pr_debug("probing for VGA not safe\n");
+ x86_platform.legacy.no_vga = 1;
+ }
#ifdef CONFIG_X86_PM_TIMER
/* detect the location of the ACPI PM Timer */
@@ -808,8 +1011,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
}
if (pmtmr_ioport)
- printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
- pmtmr_ioport);
+ pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport);
#endif
return 0;
}
@@ -820,94 +1022,81 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
* returns 0 on success, < 0 on error
*/
-static void __init acpi_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, address);
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
-}
-
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
{
int count;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
+ pr_err("Error parsing LAPIC address override entry\n");
return count;
}
- acpi_register_lapic_address(acpi_lapic_addr);
+ register_lapic_address(acpi_lapic_addr);
return count;
}
static int __init acpi_parse_madt_lapic_entries(void)
{
- int count;
- int x2count = 0;
+ int count, x2count = 0;
+ struct acpi_subtable_proc madt_proc[2];
+ int ret;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
- /*
- * Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
- */
-
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
- if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
- return count;
- }
-
- acpi_register_lapic_address(acpi_lapic_addr);
-
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
- acpi_parse_sapic, MAX_APICS);
+ acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_APICS);
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_APICS);
+ /* Check if there are valid LAPIC entries */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC);
+
+ /*
+ * Enumerate the APIC IDs in the order that they appear in the
+ * MADT, no matter LAPIC entry or x2APIC entry is used.
+ */
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
- printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ pr_err("No LAPIC entries present\n");
/* TBD: Cleanup to allow fallback to MPS */
return -ENODEV;
} else if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ pr_err("Error parsing LAPIC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
- x2count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
- acpi_parse_x2apic_nmi, 0);
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ pr_err("Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -916,100 +1105,25 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
-#define MP_ISA_BUS 0
-
-#ifdef CONFIG_X86_ES7000
-extern int es7000_plat;
-#endif
-
-static void assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- int ioapic;
- int pin;
- struct mpc_intsrc mp_irq;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = mp_find_ioapic_pin(ioapic, gsi);
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger << 2) | polarity;
- mp_irq.srcbus = MP_ISA_BUS;
- mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
- mp_irq.dstirq = pin; /* INTIN# */
-
- save_mp_irq(&mp_irq);
-
- isa_irq_to_gsi[bus_irq] = gsi;
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
{
int i;
struct mpc_intsrc mp_irq;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
/*
* Fabricate the legacy ISA bus (bus #31).
*/
mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
#endif
set_bit(MP_ISA_BUS, mp_bus_not_pci);
- pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
-
-#ifdef CONFIG_X86_ES7000
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
-#endif
+ pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs());
/*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < nr_legacy_irqs(); i++) {
int ioapic, pin;
unsigned int dstapic;
int idx;
@@ -1026,7 +1140,7 @@ void __init mp_config_acpi_legacy_irqs(void)
if (ioapic < 0)
continue;
pin = mp_find_ioapic_pin(ioapic, gsi);
- dstapic = mp_ioapics[ioapic].apicid;
+ dstapic = mpc_ioapic_id(ioapic);
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1041,7 +1155,7 @@ void __init mp_config_acpi_legacy_irqs(void)
}
if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ pr_debug("ACPI: IRQ%d used by override.\n", i);
continue; /* IRQ already used */
}
@@ -1053,85 +1167,8 @@ void __init mp_config_acpi_legacy_irqs(void)
mp_irq.srcbusirq = i; /* Identity mapped */
mp_irq.dstirq = pin;
- save_mp_irq(&mp_irq);
- }
-}
-
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
- int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- struct pci_dev *pdev;
- unsigned char number;
- unsigned int devfn;
- int ioapic;
- u8 pin;
-
- if (!acpi_ioapic)
- return 0;
- if (!dev)
- return 0;
- if (dev->bus != &pci_bus_type)
- return 0;
-
- pdev = to_pci_dev(dev);
- number = pdev->bus->number;
- devfn = pdev->devfn;
- pin = pdev->pin;
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapics[ioapic].apicid;
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- save_mp_irq(&mp_irq);
-#endif
- return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
- int ioapic;
- int ioapic_pin;
- struct io_apic_irq_attr irq_attr;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
-
- ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapics[ioapic].apicid,
- ioapic_pin);
- return gsi;
+ mp_save_irq(&mp_irq);
}
-
- if (enable_update_mptable)
- mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
- set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
- trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
-
- return gsi;
}
/*
@@ -1151,35 +1188,32 @@ static int __init acpi_parse_madt_ioapic_entries(void)
if (acpi_disabled || acpi_noirq)
return -ENODEV;
- if (!cpu_has_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
/*
* if "noapic" boot option, don't look for IO-APICs
*/
- if (skip_ioapic_setup) {
- printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
- "due to 'noapic' option.\n");
+ if (ioapic_is_disabled) {
+ pr_info("Skipping IOAPIC probe due to 'noapic' option.\n");
return -ENODEV;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
- MAX_IO_APICS);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
if (!count) {
- printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ pr_err("No IOAPIC entries present\n");
return -ENODEV;
} else if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ pr_err("Error parsing IOAPIC entry\n");
return count;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr,
+ irq_get_nr_irqs());
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing interrupt source overrides entry\n");
+ pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1187,19 +1221,20 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/*
* If BIOS did not supply an INT_SRC_OVR for the SCI
* pretend we got one so we can set the SCI flags.
+ * But ignore setting up SCI on hardware reduced platforms.
*/
- if (!acpi_sci_override_gsi)
+ if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
acpi_gbl_FADT.sci_interrupt);
/* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src,
+ irq_get_nr_irqs());
if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1232,8 +1267,7 @@ static void __init early_acpi_process_madt(void)
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
}
@@ -1257,20 +1291,28 @@ static void __init acpi_process_madt(void)
/*
* Parse MADT IO-APIC entries
*/
+ mutex_lock(&acpi_ioapic_lock);
error = acpi_parse_madt_ioapic_entries();
+ mutex_unlock(&acpi_ioapic_lock);
if (!error) {
- acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
- acpi_ioapic = 1;
+ acpi_set_irq_model_ioapic();
smp_found_config = 1;
}
+
+#ifdef CONFIG_ACPI_MADT_WAKEUP
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_parse_mp_wake, 1);
+#endif
}
if (error == -EINVAL) {
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
} else {
@@ -1280,8 +1322,7 @@ static void __init acpi_process_madt(void)
* Boot with "acpi=off" to use MPS on such a system.
*/
if (smp_found_config) {
- printk(KERN_WARNING PREFIX
- "No APIC-table, disabling MPS\n");
+ pr_warn("No APIC-table, disabling MPS\n");
smp_found_config = 0;
}
}
@@ -1291,11 +1332,9 @@ static void __init acpi_process_madt(void)
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic)
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ pr_info("Using ACPI (MADT) for SMP configuration information\n");
else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+ pr_info("Using ACPI for processor (LAPIC) configuration information\n");
#endif
return;
}
@@ -1303,8 +1342,7 @@ static void __init acpi_process_madt(void)
static int __init disable_acpi_irq(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
- d->ident);
+ pr_notice("%s detected: force use of acpi=noirq\n", d->ident);
acpi_noirq_set();
}
return 0;
@@ -1313,37 +1351,41 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d)
static int __init disable_acpi_pci(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
- d->ident);
+ pr_notice("%s detected: force use of pci=noacpi\n", d->ident);
acpi_disable_pci();
}
return 0;
}
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ } else {
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
+ }
+ return 0;
+}
+
static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ pr_notice("%s detected: acpi off\n", d->ident);
disable_acpi();
} else {
- printk(KERN_NOTICE
- "Warning: DMI blacklist says broken, but acpi forced\n");
+ pr_notice("Warning: DMI blacklist says broken, but acpi forced\n");
}
return 0;
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1351,10 +1393,34 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
}
/*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+void __init acpi_generic_reduced_hw_init(void)
+{
+ /*
+ * Override x86_init functions and bypass legacy PIC in
+ * hardware reduced ACPI mode.
+ */
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ legacy_pic = &null_legacy_pic;
+}
+
+static void __init acpi_reduced_hw_init(void)
+{
+ if (acpi_gbl_reduced_hardware)
+ x86_init.acpi.reduced_hw_early_init();
+}
+
+/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact linux-acpi@vger.kernel.org
*/
-static struct dmi_system_id __initdata acpi_dmi_table[] = {
+static const struct dmi_system_id acpi_dmi_table[] __initconst = {
/*
* Boxes that need ACPI disabled
*/
@@ -1425,11 +1491,24 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
{}
};
/* second table for DMI checks that should run after early-quirks */
-static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
+static const struct dmi_system_id acpi_dmi_table_late[] __initconst = {
/*
* HP laptops which use a DSDT reporting as HP/SB400/10000,
* which includes some code which overrides all temperature
@@ -1437,7 +1516,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1472,6 +1551,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
@@ -1502,15 +1589,23 @@ void __init acpi_boot_table_init(void)
* If acpi_disabled, bail out
*/
if (acpi_disabled)
- return;
+ return;
/*
* Initialize the ACPI boot-time table parser.
*/
- if (acpi_table_init()) {
+ if (acpi_locate_initial_tables())
disable_acpi();
- return;
- }
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1519,27 +1614,23 @@ void __init acpi_boot_table_init(void)
*/
if (acpi_blacklisted()) {
if (acpi_force) {
- printk(KERN_WARNING PREFIX "acpi=force override\n");
+ pr_warn("acpi=force override\n");
} else {
- printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ pr_warn("Disabling ACPI support\n");
disable_acpi();
- return;
+ return 1;
}
}
-}
-int __init early_acpi_boot_init(void)
-{
/*
- * If acpi_disabled, bail out
+ * Process the Multiple APIC Description Table (MADT), if present
*/
- if (acpi_disabled)
- return 1;
+ early_acpi_process_madt();
/*
- * Process the Multiple APIC Description Table (MADT), if present
+ * Hardware-reduced ACPI mode initialization:
*/
- early_acpi_process_madt();
+ acpi_reduced_hw_init();
return 0;
}
@@ -1568,10 +1659,14 @@ int __init acpi_boot_init(void)
acpi_process_madt();
acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt)
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
if (!acpi_noirq)
x86_init.pci.init = pci_acpi_init;
+ /* Do not enable ACPI SPCR console by default */
+ acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
return 0;
}
@@ -1595,15 +1690,19 @@ static int __init parse_acpi(char *arg)
}
/* acpi=rsdt use RSDT instead of XSDT */
else if (strcmp(arg, "rsdt") == 0) {
- acpi_rsdt_forced = 1;
+ acpi_gbl_do_not_use_xsdt = TRUE;
}
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
}
- /* "acpi=copy_dsdt" copys DSDT */
+ /* "acpi=copy_dsdt" copies DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
@@ -1612,6 +1711,13 @@ static int __init parse_acpi(char *arg)
}
early_param("acpi", parse_acpi);
+static int __init parse_acpi_bgrt(char *arg)
+{
+ acpi_nobgrt = true;
+ return 0;
+}
+early_param("bgrt_disable", parse_acpi_bgrt);
+
/* FIXME: Using pci= for an ACPI parameter is a travesty. */
static int __init parse_pci(char *arg)
{
@@ -1625,10 +1731,17 @@ int __init acpi_mps_check(void)
{
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
+
+ /*
+ * Xen disables ACPI in PV DomU guests but it still emulates APIC and
+ * supports SMP. Returning early here ensures that APIC is not disabled
+ * unnecessarily and the guest is not limited to a single vCPU.
+ */
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
if (acpi_disabled || acpi_noirq) {
- printk(KERN_WARNING "MPS support code is not built-in.\n"
- "Using acpi=off or acpi=noirq or pci=noacpi "
- "may have problem\n");
+ pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
}
#endif
@@ -1676,21 +1789,53 @@ early_param("acpi_sci", setup_acpi_sci);
int __acpi_acquire_global_lock(unsigned int *lock)
{
unsigned int old, new, val;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
- new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
- return (new < 3) ? -1 : 0;
+ val = (old >> 1) & 0x1;
+ new = (old & ~0x3) + 2 + val;
+ } while (!try_cmpxchg(lock, &old, new));
+
+ if (val)
+ return 0;
+
+ return -1;
}
int __acpi_release_global_lock(unsigned int *lock)
{
- unsigned int old, new, val;
+ unsigned int old, new;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
new = old & ~0x3;
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
+ } while (!try_cmpxchg(lock, &old, new));
return old & 0x1;
}
+
+void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ e820__range_add(addr, size, E820_TYPE_NVS);
+ e820__update_table_print();
+}
+
+void x86_default_set_root_pointer(u64 addr)
+{
+ boot_params.acpi_rsdp_addr = addr;
+}
+
+u64 x86_default_get_root_pointer(void)
+{
+ return boot_params.acpi_rsdp_addr;
+}
+
+#ifdef CONFIG_XEN_PV
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+ return ioremap_cache(phys, size);
+}
+
+void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) =
+ x86_acpi_os_ioremap;
+EXPORT_SYMBOL_GPL(acpi_os_ioremap);
+#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
new file mode 100644
index 000000000000..d7c8ef1e354d
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * cppc.c: CPPC Interface for x86
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <linux/bitfield.h>
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/topology.h>
+
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_supported_by_cpu(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) ||
+ (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
+ return true;
+ else if (boot_cpu_data.x86 == 0x17 &&
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
+ return true;
+ return boot_cpu_has(X86_FEATURE_CPPC);
+ }
+ return false;
+}
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
+
+static void amd_set_max_freq_ratio(void)
+{
+ struct cppc_perf_caps perf_caps;
+ u64 numerator, nominal_perf;
+ u64 perf_ratio;
+ int rc;
+
+ rc = cppc_get_perf_caps(0, &perf_caps);
+ if (rc) {
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
+ return;
+ }
+
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
+ return;
+ }
+
+ /* midpoint between max_boost and max_P */
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
+
+ freq_invariance_set_perf_ratio(perf_ratio, false);
+}
+
+static DEFINE_MUTEX(freq_invariance_lock);
+
+static inline void init_freq_invariance_cppc(void)
+{
+ static bool init_done;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ mutex_lock(&freq_invariance_lock);
+ if (!init_done)
+ amd_set_max_freq_ratio();
+ init_done = true;
+ mutex_unlock(&freq_invariance_lock);
+}
+
+void acpi_processor_init_invariance_cppc(void)
+{
+ init_freq_invariance_cppc();
+}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_online_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
+ bool prefcore;
+ int ret;
+ u32 tmp;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+
+ /* detect if running on heterogeneous design */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) {
+ switch (core_type) {
+ case TOPO_CPU_TYPE_UNKNOWN:
+ pr_warn("Undefined core type found for cpu %d\n", cpu);
+ break;
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ /* use the max scale for performance cores */
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ /* use the highest perf value for efficiency cores */
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+ *numerator = tmp;
+ return 0;
+ }
+ }
+
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2b..0281703da5e2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2005 Intel Corporation
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
@@ -5,14 +6,18 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <acpi/processor.h>
-#include <asm/acpi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpuid/api.h>
+#include <asm/mwait.h>
+#include <asm/special_insns.h>
+#include <asm/smp.h>
/*
* Initialize bm_flags based on the CPU cache properties
@@ -44,12 +49,61 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
/*
* On all recent Intel platforms, ARB_DISABLE is a nop.
* So, set bm_control to zero to indicate that ARB_DISABLE
- * is not required while entering C3 type state on
- * P4, Core and beyond CPUs
+ * is not required while entering C3 type state.
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
- (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
+ (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST)))
+ flags->bm_control = 0;
+
+ if (c->x86_vendor == X86_VENDOR_CENTAUR) {
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
+ c->x86_stepping >= 0x0e)) {
+ /*
+ * For all recent Centaur CPUs, the ucode will make sure that each
+ * core can keep cache coherence with each other while entering C3
+ * type state. So, set bm_check to 1 to indicate that the kernel
+ * doesn't need to execute a cache flush operation (WBINVD) when
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ /*
+ * For all recent Centaur platforms, ARB_DISABLE is a nop.
+ * Set bm_control to zero to indicate that ARB_DISABLE is
+ * not required while entering C3 type state.
+ */
flags->bm_control = 0;
+ }
+ }
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All Zhaoxin CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ /*
+ * On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
+ if (cpu_feature_enabled(X86_FEATURE_ZEN)) {
+ /*
+ * For all AMD Zen or newer CPUs that support C3, caches
+ * should not be flushed by software while entering C3
+ * type state. Set bm->check to 1 so that kernel doesn't
+ * need to execute cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
@@ -61,20 +115,10 @@ struct cstate_entry {
unsigned int ecx;
} states[ACPI_PROCESSOR_MAX_POWER];
};
-static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_CSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
#define NATIVE_CSTATE_BEYOND_HALT (2)
static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
@@ -86,16 +130,19 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
- MWAIT_CSTATE_MASK) + 1;
+ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
retval = 0;
- if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+ cx->address, edx_part);
retval = -1;
goto out;
}
@@ -110,11 +157,11 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
if (!mwait_supported[cstate_type]) {
mwait_supported[cstate_type] = 1;
printk(KERN_DEBUG
- "Monitor-Mwait will be used to enter C-%d "
- "state\n", cx->type);
+ "Monitor-Mwait will be used to enter C-%d state\n",
+ cx->type);
}
snprintf(cx->desc,
- ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
+ ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x",
cx->address);
out:
return retval;
@@ -127,7 +174,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
- if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
@@ -139,7 +186,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
/* Make sure we are running on right CPU */
- retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx,
+ false);
if (retval == 0) {
/* Use the hint in CST */
percpu_entry->states[cx->index].eax = cx->address;
@@ -158,7 +206,17 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_play_dead(percpu_entry->states[cx->index].eax);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead);
+
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
@@ -172,7 +230,10 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
- if (c->x86_vendor != X86_VENDOR_INTEL)
+
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..aefb9cb583ad
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ ANNOTATE_NOENDBR
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..6d7603511f52
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ mstart = PAGE_ALIGN_DOWN(reset_vector);
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ /*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping
+ * at the same place as in the kernel page tables.
+ * asm_acpi_mp_play_dead() switches to the identity mapping and the
+ * function must be present at the same spot in the virtual address space
+ * before and after switching page tables.
+ */
+ info.offset = __START_KERNEL_map - phys_base;
+ mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead));
+ mend = mstart + PAGE_SIZE;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgment. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
+
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
deleted file mode 100644
index 58f1f48a58f8..000000000000
--- a/arch/x86/kernel/acpi/realmode/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-wakeup.bin
-wakeup.elf
-wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef5..000000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always := wakeup.bin
-targets := wakeup.elf wakeup.lds
-
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter. In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y += video-vga.o
-wakeup-y += video-vesa.o
-wakeup-y += video-bios.o
-
-targets += $(wakeup-y)
-
-bootsrc := $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
- -I$(srctree)/$(bootsrc) \
- $(cflags-y) \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(bootsrc)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
-KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf := -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin := -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
- $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56ce..000000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d8..000000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba202..000000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a27..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f113..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226a..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f374..000000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index 580b4e296010..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-
- .code16
- .section ".header", "a"
-
-/* This should match the structure in wakeup.h */
- .globl wakeup_header
-wakeup_header:
-video_mode: .short 0 /* Video mode number */
-pmode_return: .byte 0x66, 0xea /* ljmpl */
- .long 0 /* offset goes here */
- .short __KERNEL_CS
-pmode_cr0: .long 0 /* Saved %cr0 */
-pmode_cr3: .long 0 /* Saved %cr3 */
-pmode_cr4: .long 0 /* Saved %cr4 */
-pmode_efer: .quad 0 /* Saved EFER */
-pmode_gdt: .quad 0
-realmode_flags: .long 0
-real_magic: .long 0
-trampoline_segment: .word 0
-_pad1: .byte 0
-wakeup_jmp: .byte 0xea /* ljmpw */
-wakeup_jmp_off: .word 3f
-wakeup_jmp_seg: .word 0
-wakeup_gdt: .quad 0, 0, 0
-signature: .long 0x51ee1111
-
- .text
- .globl _start
- .code16
-wakeup_code:
-_start:
- cli
- cld
-
- /* Apparently some dimwit BIOS programmers don't know how to
- program a PM to RM transition, and we might end up here with
- junk in the data segment descriptor registers. The only way
- to repair that is to go into PM and fix it ourselves... */
- movw $16, %cx
- lgdtl %cs:wakeup_gdt
- movl %cr0, %eax
- orb $X86_CR0_PE, %al
- movl %eax, %cr0
- jmp 1f
-1: ljmpw $8, $2f
-2:
- movw %cx, %ds
- movw %cx, %es
- movw %cx, %ss
- movw %cx, %fs
- movw %cx, %gs
-
- andb $~X86_CR0_PE, %al
- movl %eax, %cr0
- jmp wakeup_jmp
-3:
- /* Set up segments */
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- lidtl wakeup_idt
-
- movl $wakeup_stack_end, %esp
-
- /* Clear the EFLAGS */
- pushl $0
- popfl
-
- /* Check header signature... */
- movl signature, %eax
- cmpl $0x51ee1111, %eax
- jne bogus_real_magic
-
- /* Check we really have everything... */
- movl end_signature, %eax
- cmpl $0x65a22c82, %eax
- jne bogus_real_magic
-
- /* Call the C code */
- calll main
-
- /* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
- /* This could also be done in C code... */
- movl pmode_cr3, %eax
- movl %eax, %cr3
-
- movl pmode_cr4, %ecx
- jecxz 1f
- movl %ecx, %cr4
-1:
- movl pmode_efer, %eax
- movl pmode_efer + 4, %edx
- movl %eax, %ecx
- orl %edx, %ecx
- jz 1f
- movl $0xc0000080, %ecx
- wrmsr
-1:
-
- lgdtl pmode_gdt
-
- /* This really couldn't... */
- movl pmode_cr0, %eax
- movl %eax, %cr0
- jmp pmode_return
-#else
- pushw $0
- pushw trampoline_segment
- pushw $0
- lret
-#endif
-
-bogus_real_magic:
-1:
- hlt
- jmp 1b
-
- .data
- .balign 8
-
- /* This is the standard real-mode IDT */
-wakeup_idt:
- .word 0xffff /* limit */
- .long 0 /* address */
- .word 0
-
- .globl HEAP, heap_end
-HEAP:
- .long wakeup_heap
-heap_end:
- .long wakeup_stack
-
- .bss
-wakeup_heap:
- .space 2048
-wakeup_stack:
- .space 2048
-wakeup_stack_end:
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index 060fff8f5c5b..000000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
- . = 0;
- .text : {
- *(.text*)
- }
-
- . = ALIGN(16);
- .rodata : {
- *(.rodata*)
- }
-
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-
- . = ALIGN(16);
- .data : {
- *(.data*)
- }
-
- .signature : {
- end_signature = .;
- LONG(0x65a22c82)
- }
-
- . = ALIGN(16);
- .bss : {
- __bss_start = .;
- *(.bss)
- __bss_end = .;
- }
-
- . = HEADER_OFFSET;
- .header : {
- *(.header)
- }
-
- . = ALIGN(16);
- _end = .;
-
- /DISCARD/ : {
- *(.note*)
- }
-
- /*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
- */
- . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fcc3c61fdecc..91fa262f0e30 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -1,150 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
#include <asm/desc.h>
-
-#include "realmode/wakeup.h"
+#include <asm/cacheflush.h>
+#include <asm/realmode.h>
+#include <asm/hypervisor.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#include <linux/ftrace.h>
+#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
-unsigned long acpi_wakeup_address;
unsigned long acpi_realmode_flags;
-/* address in low memory of the wakeup routine. */
-static unsigned long acpi_realmode;
-
#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
static char temp_stack[4096];
#endif
/**
- * acpi_save_state_mem - save kernel state
+ * acpi_get_wakeup_address - provide physical address for S3 wakeup
*
- * Create an identity mapped page table and copy the wakeup routine to
- * low memory.
+ * Returns the physical address where the kernel should be resumed after the
+ * system awakes from S3, e.g. for programming into the firmware waking vector.
+ */
+unsigned long acpi_get_wakeup_address(void)
+{
+ return ((unsigned long)(real_mode_header->wakeup_start));
+}
+
+/**
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
*
- * Note that this is too late to change acpi_wakeup_address.
+ * Wrapper around acpi_enter_sleep_state() to be called by assembly.
*/
-int acpi_save_state_mem(void)
+asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
{
- struct wakeup_header *header;
+ return acpi_enter_sleep_state(state);
+}
- if (!acpi_realmode) {
- printk(KERN_ERR "Could not allocate memory during boot, "
- "S3 disabled\n");
- return -ENOMEM;
- }
- memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
+/**
+ * x86_acpi_suspend_lowlevel - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int x86_acpi_suspend_lowlevel(void)
+{
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
- header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
- if (header->signature != 0x51ee1111) {
+ if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
}
header->video_mode = saved_video_mode;
- header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
- /*
- * Set up the wakeup GDT. We set these up as Big Real Mode,
- * that is, with limits set to 4 GB. At least the Lenovo
- * Thinkpad X61 is known to need this for the video BIOS
- * initialization quirk to work; this is likely to also
- * be the case for other laptops or integrated video devices.
- */
-
- /* GDT[0]: GDT self-pointer */
- header->wakeup_gdt[0] =
- (u64)(sizeof(header->wakeup_gdt) - 1) +
- ((u64)(acpi_wakeup_address +
- ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
- << 16);
- /* GDT[1]: big real mode-like code segment */
- header->wakeup_gdt[1] =
- GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
- /* GDT[2]: big real mode-like data segment */
- header->wakeup_gdt[2] =
- GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
+ header->pmode_behavior = 0;
#ifndef CONFIG_64BIT
- store_gdt((struct desc_ptr *)&header->pmode_gdt);
+ native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
- if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
- &header->pmode_efer_high))
- header->pmode_efer_low = header->pmode_efer_high = 0;
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4_safe();
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = __read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
- header->trampoline_segment = setup_trampoline() >> 4;
#ifdef CONFIG_SMP
- stack_start.sp = temp_stack + sizeof(temp_stack);
- early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_table(smp_processor_id());
- initial_gs = per_cpu_offset(smp_processor_id());
+ /*
+ * As each CPU starts up, it will find its own stack pointer
+ * from its current_task->thread.sp. Typically that will be
+ * the idle thread for a newly-started AP, or even the boot
+ * CPU which will find it set to &init_task in the static
+ * per-cpu data.
+ *
+ * Make the resuming CPU use the temporary stack at startup
+ * by setting current->thread.sp to point to that. The true
+ * %rsp will be restored with the rest of the CPU context,
+ * by do_suspend_lowlevel(). And unwinders don't care about
+ * the abuse of ->thread.sp because it's a dead variable
+ * while the thread is running on the CPU anyway; the true
+ * value is in the actual %rsp register.
+ */
+ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
+ /*
+ * Ensure the CPU knows which one it is when it comes back, if
+ * it isn't in parallel mode and expected to work that out for
+ * itself.
+ */
+ if (!(smpboot_control & STARTUP_PARALLEL_MASK))
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0L;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
+ do_suspend_lowlevel();
+ unpause_graph_tracing();
return 0;
}
-/*
- * acpi_restore_state - undo effects of acpi_save_state_mem
- */
-void acpi_restore_state_mem(void)
-{
-}
-
-
-/**
- * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
- *
- * We allocate a page from the first 1MB of memory for the wakeup
- * routine for when we come back from a sleep state. The
- * runtime allocator allows specification of <16MB pages, but not
- * <1MB pages.
- */
-void __init acpi_reserve_wakeup_memory(void)
-{
- unsigned long mem;
-
- if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
- printk(KERN_ERR
- "ACPI: Wakeup code way too big, S3 disabled.\n");
- return;
- }
-
- mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
-
- if (mem == -1L) {
- printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
- return;
- }
- acpi_realmode = (unsigned long) phys_to_virt(mem);
- acpi_wakeup_address = mem;
- reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
-}
-
-
static int __init acpi_sleep_setup(char *str)
{
while ((str != NULL) && (*str != '\0')) {
@@ -155,18 +161,19 @@ static int __init acpi_sleep_setup(char *str)
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_hwsig", 8) == 0)
+ acpi_check_s4_hw_signature = 1;
if (strncmp(str, "s4_nohwsig", 10) == 0)
- acpi_no_s4_hw_signature();
- if (strncmp(str, "s4_nonvs", 8) == 0) {
- pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
- "please use acpi_sleep=nonvs instead");
- acpi_nvs_nosave();
- }
+ acpi_check_s4_hw_signature = 0;
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
+ if (strncmp(str, "nonvs_s3", 8) == 0)
+ acpi_nvs_nosave_s3();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
+ if (strncmp(str, "nobl", 4) == 0)
+ acpi_sleep_no_blacklist();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
@@ -175,3 +182,21 @@ static int __init acpi_sleep_setup(char *str)
}
__setup("acpi_sleep=", acpi_sleep_setup);
+
+#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST)
+static int __init init_s4_sigcheck(void)
+{
+ /*
+ * If running on a hypervisor, honour the ACPI specification
+ * by default and trigger a clean reboot when the hardware
+ * signature in FACS is changed after hibernation.
+ */
+ if (acpi_check_s4_hw_signature == -1 &&
+ !hypervisor_is_type(X86_HYPER_NATIVE))
+ acpi_check_s4_hw_signature = 1;
+
+ return 0;
+}
+/* This must happen before acpi_init() which is a subsys initcall */
+arch_initcall(init_s4_sigcheck);
+#endif
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..054c15a2f860 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -1,16 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Variables and functions used by the code in sleep.c
*/
-#include <asm/trampoline.h>
-
-extern char wakeup_code_start, wakeup_code_end;
+#include <linux/linkage.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
extern int wakeup_pmode_return;
-extern char swsusp_pg_dir[PAGE_SIZE];
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern u8 wake_sleep_flags;
+
extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
+
+asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 13ab720573e3..cf69081073b5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,24 +1,25 @@
- .section .text..page_aligned
+/* SPDX-License-Identifier: GPL-2.0-only */
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
-# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz
.code32
ALIGN
-ENTRY(wakeup_pmode_return)
-wakeup_pmode_return:
+SYM_CODE_START(wakeup_pmode_return)
movw $__KERNEL_DS, %ax
movw %ax, %ss
- movw %ax, %ds
- movw %ax, %es
movw %ax, %fs
movw %ax, %gs
+ movw $__USER_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
# reload the gdt, as we need the full 32 bit address
- lgdt saved_gdt
lidt saved_idt
lldt saved_ldt
ljmp $(__KERNEL_CS), $1f
@@ -37,6 +38,7 @@ wakeup_pmode_return:
# jump to place where we left off
movl saved_eip, %eax
jmp *%eax
+SYM_CODE_END(wakeup_pmode_return)
bogus_magic:
jmp bogus_magic
@@ -44,7 +46,6 @@ bogus_magic:
save_registers:
- sgdt saved_gdt
sidt saved_idt
sldt saved_ldt
str saved_tss
@@ -59,7 +60,7 @@ save_registers:
popl saved_context_eflags
movl $ret_point, saved_eip
- ret
+ RET
restore_registers:
@@ -69,13 +70,13 @@ restore_registers:
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
- ret
+ RET
-ENTRY(do_suspend_lowlevel)
+SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
addl $4, %esp
# In case of S3 failure, we'll emerge here. Jump
@@ -85,15 +86,15 @@ ENTRY(do_suspend_lowlevel)
ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
+SYM_CODE_END(do_suspend_lowlevel)
.data
ALIGN
-ENTRY(saved_magic) .long 0
-ENTRY(saved_eip) .long 0
+SYM_DATA(saved_magic, .long 0)
+saved_eip: .long 0
# saved registers
-saved_gdt: .long 0,0
saved_idt: .long 0,0
saved_ldt: .long 0
saved_tss: .long 0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164cbd04..04f561f75e99 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,44 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
.text
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
-# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+# Copyright 2003 Pavel Machek <pavel@suse.cz
.code64
/*
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
-ENTRY(wakeup_long64)
- movq saved_magic, %rax
+SYM_FUNC_START(wakeup_long64)
+ ANNOTATE_NOENDBR
+ movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
- jne bogus_64_magic
+ je 2f
+ /* stop here on a saved_magic mismatch */
+ movq $0xbad6d61676963, %rcx
+1:
+ jmp 1b
+2:
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
- movq saved_rsp, %rsp
+ movq saved_rsp(%rip), %rsp
- movq saved_rbx, %rbx
- movq saved_rdi, %rdi
- movq saved_rsi, %rsi
- movq saved_rbp, %rbp
+ movq saved_rbx(%rip), %rbx
+ movq saved_rdi(%rip), %rdi
+ movq saved_rsi(%rip), %rsi
+ movq saved_rbp(%rip), %rbp
- movq saved_rip, %rax
+ movq saved_rip(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
-ENDPROC(wakeup_long64)
+SYM_FUNC_END(wakeup_long64)
-bogus_64_magic:
- jmp bogus_64_magic
-
-ENTRY(do_suspend_lowlevel)
+SYM_FUNC_START(do_suspend_lowlevel)
+ FRAME_BEGIN
subq $8, %rsp
xorl %eax, %eax
call save_processor_state
@@ -62,23 +71,24 @@ ENTRY(do_suspend_lowlevel)
pushfq
popq pt_regs_flags(%rax)
- movq $resume_point, saved_rip(%rip)
+ movq $.Lresume_point, saved_rip(%rip)
- movq %rsp, saved_rsp
- movq %rbp, saved_rbp
- movq %rbx, saved_rbx
- movq %rdi, saved_rdi
- movq %rsi, saved_rsi
+ movq %rsp, saved_rsp(%rip)
+ movq %rbp, saved_rbp(%rip)
+ movq %rbx, saved_rbx(%rip)
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
addq $8, %rsp
movl $3, %edi
xorl %eax, %eax
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
- jmp resume_point
+ jmp .Lresume_point
.align 4
-resume_point:
+.Lresume_point:
+ ANNOTATE_NOENDBR
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx
@@ -107,18 +117,29 @@ resume_point:
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
+ FRAME_END
jmp restore_processor_state
-ENDPROC(do_suspend_lowlevel)
+SYM_FUNC_END(do_suspend_lowlevel)
+STACK_FRAME_NON_STANDARD do_suspend_lowlevel
.data
-ENTRY(saved_rbp) .quad 0
-ENTRY(saved_rsi) .quad 0
-ENTRY(saved_rdi) .quad 0
-ENTRY(saved_rbx) .quad 0
+saved_rbp: .quad 0
+saved_rsi: .quad 0
+saved_rdi: .quad 0
+saved_rbx: .quad 0
-ENTRY(saved_rip) .quad 0
-ENTRY(saved_rsp) .quad 0
+saved_rip: .quad 0
+saved_rsp: .quad 0
-ENTRY(saved_magic) .quad 0
+SYM_DATA(saved_magic, .quad 0)
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 6ff3b5730575..000000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
- .section ".rodata","a"
- .globl wakeup_code_start, wakeup_code_end
-wakeup_code_start:
- .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
-wakeup_code_end:
- .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..28518371d8bf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,45 +1,42 @@
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/stringify.h>
-#include <linux/kprobes.h>
-#include <linux/mm.h>
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
+#include <linux/mmu_context.h>
+#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
-#include <linux/stop_machine.h>
-#include <linux/slab.h>
-#include <asm/alternative.h>
-#include <asm/sections.h>
-#include <asm/pgtable.h>
-#include <asm/mce.h>
+#include <linux/execmem.h>
+
+#include <asm/text-patching.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ibt.h>
+#include <asm/set_memory.h>
#include <asm/nmi.h>
-#include <asm/vsyscall.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#define MAX_PATCH_LEN (255-1)
+int __read_mostly alternatives_patched;
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
+EXPORT_SYMBOL_GPL(alternatives_patched);
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
+#define MAX_PATCH_LEN (255-1)
+
+#define DA_ALL (~0)
+#define DA_ALT 0x01
+#define DA_RET 0x02
+#define DA_RETPOLINE 0x04
+#define DA_ENDBR 0x08
+#define DA_SMP 0x10
-static int __initdata_or_module debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
- debug_alternative = 1;
+ if (str && *str == '=')
+ str++;
+
+ if (!str || kstrtouint(str, 0, &debug_alternative))
+ debug_alternative = DA_ALL;
+
return 1;
}
__setup("debug-alternative", debug_alt);
@@ -53,194 +50,2021 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);
-#ifdef CONFIG_PARAVIRT
-static int __initdata_or_module noreplace_paravirt = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
+#define DPRINTK(type, fmt, args...) \
+do { \
+ if (debug_alternative & DA_##type) \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
+} while (0)
+
+#define DUMP_BYTES(type, buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative & DA_##type)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
+} while (0)
+
+static const unsigned char x86nops[] =
{
- noreplace_paravirt = 1;
- return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
+ BYTES_NOP1,
+ BYTES_NOP2,
+ BYTES_NOP3,
+ BYTES_NOP4,
+ BYTES_NOP5,
+ BYTES_NOP6,
+ BYTES_NOP7,
+ BYTES_NOP8,
+#ifdef CONFIG_64BIT
+ BYTES_NOP9,
+ BYTES_NOP10,
+ BYTES_NOP11,
#endif
+};
-#define DPRINTK(fmt, args...) if (debug_alternative) \
- printk(KERN_DEBUG fmt, args)
-
-#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
- as inline assembly strings in the include files and we cannot
- get them easily into strings. */
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
- GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8
- "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const __initconst_or_module
-intel_nops[ASM_NOP_MAX+1] = {
+const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
+{
NULL,
- intelnops,
- intelnops + 1,
- intelnops + 1 + 2,
- intelnops + 1 + 2 + 3,
- intelnops + 1 + 2 + 3 + 4,
- intelnops + 1 + 2 + 3 + 4 + 5,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ x86nops,
+ x86nops + 1,
+ x86nops + 1 + 2,
+ x86nops + 1 + 2 + 3,
+ x86nops + 1 + 2 + 3 + 4,
+ x86nops + 1 + 2 + 3 + 4 + 5,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+#ifdef CONFIG_64BIT
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+#endif
};
+
+#ifdef CONFIG_FINEIBT
+static bool cfi_paranoid __ro_after_init;
#endif
-#ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8
- "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const __initconst_or_module
-k8_nops[ASM_NOP_MAX+1] = {
- NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+#ifdef CONFIG_MITIGATION_ITS
+
+#ifdef CONFIG_MODULES
+static struct module *its_mod;
#endif
+static void *its_page;
+static unsigned int its_offset;
+struct its_array its_pages;
-#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
- K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8
- "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const __initconst_or_module
-k7_nops[ASM_NOP_MAX+1] = {
- NULL,
- k7nops,
- k7nops + 1,
- k7nops + 1 + 2,
- k7nops + 1 + 2 + 3,
- k7nops + 1 + 2 + 3 + 4,
- k7nops + 1 + 2 + 3 + 4 + 5,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+static void *__its_alloc(struct its_array *pages)
+{
+ void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ pages->pages = tmp;
+ pages->pages[pages->num++] = page;
+
+ return no_free_ptr(page);
+}
+
+/* Initialize a thunk with the "jmp *reg; int3" instructions. */
+static void *its_init_thunk(void *thunk, int reg)
+{
+ u8 *bytes = thunk;
+ int offset = 0;
+ int i = 0;
+
+#ifdef CONFIG_FINEIBT
+ if (cfi_paranoid) {
+ /*
+ * When ITS uses indirect branch thunk the fineibt_paranoid
+ * caller sequence doesn't fit in the caller site. So put the
+ * remaining part of the sequence (UDB + JNE) into the ITS
+ * thunk.
+ */
+ bytes[i++] = 0xd6; /* UDB */
+ bytes[i++] = 0x75; /* JNE */
+ bytes[i++] = 0xfd;
+
+ offset = 1;
+ }
#endif
-#ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
- P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8
- "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const __initconst_or_module
-p6_nops[ASM_NOP_MAX+1] = {
- NULL,
- p6nops,
- p6nops + 1,
- p6nops + 1 + 2,
- p6nops + 1 + 2 + 3,
- p6nops + 1 + 2 + 3 + 4,
- p6nops + 1 + 2 + 3 + 4 + 5,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+ bytes[i++] = 0xff;
+ bytes[i++] = 0xe0 + reg; /* JMP *reg */
+ bytes[i++] = 0xcc;
+
+ return thunk + offset;
+}
+
+static void its_pages_protect(struct its_array *pages)
+{
+ for (int i = 0; i < pages->num; i++) {
+ void *page = pages->pages[i];
+ execmem_restore_rox(page, PAGE_SIZE);
+ }
+}
+
+static void its_fini_core(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ its_pages_protect(&its_pages);
+ kfree(its_pages.pages);
+}
+
+#ifdef CONFIG_MODULES
+void its_init_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ mutex_lock(&text_mutex);
+ its_mod = mod;
+ its_page = NULL;
+}
+
+void its_fini_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ WARN_ON_ONCE(its_mod != mod);
+
+ its_mod = NULL;
+ its_page = NULL;
+ mutex_unlock(&text_mutex);
+
+ if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ its_pages_protect(&mod->arch.its_pages);
+}
+
+void its_free_mod(struct module *mod)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return;
+
+ for (int i = 0; i < mod->arch.its_pages.num; i++) {
+ void *page = mod->arch.its_pages.pages[i];
+ execmem_free(page);
+ }
+ kfree(mod->arch.its_pages.pages);
+}
+#endif /* CONFIG_MODULES */
+
+static void *its_alloc(void)
+{
+ struct its_array *pages = &its_pages;
+ void *page;
+
+#ifdef CONFIG_MODULES
+ if (its_mod)
+ pages = &its_mod->arch.its_pages;
#endif
-#ifdef CONFIG_X86_64
+ page = __its_alloc(pages);
+ if (!page)
+ return NULL;
+
+ if (pages == &its_pages)
+ set_memory_x((unsigned long)page, 1);
+
+ return page;
+}
+
+static void *its_allocate_thunk(int reg)
+{
+ int size = 3 + (reg / 8);
+ void *thunk;
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * The ITS thunk contains an indirect jump and an int3 instruction so
+ * its size is 3 or 4 bytes depending on the register used. If CFI
+ * paranoid is used then 3 extra bytes are added in the ITS thunk to
+ * complete the fineibt_paranoid caller sequence.
+ */
+ if (cfi_paranoid)
+ size += 3;
+#endif
+
+ if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) {
+ its_page = its_alloc();
+ if (!its_page) {
+ pr_err("ITS page allocation failed\n");
+ return NULL;
+ }
+ memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE);
+ its_offset = 32;
+ }
-extern char __vsyscall_0;
-static const unsigned char *const *__init_or_module find_nop_table(void)
+ /*
+ * If the indirect branch instruction will be in the lower half
+ * of a cacheline, then update the offset to reach the upper half.
+ */
+ if ((its_offset + size - 1) % 64 < 32)
+ its_offset = ((its_offset - 1) | 0x3F) + 33;
+
+ thunk = its_page + its_offset;
+ its_offset += size;
+
+ return its_init_thunk(thunk, reg);
+}
+
+u8 *its_static_thunk(int reg)
{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return k8_nops;
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+
+#ifdef CONFIG_FINEIBT
+ /* Paranoid thunk starts 2 bytes before */
+ if (cfi_paranoid)
+ return thunk - 2;
+#endif
+ return thunk;
}
-#else /* CONFIG_X86_64 */
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
+
+/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
-static const unsigned char *const *__init_or_module find_nop_table(void)
+/*
+ * Fill the buffer with a single effective instruction of size @len.
+ *
+ * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ * for every single-byte NOP, try to generate the maximally available NOP of
+ * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ * *jump* over instead of executing long and daft NOPs.
+ */
+static void add_nop(u8 *buf, unsigned int len)
{
- if (boot_cpu_has(X86_FEATURE_K8))
- return k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- return k7_nops;
- else if (boot_cpu_has(X86_FEATURE_NOPL))
- return p6_nops;
- else
- return intel_nops;
+ u8 *target = buf + len;
+
+ if (!len)
+ return;
+
+ if (len <= ASM_NOP_MAX) {
+ memcpy(buf, x86_nops[len], len);
+ return;
+ }
+
+ if (len < 128) {
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
+ } else {
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
+ }
+
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
-#endif /* CONFIG_X86_64 */
+/*
+ * Find the offset of the first non-NOP instruction starting at @offset
+ * but no further than @len.
+ */
+static int skip_nops(u8 *buf, int offset, int len)
+{
+ struct insn insn;
+
+ for (; offset < len; offset += insn.length) {
+ if (insn_decode_kernel(&insn, &buf[offset]))
+ break;
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
+ if (!insn_is_nop(&insn))
+ break;
+ }
+
+ return offset;
+}
+
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- const unsigned char *const *noptable = find_nop_table();
+ for (int next, i = 0; i < len; i = next) {
+ struct insn insn;
+
+ if (insn_decode_kernel(&insn, &buf[i]))
+ return;
+
+ next = i + insn.length;
+
+ if (insn_is_nop(&insn)) {
+ int nop = i;
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, noptable[noplen], noplen);
- insns += noplen;
- len -= noplen;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
+
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
}
}
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+/*
+ * In this context, "source" is where the instructions are placed in the
+ * section .altinstr_replacement, for example during kernel build by the
+ * toolchain.
+ * "Destination" is where the instructions are being patched in by this
+ * machinery.
+ *
+ * The source offset is:
+ *
+ * src_imm = target - src_next_ip (1)
+ *
+ * and the target offset is:
+ *
+ * dst_imm = target - dst_next_ip (2)
+ *
+ * so rework (1) as an expression for target like:
+ *
+ * target = src_imm + src_next_ip (1a)
+ *
+ * and substitute in (2) to get:
+ *
+ * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ *
+ * Now, since the instruction stream is 'identical' at src and dst (it
+ * is being copied after all) it can be stated that:
+ *
+ * src_next_ip = src + ip_offset
+ * dst_next_ip = dst + ip_offset (4)
+ *
+ * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ * obtain:
+ *
+ * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ * = src_imm + src - dst + ip_offset - ip_offset
+ * = src_imm + src - dst (5)
+ *
+ * IOW, only the relative displacement of the code block matters.
+ */
+
+#define apply_reloc_n(n_, p_, d_) \
+ do { \
+ s32 v = *(s##n_ *)(p_); \
+ v += (d_); \
+ BUG_ON((v >> 31) != (v >> (n_-1))); \
+ *(s##n_ *)(p_) = (s##n_)v; \
+ } while (0)
+
+
+static __always_inline
+void apply_reloc(int n, void *ptr, uintptr_t diff)
+{
+ switch (n) {
+ case 1: apply_reloc_n(8, ptr, diff); break;
+ case 2: apply_reloc_n(16, ptr, diff); break;
+ case 4: apply_reloc_n(32, ptr, diff); break;
+ default: BUG();
+ }
+}
+
+static __always_inline
+bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+{
+ u8 *target = src + offset;
+ /*
+ * If the target is inside the patched block, it's relative to the
+ * block itself and does not need relocation.
+ */
+ return (target < src || target > src + src_len);
+}
+
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ for (int next, i = 0; i < instrlen; i = next) {
+ struct insn insn;
+
+ if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+ return;
+
+ next = i + insn.length;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0x0f:
+ if (insn.opcode.bytes[1] < 0x80 ||
+ insn.opcode.bytes[1] > 0x8f)
+ break;
+
+ fallthrough; /* Jcc.d32 */
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ case JMP8_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case CALL_INSN_OPCODE:
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
+ apply_reloc(insn.immediate.nbytes,
+ buf + i + insn_offset_immediate(&insn),
+ repl - instr);
+ }
+
+ /*
+ * Where possible, convert JMP.d32 into JMP.d8.
+ */
+ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ s32 imm = insn.immediate.value;
+ imm += repl - instr;
+ imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+ if ((imm >> 31) == (imm >> 7)) {
+ buf[i+0] = JMP8_INSN_OPCODE;
+ buf[i+1] = (s8)imm;
+
+ memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+ }
+ }
+ break;
+ }
+
+ if (insn_rip_relative(&insn)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
+ apply_reloc(insn.displacement.nbytes,
+ buf + i + insn_offset_displacement(&insn),
+ repl - instr);
+ }
+ }
+ }
+}
+
+void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that assymetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
-void __init_or_module apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+noinstr void BUG_func(void)
{
- struct alt_instr *a;
- u8 insnbuf[MAX_PATCH_LEN];
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
+ */
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a, *b;
+
+ DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
for (a = start; a < end; a++) {
- u8 *instr = a->instr;
- BUG_ON(a->replacementlen > a->instrlen);
- BUG_ON(a->instrlen > sizeof(insnbuf));
- if (!boot_cpu_has(a->cpuid))
+ unsigned int insn_buff_sz = 0;
+
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
+
+ instr = instr_va(a);
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ BUG_ON(a->instrlen > sizeof(insn_buff));
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
+
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALT_FLAG_NOT is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
-#ifdef CONFIG_X86_64
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __func__, a->instr, instr);
}
+
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, instr, a->instrlen,
+ replacement, a->replacementlen, a->flags);
+
+ memcpy(insn_buff, replacement, a->replacementlen);
+ insn_buff_sz = a->replacementlen;
+
+ if (a->flags & ALT_FLAG_DIRECT_CALL)
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+
+ for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+ insn_buff[insn_buff_sz] = 0x90;
+
+ text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
+
+ DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+
+ text_poke_early(instr, insn_buff, insn_buff_sz);
+ }
+
+ kasan_enable_current();
+}
+
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
+
+/*
+ * [CS]{,3} CALL/JMP *%\reg [INT3]*
+ */
+static int emit_indirect(int op, int reg, u8 *bytes, int len)
+{
+ int cs = 0, bp = 0;
+ int i = 0;
+ u8 modrm;
+
+ /*
+ * Set @len to the excess bytes after writing the instruction.
+ */
+ len -= 2 + (reg >= 8);
+ WARN_ON_ONCE(len < 0);
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ modrm = 0x10; /* Reg = 2; CALL r/m */
+ /*
+ * Additional NOP is better than prefix decode penalty.
+ */
+ if (len <= 3)
+ cs = len;
+ break;
+
+ case JMP32_INSN_OPCODE:
+ modrm = 0x20; /* Reg = 4; JMP r/m */
+ bp = len;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ while (cs--)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ if (reg >= 8) {
+ bytes[i++] = 0x41; /* REX.B prefix */
+ reg -= 8;
+ }
+
+ modrm |= 0xc0; /* Mod = 3 */
+ modrm += reg;
+
+ bytes[i++] = 0xff; /* opcode */
+ bytes[i++] = modrm;
+
+ while (bp--)
+ bytes[i++] = 0xcc; /* INT3 */
+
+ return i;
+}
+
+static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes,
+ void *call_dest, void *jmp_dest)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ call_dest,
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ jmp_dest,
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ return __emit_trampoline(addr, insn, bytes,
+ __x86_indirect_call_thunk_array[reg],
+ __x86_indirect_jump_thunk_array[reg]);
+}
+
+#ifdef CONFIG_MITIGATION_ITS
+static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = __x86_indirect_its_thunk_array[reg];
+ u8 *tmp = its_allocate_thunk(reg);
+
+ if (tmp)
+ thunk = tmp;
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+/* Check if an indirect branch is at ITS-unsafe address */
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
+ return false;
+
+ /* Indirect branch opcode is 2 or 3 bytes depending on reg */
+ addr += 1 + reg / 8;
+
+ /* Lower-half of the cacheline? */
+ return !(addr & 0x20);
+}
+#else /* CONFIG_MITIGATION_ITS */
+
+#ifdef CONFIG_FINEIBT
+static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg)
+{
+ return false;
+}
#endif
- memcpy(insnbuf, a->replacement, a->replacementlen);
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- text_poke_early(instr, insnbuf, a->instrlen);
+
+#endif /* CONFIG_MITIGATION_ITS */
+
+/*
+ * Rewrite the compiler generated retpoline thunk calls.
+ *
+ * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
+ * indirect instructions, avoiding the extra indirection.
+ *
+ * For example, convert:
+ *
+ * CALL __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * CALL *%\reg
+ *
+ * It also tries to inline spectre_v2=retpoline,lfence when size permits.
+ */
+static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
+{
+ retpoline_thunk_t *target;
+ int reg, ret, i = 0;
+ u8 op, cc;
+
+ target = addr + insn->length + insn->immediate.value;
+ reg = target - __x86_indirect_thunk_array;
+
+ if (WARN_ON_ONCE(reg & ~0xf))
+ return -1;
+
+ /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
+ BUG_ON(reg == 4);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return emit_call_track_retpoline(addr, insn, reg, bytes);
+
+ return -1;
}
+
+ op = insn->opcode.bytes[0];
+
+ /*
+ * Convert:
+ *
+ * Jcc.d32 __x86_indirect_thunk_\reg
+ *
+ * into:
+ *
+ * Jncc.d8 1f
+ * [ LFENCE ]
+ * JMP *%\reg
+ * [ NOP ]
+ * 1:
+ */
+ if (is_jcc32(insn)) {
+ cc = insn->opcode.bytes[1] & 0xf;
+ cc ^= 1; /* invert condition */
+
+ bytes[i++] = 0x70 + cc; /* Jcc.d8 */
+ bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
+
+ /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
+ op = JMP32_INSN_OPCODE;
+ }
+
+ /*
+ * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ bytes[i++] = 0x0f;
+ bytes[i++] = 0xae;
+ bytes[i++] = 0xe8; /* LFENCE */
+ }
+
+#ifdef CONFIG_MITIGATION_ITS
+ /*
+ * Check if the address of last byte of emitted-indirect is in
+ * lower-half of the cacheline. Such branches need ITS mitigation.
+ */
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg))
+ return emit_its_trampoline(addr, insn, reg, bytes);
+#endif
+
+ ret = emit_indirect(op, reg, bytes + i, insn->length - i);
+ if (ret < 0)
+ return ret;
+ i += ret;
+
+ for (; i < insn->length;)
+ bytes[i++] = BYTES_NOP1;
+
+ return i;
}
-#ifdef CONFIG_SMP
+/*
+ * Generated by 'objtool --retpoline'.
+ */
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op1, op2;
+ u8 *dest;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op1 = insn.opcode.bytes[0];
+ op2 = insn.opcode.bytes[1];
+
+ switch (op1) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ /* See cfi_paranoid. */
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ /* Check for cfi_paranoid + ITS */
+ dest = addr + insn.length + insn.immediate.value;
+ if (dest[-1] == 0xd6 && (dest[0] & 0xf0) == 0x70) {
+ WARN_ON_ONCE(cfi_mode != CFI_FINEIBT);
+ continue;
+ }
+ break;
+
+ case 0x0f: /* escape */
+ if (op2 >= 0x80 && op2 <= 0x8f)
+ break;
+ fallthrough;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_retpoline(addr, &insn, bytes);
+ if (len == insn.length) {
+ optimize_nops(addr, bytes, len);
+ DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+
+bool cpu_wants_rethunk(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_RETHUNK);
+}
+
+bool cpu_wants_rethunk_at(void *addr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ return false;
+ if (x86_return_thunk != its_return_thunk)
+ return true;
+
+ return !((unsigned long)addr & 0x20);
+}
+
+/*
+ * Rewrite the compiler generated return thunk tail-calls.
+ *
+ * For example, convert:
+ *
+ * JMP __x86_return_thunk
+ *
+ * into:
+ *
+ * RET
+ */
+static int patch_return(void *addr, struct insn *insn, u8 *bytes)
+{
+ int i = 0;
+
+ /* Patch the custom return thunks... */
+ if (cpu_wants_rethunk_at(addr)) {
+ i = JMP32_INSN_SIZE;
+ __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
+ } else {
+ /* ... or patch them out if not needed. */
+ bytes[i++] = RET_INSN_OPCODE;
+ }
+
+ for (; i < insn->length;)
+ bytes[i++] = INT3_INSN_OPCODE;
+ return i;
+}
+
+void __init_or_module noinline apply_returns(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ if (cpu_wants_rethunk())
+ static_call_force_reinit();
+
+ for (s = start; s < end; s++) {
+ void *dest = NULL, *addr = (void *)s + *s;
+ struct insn insn;
+ int len, ret;
+ u8 bytes[16];
+ u8 op;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op == JMP32_INSN_OPCODE)
+ dest = addr + insn.length + insn.immediate.value;
+
+ if (__static_call_fixup(addr, op, dest) ||
+ WARN_ONCE(dest != &__x86_return_thunk,
+ "missing return thunk: %pS-%pS: %*ph",
+ addr, dest, 5, addr))
+ continue;
+
+ DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
+ addr, addr, insn.length,
+ addr + insn.length + insn.immediate.value);
+
+ len = patch_return(addr, &insn, bytes);
+ if (len == insn.length) {
+ DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
+ text_poke_early(addr, bytes, len);
+ }
+ }
+}
+#else /* !CONFIG_MITIGATION_RETHUNK: */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+#endif /* !CONFIG_MITIGATION_RETHUNK */
+
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr bool is_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return __is_endbr(endbr);
+
+Efault:
+ return false;
+}
+
+#ifdef CONFIG_FINEIBT
+
+static __noendbr bool exact_endbr(u32 *val)
+{
+ u32 endbr;
+
+ __get_kernel_nofault(&endbr, val, u32, Efault);
+ return endbr == gen_endbr();
+
+Efault:
+ return false;
+}
+
+#endif
+
+static void poison_cfi(void *addr);
+
+static void __init_or_module poison_endbr(void *addr)
+{
+ u32 poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(!is_endbr(addr)))
+ return;
+
+ DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+}
+
+/*
+ * Generated by: objtool --ibt
+ *
+ * Seal the functions for indirect calls by clobbering the ENDBR instructions
+ * and the kCFI hash value.
+ */
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ poison_endbr(addr);
+ if (IS_ENABLED(CONFIG_FINEIBT))
+ poison_cfi(addr - 16);
+ }
+}
+
+#else /* !CONFIG_X86_KERNEL_IBT: */
+
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
+
+#endif /* !CONFIG_X86_KERNEL_IBT */
+
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+# define __CFI_DEFAULT CFI_AUTO
+#elif defined(CONFIG_CFI)
+# define __CFI_DEFAULT CFI_KCFI
+#else
+# define __CFI_DEFAULT CFI_OFF
+#endif
+
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+static bool cfi_debug __ro_after_init;
+
+#ifdef CONFIG_FINEIBT_BHI
+bool cfi_bhi __ro_after_init = false;
+#endif
+
+#ifdef CONFIG_CFI
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+
+int cfi_get_func_arity(void *func)
+{
+ bhi_thunk *target;
+ s32 disp;
+
+ if (cfi_mode != CFI_FINEIBT && !cfi_bhi)
+ return 0;
+
+ if (get_kernel_nofault(disp, func - 4))
+ return 0;
+
+ target = func + disp;
+ return target - __bhi_args;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
+
+static bool cfi_rand __ro_after_init = true;
+static u32 cfi_seed __ro_after_init;
+
+/*
+ * Re-hash the CFI hash with a boot-time seed while making sure the result is
+ * not a valid ENDBR instruction.
+ */
+static u32 cfi_rehash(u32 hash)
+{
+ hash ^= cfi_seed;
+ while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) {
+ bool lsb = hash & 1;
+ hash >>= 1;
+ if (lsb)
+ hash ^= 0x80200003;
+ }
+ return hash;
+}
+
+static __init int cfi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "auto")) {
+ cfi_mode = CFI_AUTO;
+ } else if (!strcmp(str, "off")) {
+ cfi_mode = CFI_OFF;
+ cfi_rand = false;
+ } else if (!strcmp(str, "debug")) {
+ cfi_debug = true;
+ } else if (!strcmp(str, "kcfi")) {
+ cfi_mode = CFI_KCFI;
+ } else if (!strcmp(str, "fineibt")) {
+ cfi_mode = CFI_FINEIBT;
+ } else if (!strcmp(str, "norand")) {
+ cfi_rand = false;
+ } else if (!strcmp(str, "warn")) {
+ pr_alert("CFI: mismatch non-fatal!\n");
+ cfi_warn = true;
+ } else if (!strcmp(str, "paranoid")) {
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_paranoid = true;
+ } else {
+ pr_err("CFI: ignoring paranoid; depends on fineibt.\n");
+ }
+ } else if (!strcmp(str, "bhi")) {
+#ifdef CONFIG_FINEIBT_BHI
+ if (cfi_mode == CFI_FINEIBT) {
+ cfi_bhi = true;
+ } else {
+ pr_err("CFI: ignoring bhi; depends on fineibt.\n");
+ }
+#else
+ pr_err("CFI: ignoring bhi; depends on FINEIBT_BHI=y.\n");
+#endif
+ } else {
+ pr_err("CFI: Ignoring unknown option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("cfi", cfi_parse_cmdline);
+
+/*
+ * kCFI FineIBT
+ *
+ * __cfi_\func: __cfi_\func:
+ * movl $0x12345678,%eax // 5 endbr64 // 4
+ * nop subl $0x12345678,%eax // 5
+ * nop jne.d32,pn \func+3 // 7
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * \func: \func:
+ * endbr64 nopl -42(%rax)
+ *
+ *
+ * caller: caller:
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%eax // 5
+ * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4
+ * je 1f // 2 nop5 // 5
+ * ud2 // 2
+ * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6
+ *
+ *
+ * Notably, the FineIBT sequences are crafted such that branches are presumed
+ * non-taken. This is based on Agner Fog's optimization manual, which states:
+ *
+ * "Make conditional jumps most often not taken: The efficiency and throughput
+ * for not-taken branches is better than for taken branches on most
+ * processors. Therefore, it is good to place the most frequent branch first"
+ */
+
+/*
+ * <fineibt_preamble_start>:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 0f 85 03 00 00 00 jne,pn 13 <fineibt_preamble_start+0x13>
+ * 10: 0f 1f 40 d6 nopl -0x2a(%rax)
+ *
+ * Note that the JNE target is the 0xD6 byte inside the NOPL, this decodes as
+ * UDB on x86_64 and raises #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %eax \n"
+ "fineibt_preamble_bhi: \n"
+ " cs jne.d32 fineibt_preamble_start+0x13 \n"
+ "#fineibt_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_preamble_end: \n"
+ ".popsection\n"
+);
+
+extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_bhi[];
+extern u8 fineibt_preamble_end[];
+
+#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start)
+#define fineibt_preamble_ud 0x13
+#define fineibt_preamble_hash 5
+
+/*
+ * <fineibt_caller_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 4d 8d 5b f0 lea -0x10(%r11), %r11
+ * 9: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_caller_start: \n"
+ " movl $0x12345678, %eax \n"
+ " lea -0x10(%r11), %r11 \n"
+ ASM_NOP5
+ "fineibt_caller_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
+#define fineibt_caller_hash 1
+
+#define fineibt_caller_jmp (fineibt_caller_size - 2)
+
+/*
+ * Since FineIBT does hash validation on the callee side it is prone to
+ * circumvention attacks where a 'naked' ENDBR instruction exists that
+ * is not part of the fineibt_preamble sequence.
+ *
+ * Notably the x86 entry points must be ENDBR and equally cannot be
+ * fineibt_preamble.
+ *
+ * The fineibt_paranoid caller sequence adds additional caller side
+ * hash validation. This stops such circumvention attacks dead, but at the cost
+ * of adding a load.
+ *
+ * <fineibt_paranoid_start>:
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f5 cmp -0x11(%r11), %eax
+ * 9: 2e 4d 8d 5b <f0> cs lea -0x10(%r11), %r11
+ * e: 75 fd jne d <fineibt_paranoid_start+0xd>
+ * 10: 41 ff d3 call *%r11
+ * 13: 90 nop
+ *
+ * Notably LEA does not modify flags and can be reordered with the CMP,
+ * avoiding a dependency. Again, using a non-taken (backwards) branch
+ * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the
+ * Jcc.d8, causing #UD.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_paranoid_start: \n"
+ " mov $0x12345678, %eax \n"
+ " cmpl -11(%r11), %eax \n"
+ " cs lea -0x10(%r11), %r11 \n"
+ "#fineibt_caller_size: \n"
+ " jne fineibt_paranoid_start+0xd \n"
+ "fineibt_paranoid_ind: \n"
+ " cs call *%r11 \n"
+ "fineibt_paranoid_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_paranoid_start[];
+extern u8 fineibt_paranoid_ind[];
+extern u8 fineibt_paranoid_end[];
+
+#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start)
+#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start)
+#define fineibt_paranoid_ud 0xd
+
+static u32 decode_preamble_hash(void *addr, int *reg)
+{
+ u8 *p = addr;
+
+ /* b8+reg 78 56 34 12 movl $0x12345678,\reg */
+ if (p[0] >= 0xb8 && p[0] < 0xc0) {
+ if (reg)
+ *reg = p[0] - 0xb8;
+ return *(u32 *)(addr + 1);
+ }
+
+ return 0; /* invalid hash value */
+}
+
+static u32 decode_caller_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */
+ if (p[0] == 0x41 && p[1] == 0xba)
+ return -*(u32 *)(addr + 2);
+
+ /* e8 0c 88 a9 cb ed jmp.d8 +12 */
+ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
+ return -*(u32 *)(addr + 2);
+
+ return 0; /* invalid hash value */
+}
+
+/* .retpoline_sites */
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
+ * in tact for later usage. Also see decode_caller_hash() and
+ * cfi_rewrite_callers().
+ */
+ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, jmp, 2);
+ }
+
+ return 0;
+}
+
+static int cfi_enable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Re-enable kCFI, undo what cfi_disable_callers() did.
+ */
+ const u8 mov[] = { 0x41, 0xba };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
+ continue;
+
+ text_poke_early(addr, mov, 2);
+ }
+
+ return 0;
+}
+
+/* .cfi_sites */
+static int cfi_rand_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr, NULL);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ hash = cfi_rehash(hash);
+ text_poke_early(addr + 1, &hash, 4);
+ }
+
+ return 0;
+}
+
+/*
+ * Inline the bhi-arity 1 case:
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 49 0f 45 fa cmovne %rax, %rdi
+ * d: 2e 75 03 jne,pn foo+0x3
+ *
+ * foo:
+ * 10: 0f 1f 40 <d6> nopl -42(%rax)
+ *
+ * Notably, this scheme is incompatible with permissive CFI
+ * because the CMOVcc is unconditional and RDI will have been
+ * clobbered.
+ */
+asm( ".pushsection .rodata \n"
+ "fineibt_bhi1_start: \n"
+ " cmovne %rax, %rdi \n"
+ " cs jne fineibt_bhi1_func + 0x3 \n"
+ "fineibt_bhi1_func: \n"
+ " nopl -42(%rax) \n"
+ "fineibt_bhi1_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_bhi1_start[];
+extern u8 fineibt_bhi1_end[];
+
+#define fineibt_bhi1_size (fineibt_bhi1_end - fineibt_bhi1_start)
+
+static void cfi_fineibt_bhi_preamble(void *addr, int arity)
+{
+ u8 bytes[MAX_INSN_SIZE];
+
+ if (!arity)
+ return;
+
+ if (!cfi_warn && arity == 1) {
+ text_poke_early(addr + fineibt_preamble_bhi,
+ fineibt_bhi1_start, fineibt_bhi1_size);
+ return;
+ }
+
+ /*
+ * Replace the bytes at fineibt_preamble_bhi with a CALL instruction
+ * that lines up exactly with the end of the preamble, such that the
+ * return address will be foo+0.
+ *
+ * __cfi_foo:
+ * 0: f3 0f 1e fa endbr64
+ * 4: 2d 78 56 34 12 sub $0x12345678, %eax
+ * 9: 2e 2e e8 DD DD DD DD cs cs call __bhi_args[arity]
+ */
+ bytes[0] = 0x2e;
+ bytes[1] = 0x2e;
+ __text_gen_insn(bytes + 2, CALL_INSN_OPCODE,
+ addr + fineibt_preamble_bhi + 2,
+ __bhi_args[arity], CALL_INSN_SIZE);
+ text_poke_early(addr + fineibt_preamble_bhi, bytes, 7);
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ int arity;
+ u32 hash;
+
+ /*
+ * When the function doesn't start with ENDBR the compiler will
+ * have determined there are no indirect calls to it and we
+ * don't need no CFI either.
+ */
+ if (!is_endbr(addr + 16))
+ continue;
+
+ hash = decode_preamble_hash(addr, &arity);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+
+ WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity,
+ "kCFI preamble has wrong register at: %pS %*ph\n",
+ addr, 5, addr);
+
+ if (cfi_bhi)
+ cfi_fineibt_bhi_preamble(addr, arity);
+ }
+
+ return 0;
+}
+
+static void cfi_rewrite_endbr(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+
+ if (!exact_endbr(addr + 16))
+ continue;
+
+ poison_endbr(addr + 16);
+ }
+}
+
+/* .retpoline_sites */
+static int cfi_rand_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ hash = -cfi_rehash(hash);
+ text_poke_early(addr + 2, &hash, 4);
+ }
+ }
+
+ return 0;
+}
+
+static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2;
+
+#ifdef CONFIG_MITIGATION_ITS
+ u8 *tmp = its_allocate_thunk(reg);
+ if (tmp)
+ thunk = tmp;
+#endif
+
+ return __emit_trampoline(addr, insn, bytes, thunk, thunk);
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ struct insn insn;
+ u8 bytes[20];
+ u32 hash;
+ int ret;
+ u8 op;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash)
+ continue;
+
+ if (!cfi_paranoid) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ /* rely on apply_retpolines() */
+ continue;
+ }
+
+ /* cfi_paranoid */
+ ret = insn_decode_kernel(&insn, addr + fineibt_caller_size);
+ if (WARN_ON_ONCE(ret < 0))
+ continue;
+
+ op = insn.opcode.bytes[0];
+ if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) {
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size);
+ memcpy(bytes + fineibt_caller_hash, &hash, 4);
+
+ if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) {
+ emit_paranoid_trampoline(addr + fineibt_caller_size,
+ &insn, 11, bytes + fineibt_caller_size);
+ } else {
+ int len = fineibt_paranoid_size - fineibt_paranoid_ind;
+ ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind, len);
+ if (WARN_ON_ONCE(ret != len))
+ continue;
+ }
+
+ text_poke_early(addr, bytes, fineibt_paranoid_size);
+ }
+
+ return 0;
+}
+
+#define pr_cfi_debug(X...) if (cfi_debug) pr_info(X)
+
+#define FINEIBT_WARN(_f, _v) \
+ WARN_ONCE((_f) != (_v), "FineIBT: " #_f " %ld != %d\n", _f, _v)
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ int ret;
+
+ if (FINEIBT_WARN(fineibt_preamble_size, 20) ||
+ FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) ||
+ FINEIBT_WARN(fineibt_caller_size, 14) ||
+ FINEIBT_WARN(fineibt_paranoid_size, 20))
+ return;
+
+ if (cfi_mode == CFI_AUTO) {
+ cfi_mode = CFI_KCFI;
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) {
+ /*
+ * FRED has much saner context on exception entry and
+ * is less easy to take advantage of.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ cfi_paranoid = true;
+ cfi_mode = CFI_FINEIBT;
+ }
+ }
+
+ /*
+ * Rewrite the callers to not use the __cfi_ stubs, such that we might
+ * rewrite them. This disables all CFI. If this succeeds but any of the
+ * later stages fails, we're without CFI.
+ */
+ pr_cfi_debug("CFI: disabling all indirect call checking\n");
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (cfi_rand) {
+ if (builtin) {
+ cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
+ pr_cfi_debug("CFI: cfi_seed: 0x%08x\n", cfi_seed);
+
+ pr_cfi_debug("CFI: rehashing all preambles\n");
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ pr_cfi_debug("CFI: rehashing all indirect calls\n");
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+ } else {
+ pr_cfi_debug("CFI: rehashing disabled\n");
+ }
+
+ switch (cfi_mode) {
+ case CFI_OFF:
+ if (builtin)
+ pr_info("CFI: disabled\n");
+ return;
+
+ case CFI_KCFI:
+ pr_cfi_debug("CFI: re-enabling all indirect call checking\n");
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("CFI: Using %sretpoline kCFI\n",
+ cfi_rand ? "rehashed " : "");
+ return;
+
+ case CFI_FINEIBT:
+ pr_cfi_debug("CFI: adding FineIBT to all preambles\n");
+ /* place the FineIBT preamble at func()-16 */
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ /* rewrite the callers to target func()-16 */
+ pr_cfi_debug("CFI: rewriting indirect call sites to use FineIBT\n");
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ /* now that nobody targets func()+0, remove ENDBR there */
+ pr_cfi_debug("CFI: removing old endbr insns\n");
+ cfi_rewrite_endbr(start_cfi, end_cfi);
+
+ if (builtin) {
+ pr_info("Using %sFineIBT%s CFI\n",
+ cfi_paranoid ? "paranoid " : "",
+ cfi_bhi ? "+BHI" : "");
+ }
+ return;
+
+ default:
+ break;
+ }
+
+err:
+ pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
+}
+
+static inline void poison_hash(void *addr)
+{
+ *(u32 *)addr = 0;
+}
+
+static void poison_cfi(void *addr)
+{
+ /*
+ * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes,
+ * some (static) functions for which they can determine the address
+ * is never taken do not get a __cfi prefix, but *DO* get an ENDBR.
+ *
+ * As such, these functions will get sealed, but we need to be careful
+ * to not unconditionally scribble the previous function.
+ */
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ /*
+ * FineIBT prefix should start with an ENDBR.
+ */
+ if (!is_endbr(addr))
+ break;
+
+ /*
+ * __cfi_\func:
+ * nopl -42(%rax)
+ * sub $0, %eax
+ * jne \func+3
+ * \func:
+ * nopl -42(%rax)
+ */
+ poison_endbr(addr);
+ poison_hash(addr + fineibt_preamble_hash);
+ break;
+
+ case CFI_KCFI:
+ /*
+ * kCFI prefix should start with a valid hash.
+ */
+ if (!decode_preamble_hash(addr, NULL))
+ break;
+
+ /*
+ * __cfi_\func:
+ * movl $0, %eax
+ * .skip 11, 0x90
+ */
+ poison_hash(addr + 1);
+ break;
+
+ default:
+ break;
+ }
+}
+
+#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE)
+
+/*
+ * When regs->ip points to a 0xD6 byte in the FineIBT preamble,
+ * return true and fill out target and type.
+ *
+ * We check the preamble by checking for the ENDBR instruction relative to the
+ * UDB instruction.
+ */
+static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_preamble_ud;
+ u32 hash;
+
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ *target = addr + fineibt_prefix_size;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * Since regs->ip points to the middle of an instruction; it cannot
+ * continue with the normal fixup.
+ */
+ regs->ip = *target;
+
+ return true;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to one of the UD2 in __bhi_args[].
+ */
+static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr;
+ u32 hash;
+
+ if (!cfi_bhi)
+ return false;
+
+ if (regs->ip < (unsigned long)__bhi_args ||
+ regs->ip >= (unsigned long)__bhi_args_end)
+ return false;
+
+ /*
+ * Fetch the return address from the stack, this points to the
+ * FineIBT preamble. Since the CALL instruction is in the 5 last
+ * bytes of the preamble, the return address is in fact the target
+ * address.
+ */
+ __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault);
+ *target = addr;
+
+ addr -= fineibt_prefix_size;
+ if (!exact_endbr((void *)addr))
+ return false;
+
+ __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault);
+ *type = (u32)regs->ax + hash;
+
+ /*
+ * The UD2 sites are constructed with a RET immediately following,
+ * as such the non-fatal case can use the regular fixup.
+ */
+ return true;
+
+Efault:
+ return false;
+}
+
+static bool is_paranoid_thunk(unsigned long addr)
+{
+ u32 thunk;
+
+ __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault);
+ return (thunk & 0x00FFFFFF) == 0xfd75d6;
+
+Efault:
+ return false;
+}
+
+/*
+ * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[]
+ * sequence, or to UDB + Jcc.d8 for cfi_paranoid + ITS thunk.
+ */
+static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ unsigned long addr = regs->ip - fineibt_paranoid_ud;
+
+ if (!cfi_paranoid)
+ return false;
+
+ if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ /*
+ * Since the trapping instruction is the exact, but LOCK prefixed,
+ * Jcc.d8 that got us here, the normal fixup will work.
+ */
+ return true;
+ }
+
+ /*
+ * The cfi_paranoid + ITS thunk combination results in:
+ *
+ * 0: b8 78 56 34 12 mov $0x12345678, %eax
+ * 5: 41 3b 43 f7 cmp -11(%r11), %eax
+ * a: 2e 3d 8d 5b f0 cs lea -0x10(%r11), %r11
+ * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11
+ *
+ * Where the paranoid_thunk looks like:
+ *
+ * 1d: <d6> udb
+ * __x86_indirect_paranoid_thunk_r11:
+ * 1e: 75 fd jne 1d
+ * __x86_indirect_its_thunk_r11:
+ * 20: 41 ff eb jmp *%r11
+ * 23: cc int3
+ *
+ */
+ if (is_paranoid_thunk(regs->ip)) {
+ *target = regs->r11 + fineibt_prefix_size;
+ *type = regs->ax;
+
+ regs->ip = *target;
+ return true;
+ }
+
+ return false;
+}
+
+bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type)
+{
+ if (decode_fineibt_paranoid(regs, target, type))
+ return true;
+
+ if (decode_fineibt_bhi(regs, target, type))
+ return true;
+
+ return decode_fineibt_preamble(regs, target, type);
+}
+
+#else /* !CONFIG_FINEIBT: */
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ if (IS_ENABLED(CONFIG_CFI) && builtin)
+ pr_info("CFI: Using standard kCFI\n");
+}
+
+#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_cfi(void *addr) { }
+#endif
+
+#endif /* !CONFIG_FINEIBT */
+
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+ return __apply_fineibt(start_retpoline, end_retpoline,
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
+}
+
+#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -249,8 +2073,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
- };
- mutex_unlock(&text_mutex);
+ }
}
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
@@ -258,10 +2081,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- if (noreplace_smp)
- return;
-
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -270,8 +2089,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
- };
- mutex_unlock(&text_mutex);
+ }
}
struct smp_alt_module {
@@ -290,8 +2108,7 @@ struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by text_mutex */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -300,19 +2117,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- if (noreplace_smp)
- return;
+ mutex_lock(&text_mutex);
+ if (!uniproc_patched)
+ goto unlock;
- if (smp_alt_once) {
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(locks, locks_end,
- text, text_end);
- return;
- }
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
- return; /* we'll run the (safe but slow) SMP code then ... */
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
smp->mod = mod;
smp->name = name;
@@ -320,85 +2136,58 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(smp->locks, smp->locks_end,
- smp->text, smp->text_end);
- mutex_unlock(&smp_alt);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
+ mutex_unlock(&text_mutex);
}
void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- if (smp_alt_once || noreplace_smp)
- return;
-
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- mutex_unlock(&smp_alt);
- DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
- return;
+ break;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
-#ifdef CONFIG_LOCKDEP
- /*
- * Older binutils section handling bug prevented
- * alternatives-replacement from working reliably.
- *
- * If this still occurs then you should see a hang
- * or crash shortly after this line:
- */
- printk("lockdep: fixing up alternatives.\n");
-#endif
-
- if (noreplace_smp || smp_alt_once)
- return;
- BUG_ON(!smp && (num_online_cpus() > 1));
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
- /*
- * Avoid unnecessary switches because it forces JIT based VMs to
- * throw away all cached translations, which can be quite costly.
- */
- if (smp == smp_mode) {
- /* nothing */
- } else if (smp) {
- printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+ if (uniproc_patched) {
+ pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
- } else {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
- list_for_each_entry(mod, &smp_alt_modules, next)
- alternatives_smp_unlock(mod->locks, mod->locks_end,
- mod->text, mod->text_end);
+ uniproc_patched = false;
}
- smp_mode = smp;
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
@@ -406,6 +2195,8 @@ int alternatives_text_reserved(void *start, void *end)
u8 *text_start = start;
u8 *text_end = end;
+ lockdep_assert_held(&text_mutex);
+
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
@@ -419,94 +2210,206 @@ int alternatives_text_reserved(void *start, void *end)
return 0;
}
-#endif
+#endif /* CONFIG_SMP */
+
+/*
+ * Self-test for the INT3 based CALL emulation code.
+ *
+ * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
+ * properly and that there is a stack gap between the INT3 frame and the
+ * previous context. Without this gap doing a virtual PUSH on the interrupted
+ * stack would corrupt the INT3 IRET frame.
+ *
+ * See entry_{32,64}.S for more details.
+ */
-#ifdef CONFIG_PARAVIRT
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
+extern void int3_selftest_asm(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_asm, @function\n"
+"int3_selftest_asm:\n"
+ ANNOTATE_NOENDBR "\n"
+ /*
+ * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
+ * exception, then the int3_exception_nb notifier emulates a call to
+ * int3_selftest_callee().
+ */
+" int3; nop; nop; nop; nop\n"
+ ASM_RET
+" .size int3_selftest_asm, . - int3_selftest_asm\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_callee(unsigned int *ptr);
+
+asm (
+" .pushsection .init.text, \"ax\", @progbits\n"
+" .type int3_selftest_callee, @function\n"
+"int3_selftest_callee:\n"
+ ANNOTATE_NOENDBR "\n"
+" movl $0x1234, (%" _ASM_ARG1 ")\n"
+ ASM_RET
+" .size int3_selftest_callee, . - int3_selftest_callee\n"
+" .popsection\n"
+);
+
+extern void int3_selftest_ip(void); /* defined in asm below */
+
+static int __init
+int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
- struct paravirt_patch_site *p;
- char insnbuf[MAX_PATCH_LEN];
+ unsigned long selftest = (unsigned long)&int3_selftest_asm;
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
- if (noreplace_paravirt)
- return;
+ OPTIMIZER_HIDE_VAR(selftest);
- for (p = start; p < end; p++) {
- unsigned int used;
+ if (!regs || user_mode(regs))
+ return NOTIFY_DONE;
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insnbuf, p->instr, p->len);
- used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
- (unsigned long)p->instr, p->len);
+ if (val != DIE_INT3)
+ return NOTIFY_DONE;
- BUG_ON(used > p->len);
+ if (regs->ip - INT3_INSN_SIZE != selftest)
+ return NOTIFY_DONE;
- /* Pad the rest with nops */
- add_nops(insnbuf + used, p->len - used);
- text_poke_early(p->instr, insnbuf, p->len);
- }
+ int3_emulate_call(regs, (unsigned long)&int3_selftest_callee);
+ return NOTIFY_STOP;
+}
+
+/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
+static noinline void __init int3_selftest(void)
+{
+ static __initdata struct notifier_block int3_exception_nb = {
+ .notifier_call = int3_exception_notify,
+ .priority = INT_MAX-1, /* last */
+ };
+ unsigned int val = 0;
+
+ BUG_ON(register_die_notifier(&int3_exception_nb));
+
+ /*
+ * Basically: int3_selftest_callee(&val); but really complicated :-)
+ */
+ int3_selftest_asm(&val);
+
+ BUG_ON(val != 0x1234);
+
+ unregister_die_notifier(&int3_exception_nb);
+}
+
+static __initdata int __alt_reloc_selftest_addr;
+
+extern void __init __alt_reloc_selftest(void *arg);
+__visible noinline void __init __alt_reloc_selftest(void *arg)
+{
+ WARN_ON(arg != &__alt_reloc_selftest_addr);
+}
+
+static noinline void __init alt_reloc_selftest(void)
+{
+ /*
+ * Tests text_poke_apply_relocation().
+ *
+ * This has a relative immediate (CALL) in a place other than the first
+ * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ *
+ * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
+ * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
+ *
+ * Getting this wrong will either crash and burn or tickle the WARN
+ * above.
+ */
+ asm_inline volatile (
+ ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ : ASM_CALL_CONSTRAINT
+ : [mem] "m" (__alt_reloc_selftest_addr)
+ : _ASM_ARG1
+ );
}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
void __init alternative_instructions(void)
{
- /* The patching is not fully atomic, so try to avoid local interruptions
- that might execute the to be patched code.
- Other CPUs are not running. */
+ u64 ibt;
+
+ int3_selftest();
+
+ /*
+ * The patching is not fully atomic, so try to avoid local
+ * interruptions that might execute the to be patched code.
+ * Other CPUs are not running.
+ */
stop_nmi();
/*
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
- * Ignoring it is worse than a unlikely patching race.
+ * Ignoring it is worse than an unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
+ /*
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
+ */
+ paravirt_set_cap();
+
+ /* Keep CET-IBT disabled until caller/callee are patched */
+ ibt = ibt_save(/*disable*/ true);
+
+ __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+ __cfi_sites, __cfi_sites_end, true);
+ cfi_debug = false;
+
+ /*
+ * Rewrite the retpolines, must be done before alternatives since
+ * those can rewrite the retpoline thunks.
+ */
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end);
+ apply_returns(__return_sites, __return_sites_end);
+
+ its_fini_core();
+
+ /*
+ * Adjust all CALL instructions to point to func()-10, including
+ * those in .altinstr_replacement.
+ */
+ callthunks_patch_builtin_calls();
+
apply_alternatives(__alt_instructions, __alt_instructions_end);
- /* switch to patch-once-at-boottime-only mode and free the
- * tables in case we know the number of CPUs will never ever
- * change */
-#ifdef CONFIG_HOTPLUG_CPU
- if (num_possible_cpus() < 2)
- smp_alt_once = 1;
-#endif
+ /*
+ * Seal all functions that do not have their address taken.
+ */
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+
+ ibt_restore(ibt);
#ifdef CONFIG_SMP
- if (smp_alt_once) {
- if (1 == num_possible_cpus()) {
- printk(KERN_INFO "SMP alternatives: switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
- alternatives_smp_unlock(__smp_locks, __smp_locks_end,
- _text, _etext);
- }
- } else {
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
-
- /* Only switch to UP mode if we don't immediately boot others */
- if (num_present_cpus() == 1 || setup_max_cpus <= 1)
- alternatives_smp_switch(0);
}
-#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
- if (smp_alt_once)
+ if (!uniproc_patched || num_possible_cpus() == 1) {
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+ }
+#endif
restart_nmi();
+ alternatives_patched = 1;
+
+ alt_reloc_selftest();
}
/**
@@ -518,19 +2421,169 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to be protected against NMI or
+ * MCE handlers seeing an inconsistent instruction while you patch.
+ */
+void __init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_NX) &&
+ is_module_text_address((unsigned long)addr)) {
+ /*
+ * Modules text is marked initially as non-executable, so the
+ * code cannot be running and speculative code-fetches are
+ * prevented. Just change the code.
+ */
+ memcpy(addr, opcode, len);
+ } else {
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ sync_core();
+ local_irq_restore(flags);
+
+ /*
+ * Could also do a CLFLUSH here to speed up CPU recovery; but
+ * that causes hangs on some VIA CPUs.
+ */
+ }
+}
+
+__ro_after_init struct mm_struct *text_poke_mm;
+__ro_after_init unsigned long text_poke_mm_addr;
+
+/*
+ * Text poking creates and uses a mapping in the lower half of the
+ * address space. Relax LASS enforcement when accessing the poking
+ * address.
+ *
+ * objtool enforces a strict policy of "no function calls within AC=1
+ * regions". Adhere to the policy by using inline versions of
+ * memcpy()/memset() that will never result in a function call.
*/
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
- size_t len)
+
+static void text_poke_memcpy(void *dst, const void *src, size_t len)
+{
+ lass_stac();
+ __inline_memcpy(dst, src, len);
+ lass_clac();
+}
+
+static void text_poke_memset(void *dst, const void *src, size_t len)
+{
+ int c = *(const int *)src;
+
+ lass_stac();
+ __inline_memset(dst, c, len);
+ lass_clac();
+}
+
+typedef void text_poke_f(void *dst, const void *src, size_t len);
+
+static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
{
+ bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
+ struct page *pages[2] = {NULL};
+ struct mm_struct *prev_mm;
unsigned long flags;
+ pte_t pte, *ptep;
+ spinlock_t *ptl;
+ pgprot_t pgprot;
+
+ /*
+ * While boot memory allocator is running we cannot use struct pages as
+ * they are not yet initialized. There is no way to recover.
+ */
+ BUG_ON(!after_bootmem);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ if (cross_page_boundary)
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ if (cross_page_boundary)
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ /*
+ * If something went wrong, crash and burn since recovery paths are not
+ * implemented.
+ */
+ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
+
+ /*
+ * Map the page without the global bit, as TLB flushing is done with
+ * flush_tlb_mm_range(), which is intended for non-global PTEs.
+ */
+ pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
+
+ /*
+ * The lock is not really needed, but this allows to avoid open-coding.
+ */
+ ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
+
+ /*
+ * This must not fail; preallocated in poking_init().
+ */
+ VM_BUG_ON(!ptep);
+
local_irq_save(flags);
- memcpy(addr, opcode, len);
- sync_core();
+
+ pte = mk_pte(pages[0], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte);
+
+ if (cross_page_boundary) {
+ pte = mk_pte(pages[1], pgprot);
+ set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte);
+ }
+
+ /*
+ * Loading the temporary mm behaves as a compiler barrier, which
+ * guarantees that the PTE will be set at the time memcpy() is done.
+ */
+ prev_mm = use_temporary_mm(text_poke_mm);
+
+ kasan_disable_current();
+ func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len);
+ kasan_enable_current();
+
+ /*
+ * Ensure that the PTE is only cleared after the instructions of memcpy
+ * were issued by using a compiler barrier.
+ */
+ barrier();
+
+ pte_clear(text_poke_mm, text_poke_mm_addr, ptep);
+ if (cross_page_boundary)
+ pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1);
+
+ /*
+ * Loading the previous page-table hierarchy requires a serializing
+ * instruction that already allows the core to see the updated version.
+ * Xen-PV is assumed to serialize execution in a similar manner.
+ */
+ unuse_temporary_mm(prev_mm);
+
+ /*
+ * Flushing the TLB might involve IPIs, which would require enabled
+ * IRQs, but not if the mm is not used, as it is in this point.
+ */
+ flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr +
+ (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
+ PAGE_SHIFT, false);
+
+ if (func == text_poke_memcpy) {
+ /*
+ * If the text does not match what we just wrote then something is
+ * fundamentally screwy; there's nothing we can really do about that.
+ */
+ BUG_ON(memcmp(addr, src, len));
+ }
+
local_irq_restore(flags);
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
+ pte_unmap_unlock(ptep, ptl);
return addr;
}
@@ -545,98 +2598,566 @@ static void *__init_or_module text_poke_early(void *addr, const void *opcode,
* in a way that permits an atomic write. It also makes sure we fit on a single
* page.
*
- * Note: Must be called under text_mutex.
+ * Note that the caller must ensure that if the modified code is part of a
+ * module, the module would not be removed during poking. This can be achieved
+ * by registering a module notifier, and ordering module removal and patching
+ * through a mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
- unsigned long flags;
- char *vaddr;
- struct page *pages[2];
- int i;
+ lockdep_assert_held(&text_mutex);
- if (!core_kernel_text((unsigned long)addr)) {
- pages[0] = vmalloc_to_page(addr);
- pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
- } else {
- pages[0] = virt_to_page(addr);
- WARN_ON(!PageReserved(pages[0]));
- pages[1] = virt_to_page(addr + PAGE_SIZE);
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
+
+/**
+ * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Context: should only be used by kgdb, which ensures no other core is running,
+ * despite the fact it does not hold the text_mutex.
+ */
+void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+{
+ return __text_poke(text_poke_memcpy, addr, opcode, len);
+}
+
+void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
+ bool core_ok)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
+ return NULL;
+
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
+ patched += s;
}
- BUG_ON(!pages[0]);
- local_irq_save(flags);
- set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
- if (pages[1])
- set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
- vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
- memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
- clear_fixmap(FIX_TEXT_POKE0);
- if (pages[1])
- clear_fixmap(FIX_TEXT_POKE1);
- local_flush_tlb();
- sync_core();
- /* Could also do a CLFLUSH here to speed up CPU recovery; but
- that causes hangs on some VIA CPUs. */
- for (i = 0; i < len; i++)
- BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
- local_irq_restore(flags);
return addr;
}
-/*
- * Cross-modifying kernel text with stop_machine().
- * This code originally comes from immediate value.
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
*/
-static atomic_t stop_machine_first;
-static int wrote_text;
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ mutex_lock(&text_mutex);
+ addr = text_poke_copy_locked(addr, opcode, len, false);
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+/**
+ * text_poke_set - memset into (an unused part of) RX memory
+ * @addr: address to modify
+ * @c: the byte to fill the area with
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * This is useful to overwrite unused regions of RX memory with illegal
+ * instructions.
+ */
+void *text_poke_set(void *addr, int c, size_t len)
+{
+ unsigned long start = (unsigned long)addr;
+ size_t patched = 0;
+
+ if (WARN_ON_ONCE(core_kernel_text(start)))
+ return NULL;
+
+ mutex_lock(&text_mutex);
+ while (patched < len) {
+ unsigned long ptr = start + patched;
+ size_t s;
+
+ s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+ __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
+ patched += s;
+ }
+ mutex_unlock(&text_mutex);
+ return addr;
+}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+void smp_text_poke_sync_each_cpu(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
-struct text_poke_params {
- void *addr;
- const void *opcode;
- size_t len;
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
+struct smp_text_poke_loc {
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
+ u8 opcode;
+ const u8 text[TEXT_POKE_MAX_OPCODE_SIZE];
+ /* see smp_text_poke_batch_finish() */
+ u8 old;
};
-static int __kprobes stop_machine_text_poke(void *data)
+#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc))
+
+static struct smp_text_poke_array {
+ struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX];
+ int nr_entries;
+} text_poke_array;
+
+static DEFINE_PER_CPU(atomic_t, text_poke_array_refs);
+
+/*
+ * These four __always_inline annotations imply noinstr, necessary
+ * due to smp_text_poke_int3_handler() being noinstr:
+ */
+
+static __always_inline bool try_get_text_poke_array(void)
+{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ if (!raw_atomic_inc_not_zero(refs))
+ return false;
+
+ return true;
+}
+
+static __always_inline void put_text_poke_array(void)
+{
+ atomic_t *refs = this_cpu_ptr(&text_poke_array_refs);
+
+ smp_mb__before_atomic();
+ raw_atomic_dec(refs);
+}
+
+static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl)
+{
+ return _stext + tpl->rel_addr;
+}
+
+static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b)
+{
+ if (tpl_a < text_poke_addr(tpl_b))
+ return -1;
+ if (tpl_a > text_poke_addr(tpl_b))
+ return 1;
+ return 0;
+}
+
+noinstr int smp_text_poke_int3_handler(struct pt_regs *regs)
{
- struct text_poke_params *tpp = data;
+ struct smp_text_poke_loc *tpl;
+ int ret = 0;
+ void *ip;
+
+ if (user_mode(regs))
+ return 0;
- if (atomic_dec_and_test(&stop_machine_first)) {
- text_poke(tpp->addr, tpp->opcode, tpp->len);
- smp_wmb(); /* Make sure other cpus see that this has run */
- wrote_text = 1;
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * text_poke_array with non-zero refcount:
+ *
+ * text_poke_array_refs = 1 INT3
+ * WMB RMB
+ * write INT3 if (text_poke_array_refs != 0)
+ */
+ smp_rmb();
+
+ if (!try_get_text_poke_array())
+ return 0;
+
+ /*
+ * Discount the INT3. See smp_text_poke_batch_finish().
+ */
+ ip = (void *) regs->ip - INT3_INSN_SIZE;
+
+ /*
+ * Skip the binary search if there is a single member in the vector.
+ */
+ if (unlikely(text_poke_array.nr_entries > 1)) {
+ tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries,
+ sizeof(struct smp_text_poke_loc),
+ patch_cmp);
+ if (!tpl)
+ goto out_put;
} else {
- while (!wrote_text)
- cpu_relax();
- smp_mb(); /* Load wrote_text before following execution */
+ tpl = text_poke_array.vec;
+ if (text_poke_addr(tpl) != ip)
+ goto out_put;
}
- flush_icache_range((unsigned long)tpp->addr,
- (unsigned long)tpp->addr + tpp->len);
- return 0;
+ ip += tpl->len;
+
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ /*
+ * Someone poked an explicit INT3, they'll want to handle it,
+ * do not consume.
+ */
+ goto out_put;
+
+ case RET_INSN_OPCODE:
+ int3_emulate_ret(regs);
+ break;
+
+ case CALL_INSN_OPCODE:
+ int3_emulate_call(regs, (long)ip + tpl->disp);
+ break;
+
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ int3_emulate_jmp(regs, (long)ip + tpl->disp);
+ break;
+
+ case 0x70 ... 0x7f: /* Jcc */
+ int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp);
+ break;
+
+ default:
+ BUG();
+ }
+
+ ret = 1;
+
+out_put:
+ put_text_poke_array();
+ return ret;
}
/**
- * text_poke_smp - Update instructions on a live kernel on SMP
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy
+ * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP
+ *
+ * Input state:
+ * text_poke_array.vec: vector of instructions to patch
+ * text_poke_array.nr_entries: number of entries in the vector
*
- * Modify multi-byte instruction by using stop_machine() on SMP. This allows
- * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
- * should be allowed, since stop_machine() does _not_ protect code against
- * NMI and MCE.
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
*
- * Note: Must be called under get_online_cpus() and text_mutex.
+ * The way it is done:
+ * - For each entry in the vector:
+ * - add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - For each entry in the vector:
+ * - replace the first byte (INT3) by the first byte of the
+ * replacing opcode
+ * - SMP sync all CPUs
*/
-void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+void smp_text_poke_batch_finish(void)
{
- struct text_poke_params tpp;
+ unsigned char int3 = INT3_INSN_OPCODE;
+ unsigned int i;
+ int do_sync;
- tpp.addr = addr;
- tpp.opcode = opcode;
- tpp.len = len;
- atomic_set(&stop_machine_first, 1);
- wrote_text = 0;
- stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
- return addr;
+ if (!text_poke_array.nr_entries)
+ return;
+
+ lockdep_assert_held(&text_mutex);
+
+ /*
+ * Corresponds to the implicit memory barrier in try_get_text_poke_array() to
+ * ensure reading a non-zero refcount provides up to date text_poke_array data.
+ */
+ for_each_possible_cpu(i)
+ atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1);
+
+ /*
+ * Function tracing can enable thousands of places that need to be
+ * updated. This can take quite some time, and with full kernel debugging
+ * enabled, this could cause the softlockup watchdog to trigger.
+ * This function gets called every 256 entries added to be patched.
+ * Call cond_resched() here to make sure that other tasks can get scheduled
+ * while processing all the functions being patched.
+ */
+ cond_resched();
+
+ /*
+ * Corresponding read barrier in INT3 notifier for making sure the
+ * text_poke_array.nr_entries and handler are correctly ordered wrt. patching.
+ */
+ smp_wmb();
+
+ /*
+ * First step: add a INT3 trap to the address that will be patched.
+ */
+ for (i = 0; i < text_poke_array.nr_entries; i++) {
+ text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]);
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE);
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Second step: update all but the first byte of the patched range.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, };
+ u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = text_poke_array.vec[i].text;
+ int len = text_poke_array.vec[i].len;
+
+ if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ if (len == 6) {
+ _new[0] = 0x0f;
+ memcpy(_new + 1, new, 5);
+ new = _new;
+ }
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
+
+ do_sync++;
+ }
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len);
+ }
+
+ if (do_sync) {
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ smp_text_poke_sync_each_cpu();
+ }
+
+ /*
+ * Third step: replace the first byte (INT3) by the first byte of the
+ * replacing opcode.
+ */
+ for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) {
+ u8 byte = text_poke_array.vec[i].text[0];
+
+ if (text_poke_array.vec[i].len == 6)
+ byte = 0x0f;
+
+ if (byte == INT3_INSN_OPCODE)
+ continue;
+
+ text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE);
+ do_sync++;
+ }
+
+ if (do_sync)
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Remove and wait for refs to be zero.
+ *
+ * Notably, if after step-3 above the INT3 got removed, then the
+ * smp_text_poke_sync_each_cpu() will have serialized against any running INT3
+ * handlers and the below spin-wait will not happen.
+ *
+ * IOW. unless the replacement instruction is INT3, this case goes
+ * unused.
+ */
+ for_each_possible_cpu(i) {
+ atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i);
+
+ if (unlikely(!atomic_dec_and_test(refs)))
+ atomic_cond_read_acquire(refs, !VAL);
+ }
+
+ /* They are all completed: */
+ text_poke_array.nr_entries = 0;
+}
+
+static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct smp_text_poke_loc *tpl;
+ struct insn insn;
+ int ret, i = 0;
+
+ tpl = &text_poke_array.vec[text_poke_array.nr_entries++];
+
+ if (len == 6)
+ i = 1;
+ memcpy((void *)tpl->text, opcode+i, len-i);
+ if (!emulate)
+ emulate = opcode;
+
+ ret = insn_decode_kernel(&insn, emulate);
+ BUG_ON(ret < 0);
+
+ tpl->rel_addr = addr - (void *)_stext;
+ tpl->len = len;
+ tpl->opcode = insn.opcode.bytes[0];
+
+ if (is_jcc32(&insn)) {
+ /*
+ * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+ */
+ tpl->opcode = insn.opcode.bytes[1] - 0x10;
+ }
+
+ switch (tpl->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tpl->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ }
+
+ switch (tpl->opcode) {
+ case INT3_INSN_OPCODE:
+ case RET_INSN_OPCODE:
+ break;
+
+ case CALL_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ case 0x70 ... 0x7f: /* Jcc */
+ tpl->disp = insn.immediate.value;
+ break;
+
+ default: /* assume NOP */
+ switch (len) {
+ case 2: /* NOP2 -- emulate as JMP8+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP8_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ case 5: /* NOP5 -- emulate as JMP32+0 */
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
+ tpl->opcode = JMP32_INSN_OPCODE;
+ tpl->disp = 0;
+ break;
+
+ default: /* unknown instruction */
+ BUG();
+ }
+ break;
+ }
}
+/*
+ * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool text_poke_addr_ordered(void *addr)
+{
+ WARN_ON_ONCE(!addr);
+
+ if (!text_poke_array.nr_entries)
+ return true;
+
+ /*
+ * If the last current entry's address is higher than the
+ * new entry's address we'd like to add, then ordering
+ * is violated and we must first flush all pending patching
+ * requests:
+ */
+ if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr)
+ return false;
+
+ return true;
+}
+
+/**
+ * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Add a new instruction to the current queue of to-be-patched instructions
+ * the kernel maintains. The patching request will not be executed immediately,
+ * but becomes part of an array of patching requests, optimized for batched
+ * execution. All pending patching requests will be executed on the next
+ * smp_text_poke_batch_finish() call.
+ */
+void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr))
+ smp_text_poke_batch_finish();
+ __smp_text_poke_batch_add(addr, opcode, len, emulate);
+}
+
+/**
+ * smp_text_poke_single() -- update instruction on live kernel on SMP immediately
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @emulate: instruction to be emulated
+ *
+ * Update a single instruction with the vector in the stack, avoiding
+ * dynamically allocated memory. This function should be used when it is
+ * not possible to allocate memory for a vector. The single instruction
+ * is patched in immediately.
+ */
+void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ smp_text_poke_batch_add(addr, opcode, len, emulate);
+ smp_text_poke_batch_finish();
+}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 0f7f130caa67..e8000a56732e 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Dynamic DMA mapping support for AMD Hammer.
*
@@ -5,10 +6,9 @@
* This allows to use PCI devices that only support 32bit addresses on systems
* with more than 4GB.
*
- * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
+ * See Documentation/core-api/dma-api-howto.rst for the interface specification.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU General Public License v2 only.
*/
#include <linux/types.h>
@@ -17,29 +17,29 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
-#include <linux/module.h>
#include <linux/topology.h>
#include <linux/interrupt.h>
#include <linux/bitmap.h>
#include <linux/kdebug.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/io.h>
#include <linux/gfp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <asm/mtrr.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/cacheflush.h>
-#include <asm/swiotlb.h>
+#include <asm/set_memory.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd/nb.h>
#include <asm/x86_init.h>
static unsigned long iommu_bus_base; /* GART remapping area (physical) */
@@ -48,14 +48,12 @@ static unsigned long iommu_pages; /* .. and in pages */
static u32 *iommu_gatt_base; /* Remapping table */
-static dma_addr_t bad_dma_addr;
-
/*
* If this is disabled the IOMMU will use an optimized flushing strategy
* of only flushing when an mapping is reused. With it true the GART is
* flushed for every mapping. Problem is that doing the lazy flush seems
* to trigger bugs with some popular PCI cards, in particular 3ware (but
- * has been also also seen with Qlogic at least).
+ * has been also seen with Qlogic at least).
*/
static int iommu_fullflush = 1;
@@ -72,14 +70,15 @@ static u32 gart_unmapped_entry;
(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
#ifdef CONFIG_AGP
#define AGPEXTERN extern
#else
#define AGPEXTERN
#endif
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
@@ -96,8 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
+ boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT);
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
@@ -142,7 +140,7 @@ static void flush_gart(void)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
- k8_flush_garts();
+ amd_flush_garts();
need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -150,9 +148,6 @@ static void flush_gart(void)
#ifdef CONFIG_IOMMU_LEAK
/* Debugging aid for drivers that don't free their IOMMU tables */
-static int leak_trace;
-static int iommu_leak_pages = 20;
-
static void dump_leak(void)
{
static int dump;
@@ -161,7 +156,7 @@ static void dump_leak(void)
return;
dump = 1;
- show_stack(NULL, NULL);
+ show_stack(NULL, NULL, KERN_ERR);
debug_dma_dump_mappings(NULL);
}
#endif
@@ -179,14 +174,6 @@ static void iommu_full(struct device *dev, size_t size, int dir)
*/
dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
-
- if (size > PAGE_SIZE*EMERGENCY_PAGES) {
- if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
- panic("PCI-DMA: Memory would be corrupted\n");
- if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
- panic(KERN_ERR
- "PCI-DMA: Random memory would be DMAed\n");
- }
#ifdef CONFIG_IOMMU_LEAK
dump_leak();
#endif
@@ -195,13 +182,13 @@ static void iommu_full(struct device *dev, size_t size, int dir)
static inline int
need_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return force_iommu || !dma_capable(dev, addr, size);
+ return force_iommu || !dma_capable(dev, addr, size, true);
}
static inline int
nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
{
- return !dma_capable(dev, addr, size);
+ return !dma_capable(dev, addr, size, true);
}
/* Map a single continuous physical area into the IOMMU.
@@ -211,16 +198,20 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
- unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+ unsigned long iommu_page;
int i;
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return DMA_MAPPING_ERROR;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
if (panic_on_overflow)
panic("dma_map_area overflow %lu bytes\n", size);
iommu_full(dev, size, dir);
- return bad_dma_addr;
+ return DMA_MAPPING_ERROR;
}
for (i = 0; i < npages; i++) {
@@ -231,16 +222,14 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
}
/* Map a single area into the IOMMU */
-static dma_addr_t gart_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
+static dma_addr_t gart_map_phys(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
unsigned long bus;
- phys_addr_t paddr = page_to_phys(page) + offset;
- if (!dev)
- dev = &x86_dma_fallback_dev;
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
if (!need_iommu(dev, paddr, size))
return paddr;
@@ -254,15 +243,23 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
/*
* Free a DMA mapping.
*/
-static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+static void gart_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
unsigned long iommu_page;
int npages;
int i;
- if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ if (WARN_ON_ONCE(dma_addr == DMA_MAPPING_ERROR))
+ return;
+
+ /*
+ * This driver will not always use a GART mapping, but might have
+ * created a direct mapping instead. If that is the case there is
+ * nothing to unmap here.
+ */
+ if (dma_addr < iommu_bus_base ||
dma_addr >= iommu_bus_base + iommu_size)
return;
@@ -278,7 +275,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
* Wrapper for pci_unmap_single working with scatterlists.
*/
static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s;
int i;
@@ -286,7 +283,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
- gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
+ gart_unmap_phys(dev, s->dma_address, s->dma_length, dir, 0);
}
}
@@ -306,9 +303,9 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir, 0);
- if (addr == bad_dma_addr) {
+ if (addr == DMA_MAPPING_ERROR) {
if (i > 0)
- gart_unmap_sg(dev, sg, i, dir, NULL);
+ gart_unmap_sg(dev, sg, i, dir, 0);
nents = 0;
sg[0].dma_length = 0;
break;
@@ -333,7 +330,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int i;
if (iommu_start == -1)
- return -1;
+ return -ENOMEM;
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -379,19 +376,16 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
* Merge chunks that have page aligned sizes into a continuous mapping.
*/
static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, struct dma_attrs *attrs)
+ enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
- int need = 0, nextneed, i, out, start;
+ int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
if (nents == 0)
- return 0;
-
- if (!dev)
- dev = &x86_dma_fallback_dev;
+ return -EINVAL;
out = 0;
start = 0;
@@ -419,8 +413,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(dev, start_sg, i - start,
- sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
@@ -437,7 +432,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
- if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
flush_gart();
@@ -449,7 +445,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
error:
flush_gart();
- gart_unmap_sg(dev, sg, out, dir, NULL);
+ gart_unmap_sg(dev, sg, out, dir, 0);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
@@ -461,54 +457,39 @@ error:
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
- for_each_sg(sg, s, nents, i)
- s->dma_address = bad_dma_addr;
- return 0;
+ return ret;
}
/* allocate and map a coherent mapping */
static void *
gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
- gfp_t flag)
+ gfp_t flag, unsigned long attrs)
{
- dma_addr_t paddr;
- unsigned long align_mask;
- struct page *page;
-
- if (force_iommu && !(flag & GFP_DMA)) {
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- page = alloc_pages(flag | __GFP_ZERO, get_order(size));
- if (!page)
- return NULL;
-
- align_mask = (1UL << get_order(size)) - 1;
- paddr = dma_map_area(dev, page_to_phys(page), size,
- DMA_BIDIRECTIONAL, align_mask);
-
- flush_gart();
- if (paddr != bad_dma_addr) {
- *dma_addr = paddr;
- return page_address(page);
- }
- __free_pages(page, get_order(size));
- } else
- return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+ void *vaddr;
+
+ vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
+ if (!vaddr ||
+ !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
+ return vaddr;
+ *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
+ DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1);
+ flush_gart();
+ if (unlikely(*dma_addr == DMA_MAPPING_ERROR))
+ goto out_free;
+ return vaddr;
+out_free:
+ dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
return NULL;
}
/* free a coherent mapping */
static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr, unsigned long attrs)
{
- gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- return (dma_addr == bad_dma_addr);
+ gart_unmap_phys(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
+ dma_direct_free(dev, size, vaddr, dma_addr, attrs);
}
static int no_agp;
@@ -524,13 +505,12 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
}
a = aper + iommu_size;
- iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+ iommu_size -= round_up(a, PMD_SIZE) - a;
if (iommu_size < 64*1024*1024) {
- pr_warning(
- "PCI-DMA: Warning: Small IOMMU %luMB."
+ pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
" Consider increasing the AGP aperture in BIOS\n",
- iommu_size >> 20);
+ iommu_size >> 20);
}
return iommu_size;
@@ -560,14 +540,17 @@ static void enable_gart_translations(void)
{
int i;
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
enable_gart_translation(dev, __pa(agp_gatt_table));
}
/* Flush the GART-TLB to remove stale entries */
- k8_flush_garts();
+ amd_flush_garts();
}
/*
@@ -585,72 +568,66 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
aperture_alloc = aper_alloc;
}
-static void gart_fixup_northbridges(struct sys_device *dev)
+static void gart_fixup_northbridges(void)
{
int i;
if (!fix_up_north_bridges)
return;
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < num_k8_northbridges; i++) {
- struct pci_dev *dev = k8_northbridges[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
/*
* Don't enable translations just yet. That is the next
* step. Restore the pre-suspend aperture settings.
*/
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+ gart_set_size_and_enable(dev, aperture_order);
pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
}
}
-static int gart_resume(struct sys_device *dev)
+static void gart_resume(void *data)
{
pr_info("PCI-DMA: Resuming GART IOMMU\n");
- gart_fixup_northbridges(dev);
+ gart_fixup_northbridges();
enable_gart_translations();
-
- return 0;
-}
-
-static int gart_suspend(struct sys_device *dev, pm_message_t state)
-{
- return 0;
}
-static struct sysdev_class gart_sysdev_class = {
- .name = "gart",
- .suspend = gart_suspend,
+static const struct syscore_ops gart_syscore_ops = {
.resume = gart_resume,
};
-static struct sys_device device_gart = {
- .cls = &gart_sysdev_class,
+static struct syscore gart_syscore = {
+ .ops = &gart_syscore_ops,
};
/*
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
{
unsigned aper_size, gatt_size, new_aper_size;
unsigned aper_base, new_aper_base;
struct pci_dev *dev;
void *gatt;
- int i, error;
+ int i;
pr_info("PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < num_k8_northbridges; i++) {
- dev = k8_northbridges[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -678,12 +655,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
agp_gatt_table = gatt;
- error = sysdev_class_register(&gart_sysdev_class);
- if (!error)
- error = sysdev_register(&device_gart);
- if (error)
- panic("Could not register gart_sysdev -- "
- "would corrupt data on next suspend");
+ register_syscore(&gart_syscore);
flush_gart();
@@ -694,19 +666,23 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
nommu:
/* Should not happen anymore */
- pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- "falling back to iommu=soft.\n");
+ pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n");
return -1;
}
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
- .map_page = gart_map_page,
- .unmap_page = gart_unmap_page,
- .alloc_coherent = gart_alloc_coherent,
- .free_coherent = gart_free_coherent,
- .mapping_error = gart_mapping_error,
+ .map_phys = gart_map_phys,
+ .unmap_phys = gart_unmap_phys,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
+ .mmap = dma_common_mmap,
+ .get_sgtable = dma_common_get_sgtable,
+ .dma_supported = dma_direct_supported,
+ .get_required_mask = dma_direct_get_required_mask,
+ .alloc_pages_op = dma_direct_alloc_pages,
+ .free_pages = dma_direct_free_pages,
};
static void gart_iommu_shutdown(void)
@@ -718,10 +694,13 @@ static void gart_iommu_shutdown(void)
if (!no_agp)
return;
- for (i = 0; i < num_k8_northbridges; i++) {
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
u32 ctl;
- dev = k8_northbridges[i];
+ dev = node_to_amd_nb(i)->misc;
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -737,16 +716,15 @@ int __init gart_iommu_init(void)
unsigned long aper_base, aper_size;
unsigned long start_pfn, end_pfn;
unsigned long scratch;
- long i;
- if (num_k8_northbridges == 0)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
- /* Add other K8 AGP bridge drivers here */
+ /* Add other AMD AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
@@ -755,10 +733,10 @@ int __init gart_iommu_init(void)
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
- (no_agp && init_k8_gatt(&info) < 0)) {
+ (no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
- pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
- pr_warning("falling back to iommu=soft.\n");
+ pr_warn("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warn("falling back to iommu=soft.\n");
}
return 0;
}
@@ -768,10 +746,10 @@ int __init gart_iommu_init(void)
aper_base = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
- if (end_pfn > max_low_pfn_mapped) {
- start_pfn = (aper_base>>PAGE_SHIFT);
- init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
- }
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT,
+ PAGE_KERNEL);
pr_info("PCI-DMA: using GART IOMMU.\n");
iommu_size = check_iommu_size(info.aper_base, aper_size);
@@ -782,29 +760,12 @@ int __init gart_iommu_init(void)
if (!iommu_gart_bitmap)
panic("Cannot allocate iommu bitmap\n");
-#ifdef CONFIG_IOMMU_LEAK
- if (leak_trace) {
- int ret;
-
- ret = dma_debug_resize_entries(iommu_pages);
- if (ret)
- pr_debug("PCI-DMA: Cannot trace all the entries\n");
- }
-#endif
-
- /*
- * Out of IOMMU space handling.
- * Reserve some invalid pages at the beginning of the GART.
- */
- bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
-
pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
iommu_size >> 20);
agp_memory_reserved = iommu_size;
iommu_start = aper_size - iommu_size;
iommu_bus_base = info.aper_base + iommu_start;
- bad_dma_addr = iommu_bus_base;
iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
/*
@@ -820,7 +781,7 @@ int __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
@@ -846,13 +807,11 @@ int __init gart_iommu_init(void)
if (!scratch)
panic("Cannot allocate iommu scratch page");
gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
- for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
- iommu_gatt_base[i] = gart_unmapped_entry;
flush_gart();
dma_ops = &gart_dma_ops;
x86_platform.iommu_shutdown = gart_iommu_shutdown;
- swiotlb = 0;
+ x86_swiotlb_enable = false;
return 0;
}
@@ -861,16 +820,6 @@ void __init gart_parse_options(char *p)
{
int arg;
-#ifdef CONFIG_IOMMU_LEAK
- if (!strncmp(p, "leak", 4)) {
- leak_trace = 1;
- p += 4;
- if (*p == '=')
- ++p;
- if (isdigit(*p) && get_option(&p, &arg))
- iommu_leak_pages = arg;
- }
-#endif
if (isdigit(*p) && get_option(&p, &arg))
iommu_size = arg;
if (!strncmp(p, "fullflush", 9))
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 0d20286d78c6..000000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2629 +0,0 @@
-/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define EXIT_LOOP_COUNT 10000000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
- u32 data[4];
-};
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
- return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
- struct dma_ops_domain *entry, *ret = NULL;
- unsigned long flags;
- u16 alias = amd_iommu_alias_table[devid];
-
- if (list_empty(&iommu_pd_list))
- return NULL;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
- list_for_each_entry(entry, &iommu_pd_list, list) {
- if (entry->target_dev == devid ||
- entry->target_dev == alias) {
- ret = entry;
- break;
- }
- }
-
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
- u16 devid;
-
- if (!dev || !dev->dma_mask)
- return false;
-
- /* No device or no PCI device */
- if (dev->bus != &pci_bus_type)
- return false;
-
- devid = get_device_id(dev);
-
- /* Out of our scope? */
- if (devid > amd_iommu_last_bdf)
- return false;
-
- if (amd_iommu_rlookup_table[devid] == NULL)
- return false;
-
- return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct pci_dev *pdev;
- u16 devid, alias;
-
- if (dev->archdata.iommu)
- return 0;
-
- dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
- if (!dev_data)
- return -ENOMEM;
-
- dev_data->dev = dev;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
- if (pdev)
- dev_data->alias = &pdev->dev;
-
- atomic_set(&dev_data->bind, 0);
-
- dev->archdata.iommu = dev_data;
-
-
- return 0;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
- kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
- struct pci_dev *pdev = NULL;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- iommu_uninit_device(&pdev->dev);
- }
-}
-
-int __init amd_iommu_init_devices(void)
-{
- struct pci_dev *pdev = NULL;
- int ret = 0;
-
- for_each_pci_dev(pdev) {
-
- if (!check_device(&pdev->dev))
- continue;
-
- ret = iommu_init_device(&pdev->dev);
- if (ret)
- goto out_free;
- }
-
- return 0;
-
-out_free:
-
- amd_iommu_uninit_devices();
-
- return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
- if (stats_dir == NULL)
- return;
-
- cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
- &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
- stats_dir = debugfs_create_dir("amd-iommu", NULL);
- if (stats_dir == NULL)
- return;
-
- de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
- (u32 *)&amd_iommu_unmap_flush);
-
- amd_iommu_stats_add(&compl_wait);
- amd_iommu_stats_add(&cnt_map_single);
- amd_iommu_stats_add(&cnt_unmap_single);
- amd_iommu_stats_add(&cnt_map_sg);
- amd_iommu_stats_add(&cnt_unmap_sg);
- amd_iommu_stats_add(&cnt_alloc_coherent);
- amd_iommu_stats_add(&cnt_free_coherent);
- amd_iommu_stats_add(&cross_page);
- amd_iommu_stats_add(&domain_flush_single);
- amd_iommu_stats_add(&domain_flush_all);
- amd_iommu_stats_add(&alloced_io_mem);
- amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
- int i;
-
- for (i = 0; i < 8; ++i)
- pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
- amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
- struct iommu_cmd *cmd = phys_to_virt(phys_addr);
- int i;
-
- for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
- u32 *event = __evt;
- int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
- int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
- int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
- int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
- u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
- printk(KERN_ERR "AMD-Vi: Event logged [");
-
- switch (type) {
- case EVENT_TYPE_ILL_DEV:
- printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- dump_dte_entry(devid);
- break;
- case EVENT_TYPE_IO_FAULT:
- printk("IO_PAGE_FAULT device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_DEV_TAB_ERR:
- printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- case EVENT_TYPE_PAGE_TAB_ERR:
- printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- domid, address, flags);
- break;
- case EVENT_TYPE_ILL_CMD:
- printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- iommu->reset_in_progress = true;
- reset_iommu_command_buffer(iommu);
- dump_command(address);
- break;
- case EVENT_TYPE_CMD_HARD_ERR:
- printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
- "flags=0x%04x]\n", address, flags);
- break;
- case EVENT_TYPE_IOTLB_INV_TO:
- printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
- "address=0x%016llx]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address);
- break;
- case EVENT_TYPE_INV_DEV_REQ:
- printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
- PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
- address, flags);
- break;
- default:
- printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
- }
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
- u32 head, tail;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- while (head != tail) {
- iommu_print_event(iommu, iommu->evt_buf + head);
- head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
- }
-
- writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
- spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_poll_events(iommu);
-
- return IRQ_HANDLED;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- u32 tail, head;
- u8 *target;
-
- WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- target = iommu->cmd_buf + tail;
- memcpy_toio(target, cmd, sizeof(*cmd));
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- if (tail == head)
- return -ENOMEM;
- writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&iommu->lock, flags);
- ret = __iommu_queue_command(iommu, cmd);
- if (!ret)
- iommu->need_sync = true;
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
- int ready = 0;
- unsigned status = 0;
- unsigned long i = 0;
-
- INC_STATS_COUNTER(compl_wait);
-
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- iommu->reset_in_progress = true;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
- struct iommu_cmd cmd;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-
- return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
- int ret = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&iommu->lock, flags);
-
- if (!iommu->need_sync)
- goto out;
-
- ret = __iommu_completion_wait(iommu);
-
- iommu->need_sync = false;
-
- if (ret)
- goto out;
-
- __iommu_wait_for_completion(iommu);
-
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
-
- if (iommu->reset_in_progress)
- reset_iommu_command_buffer(iommu);
-
- return 0;
-}
-
-static void iommu_flush_complete(struct protection_domain *domain)
-{
- int i;
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
- }
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int iommu_flush_device(struct device *dev)
-{
- struct amd_iommu *iommu;
- struct iommu_cmd cmd;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- /* Build command */
- memset(&cmd, 0, sizeof(cmd));
- CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
- cmd.data[0] = devid;
-
- return iommu_queue_command(iommu, &cmd);
-}
-
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- u16 domid, int pde, int s)
-{
- memset(cmd, 0, sizeof(*cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
- u64 address, u16 domid, int pde, int s)
-{
- struct iommu_cmd cmd;
- int ret;
-
- __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
- ret = iommu_queue_command(iommu, &cmd);
-
- return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
-{
- int s = 0, i;
- unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
- address &= PAGE_MASK;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
-
-
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
-
- /*
- * Devices of this domain are behind this IOMMU
- * We need a TLB flush
- */
- iommu_queue_inv_iommu_pages(amd_iommus[i], address,
- domain->id, pde, s);
- }
-
- return;
-}
-
-static void iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
-{
- __iommu_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
-{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- iommu_flush_device(dev_data->dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-static void iommu_flush_all_domain_devices(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- iommu_flush_domain_devices(domain);
- iommu_flush_complete(domain);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-void amd_iommu_flush_all_devices(void)
-{
- iommu_flush_all_domain_devices();
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-void amd_iommu_flush_all_domains(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- spin_lock(&domain->lock);
- iommu_flush_tlb_pde(domain);
- iommu_flush_complete(domain);
- spin_unlock(&domain->lock);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-
- amd_iommu_reset_cmd_buffer(iommu);
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-
- iommu->reset_in_progress = false;
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
- gfp_t gfp)
-{
- u64 *pte;
-
- if (domain->mode == PAGE_MODE_6_LEVEL)
- /* address space already 64 bit large */
- return false;
-
- pte = (void *)get_zeroed_page(gfp);
- if (!pte)
- return false;
-
- *pte = PM_LEVEL_PDE(domain->mode,
- virt_to_phys(domain->pt_root));
- domain->pt_root = pte;
- domain->mode += 1;
- domain->updated = true;
-
- return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
- unsigned long address,
- unsigned long page_size,
- u64 **pte_page,
- gfp_t gfp)
-{
- int level, end_lvl;
- u64 *pte, *page;
-
- BUG_ON(!is_power_of_2(page_size));
-
- while (address > PM_LEVEL_SIZE(domain->mode))
- increase_address_space(domain, gfp);
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
- address = PAGE_SIZE_ALIGN(address, page_size);
- end_lvl = PAGE_SIZE_LEVEL(page_size);
-
- while (level > end_lvl) {
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(gfp);
- if (!page)
- return NULL;
- *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- pte = IOMMU_PTE_PAGE(*pte);
-
- if (pte_page && level == end_lvl)
- *pte_page = pte;
-
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
- int level;
- u64 *pte;
-
- if (address > PM_LEVEL_SIZE(domain->mode))
- return NULL;
-
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
- while (level > 0) {
-
- /* Not Present */
- if (!IOMMU_PTE_PRESENT(*pte))
- return NULL;
-
- /* Large PTE */
- if (PM_PTE_LEVEL(*pte) == 0x07) {
- unsigned long pte_mask, __pte;
-
- /*
- * If we have a series of large PTEs, make
- * sure to return a pointer to the first one.
- */
- pte_mask = PTE_PAGE_SIZE(*pte);
- pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
- __pte = ((unsigned long)pte) & pte_mask;
-
- return (u64 *)__pte;
- }
-
- /* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
- return NULL;
-
- level -= 1;
-
- /* Walk to the next level */
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[PM_LEVEL_INDEX(level, address)];
- }
-
- return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot,
- unsigned long page_size)
-{
- u64 __pte, *pte;
- int i, count;
-
- if (!(prot & IOMMU_PROT_MASK))
- return -EINVAL;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
- count = PAGE_SIZE_PTE_COUNT(page_size);
- pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
- for (i = 0; i < count; ++i)
- if (IOMMU_PTE_PRESENT(pte[i]))
- return -EBUSY;
-
- if (page_size > PAGE_SIZE) {
- __pte = PAGE_SIZE_PTE(phys_addr, page_size);
- __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
- } else
- __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (prot & IOMMU_PROT_IR)
- __pte |= IOMMU_PTE_IR;
- if (prot & IOMMU_PROT_IW)
- __pte |= IOMMU_PTE_IW;
-
- for (i = 0; i < count; ++i)
- pte[i] = __pte;
-
- update_domain(dom);
-
- return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long page_size)
-{
- unsigned long long unmap_size, unmapped;
- u64 *pte;
-
- BUG_ON(!is_power_of_2(page_size));
-
- unmapped = 0;
-
- while (unmapped < page_size) {
-
- pte = fetch_pte(dom, bus_addr);
-
- if (!pte) {
- /*
- * No PTE for this address
- * move forward in 4kb steps
- */
- unmap_size = PAGE_SIZE;
- } else if (PM_PTE_LEVEL(*pte) == 0) {
- /* 4kb PTE found for this address */
- unmap_size = PAGE_SIZE;
- *pte = 0ULL;
- } else {
- int count, i;
-
- /* Large PTE found which maps this address */
- unmap_size = PTE_PAGE_SIZE(*pte);
- count = PAGE_SIZE_PTE_COUNT(unmap_size);
- for (i = 0; i < count; i++)
- pte[i] = 0ULL;
- }
-
- bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
- unmapped += unmap_size;
- }
-
- BUG_ON(!is_power_of_2(unmapped));
-
- return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
- struct unity_map_entry *entry)
-{
- u16 bdf, i;
-
- for (i = entry->devid_start; i <= entry->devid_end; ++i) {
- bdf = amd_iommu_alias_table[i];
- if (amd_iommu_rlookup_table[bdf] == iommu)
- return 1;
- }
-
- return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
- struct unity_map_entry *e)
-{
- u64 addr;
- int ret;
-
- for (addr = e->address_start; addr < e->address_end;
- addr += PAGE_SIZE) {
- ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PAGE_SIZE);
- if (ret)
- return ret;
- /*
- * if unity mapping is in aperture range mark the page
- * as allocated in the aperture
- */
- if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT,
- dma_dom->aperture[0]->bitmap);
- }
-
- return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
- struct unity_map_entry *entry;
- int ret;
-
- list_for_each_entry(entry, &amd_iommu_unity_map, list) {
- if (!iommu_for_unity_map(iommu, entry))
- continue;
- ret = dma_ops_unity_map(iommu->default_dom, entry);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
- u16 devid)
-{
- struct unity_map_entry *e;
- int ret;
-
- list_for_each_entry(e, &amd_iommu_unity_map, list) {
- if (!(devid >= e->devid_start && devid <= e->devid_end))
- continue;
- ret = dma_ops_unity_map(dma_dom, e);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
- unsigned long start_page,
- unsigned int pages)
-{
- unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
- if (start_page + pages > last_page)
- pages = last_page - start_page;
-
- for (i = start_page; i < start_page + pages; ++i) {
- int index = i / APERTURE_RANGE_PAGES;
- int page = i % APERTURE_RANGE_PAGES;
- __set_bit(page, dom->aperture[index]->bitmap);
- }
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
- bool populate, gfp_t gfp)
-{
- int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
- struct amd_iommu *iommu;
- unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
- populate = false;
-#endif
-
- if (index >= APERTURE_MAX_RANGES)
- return -ENOMEM;
-
- dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
- if (!dma_dom->aperture[index])
- return -ENOMEM;
-
- dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
- if (!dma_dom->aperture[index]->bitmap)
- goto out_free;
-
- dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
- if (populate) {
- unsigned long address = dma_dom->aperture_size;
- int i, num_ptes = APERTURE_RANGE_PAGES / 512;
- u64 *pte, *pte_page;
-
- for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
- &pte_page, gfp);
- if (!pte)
- goto out_free;
-
- dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
- address += APERTURE_RANGE_SIZE / 64;
- }
- }
-
- dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
- /* Intialize the exclusion range if necessary */
- for_each_iommu(iommu) {
- if (iommu->exclusion_start &&
- iommu->exclusion_start >= dma_dom->aperture[index]->offset
- && iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- startpage = iommu->exclusion_start >> PAGE_SHIFT;
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
- }
-
- /*
- * Check for areas already mapped as present in the new aperture
- * range and mark those pages as reserved in the allocator. Such
- * mappings may already exist as a result of requested unity
- * mappings for devices.
- */
- for (i = dma_dom->aperture[index]->offset;
- i < dma_dom->aperture_size;
- i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i);
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- continue;
-
- dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
- }
-
- update_domain(&dma_dom->domain);
-
- return 0;
-
-out_free:
- update_domain(&dma_dom->domain);
-
- free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
- kfree(dma_dom->aperture[index]);
- dma_dom->aperture[index] = NULL;
-
- return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask,
- unsigned long start)
-{
- unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
- int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
- int i = start >> APERTURE_RANGE_SHIFT;
- unsigned long boundary_size;
- unsigned long address = -1;
- unsigned long limit;
-
- next_bit >>= PAGE_SHIFT;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- for (;i < max_index; ++i) {
- unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
- if (dom->aperture[i]->offset >= dma_mask)
- break;
-
- limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
- dma_mask >> PAGE_SHIFT);
-
- address = iommu_area_alloc(dom->aperture[i]->bitmap,
- limit, next_bit, pages, 0,
- boundary_size, align_mask);
- if (address != -1) {
- address = dom->aperture[i]->offset +
- (address << PAGE_SHIFT);
- dom->next_address = address + (pages << PAGE_SHIFT);
- break;
- }
-
- next_bit = 0;
- }
-
- return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
- struct dma_ops_domain *dom,
- unsigned int pages,
- unsigned long align_mask,
- u64 dma_mask)
-{
- unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
- dom->next_address = 0;
- dom->need_flush = true;
-#endif
-
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, dom->next_address);
-
- if (address == -1) {
- dom->next_address = 0;
- address = dma_ops_area_alloc(dev, dom, pages, align_mask,
- dma_mask, 0);
- dom->need_flush = true;
- }
-
- if (unlikely(address == -1))
- address = DMA_ERROR_CODE;
-
- WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
- return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
- unsigned long address,
- unsigned int pages)
-{
- unsigned i = address >> APERTURE_RANGE_SHIFT;
- struct aperture_range *range = dom->aperture[i];
-
- BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
- if (i < 4)
- return;
-#endif
-
- if (address >= dom->next_address)
- dom->need_flush = true;
-
- address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
- bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_add(&domain->list, &amd_iommu_pd_list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
- list_del(&domain->list);
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
- unsigned long flags;
- int id;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
- BUG_ON(id == 0);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __set_bit(id, amd_iommu_pd_alloc_bitmap);
- else
- id = 0;
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return id;
-}
-
-static void domain_id_free(int id)
-{
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- if (id > 0 && id < MAX_DOMAIN_ID)
- __clear_bit(id, amd_iommu_pd_alloc_bitmap);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
- int i, j;
- u64 *p1, *p2, *p3;
-
- p1 = domain->pt_root;
-
- if (!p1)
- return;
-
- for (i = 0; i < 512; ++i) {
- if (!IOMMU_PTE_PRESENT(p1[i]))
- continue;
-
- p2 = IOMMU_PTE_PAGE(p1[i]);
- for (j = 0; j < 512; ++j) {
- if (!IOMMU_PTE_PRESENT(p2[j]))
- continue;
- p3 = IOMMU_PTE_PAGE(p2[j]);
- free_page((unsigned long)p3);
- }
-
- free_page((unsigned long)p2);
- }
-
- free_page((unsigned long)p1);
-
- domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
- int i;
-
- if (!dom)
- return;
-
- del_domain_from_list(&dom->domain);
-
- free_pagetable(&dom->domain);
-
- for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
- if (!dom->aperture[i])
- continue;
- free_page((unsigned long)dom->aperture[i]->bitmap);
- kfree(dom->aperture[i]);
- }
-
- kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
- struct dma_ops_domain *dma_dom;
-
- dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
- if (!dma_dom)
- return NULL;
-
- spin_lock_init(&dma_dom->domain.lock);
-
- dma_dom->domain.id = domain_id_alloc();
- if (dma_dom->domain.id == 0)
- goto free_dma_dom;
- INIT_LIST_HEAD(&dma_dom->domain.dev_list);
- dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
- dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- dma_dom->domain.flags = PD_DMA_OPS_MASK;
- dma_dom->domain.priv = dma_dom;
- if (!dma_dom->domain.pt_root)
- goto free_dma_dom;
-
- dma_dom->need_flush = false;
- dma_dom->target_dev = 0xffff;
-
- add_domain_to_list(&dma_dom->domain);
-
- if (alloc_new_range(dma_dom, true, GFP_KERNEL))
- goto free_dma_dom;
-
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->aperture[0]->bitmap[0] = 1;
- dma_dom->next_address = 0;
-
-
- return dma_dom;
-
-free_dma_dom:
- dma_ops_domain_free(dma_dom);
-
- return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
- return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
-{
- u64 pte_root = virt_to_phys(domain->pt_root);
-
- pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
- << DEV_ENTRY_MODE_SHIFT;
- pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
- /* remove entry from the device table seen by the hardware */
- amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[1] = 0;
- amd_iommu_dev_table[devid].data[2] = 0;
-
- amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* Update data structures */
- dev_data->domain = domain;
- list_add(&dev_data->list, &domain->dev_list);
- set_dte_entry(devid, domain);
-
- /* Do reference counting */
- domain->dev_iommu[iommu->index] += 1;
- domain->dev_cnt += 1;
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-static void do_detach(struct device *dev)
-{
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- u16 devid;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
- dev_data = get_dev_data(dev);
-
- /* decrease reference counters */
- dev_data->domain->dev_iommu[iommu->index] -= 1;
- dev_data->domain->dev_cnt -= 1;
-
- /* Update data structures */
- dev_data->domain = NULL;
- list_del(&dev_data->list);
- clear_dte_entry(devid);
-
- /* Flush the DTE entry */
- iommu_flush_device(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *alias_data;
- int ret;
-
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
-
- if (!alias_data)
- return -EINVAL;
-
- /* lock domain */
- spin_lock(&domain->lock);
-
- /* Some sanity checks */
- ret = -EBUSY;
- if (alias_data->domain != NULL &&
- alias_data->domain != domain)
- goto out_unlock;
-
- if (dev_data->domain != NULL &&
- dev_data->domain != domain)
- goto out_unlock;
-
- /* Do real assignment */
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (alias_data->domain == NULL)
- do_attach(dev_data->alias, domain);
-
- atomic_inc(&alias_data->bind);
- }
-
- if (dev_data->domain == NULL)
- do_attach(dev, domain);
-
- atomic_inc(&dev_data->bind);
-
- ret = 0;
-
-out_unlock:
-
- /* ready */
- spin_unlock(&domain->lock);
-
- return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
- struct protection_domain *domain)
-{
- unsigned long flags;
- int ret;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- ret = __attach_device(dev, domain);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- /*
- * We might boot into a crash-kernel here. The crashed kernel
- * left the caches in the IOMMU dirty. So we have to flush
- * here to evict all dirty stuff.
- */
- iommu_flush_tlb_pde(domain);
-
- return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
- struct iommu_dev_data *dev_data = get_dev_data(dev);
- struct iommu_dev_data *alias_data;
- struct protection_domain *domain;
- unsigned long flags;
-
- BUG_ON(!dev_data->domain);
-
- domain = dev_data->domain;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- if (dev_data->alias != dev) {
- alias_data = get_dev_data(dev_data->alias);
- if (atomic_dec_and_test(&alias_data->bind))
- do_detach(dev_data->alias);
- }
-
- if (atomic_dec_and_test(&dev_data->bind))
- do_detach(dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- /*
- * If we run in passthrough mode the device must be assigned to the
- * passthrough domain if it is detached from any other domain.
- * Make sure we can deassign from the pt_domain itself.
- */
- if (iommu_pass_through &&
- (dev_data->domain == NULL && domain != pt_domain))
- __attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
- unsigned long flags;
-
- /* lock device table */
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
- __detach_device(dev);
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
- struct protection_domain *dom;
- struct iommu_dev_data *dev_data, *alias_data;
- unsigned long flags;
- u16 devid, alias;
-
- devid = get_device_id(dev);
- alias = amd_iommu_alias_table[devid];
- dev_data = get_dev_data(dev);
- alias_data = get_dev_data(dev_data->alias);
- if (!alias_data)
- return NULL;
-
- read_lock_irqsave(&amd_iommu_devtable_lock, flags);
- dom = dev_data->domain;
- if (dom == NULL &&
- alias_data->domain != NULL) {
- __attach_device(dev, alias_data->domain);
- dom = alias_data->domain;
- }
-
- read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
- return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- u16 devid;
- struct protection_domain *domain;
- struct dma_ops_domain *dma_domain;
- struct amd_iommu *iommu;
- unsigned long flags;
-
- if (!check_device(dev))
- return 0;
-
- devid = get_device_id(dev);
- iommu = amd_iommu_rlookup_table[devid];
-
- switch (action) {
- case BUS_NOTIFY_UNBOUND_DRIVER:
-
- domain = domain_for_device(dev);
-
- if (!domain)
- goto out;
- if (iommu_pass_through)
- break;
- detach_device(dev);
- break;
- case BUS_NOTIFY_ADD_DEVICE:
-
- iommu_init_device(dev);
-
- domain = domain_for_device(dev);
-
- /* allocate a protection domain if a device is added */
- dma_domain = find_protection_domain(devid);
- if (dma_domain)
- goto out;
- dma_domain = dma_ops_domain_alloc();
- if (!dma_domain)
- goto out;
- dma_domain->target_dev = devid;
-
- spin_lock_irqsave(&iommu_pd_list_lock, flags);
- list_add_tail(&dma_domain->list, &iommu_pd_list);
- spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
- break;
- case BUS_NOTIFY_DEL_DEVICE:
-
- iommu_uninit_device(dev);
-
- default:
- goto out;
- }
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-
-out:
- return 0;
-}
-
-static struct notifier_block device_nb = {
- .notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
- bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
- struct protection_domain *domain;
- struct dma_ops_domain *dma_dom;
- u16 devid = get_device_id(dev);
-
- if (!check_device(dev))
- return ERR_PTR(-EINVAL);
-
- domain = domain_for_device(dev);
- if (domain != NULL && !dma_ops_domain(domain))
- return ERR_PTR(-EBUSY);
-
- if (domain != NULL)
- return domain;
-
- /* Device not bount yet - bind it */
- dma_dom = find_protection_domain(devid);
- if (!dma_dom)
- dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
- attach_device(dev, &dma_dom->domain);
- DUMP_printk("Using protection domain %d for device %s\n",
- dma_dom->domain.id, dev_name(dev));
-
- return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data;
-
- list_for_each_entry(dev_data, &domain->dev_list, list) {
- u16 devid = get_device_id(dev_data->dev);
- set_dte_entry(devid, domain);
- }
-}
-
-static void update_domain(struct protection_domain *domain)
-{
- if (!domain->updated)
- return;
-
- update_device_table(domain);
- iommu_flush_domain_devices(domain);
- iommu_flush_tlb_pde(domain);
-
- domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte, *pte_page;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return NULL;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte) {
- pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
- GFP_ATOMIC);
- aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
- } else
- pte += PM_LEVEL_INDEX(0, address);
-
- update_domain(&dom->domain);
-
- return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
- unsigned long address,
- phys_addr_t paddr,
- int direction)
-{
- u64 *pte, __pte;
-
- WARN_ON(address > dom->aperture_size);
-
- paddr &= PAGE_MASK;
-
- pte = dma_ops_get_pte(dom, address);
- if (!pte)
- return DMA_ERROR_CODE;
-
- __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
- if (direction == DMA_TO_DEVICE)
- __pte |= IOMMU_PTE_IR;
- else if (direction == DMA_FROM_DEVICE)
- __pte |= IOMMU_PTE_IW;
- else if (direction == DMA_BIDIRECTIONAL)
- __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
- WARN_ON(*pte);
-
- *pte = __pte;
-
- return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
- unsigned long address)
-{
- struct aperture_range *aperture;
- u64 *pte;
-
- if (address >= dom->aperture_size)
- return;
-
- aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
- if (!aperture)
- return;
-
- pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
- if (!pte)
- return;
-
- pte += PM_LEVEL_INDEX(0, address);
-
- WARN_ON(!*pte);
-
- *pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
- struct dma_ops_domain *dma_dom,
- phys_addr_t paddr,
- size_t size,
- int dir,
- bool align,
- u64 dma_mask)
-{
- dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start, ret;
- unsigned int pages;
- unsigned long align_mask = 0;
- int i;
-
- pages = iommu_num_pages(paddr, size, PAGE_SIZE);
- paddr &= PAGE_MASK;
-
- INC_STATS_COUNTER(total_map_requests);
-
- if (pages > 1)
- INC_STATS_COUNTER(cross_page);
-
- if (align)
- align_mask = (1UL << get_order(size)) - 1;
-
-retry:
- address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
- dma_mask);
- if (unlikely(address == DMA_ERROR_CODE)) {
- /*
- * setting next_address here will let the address
- * allocator only scan the new allocated range in the
- * first run. This is a small optimization.
- */
- dma_dom->next_address = dma_dom->aperture_size;
-
- if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
- goto out;
-
- /*
- * aperture was successfully enlarged by 128 MB, try
- * allocation again
- */
- goto retry;
- }
-
- start = address;
- for (i = 0; i < pages; ++i) {
- ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
- if (ret == DMA_ERROR_CODE)
- goto out_unmap;
-
- paddr += PAGE_SIZE;
- start += PAGE_SIZE;
- }
- address += offset;
-
- ADD_STATS_COUNTER(alloced_io_mem, size);
-
- if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(&dma_dom->domain);
- dma_dom->need_flush = false;
- } else if (unlikely(amd_iommu_np_cache))
- iommu_flush_pages(&dma_dom->domain, address, size);
-
-out:
- return address;
-
-out_unmap:
-
- for (--i; i >= 0; --i) {
- start -= PAGE_SIZE;
- dma_ops_domain_unmap(dma_dom, start);
- }
-
- dma_ops_free_addresses(dma_dom, address, pages);
-
- return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
- dma_addr_t dma_addr,
- size_t size,
- int dir)
-{
- dma_addr_t i, start;
- unsigned int pages;
-
- if ((dma_addr == DMA_ERROR_CODE) ||
- (dma_addr + size > dma_dom->aperture_size))
- return;
-
- pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- dma_addr &= PAGE_MASK;
- start = dma_addr;
-
- for (i = 0; i < pages; ++i) {
- dma_ops_domain_unmap(dma_dom, start);
- start += PAGE_SIZE;
- }
-
- SUB_STATS_COUNTER(alloced_io_mem, size);
-
- dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
- if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, dma_addr, size);
- dma_dom->need_flush = false;
- }
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- dma_addr_t addr;
- u64 dma_mask;
- phys_addr_t paddr = page_to_phys(page) + offset;
-
- INC_STATS_COUNTER(cnt_map_single);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return (dma_addr_t)paddr;
- else if (IS_ERR(domain))
- return DMA_ERROR_CODE;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- addr = __map_single(dev, domain->priv, paddr, size, dir, false,
- dma_mask);
- if (addr == DMA_ERROR_CODE)
- goto out;
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
- enum dma_data_direction dir, struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_unmap_single);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, dir);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
- int nelems, int dir)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sglist, s, nelems, i) {
- s->dma_address = (dma_addr_t)sg_phys(s);
- s->dma_length = s->length;
- }
-
- return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- int i;
- struct scatterlist *s;
- phys_addr_t paddr;
- int mapped_elems = 0;
- u64 dma_mask;
-
- INC_STATS_COUNTER(cnt_map_sg);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL)
- return map_sg_no_iommu(dev, sglist, nelems, dir);
- else if (IS_ERR(domain))
- return 0;
-
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- paddr = sg_phys(s);
-
- s->dma_address = __map_single(dev, domain->priv,
- paddr, s->length, dir, false,
- dma_mask);
-
- if (s->dma_address) {
- s->dma_length = s->length;
- mapped_elems++;
- } else
- goto unmap;
- }
-
- iommu_flush_complete(domain);
-
-out:
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return mapped_elems;
-unmap:
- for_each_sg(sglist, s, mapped_elems, i) {
- if (s->dma_address)
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- mapped_elems = 0;
-
- goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- unsigned long flags;
- struct protection_domain *domain;
- struct scatterlist *s;
- int i;
-
- INC_STATS_COUNTER(cnt_unmap_sg);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- return;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- for_each_sg(sglist, s, nelems, i) {
- __unmap_single(domain->priv, s->dma_address,
- s->dma_length, dir);
- s->dma_address = s->dma_length = 0;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long flags;
- void *virt_addr;
- struct protection_domain *domain;
- phys_addr_t paddr;
- u64 dma_mask = dev->coherent_dma_mask;
-
- INC_STATS_COUNTER(cnt_alloc_coherent);
-
- domain = get_domain(dev);
- if (PTR_ERR(domain) == -EINVAL) {
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- *dma_addr = __pa(virt_addr);
- return virt_addr;
- } else if (IS_ERR(domain))
- return NULL;
-
- dma_mask = dev->coherent_dma_mask;
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- flag |= __GFP_ZERO;
-
- virt_addr = (void *)__get_free_pages(flag, get_order(size));
- if (!virt_addr)
- return NULL;
-
- paddr = virt_to_phys(virt_addr);
-
- if (!dma_mask)
- dma_mask = *dev->dma_mask;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- *dma_addr = __map_single(dev, domain->priv, paddr,
- size, DMA_BIDIRECTIONAL, true, dma_mask);
-
- if (*dma_addr == DMA_ERROR_CODE) {
- spin_unlock_irqrestore(&domain->lock, flags);
- goto out_free;
- }
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
- return virt_addr;
-
-out_free:
-
- free_pages((unsigned long)virt_addr, get_order(size));
-
- return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
- void *virt_addr, dma_addr_t dma_addr)
-{
- unsigned long flags;
- struct protection_domain *domain;
-
- INC_STATS_COUNTER(cnt_free_coherent);
-
- domain = get_domain(dev);
- if (IS_ERR(domain))
- goto free_mem;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
- iommu_flush_complete(domain);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
- free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
- return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
- struct pci_dev *dev = NULL;
- struct dma_ops_domain *dma_dom;
- u16 devid;
-
- for_each_pci_dev(dev) {
-
- /* Do we handle this device? */
- if (!check_device(&dev->dev))
- continue;
-
- /* Is there already any domain for it? */
- if (domain_for_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- dma_dom = dma_ops_domain_alloc();
- if (!dma_dom)
- continue;
- init_unity_mappings_for_device(dma_dom, devid);
- dma_dom->target_dev = devid;
-
- attach_device(&dev->dev, &dma_dom->domain);
-
- list_add_tail(&dma_dom->list, &iommu_pd_list);
- }
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
- .alloc_coherent = alloc_coherent,
- .free_coherent = free_coherent,
- .map_page = map_page,
- .unmap_page = unmap_page,
- .map_sg = map_sg,
- .unmap_sg = unmap_sg,
- .dma_supported = amd_iommu_dma_supported,
-};
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
- register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
- struct amd_iommu *iommu;
- int ret;
-
- /*
- * first allocate a default protection domain for every IOMMU we
- * found in the system. Devices not assigned to any other
- * protection domain will be assigned to the default one.
- */
- for_each_iommu(iommu) {
- iommu->default_dom = dma_ops_domain_alloc();
- if (iommu->default_dom == NULL)
- return -ENOMEM;
- iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
- ret = iommu_init_unity_mappings(iommu);
- if (ret)
- goto free_domains;
- }
-
- /*
- * Pre-allocate the protection domains for each device.
- */
- prealloc_protection_domains();
-
- iommu_detected = 1;
- swiotlb = 0;
-
- /* Make the driver finally visible to the drivers */
- dma_ops = &amd_iommu_dma_ops;
-
- amd_iommu_stats_init();
-
- return 0;
-
-free_domains:
-
- for_each_iommu(iommu) {
- if (iommu->default_dom)
- dma_ops_domain_free(iommu->default_dom);
- }
-
- return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
- struct iommu_dev_data *dev_data, *next;
- unsigned long flags;
-
- write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
- list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
- struct device *dev = dev_data->dev;
-
- __detach_device(dev);
- atomic_set(&dev_data->bind, 0);
- }
-
- write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
- if (!domain)
- return;
-
- del_domain_from_list(domain);
-
- if (domain->id)
- domain_id_free(domain->id);
-
- kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
- struct protection_domain *domain;
-
- domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- if (!domain)
- return NULL;
-
- spin_lock_init(&domain->lock);
- mutex_init(&domain->api_lock);
- domain->id = domain_id_alloc();
- if (!domain->id)
- goto out_err;
- INIT_LIST_HEAD(&domain->dev_list);
-
- add_domain_to_list(domain);
-
- return domain;
-
-out_err:
- kfree(domain);
-
- return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
- struct protection_domain *domain;
-
- domain = protection_domain_alloc();
- if (!domain)
- goto out_free;
-
- domain->mode = PAGE_MODE_3_LEVEL;
- domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
- if (!domain->pt_root)
- goto out_free;
-
- dom->priv = domain;
-
- return 0;
-
-out_free:
- protection_domain_free(domain);
-
- return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
- struct protection_domain *domain = dom->priv;
-
- if (!domain)
- return;
-
- if (domain->dev_cnt > 0)
- cleanup_domain(domain);
-
- BUG_ON(domain->dev_cnt != 0);
-
- free_pagetable(domain);
-
- protection_domain_free(domain);
-
- dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct iommu_dev_data *dev_data = dev->archdata.iommu;
- struct amd_iommu *iommu;
- u16 devid;
-
- if (!check_device(dev))
- return;
-
- devid = get_device_id(dev);
-
- if (dev_data->domain != NULL)
- detach_device(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return;
-
- iommu_flush_device(dev);
- iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
- struct device *dev)
-{
- struct protection_domain *domain = dom->priv;
- struct iommu_dev_data *dev_data;
- struct amd_iommu *iommu;
- int ret;
- u16 devid;
-
- if (!check_device(dev))
- return -EINVAL;
-
- dev_data = dev->archdata.iommu;
-
- devid = get_device_id(dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- return -EINVAL;
-
- if (dev_data->domain)
- detach_device(dev);
-
- ret = attach_device(dev, domain);
-
- iommu_completion_wait(iommu);
-
- return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
- phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
- unsigned long page_size = 0x1000UL << gfp_order;
- struct protection_domain *domain = dom->priv;
- int prot = 0;
- int ret;
-
- if (iommu_prot & IOMMU_READ)
- prot |= IOMMU_PROT_IR;
- if (iommu_prot & IOMMU_WRITE)
- prot |= IOMMU_PROT_IW;
-
- mutex_lock(&domain->api_lock);
- ret = iommu_map_page(domain, iova, paddr, prot, page_size);
- mutex_unlock(&domain->api_lock);
-
- return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
- int gfp_order)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long page_size, unmap_size;
-
- page_size = 0x1000UL << gfp_order;
-
- mutex_lock(&domain->api_lock);
- unmap_size = iommu_unmap_page(domain, iova, page_size);
- mutex_unlock(&domain->api_lock);
-
- iommu_flush_tlb_pde(domain);
-
- return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
- unsigned long iova)
-{
- struct protection_domain *domain = dom->priv;
- unsigned long offset_mask;
- phys_addr_t paddr;
- u64 *pte, __pte;
-
- pte = fetch_pte(domain, iova);
-
- if (!pte || !IOMMU_PTE_PRESENT(*pte))
- return 0;
-
- if (PM_PTE_LEVEL(*pte) == 0)
- offset_mask = PAGE_SIZE - 1;
- else
- offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
- __pte = *pte & PM_ADDR_MASK;
- paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
- return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
- unsigned long cap)
-{
- return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
- .domain_init = amd_iommu_domain_init,
- .domain_destroy = amd_iommu_domain_destroy,
- .attach_dev = amd_iommu_attach_device,
- .detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map,
- .unmap = amd_iommu_unmap,
- .iova_to_phys = amd_iommu_iova_to_phys,
- .domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
- struct amd_iommu *iommu;
- struct pci_dev *dev = NULL;
- u16 devid;
-
- /* allocate passthrough domain */
- pt_domain = protection_domain_alloc();
- if (!pt_domain)
- return -ENOMEM;
-
- pt_domain->mode |= PAGE_MODE_NONE;
-
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
- if (!check_device(&dev->dev))
- continue;
-
- devid = get_device_id(&dev->dev);
-
- iommu = amd_iommu_rlookup_table[devid];
- if (!iommu)
- continue;
-
- attach_device(&dev->dev, pt_domain);
- }
-
- pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
- return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index 3cc63e2b8dd4..000000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1430 +0,0 @@
-/*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- * Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE 0x10
-#define ACPI_IVMD_TYPE_ALL 0x20
-#define ACPI_IVMD_TYPE 0x21
-#define ACPI_IVMD_TYPE_RANGE 0x22
-
-#define IVHD_DEV_ALL 0x01
-#define IVHD_DEV_SELECT 0x02
-#define IVHD_DEV_SELECT_RANGE_START 0x03
-#define IVHD_DEV_RANGE_END 0x04
-#define IVHD_DEV_ALIAS 0x42
-#define IVHD_DEV_ALIAS_RANGE 0x43
-#define IVHD_DEV_EXT_SELECT 0x46
-#define IVHD_DEV_EXT_SELECT_RANGE 0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK 0x01
-#define IVHD_FLAG_PASSPW_EN_MASK 0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK 0x04
-#define IVHD_FLAG_ISOC_EN_MASK 0x08
-
-#define IVMD_FLAG_EXCL_RANGE 0x08
-#define IVMD_FLAG_UNITY_MAP 0x01
-
-#define ACPI_DEVFLAG_INITPASS 0x01
-#define ACPI_DEVFLAG_EXTINT 0x02
-#define ACPI_DEVFLAG_NMI 0x04
-#define ACPI_DEVFLAG_SYSMGT1 0x10
-#define ACPI_DEVFLAG_SYSMGT2 0x20
-#define ACPI_DEVFLAG_LINT0 0x40
-#define ACPI_DEVFLAG_LINT1 0x80
-#define ACPI_DEVFLAG_ATSDIS 0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 cap_ptr;
- u64 mmio_phys;
- u16 pci_seg;
- u16 info;
- u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
- u8 type;
- u16 devid;
- u8 flags;
- u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
- u8 type;
- u8 flags;
- u16 length;
- u16 devid;
- u16 aux;
- u64 resv;
- u64 range_start;
- u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf; /* largest PCI device id we have
- to handle */
-LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
- we find in ACPI */
-bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
- system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size; /* size of the device table */
-static u32 alias_table_size; /* size of the alias table */
-static u32 rlookup_table_size; /* size if the rlookup table */
-
-static inline void update_last_devid(u16 devid)
-{
- if (devid > amd_iommu_last_bdf)
- amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
- unsigned shift = PAGE_SHIFT +
- get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
- return 1UL << shift;
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
- u64 start = iommu->exclusion_start & PAGE_MASK;
- u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
- u64 entry;
-
- if (!iommu->exclusion_start)
- return;
-
- entry = start | MMIO_EXCL_ENABLE_MASK;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
- &entry, sizeof(entry));
-
- entry = limit;
- memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->mmio_base == NULL);
-
- entry = virt_to_phys(amd_iommu_dev_table);
- entry |= (dev_table_size >> 12) - 1;
- memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
- &entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl |= (1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
- u32 ctrl;
-
- ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
- ctrl &= ~(1 << bit);
- writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
- printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
- dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
- iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
- /* Disable command buffer */
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- /* Disable event logging and event interrupts */
- iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
- iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
- /* Disable IOMMU hardware itself */
- iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
- u8 *ret;
-
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
- pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
- address);
- pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
- return NULL;
- }
-
- ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
- if (ret != NULL)
- return ret;
-
- release_mem_region(address, MMIO_REGION_LENGTH);
-
- return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
- if (iommu->mmio_base)
- iounmap(iommu->mmio_base);
- release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
- return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
- u32 cap;
-
- cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
- update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
- return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
- u8 *p = (void *)h, *end = (void *)h;
- struct ivhd_entry *dev;
-
- p += sizeof(*h);
- end += h->length;
-
- find_last_devid_on_pci(PCI_BUS(h->devid),
- PCI_SLOT(h->devid),
- PCI_FUNC(h->devid),
- h->cap_ptr);
-
- while (p < end) {
- dev = (struct ivhd_entry *)p;
- switch (dev->type) {
- case IVHD_DEV_SELECT:
- case IVHD_DEV_RANGE_END:
- case IVHD_DEV_ALIAS:
- case IVHD_DEV_EXT_SELECT:
- /* all the above subfield types refer to device ids */
- update_last_devid(dev->devid);
- break;
- default:
- break;
- }
- p += ivhd_entry_length(p);
- }
-
- WARN_ON(p != end);
-
- return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
- int i;
- u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
-
- /*
- * Validate checksum here so we don't need to do it when
- * we actually parse the table
- */
- for (i = 0; i < table->length; ++i)
- checksum += p[i];
- if (checksum != 0) {
- /* ACPI table corrupt */
- amd_iommu_init_err = -ENODEV;
- return 0;
- }
-
- p += IVRS_HEADER_LENGTH;
-
- end += table->length;
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (h->type) {
- case ACPI_IVHD_TYPE:
- find_last_devid_from_ivhd(h);
- break;
- default:
- break;
- }
- p += h->length;
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(CMD_BUFFER_SIZE));
-
- if (cmd_buf == NULL)
- return NULL;
-
- iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
- return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
- iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
- writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->cmd_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->cmd_buf);
- entry |= MMIO_CMD_SIZE_512;
-
- memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
-
- amd_iommu_reset_cmd_buffer(iommu);
- iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->cmd_buf,
- get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
- iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(EVT_BUFFER_SIZE));
-
- if (iommu->evt_buf == NULL)
- return NULL;
-
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
- u64 entry;
-
- BUG_ON(iommu->evt_buf == NULL);
-
- entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
- memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
- &entry, sizeof(entry));
-
- /* set head and tail to zero manually */
- writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
- writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
- free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
- int i = (bit >> 5) & 0x07;
- int _bit = bit & 0x1f;
-
- return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
- int sysmgt;
-
- sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
- (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
- if (sysmgt == 0x01)
- set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
- amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
- u16 devid, u32 flags, u32 ext_flags)
-{
- if (flags & ACPI_DEVFLAG_INITPASS)
- set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
- if (flags & ACPI_DEVFLAG_EXTINT)
- set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
- if (flags & ACPI_DEVFLAG_NMI)
- set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
- if (flags & ACPI_DEVFLAG_SYSMGT1)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
- if (flags & ACPI_DEVFLAG_SYSMGT2)
- set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
- if (flags & ACPI_DEVFLAG_LINT0)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
- if (flags & ACPI_DEVFLAG_LINT1)
- set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
- amd_iommu_apply_erratum_63(devid);
-
- set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
- if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
- return;
-
- if (iommu) {
- /*
- * We only can configure exclusion ranges per IOMMU, not
- * per device. But we can enable the exclusion range per
- * device. This is done here
- */
- set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
- iommu->exclusion_start = m->range_start;
- iommu->exclusion_length = m->range_length;
- }
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
- int cap_ptr = iommu->cap_ptr;
- u32 range, misc;
-
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
- &iommu->cap);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
- &range);
- pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
- &misc);
-
- iommu->first_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_FD(range));
- iommu->last_device = calc_devid(MMIO_GET_BUS(range),
- MMIO_GET_LD(range));
- iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
- struct ivhd_header *h)
-{
- u8 *p = (u8 *)h;
- u8 *end = p, flags = 0;
- u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
- u32 ext_flags = 0;
- bool alias = false;
- struct ivhd_entry *e;
-
- /*
- * First set the recommended feature enable bits from ACPI
- * into the IOMMU control registers
- */
- h->flags & IVHD_FLAG_HT_TUN_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
- iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
- h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
- h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
- iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
- h->flags & IVHD_FLAG_ISOC_EN_MASK ?
- iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
- iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
- /*
- * make IOMMU memory accesses cache coherent
- */
- iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-
- /*
- * Done. Now parse the device entries
- */
- p += sizeof(struct ivhd_header);
- end += h->length;
-
-
- while (p < end) {
- e = (struct ivhd_entry *)p;
- switch (e->type) {
- case IVHD_DEV_ALL:
-
- DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
- " last device %02x:%02x.%x flags: %02x\n",
- PCI_BUS(iommu->first_device),
- PCI_SLOT(iommu->first_device),
- PCI_FUNC(iommu->first_device),
- PCI_BUS(iommu->last_device),
- PCI_SLOT(iommu->last_device),
- PCI_FUNC(iommu->last_device),
- e->flags);
-
- for (dev_i = iommu->first_device;
- dev_i <= iommu->last_device; ++dev_i)
- set_dev_entry_from_acpi(iommu, dev_i,
- e->flags, 0);
- break;
- case IVHD_DEV_SELECT:
-
- DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
- "flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
- break;
- case IVHD_DEV_SELECT_RANGE_START:
-
- DUMP_printk(" DEV_SELECT_RANGE_START\t "
- "devid: %02x:%02x.%x flags: %02x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = 0;
- alias = false;
- break;
- case IVHD_DEV_ALIAS:
-
- DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
- "flags: %02x devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid = e->devid;
- devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid , e->flags, 0);
- set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
- amd_iommu_alias_table[devid] = devid_to;
- break;
- case IVHD_DEV_ALIAS_RANGE:
-
- DUMP_printk(" DEV_ALIAS_RANGE\t\t "
- "devid: %02x:%02x.%x flags: %02x "
- "devid_to: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags,
- PCI_BUS(e->ext >> 8),
- PCI_SLOT(e->ext >> 8),
- PCI_FUNC(e->ext >> 8));
-
- devid_start = e->devid;
- flags = e->flags;
- devid_to = e->ext >> 8;
- ext_flags = 0;
- alias = true;
- break;
- case IVHD_DEV_EXT_SELECT:
-
- DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
- "flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid = e->devid;
- set_dev_entry_from_acpi(iommu, devid, e->flags,
- e->ext);
- break;
- case IVHD_DEV_EXT_SELECT_RANGE:
-
- DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
- "%02x:%02x.%x flags: %02x ext: %08x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid),
- e->flags, e->ext);
-
- devid_start = e->devid;
- flags = e->flags;
- ext_flags = e->ext;
- alias = false;
- break;
- case IVHD_DEV_RANGE_END:
-
- DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
- PCI_BUS(e->devid),
- PCI_SLOT(e->devid),
- PCI_FUNC(e->devid));
-
- devid = e->devid;
- for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
- if (alias) {
- amd_iommu_alias_table[dev_i] = devid_to;
- set_dev_entry_from_acpi(iommu,
- devid_to, flags, ext_flags);
- }
- set_dev_entry_from_acpi(iommu, dev_i,
- flags, ext_flags);
- }
- break;
- default:
- break;
- }
-
- p += ivhd_entry_length(p);
- }
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
- u16 i;
-
- for (i = iommu->first_device; i <= iommu->last_device; ++i)
- set_iommu_for_device(iommu, i);
-
- return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
- free_command_buffer(iommu);
- free_event_buffer(iommu);
- iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
- struct amd_iommu *iommu, *next;
-
- for_each_iommu_safe(iommu, next) {
- list_del(&iommu->list);
- free_iommu_one(iommu);
- kfree(iommu);
- }
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
- spin_lock_init(&iommu->lock);
-
- /* Add IOMMU to internal data structures */
- list_add_tail(&iommu->list, &amd_iommu_list);
- iommu->index = amd_iommus_present++;
-
- if (unlikely(iommu->index >= MAX_IOMMUS)) {
- WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
- return -ENOSYS;
- }
-
- /* Index is fine - add IOMMU to the array */
- amd_iommus[iommu->index] = iommu;
-
- /*
- * Copy data from ACPI table entry to the iommu struct
- */
- iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
- if (!iommu->dev)
- return 1;
-
- iommu->cap_ptr = h->cap_ptr;
- iommu->pci_seg = h->pci_seg;
- iommu->mmio_phys = h->mmio_phys;
- iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
- if (!iommu->mmio_base)
- return -ENOMEM;
-
- iommu->cmd_buf = alloc_command_buffer(iommu);
- if (!iommu->cmd_buf)
- return -ENOMEM;
-
- iommu->evt_buf = alloc_event_buffer(iommu);
- if (!iommu->evt_buf)
- return -ENOMEM;
-
- iommu->int_enabled = false;
-
- init_iommu_from_pci(iommu);
- init_iommu_from_acpi(iommu, h);
- init_iommu_devices(iommu);
-
- if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
- amd_iommu_np_cache = true;
-
- return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivhd_header *h;
- struct amd_iommu *iommu;
- int ret;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- h = (struct ivhd_header *)p;
- switch (*p) {
- case ACPI_IVHD_TYPE:
-
- DUMP_printk("device: %02x:%02x.%01x cap: %04x "
- "seg: %d flags: %01x info %04x\n",
- PCI_BUS(h->devid), PCI_SLOT(h->devid),
- PCI_FUNC(h->devid), h->cap_ptr,
- h->pci_seg, h->flags, h->info);
- DUMP_printk(" mmio-addr: %016llx\n",
- h->mmio_phys);
-
- iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
- if (iommu == NULL) {
- amd_iommu_init_err = -ENOMEM;
- return 0;
- }
-
- ret = init_iommu_one(iommu, h);
- if (ret) {
- amd_iommu_init_err = ret;
- return 0;
- }
- break;
- default:
- break;
- }
- p += h->length;
-
- }
- WARN_ON(p != end);
-
- return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
- int r;
-
- if (pci_enable_msi(iommu->dev))
- return 1;
-
- r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD-Vi",
- NULL);
-
- if (r) {
- pci_disable_msi(iommu->dev);
- return 1;
- }
-
- iommu->int_enabled = true;
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
- return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
- if (iommu->int_enabled)
- return 0;
-
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
- return iommu_setup_msi(iommu);
-
- return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
- struct unity_map_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
- list_del(&entry->list);
- kfree(entry);
- }
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
- int i;
-
- switch (m->type) {
- case ACPI_IVMD_TYPE:
- set_device_exclusion_range(m->devid, m);
- break;
- case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- set_device_exclusion_range(i, m);
- break;
- case ACPI_IVMD_TYPE_RANGE:
- for (i = m->devid; i <= m->aux; ++i)
- set_device_exclusion_range(i, m);
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
- struct unity_map_entry *e = 0;
- char *s;
-
- e = kzalloc(sizeof(*e), GFP_KERNEL);
- if (e == NULL)
- return -ENOMEM;
-
- switch (m->type) {
- default:
- kfree(e);
- return 0;
- case ACPI_IVMD_TYPE:
- s = "IVMD_TYPEi\t\t\t";
- e->devid_start = e->devid_end = m->devid;
- break;
- case ACPI_IVMD_TYPE_ALL:
- s = "IVMD_TYPE_ALL\t\t";
- e->devid_start = 0;
- e->devid_end = amd_iommu_last_bdf;
- break;
- case ACPI_IVMD_TYPE_RANGE:
- s = "IVMD_TYPE_RANGE\t\t";
- e->devid_start = m->devid;
- e->devid_end = m->aux;
- break;
- }
- e->address_start = PAGE_ALIGN(m->range_start);
- e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
- e->prot = m->flags >> 1;
-
- DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
- " range_start: %016llx range_end: %016llx flags: %x\n", s,
- PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
- PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
- PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
- e->address_start, e->address_end, m->flags);
-
- list_add_tail(&e->list, &amd_iommu_unity_map);
-
- return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
- u8 *p = (u8 *)table, *end = (u8 *)table;
- struct ivmd_header *m;
-
- end += table->length;
- p += IVRS_HEADER_LENGTH;
-
- while (p < end) {
- m = (struct ivmd_header *)p;
- if (m->flags & IVMD_FLAG_EXCL_RANGE)
- init_exclusion_range(m);
- else if (m->flags & IVMD_FLAG_UNITY_MAP)
- init_unity_map_range(m);
-
- p += m->length;
- }
-
- return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
- u16 devid;
-
- for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
- set_dev_entry_bit(devid, DEV_ENTRY_VALID);
- set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
- }
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu) {
- iommu_disable(iommu);
- iommu_set_device_table(iommu);
- iommu_enable_command_buffer(iommu);
- iommu_enable_event_buffer(iommu);
- iommu_set_exclusion_range(iommu);
- iommu_init_msi(iommu);
- iommu_enable(iommu);
- }
-}
-
-static void disable_iommus(void)
-{
- struct amd_iommu *iommu;
-
- for_each_iommu(iommu)
- iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static int amd_iommu_resume(struct sys_device *dev)
-{
- /* re-load the hardware */
- enable_iommus();
-
- /*
- * we have to flush after the IOMMUs are enabled because a
- * disabled IOMMU will never execute the commands we send
- */
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
-
- return 0;
-}
-
-static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* disable IOMMUs to go out of the way for BIOS */
- disable_iommus();
-
- return 0;
-}
-
-static struct sysdev_class amd_iommu_sysdev_class = {
- .name = "amd_iommu",
- .suspend = amd_iommu_suspend,
- .resume = amd_iommu_resume,
-};
-
-static struct sys_device device_amd_iommu = {
- .id = 0,
- .cls = &amd_iommu_sysdev_class,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- * 1 pass) Find the highest PCI device id the driver has to handle.
- * Upon this information the size of the data structures is
- * determined that needs to be allocated.
- *
- * 2 pass) Initialize the data structures just allocated with the
- * information in the ACPI table about available AMD IOMMUs
- * in the system. It also maps the PCI devices in the
- * system to specific IOMMUs
- *
- * 3 pass) After the basic data structures are allocated and
- * initialized we update them with information about memory
- * remapping requirements parsed out of the ACPI table in
- * this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
- int i, ret = 0;
-
- /*
- * First parse ACPI tables to find the largest Bus/Dev/Func
- * we need to handle. Upon this information the shared data
- * structures for the IOMMUs in the system will be allocated
- */
- if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
- return -ENODEV;
-
- ret = amd_iommu_init_err;
- if (ret)
- goto out;
-
- dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
- alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
- rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
- ret = -ENOMEM;
-
- /* Device table - directly used by all IOMMUs */
- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(dev_table_size));
- if (amd_iommu_dev_table == NULL)
- goto out;
-
- /*
- * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
- * IOMMU see for that device
- */
- amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
- get_order(alias_table_size));
- if (amd_iommu_alias_table == NULL)
- goto free;
-
- /* IOMMU rlookup table - find the IOMMU for a specific device */
- amd_iommu_rlookup_table = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(rlookup_table_size));
- if (amd_iommu_rlookup_table == NULL)
- goto free;
-
- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
- GFP_KERNEL | __GFP_ZERO,
- get_order(MAX_DOMAIN_ID/8));
- if (amd_iommu_pd_alloc_bitmap == NULL)
- goto free;
-
- /* init the device table */
- init_device_table();
-
- /*
- * let all alias entries point to itself
- */
- for (i = 0; i <= amd_iommu_last_bdf; ++i)
- amd_iommu_alias_table[i] = i;
-
- /*
- * never allocate domain 0 because its used as the non-allocated and
- * error value placeholder
- */
- amd_iommu_pd_alloc_bitmap[0] = 1;
-
- spin_lock_init(&amd_iommu_pd_lock);
-
- /*
- * now the data structures are allocated and basically initialized
- * start the real acpi table scan
- */
- ret = -ENODEV;
- if (acpi_table_parse("IVRS", init_iommu_all) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
- goto free;
-
- if (amd_iommu_init_err) {
- ret = amd_iommu_init_err;
- goto free;
- }
-
- ret = sysdev_class_register(&amd_iommu_sysdev_class);
- if (ret)
- goto free;
-
- ret = sysdev_register(&device_amd_iommu);
- if (ret)
- goto free;
-
- ret = amd_iommu_init_devices();
- if (ret)
- goto free;
-
- enable_iommus();
-
- if (iommu_pass_through)
- ret = amd_iommu_init_passthrough();
- else
- ret = amd_iommu_init_dma_ops();
-
- if (ret)
- goto free_disable;
-
- amd_iommu_init_api();
-
- amd_iommu_init_notifier();
-
- if (iommu_pass_through)
- goto out;
-
- if (amd_iommu_unmap_flush)
- printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
- else
- printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
- x86_platform.iommu_shutdown = disable_iommus;
-out:
- return ret;
-
-free_disable:
- disable_iommus();
-
-free:
- amd_iommu_uninit_devices();
-
- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
- get_order(MAX_DOMAIN_ID/8));
-
- free_pages((unsigned long)amd_iommu_rlookup_table,
- get_order(rlookup_table_size));
-
- free_pages((unsigned long)amd_iommu_alias_table,
- get_order(alias_table_size));
-
- free_pages((unsigned long)amd_iommu_dev_table,
- get_order(dev_table_size));
-
- free_iommu_all();
-
- free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
- /*
- * We failed to initialize the AMD IOMMU - try fallback to GART
- * if possible.
- */
- gart_iommu_init();
-
-#endif
-
- goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
- return 0;
-}
-
-void __init amd_iommu_detect(void)
-{
- if (no_iommu || (iommu_detected && !gart_iommu_aperture))
- return;
-
- if (amd_iommu_disabled)
- return;
-
- if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
- iommu_detected = 1;
- amd_iommu_detected = 1;
- x86_init.iommu.iommu_init = amd_iommu_init;
-
- /* Make sure ACS will be enabled */
- pci_request_acs();
- }
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
- amd_iommu_dump = true;
-
- return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
- for (; *str; ++str) {
- if (strncmp(str, "fullflush", 9) == 0)
- amd_iommu_unmap_flush = true;
- if (strncmp(str, "off", 3) == 0)
- amd_iommu_disabled = true;
- }
-
- return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000000..c1acead6227a
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared support code for AMD K8 northbridges and derivatives.
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/pci_ids.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cpuid/api.h>
+
+static u32 *flush_words;
+
+static const struct pci_device_id amd_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ {}
+};
+
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
+
+static int amd_cache_northbridges(void)
+{
+ struct amd_northbridge *nb;
+ u16 i;
+
+ if (amd_northbridges.num)
+ return 0;
+
+ amd_northbridges.num = amd_num_nodes();
+
+ nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ amd_northbridges.nb = nb;
+
+ for (i = 0; i < amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
+
+ /*
+ * Each Northbridge must have a 'misc' device.
+ * If not, then uninitialize everything.
+ */
+ if (!node_to_amd_nb(i)->misc) {
+ amd_northbridges.num = 0;
+ kfree(nb);
+ return -ENODEV;
+ }
+
+ node_to_amd_nb(i)->link = amd_node_get_func(i, 4);
+ }
+
+ if (amd_gart_present())
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return 0;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_stepping >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ /* L3 cache partitioning is supported on family 0x15 */
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+
+ return 0;
+}
+
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+ const struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return false;
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN))
+ return false;
+
+ device >>= 16;
+ for (id = amd_nb_misc_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return true;
+ return false;
+}
+
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u64 base, msr;
+ unsigned int segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return NULL;
+
+ /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */
+ if (boot_cpu_data.x86 < 0x10 ||
+ rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr))
+ return NULL;
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
+int amd_get_subcaches(int cpu)
+{
+ struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link;
+ unsigned int mask;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return 0;
+
+ pci_read_config_dword(link, 0x1d4, &mask);
+
+ return (mask >> (4 * cpu_data(cpu).topo.core_id)) & 0xf;
+}
+
+int amd_set_subcaches(int cpu, unsigned long mask)
+{
+ static unsigned int reset, ban;
+ struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu));
+ unsigned int reg;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+ return -EINVAL;
+
+ /* if necessary, collect reset state of L3 partitioning and BAN mode */
+ if (reset == 0) {
+ pci_read_config_dword(nb->link, 0x1d4, &reset);
+ pci_read_config_dword(nb->misc, 0x1b8, &ban);
+ ban &= 0x180000;
+ }
+
+ /* deactivate BAN mode if any subcaches are to be disabled */
+ if (mask != 0xf) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+ }
+
+ cuid = cpu_data(cpu).topo.core_id;
+ mask <<= 4 * cuid;
+ mask |= (0xf ^ (1 << cuid)) << 26;
+
+ pci_write_config_dword(nb->link, 0x1d4, mask);
+
+ /* reset BAN mode if L3 partitioning returned to reset state */
+ pci_read_config_dword(nb->link, 0x1d4, &reg);
+ if (reg == reset) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ reg &= ~0x180000;
+ pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+ }
+
+ return 0;
+}
+
+static void amd_cache_gart(void)
+{
+ u16 i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
+ return;
+ }
+
+ for (i = 0; i != amd_northbridges.num; i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
+}
+
+void amd_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < amd_northbridges.num; i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
+ flushed++;
+ }
+ for (i = 0; i < amd_northbridges.num; i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ pr_notice("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+
+static void __fix_erratum_688(void *info)
+{
+#define MSR_AMD64_IC_CFG 0xC0011021
+
+ msr_set_bit(MSR_AMD64_IC_CFG, 3);
+ msr_set_bit(MSR_AMD64_IC_CFG, 14);
+}
+
+/* Apply erratum 688 fix so machines without a BIOS fix work. */
+static __init void fix_erratum_688(void)
+{
+ struct pci_dev *F4;
+ u32 val;
+
+ if (boot_cpu_data.x86 != 0x14)
+ return;
+
+ if (!amd_northbridges.num)
+ return;
+
+ F4 = node_to_amd_nb(0)->link;
+ if (!F4)
+ return;
+
+ if (pci_read_config_dword(F4, 0x164, &val))
+ return;
+
+ if (val & BIT(2))
+ return;
+
+ on_each_cpu(__fix_erratum_688, NULL, 0);
+
+ pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n");
+}
+
+static __init int init_amd_nbs(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return 0;
+
+ amd_cache_northbridges();
+ amd_cache_gart();
+
+ fix_erratum_688();
+
+ return 0;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
new file mode 100644
index 000000000000..3d0a4768d603
--- /dev/null
+++ b/arch/x86/kernel/amd_node.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Node helper functions and common defines
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include <linux/debugfs.h>
+#include <asm/amd/node.h>
+
+/*
+ * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
+ * or more nodes per package.
+ *
+ * The nodes are software-visible through PCI config space. All nodes are enumerated
+ * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8
+ * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a
+ * multi-function device.
+ *
+ * On legacy systems, these node devices represent integrated Northbridge functionality.
+ * On Zen-based systems, these node devices represent Data Fabric functionality.
+ *
+ * See "Configuration Space Accesses" section in BKDGs or
+ * "Processor x86 Core" -> "Configuration Space" section in PPRs.
+ */
+struct pci_dev *amd_node_get_func(u16 node, u8 func)
+{
+ if (node >= MAX_AMD_NUM_NODES)
+ return NULL;
+
+ return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
+}
+
+static struct pci_dev **amd_roots;
+
+/* Protect the PCI config register pairs used for SMN. */
+static DEFINE_MUTEX(smn_mutex);
+static bool smn_exclusive;
+
+#define SMN_INDEX_OFFSET 0x60
+#define SMN_DATA_OFFSET 0x64
+
+#define HSMP_INDEX_OFFSET 0xc4
+#define HSMP_DATA_OFFSET 0xc8
+
+/*
+ * SMN accesses may fail in ways that are difficult to detect here in the called
+ * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
+ * their own checking based on what behavior they expect.
+ *
+ * For SMN reads, the returned value may be zero if the register is Read-as-Zero.
+ * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
+ * can be checked here, and a proper error code can be returned.
+ *
+ * But the Read-as-Zero response cannot be verified here. A value of 0 may be
+ * correct in some cases, so callers must check that this correct is for the
+ * register/fields they need.
+ *
+ * For SMN writes, success can be determined through a "write and read back"
+ * However, this is not robust when done here.
+ *
+ * Possible issues:
+ *
+ * 1) Bits that are "Write-1-to-Clear". In this case, the read value should
+ * *not* match the write value.
+ *
+ * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
+ * known here.
+ *
+ * 3) Bits that are "Reserved / Set to 1". Ditto above.
+ *
+ * Callers of amd_smn_write() should do the "write and read back" check
+ * themselves, if needed.
+ *
+ * For #1, they can see if their target bits got cleared.
+ *
+ * For #2 and #3, they can check if their target bits got set as intended.
+ *
+ * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
+ * the operation is considered a success, and the caller does their own
+ * checking.
+ */
+static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_num_nodes())
+ return err;
+
+ root = amd_roots[node];
+ if (!root)
+ return err;
+
+ if (!smn_exclusive)
+ return err;
+
+ guard(mutex)(&smn_mutex);
+
+ err = pci_write_config_dword(root, i_off, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ return pcibios_err_to_errno(err);
+ }
+
+ err = (write ? pci_write_config_dword(root, d_off, *value)
+ : pci_read_config_dword(root, d_off, value));
+
+ return pcibios_err_to_errno(err);
+}
+
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false);
+
+ if (PCI_POSSIBLE_ERROR(*value)) {
+ err = -ENODEV;
+ *value = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int __must_check amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write)
+{
+ return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write);
+}
+EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr);
+
+static struct dentry *debugfs_dir;
+static u16 debug_node;
+static u32 debug_address;
+
+static ssize_t smn_node_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u16 node;
+ int ret;
+
+ ret = kstrtou16_from_user(userbuf, count, 0, &node);
+ if (ret)
+ return ret;
+
+ if (node >= amd_num_nodes())
+ return -ENODEV;
+
+ debug_node = node;
+ return count;
+}
+
+static int smn_node_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_node);
+ return 0;
+}
+
+static ssize_t smn_address_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &debug_address);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int smn_address_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%08x\n", debug_address);
+ return 0;
+}
+
+static int smn_value_show(struct seq_file *m, void *v)
+{
+ u32 val;
+ int ret;
+
+ ret = amd_smn_read(debug_node, debug_address, &val);
+ if (ret)
+ return ret;
+
+ seq_printf(m, "0x%08x\n", val);
+ return 0;
+}
+
+static ssize_t smn_value_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ u32 val;
+ int ret;
+
+ ret = kstrtouint_from_user(userbuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ ret = amd_smn_write(debug_node, debug_address, val);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_node);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_address);
+DEFINE_SHOW_STORE_ATTRIBUTE(smn_value);
+
+static struct pci_dev *get_next_root(struct pci_dev *root)
+{
+ while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) {
+ /* Root device is Device 0 Function 0. */
+ if (root->devfn)
+ continue;
+
+ if (root->vendor != PCI_VENDOR_ID_AMD &&
+ root->vendor != PCI_VENDOR_ID_HYGON)
+ continue;
+
+ break;
+ }
+
+ return root;
+}
+
+static bool enable_dfs;
+
+static int __init amd_smn_enable_dfs(char *str)
+{
+ enable_dfs = true;
+ return 1;
+}
+__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs);
+
+static int __init amd_smn_init(void)
+{
+ u16 count, num_roots, roots_per_node, node, num_nodes;
+ struct pci_dev *root;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ guard(mutex)(&smn_mutex);
+
+ if (amd_roots)
+ return 0;
+
+ num_roots = 0;
+ root = NULL;
+ while ((root = get_next_root(root))) {
+ pci_dbg(root, "Reserving PCI config space\n");
+
+ /*
+ * There are a few SMN index/data pairs and other registers
+ * that shouldn't be accessed by user space. So reserve the
+ * entire PCI config space for simplicity rather than covering
+ * specific registers piecemeal.
+ */
+ if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) {
+ pci_err(root, "Failed to reserve config space\n");
+ return -EEXIST;
+ }
+
+ num_roots++;
+ }
+
+ pr_debug("Found %d AMD root devices\n", num_roots);
+
+ if (!num_roots)
+ return -ENODEV;
+
+ num_nodes = amd_num_nodes();
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ roots_per_node = num_roots / num_nodes;
+
+ count = 0;
+ node = 0;
+ root = NULL;
+ while (node < num_nodes && (root = get_next_root(root))) {
+ /* Use one root for each node and skip the rest. */
+ if (count++ % roots_per_node)
+ continue;
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ amd_roots[node++] = root;
+ }
+
+ if (enable_dfs) {
+ debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir);
+
+ debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops);
+ debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops);
+ debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops);
+ }
+
+ smn_exclusive = true;
+
+ return 0;
+}
+
+fs_initcall(amd_smn_init);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
deleted file mode 100644
index a35347501d36..000000000000
--- a/arch/x86/kernel/apb_timer.c
+++ /dev/null
@@ -1,785 +0,0 @@
-/*
- * apb_timer.c: Driver for Langwell APB timers
- *
- * (C) Copyright 2009 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Note:
- * Langwell is the south complex of Intel Moorestown MID platform. There are
- * eight external timers in total that can be used by the operating system.
- * The timer information, such as frequency and addresses, is provided to the
- * OS via SFI tables.
- * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
- * individual redirection table entries (RTE).
- * Unlike HPET, there is no master counter, therefore one of the timers are
- * used as clocksource. The overall allocation looks like:
- * - timer 0 - NR_CPUs for per cpu timer
- * - one timer for clocksource
- * - one timer for watchdog driver.
- * It is also worth notice that APB timer does not support true one-shot mode,
- * free-running mode will be used here to emulate one-shot mode.
- * APB timer can also be used as broadcast timer along with per cpu local APIC
- * timer, but by default APB timer has higher rating than local APIC timers.
- */
-
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/sysdev.h>
-#include <linux/slab.h>
-#include <linux/pm.h>
-#include <linux/pci.h>
-#include <linux/sfi.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/irq.h>
-
-#include <asm/fixmap.h>
-#include <asm/apb_timer.h>
-
-#define APBT_MASK CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT 22
-#define APBT_CLOCKEVENT_RATING 150
-#define APBT_CLOCKSOURCE_RATING 250
-#define APBT_MIN_DELTA_USEC 200
-
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
-#define APBT_CLOCKEVENT0_NUM (0)
-#define APBT_CLOCKEVENT1_NUM (1)
-#define APBT_CLOCKSOURCE_NUM (2)
-
-static unsigned long apbt_address;
-static int apb_timer_block_enabled;
-static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
-
-/*
- * Common DW APB timer info
- */
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
-
-struct apbt_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int tick;
- unsigned int count;
- unsigned int flags;
- char name[10];
-};
-
-int disable_apbt_percpu __cpuinitdata;
-
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
-
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static inline unsigned long apbt_readl_reg(unsigned long a)
-{
- return readl(apbt_virt_address + a);
-}
-
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
- return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
-
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
- writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
-
-static inline void apbt_set_mapping(void)
-{
- struct sfi_timer_table_entry *mtmr;
-
- if (apbt_virt_address) {
- pr_debug("APBT base already mapped\n");
- return;
- }
- mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
- if (mtmr == NULL) {
- printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
- APBT_CLOCKEVENT0_NUM);
- return;
- }
- apbt_address = (unsigned long)mtmr->phys_addr;
- if (!apbt_address) {
- printk(KERN_WARNING "No timer base from SFI, use default\n");
- apbt_address = APBT_DEFAULT_BASE;
- }
- apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
- if (apbt_virt_address) {
- pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
- (void *)apbt_address, (void *)apbt_virt_address);
- } else {
- pr_debug("Failed mapping APBT phy address at %p\n",\
- (void *)apbt_address);
- goto panic_noapbt;
- }
- apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
- sfi_free_mtmr(mtmr);
-
- /* Now figure out the physical timer id for clocksource device */
- mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
- if (mtmr == NULL)
- goto panic_noapbt;
-
- /* Now figure out the physical timer id */
- phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
- / APBTMRS_REG_SIZE;
- pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
- return;
-
-panic_noapbt:
- panic("Failed to setup APB system timer\n");
-
-}
-
-static inline void apbt_clear_mapping(void)
-{
- iounmap(apbt_virt_address);
- apbt_virt_address = NULL;
-}
-
-/*
- * APBT timer interrupt enable / disable
- */
-static inline int is_apbt_capable(void)
-{
- return apbt_virt_address ? 1 : 0;
-}
-
-static struct clocksource clocksource_apbt = {
- .name = "apbt",
- .rating = APBT_CLOCKSOURCE_RATING,
- .read = apbt_read_clocksource,
- .mask = APBT_MASK,
- .shift = APBT_SHIFT,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
- .resume = apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
- .name = "apbt0",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = apbt_set_mode,
- .set_next_event = apbt_next_event,
- .shift = APBT_SHIFT,
- .irq = 0,
- .rating = APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp("apbt_only", arg) == 0)
- disable_apbt_percpu = 0;
- else if (strcmp("lapic_and_apbt", arg) == 0)
- disable_apbt_percpu = 1;
- else {
- pr_warning("X86 MRST timer option %s not recognised"
- " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
- arg);
- return -EINVAL;
- }
- return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
- /* enable, mask interrupt */
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
- ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- /* read it once to get cached counter value initialized */
- apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
- struct apbt_dev *dev = (struct apbt_dev *)data;
- struct clock_event_device *aevt = &dev->evt;
-
- if (!aevt->event_handler) {
- printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
- dev->num);
- return IRQ_NONE;
- }
- aevt->event_handler(aevt);
- return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
- apbt_start_counter(phy_cs_timer_id);
-}
-
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
- struct irq_chip *chip;
- struct irq_desc *desc;
-
- /* timer0 irq has been setup early */
- if (adev->irq == 0)
- return;
- desc = irq_to_desc(adev->irq);
- chip = get_irq_chip(adev->irq);
- disable_irq(adev->irq);
- desc->status |= IRQ_MOVE_PCNTXT;
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
- set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
- enable_irq(adev->irq);
- if (system_state == SYSTEM_BOOTING)
- if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- adev->num);
- }
-}
-#endif
-
-static void apbt_enable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- /* clear pending intr */
- apbt_readl(n, APBTMR_N_EOI);
- ctrl &= ~APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
- ctrl |= APBTMR_CONTROL_INT;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
-static int __init apbt_clockevent_register(void)
-{
- struct sfi_timer_table_entry *mtmr;
- struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
-
- mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
- if (mtmr == NULL) {
- printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
- APBT_CLOCKEVENT0_NUM);
- return -ENODEV;
- }
-
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
- , NSEC_PER_SEC, APBT_SHIFT);
-
- /* Calculate the min / max delta */
- apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &apbt_clockevent);
- apbt_clockevent.min_delta_ns = clockevent_delta2ns(
- APBT_MIN_DELTA_USEC*apbt_freq,
- &apbt_clockevent);
- /*
- * Start apbt with the boot cpu mask and make it
- * global if not used for per cpu timer.
- */
- apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
- adev->num = smp_processor_id();
- memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
-
- if (disable_apbt_percpu) {
- apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
- global_clock_event = &adev->evt;
- printk(KERN_DEBUG "%s clockevent registered as global\n",
- global_clock_event->name);
- }
-
- if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- apbt_clockevent.name, adev)) {
- printk(KERN_ERR "Failed request IRQ for APBT%d\n",
- apbt_clockevent.irq);
- }
-
- clockevents_register_device(&adev->evt);
- /* Start APBT 0 interrupts */
- apbt_enable_int(APBT_CLOCKEVENT0_NUM);
-
- sfi_free_mtmr(mtmr);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-/* Should be called with per cpu */
-void apbt_setup_secondary_clock(void)
-{
- struct apbt_dev *adev;
- struct clock_event_device *aevt;
- int cpu;
-
- /* Don't register boot CPU clockevent */
- cpu = smp_processor_id();
- if (cpu == boot_cpu_id)
- return;
- /*
- * We need to calculate the scaled math multiplication factor for
- * nanosecond to apbt tick conversion.
- * mult = (nsec/cycle)*2^APBT_SHIFT
- */
- printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
- adev = &per_cpu(cpu_apbt_dev, cpu);
- aevt = &adev->evt;
-
- memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
- aevt->cpumask = cpumask_of(cpu);
- aevt->name = adev->name;
- aevt->mode = CLOCK_EVT_MODE_UNUSED;
-
- printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
- cpu, aevt->name, *(u32 *)aevt->cpumask);
-
- apbt_setup_irq(adev);
-
- clockevents_register_device(aevt);
-
- apbt_enable_int(cpu);
-
- return;
-}
-
-/*
- * this notify handler process CPU hotplug events. in case of S0i3, nonboot
- * cpus are disabled/enabled frequently, for performance reasons, we keep the
- * per cpu timer irq registered so that we do need to do free_irq/request_irq.
- *
- * TODO: it might be more reliable to directly disable percpu clockevent device
- * without the notifier chain. currently, cpu 0 may get interrupts from other
- * cpu timers during the offline process due to the ordering of notification.
- * the extra interrupt is harmless.
- */
-static int apbt_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- unsigned long cpu = (unsigned long)hcpu;
- struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
-
- switch (action & 0xf) {
- case CPU_DEAD:
- apbt_disable_int(cpu);
- if (system_state == SYSTEM_RUNNING)
- pr_debug("skipping APBT CPU %lu offline\n", cpu);
- else if (adev) {
- pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
- free_irq(adev->irq, adev);
- }
- break;
- default:
- pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
- }
- return NOTIFY_OK;
-}
-
-static __init int apbt_late_init(void)
-{
- if (disable_apbt_percpu || !apb_timer_block_enabled)
- return 0;
- /* This notifier should be called after workqueue is ready */
- hotcpu_notifier(apbt_cpuhp_notify, -20);
- return 0;
-}
-fs_initcall(apbt_late_init);
-#else
-
-void apbt_setup_secondary_clock(void) {}
-
-#endif /* CONFIG_SMP */
-
-static void apbt_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- uint64_t delta;
- int timer_num;
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- timer_num = adev->num;
- pr_debug("%s CPU %d timer %d mode=%d\n",
- __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
- delta >>= apbt_clockevent.shift;
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /*
- * DW APB p. 46, have to disable timer before load counter,
- * may cause sync problem.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- udelay(1);
- pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
- /* APB timer does not have one-shot mode, use free running mode */
- case CLOCK_EVT_MODE_ONESHOT:
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- /*
- * set free running mode, this mode will let timer reload max
- * timeout which will give time (3min on 25MHz clock) to rearm
- * the next event, therefore emulate the one-shot mode.
- */
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write again to set free running mode */
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
- /*
- * DW APB p. 46, load counter with all 1s before starting free
- * running mode.
- */
- apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
- ctrl &= ~APBTMR_CONTROL_INT;
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- apbt_disable_int(timer_num);
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- apbt_enable_int(timer_num);
- break;
- }
-}
-
-static int apbt_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- unsigned long ctrl;
- int timer_num;
-
- struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
- timer_num = adev->num;
- /* Disable timer */
- ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- /* write new count */
- apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
- ctrl |= APBTMR_CONTROL_ENABLE;
- apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
- return 0;
-}
-
-/*
- * APB timer clock is not in sync with pclk on Langwell, which translates to
- * unreliable read value caused by sampling error. the error does not add up
- * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
- * would go backwards. the following code is trying to prevent time traveling
- * backwards. little bit paranoid.
- */
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
- unsigned long t0, t1, t2;
- static unsigned long last_read;
-
-bad_count:
- t1 = apbt_readl(phy_cs_timer_id,
- APBTMR_N_CURRENT_VALUE);
- t2 = apbt_readl(phy_cs_timer_id,
- APBTMR_N_CURRENT_VALUE);
- if (unlikely(t1 < t2)) {
- pr_debug("APBT: read current count error %lx:%lx:%lx\n",
- t1, t2, t2 - t1);
- goto bad_count;
- }
- /*
- * check against cached last read, makes sure time does not go back.
- * it could be a normal rollover but we will do tripple check anyway
- */
- if (unlikely(t2 > last_read)) {
- /* check if we have a normal rollover */
- unsigned long raw_intr_status =
- apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
- /*
- * cs timer interrupt is masked but raw intr bit is set if
- * rollover occurs. then we read EOI reg to clear it.
- */
- if (raw_intr_status & (1 << phy_cs_timer_id)) {
- apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
- goto out;
- }
- pr_debug("APB CS going back %lx:%lx:%lx ",
- t2, last_read, t2 - last_read);
-bad_count_x3:
- pr_debug(KERN_INFO "tripple check enforced\n");
- t0 = apbt_readl(phy_cs_timer_id,
- APBTMR_N_CURRENT_VALUE);
- udelay(1);
- t1 = apbt_readl(phy_cs_timer_id,
- APBTMR_N_CURRENT_VALUE);
- udelay(1);
- t2 = apbt_readl(phy_cs_timer_id,
- APBTMR_N_CURRENT_VALUE);
- if ((t2 > t1) || (t1 > t0)) {
- printk(KERN_ERR "Error: APB CS tripple check failed\n");
- goto bad_count_x3;
- }
- }
-out:
- last_read = t2;
- return (cycle_t)~t2;
-}
-
-static int apbt_clocksource_register(void)
-{
- u64 start, now;
- cycle_t t1;
-
- /* Start the counter, use timer 2 as source, timer 0/1 for event */
- apbt_start_counter(phy_cs_timer_id);
-
- /* Verify whether apbt counter works */
- t1 = apbt_read_clocksource(&clocksource_apbt);
- rdtscll(start);
-
- /*
- * We don't know the TSC frequency yet, but waiting for
- * 200000 TSC cycles is safe:
- * 4 GHz == 50us
- * 1 GHz == 200us
- */
- do {
- rep_nop();
- rdtscll(now);
- } while ((now - start) < 200000UL);
-
- /* APBT is the only always on clocksource, it has to work! */
- if (t1 == apbt_read_clocksource(&clocksource_apbt))
- panic("APBT counter not counting. APBT disabled\n");
-
- /*
- * initialize and register APBT clocksource
- * convert that to ns/clock cycle
- * mult = (ns/c) * 2^APBT_SHIFT
- */
- clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
- (unsigned long) apbt_freq, APBT_SHIFT);
- clocksource_register(&clocksource_apbt);
-
- return 0;
-}
-
-/*
- * Early setup the APBT timer, only use timer 0 for booting then switch to
- * per CPU timer if possible.
- * returns 1 if per cpu apbt is setup
- * returns 0 if no per cpu apbt is chosen
- * panic if set up failed, this is the only platform timer on Moorestown.
- */
-void __init apbt_time_init(void)
-{
-#ifdef CONFIG_SMP
- int i;
- struct sfi_timer_table_entry *p_mtmr;
- unsigned int percpu_timer;
- struct apbt_dev *adev;
-#endif
-
- if (apb_timer_block_enabled)
- return;
- apbt_set_mapping();
- if (apbt_virt_address) {
- pr_debug("Found APBT version 0x%lx\n",\
- apbt_readl_reg(APBTMRS_COMP_VERSION));
- } else
- goto out_noapbt;
- /*
- * Read the frequency and check for a sane value, for ESL model
- * we extend the possible clock range to allow time scaling.
- */
-
- if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
- pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
- goto out_noapbt;
- }
- if (apbt_clocksource_register()) {
- pr_debug("APBT has failed to register clocksource\n");
- goto out_noapbt;
- }
- if (!apbt_clockevent_register())
- apb_timer_block_enabled = 1;
- else {
- pr_debug("APBT has failed to register clockevent\n");
- goto out_noapbt;
- }
-#ifdef CONFIG_SMP
- /* kernel cmdline disable apb timer, so we will use lapic timers */
- if (disable_apbt_percpu) {
- printk(KERN_INFO "apbt: disabled per cpu timer\n");
- return;
- }
- pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
- if (num_possible_cpus() <= sfi_mtimer_num) {
- percpu_timer = 1;
- apbt_num_timers_used = num_possible_cpus();
- } else {
- percpu_timer = 0;
- apbt_num_timers_used = 1;
- adev = &per_cpu(cpu_apbt_dev, 0);
- adev->flags &= ~APBT_DEV_USED;
- }
- pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
-
- /* here we set up per CPU timer data structure */
- apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
- GFP_KERNEL);
- if (!apbt_devs) {
- printk(KERN_ERR "Failed to allocate APB timer devices\n");
- return;
- }
- for (i = 0; i < apbt_num_timers_used; i++) {
- adev = &per_cpu(cpu_apbt_dev, i);
- adev->num = i;
- adev->cpu = i;
- p_mtmr = sfi_get_mtmr(i);
- if (p_mtmr) {
- adev->tick = p_mtmr->freq_hz;
- adev->irq = p_mtmr->irq;
- } else
- printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
- adev->count = 0;
- sprintf(adev->name, "apbt%d", i);
- }
-#endif
-
- return;
-
-out_noapbt:
- apbt_clear_mapping();
- apb_timer_block_enabled = 0;
- panic("failed to enable APB timer\n");
-}
-
-static inline void apbt_disable(int n)
-{
- if (is_apbt_capable()) {
- unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
- ctrl &= ~APBTMR_CONTROL_ENABLE;
- apbt_writel(n, ctrl, APBTMR_N_CONTROL);
- }
-}
-
-/* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
-{
- int i, scale;
- u64 old, new;
- cycle_t t1, t2;
- unsigned long khz = 0;
- u32 loop, shift;
-
- apbt_set_mapping();
- apbt_start_counter(phy_cs_timer_id);
-
- /* check if the timer can count down, otherwise return */
- old = apbt_read_clocksource(&clocksource_apbt);
- i = 10000;
- while (--i) {
- if (old != apbt_read_clocksource(&clocksource_apbt))
- break;
- }
- if (!i)
- goto failed;
-
- /* count 16 ms */
- loop = (apbt_freq * 1000) << 4;
-
- /* restart the timer to ensure it won't get to 0 in the calibration */
- apbt_start_counter(phy_cs_timer_id);
-
- old = apbt_read_clocksource(&clocksource_apbt);
- old += loop;
-
- t1 = __native_read_tsc();
-
- do {
- new = apbt_read_clocksource(&clocksource_apbt);
- } while (new < old);
-
- t2 = __native_read_tsc();
-
- shift = 5;
- if (unlikely(loop >> shift == 0)) {
- printk(KERN_INFO
- "APBT TSC calibration failed, not enough resolution\n");
- return 0;
- }
- scale = (int)div_u64((t2 - t1), loop >> shift);
- khz = (scale * apbt_freq * 1000) >> shift;
- printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
- return khz;
-failed:
- return 0;
-}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..769321185a08 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Firmware replacement code.
*
@@ -10,25 +11,43 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
+#include <linux/kcore.h>
#include <linux/types.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
-#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <linux/kmemleak.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd/nb.h>
#include <asm/x86_init.h>
+#include <linux/crash_dump.h>
+
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
int gart_iommu_aperture;
int gart_iommu_aperture_disabled __initdata;
@@ -39,29 +58,48 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-struct bus_dev_range {
- int bus;
- int dev_base;
- int dev_limit;
-};
+#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE)
+/*
+ * If the first kernel maps the aperture over e820 RAM, the kdump kernel will
+ * use the same range because it will remain configured in the northbridge.
+ * Trying to dump this area via /proc/vmcore may crash the machine, so exclude
+ * it from vmcore.
+ */
+static unsigned long aperture_pfn_start, aperture_page_count;
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
- { 0x00, 0x18, 0x20},
- { 0xff, 0x00, 0x20},
- { 0xfe, 0x00, 0x20}
-};
+static int gart_mem_pfn_is_ram(unsigned long pfn)
+{
+ return likely((pfn < aperture_pfn_start) ||
+ (pfn >= aperture_pfn_start + aperture_page_count));
+}
-static struct resource gart_resource = {
- .name = "GART",
- .flags = IORESOURCE_MEM,
+#ifdef CONFIG_PROC_VMCORE
+static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn)
+{
+ return !!gart_mem_pfn_is_ram(pfn);
+}
+
+static struct vmcore_cb gart_vmcore_cb = {
+ .pfn_is_ram = gart_oldmem_pfn_is_ram,
};
+#endif
-static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+static void __init exclude_from_core(u64 aper_base, u32 aper_order)
+{
+ aperture_pfn_start = aper_base >> PAGE_SHIFT;
+ aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
+#ifdef CONFIG_PROC_VMCORE
+ register_vmcore_cb(&gart_vmcore_cb);
+#endif
+#ifdef CONFIG_PROC_KCORE
+ WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
+}
+#else
+static void exclude_from_core(u64 aper_base, u32 aper_order)
{
- gart_resource.start = aper_base;
- gart_resource.end = aper_base + aper_size - 1;
- insert_resource(&iomem_resource, &gart_resource);
}
+#endif
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -69,7 +107,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
static u32 __init allocate_aperture(void)
{
u32 aper_size;
- void *p;
+ unsigned long addr;
/* aper_size should <= 1G */
if (fallback_aper_order > 5)
@@ -82,40 +120,19 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- /*
- * using 512M as goal, in case kexec will load kernel_big
- * that will do the on position decompress, and could overlap with
- * that positon with gart that is used.
- * sequende:
- * kernel_small
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kernel_small(gart area become e820_reserved)
- * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
- * ==> kerne_big (uncompressed size will be big than 64M or 128M)
- * so don't use 512M below as gart iommu, leave the space for kernel
- * code for safe
- */
- p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(p);
- if (!p || __pa(p)+aper_size > 0xffffffff) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%p,%uK)\n",
- p, aper_size>>10);
- if (p)
- free_bootmem(__pa(p), aper_size);
+ addr = memblock_phys_alloc_range(aper_size, aper_size,
+ GART_MIN_ADDR, GART_MAX_ADDR);
+ if (!addr) {
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, __pa(p));
- insert_aperture_resource((u32)__pa(p), aper_size);
- register_nosave_region((u32)__pa(p) >> PAGE_SHIFT,
- (u32)__pa(p+aper_size) >> PAGE_SHIFT);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ register_nosave_region(addr >> PAGE_SHIFT,
+ (addr+aper_size) >> PAGE_SHIFT);
- return (u32)__pa(p);
+ return (u32)addr;
}
@@ -155,10 +172,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -182,16 +200,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -206,7 +226,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* Do an PCI bus scan by hand because we're running before the PCI
* subsystem.
*
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
* generically. It's probably overkill to always scan all slots because
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
@@ -239,75 +259,74 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
order);
}
- /* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
break;
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
+/*
+ * With kexec/kdump, if the first kernel doesn't shut down the GART and the
+ * second kernel allocates a different GART region, there might be two
+ * overlapping GART regions present:
+ *
+ * - the first still used by the GART initialized in the first kernel.
+ * - (sub-)set of it used as normal RAM by the second kernel.
+ *
+ * which leads to memory corruptions and a kernel panic eventually.
+ *
+ * This can also happen if the BIOS has forgotten to mark the GART region
+ * as reserved.
+ *
+ * Try to update the e820 map to mark that new region as reserved.
+ */
void __init early_gart_iommu_check(void)
{
- /*
- * in case it is enabled before, esp for kexec/kdump,
- * previous kernel already enable that. memset called
- * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
- * or second kernel have different position for GART hole. and new
- * kernel could use hole as RAM that is still used by GART set by
- * first kernel
- * or BIOS forget to put that in reserved.
- * try to update e820 to make that region as reserved.
- */
- u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 agp_aper_order = 0;
int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
+ if (!amd_gart_present())
+ return;
+
if (!early_pci_allowed())
return;
/* This is mostly duplicate of iommu_hole_init */
- agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+ search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- aper_enabled = ctl & AMD64_GARTEN;
+ aper_enabled = ctl & GARTEN;
aper_order = (ctl >> 1) & 7;
aper_size = (32 * 1024 * 1024) << aper_order;
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -336,12 +355,13 @@ void __init early_gart_iommu_check(void)
fix = 1;
if (gart_fix_e820 && !fix && aper_enabled) {
- if (e820_any_mapped(aper_base, aper_base + aper_size,
- E820_RAM)) {
+ if (e820__mapped_any(aper_base, aper_base + aper_size,
+ E820_TYPE_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
- e820_add_region(aper_base, aper_size, E820_RESERVED);
- update_e820();
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
+ e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED);
+ e820__update_table_print();
}
}
@@ -349,20 +369,20 @@ void __init early_gart_iommu_check(void)
return;
/* disable them all at first */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
- ctl &= ~AMD64_GARTEN;
+ ctl &= ~GARTEN;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -379,28 +399,31 @@ void __init gart_iommu_hole_init(void)
int fix, slot, valid_agp = 0;
int i, node;
+ if (!amd_gart_present())
+ return;
+
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
return;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
node = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
u32 ctl;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
iommu_detected = 1;
@@ -424,8 +447,9 @@ void __init gart_iommu_hole_init(void)
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -436,9 +460,9 @@ void __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -460,9 +484,12 @@ void __init gart_iommu_hole_init(void)
out:
if (!fix && !fallback_aper_force) {
if (last_aper_base) {
- unsigned long n = (32 * 1024 * 1024) << last_aper_order;
-
- insert_aperture_resource((u32)last_aper_base, n);
+ /*
+ * If this is the kdump kernel, the first kernel
+ * may have allocated the range over its e820 RAM
+ * and fixed up the northbridge
+ */
+ exclude_from_core(last_aper_base, last_aper_order);
}
return;
}
@@ -478,13 +505,10 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave an aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
@@ -503,22 +527,32 @@ out:
return;
}
- /* Fix up the north bridges */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
- int bus;
- int dev_base, dev_limit;
+ /*
+ * If this is the kdump kernel _and_ the first kernel did not
+ * configure the aperture in the northbridge, this range may
+ * overlap with the first kernel's memory. We can't access the
+ * range through vmcore even though it should be part of the dump.
+ */
+ exclude_from_core(aper_alloc, aper_order);
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ /* Fix up the north bridges */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = aper_order << 1;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
- /* Don't enable translation yet. That is done later.
- Assume this BIOS didn't initialise the GART so
- just overwrite all previous bits */
- write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
}
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..581db89477f9 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -1,19 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particular, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
+obj-y += hw_nmi.o
+
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_PCI_MSI) += msi.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
-obj-y += apic_flat_64.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
endif
-obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
-obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a96489ee6cab..d93f87f29d03 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC handling, local APIC timers
*
@@ -18,84 +19,85 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/bitmap.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/ftrace.h>
#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/timex.h>
+#include <linux/i8253.h>
#include <linux/dmar.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/dmi.h>
-#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/kvm_types.h>
+#include <xen/xen.h>
+
+#include <asm/trace/irq_vectors.h>
+#include <asm/irq_remapping.h>
+#include <asm/pc-conf-reg.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
+#include <asm/barrier.h>
#include <asm/mpspec.h>
-#include <asm/i8253.h>
#include <asm/i8259.h>
#include <asm/proto.h>
+#include <asm/traps.h>
#include <asm/apic.h>
+#include <asm/acpi.h>
+#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/idle.h>
#include <asm/mtrr.h>
+#include <asm/time.h>
#include <asm/smp.h>
#include <asm/mce.h>
-#include <asm/kvm_para.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
+#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
+#include <asm/cpu.h>
-unsigned int num_processors;
-
-unsigned disabled_cpus __cpuinitdata;
+#include "local.h"
/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
+u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
-/*
- * The highest APIC ID seen during enumeration.
- */
-unsigned int max_physical_apicid;
+u8 boot_cpu_apic_version __ro_after_init;
/*
- * Bitmask of physically existing CPUs:
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
*/
-physid_mask_t phys_cpu_present_map;
+static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
- * Map cpu index to physical APIC ID
+ * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID
*/
-DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+static bool virt_ext_dest_id __ro_after_init;
-#ifdef CONFIG_X86_32
-/*
- * Knob to control our willingness to enable the local APIC.
- *
- * +1=force-enable
- */
-static int force_enable_local_apic;
-/*
- * APIC command line parameters
- */
-static int __init parse_lapic(char *arg)
+/* For parallel bootup. */
+unsigned long apic_mmio_base __ro_after_init;
+
+static inline bool apic_accessible(void)
{
- force_enable_local_apic = 1;
- return 0;
+ return x2apic_mode || apic_mmio_base;
}
-early_param("lapic", parse_lapic);
+
+#ifdef CONFIG_X86_32
/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
+static int enabled_via_apicbase __ro_after_init;
/*
* Handle interrupt mode configuration register (IMCR).
@@ -107,103 +109,77 @@ static int enabled_via_apicbase;
*/
static inline void imcr_pic_to_apic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go through APIC */
- outb(0x01, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x01);
}
static inline void imcr_apic_to_pic(void)
{
- /* select IMCR register */
- outb(0x70, 0x22);
/* NMI and 8259 INTR go directly to BSP */
- outb(0x00, 0x23);
+ pc_conf_set(PC_CONF_MPS_IMCR, 0x00);
}
#endif
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ if (IS_ENABLED(CONFIG_X86_32) && !arg)
+ force_enable_local_apic = 1;
+ else if (arg && !strncmp(arg, "notscdeadline", 13))
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
#ifdef CONFIG_X86_64
static int apic_calibrate_pmtmr __initdata;
static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-int x2apic_mode;
-#ifdef CONFIG_X86_X2APIC
-/* x2apic enabled before OS handover */
-static int x2apic_preenabled;
-static __init int setup_nox2apic(char *str)
-{
- if (x2apic_enabled()) {
- pr_warning("Bios already enabled x2apic, "
- "can't enforce nox2apic");
- return 0;
- }
-
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-#endif
-
-unsigned long mp_lapic_addr;
-int disable_apic;
+static unsigned long mp_lapic_addr __ro_after_init;
+bool apic_is_disabled __ro_after_init;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __initdata;
/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
+int local_apic_timer_c2_ok __ro_after_init;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-int first_system_vector = 0xfe;
-
/*
* Debug level, exported for io_apic.c
*/
-unsigned int apic_verbosity;
+int apic_verbosity __ro_after_init;
-int pic_mode;
+int pic_mode __ro_after_init;
/* Have we found an MP table */
-int smp_found_config;
+int smp_found_config __ro_after_init;
static struct resource lapic_resource = {
.name = "Local APIC",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
-static unsigned int calibration_result;
+/* Measured in ticks per HZ. */
+unsigned int lapic_timer_period = 0;
-static int lapic_next_event(unsigned long delta,
- struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static void lapic_timer_broadcast(const struct cpumask *mask);
static void apic_pm_activate(void);
/*
- * The local apic timer can be used for any function which is CPU local.
- */
-static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-/*
* Get the LAPIC version
*/
static inline int lapic_get_version(void)
@@ -216,11 +192,7 @@ static inline int lapic_get_version(void)
*/
static inline int lapic_is_integrated(void)
{
-#ifdef CONFIG_X86_64
- return 1;
-#else
return APIC_INTEGRATED(lapic_get_version());
-#endif
}
/*
@@ -232,6 +204,11 @@ static int modern_apic(void)
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 >= 0xf)
return 1;
+
+ /* Hygon systems use modern APIC */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return 1;
+
return lapic_get_version() >= 0x14;
}
@@ -239,38 +216,19 @@ static int modern_apic(void)
* right after this call apic become NOOP driven
* so apic->write/read doesn't do anything
*/
-void apic_disable(void)
-{
- pr_info("APIC: switched to apic NOOP\n");
- apic = &apic_noop;
-}
-
-void native_apic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 native_safe_apic_wait_icr_idle(void)
+static void __init apic_disable(void)
{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
+ apic_install_driver(&apic_noop);
}
void native_apic_icr_write(u32 low, u32 id)
{
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
}
u64 native_apic_icr_read(void)
@@ -284,45 +242,15 @@ u64 native_apic_icr_read(void)
}
/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
- unsigned int v;
-
- /* unmask and set to NMI */
- v = APIC_DM_NMI;
-
- /* Level triggered for 82489DX (32bit mode) */
- if (!lapic_is_integrated())
- v |= APIC_LVT_LEVEL_TRIGGER;
-
- apic_write(APIC_LVT0, v);
-}
-
-#ifdef CONFIG_X86_32
-/**
- * get_physical_broadcast - Get number of physical broadcast IDs
- */
-int get_physical_broadcast(void)
-{
- return modern_apic() ? 0xff : 0xf;
-}
-#endif
-
-/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
*/
int lapic_get_maxlvt(void)
{
- unsigned int v;
-
- v = apic_read(APIC_LVR);
/*
* - we always have APIC integrated on 64bit mode
* - 82489DXs do not report # of LVT entries
*/
- return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+ return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2;
}
/*
@@ -331,6 +259,10 @@ int lapic_get_maxlvt(void)
/* Clock divisor */
#define APIC_DIVISOR 16
+#define TSC_DIVISOR 8
+
+/* i82489DX specific */
+#define I82489DX_BASE_DIVIDER (((0x2) << 18))
/*
* This function sets up the local APIC timer, with a timeout of
@@ -349,14 +281,33 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
lvtt_value = LOCAL_TIMER_VECTOR;
if (!oneshot)
lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+
+ /*
+ * The i82489DX APIC uses bit 18 and 19 for the base divider. This
+ * overlaps with bit 18 on integrated APICs, but is not documented
+ * in the SDM. No problem though. i82489DX equipped systems do not
+ * have TSC deadline timer.
+ */
if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ lvtt_value |= I82489DX_BASE_DIVIDER;
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
apic_write(APIC_LVTT, lvtt_value);
+ if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+ return;
+ }
+
/*
* Divide PICLK by 16
*/
@@ -370,38 +321,93 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
}
/*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
*
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
*
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
*/
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
- unsigned int v = (mask << 16) | (msg_type << 8) | vector;
-
- apic_write(reg, v);
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
}
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_MCE;
+ unsigned int rsvd, vector;
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]);
+ do {
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
+
+ rsvd = new & ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
+ return new;
}
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
{
- setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
- return APIC_EILVT_LVTOFF_IBS;
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
/*
* Program the next event, relative to now
@@ -413,40 +419,65 @@ static int lapic_next_event(unsigned long delta,
return 0;
}
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int lapic_next_deadline(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ u64 tsc;
+
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
+ tsc = rdtsc();
+ wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ return 0;
+}
+
+static int lapic_timer_shutdown(struct clock_event_device *evt)
{
- unsigned long flags;
unsigned int v;
/* Lapic used as dummy for broadcast ? */
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
+ return 0;
- local_irq_save(flags);
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
+ /*
+ * Setting APIC_LVT_MASKED (above) should be enough to tell
+ * the hardware that this timer will never fire. But AMD
+ * erratum 411 and some Intel CPU behavior circa 2024 say
+ * otherwise. Time for belt and suspenders programming: mask
+ * the timer _and_ zero the counter registers:
+ */
+ if (v & APIC_LVT_TIMER_TSCDEADLINE)
+ wrmsrq(MSR_IA32_TSC_DEADLINE, 0);
+ else
apic_write(APIC_TMICT, 0);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- local_irq_restore(flags);
+ return 0;
+}
+
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ __setup_APIC_LVTT(lapic_timer_period, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
}
/*
@@ -455,35 +486,147 @@ static void lapic_timer_setup(enum clock_event_mode mode,
static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_state_oneshot_stopped = lapic_timer_shutdown,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+static const struct x86_cpu_id deadline_match[] __initconst = {
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
+
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
+
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
+
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
+
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
+
+ {},
+};
+
+static __init bool apic_validate_deadline_timer(void)
+{
+ const struct x86_cpu_id *m;
+ u32 rev;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ m = x86_match_cpu(deadline_match);
+ if (!m)
+ return true;
+
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return true;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
+}
+
/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
-static void __cpuinit setup_APIC_timer(void)
+static void setup_APIC_timer(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
- if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
- /* Make LAPIC timer preferrable over percpu HPET */
+ /* Make LAPIC timer preferable over percpu HPET */
lapic_clockevent.rating = 150;
}
memcpy(levt, &lapic_clockevent, sizeof(*levt));
levt->cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(levt);
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->set_next_event = lapic_next_deadline;
+ clockevents_config_and_register(levt,
+ tsc_khz * (1000 / TSC_DIVISOR),
+ 0xF, ~0UL);
+ } else
+ clockevents_register_device(levt);
+
+ apic_update_vector(smp_processor_id(), LOCAL_TIMER_VECTOR, true);
+}
+
+/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
}
/*
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * irqs synchronous. CPUs connected by the same APIC bus have the very same bus
* frequency.
*
* This was previously done by reading the PIT/HPET and waiting for a wrap
@@ -505,20 +648,20 @@ static void __cpuinit setup_APIC_timer(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
- * Temporary interrupt handler.
+ * Temporary interrupt handler and polled calibration function.
*/
static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
- if (cpu_has_tsc)
- rdtscll(tsc);
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
switch (lapic_cal_loops++) {
case 0:
@@ -540,7 +683,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -551,7 +694,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_pr_verbose("... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -561,14 +704,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ apic_pr_verbose("... PM-Timer result ok\n");
return 0;
}
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warning("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+ pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n",
+ (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -578,31 +721,111 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
*delta = (long)res;
/* Correct the tsc counter value */
- if (cpu_has_tsc) {
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld)\n",
- (unsigned long)res, *deltatsc);
+ apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
return 0;
}
+static int __init lapic_init_clockevent(void)
+{
+ if (!lapic_timer_period)
+ return -1;
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+
+ return 0;
+}
+
+bool __init apic_needs_pit(void)
+{
+ /*
+ * If the frequencies are not known, PIT is required for both TSC
+ * and apic timer calibration.
+ */
+ if (!tsc_khz || !cpu_khz)
+ return true;
+
+ /* Is there an APIC at all or is it disabled? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled)
+ return true;
+
+ /*
+ * If interrupt delivery mode is legacy PIC or virtual wire without
+ * configuration, the local APIC timer won't be set up. Make sure
+ * that the PIT is initialized.
+ */
+ if (apic_intr_mode == APIC_PIC ||
+ apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
+ return true;
+
+ /* Virt guests may lack ARAT, but still have DEADLINE */
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ return true;
+
+ /* Deadline timer is based on TSC so no further PIT action required */
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+
+ /* APIC timer disabled? */
+ if (disable_apic_timer)
+ return true;
+ /*
+ * The APIC timer frequency is known already, no PIT calibration
+ * required. If unknown, let the PIT be initialized.
+ */
+ return lapic_timer_period == 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- void (*real_handler)(struct clock_event_device *dev);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+ u64 tsc_perj = 0, tsc_start = 0;
+ long delta_tsc_khz, bus_khz;
+ unsigned long jif_start;
unsigned long deltaj;
long delta, deltatsc;
int pm_referenced = 0;
- local_irq_disable();
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return 0;
- /* Replace the global interrupt handler */
- real_handler = global_clock_event->event_handler;
- global_clock_event->event_handler = lapic_cal_handler;
+ /*
+ * Check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. If so just fill
+ * in the clockevent structure and return.
+ */
+ if (!lapic_init_clockevent()) {
+ apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period);
+ /*
+ * Direct calibration methods must have an always running
+ * local APIC timer, no need for broadcast timer.
+ */
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
+ apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n");
+
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
+ local_irq_disable();
/*
* Setup the APIC counter to maximum. There is no way the lapic
@@ -610,20 +833,55 @@ static int __init calibrate_APIC_clock(void)
*/
__setup_APIC_LVTT(0xffffffff, 0, 0);
- /* Let the interrupts run */
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
local_irq_enable();
- while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
- cpu_relax();
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
- local_irq_disable();
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
- /* Restore the real event handler */
- global_clock_event->event_handler = real_handler;
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
+
+ local_irq_disable();
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ apic_pr_verbose("... lapic delta = %ld\n", delta);
deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
@@ -631,56 +889,48 @@ static int __init calibrate_APIC_clock(void)
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
&delta, &deltatsc);
- /* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
- lapic_clockevent.shift);
- lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
- lapic_clockevent.min_delta_ns =
- clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_init_clockevent();
- calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ apic_pr_verbose("..... delta %ld\n", delta);
+ apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult);
+ apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- calibration_result);
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS);
- if (cpu_has_tsc) {
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n",
+ delta_tsc_khz / 1000, delta_tsc_khz % 1000);
}
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%u.%04u MHz.\n",
- calibration_result / (1000000 / HZ),
- calibration_result % (1000000 / HZ));
+ bus_khz = (long)lapic_timer_period * HZ / 1000;
+ apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n",
+ bus_khz / 1000, bus_khz % 1000);
/*
* Do a sanity check on the APIC calibration result
*/
- if (calibration_result < (1000000 / HZ)) {
+ if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
- pr_warning("APIC frequency too slow, disabling apic timer\n");
+ pr_warn("APIC frequency too slow, disabling apic timer\n");
return -1;
}
levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
/*
- * PM timer calibration failed or not turned on
- * so lets try APIC timer based calibration
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
*/
- if (!pm_referenced) {
- apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+ if (!pm_referenced && global_clock_event) {
+ apic_pr_verbose("... verify APIC timer\n");
/*
* Setup the apic timer manually
*/
levt->event_handler = lapic_cal_handler;
- lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_timer_set_periodic(levt);
lapic_cal_loops = -1;
/* Let the interrupts run */
@@ -690,23 +940,24 @@ static int __init calibrate_APIC_clock(void)
cpu_relax();
/* Stop the lapic timer */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+ apic_pr_verbose("... jiffies delta = %lu\n", deltaj);
/* Check, if the jiffies result is consistent */
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
- apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ apic_pr_verbose("... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
- pr_warning("APIC timer disabled due to verification failure\n");
- return -1;
+ pr_warn("APIC timer disabled due to verification failure\n");
+ return -1;
}
return 0;
@@ -735,9 +986,6 @@ void __init setup_boot_APIC_clock(void)
return;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
-
if (calibrate_APIC_clock()) {
/* No broadcast on UP ! */
if (num_possible_cpus() > 1)
@@ -750,19 +998,17 @@ void __init setup_boot_APIC_clock(void)
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- pr_warning("APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
-void __cpuinit setup_secondary_APIC_clock(void)
+void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
/*
@@ -770,8 +1016,7 @@ void __cpuinit setup_secondary_APIC_clock(void)
*/
static void local_apic_timer_interrupt(void)
{
- int cpu = smp_processor_id();
- struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
/*
* Normally we should not be here till LAPIC has been initialized but
@@ -785,9 +1030,10 @@ static void local_apic_timer_interrupt(void)
* spurious.
*/
if (!evt->event_handler) {
- pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ pr_warn("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
/* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ lapic_timer_shutdown(evt);
return;
}
@@ -807,33 +1053,18 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- /*
- * NOTE! We'd better ACK the irq immediately,
- * because timer handling can be slow.
- */
- ack_APIC_irq();
- /*
- * update_process_times() expects us to have done irq_enter().
- * Besides, if we don't timer interrupts ignore the global
- * interrupt lock, which is the WrongThing (tm) to do.
- */
- exit_idle();
- irq_enter();
+ apic_eoi();
+ trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
- irq_exit();
+ trace_local_timer_exit(LOCAL_TIMER_VECTOR);
set_irq_regs(old_regs);
}
-int setup_profiling_timer(unsigned int multiplier)
-{
- return -EINVAL;
-}
-
/*
* Local APIC start and shutdown
*/
@@ -850,8 +1081,7 @@ void clear_local_APIC(void)
int maxlvt;
u32 v;
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
+ if (!apic_accessible())
return;
maxlvt = lapic_get_maxlvt();
@@ -914,25 +1144,40 @@ void clear_local_APIC(void)
}
/**
- * disable_local_APIC - clear and disable the local APIC
+ * apic_soft_disable - Clears and software disables the local APIC on hotplug
+ *
+ * Contrary to disable_local_APIC() this does not touch the enable bit in
+ * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC
+ * bus would require a hardware reset as the APIC would lose track of bus
+ * arbitration. On systems with FSB delivery APICBASE could be disabled,
+ * but it has to be guaranteed that no interrupt is sent to the APIC while
+ * in that state and it's not clear from the SDM whether it still responds
+ * to INIT/SIPI messages. Stay on the safe side and use software disable.
*/
-void disable_local_APIC(void)
+void apic_soft_disable(void)
{
- unsigned int value;
-
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
- return;
+ u32 value;
clear_local_APIC();
- /*
- * Disable APIC (implies clearing of registers
- * for 82489DX!).
- */
+ /* Soft disable APIC (implies clearing of registers for 82489DX!). */
value = apic_read(APIC_SPIV);
value &= ~APIC_SPIV_APIC_ENABLED;
apic_write(APIC_SPIV, value);
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ if (!apic_accessible())
+ return;
+
+ if (apic->teardown)
+ apic->teardown();
+
+ apic_soft_disable();
#ifdef CONFIG_X86_32
/*
@@ -959,7 +1204,7 @@ void lapic_shutdown(void)
{
unsigned long flags;
- if (!cpu_has_apic && !apic_from_smp_config())
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
return;
local_irq_save(flags);
@@ -975,67 +1220,6 @@ void lapic_shutdown(void)
local_irq_restore(flags);
}
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ apic->apic_id_mask))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
/**
* sync_Arb_IDs - synchronize APIC bus arbitration IDs
*/
@@ -1053,9 +1237,71 @@ void __init sync_Arb_IDs(void)
*/
apic_wait_icr_idle();
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_pr_debug("Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
+}
+
+enum apic_intr_mode_id apic_intr_mode __ro_after_init;
+
+static int __init __apic_intr_mode_select(void)
+{
+ /* Check kernel option */
+ if (apic_is_disabled) {
+ pr_info("APIC disabled via kernel command line\n");
+ return APIC_PIC;
+ }
+
+ /* Check BIOS */
+#ifdef CONFIG_X86_64
+ /* On 64-bit, the APIC must be integrated, Check local APIC only */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ apic_is_disabled = true;
+ pr_info("APIC disabled by BIOS\n");
+ return APIC_PIC;
+ }
+#else
+ /* On 32-bit, the APIC may be integrated APIC or 82489DX */
+
+ /* Neither 82489DX nor integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
+ apic_is_disabled = true;
+ return APIC_PIC;
+ }
+
+ /* If the BIOS pretends there is an integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) &&
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
+ apic_is_disabled = true;
+ pr_err(FW_BUG "Local APIC not detected, force emulation\n");
+ return APIC_PIC;
+ }
+#endif
+
+ /* Check MP table or ACPI MADT configuration */
+ if (!smp_found_config) {
+ disable_ioapic_support();
+ if (!acpi_lapic) {
+ pr_info("APIC: ACPI MADT or MP tables are not detected\n");
+ return APIC_VIRTUAL_WIRE_NO_CONFIG;
+ }
+ return APIC_VIRTUAL_WIRE;
+ }
+
+#ifdef CONFIG_SMP
+ /* If SMP should be disabled, then really disable it! */
+ if (!setup_max_cpus) {
+ pr_info("APIC: SMP mode deactivated\n");
+ return APIC_SYMMETRIC_IO_NO_ROUTING;
+ }
+#endif
+
+ return APIC_SYMMETRIC_IO;
+}
+
+/* Select the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_select(void)
+{
+ apic_intr_mode = __apic_intr_mode_select();
}
/*
@@ -1069,7 +1315,7 @@ void __init init_bsp_APIC(void)
* Don't do the setup now if we have a SMP BIOS as the
* through-I/O-APIC virtual wire mode might be active.
*/
- if (smp_found_config || !cpu_has_apic)
+ if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
return;
/*
@@ -1102,10 +1348,46 @@ void __init init_bsp_APIC(void)
value = APIC_DM_NMI;
if (!lapic_is_integrated()) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
}
-static void __cpuinit lapic_setup_esr(void)
+static void __init apic_bsp_setup(bool upmode);
+
+/* Init the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_init(void)
+{
+ bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ pr_info("APIC: Keep in PIC mode(8259)\n");
+ return;
+ case APIC_VIRTUAL_WIRE:
+ pr_info("APIC: Switch to virtual wire mode setup\n");
+ break;
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
+ upmode = true;
+ break;
+ case APIC_SYMMETRIC_IO:
+ pr_info("APIC: Switch to symmetric I/O mode setup\n");
+ break;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
+ break;
+ }
+
+ x86_64_probe_apic();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+
+ apic_bsp_setup(upmode);
+}
+
+static void lapic_setup_esr(void)
{
unsigned int oldvalue, value, maxlvt;
@@ -1140,31 +1422,105 @@ static void __cpuinit lapic_setup_esr(void)
if (maxlvt > 3)
apic_write(APIC_ESR, 0);
value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08x after: 0x%08x\n",
- oldvalue, value);
+ if (value != oldvalue) {
+ apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+ }
+}
+
+#define APIC_IR_REGS APIC_ISR_NR
+#define APIC_IR_BITS (APIC_IR_REGS * 32)
+#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG)
+
+union apic_ir {
+ unsigned long map[APIC_IR_MAPSIZE];
+ u32 regs[APIC_IR_REGS];
+};
+
+static bool apic_check_and_eoi_isr(union apic_ir *isr)
+{
+ int i, bit;
+
+ /* Read the ISRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ /* If the ISR map empty, nothing to do here. */
+ if (bitmap_empty(isr->map, APIC_IR_BITS))
+ return true;
+
+ /*
+ * There can be multiple ISR bits set when a high priority
+ * interrupt preempted a lower priority one. Issue an EOI for each
+ * set bit. The priority traversal order does not matter as there
+ * can't be new ISR bits raised at this point. What matters is that
+ * an EOI is issued for each ISR bit.
+ */
+ for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+ apic_eoi();
+
+ /* Reread the ISRs, they should be empty now */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ return bitmap_empty(isr->map, APIC_IR_BITS);
}
+/*
+ * If a CPU services an interrupt and crashes before issuing EOI to the
+ * local APIC, the corresponding ISR bit is still set when the crashing CPU
+ * jumps into a crash kernel. Read the ISR and issue an EOI for each set
+ * bit to acknowledge it as otherwise these slots would be locked forever
+ * waiting for an EOI.
+ *
+ * If there are pending bits in the IRR, then they won't be converted into
+ * ISR bits as the CPU has interrupts disabled. They will be delivered once
+ * the CPU enables interrupts and there is nothing which can prevent that.
+ *
+ * In the worst case this results in spurious interrupt warnings.
+ */
+static void apic_clear_isr(void)
+{
+ union apic_ir ir;
+ unsigned int i;
+
+ if (!apic_check_and_eoi_isr(&ir))
+ pr_warn("APIC: Stale ISR: %256pb\n", ir.map);
+
+ for (i = 0; i < APIC_IR_REGS; i++)
+ ir.regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+ if (!bitmap_empty(ir.map, APIC_IR_BITS))
+ pr_warn("APIC: Stale IRR: %256pb\n", ir.map);
+}
/**
* setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringing up APs.
+ * Always called with preemption disabled.
*/
-void __cpuinit setup_local_APIC(void)
+static void setup_local_APIC(void)
{
- unsigned int value, queued;
- int i, j, acked = 0;
- unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz;
-
- if (cpu_has_tsc)
- rdtscll(tsc);
+ int cpu = smp_processor_id();
+ unsigned int value;
- if (disable_apic) {
- arch_disable_smp_support();
+ if (apic_is_disabled) {
+ disable_ioapic_support();
return;
}
+ if (apic->setup)
+ apic->setup();
+
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
#ifdef CONFIG_X86_32
/* Pound the ESR really hard over the head with a big hammer - mbligh */
if (lapic_is_integrated() && apic->disable_esr) {
@@ -1174,68 +1530,28 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- perf_events_lapic_init();
-
- preempt_disable();
-
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- BUG_ON(!apic->apic_id_registered());
-
/*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * document number 292116).
+ *
+ * Except for APICs which operate in physical destination mode.
*/
- apic->init_apic_ldr();
+ if (apic->init_apic_ldr)
+ apic->init_apic_ldr();
/*
- * Set Task Priority to 'accept all'. We never change this
- * later on.
+ * Set Task Priority to 'accept all except vectors 0-31'. An APIC
+ * vector in the 16-31 range could be delivered if TPR == 0, but we
+ * would think it's an exception and terrible things will happen. We
+ * never change this later on.
*/
value = apic_read(APIC_TASKPRI);
value &= ~APIC_TPRI_MASK;
+ value |= 0x10;
apic_write(APIC_TASKPRI, value);
- /*
- * After a crash, we no longer service the interrupts and a pending
- * interrupt from previous kernel might still have ISR bit set.
- *
- * Most probably by now CPU has serviced that pending interrupt and
- * it might not have done the ack_APIC_irq() because it thought,
- * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
- * does not clear the ISR bit and cpu thinks it has already serivced
- * the interrupt. Hence a vector might get locked. It was noticed
- * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
- */
- do {
- queued = 0;
- for (i = APIC_ISR_NR - 1; i >= 0; i--)
- queued |= apic_read(APIC_IRR + i*0x10);
-
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j)) {
- ack_APIC_irq();
- acked++;
- }
- }
- }
- if (acked > 256) {
- printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
- acked);
- break;
- }
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
- } while (queued && max_loops > 0);
- WARN_ON(max_loops <= 0);
+ apic_clear_isr();
/*
* Now that we are all set up, enable the APIC
@@ -1263,9 +1579,8 @@ void __cpuinit setup_local_APIC(void)
*/
/*
* Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
+ * frequent as it makes the interrupt distribution model be more
* like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
*/
/*
@@ -1282,10 +1597,12 @@ void __cpuinit setup_local_APIC(void)
value |= SPURIOUS_APIC_VECTOR;
apic_write(APIC_SPIV, value);
+ perf_events_lapic_init();
+
/*
* Set up LVT0, LVT1:
*
- * set up through-local-APIC on the BP's LINT0. This is not
+ * set up through-local-APIC on the boot CPU's LINT0. This is not
* strictly necessary in pure symmetric-IO mode, but sometimes
* we delegate interrupts to the 8259A.
*/
@@ -1293,38 +1610,38 @@ void __cpuinit setup_local_APIC(void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
+ if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
/*
- * only the BP should see the LINT1 NMI signal, obviously.
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
*/
- if (!smp_processor_id())
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
- if (!lapic_is_integrated()) /* 82489DX */
+
+ /* Is 82489DX ? */
+ if (!lapic_is_integrated())
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
- preempt_enable();
-
#ifdef CONFIG_X86_MCE_INTEL
/* Recheck CMCI information after local APIC is up on CPU #0 */
- if (smp_processor_id() == 0)
+ if (!cpu)
cmci_recheck();
#endif
}
-void __cpuinit end_local_APIC_setup(void)
+static void end_local_APIC_setup(void)
{
lapic_setup_esr();
@@ -1338,129 +1655,281 @@ void __cpuinit end_local_APIC_setup(void)
}
#endif
- setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
+{
+ setup_local_APIC();
+ end_local_APIC_setup();
+}
+
+static __init void apic_read_boot_cpu_id(bool x2apic)
+{
+ /*
+ * This can be invoked from check_x2apic() before the APIC has been
+ * selected. But that code knows for sure that the BIOS enabled
+ * X2APIC.
+ */
+ if (x2apic) {
+ boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID);
+ boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR));
+ } else {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+ topology_register_boot_apic(boot_cpu_physical_apicid);
+}
+
#ifdef CONFIG_X86_X2APIC
-void check_x2apic(void)
+int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
+
+enum {
+ X2APIC_OFF,
+ X2APIC_DISABLED,
+ /* All states below here have X2APIC enabled */
+ X2APIC_ON,
+ X2APIC_ON_LOCKED
+};
+static int x2apic_state;
+
+static bool x2apic_hw_locked(void)
{
- if (x2apic_enabled()) {
- pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic_mode = 1;
+ u64 x86_arch_cap_msr;
+ u64 msr;
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
+ rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+ return (msr & LEGACY_XAPIC_DISABLED);
}
+ return false;
}
-void enable_x2apic(void)
+static void __x2apic_disable(void)
{
- int msr, msr2;
+ u64 msr;
- if (!x2apic_mode)
+ if (!boot_cpu_has(X86_FEATURE_APIC))
return;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
- if (!(msr & X2APIC_ENABLE)) {
- printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
- }
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
}
-#endif /* CONFIG_X86_X2APIC */
-int __init enable_IR(void)
+static void __x2apic_enable(void)
{
-#ifdef CONFIG_INTR_REMAP
- if (!intr_remapping_supported()) {
- pr_debug("intr-remapping not supported\n");
- return 0;
+ u64 msr;
+
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
+
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ u32 apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warn("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ if (x2apic_hw_locked()) {
+ pr_warn("APIC locked in x2apic mode, can't disable\n");
+ return 0;
+ }
+ pr_warn("x2apic already enabled.\n");
+ __x2apic_disable();
}
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling intr-remap because of skipping "
- "io-apic setup\n");
- return 0;
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
+{
+ /*
+ * Try to make the AP's APIC state match that of the BSP, but if the
+ * BSP is unlocked and the AP is locked then there is a state mismatch.
+ * Warn about the mismatch in case a GP fault occurs due to a locked AP
+ * trying to be turned off.
+ */
+ if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked())
+ pr_warn("x2apic lock mismatch between BSP and AP.\n");
+ /*
+ * If x2apic is not in ON or LOCKED state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state < X2APIC_ON) {
+ __x2apic_disable();
+ return;
}
+ __x2apic_enable();
+}
- if (enable_intr_remapping(x2apic_supported()))
- return 0;
+static __init void apic_set_fixmap(bool read_apic);
- pr_info("Enabled Interrupt-remapping\n");
+static __init void x2apic_disable(void)
+{
+ u32 x2apic_id;
- return 1;
+ if (x2apic_state < X2APIC_ON)
+ return;
-#endif
- return 0;
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ if (x2apic_hw_locked()) {
+ pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id);
+ return;
+ }
+
+ __x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
}
-void __init enable_IR_x2apic(void)
+static __init void x2apic_enable(void)
{
- unsigned long flags;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
- int ret, x2apic_enabled = 0;
- int dmar_table_init_ret;
-
- dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret && !x2apic_supported())
+ if (x2apic_state != X2APIC_OFF)
return;
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- pr_err("Allocate ioapic_entries failed\n");
- goto out;
- }
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto out;
- }
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
+ return;
- local_irq_save(flags);
- legacy_pic->mask_all();
- mask_IO_APIC_setup(ioapic_entries);
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ u32 apic_limit = 255;
- if (dmar_table_init_ret)
- ret = 0;
- else
- ret = enable_IR();
+ /*
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
+ */
+ if (!x86_init.hyper.x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
- if (!ret) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
+ /*
+ * If the hypervisor supports extended destination ID in
+ * MSI, that increases the maximum APIC ID that can be
+ * used for non-remapped IRQ domains.
*/
- if (max_physical_apicid > 255 || !kvm_para_available())
- goto nox2apic;
+ if (x86_init.hyper.msi_ext_dest_id()) {
+ virt_ext_dest_id = 1;
+ apic_limit = 32767;
+ }
+
/*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannot
+ * be addressed must not be brought online.
*/
- x2apic_force_phys();
+ x2apic_set_max_apicid(apic_limit);
+ x2apic_phys = 1;
}
+ x2apic_enable();
+}
- x2apic_enabled = 1;
-
- if (x2apic_supported() && !x2apic_mode) {
+void __init check_x2apic(void)
+{
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
+ if (x2apic_hw_locked())
+ x2apic_state = X2APIC_ON_LOCKED;
+ else
+ x2apic_state = X2APIC_ON;
+ apic_read_boot_cpu_id(true);
+ } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
+ x2apic_state = X2APIC_DISABLED;
}
+}
+#else /* CONFIG_X86_X2APIC */
+void __init check_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return;
+ /*
+ * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC?
+ */
+ pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
+ pr_err("Disabling APIC, expect reduced performance and functionality.\n");
-nox2apic:
- if (!ret) /* IR enabling failed */
- restore_IO_APIC_setup(ioapic_entries);
- legacy_pic->restore_mask();
- local_irq_restore(flags);
+ apic_is_disabled = true;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+}
-out:
- if (ioapic_entries)
- free_ioapic_entries(ioapic_entries);
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ int ret, ir_stat;
- if (x2apic_enabled)
+ if (ioapic_is_disabled) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
+ return;
+ }
+
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
+ return;
+
+ ret = save_ioapic_entries();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
return;
+ }
- if (x2apic_preenabled)
- panic("x2apic: enabled by BIOS but kernel init failed.");
- else if (cpu_has_x2apic)
- pr_info("Not enabling x2apic, Intr-remapping init failed.\n");
+ local_irq_save(flags);
+ legacy_pic->mask_all();
+ mask_ioapic_entries();
+
+ /* If irq_remapping_prepare() succeeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = irq_remapping_enable();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
+
+ if (ir_stat < 0)
+ restore_ioapic_entries();
+ legacy_pic->restore_mask();
+ local_irq_restore(flags);
}
#ifdef CONFIG_X86_64
@@ -1470,27 +1939,78 @@ out:
* On AMD64 we trust the BIOS - if it says no APIC it is likely
* not correctly set up (usually the APIC timer won't work etc.)
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
- if (!cpu_has_apic) {
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
pr_info("No local APIC present\n");
- return -1;
+ return false;
}
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return 0;
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ return true;
}
#else
+
+static bool __init apic_verify(unsigned long addr)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warn("Could not enable APIC!\n");
+ return false;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+
+ /* The BIOS may have set up the APIC at some other address */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ addr = l & MSR_IA32_APICBASE_BASE;
+ }
+
+ register_lapic_address(addr);
+ pr_info("Found and enabled local APIC!\n");
+ return true;
+}
+
+bool __init apic_force_enable(unsigned long addr)
+{
+ u32 h, l;
+
+ if (apic_is_disabled)
+ return false;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ }
+ return apic_verify(addr);
+}
+
/*
* Detect and initialize APIC
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
- u32 h, l, features;
-
/* Disabled by kernel option? */
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -1498,16 +2018,18 @@ static int __init detect_init_APIC(void)
(boot_cpu_data.x86 >= 15))
break;
goto no_apic;
+ case X86_VENDOR_HYGON:
+ break;
case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
- (boot_cpu_data.x86 == 5 && cpu_has_apic))
+ if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) ||
+ boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO)
break;
goto no_apic;
default:
goto no_apic;
}
- if (!cpu_has_apic) {
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
/*
* Over-ride BIOS and try to enable the local APIC only if
* "lapic" specified.
@@ -1515,70 +2037,22 @@ static int __init detect_init_APIC(void)
if (!force_enable_local_apic) {
pr_info("Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
- return -1;
- }
- /*
- * Some BIOSes disable the local APIC in the APIC_BASE
- * MSR. This can only be done in software for Intel P6 or later
- * and AMD K7 (Model > 1) or later.
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
+ return false;
}
+ if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return false;
+ } else {
+ if (!apic_verify(APIC_DEFAULT_PHYS_BASE))
+ return false;
}
- /*
- * The APIC feature bit should now be enabled
- * in `cpuid'
- */
- features = cpuid_edx(1);
- if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
- return -1;
- }
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
- pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
- return 0;
+ return true;
no_apic:
pr_info("No local APIC present or hardware disabled\n");
- return -1;
-}
-#endif
-
-#ifdef CONFIG_X86_64
-void __init early_init_lapic_mapping(void)
-{
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, mp_lapic_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
+ return false;
}
#endif
@@ -1587,203 +2061,146 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
- unsigned int new_apicid;
+ if (apic_validate_deadline_timer())
+ pr_info("TSC deadline timer available\n");
- if (x2apic_mode) {
- boot_cpu_physical_apicid = read_apic_id();
+ if (x2apic_mode)
return;
- }
- /* If no local APIC can be found return early */
- if (!smp_found_config && detect_init_APIC()) {
- /* lets NOP'ify apic operations */
- pr_info("APIC: disable apic facility\n");
- apic_disable();
- } else {
- apic_phys = mp_lapic_addr;
+ if (!smp_found_config) {
+ if (!detect_init_APIC()) {
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ }
+ }
+}
- /*
- * acpi lapic path already maps that address in
- * acpi_register_lapic_address()
- */
- if (!acpi_lapic)
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+static __init void apic_set_fixmap(bool read_apic)
+{
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+ apic_mmio_base = APIC_BASE;
+ apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
+}
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
- }
+void __init register_lapic_address(unsigned long address)
+{
+ /* This should only happen once */
+ WARN_ON_ONCE(mp_lapic_addr);
+ mp_lapic_addr = address;
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- new_apicid = read_apic_id();
- if (boot_cpu_physical_apicid != new_apicid) {
- boot_cpu_physical_apicid = new_apicid;
- /*
- * yeah -- we lie about apic_version
- * in case if apic was disabled via boot option
- * but it's not a problem for SMP compiled kernel
- * since smp_sanity_check is prepared for such a case
- * and disable smp mode
- */
- apic_version[new_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+ if (!x2apic_mode)
+ apic_set_fixmap(true);
}
/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
+ * Local APIC interrupts
*/
-int apic_version[MAX_APICS];
-int __init APIC_init_uniprocessor(void)
+/*
+ * Common handling code for spurious_interrupt and spurious_vector entry
+ * points below. No point in allowing the compiler to inline it twice.
+ */
+static noinline void handle_spurious_interrupt(u8 vector)
{
- if (disable_apic) {
- pr_info("Apic disabled\n");
- return -1;
- }
-#ifdef CONFIG_X86_64
- if (!cpu_has_apic) {
- disable_apic = 1;
- pr_info("Apic disabled by BIOS\n");
- return -1;
- }
-#else
- if (!smp_found_config && !cpu_has_apic)
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
- return -1;
- }
-#endif
+ u32 v;
-#ifndef CONFIG_SMP
- enable_IR_x2apic();
- default_setup_apic_routing();
-#endif
+ trace_spurious_apic_entry(vector);
- verify_local_APIC();
- connect_bsp_APIC();
+ inc_irq_stat(irq_spurious_count);
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-#else
/*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
+ * If this is a spurious interrupt then do not acknowledge
*/
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- setup_local_APIC();
+ if (vector == SPURIOUS_APIC_VECTOR) {
+ /* See SDM vol 3 */
+ pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ goto out;
+ }
-#ifdef CONFIG_X86_IO_APIC
/*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling error vector
+ * If it is a vectored one, verify it's set in the ISR. If set,
+ * acknowledge it.
*/
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-#endif
-
- end_local_APIC_setup();
-
-#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- localise_nmi_watchdog();
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f))) {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ vector, smp_processor_id());
+ apic_eoi();
+ } else {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ vector, smp_processor_id());
}
-#else
- localise_nmi_watchdog();
-#endif
-
- x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
- check_nmi_watchdog();
-#endif
-
- return 0;
+out:
+ trace_spurious_apic_exit(vector);
}
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
*/
-void smp_spurious_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
- u32 v;
-
- exit_idle();
- irq_enter();
- /*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
- */
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
- ack_APIC_irq();
-
- inc_irq_stat(irq_spurious_count);
+ handle_spurious_interrupt(vector);
+}
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- pr_info("spurious APIC interrupt on CPU#%d, "
- "should never happen.\n", smp_processor_id());
- irq_exit();
+DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
+{
+ handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
-void smp_error_interrupt(struct pt_regs *regs)
-{
- u32 v, v1;
+DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
+{
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
+ u32 v, i = 0;
+
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
- exit_idle();
- irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
- ack_APIC_irq();
+ apic_eoi();
atomic_inc(&irq_err_count);
- /*
- * Here is what the APIC error bits mean:
- * 0: Send CS error
- * 1: Receive CS error
- * 2: Send accept error
- * 3: Receive accept error
- * 4: Reserved
- * 5: Send illegal vector
- * 6: Received illegal vector
- * 7: Illegal register address
- */
- pr_debug("APIC error on CPU%d: %02x(%02x)\n",
- smp_processor_id(), v , v1);
- irq_exit();
+ apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v);
+
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
+ apic_pr_debug_cont(" : %s", error_interrupt_reason[i]);
+ i++;
+ v >>= 1;
+ }
+
+ apic_pr_debug_cont("\n");
+
+ trace_error_apic_exit(ERROR_APIC_VECTOR);
}
/**
* connect_bsp_APIC - attach the APIC to the interrupt system
*/
-void __init connect_bsp_APIC(void)
+static void __init connect_bsp_APIC(void)
{
#ifdef CONFIG_X86_32
if (pic_mode) {
@@ -1795,13 +2212,10 @@ void __init connect_bsp_APIC(void)
* PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
* local APIC to INT and NMI lines.
*/
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
+ apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n");
imcr_pic_to_apic();
}
#endif
- if (apic->enable_apic_mode)
- apic->enable_apic_mode();
}
/**
@@ -1823,8 +2237,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
* IPIs, won't work beyond this point! The only exception are
* INIT IPIs.
*/
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
+ apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n");
imcr_apic_to_pic();
return;
}
@@ -1869,85 +2282,77 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-void __cpuinit generic_processor_info(int apicid, int version)
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar)
{
- int cpu;
-
- /*
- * Validate version
- */
- if (version == 0x0) {
- pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
- version = 0x10;
- }
- apic_version[apicid] = version;
+ memset(msg, 0, sizeof(*msg));
- if (num_processors >= nr_cpu_ids) {
- int max = nr_cpu_ids;
- int thiscpu = max + disabled_cpus;
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+ msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
- pr_warning(
- "ACPI: NR_CPUS/possible_cpus limit of %i reached."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+ msg->arch_data.vector = cfg->vector;
- disabled_cpus++;
- return;
- }
-
- num_processors++;
- cpu = cpumask_next_zero(-1, cpu_present_mask);
-
- if (version != apic_version[boot_cpu_physical_apicid])
- WARN_ONCE(1,
- "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
- apic_version[boot_cpu_physical_apicid], cpu, version);
-
- physid_set(apicid, phys_cpu_present_map);
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+ /*
+ * Only the IOMMU itself can use the trick of putting destination
+ * APIC ID into the high bits of the address. Anything else would
+ * just be writing to memory if it tried that, and needs IR to
+ * address APICs which can't be addressed in the normal 32-bit
+ * address range at 0xFFExxxxx. That is typically just 8 bits, but
+ * some hypervisors allow the extended destination ID field in bits
+ * 5-11 to be used, giving support for 15 bits of APIC IDs in total.
+ */
+ if (dmar)
+ msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+ else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
+ msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
+ else
+ WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
+}
-#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
- early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-#endif
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+ u32 dest = msg->arch_addr_lo.destid_0_7;
- set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+ if (extid)
+ dest |= msg->arch_addr_hi.destid_8_31 << 8;
+ return dest;
}
+EXPORT_SYMBOL_FOR_KVM(x86_msi_msg_get_destid);
-int hard_smp_processor_id(void)
+static void __init apic_bsp_up_setup(void)
{
- return read_apic_id();
+ reset_phys_cpu_present_map(boot_cpu_physical_apicid);
}
-void default_init_apic_ldr(void)
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ */
+static void __init apic_bsp_setup(bool upmode)
{
- unsigned long val;
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+ lapic_update_legacy_vectors();
}
-#ifdef CONFIG_X86_32
-int default_apicid_to_node(int logical_apicid)
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
+ if (apic_intr_mode == APIC_PIC)
+ return;
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
}
#endif
@@ -1964,7 +2369,7 @@ static struct {
*/
int active;
/* r/w apic fields */
- unsigned int apic_id;
+ u32 apic_id;
unsigned int apic_taskpri;
unsigned int apic_ldr;
unsigned int apic_dfr;
@@ -1977,9 +2382,10 @@ static struct {
unsigned int apic_tmict;
unsigned int apic_tdcr;
unsigned int apic_thmr;
+ unsigned int apic_cmci;
} apic_pm_state;
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_suspend(void *data)
{
unsigned long flags;
int maxlvt;
@@ -2006,61 +2412,62 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
if (maxlvt >= 5)
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
local_irq_save(flags);
+
+ /*
+ * Mask IOAPIC before disabling the local APIC to prevent stale IRR
+ * entries on some implementations.
+ */
+ mask_ioapic_entries();
+
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
}
-static int lapic_resume(struct sys_device *dev)
+static void lapic_resume(void *data)
{
unsigned int l, h;
unsigned long flags;
int maxlvt;
- int ret = 0;
- struct IO_APIC_route_entry **ioapic_entries = NULL;
if (!apic_pm_state.active)
- return 0;
+ return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
- ioapic_entries = alloc_ioapic_entries();
- if (!ioapic_entries) {
- WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- ret = -ENOMEM;
- goto restore;
- }
-
- ret = save_IO_APIC_setup(ioapic_entries);
- if (ret) {
- WARN(1, "Saving IO-APIC state failed: %d\n", ret);
- free_ioapic_entries(ioapic_entries);
- goto restore;
- }
- mask_IO_APIC_setup(ioapic_entries);
- legacy_pic->mask_all();
- }
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
- if (x2apic_mode)
- enable_x2apic();
- else {
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
/*
* Make sure the APICBASE points to the right address
*
* FIXME! This will be wrong if we ever support suspend on
* SMP! We'll need to do this as part of the CPU restore!
*/
- rdmsr(MSR_IA32_APICBASE, l, h);
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
- wrmsr(MSR_IA32_APICBASE, l, h);
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
}
maxlvt = lapic_get_maxlvt();
@@ -2072,10 +2479,14 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5)
apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
if (maxlvt >= 4)
apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
@@ -2087,16 +2498,9 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled) {
- reenable_intr_remapping(x2apic_mode);
- legacy_pic->restore_mask();
- restore_IO_APIC_setup(ioapic_entries);
- free_ioapic_entries(ioapic_entries);
- }
-restore:
- local_irq_restore(flags);
+ irq_remapping_reenable(x2apic_mode);
- return ret;
+ local_irq_restore(flags);
}
/*
@@ -2104,34 +2508,27 @@ restore:
* are needed on every CPU up until machine_halt/restart/poweroff.
*/
-static struct sysdev_class lapic_sysclass = {
- .name = "lapic",
+static const struct syscore_ops lapic_syscore_ops = {
.resume = lapic_resume,
.suspend = lapic_suspend,
};
-static struct sys_device device_lapic = {
- .id = 0,
- .cls = &lapic_sysclass,
+static struct syscore lapic_syscore = {
+ .ops = &lapic_syscore_ops,
};
-static void __cpuinit apic_pm_activate(void)
+static void apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
static int __init init_lapic_sysfs(void)
{
- int error;
-
- if (!cpu_has_apic)
- return 0;
/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ register_syscore(&lapic_syscore);
- error = sysdev_class_register(&lapic_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic);
- return error;
+ return 0;
}
/* local apic needs to resume before other devices access its registers. */
@@ -2145,55 +2542,10 @@ static void apic_pm_activate(void) { }
#ifdef CONFIG_X86_64
-static int __cpuinit apic_cluster_num(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < nr_cpu_ids; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- } else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- } else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
+static int multi_checked;
+static int multi;
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- return clusters;
-}
-
-static int __cpuinitdata multi_checked;
-static int __cpuinitdata multi;
-
-static int __cpuinit set_multi(const struct dmi_system_id *d)
+static int set_multi(const struct dmi_system_id *d)
{
if (multi)
return 0;
@@ -2202,7 +2554,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d)
return 0;
}
-static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+static const struct dmi_system_id multi_dmi_table[] = {
{
.callback = set_multi,
.ident = "IBM System Summit2",
@@ -2214,7 +2566,7 @@ static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
{}
};
-static void __cpuinit dmi_check_multi(void)
+static void dmi_check_multi(void)
{
if (multi_checked)
return;
@@ -2231,42 +2583,22 @@ static void __cpuinit dmi_check_multi(void)
* multi-chassis.
* Use DMI to check them
*/
-__cpuinit int apic_is_clustered_box(void)
+int apic_is_clustered_box(void)
{
dmi_check_multi();
- if (multi)
- return 1;
-
- if (!is_vsmp_box())
- return 0;
-
- /*
- * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (apic_cluster_num() > 1)
- return 1;
-
- return 0;
+ return multi;
}
#endif
/*
* APIC command line parameters
*/
-static int __init setup_disableapic(char *arg)
+static int __init setup_nolapic(char *arg)
{
- disable_apic = 1;
+ apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -2293,22 +2625,24 @@ early_param("nolapic_timer", parse_nolapic_timer);
static int __init apic_set_verbosity(char *arg)
{
if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
+ if (IS_ENABLED(CONFIG_X86_32))
+ return -EINVAL;
+
+ ioapic_is_disabled = false;
return 0;
-#endif
- return -EINVAL;
}
if (strcmp("debug", arg) == 0)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
else {
- pr_warning("APIC Verbosity level %s not recognised"
+ pr_warn("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg);
return -EINVAL;
}
+#endif
return 0;
}
@@ -2316,11 +2650,11 @@ early_param("apic", apic_set_verbosity);
static int __init lapic_insert_resource(void)
{
- if (!apic_phys)
+ if (!apic_mmio_base)
return -1;
/* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
+ lapic_resource.start = apic_mmio_base;
lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
insert_resource(&iomem_resource, &lapic_resource);
@@ -2328,7 +2662,27 @@ static int __init lapic_insert_resource(void)
}
/*
- * need call insert after e820_reserve_resources()
+ * need call insert after e820__reserve_resources()
* that is using request_resource
*/
late_initcall(lapic_insert_resource);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
new file mode 100644
index 000000000000..2ed3b5c88c7f
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -0,0 +1,43 @@
+/*
+ * Common functions shared between the various APIC flavours
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/irq.h>
+#include <linux/kvm_types.h>
+#include <asm/apic.h>
+
+#include "local.h"
+
+u32 apic_default_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+u32 apic_flat_calc_apicid(unsigned int cpu)
+{
+ return 1U << cpu;
+}
+
+u32 default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid);
+
+/*
+ * Set up the logical destination ID when the APIC operates in logical
+ * destination mode.
+ */
+void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..e0308d8c4e6c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Flat APIC subarch code.
*
@@ -8,368 +8,61 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 1;
-}
-
-static const struct cpumask *flat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void flat_init_apic_ldr(void)
-{
- unsigned long val;
- unsigned long num, id;
-
- num = smp_processor_id();
- id = 1UL << num;
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
- local_irq_restore(flags);
-}
-
-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void
- flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- int cpu = smp_processor_id();
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void flat_send_IPI_allbutself(int vector)
-{
- int cpu = smp_processor_id();
-#ifdef CONFIG_HOTPLUG_CPU
- int hotplug = 1;
-#else
- int hotplug = 0;
-#endif
- if (hotplug || vector == NMI_VECTOR) {
- if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
- unsigned long mask = cpumask_bits(cpu_online_mask)[0];
-
- if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
- }
- } else if (num_online_cpus() > 1) {
- __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
- vector, apic->dest_logical);
- }
-}
-
-static void flat_send_IPI_all(int vector)
-{
- if (vector == NMI_VECTOR) {
- flat_send_IPI_mask(cpu_online_mask, vector);
- } else {
- __default_send_IPI_shortcut(APIC_DEST_ALLINC,
- vector, apic->dest_logical);
- }
-}
-
-static unsigned int flat_get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- id = (((x)>>24) & 0xFFu);
+#include <linux/export.h>
- return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
-
- x = ((id & 0xFFu)<<24);
- return x;
-}
-
-static unsigned int read_xapic_id(void)
-{
- unsigned int id;
+#include <asm/apic.h>
- id = flat_get_apic_id(apic_read(APIC_ID));
- return id;
-}
+#include "local.h"
-static int flat_apic_id_registered(void)
+static u32 physflat_get_apic_id(u32 x)
{
- return physid_isset(read_xapic_id(), phys_cpu_present_map);
+ return (x >> 24) & 0xFF;
}
-static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+static int physflat_probe(void)
{
- return initial_apic_id >> index_msb;
+ return 1;
}
-struct apic apic_flat = {
- .name = "flat",
- .probe = NULL,
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .apic_id_registered = flat_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- .irq_dest_mode = 1, /* logical */
-
- .target_cpus = flat_target_cpus,
- .disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = flat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
-
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = flat_send_IPI_allbutself,
- .send_IPI_all = flat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
- printk(KERN_DEBUG "system APIC only can use physical flat");
- return 1;
- }
-
- if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
- printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
- return 1;
- }
-#endif
-
- return 0;
-}
-
-static const struct cpumask *physflat_target_cpus(void)
-{
- return cpu_online_mask;
-}
-
-static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- default_send_IPI_mask_sequence_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
- int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpumask, vector);
-}
-
-static void physflat_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void physflat_send_IPI_all(int vector)
-{
- physflat_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- cpu = cpumask_first(cpumask);
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- return per_cpu(x86_cpu_to_apicid, cpu);
+ return 1;
}
-struct apic apic_physflat = {
+static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
- .probe = NULL,
+ .probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = physflat_target_cpus,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
- .vector_allocation_domain = physflat_vector_allocation_domain,
- /* not needed, but shouldn't hurt: */
- .init_apic_ldr = flat_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFu << 24,
- .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+ .max_apic_id = 0xFE,
+ .get_apic_id = physflat_get_apic_id,
- .send_IPI_mask = physflat_send_IPI_mask,
- .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = physflat_send_IPI_allbutself,
- .send_IPI_all = physflat_send_IPI_all,
- .send_IPI_self = apic_send_IPI_self,
+ .calc_dest_apicid = apic_default_calc_apicid,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+ .nmi_to_offline_cpu = true,
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
+apic_driver(apic_physflat);
+
+struct apic *apic __ro_after_init = &apic_physflat;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..58abb941c45b 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NOOP APIC driver.
*
@@ -7,173 +8,62 @@
* Though in case if apic is disabled (for some reason) we try
* to not uglify the caller's code and allow to call (some) apic routines
* like self-ipi, etc...
+ *
+ * FIXME: Remove this gunk. The above argument which was intentionally left
+ * in place is silly to begin with because none of the callbacks except for
+ * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh...
*/
-
-#include <linux/threads.h>
#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
+#include <linux/thread_info.h>
-#include <linux/smp.h>
-#include <asm/ipi.h>
+#include <asm/apic.h>
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820.h>
+#include "local.h"
-static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
-static void noop_apic_wait_icr_idle(void) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip,
+ unsigned int cpu)
{
return -1;
}
-static u32 noop_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
-static u64 noop_apic_icr_read(void)
-{
- return 0;
-}
-
-static int noop_cpu_to_logical_apicid(int cpu)
-{
- return 0;
-}
-
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return 0;
-}
-
-static unsigned int noop_get_apic_id(unsigned long x)
-{
- return 0;
-}
-
-static int noop_probe(void)
-{
- /*
- * NOOP apic should not ever be
- * enabled via probe routine
- */
- return 0;
-}
-
-static int noop_apic_id_registered(void)
-{
- /*
- * if we would be really "pedantic"
- * we should pass read_apic_id() here
- * but since NOOP suppose APIC ID = 0
- * lets save a few cycles
- */
- return physid_isset(0, phys_cpu_present_map);
-}
-
-static const struct cpumask *noop_target_cpus(void)
-{
- /* only BSP here */
- return cpumask_of(0);
-}
-
-static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static unsigned long noop_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- if (cpu != 0)
- pr_warning("APIC: Vector allocated for non-BSP cpu\n");
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-int noop_apicid_to_node(int logical_apicid)
-{
- /* we're always on node 0 */
- return 0;
-}
+static u64 noop_apic_icr_read(void) { return 0; }
+static u32 noop_get_apic_id(u32 apicid) { return 0; }
+static void noop_apic_eoi(void) { }
static u32 noop_apic_read(u32 reg)
{
- WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
return 0;
}
-static void noop_apic_write(u32 reg, u32 v)
+static void noop_apic_write(u32 reg, u32 val)
{
- WARN_ON_ONCE(cpu_has_apic && !disable_apic);
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
}
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
.name = "noop",
- .probe = noop_probe,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = noop_apic_id_registered,
+ .dest_mode_logical = true,
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
-
- .target_cpus = noop_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = noop_check_apicid_used,
- .check_apicid_present = noop_check_apicid_present,
-
- .vector_allocation_domain = noop_vector_allocation_domain,
- .init_apic_ldr = noop_init_apic_ldr,
-
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = noop_apicid_to_node,
- .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
-
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
-
- .phys_pkg_id = noop_phys_pkg_id,
-
- .mps_oem_check = NULL,
+ .max_apic_id = 0xFE,
.get_apic_id = noop_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_flat_calc_apicid,
+ .send_IPI = noop_send_IPI,
.send_IPI_mask = noop_send_IPI_mask,
.send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
.send_IPI_allbutself = noop_send_IPI_allbutself,
@@ -182,19 +72,9 @@ struct apic apic_noop = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- /* should be safe */
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = NULL,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
-
.read = noop_apic_read,
.write = noop_apic_write,
+ .eoi = noop_apic_eoi,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
- .wait_icr_idle = noop_apic_wait_icr_idle,
- .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..5c5be2d58242
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,272 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pgtable.h>
+
+#include <asm/msr.h>
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+
+
+#include "local.h"
+
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
+
+static u32 numachip1_get_apic_id(u32 x)
+{
+ unsigned long value;
+ unsigned int id = (x >> 24) & 0xff;
+
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrq(MSR_FAM10H_NODE_ID, value);
+ id |= (value << 2) & 0xff00;
+ }
+
+ return id;
+}
+
+static u32 numachip2_get_apic_id(u32 x)
+{
+ u64 mcfg;
+
+ rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
+{
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
+
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+ numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
+
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
+{
+ numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+ numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+ (start_rip >> 12));
+
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned int dmode;
+
+ preempt_disable();
+ local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+ /* Send via local APIC where non-local part matches */
+ if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(apicid, vector,
+ APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+ preempt_enable();
+ return;
+ }
+ preempt_enable();
+
+ dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+ numachip_apic_icr_write(apicid, dmode | vector);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int __init numachip1_probe(void)
+{
+ return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+ return apic == &apic_numachip2;
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ u64 val;
+ u32 nodes = 1;
+
+ c->topo.llc_id = node;
+
+ /* Account for nodes per socket in multi-core-module processors */
+ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrq(MSR_FAM10H_NODE_ID, val);
+ nodes = ((val >> 3) & 7) + 1;
+ }
+
+ c->topo.pkg_id = node / nodes;
+}
+
+static int __init numachip_system_init(void)
+{
+ /* Map the LCSR area and set up the apic_icr_write function */
+ switch (numachip_system) {
+ case 1:
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ numachip_apic_icr_write = numachip1_apic_icr_write;
+ break;
+ case 2:
+ init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+ numachip_apic_icr_write = numachip2_apic_icr_write;
+ break;
+ default:
+ return 0;
+ }
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+ return 0;
+
+ numachip_system = 1;
+
+ return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+ return 0;
+
+ numachip_system = 2;
+
+ return 1;
+}
+
+static const struct apic apic_numachip1 __refconst = {
+ .name = "NumaConnect system",
+ .probe = numachip1_probe,
+ .acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = numachip1_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+ .name = "NumaConnect2 system",
+ .probe = numachip2_probe,
+ .acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = numachip2_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+};
+
+apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
deleted file mode 100644
index cb804c5091b9..000000000000
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
- *
- * Drives the local APIC in "clustered mode".
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/smp.h>
-
-#include <asm/apicdef.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-static unsigned bigsmp_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static int bigsmp_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *bigsmp_target_cpus(void)
-{
-#ifdef CONFIG_SMP
- return cpu_online_mask;
-#else
- return cpumask_of(0);
-#endif
-}
-
-static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static inline unsigned long calculate_ldr(int cpu)
-{
- unsigned long val, id;
-
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- id = per_cpu(x86_bios_cpu_apicid, cpu);
- val |= SET_APIC_LOGICAL_ID(id);
-
- return val;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void bigsmp_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void bigsmp_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int bigsmp_apicid_to_node(int logical_apicid)
-{
- return apicid_2_node[hard_smp_processor_id()];
-}
-
-static int bigsmp_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
-
- return BAD_APICID;
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int bigsmp_cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_physical_id(cpu);
-}
-
-static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static int bigsmp_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
-/* As we are using single CPU as destination, pick only one CPU here */
-static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask));
-}
-
-static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- return bigsmp_cpu_to_logical_apicid(cpu);
-}
-
-static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void bigsmp_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void bigsmp_send_IPI_all(int vector)
-{
- bigsmp_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int dmi_bigsmp; /* can be set by dmi scanners */
-
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
-{
- printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
- dmi_bigsmp = 1;
-
- return 0;
-}
-
-
-static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }
- },
-
- { hp_ht_bigsmp, "HP ProLiant DL740",
- { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }
- },
- { } /* NULL entry stops DMI scanning */
-};
-
-static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
-}
-
-static int probe_bigsmp(void)
-{
- if (def_to_bigsmp)
- dmi_bigsmp = 1;
- else
- dmi_check_system(bigsmp_dmi_table);
-
- return dmi_bigsmp;
-}
-
-struct apic apic_bigsmp = {
-
- .name = "bigsmp",
- .probe = probe_bigsmp,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = bigsmp_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPU: */
- .irq_dest_mode = 0,
-
- .target_cpus = bigsmp_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = bigsmp_check_apicid_used,
- .check_apicid_present = bigsmp_check_apicid_present,
-
- .vector_allocation_domain = bigsmp_vector_allocation_domain,
- .init_apic_ldr = bigsmp_init_apic_ldr,
-
- .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
- .setup_apic_routing = bigsmp_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = bigsmp_apicid_to_node,
- .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
- .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = bigsmp_phys_pkg_id,
- .mps_oem_check = NULL,
-
- .get_apic_id = bigsmp_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = bigsmp_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
- .send_IPI_all = bigsmp_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
deleted file mode 100644
index 425e53a87feb..000000000000
--- a/arch/x86/kernel/apic/es7000_32.c
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- *
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/notifier.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/gfp.h>
-#include <linux/nmi.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-
-#include <asm/apicdef.h>
-#include <asm/atomic.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_SW_APIC 0x1020b
-
-#define MIP_PORT(val) ((val >> 32) & 0xffff)
-
-#define MIP_RD_LO(val) (val & 0xffffffff)
-
-struct mip_reg {
- unsigned long long off_0x00;
- unsigned long long off_0x08;
- unsigned long long off_0x10;
- unsigned long long off_0x18;
- unsigned long long off_0x20;
- unsigned long long off_0x28;
- unsigned long long off_0x30;
- unsigned long long off_0x38;
-};
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-#ifdef CONFIG_ACPI
-
-struct es7000_oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-static unsigned long oem_addrX;
-static unsigned long oem_size;
-
-#endif
-
-/*
- * ES7000 Globals
- */
-
-static volatile unsigned long *psai;
-static struct mip_reg *mip_reg;
-static struct mip_reg *host_reg;
-static int mip_port;
-static unsigned long mip_addr;
-static unsigned long host_addr;
-
-int es7000_plat;
-
-/*
- * GSI override for ES7000 platforms.
- */
-
-static unsigned int base;
-
-static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-}
-
-static int es7000_apic_is_cluster(void)
-{
- /* MPENTIUMIII */
- if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
- return 1;
-
- return 0;
-}
-
-static void setup_unisys(void)
-{
- /*
- * Determine the generation of the ES7000 currently running.
- *
- * es7000_plat = 1 if the machine is a 5xx ES7000 box
- * es7000_plat = 2 if the machine is a x86_64 ES7000 box
- *
- */
- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
- es7000_plat = ES7000_ZORRO;
- else
- es7000_plat = ES7000_CLASSIC;
-}
-
-/*
- * Parse the OEM Table:
- */
-static int parse_unisys_oem(char *oemptr)
-{
- int i;
- int success = 0;
- unsigned char type, size;
- unsigned long val;
- char *tp = NULL;
- struct psai *psaip = NULL;
- struct mip_reg_info *mi;
- struct mip_reg *host, *mip;
-
- tp = oemptr;
-
- tp += 8;
-
- for (i = 0; i <= 6; i++) {
- type = *tp++;
- size = *tp++;
- tp -= 2;
- switch (type) {
- case MIP_REG:
- mi = (struct mip_reg_info *)tp;
- val = MIP_RD_LO(mi->host_reg);
- host_addr = val;
- host = (struct mip_reg *)val;
- host_reg = __va(host);
- val = MIP_RD_LO(mi->mip_reg);
- mip_port = MIP_PORT(mi->mip_info);
- mip_addr = val;
- mip = (struct mip_reg *)val;
- mip_reg = __va(mip);
- pr_debug("host_reg = 0x%lx\n",
- (unsigned long)host_reg);
- pr_debug("mip_reg = 0x%lx\n",
- (unsigned long)mip_reg);
- success++;
- break;
- case MIP_PSAI_REG:
- psaip = (struct psai *)tp;
- if (tp != NULL) {
- if (psaip->addr)
- psai = __va(psaip->addr);
- else
- psai = NULL;
- success++;
- }
- break;
- default:
- break;
- }
- tp += size;
- }
-
- if (success < 2)
- es7000_plat = NON_UNISYS;
- else
- setup_unisys();
-
- return es7000_plat;
-}
-
-#ifdef CONFIG_ACPI
-static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
-{
- struct acpi_table_header *header = NULL;
- struct es7000_oem_table *table;
- acpi_size tbl_size;
- acpi_status ret;
- int i = 0;
-
- for (;;) {
- ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
- if (!ACPI_SUCCESS(ret))
- return -1;
-
- if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
- break;
-
- early_acpi_os_unmap_memory(header, tbl_size);
- }
-
- table = (void *)header;
-
- oem_addrX = table->OEMTableAddr;
- oem_size = table->OEMTableSize;
-
- early_acpi_os_unmap_memory(header, tbl_size);
-
- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
-
- return 0;
-}
-
-static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
-{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
-}
-
-static int es7000_check_dsdt(void)
-{
- struct acpi_table_header header;
-
- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
- !strncmp(header.oem_id, "UNISYS", 6))
- return 1;
- return 0;
-}
-
-static int es7000_acpi_ret;
-
-/* Hook from generic ACPI tables.c */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- unsigned long oem_addr = 0;
- int check_dsdt;
- int ret = 0;
-
- /* check dsdt at first to avoid clear fix_map for oem_addr */
- check_dsdt = es7000_check_dsdt();
-
- if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (check_dsdt) {
- ret = parse_unisys_oem((char *)oem_addr);
- } else {
- setup_unisys();
- ret = 1;
- }
- /*
- * we need to unmap it
- */
- unmap_unisys_acpi_oem_table(oem_addr);
- }
-
- es7000_acpi_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- int ret = es7000_acpi_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-#else /* !CONFIG_ACPI: */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-#endif /* !CONFIG_ACPI */
-
-static void es7000_spin(int n)
-{
- int i = 0;
-
- while (i++ < n)
- rep_nop();
-}
-
-static int es7000_mip_write(struct mip_reg *mip_reg)
-{
- int status = 0;
- int spin;
-
- spin = MIP_SPIN;
- while ((host_reg->off_0x38 & MIP_VALID) != 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for Host Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
- outb(1, mip_port);
-
- spin = MIP_SPIN;
-
- while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for MIP Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
- mip_reg->off_0x38 &= ~MIP_VALID;
-
- return status;
-}
-
-static void es7000_enable_apic_mode(void)
-{
- struct mip_reg es7000_mip_reg;
- int mip_status;
-
- if (!es7000_plat)
- return;
-
- pr_info("Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0x00 = MIP_SW_APIC;
- es7000_mip_reg.off_0x38 = MIP_VALID;
-
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- WARN(1, "Command failed, status = %x\n", mip_status);
-}
-
-static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-
-static void es7000_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
-}
-
-static unsigned int es7000_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void es7000_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void es7000_send_IPI_all(int vector)
-{
- es7000_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int es7000_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *target_cpus_cluster(void)
-{
- return cpu_all_mask;
-}
-
-static const struct cpumask *es7000_target_cpus(void)
-{
- return cpumask_of(smp_processor_id());
-}
-
-static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-static unsigned long es7000_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static unsigned long calculate_ldr(int cpu)
-{
- unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
-
- return SET_APIC_LOGICAL_ID(id);
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LdR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void es7000_init_apic_ldr_cluster(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_CLUSTER);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_setup_apic_routing(void)
-{
- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-
- pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
- (apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster",
- nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
-}
-
-static int es7000_apicid_to_node(int logical_apicid)
-{
- return 0;
-}
-
-
-static int es7000_cpu_present_to_apicid(int mps_cpu)
-{
- if (!mps_cpu)
- return boot_cpu_physical_apicid;
- else if (mps_cpu < nr_cpu_ids)
- return per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static int cpu_id;
-
-static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(cpu_id, retmap);
- ++cpu_id;
-}
-
-/* Mapping from cpu number to logical apicid */
-static int es7000_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
-{
- boot_cpu_physical_apicid = read_apic_id();
- return 1;
-}
-
-static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, uninitialized_var(apicid);
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = es7000_cpu_to_logical_apicid(cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- WARN(1, "Not a valid mask!");
-
- return BAD_APICID;
- }
- apicid = new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int
-es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = es7000_cpu_to_logical_apicid(0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = es7000_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int probe_es7000(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static int es7000_mps_ret;
-static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = 0;
-
- if (mpc->oemptr) {
- struct mpc_oemtable *oem_table =
- (struct mpc_oemtable *)mpc->oemptr;
-
- if (!strncmp(oem, "UNISYS", 6))
- ret = parse_unisys_oem((char *)oem_table);
- }
-
- es7000_mps_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = es7000_mps_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all procs: */
- .irq_dest_mode = 1,
-
- .target_cpus = target_cpus_cluster,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr_cluster,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = es7000_apicid_to_node,
- .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check_cluster,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = NULL,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
-
-struct apic __refdata apic_es7000 = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPUs: */
- .irq_dest_mode = 0,
-
- .target_cpus = es7000_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = es7000_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = es7000_apicid_to_node,
- .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = es7000_wait_for_init_deassert,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..45af535c44a0
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <linux/thread_info.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+
+#include "local.h"
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+
+#ifdef arch_trigger_cpumask_backtrace
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
+{
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_cpu,
+ nmi_raise_cpu_backtrace);
+}
+
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ if (nmi_cpu_backtrace(regs))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
+
+static int __init register_nmi_cpu_backtrace_handler(void)
+{
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
+ 0, "arch_bt");
+ return 0;
+}
+early_initcall(register_nmi_cpu_backtrace_handler);
+#endif
diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c
new file mode 100644
index 000000000000..821e2e536f19
--- /dev/null
+++ b/arch/x86/kernel/apic/init.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "APIC: " fmt
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+/*
+ * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions
+ * for each callback. The callbacks are setup during boot and all except
+ * wait_icr_idle() must be initialized before usage. The IPI wrappers
+ * use static_call() and not static_call_cond() to catch any fails.
+ */
+#define DEFINE_APIC_CALL(__cb) \
+ DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb)
+
+DEFINE_APIC_CALL(eoi);
+DEFINE_APIC_CALL(native_eoi);
+DEFINE_APIC_CALL(icr_read);
+DEFINE_APIC_CALL(icr_write);
+DEFINE_APIC_CALL(read);
+DEFINE_APIC_CALL(send_IPI);
+DEFINE_APIC_CALL(send_IPI_mask);
+DEFINE_APIC_CALL(send_IPI_mask_allbutself);
+DEFINE_APIC_CALL(send_IPI_allbutself);
+DEFINE_APIC_CALL(send_IPI_all);
+DEFINE_APIC_CALL(send_IPI_self);
+DEFINE_APIC_CALL(wait_icr_idle);
+DEFINE_APIC_CALL(wakeup_secondary_cpu);
+DEFINE_APIC_CALL(wakeup_secondary_cpu_64);
+DEFINE_APIC_CALL(write);
+
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask);
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self);
+
+/* The container for function call overrides */
+struct apic_override __x86_apic_override __initdata;
+
+#define apply_override(__cb) \
+ if (__x86_apic_override.__cb) \
+ apic->__cb = __x86_apic_override.__cb
+
+static __init void restore_override_callbacks(void)
+{
+ apply_override(eoi);
+ apply_override(native_eoi);
+ apply_override(write);
+ apply_override(read);
+ apply_override(send_IPI);
+ apply_override(send_IPI_mask);
+ apply_override(send_IPI_mask_allbutself);
+ apply_override(send_IPI_allbutself);
+ apply_override(send_IPI_all);
+ apply_override(send_IPI_self);
+ apply_override(icr_read);
+ apply_override(icr_write);
+ apply_override(wakeup_secondary_cpu);
+ apply_override(wakeup_secondary_cpu_64);
+}
+
+#define update_call(__cb) \
+ static_call_update(apic_call_##__cb, *apic->__cb)
+
+static __init void update_static_calls(void)
+{
+ update_call(eoi);
+ update_call(native_eoi);
+ update_call(write);
+ update_call(read);
+ update_call(send_IPI);
+ update_call(send_IPI_mask);
+ update_call(send_IPI_mask_allbutself);
+ update_call(send_IPI_allbutself);
+ update_call(send_IPI_all);
+ update_call(send_IPI_self);
+ update_call(icr_read);
+ update_call(icr_write);
+ update_call(wait_icr_idle);
+ update_call(wakeup_secondary_cpu);
+ update_call(wakeup_secondary_cpu_64);
+}
+
+void __init apic_setup_apic_calls(void)
+{
+ /* Ensure that the default APIC has native_eoi populated */
+ apic->native_eoi = apic->eoi;
+ update_static_calls();
+ pr_info("Static calls initialized\n");
+}
+
+void __init apic_install_driver(struct apic *driver)
+{
+ if (apic == driver)
+ return;
+
+ apic = driver;
+
+ if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid)
+ apic->max_apic_id = x2apic_max_apicid;
+
+ /* Copy the original eoi() callback as KVM/HyperV might overwrite it */
+ if (!apic->native_eoi)
+ apic->native_eoi = apic->eoi;
+
+ /* Apply any already installed callback overrides */
+ restore_override_callbacks();
+ update_static_calls();
+
+ pr_info("Switched APIC routing to: %s\n", driver->name);
+}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..28f934f05a85 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Intel IO-APIC support for multi-Pentium hosts.
*
@@ -18,10 +19,21 @@
* and Rolf G. Tews
* for testing these extensively
* Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
@@ -29,22 +41,16 @@
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-#include <linux/bootmem.h>
-#include <linux/dmar.h>
-#include <linux/hpet.h>
+#include <linux/memblock.h>
+#include <linux/msi.h>
-#include <asm/idle.h>
+#include <asm/irqdomain.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/cpu.h>
@@ -53,295 +59,194 @@
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/timer.h>
+#include <asm/time.h>
#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
-#include <asm/hpet.h>
#include <asm/hw_irq.h>
-
#include <asm/apic.h>
-
-#define __apicdebuginit(type) static type __init
+#include <asm/pgtable.h>
+#include <asm/x86_init.h>
+
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
#define for_each_irq_pin(entry, head) \
- for (entry = head; entry; entry = entry->next)
-
-/*
- * Is the SiS APIC rmw bug present ?
- * -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
+ list_for_each_entry(entry, &head, list)
static DEFINE_RAW_SPINLOCK(ioapic_lock);
-static DEFINE_RAW_SPINLOCK(vector_lock);
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
-/* IO APIC gsi routing info */
-struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
-
-/* The one past the highest gsi number used */
-u32 gsi_top;
-
-/* MP IRQ source entries */
-struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* # of MP IRQ source entries */
-int mp_irq_entries;
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
-/* GSI interrupts */
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
+struct mp_chip_data {
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
+ u32 count;
+};
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
+};
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+static struct ioapic {
+ /* # of IRQ routing registers */
+ int nr_registers;
+ /* Saved state during suspend/resume, or while enabling intr-remap. */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
+} ioapics[MAX_IO_APICS];
-int skip_ioapic_setup;
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
-void arch_disable_smp_support(void)
+int mpc_ioapic_id(int ioapic_idx)
{
-#ifdef CONFIG_PCI
- noioapicquirk = 1;
- noioapicreroute = -1;
-#endif
- skip_ioapic_setup = 1;
+ return ioapics[ioapic_idx].mp_config.apicid;
}
-static int __init parse_noapic(char *str)
+unsigned int mpc_ioapic_addr(int ioapic_idx)
{
- /* disable IO-APIC */
- arch_disable_smp_support();
- return 0;
+ return ioapics[ioapic_idx].mp_config.apicaddr;
}
-early_param("noapic", parse_noapic);
-struct irq_pin_list {
- int apic, pin;
- struct irq_pin_list *next;
-};
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+ return &ioapics[ioapic_idx].gsi_config;
+}
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static inline int mp_ioapic_pin_count(int ioapic)
{
- struct irq_pin_list *pin;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
- pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
- return pin;
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
}
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-#ifdef CONFIG_SPARSE_IRQ
-static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-#else
-static struct irq_cfg irq_cfgx[NR_IRQS];
-#endif
+static inline bool mp_is_legacy_irq(int irq)
+{
+ return irq >= 0 && irq < nr_legacy_irqs();
+}
-int __init arch_early_irq_init(void)
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
{
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- int count;
- int node;
- int i;
+ return ioapics[ioapic].irqdomain;
+}
- if (!legacy_pic->nr_legacy_irqs) {
- nr_irqs_gsi = 0;
- io_apic_irqs = ~0UL;
- }
+int nr_ioapics;
- cfg = irq_cfgx;
- count = ARRAY_SIZE(irq_cfgx);
- node= cpu_to_node(boot_cpu_id);
+/* The one past the highest gsi number used */
+u32 gsi_top;
- for (i = 0; i < count; i++) {
- desc = irq_to_desc(i);
- desc->chip_data = &cfg[i];
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
- /*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
- */
- if (i < legacy_pic->nr_legacy_irqs) {
- cfg[i].vector = IRQ0_VECTOR + i;
- cpumask_set_cpu(0, cfg[i].domain);
- }
- }
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
- return 0;
-}
+/* # of MP IRQ source entries */
+int mp_irq_entries;
-#ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
-{
- struct irq_cfg *cfg = NULL;
- struct irq_desc *desc;
+#ifdef CONFIG_EISA
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
- desc = irq_to_desc(irq);
- if (desc)
- cfg = desc->chip_data;
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
- return cfg;
-}
+bool ioapic_is_disabled __ro_after_init;
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
{
- struct irq_cfg *cfg;
-
- cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
- if (cfg) {
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
- kfree(cfg);
- cfg = NULL;
- } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
- GFP_ATOMIC, node)) {
- free_cpumask_var(cfg->domain);
- kfree(cfg);
- cfg = NULL;
- }
- }
-
- return cfg;
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ ioapic_is_disabled = true;
}
-int arch_init_chip_data(struct irq_desc *desc, int node)
+static int __init parse_noapic(char *str)
{
- struct irq_cfg *cfg;
-
- cfg = desc->chip_data;
- if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(node);
- if (!desc->chip_data) {
- printk(KERN_ERR "can not alloc irq_cfg\n");
- BUG_ON(1);
- }
- }
-
+ /* disable IO-APIC */
+ disable_ioapic_support();
return 0;
}
+early_param("noapic", parse_noapic);
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
{
- struct irq_pin_list *old_entry, *head, *tail, *entry;
-
- cfg->irq_2_pin = NULL;
- old_entry = old_cfg->irq_2_pin;
- if (!old_entry)
- return;
+ int i;
- entry = get_one_free_irq_2_pin(node);
- if (!entry)
- return;
+ apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- head = entry;
- tail = entry;
- old_entry = old_entry->next;
- while (old_entry) {
- entry = get_one_free_irq_2_pin(node);
- if (!entry) {
- entry = head;
- while (entry) {
- head = entry->next;
- kfree(entry);
- entry = head;
- }
- /* still use the old one */
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
return;
- }
- entry->apic = old_entry->apic;
- entry->pin = old_entry->pin;
- tail->next = entry;
- tail = entry;
- old_entry = old_entry->next;
}
- tail->next = NULL;
- cfg->irq_2_pin = head;
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
}
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static void alloc_ioapic_saved_registers(int idx)
{
- struct irq_pin_list *entry, *next;
+ size_t size;
- if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+ if (ioapics[idx].saved_registers)
return;
- entry = old_cfg->irq_2_pin;
-
- while (entry) {
- next = entry->next;
- kfree(entry);
- entry = next;
- }
- old_cfg->irq_2_pin = NULL;
+ size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers;
+ ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL);
+ if (!ioapics[idx].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", idx);
}
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int node)
+static void free_ioapic_saved_registers(int idx)
{
- struct irq_cfg *cfg;
- struct irq_cfg *old_cfg;
-
- cfg = get_one_free_irq_cfg(node);
-
- if (!cfg)
- return;
-
- desc->chip_data = cfg;
-
- old_cfg = old_desc->chip_data;
-
- memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
-
- init_copy_irq_2_pin(old_cfg, cfg, node);
-}
-
-static void free_irq_cfg(struct irq_cfg *old_cfg)
-{
- kfree(old_cfg);
+ kfree(ioapics[idx].saved_registers);
+ ioapics[idx].saved_registers = NULL;
}
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+int __init arch_early_ioapic_init(void)
{
- struct irq_cfg *old_cfg, *cfg;
-
- old_cfg = old_desc->chip_data;
- cfg = desc->chip_data;
+ int i;
- if (old_cfg == cfg)
- return;
+ if (!nr_legacy_irqs())
+ io_apic_irqs = ~0UL;
- if (old_cfg) {
- free_irq_2_pin(old_cfg, cfg);
- free_irq_cfg(old_cfg);
- old_desc->chip_data = NULL;
- }
-}
-/* end for move_irq_desc */
+ for_each_ioapic(i)
+ alloc_ioapic_saved_registers(i);
-#else
-struct irq_cfg *irq_cfg(unsigned int irq)
-{
- return irq < nr_irqs ? irq_cfgx + irq : NULL;
+ return 0;
}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -353,81 +258,47 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
}
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(vector, &io_apic->eoi);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
- writel(reg, &io_apic->index);
- return readl(&io_apic->data);
-}
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
- writel(value, &io_apic->data);
+ return readl(&io_apic->data);
}
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
- if (sis_apic_bug)
- writel(reg, &io_apic->index);
+ writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- int pin;
+ struct IO_APIC_route_entry entry;
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- return false;
+ return entry;
}
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ return __ioapic_read_entry(apic, pin);
}
/*
@@ -436,22 +307,16 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu = {{0, 0}};
-
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
}
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__ioapic_write_entry(apic, pin, e);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -461,13 +326,11 @@ void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
*/
static void ioapic_mask_entry(int apic, int pin)
{
- unsigned long flags;
- union entry_union eu = { .entry.mask = 1 };
+ struct IO_APIC_route_entry e = { .masked = true };
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
}
/*
@@ -475,151 +338,131 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
- struct irq_pin_list **last, *entry;
+ struct irq_pin_list *entry;
- /* don't allow duplicates */
- last = &cfg->irq_2_pin;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ /* Don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return 0;
- last = &entry->next;
+ return true;
}
- entry = get_one_free_irq_2_pin(node);
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
- printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
- return -ENOMEM;
+ pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
+
entry->apic = apic;
entry->pin = pin;
-
- *last = entry;
- return 0;
+ list_add_tail(&entry->list, &data->irq_2_pin);
+ return true;
}
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
- if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
- panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
-}
+ struct irq_pin_list *tmp, *entry;
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- /* every one is different, right? */
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) {
+ if (entry->apic == apic && entry->pin == pin) {
+ list_del(&entry->list);
+ kfree(entry);
return;
}
}
-
- /* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(cfg, node, newapic, newpin);
-}
-
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
- int mask_and, int mask_or,
- void (*final)(struct irq_pin_list *entry))
-{
- unsigned int reg, pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin * 2);
- reg &= mask_and;
- reg |= mask_or;
- io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
- if (final)
- final(entry);
}
-static void io_apic_modify_irq(struct irq_cfg *cfg,
- int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
void (*final)(struct irq_pin_list *entry))
{
struct irq_pin_list *entry;
- for_each_irq_pin(entry, cfg->irq_2_pin)
- __io_apic_modify_irq(entry, mask_and, mask_or, final);
-}
-
-static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
-{
- __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
- IO_APIC_REDIR_MASKED, NULL);
-}
+ data->entry.masked = masked;
-static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
-{
- __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
- IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
-}
-
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
- io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
+ if (final)
+ final(entry);
+ }
}
+/*
+ * Synchronize the IO-APIC and the CPU by doing a dummy read from the
+ * IO-APIC
+ */
static void io_apic_sync(struct irq_pin_list *entry)
{
- /*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
struct io_apic __iomem *io_apic;
+
io_apic = io_apic_base(entry->apic);
readl(&io_apic->data);
}
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
{
- io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ io_apic_modify_irq(data, true, &io_apic_sync);
}
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void __unmask_ioapic(struct mp_chip_data *data)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
-
- BUG_ON(!cfg);
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __mask_IO_APIC_irq(cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ io_apic_modify_irq(data, false, NULL);
}
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
{
- struct irq_cfg *cfg = desc->chip_data;
- unsigned long flags;
+ struct mp_chip_data *data = irq_data->chip_data;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __unmask_IO_APIC_irq(cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ __unmask_ioapic(data);
}
-static void mask_IO_APIC_irq(unsigned int irq)
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
+
+ entry = entry1 = __ioapic_read_entry(apic, pin);
+
+ /* Mask the entry and change the trigger mode to edge. */
+ entry1.masked = true;
+ entry1.is_level = false;
+
+ __ioapic_write_entry(apic, pin, entry1);
- mask_IO_APIC_irq_desc(desc);
+ /* Restore the previous level triggered entry. */
+ __ioapic_write_entry(apic, pin, entry);
+ }
}
-static void unmask_IO_APIC_irq(unsigned int irq)
+
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_pin_list *entry;
- unmask_IO_APIC_irq_desc(desc);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -628,21 +471,50 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
+ if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
return;
+
+ /*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (!entry.masked) {
+ entry.masked = true;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (!entry.is_level) {
+ entry.is_level = true;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
+ }
+
/*
- * Disable it in the IO-APIC irq-routing table:
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
*/
ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
}
-static void clear_IO_APIC (void)
+void clear_IO_APIC (void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- clear_IO_APIC_pin(apic, pin);
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
}
#ifdef CONFIG_X86_32
@@ -658,101 +530,64 @@ static int pirq_entries[MAX_PIRQS] = {
static int __init ioapic_pirq_setup(char *str)
{
- int i, max;
- int ints[MAX_PIRQS+1];
+ int i, max, ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
+ apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n");
+
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
+ apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]);
+ /* PIRQs are mapped upside down, usually */
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
-
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
- int apic;
- struct IO_APIC_route_entry **ioapic_entries;
-
- ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
- GFP_ATOMIC);
- if (!ioapic_entries)
- return 0;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- ioapic_entries[apic] =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- nr_ioapic_registers[apic], GFP_ATOMIC);
- if (!ioapic_entries[apic])
- goto nomem;
- }
-
- return ioapic_entries;
-
-nomem:
- while (--apic >= 0)
- kfree(ioapic_entries[apic]);
- kfree(ioapic_entries);
-
- return 0;
-}
-
/*
* Saves all the IO-APIC RTE's
*/
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
{
int apic, pin;
+ int err = 0;
- if (!ioapic_entries)
- return -ENOMEM;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_entries[apic][pin] =
- ioapic_read_entry(apic, pin);
+ for_each_pin(apic, pin)
+ ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin);
}
- return 0;
+ return err;
}
/*
* Mask all IO APIC entries.
*/
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- break;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ for_each_pin(apic, pin) {
struct IO_APIC_route_entry entry;
- entry = ioapic_entries[apic][pin];
- if (!entry.mask) {
- entry.mask = 1;
+ entry = ioapics[apic].saved_registers[pin];
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -760,49 +595,36 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
}
/*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
*/
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
{
int apic, pin;
- if (!ioapic_entries)
- return -ENOMEM;
-
- for (apic = 0; apic < nr_ioapics; apic++) {
- if (!ioapic_entries[apic])
- return -ENOMEM;
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
- ioapic_write_entry(apic, pin,
- ioapic_entries[apic][pin]);
+ for_each_pin(apic, pin)
+ ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]);
}
return 0;
}
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
- int apic;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- kfree(ioapic_entries[apic]);
-
- kfree(ioapic_entries);
-}
-
/*
* Find the IRQ entry number of a certain pin.
*/
-static int find_irq_entry(int apic, int pin, int type)
+static int find_irq_entry(int ioapic_idx, int pin, int type)
{
int i;
- for (i = 0; i < mp_irq_entries; i++)
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].irqtype == type &&
- (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
+ }
return -1;
}
@@ -817,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
-
return mp_irqs[i].dstirq;
}
return -1;
@@ -833,847 +653,549 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
break;
}
+
if (i < mp_irq_entries) {
- int apic;
- for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
- return apic;
+ int ioapic_idx;
+
+ for_each_ioapic(ioapic_idx) {
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
}
}
return -1;
}
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < legacy_pic->nr_legacy_irqs) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (1)
-#define default_PCI_polarity(idx) (1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].irqflag & 3)
- {
- case 0: /* conforms, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- polarity = default_ISA_polarity(idx);
- else
- polarity = default_PCI_polarity(idx);
- break;
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
+ case MP_IRQPOL_DEFAULT:
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
+ case MP_IRQPOL_ACTIVE_HIGH:
+ return false;
+ case MP_IRQPOL_RESERVED:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ fallthrough;
+ case MP_IRQPOL_ACTIVE_LOW:
+ default: /* Pointless default required due to do gcc stupidity */
+ return true;
}
- return polarity;
}
-static int MPBIOS_trigger(int idx)
+#ifdef CONFIG_EISA
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
{
- int bus = mp_irqs[idx].srcbus;
- int trigger;
-
- /*
- * Determine IRQ trigger mode (edge or level sensitive):
- */
- switch ((mp_irqs[idx].irqflag>>2) & 3)
- {
- case 0: /* conforms, ie. bus-type dependent */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus]) {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
- break;
- case 1: /* edge */
- {
- trigger = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- case 3: /* level */
- {
- trigger = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
- break;
- }
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = PIC_ELCR1 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
}
- return trigger;
+ apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq);
+ return false;
}
-static inline int irq_polarity(int idx)
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
- return MPBIOS_polarity(idx);
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return level;
+ case MP_BUS_EISA:
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
+ }
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return true;
}
-
-static inline int irq_trigger(int idx)
+#else
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return MPBIOS_trigger(idx);
+ return level;
}
+#endif
-static int pin_2_irq(int idx, int apic, int pin)
+static bool irq_is_level(int idx)
{
- int irq;
int bus = mp_irqs[idx].srcbus;
+ bool level;
/*
- * Debugging check, we are in big trouble if this message pops up!
- */
- if (mp_irqs[idx].dstirq != pin)
- printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].srcbusirq;
- } else {
- u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
- }
-
-#ifdef CONFIG_X86_32
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ * Determine IRQ trigger mode (edge or level sensitive):
*/
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
+ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
+ case MP_IRQTRIG_DEFAULT:
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to level, ISA to edge.
+ */
+ level = !test_bit(bus, mp_bus_not_pci);
+ /* Take EISA into account */
+ return eisa_irq_is_level(idx, bus, level);
+ case MP_IRQTRIG_EDGE:
+ return false;
+ case MP_IRQTRIG_RESERVED:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ fallthrough;
+ case MP_IRQTRIG_LEVEL:
+ default: /* Pointless default required due to do gcc stupidity */
+ return true;
}
-#endif
-
- return irq;
}
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
- struct io_apic_irq_attr *irq_attr)
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
{
- int apic, i, best_guess = -1;
+ int ioapic, pin, idx;
- apic_printk(APIC_DEBUG,
- "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE,
- "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ if (ioapic_is_disabled)
return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].srcbus;
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
- break;
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
- if (pin == (mp_irqs[i].srcbusirq & 3)) {
- set_io_apic_irq_attr(irq_attr, apic,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- return irq;
- }
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0) {
- set_io_apic_irq_attr(irq_attr, apic,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- best_guess = irq;
- }
- }
- }
- return best_guess;
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-void lock_vector_lock(void)
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
{
- /* Used to the online set of cpus does not change
- * during assign_irq_vector.
- */
- raw_spin_lock(&vector_lock);
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
}
+#endif
-void unlock_vector_lock(void)
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
{
- raw_spin_unlock(&vector_lock);
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic.node = node;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
+ info->ioapic.valid = 1;
}
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
{
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
- static int current_offset = VECTOR_OFFSET_START % 8;
- unsigned int old_vector;
- int cpu, err;
- cpumask_var_t tmp_mask;
-
- if (cfg->move_in_progress)
- return -EBUSY;
-
- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
- return -ENOMEM;
-
- old_vector = cfg->vector;
- if (old_vector) {
- cpumask_and(tmp_mask, mask, cpu_online_mask);
- cpumask_and(tmp_mask, cfg->domain, tmp_mask);
- if (!cpumask_empty(tmp_mask)) {
- free_cpumask_var(tmp_mask);
- return 0;
- }
- }
-
- /* Only try and allocate irqs on cpus that are present */
- err = -ENOSPC;
- for_each_cpu_and(cpu, mask, cpu_online_mask) {
- int new_cpu;
- int vector, offset;
-
- apic->vector_allocation_domain(cpu, tmp_mask);
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 8;
- if (vector >= first_system_vector) {
- /* If out of vectors on large boxen, must share them. */
- offset = (offset + 1) % 8;
- vector = FIRST_EXTERNAL_VECTOR + offset;
- }
- if (unlikely(current_vector == vector))
- continue;
+ bool level, pol_low;
- if (test_bit(vector, used_vectors))
- goto next;
-
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
- goto next;
- /* Found one! */
- current_vector = vector;
- current_offset = offset;
- if (old_vector) {
- cfg->move_in_progress = 1;
- cpumask_copy(cfg->old_domain, cfg->domain);
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->devid = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic.pin = pin;
+ dst->ioapic.valid = 1;
+ if (src && src->ioapic.valid) {
+ dst->ioapic.node = src->ioapic.node;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
+ } else {
+ dst->ioapic.node = NUMA_NO_NODE;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
- err = 0;
- break;
}
- free_cpumask_var(tmp_mask);
- return err;
}
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
{
- int err;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
- return err;
+ return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}
-static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+static void mp_register_handler(unsigned int irq, bool level)
{
- int cpu, vector;
+ irq_flow_handler_t hdl;
+ bool fasteoi;
- BUG_ON(!cfg->vector);
-
- vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
-
- cfg->vector = 0;
- cpumask_clear(cfg->domain);
-
- if (likely(!cfg->move_in_progress))
- return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
- continue;
- per_cpu(vector_irq, cpu)[vector] = -1;
- break;
- }
+ if (level) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
}
- cfg->move_in_progress = 0;
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
}
-void __setup_vector_irq(int cpu)
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
{
- /* Initialize vector_irq on a new cpu */
- int irq, vector;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
+ struct mp_chip_data *data = irq_get_chip_data(irq);
/*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attributes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
*/
- raw_spin_lock(&vector_lock);
- /* Mark the inuse vectors */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
-
- /*
- * If it is a legacy IRQ handled by the legacy PIC, this cpu
- * will be part of the irq_cfg's domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
- cpumask_set_cpu(cpu, cfg->domain);
-
- if (!cpumask_test_cpu(cpu, cfg->domain))
- continue;
- vector = cfg->vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.is_level = data->is_level = info->ioapic.is_level;
+ data->entry.active_low = data->active_low = info->ioapic.active_low;
}
- /* Mark the free vectors */
- for (vector = 0; vector < NR_VECTORS; ++vector) {
- irq = per_cpu(vector_irq, cpu)[vector];
- if (irq < 0)
- continue;
- cfg = irq_cfg(irq);
- if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
- }
- raw_spin_unlock(&vector_lock);
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}
-static struct irq_chip ioapic_chip;
-static struct irq_chip ir_ioapic_chip;
-
-#define IOAPIC_AUTO -1
-#define IOAPIC_EDGE 0
-#define IOAPIC_LEVEL 1
-
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
{
- int apic, idx, pin;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+ bool legacy = false;
+ int irq = -1;
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
+ */
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
+ irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ return -1;
}
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
+
+ return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info),
+ info, legacy, NULL);
}
-#endif
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+ struct mp_chip_data *data;
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- desc->status |= IRQ_LEVEL;
- else
- desc->status &= ~IRQ_LEVEL;
-
- if (irq_remapped(irq)) {
- desc->status |= IRQ_MOVE_PCNTXT;
- if (trigger)
- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
- handle_fasteoi_irq,
- "fasteoi");
- else
- set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
- handle_edge_irq, "edge");
- return;
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list associated with this IRQ and program the IOAPIC
+ * entry.
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
+ return -ENOMEM;
+ } else {
+ info->flags |= X86_IRQ_ALLOC_LEGACY;
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
}
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_fasteoi_irq,
- "fasteoi");
- else
- set_irq_chip_and_handler_name(irq, &ioapic_chip,
- handle_edge_irq, "edge");
+ return irq;
}
-int setup_ioapic_entry(int apic_id, int irq,
- struct IO_APIC_route_entry *entry,
- unsigned int destination, int trigger,
- int polarity, int vector, int pin)
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags, struct irq_alloc_info *info)
{
- /*
- * add it to the IO-APIC irq-routing table:
- */
- memset(entry,0,sizeof(*entry));
-
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
- struct irte irte;
- struct IR_IO_APIC_route_entry *ir_entry =
- (struct IR_IO_APIC_route_entry *) entry;
- int index;
-
- if (!iommu)
- panic("No mapping iommu for ioapic %d\n", apic_id);
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
- panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
+ bool legacy = false;
+ int irq;
- memset(&irte, 0, sizeof(irte));
+ if (!domain)
+ return -ENOSYS;
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ legacy = mp_is_legacy_irq(irq);
/*
- * Trigger mode in the IRTE will always be edge, and the
- * actual level or edge trigger will be setup in the IO-APIC
- * RTE. This will help simplify level triggered irq migration.
- * For more details, see the comments above explainig IO-APIC
- * irq migration in the presence of interrupt-remapping.
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
*/
- irte.trigger_mode = 0;
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = vector;
- irte.dest_id = IRTE_DEST(destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, apic_id);
-
- modify_irte(irq, &irte);
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
+ }
- ir_entry->index2 = (index >> 15) & 0x1;
- ir_entry->zero = 0;
- ir_entry->format = 1;
- ir_entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- ir_entry->vector = pin;
+ guard(mutex)(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
+ if (irq == 0)
+ irq = -ENOENT;
+ }
} else {
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = destination;
- entry->vector = vector;
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
}
-
- entry->mask = 0; /* enable IRQ */
- entry->trigger = trigger;
- entry->polarity = polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (trigger)
- entry->mask = 1;
- return 0;
+ return irq;
}
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
- int trigger, int polarity)
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
{
- struct irq_cfg *cfg;
- struct IO_APIC_route_entry entry;
- unsigned int dest;
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
- if (!IO_APIC_IRQ(irq))
- return;
+ /* Debugging check, we are in big trouble if this message pops up! */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
- cfg = desc->chip_data;
+#ifdef CONFIG_X86_32
+ /* PCI IRQ command line redirection. Yes, limits are hardcoded. */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin - 16] != -1) {
+ if (!pirq_entries[pin - 16]) {
+ apic_pr_verbose("Disabling PIRQ%d\n", pin - 16);
+ } else {
+ int irq = pirq_entries[pin-16];
- /*
- * For legacy irqs, cfg->domain starts with cpu 0 for legacy
- * controllers like 8259. Now that IO-APIC can handle this irq, update
- * the cfg->domain.
- */
- if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
- apic->vector_allocation_domain(0, cfg->domain);
+ apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq);
+ return irq;
+ }
+ }
+ }
+#endif
- if (assign_irq_vector(irq, cfg, apic->target_cpus()))
- return;
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -ENODEV;
- apic_printk(APIC_VERBOSE,KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
- "IRQ %d Mode:%i Active:%i)\n",
- apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
- irq, trigger, polarity);
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
+}
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
- if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
- dest, trigger, polarity, cfg->vector, pin)) {
- printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
- mp_ioapics[apic_id].apicid, pin);
- __clear_irq_vector(irq, cfg);
+ if (!irq_data || !irq_data->domain)
return;
- }
- ioapic_register_intr(irq, desc, trigger);
- if (irq < legacy_pic->nr_legacy_irqs)
- legacy_pic->chip->mask(irq);
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
- ioapic_write_entry(apic_id, pin, entry);
+ guard(mutex)(&ioapic_mutex);
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
}
-static struct {
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static void __init setup_IO_APIC_irqs(void)
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
- int apic_id, pin, idx, irq;
- int notcon = 0;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int node = cpu_to_node(boot_cpu_id);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1) {
- if (!notcon) {
- notcon = 1;
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- } else
- apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- continue;
- }
- if (notcon) {
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
- notcon = 0;
- }
+ int irq, i, best_ioapic = -1, best_idx = -1;
- irq = pin_2_irq(idx, apic_id, pin);
+ apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
- if ((apic_id > 0) && (irq > 16))
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
continue;
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(apic_id, irq))
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
+ break;
+ }
+ if (!found)
continue;
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
}
- cfg = desc->chip_data;
- add_pin_to_irq_node(cfg, node, apic_id, pin);
+
/*
- * don't mark it in pin_programmed, so later acpi could
- * set it correctly when irq < 16
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
*/
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ }
}
+ if (best_idx < 0)
+ return -1;
- if (notcon)
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
+out:
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC);
}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-/*
- * for the gsit that is not in first ioapic
- * but could not use acpi_register_gsi()
- * like some special sci in IBM x3330
- */
-void setup_IO_APIC_irq_extra(u32 gsi)
-{
- int apic_id = 0, pin, idx, irq;
- int node = cpu_to_node(boot_cpu_id);
- struct irq_desc *desc;
- struct irq_cfg *cfg;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- apic_id = mp_find_ioapic(gsi);
- if (apic_id < 0)
- return;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
- pin = mp_find_ioapic_pin(apic_id, gsi);
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1)
- return;
-
- irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
- desc = irq_to_desc(irq);
- if (desc)
- return;
-#endif
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
- return;
- }
+static void __init setup_IO_APIC_irqs(void)
+{
+ unsigned int ioapic, pin;
+ int idx;
- cfg = desc->chip_data;
- add_pin_to_irq_node(cfg, node, apic_id, pin);
+ apic_pr_verbose("Init IO_APIC IRQs\n");
- if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapics[apic_id].apicid, pin);
- return;
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0) {
+ apic_pr_verbose("apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ } else {
+ pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
- set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+}
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
}
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
- int vector)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
struct IO_APIC_route_entry entry;
+ char buf[256];
+ int i;
- if (intr_remapping_enabled)
- return;
-
- memset(&entry, 0, sizeof(entry));
-
- /*
- * We use logical delivery to get the timer IRQ
- * to the first CPU.
- */
- entry.dest_mode = apic->irq_dest_mode;
- entry.mask = 0; /* don't mask IRQ for edge */
- entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
- entry.delivery_mode = apic->irq_delivery_mode;
- entry.polarity = 0;
- entry.trigger = 0;
- entry.vector = vector;
-
- /*
- * The timer IRQ doesn't have to know that behind the
- * scene we may have a 8259A-master in AEOI mode ...
- */
- set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
- ioapic_write_entry(apic_id, pin, entry);
+ apic_dbg("IOAPIC %d:\n", apic);
+ for (i = 0; i <= nr_entries; i++) {
+ entry = ioapic_read_entry(apic, i);
+ snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i, entry.masked ? "disabled" : "enabled ",
+ entry.is_level ? "level" : "edge ",
+ entry.active_low ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (entry.ir_format) {
+ apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf,
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
+ } else {
+ apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
+ }
+ }
}
-
-__apicdebuginit(void) print_IO_APIC(void)
+static void __init print_IO_APIC(int ioapic_idx)
{
- int apic, i;
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- struct irq_cfg *cfg;
- struct irq_desc *desc;
- unsigned int irq;
-
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (i = 0; i < nr_ioapics; i++)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].apicid, nr_ioapic_registers[i]);
-
- /*
- * We are a bit conservative about what we expect. We have to
- * know about every hardware change ASAP.
- */
- printk(KERN_INFO "testing the IO APIC.......................\n");
-
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- reg_01.raw = io_apic_read(apic, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(apic, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(apic, 3);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ }
+
+ apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ apic_dbg(".... register #00: %08X\n", reg_00.raw);
+ apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS);
+ apic_dbg(".... register #01: %08X\n", *(int *)&reg_01);
+ apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries);
+ apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1681,8 +1203,8 @@ __apicdebuginit(void) print_IO_APIC(void)
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ apic_dbg(".... register #02: %08X\n", reg_02.raw);
+ apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
@@ -1692,286 +1214,90 @@ __apicdebuginit(void) print_IO_APIC(void)
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ apic_dbg(".... register #03: %08X\n", reg_03.raw);
+ apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
- printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
- " Stat Dmod Deli Vect:\n");
-
- for (i = 0; i <= reg_01.bits.entries; i++) {
- struct IO_APIC_route_entry entry;
-
- entry = ioapic_read_entry(apic, i);
-
- printk(KERN_DEBUG " %02x %03X ",
- i,
- entry.dest
- );
-
- printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
- entry.mask,
- entry.trigger,
- entry.irr,
- entry.polarity,
- entry.delivery_status,
- entry.dest_mode,
- entry.delivery_mode,
- entry.vector
- );
- }
- }
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
- for_each_irq_desc(irq, desc) {
- struct irq_pin_list *entry;
-
- cfg = desc->chip_data;
- entry = cfg->irq_2_pin;
- if (!entry)
- continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
- for_each_irq_pin(entry, cfg->irq_2_pin)
- printk("-> %d:%d", entry->apic, entry->pin);
- printk("\n");
- }
-
- printk(KERN_INFO ".................................... done.\n");
-
- return;
+ apic_dbg(".... IRQ redirection table:\n");
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
-__apicdebuginit(void) print_APIC_field(int base)
+void __init print_IO_APICs(void)
{
- int i;
-
- printk(KERN_DEBUG);
-
- for (i = 0; i < 8; i++)
- printk(KERN_CONT "%08x", apic_read(base + i*0x10));
-
- printk(KERN_CONT "\n");
-}
+ int ioapic_idx;
+ unsigned int irq;
-__apicdebuginit(void) print_local_APIC(void *dummy)
-{
- unsigned int i, v, ver, maxlvt;
- u64 icr;
-
- printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (!APIC_XAPIC(ver)) {
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- }
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx) {
+ apic_dbg("number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers);
}
/*
- * Remote read supported only in the 82489DX and local APIC for
- * Pentium processors.
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
*/
- if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- }
-
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- if (!x2apic_enabled()) {
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- }
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_field(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_field(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_field(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
-
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
-
- icr = apic_icr_read();
- printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
-
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
-
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- v = apic_read(APIC_EFEAT);
- maxlvt = (v >> 16) & 0xff;
- printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
- v = apic_read(APIC_ECTRL);
- printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
- for (i = 0; i < maxlvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
- }
- }
- printk("\n");
-}
-
-__apicdebuginit(void) print_local_APICs(int maxcpu)
-{
- int cpu;
-
- if (!maxcpu)
- return;
-
- preempt_disable();
- for_each_online_cpu(cpu) {
- if (cpu >= maxcpu)
- break;
- smp_call_function_single(cpu, print_local_APIC, NULL, 1);
- }
- preempt_enable();
-}
-
-__apicdebuginit(void) print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (!legacy_pic->nr_legacy_irqs)
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- raw_spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
-
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ printk(KERN_INFO "testing the IO APIC.......................\n");
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+ for_each_ioapic(ioapic_idx)
+ print_IO_APIC(ioapic_idx);
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
+ apic_dbg("IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
-static int __initdata show_lapic = 1;
-static __init int setup_show_lapic(char *arg)
-{
- int num = -1;
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
+ continue;
+ data = irq_get_chip_data(irq);
+ if (!data)
+ continue;
+ if (list_empty(&data->irq_2_pin))
+ continue;
- if (strcmp(arg, "all") == 0) {
- show_lapic = CONFIG_NR_CPUS;
- } else {
- get_option(&arg, &num);
- if (num >= 0)
- show_lapic = num;
+ apic_dbg("IRQ%d ", irq);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
}
- return 1;
-}
-__setup("show_lapic=", setup_show_lapic);
-
-__apicdebuginit(int) print_ICs(void)
-{
- if (apic_verbosity == APIC_QUIET)
- return 0;
-
- print_PIC();
-
- /* don't print out if apic is not there */
- if (!cpu_has_apic && !apic_from_smp_config())
- return 0;
-
- print_local_APICs(show_lapic);
- print_IO_APIC();
-
- return 0;
+ printk(KERN_INFO ".................................... done.\n");
}
-fs_initcall(print_ICs);
-
-
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- int i8259_apic, i8259_pin;
- int apic;
+ int i8259_apic, i8259_pin, apic, pin;
+
+ if (ioapic_is_disabled)
+ nr_ioapics = 0;
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs() || !nr_ioapics)
return;
- for(apic = 0; apic < nr_ioapics; apic++) {
- int pin;
+ for_each_ioapic_pin(apic, pin) {
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
+ /*
+ * If the interrupt line is enabled and in ExtInt mode I
+ * have found the pin where the i8259 is connected.
+ */
+ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ break;
}
}
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
+
+ /*
+ * Look to see what if the MP table has reported the ExtINT
+ *
+ * If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
@@ -1979,72 +1305,52 @@ void __init enable_IO_APIC(void)
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ pr_warn("ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ pr_warn("ExtINT in hardware and MP table differ\n");
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
+ /* Do not trust the IO-APIC being empty at bootup */
clear_IO_APIC();
}
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
+void native_restore_boot_irq_mode(void)
{
/*
- * Clear the IO-APIC before rebooting:
+ * If the i8259 is routed through an IOAPIC Put that IOAPIC in
+ * virtual wire mode so legacy interrupts can be delivered.
*/
- clear_IO_APIC();
-
- if (!legacy_pic->nr_legacy_irqs)
- return;
-
- /*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
- *
- * With interrupt-remapping, for now we will use virtual wire A mode,
- * as virtual wire B is little complex (need to configure both
- * IOAPIC RTE aswell as interrupt-remapping table entry).
- * As this gets called during crash dump, keep this simple for now.
- */
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
+ u32 apic_id = read_apic_id();
memset(&entry, 0, sizeof(entry));
- entry.mask = 0; /* Enabled */
- entry.trigger = 0; /* Edge */
- entry.irr = 0;
- entry.polarity = 0; /* High */
- entry.delivery_status = 0;
- entry.dest_mode = 0; /* Physical */
- entry.delivery_mode = dest_ExtINT; /* ExtInt */
- entry.vector = 0;
- entry.dest = read_apic_id();
-
- /*
- * Add it to the IO-APIC irq-routing table:
- */
+ entry.masked = false;
+ entry.is_level = false;
+ entry.active_low = false;
+ entry.dest_mode_logical = false;
+ entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry.destid_0_7 = apic_id & 0xFF;
+ entry.virt_destid_8_14 = apic_id >> 8;
+
+ /* Add it to the IO-APIC irq-routing table */
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
- /*
- * Use virtual wire A mode when interrupt remapping is enabled.
- */
- if (cpu_has_apic || apic_from_smp_config())
- disconnect_bsp_APIC(!intr_remapping_enabled &&
- ioapic_i8259.pin != -1);
+ if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+void restore_boot_irq_mode(void)
+{
+ if (!nr_legacy_irqs())
+ return;
+
+ x86_apic_ops.restore();
}
#ifdef CONFIG_X86_32
@@ -2054,49 +1360,35 @@ void disable_IO_APIC(void)
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-
-void __init setup_ioapic_ids_from_mpc(void)
+static void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
+ DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int apic_id;
- int i;
unsigned char old_id;
- unsigned long flags;
+ int ioapic_idx, i;
- if (acpi_ioapic)
- return;
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+ copy_phys_cpu_present_map(phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-
+ for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic_id, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic_id].apicid;
-
- if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
- mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) {
+ pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
}
/*
@@ -2104,64 +1396,71 @@ void __init setup_ioapic_ids_from_mpc(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(&phys_id_present_map,
- mp_ioapics[apic_id].apicid)) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic_id, mp_ioapics[apic_id].apicid);
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
+ if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) {
+ pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < broadcast_id; i++)
+ if (!test_bit(i, phys_id_present_map))
break;
- if (i >= get_physical_broadcast())
+ if (i >= broadcast_id)
panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
- mp_ioapics[apic_id].apicid = i;
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", i);
+ set_bit(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- physid_mask_t tmp;
- apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mp_ioapics[apic_id].apicid);
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ apic_pr_verbose("Setting %d in the phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
-
/*
- * We need to adjust the IRQ routing table
- * if the ID changed.
+ * We need to adjust the IRQ routing table if the ID
+ * changed.
*/
- if (old_id != mp_ioapics[apic_id].apicid)
- for (i = 0; i < mp_irq_entries; i++)
+ if (old_id != mpc_ioapic_id(ioapic_idx)) {
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].dstapic == old_id)
- mp_irqs[i].dstapic
- = mp_ioapics[apic_id].apicid;
+ mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx);
+ }
+ }
/*
- * Read the right value from the MPC table and
- * write it into the ID register.
+ * Update the ID register according to the right value from
+ * the MPC table if they are different.
*/
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic_id].apicid);
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
- reg_00.bits.ID = mp_ioapics[apic_id].apicid;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic_id, 0, reg_00.raw);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
- /*
- * Sanity check
- */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic_id, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
- printk("could not set ID!\n");
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ }
+ /* Sanity check */
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ pr_cont("could not set ID!\n");
else
- apic_printk(APIC_VERBOSE, " ok.\n");
+ apic_pr_verbose(" ok.\n");
}
}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(boot_cpu_apic_version))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
#endif
int no_timer_check __initdata;
@@ -2173,6 +1472,42 @@ static int __init notimercheck(char *s)
}
__setup("no_timer_check", notimercheck);
+static void __init delay_with_tsc(void)
+{
+ unsigned long long start, now;
+ unsigned long end = jiffies + 4;
+
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 40000000000/HZ TSC cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ */
+ do {
+ native_pause();
+ now = rdtsc();
+ } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
+}
+
+static void __init delay_without_tsc(void)
+{
+ unsigned long end = jiffies + 4;
+ int band = 1;
+
+ /*
+ * We don't know any frequency yet, but waiting for
+ * 40940000000/HZ cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094
+ */
+ do {
+ __delay(((1U << band++) * 10000000UL) / HZ);
+ } while (band < 12 && time_before_eq(jiffies, end));
+}
+
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
@@ -2184,16 +1519,15 @@ __setup("no_timer_check", notimercheck);
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
- unsigned long flags;
if (no_timer_check)
return 1;
- local_save_flags(flags);
local_irq_enable();
- /* Let ten ticks pass... */
- mdelay((10 * 1000) / HZ);
- local_irq_restore(flags);
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ delay_with_tsc();
+ else
+ delay_without_tsc();
/*
* Expect a few ticks at least, to be sure some possible
@@ -2203,10 +1537,10 @@ static int __init timer_irq_works(void)
* least one tick may be lost due to delays.
*/
- /* jiffies wrap? */
- if (time_after(jiffies, t1 + 4))
- return 1;
- return 0;
+ local_irq_disable();
+
+ /* Did jiffies advance? */
+ return time_after(jiffies, t1 + 4);
}
/*
@@ -2215,409 +1549,120 @@ static int __init timer_irq_works(void)
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
*
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
+ *
+ * Edge triggered needs to resend any interrupt that was delayed but this
+ * is now handled in the device independent code.
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to
+ * make sure that we get the edge. If it is already asserted for some
+ * reason, we need return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake an edge even if it
+ * isn't on the 8259A...
*/
-
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
{
- int was_pending = 0;
- unsigned long flags;
- struct irq_cfg *cfg;
+ int was_pending = 0, irq = data->irq;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < legacy_pic->nr_legacy_irqs) {
- legacy_pic->chip->mask(irq);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ if (irq < nr_legacy_irqs()) {
+ legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- cfg = irq_cfg(irq);
- __unmask_IO_APIC_irq(cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
+ __unmask_ioapic(data->chip_data);
return was_pending;
}
-static int ioapic_retrigger_irq(unsigned int irq)
-{
-
- struct irq_cfg *cfg = irq_cfg(irq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- return 1;
-}
-
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-#ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
+atomic_t irq_mis_count;
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
- int apic, pin;
struct irq_pin_list *entry;
- u8 vector = cfg->vector;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ struct IO_APIC_route_entry e;
+ int pin;
- apic = entry->apic;
pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(irq))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
+ e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (e.irr)
+ return true;
}
+ return false;
}
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
- */
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
- unsigned int *dest_id)
-{
- struct irq_cfg *cfg;
- unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return -1;
-
- cpumask_copy(desc->affinity, mask);
-
- *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
- return 0;
-}
-
-static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = set_desc_affinity(desc, mask, &dest);
- if (!ret) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
+ /* If we are moving the IRQ we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
+ return true;
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return ret;
-}
-
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
-#ifdef CONFIG_INTR_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- */
-static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- struct irte irte;
- unsigned int dest;
- unsigned int irq;
- int ret = -1;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return ret;
-
- irq = desc->irq;
- if (get_irte(irq, &irte))
- return ret;
-
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return ret;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Modified the IRTE and flushes the Interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(desc->affinity, mask);
-
- return 0;
-}
-
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
- const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
-#else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
- const struct cpumask *mask)
-{
- return 0;
+ return false;
}
-#endif
-asmlinkage void smp_irq_move_cleanup_interrupt(void)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- unsigned vector, me;
-
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- me = smp_processor_id();
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irq;
- unsigned int irr;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- irq = __get_cpu_var(vector_irq)[vector];
-
- if (irq == -1)
- continue;
-
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
-
- cfg = irq_cfg(irq);
- raw_spin_lock(&desc->lock);
-
- /*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
- */
- if (cfg->move_in_progress)
- goto unlock;
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- goto unlock;
-
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (unlikely(moveit)) {
/*
- * Check if the vector that needs to be cleanedup is
- * registered at the cpu's IRR. If so, then this is not
- * the best time to clean it up. Lets clean it up in the
- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to myself.
+ * Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completely accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
*/
- if (irr & (1 << (vector % 32))) {
- apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
- goto unlock;
- }
- __get_cpu_var(vector_irq)[vector] = -1;
-unlock:
- raw_spin_unlock(&desc->lock);
+ if (!io_apic_level_ack_pending(data->chip_data))
+ irq_move_masked_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
-
- irq_exit();
-}
-
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
-{
- struct irq_desc *desc = *descp;
- struct irq_cfg *cfg = desc->chip_data;
- unsigned me;
-
- if (likely(!cfg->move_in_progress))
- return;
-
- me = smp_processor_id();
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- send_cleanup_vector(cfg);
-}
-
-static void irq_complete_move(struct irq_desc **descp)
-{
- __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
-}
-
-void irq_force_complete_move(int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
-
- if (!cfg)
- return;
-
- __irq_complete_move(&desc, cfg->vector);
}
#else
-static inline void irq_complete_move(struct irq_desc **descp) {}
-#endif
-
-static void ack_apic_edge(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- irq_complete_move(&desc);
- move_native_irq(irq);
- ack_APIC_irq();
-}
-
-atomic_t irq_mis_count;
-
-/*
- * IO-APIC versions below 0x20 don't support EOI register.
- * For the record, here is the information about various versions:
- * 0Xh 82489DX
- * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
- * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
- * 30h-FFh Reserved
- *
- * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
- * version as 0x2. This is an error with documentation and these ICH chips
- * use io-apic's of version 0x20.
- *
- * For IO-APIC's with EOI register, we use that to do an explicit EOI.
- * Otherwise, we simulate the EOI message manually by changing the trigger
- * mode to edge and then back to level, with RTE being masked during this.
-*/
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- if (mp_ioapics[entry->apic].apicver >= 0x20) {
- /*
- * Intr-remapping uses pin number as the virtual vector
- * in the RTE. Actual vector is programmed in
- * intr-remapping table entry. Hence for the io-apic
- * EOI we use the pin number.
- */
- if (irq_remapped(irq))
- io_apic_eoi(entry->apic, entry->pin);
- else
- io_apic_eoi(entry->apic, cfg->vector);
- } else {
- __mask_and_edge_IO_APIC_irq(entry);
- __unmask_and_level_IO_APIC_irq(entry);
- }
- }
+ return false;
}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
+#endif
-static void ack_apic_level(unsigned int irq)
+static void ioapic_ack_level(struct irq_data *irq_data)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
+ bool moveit;
int i;
- struct irq_cfg *cfg;
- int do_unmask_irq = 0;
- irq_complete_move(&desc);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- /* If we are moving the irq we need to mask it */
- if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
- do_unmask_irq = 1;
- mask_IO_APIC_irq_desc(desc);
- }
-#endif
+ irq_complete_move(cfg);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -2651,7 +1696,6 @@ static void ack_apic_level(unsigned int irq)
* we use the above logic (mask+edge followed by unmask+level) from
* Manfred Spraul to clear the remote IRR.
*/
- cfg = desc->chip_data;
i = cfg->vector;
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
@@ -2659,129 +1703,199 @@ static void ack_apic_level(unsigned int irq)
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
*/
- ack_APIC_irq();
+ apic_eoi();
/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
* message via io-apic EOI register write or simulating it using
- * mask+edge followed by unnask+level logic) manually when the
+ * mask+edge followed by unmask+level logic) manually when the
* level triggered interrupt is seen as the edge triggered interrupt
* at the cpu.
*/
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
-
- eoi_ioapic_irq(desc);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- /* Now we can move and renable the irq */
- if (unlikely(do_unmask_irq)) {
- /* Only migrate the irq if the ack has been received.
- *
- * On rare occasions the broadcast level triggered ack gets
- * delayed going to ioapics, and if we reprogram the
- * vector while Remote IRR is still set the irq will never
- * fire again.
- *
- * To prevent this scenario we read the Remote IRR bit
- * of the ioapic. This has two effects.
- * - On any sane system the read of the ioapic will
- * flush writes (and acks) going to the ioapic from
- * this cpu.
- * - We get to see if the ACK has actually been delivered.
- *
- * Based on failed experiments of reprogramming the
- * ioapic entry from outside of irq context starting
- * with masking the ioapic entry and then polling until
- * Remote IRR was clear before reprogramming the
- * ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
- *
- * However there appears to be no other way to plug
- * this race, so if the Remote IRR bit is not
- * accurate and is causing problems then it is a hardware bug
- * and you can go talk to the chipset vendor about it.
- */
- cfg = desc->chip_data;
- if (!io_apic_level_ack_pending(cfg))
- move_masked_irq(irq);
- unmask_IO_APIC_irq_desc(desc);
- }
+ ioapic_finish_move(irq_data, moveit);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ apic_ack_irq(irq_data);
+ eoi_ioapic_pin(data->entry.vector, data);
}
-#ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *entry)
{
- ack_APIC_irq();
+ struct msi_msg msg;
+
+ /* Let the parent domain compose the MSI message */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * - Real vector
+ * - DMAR/IR: 8bit subhandle (ioapic.pin)
+ * - AMD/IR: 8bit IRTE index
+ */
+ entry->vector = msg.arch_data.vector;
+ /* Delivery mode (for DMAR/IR all 0) */
+ entry->delivery_mode = msg.arch_data.delivery_mode;
+ /* Destination mode or DMAR/IR index bit 15 */
+ entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical;
+ /* DMAR/IR: 1, 0 for all other modes */
+ entry->ir_format = msg.arch_addr_lo.dmar_format;
+ /*
+ * - DMAR/IR: index bit 0-14.
+ *
+ * - Virt: If the host supports x2apic without a virtualized IR
+ * unit then bit 0-6 of dmar_index_0_14 are providing bit
+ * 8-14 of the destination id.
+ *
+ * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+ * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+ */
+ entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14;
}
-static void ir_ack_apic_level(unsigned int irq)
+static void ioapic_configure_entry(struct irq_data *irqd)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct mp_chip_data *mpd = irqd->chip_data;
+ struct irq_pin_list *entry;
+
+ ioapic_setup_msg_from_msi(irqd, &mpd->entry);
- ack_APIC_irq();
- eoi_ioapic_irq(desc);
+ for_each_irq_pin(entry, mpd->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
+ ioapic_configure_entry(irq_data);
+
+ return ret;
+}
+
+/*
+ * Interrupt shutdown masks the ioapic pin, but the interrupt might already
+ * be in flight, but not yet serviced by the target CPU. That means
+ * __synchronize_hardirq() would return and claim that everything is calmed
+ * down. So free_irq() would proceed and deactivate the interrupt and free
+ * resources.
+ *
+ * Once the target CPU comes around to service it it will find a cleared
+ * vector and complain. While the spurious interrupt is harmless, the full
+ * release of resources might prevent the interrupt from being acknowledged
+ * which keeps the hardware in a weird state.
+ *
+ * Verify that the corresponding Remote-IRR bits are clear.
+ */
+static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct mp_chip_data *mcd = irqd->chip_data;
+ struct IO_APIC_route_entry rentry;
+ struct irq_pin_list *p;
+
+ if (which != IRQCHIP_STATE_ACTIVE)
+ return -EINVAL;
+
+ *state = false;
+
+ guard(raw_spinlock)(&ioapic_lock);
+ for_each_irq_pin(p, mcd->irq_2_pin) {
+ rentry = __ioapic_read_entry(p->apic, p->pin);
+ /*
+ * The remote IRR is only valid in level trigger mode. It's
+ * meaning is undefined for edge triggered interrupts and
+ * irrelevant because the IO-APIC treats them as fire and
+ * forget.
+ */
+ if (rentry.irr && rentry.is_level) {
+ *state = true;
+ break;
+ }
+ }
+ return 0;
}
-#endif /* CONFIG_INTR_REMAP */
static struct irq_chip ioapic_chip __read_mostly = {
- .name = "IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
- .ack = ack_apic_edge,
- .eoi = ack_apic_level,
-#ifdef CONFIG_SMP
- .set_affinity = set_ioapic_affinity_irq,
-#endif
- .retrigger = ioapic_retrigger_irq,
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
-static struct irq_chip ir_ioapic_chip __read_mostly = {
- .name = "IR-IO-APIC",
- .startup = startup_ioapic_irq,
- .mask = mask_IO_APIC_irq,
- .unmask = unmask_IO_APIC_irq,
-#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
- .eoi = ir_ack_apic_level,
-#ifdef CONFIG_SMP
- .set_affinity = set_ir_ioapic_affinity_irq,
-#endif
-#endif
- .retrigger = ioapic_retrigger_irq,
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
};
static inline void init_IO_APIC_traps(void)
{
- int irq;
- struct irq_desc *desc;
struct irq_cfg *cfg;
+ unsigned int irq;
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- for_each_irq_desc(irq, desc) {
- cfg = desc->chip_data;
+ for_each_active_irq(irq) {
+ cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
+ * Hmm.. We don't have an entry for this, so
+ * default to an old-fashioned 8259 interrupt if we
+ * can. Otherwise set the dummy interrupt chip.
*/
- if (irq < legacy_pic->nr_legacy_irqs)
+ if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
- /* Strange. Oh, well.. */
- desc->chip = &no_irq_chip;
+ irq_set_chip(irq, &no_irq_chip);
}
}
}
@@ -2789,58 +1903,36 @@ static inline void init_IO_APIC_traps(void)
/*
* The local APIC irq-chip implementation:
*/
-
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
{
- ack_APIC_irq();
+ apic_eoi();
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
- .mask = mask_lapic_irq,
- .unmask = unmask_lapic_irq,
- .ack = ack_lapic_irq,
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
};
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
{
- desc->status &= ~IRQ_LEVEL;
- set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
-}
-
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge");
}
/*
@@ -2852,9 +1944,10 @@ static void __init setup_nmi(void)
*/
static inline void __init unlock_ExtINT_logic(void)
{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ struct IO_APIC_route_entry entry0, entry1;
+ int apic, pin, i;
+ u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
@@ -2870,14 +1963,16 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
+ apic_id = read_apic_id();
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = 0; /* physical delivery */
- entry1.mask = 0; /* unmask IRQ now */
- entry1.dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = 0;
+ entry1.dest_mode_logical = true;
+ entry1.masked = false;
+ entry1.destid_0_7 = apic_id & 0xFF;
+ entry1.virt_destid_8_14 = apic_id >> 8;
+ entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry1.active_low = entry0.active_low;
+ entry1.is_level = false;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2911,32 +2006,66 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-int timer_through_8259 __initdata;
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
+{
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.devid = mpc_ioapic_id(ioapic);
+ info.ioapic.pin = pin;
+ guard(mutex)(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ }
+
+ return irq;
+}
+
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
- struct irq_desc *desc = irq_to_desc(0);
- struct irq_cfg *cfg = desc->chip_data;
- int node = cpu_to_node(boot_cpu_id);
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
- unsigned long flags;
int no_pin1 = 0;
- local_irq_save(flags);
+ if (!global_clock_event)
+ return;
+
+ local_irq_disable();
/*
* get/set the timer IRQ vector:
*/
- legacy_pic->chip->mask(0);
- assign_irq_vector(0, cfg, apic->target_cpus());
+ legacy_pic->mask(0);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2949,24 +2078,14 @@ static inline void __init check_timer(void)
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
legacy_pic->init(1);
-#ifdef CONFIG_X86_32
- {
- unsigned int ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- }
-#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@@ -2976,8 +2095,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
- panic("BIOS bug: timer not connected to IO-APIC");
+ panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2987,113 +2105,90 @@ static inline void __init check_timer(void)
}
if (pin1 != -1) {
- /*
- * Ok, does IRQ0 through the IOAPIC work?
- */
+ /* Ok, does IRQ0 through the IOAPIC work? */
if (no_pin1) {
- add_pin_to_irq_node(cfg, node, apic1, pin1);
- setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ mp_alloc_timer_irq(apic1, pin1);
} else {
- /* for edge trigger, setup_IO_APIC_irq already
- * leave it unmasked.
+ /*
+ * for edge trigger, it's already unmasked,
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
- unmask_IO_APIC_irq_desc(desc);
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
+ unmask_ioapic_irq(irq_get_irq_data(0));
}
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- legacy_pic->chip->unmask(0);
- }
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
- panic("timer doesn't work through Interrupt-remapped IO-APIC");
- local_irq_disable();
+ panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
+ pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n");
+ pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
- setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
- legacy_pic->chip->unmask(0);
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
- setup_nmi();
- legacy_pic->chip->unmask(0);
- }
+ pr_info("....... works.\n");
goto out;
}
/*
* Cleanup, just in case ...
*/
- local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ pr_info("....... failed.\n");
}
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
+ pr_info("...trying to set up timer as Virtual Wire IRQ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
-
- lapic_register_intr(0, desc);
+ lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
- legacy_pic->chip->unmask(0);
+ legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
- legacy_pic->chip->mask(0);
+ legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+ pr_info("..... failed.\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
+ pr_info("...trying to set up timer as ExtINT IRQ...\n");
legacy_pic->init(0);
legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- local_irq_disable();
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+
+ pr_info("..... failed :\n");
+ if (apic_is_x2apic_enabled()) {
+ pr_info("Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ }
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
- local_irq_restore(flags);
+ local_irq_enable();
}
/*
@@ -3115,1049 +2210,319 @@ out:
*/
#define PIC_IRQS (1UL << PIC_CASCADE_IR)
-void __init setup_IO_APIC(void)
-{
-
- /*
- * calling enable_IO_APIC() is moved to setup_local_APIC for BP
- */
- io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
-
- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
- /*
- * Set up IO-APIC IRQ routing.
- */
- x86_init.mpparse.setup_ioapic_ids();
-
- sync_Arb_IDs();
- setup_IO_APIC_irqs();
- init_IO_APIC_traps();
- if (legacy_pic->nr_legacy_irqs)
- check_timer();
-}
-
-/*
- * Called after all the initialization is done. If we didnt find any
- * APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
+static int mp_irqdomain_create(int ioapic)
{
- if (sis_apic_bug == -1)
- sis_apic_bug = 0;
- return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
- struct sys_device dev;
- struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
- *entry = ioapic_read_entry(dev->id, i);
-
- return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
- struct IO_APIC_route_entry *entry;
- struct sysfs_ioapic_data *data;
- unsigned long flags;
- union IO_APIC_reg_00 reg_00;
- int i;
-
- data = container_of(dev, struct sysfs_ioapic_data, dev);
- entry = data->entry;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct irq_domain *parent;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].apicid;
- io_apic_write(dev->id, 0, reg_00.raw);
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
- ioapic_write_entry(dev->id, i, entry[i]);
-
- return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
- .name = "ioapic",
- .suspend = ioapic_suspend,
- .resume = ioapic_resume,
-};
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
-static int __init ioapic_init_sysfs(void)
-{
- struct sys_device * dev;
- int i, size, error;
-
- error = sysdev_class_register(&ioapic_sysdev_class);
- if (error)
- return error;
-
- for (i = 0; i < nr_ioapics; i++ ) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
- * sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
- if (!mp_ioapic_data[i]) {
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
- dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
- dev->cls = &ioapic_sysdev_class;
- error = sysdev_register(dev);
- if (error) {
- kfree(mp_ioapic_data[i]);
- mp_ioapic_data[i] = NULL;
- printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
- continue;
- }
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_fwnode_handle(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
+ if (!fn)
+ return -ENOMEM;
}
- return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-unsigned int create_irq_nr(unsigned int irq_want, int node)
-{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
- unsigned long flags;
- struct irq_cfg *cfg_new = NULL;
- struct irq_desc *desc_new = NULL;
-
- irq = 0;
- if (irq_want < nr_irqs_gsi)
- irq_want = nr_irqs_gsi;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_node(new, node);
- if (!desc_new) {
- printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
- }
- cfg_new = desc_new->chip_data;
-
- if (cfg_new->vector != 0)
- continue;
-
- desc_new = move_irq_desc(desc_new, node);
- cfg_new = desc_new->chip_data;
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = mpc_ioapic_id(ioapic);
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
+ if (!parent) {
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENODEV;
}
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- if (irq > 0)
- dynamic_irq_init_keep_chip_data(irq);
-
- return irq;
-}
-
-int create_irq(void)
-{
- int node = cpu_to_node(boot_cpu_id);
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
-
- if (irq == 0)
- irq = -1;
-
- return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
- unsigned long flags;
-
- dynamic_irq_cleanup_keep_chip_data(irq);
-
- free_irte(irq);
- raw_spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, get_irq_chip_data(irq));
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
- struct msi_msg *msg, u8 hpet_id)
-{
- struct irq_cfg *cfg;
- int err;
- unsigned dest;
-
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
-
- if (irq_remapped(irq)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
-
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
- memset (&irte, 0, sizeof(irte));
-
- irte.present = 1;
- irte.dst_mode = apic->irq_dest_mode;
- irte.trigger_mode = 0; /* edge */
- irte.dlvry_mode = apic->irq_delivery_mode;
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /* Set source-id of interrupt request */
- if (pdev)
- set_msi_sid(&irte, pdev);
- else
- set_hpet_sid(&irte, hpet_id);
-
- modify_irte(irq, &irte);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
+ ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops,
+ (void *)(long)ioapic);
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
}
- return err;
-}
-
-#ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
-
- cfg = desc->chip_data;
-
- read_msi_msg_desc(desc, &msg);
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- write_msi_msg_desc(desc, &msg);
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1);
return 0;
}
-#ifdef CONFIG_INTR_REMAP
-/*
- * Migrate the MSI irq to another cpumask. This migration is
- * done in the process context using interrupt-remapping hardware.
- */
-static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
- struct irte irte;
-
- if (get_irte(irq, &irte))
- return -1;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * atomically update the IRTE with the new destination and vector.
- */
- modify_irte(irq, &irte);
-
- /*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- return 0;
-}
-
-#endif
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = set_msi_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-static struct irq_chip msi_ir_chip = {
- .name = "IR-PCI-MSI",
- .unmask = unmask_msi_irq,
- .mask = mask_msi_irq,
-#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
-#endif
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+static void ioapic_destroy_irqdomain(int idx)
{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
+ if (ioapics[idx].irqdomain) {
+ irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ ioapics[idx].irqdomain = NULL;
}
- return index;
}
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+void __init setup_IO_APIC(void)
{
- int ret;
- struct msi_msg msg;
-
- ret = msi_compose_msg(dev, irq, &msg, -1);
- if (ret < 0)
- return ret;
+ int ioapic;
- set_irq_msi(irq, msidesc);
- write_msi_msg(irq, &msg);
-
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
- set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
- } else
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
-
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
-
- return 0;
-}
+ if (ioapic_is_disabled || !nr_ioapics)
+ return;
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- unsigned int irq;
- int ret, sub_handle;
- struct msi_desc *msidesc;
- unsigned int irq_want;
- struct intel_iommu *iommu = NULL;
- int index = 0;
- int node;
-
- /* x86 doesn't support multiple MSI yet */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
- node = dev_to_node(&dev->dev);
- irq_want = nr_irqs_gsi;
- sub_handle = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
- if (irq == 0)
- return -1;
- irq_want = irq + 1;
- if (!intr_remapping_enabled)
- goto no_ir;
+ apic_pr_verbose("ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
- if (!sub_handle) {
- /*
- * allocate the consecutive block of IRTE's
- * for 'nvec'
- */
- index = msi_alloc_irte(dev, irq, nvec);
- if (index < 0) {
- ret = index;
- goto error;
- }
- } else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
- goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
- }
-no_ir:
- ret = setup_msi_irq(dev, msidesc, irq);
- if (ret < 0)
- goto error;
- sub_handle++;
- }
- return 0;
+ /* Set up IO-APIC IRQ routing. */
+ x86_init.mpparse.setup_ioapic_ids();
-error:
- destroy_irq(irq);
- return ret;
-}
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ if (nr_legacy_irqs())
+ check_timer();
-void arch_teardown_msi_irq(unsigned int irq)
-{
- destroy_irq(irq);
+ ioapic_initialized = 1;
}
-#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
-#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static void resume_ioapic_id(int ioapic_idx)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
-
- cfg = desc->chip_data;
-
- dmar_msi_read(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- dmar_msi_write(irq, &msg);
+ union IO_APIC_reg_00 reg_00;
- return 0;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ }
}
-#endif /* CONFIG_SMP */
-
-static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .unmask = dmar_msi_unmask,
- .mask = dmar_msi_mask,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = dmar_msi_set_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+static int ioapic_suspend(void *data)
{
- int ret;
- struct msi_msg msg;
-
- ret = msi_compose_msg(NULL, irq, &msg, -1);
- if (ret < 0)
- return ret;
- dmar_msi_write(irq, &msg);
- set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
- "edge");
- return 0;
+ return save_ioapic_entries();
}
-#endif
-
-#ifdef CONFIG_HPET_TIMER
-#ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static void ioapic_resume(void *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- struct msi_msg msg;
- unsigned int dest;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
+ int ioapic_idx;
- cfg = desc->chip_data;
+ for_each_ioapic_reverse(ioapic_idx)
+ resume_ioapic_id(ioapic_idx);
- hpet_msi_read(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- hpet_msi_write(irq, &msg);
-
- return 0;
+ restore_ioapic_entries();
}
-#endif /* CONFIG_SMP */
-
-static struct irq_chip ir_hpet_msi_type = {
- .name = "IR-HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
-#ifdef CONFIG_INTR_REMAP
- .ack = ir_ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = ir_set_msi_irq_affinity,
-#endif
-#endif
- .retrigger = ioapic_retrigger_irq,
+static const struct syscore_ops ioapic_syscore_ops = {
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
};
-static struct irq_chip hpet_msi_type = {
- .name = "HPET_MSI",
- .unmask = hpet_msi_unmask,
- .mask = hpet_msi_mask,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = hpet_msi_set_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
+static struct syscore ioapic_syscore = {
+ .ops = &ioapic_syscore_ops,
};
-int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
- int ret;
- struct msi_msg msg;
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_hpet_to_ir(id);
- int index;
-
- if (!iommu)
- return -1;
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
- return -1;
- }
-
- ret = msi_compose_msg(NULL, irq, &msg, id);
- if (ret < 0)
- return ret;
-
- hpet_msi_write(irq, &msg);
- desc->status |= IRQ_MOVE_PCNTXT;
- if (irq_remapped(irq))
- set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
- handle_edge_irq, "edge");
- else
- set_irq_chip_and_handler_name(irq, &hpet_msi_type,
- handle_edge_irq, "edge");
-
- return 0;
-}
-#endif
-
-#endif /* CONFIG_PCI_MSI */
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int __init ioapic_init_ops(void)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- unsigned int dest;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
-
- cfg = desc->chip_data;
-
- target_ht_irq(irq, dest, cfg->vector);
+ register_syscore(&ioapic_syscore);
return 0;
}
-#endif
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .mask = mask_ht_irq,
- .unmask = unmask_ht_irq,
- .ack = ack_apic_edge,
-#ifdef CONFIG_SMP
- .set_affinity = set_ht_irq_affinity,
-#endif
- .retrigger = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
- struct irq_cfg *cfg;
- int err;
-
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (!err) {
- struct ht_irq_msg msg;
- unsigned dest;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus());
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((apic->irq_dest_mode == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
-
- write_ht_irq_msg(irq, &msg);
-
- set_irq_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
-
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
- }
- return err;
-}
-#endif /* CONFIG_HT_IRQ */
+device_initcall(ioapic_init_ops);
-int __init io_apic_get_redir_entries (int ioapic)
+static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /* The register returns the maximum index redir index
- * supported, which is one less than the total number of redir
- * entries.
- */
- return reg_01.bits.entries + 1;
-}
-
-void __init probe_nr_irqs_gsi(void)
-{
- int nr;
-
- nr = gsi_top + NR_IRQS_LEGACY;
- if (nr > nr_irqs_gsi)
- nr_irqs_gsi = nr;
-
- printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
-}
-#ifdef CONFIG_SPARSE_IRQ
-int __init arch_probe_nr_irqs(void)
-{
- int nr;
-
- if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
- nr_irqs = NR_VECTORS * nr_cpu_ids;
-
- nr = nr_irqs_gsi + 8 * nr_cpu_ids;
-#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
/*
- * for MSI and HT dyn irq
+ * The register returns the maximum index redir index supported,
+ * which is one less than the total number of redir entries.
*/
- nr += nr_irqs_gsi * 16;
-#endif
- if (nr < nr_irqs)
- nr_irqs = nr;
-
- return 0;
+ return reg_01.bits.entries + 1;
}
-#endif
-static int __io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
+unsigned int arch_dynirq_lower_bound(unsigned int from)
{
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int node;
- int ioapic, pin;
- int trigger, polarity;
-
- ioapic = irq_attr->ioapic;
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- if (dev)
- node = dev_to_node(dev);
- else
- node = cpu_to_node(boot_cpu_id);
-
- desc = irq_to_desc_alloc_node(irq, node);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
- return 0;
- }
-
- pin = irq_attr->ioapic_pin;
- trigger = irq_attr->trigger;
- polarity = irq_attr->polarity;
+ unsigned int ret;
/*
- * IRQs < 16 are already in the irq_2_pin[] map
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- if (irq >= legacy_pic->nr_legacy_irqs) {
- cfg = desc->chip_data;
- if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
- printk(KERN_INFO "can not add pin %d for irq %d\n",
- pin, irq);
- return 0;
- }
- }
-
- setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
-
- return 0;
-}
+ ret = ioapic_dynirq_base ? : gsi_top;
-int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
-{
- int ioapic, pin;
/*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and
+ * always 0. gsi_top can be 0 if there is no IO/APIC registered.
+ * 0 is an invalid interrupt number for dynamic allocations. Return
+ * @from instead.
*/
- ioapic = irq_attr->ioapic;
- pin = irq_attr->ioapic_pin;
- if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapics[ioapic].apicid, pin);
- return 0;
- }
- set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
-
- return __io_apic_set_pci_routing(dev, irq, irq_attr);
-}
-
-u8 __init io_apic_unique_id(u8 id)
-{
-#ifdef CONFIG_X86_32
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
-
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
-#endif
+ return ret ? : from;
}
#ifdef CONFIG_X86_32
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
+static int io_apic_get_unique_id(int ioapic, int apic_id)
{
+ static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
int i = 0;
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
-
- if (physids_empty(apic_id_map))
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+ /* Initialize the ID map */
+ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC))
+ copy_phys_cpu_present_map(apic_id_map);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic, 0);
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ if (apic_id >= broadcast_id) {
+ pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n",
+ ioapic, apic_id, reg_00.bits.ID);
apic_id = reg_00.bits.ID;
}
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (apic->check_apicid_used(&apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(&apic_id_map, i))
+ /* Every APIC in a system must have a unique ID */
+ if (test_bit(apic_id, apic_id_map)) {
+ for (i = 0; i < broadcast_id; i++) {
+ if (!test_bit(i, apic_id_map))
break;
}
- if (i == get_physical_broadcast())
+ if (i == broadcast_id)
panic("Max apic_id exceeded!\n");
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
+ pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i);
apic_id = i;
}
- apic->apicid_to_cpu_present(apic_id, &tmp);
- physids_or(apic_id_map, apic_id_map, tmp);
+ set_bit(apic_id, apic_id_map);
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ }
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
- printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
}
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+ apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
return apic_id;
}
-#endif
-int __init io_apic_get_version(int ioapic)
+static u8 io_apic_unique_id(int idx, u8 id)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version))
+ return io_apic_get_unique_id(idx, id);
+ return id;
}
-
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+#else
+static u8 io_apic_unique_id(int idx, u8 id)
{
- int ioapic, pin, idx;
+ union IO_APIC_reg_00 reg_00;
+ DECLARE_BITMAP(used, 256);
+ u8 new_id;
+ int i;
- if (skip_ioapic_setup)
- return -1;
+ bitmap_zero(used, 256);
+ for_each_ioapic(i)
+ __set_bit(mpc_ioapic_id(i), used);
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
+ /* Hand out the requested id if available */
+ if (!test_bit(id, used))
+ return id;
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
+ /*
+ * Read the current id from the ioapic and keep it if
+ * available.
+ */
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(idx, 0);
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
+ new_id = reg_00.bits.ID;
+ if (!test_bit(new_id, used)) {
+ apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
+ return new_id;
+ }
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
+ /* Get the next free id and write it to the ioapic. */
+ new_id = find_first_zero_bit(used, 256);
+ reg_00.bits.ID = new_id;
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ }
+ /* Sanity check */
+ BUG_ON(reg_00.bits.ID != new_id);
+
+ return new_id;
}
+#endif
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be apic->target_cpus()
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
+static int io_apic_get_version(int ioapic)
{
- int pin, ioapic, irq, irq_entry;
- struct irq_desc *desc;
- const struct cpumask *mask;
-
- if (skip_ioapic_setup == 1)
- return;
-
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
-
- if ((ioapic > 0) && (irq > 16))
- continue;
-
- desc = irq_to_desc(irq);
-
- /*
- * Honour affinities which have been set in early boot
- */
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
- else
- mask = apic->target_cpus();
+ union IO_APIC_reg_01 reg_01;
- if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- set_ioapic_affinity_irq_desc(desc, mask);
- }
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ return reg_01.bits.version;
}
-#endif
+/*
+ * This function updates target affinity of IOAPIC interrupts to include
+ * the CPUs which came online during SMP bringup.
+ */
#define IOAPIC_RESOURCE_NAME_SIZE 11
static struct resource *ioapic_resources;
-static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+static struct resource * __init ioapic_setup_resources(void)
{
- unsigned long n;
struct resource *res;
+ unsigned long n;
char *mem;
int i;
- if (nr_ioapics <= 0)
+ if (nr_ioapics == 0)
return NULL;
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
n *= nr_ioapics;
- mem = alloc_bootmem(n);
+ mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES);
res = (void *)mem;
mem += sizeof(struct resource) * nr_ioapics;
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
res[i].name = mem;
res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
+ ioapics[i].iomem_res = &res[i];
}
ioapic_resources = res;
@@ -4165,24 +2530,40 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_init_mappings(void)
+static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
+{
+ pgprot_t flags = FIXMAP_PAGE_NOCACHE;
+
+ /*
+ * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot
+ * bits, just like normal ioremap():
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (x86_platform.hyper.is_private_mmio(phys))
+ flags = pgprot_encrypted(flags);
+ else
+ flags = pgprot_decrypted(flags);
+ }
+
+ __set_fixmap(idx, phys, flags);
+}
+
+void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
int i;
- ioapic_res = ioapic_setup_resources(nr_ioapics);
- for (i = 0; i < nr_ioapics; i++) {
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].apicaddr;
+ ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
+ pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
- skip_ioapic_setup = 1;
+ ioapic_is_disabled = true;
goto fake_ioapic_page;
}
#endif
@@ -4190,13 +2571,13 @@ void __init ioapic_init_mappings(void)
#ifdef CONFIG_X86_32
fake_ioapic_page:
#endif
- ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE,
+ PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
- set_fixmap_nocache(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
- ioapic_phys);
+ io_apic_set_fixmap(idx, ioapic_phys);
+ apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
@@ -4207,17 +2588,16 @@ fake_ioapic_page:
void __init ioapic_insert_resources(void)
{
- int i;
struct resource *r = ioapic_resources;
+ int i;
if (!r) {
if (nr_ioapics > 0)
- printk(KERN_ERR
- "IO APIC resources couldn't be allocated.\n");
+ pr_err("IO APIC resources couldn't be allocated.\n");
return;
}
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
insert_resource(&iomem_resource, r);
r++;
}
@@ -4225,103 +2605,355 @@ void __init ioapic_insert_resources(void)
int mp_find_ioapic(u32 gsi)
{
- int i = 0;
+ int i;
+
+ if (nr_ioapics == 0)
+ return -1;
/* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_gsi_routing[i].gsi_base)
- && (gsi <= mp_gsi_routing[i].gsi_end))
+ for_each_ioapic(i) {
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
return -1;
}
int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
- if (WARN_ON(ioapic == -1))
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ if (WARN_ON(ioapic < 0))
return -1;
- if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
return -1;
- return gsi - mp_gsi_routing[ioapic].gsi_base;
+ return gsi - gsi_cfg->gsi_base;
}
-static int bad_ioapic(unsigned long address)
+static int bad_ioapic_register(int idx)
{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded "
- "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
- return 1;
- }
- if (!address) {
- printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
+
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
return 1;
}
+
return 0;
}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int find_free_ioapic_entry(void)
{
- int idx = 0;
- int entries;
+ for (int idx = 0; idx < MAX_IO_APICS; idx++) {
+ if (ioapics[idx].nr_registers == 0)
+ return idx;
+ }
+ return MAX_IO_APICS;
+}
- if (bad_ioapic(address))
- return;
+/**
+ * mp_register_ioapic - Register an IOAPIC device
+ * @id: hardware IOAPIC ID
+ * @address: physical address of IOAPIC register area
+ * @gsi_base: base of GSI associated with the IOAPIC
+ * @cfg: configuration information for the IOAPIC
+ */
+int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg)
+{
+ bool hotplug = !!ioapic_initialized;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int idx, ioapic, entries;
+ u32 gsi_end;
- idx = nr_ioapics;
+ if (!address) {
+ pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
+ return -EINVAL;
+ }
- mp_ioapics[idx].type = MP_IOAPIC;
- mp_ioapics[idx].flags = MPC_APIC_USABLE;
- mp_ioapics[idx].apicaddr = address;
+ for_each_ioapic(ioapic) {
+ if (ioapics[ioapic].mp_config.apicaddr == address) {
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic);
+ return -EEXIST;
+ }
+ }
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].apicid = io_apic_unique_id(id);
- mp_ioapics[idx].apicver = io_apic_get_version(idx);
+ idx = find_free_ioapic_entry();
+ if (idx >= MAX_IO_APICS) {
+ pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, idx);
+ return -ENOSPC;
+ }
+
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
+
+ io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address);
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENODEV;
+ }
+
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
entries = io_apic_get_redir_entries(idx);
- mp_gsi_routing[idx].gsi_base = gsi_base;
- mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+ gsi_end = gsi_base + entries - 1;
+ for_each_ioapic(ioapic) {
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if ((gsi_base >= gsi_cfg->gsi_base &&
+ gsi_base <= gsi_cfg->gsi_end) ||
+ (gsi_end >= gsi_cfg->gsi_base &&
+ gsi_end <= gsi_cfg->gsi_end)) {
+ pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
+ gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOSPC;
+ }
+ }
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_end;
+
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
/*
- * The number of IO-APIC IRQ registers (== #pins):
+ * If mp_register_ioapic() is called during early boot stage when
+ * walking ACPI/DT tables, it's too early to create irqdomain,
+ * we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
- nr_ioapic_registers[idx] = entries;
+ if (hotplug) {
+ if (mp_irqdomain_create(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOMEM;
+ }
+ alloc_ioapic_saved_registers(idx);
+ }
- if (mp_gsi_routing[idx].gsi_end >= gsi_top)
- gsi_top = mp_gsi_routing[idx].gsi_end + 1;
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
+ if (nr_ioapics <= idx)
+ nr_ioapics = idx + 1;
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
- mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
- mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+ /* Set nr_registers to mark entry present */
+ ioapics[idx].nr_registers = entries;
- nr_ioapics++;
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+ return 0;
}
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+int mp_unregister_ioapic(u32 gsi_base)
{
- struct irq_cfg *cfg;
- struct irq_desc *desc;
+ int ioapic, pin;
+ int found = 0;
- printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
-#endif
- desc = irq_to_desc_alloc_node(0, 0);
+ for_each_ioapic(ioapic) {
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
+ return -ENODEV;
+ }
+
+ for_each_pin(ioapic, pin) {
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
+
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic);
+ return -EBUSY;
+ }
+ }
+ }
+
+ /* Mark entry not present */
+ ioapics[ioapic].nr_registers = 0;
+ ioapic_destroy_irqdomain(ioapic);
+ free_ioapic_saved_registers(ioapic);
+ if (ioapics[ioapic].iomem_res)
+ release_resource(ioapics[ioapic].iomem_res);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic);
+ memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic]));
+
+ return 0;
+}
+
+int mp_ioapic_registered(u32 gsi_base)
+{
+ int ioapic;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base)
+ return 1;
+
+ return 0;
+}
- setup_local_APIC();
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
+{
+ if (info && info->ioapic.valid) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->is_level = true;
+ data->active_low = true;
+ }
+}
+
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
+{
+ struct IO_APIC_route_entry *entry = &data->entry;
+
+ memset(entry, 0, sizeof(*entry));
+ entry->is_level = data->is_level;
+ entry->active_low = data->active_low;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ entry->masked = data->is_level;
+}
+
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct mp_chip_data *data;
+ struct irq_data *irq_data;
+ int ret, ioapic, pin;
+ unsigned long flags;
+
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
+
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic.pin;
+ if (irq_resolve_mapping(domain, (irq_hw_number_t)pin))
+ return -EEXIST;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0)
+ goto free_data;
+
+ INIT_LIST_HEAD(&data->irq_2_pin);
+ irq_data->hwirq = info->ioapic.pin;
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- cfg = irq_cfg(0);
- add_pin_to_irq_node(cfg, 0, 0, 0);
- set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) {
+ ret = -ENOMEM;
+ goto free_irqs;
+ }
+
+ mp_preconfigure_entry(data);
+ mp_register_handler(virq, data->is_level);
+
+ local_irq_save(flags);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+ local_irq_restore(flags);
+
+ apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low);
+ return 0;
- setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+free_irqs:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+free_data:
+ kfree(data);
+ return ret;
+}
+
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+ kfree(irq_data->chip_data);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
+
+int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve)
+{
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ ioapic_configure_entry(irq_data);
+ return 0;
+}
+
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
+}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..98a57cb4aa86 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,167 +1,291 @@
-#include <linux/cpumask.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
+#include <linux/cpumask.h>
#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/apic.h>
-#include <asm/proto.h>
-#include <asm/ipi.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
-void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+#include <asm/io_apic.h>
+
+#include "local.h"
+
+DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+#ifdef CONFIG_SMP
+static int apic_ipi_shorthand_off __ro_after_init;
+
+static __init int apic_ipi_shorthand(char *str)
{
- unsigned long query_cpu;
- unsigned long flags;
+ get_option(&str, &apic_ipi_shorthand_off);
+ return 1;
+}
+__setup("no_ipi_broadcast=", apic_ipi_shorthand);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("IPI shorthand broadcast: %s\n",
+ str_disabled_enabled(apic_ipi_shorthand_off));
+ return 0;
+}
+late_initcall(print_ipi_mode);
+void apic_smt_update(void)
+{
/*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicast to each CPU instead.
- * - mbligh
+ * Do not switch to broadcast mode if:
+ * - Disabled on the command line
+ * - Only a single CPU is online
+ * - Not all present CPUs have been at least booted once
+ *
+ * The latter is important as the local APIC might be in some
+ * random state and a broadcast might cause havoc. That's
+ * especially true for NMI broadcasting.
*/
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ if (apic_ipi_shorthand_off || num_online_cpus() == 1 ||
+ !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) {
+ static_branch_disable(&apic_use_ipi_shorthand);
+ } else {
+ static_branch_enable(&apic_use_ipi_shorthand);
}
- local_irq_restore(flags);
}
-void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
- int vector)
+void apic_send_IPI_allbutself(unsigned int vector)
{
- unsigned int this_cpu = smp_processor_id();
- unsigned int query_cpu;
- unsigned long flags;
+ if (num_online_cpus() < 2)
+ return;
- /* See Hack comment above */
+ if (static_branch_likely(&apic_use_ipi_shorthand))
+ __apic_send_IPI_allbutself(vector);
+ else
+ __apic_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+/*
+ * Send a 'reschedule' IPI to another CPU. It goes straight through and
+ * wastes no time serializing anything. Worst case is that we lose a
+ * reschedule ...
+ */
+void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
}
- local_irq_restore(flags);
+ __apic_send_IPI(cpu, RESCHEDULE_VECTOR);
}
-void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
- int vector)
+void native_send_call_func_single_ipi(int cpu)
{
- unsigned long flags;
- unsigned int query_cpu;
+ __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ if (static_branch_likely(&apic_use_ipi_shorthand)) {
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask))
+ goto sendmask;
+
+ if (cpumask_test_cpu(cpu, mask))
+ __apic_send_IPI_all(CALL_FUNCTION_VECTOR);
+ else if (num_online_cpus() > 1)
+ __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+sendmask:
+ __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+}
+
+void apic_send_nmi_to_offline_cpu(unsigned int cpu)
+{
+ if (WARN_ON_ONCE(!apic->nmi_to_offline_cpu))
+ return;
+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, &cpus_booted_once_mask)))
+ return;
+ apic->send_IPI(cpu, NMI_VECTOR);
+}
+#endif /* CONFIG_SMP */
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_XAPIC_DEST_FIELD(mask);
+}
+
+u32 apic_mem_wait_icr_idle_timeout(void)
+{
+ int cnt;
+ for (cnt = 0; cnt < 1000; cnt++) {
+ if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY))
+ return 0;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ }
+ return APIC_ICR_BUSY;
+}
+
+void apic_mem_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+/*
+ * This is safe against interruption because it only writes the lower 32
+ * bits of the APIC_ICR register. The destination field is ignored for
+ * short hand IPIs.
+ *
+ * wait_icr_idle()
+ * write(ICR2, dest)
+ * NMI
+ * wait_icr_idle()
+ * write(ICR)
+ * wait_icr_idle()
+ * write(ICR)
+ *
+ * This function does not need to disable interrupts as there is no ICR2
+ * interaction. The memory write is direct except when the machine is
+ * affected by the 11AP Pentium erratum, which turns the plain write into
+ * an XCHG operation.
+ */
+static void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
+{
/*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
+ * Wait for the previous ICR command to complete. Use
+ * safe_apic_wait_icr_idle() for the NMI vector as there have been
+ * issues where otherwise the system hangs when the panic CPU tries
+ * to stop the others before launching the kdump kernel.
*/
+ if (unlikely(vector == NMI_VECTOR))
+ apic_mem_wait_icr_idle_timeout();
+ else
+ apic_mem_wait_icr_idle();
+
+ /* Destination field (ICR2) and the destination mode are ignored */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0));
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int dest_mask, int vector,
+ unsigned int dest_mode)
+{
+ /* See comment in __default_send_IPI_shortcut() */
+ if (unlikely(vector == NMI_VECTOR))
+ apic_mem_wait_icr_idle_timeout();
+ else
+ apic_mem_wait_icr_idle();
+
+ /* Set the IPI destination field in the ICR */
+ native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask));
+ /* Send it with the proper destination mode */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode));
+}
+
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __default_send_IPI_dest_field(
- apic->cpu_to_logical_apicid(query_cpu), vector,
- apic->dest_logical);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
local_irq_restore(flags);
}
-void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
- int vector)
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
unsigned long flags;
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- /* See Hack comment above */
+ unsigned long cpu;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
- continue;
- __default_send_IPI_dest_field(
- apic->cpu_to_logical_apicid(query_cpu), vector,
- apic->dest_logical);
- }
+ for_each_cpu(cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ cpu), vector, APIC_DEST_PHYSICAL);
+ }
local_irq_restore(flags);
}
-#ifdef CONFIG_X86_32
-
-/*
- * This is only used on smaller machines.
- */
-void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
{
- unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
- if (WARN_ONCE(!mask, "empty IPI mask"))
- return;
-
local_irq_save(flags);
- WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ cpu), vector, APIC_DEST_PHYSICAL);
+ }
local_irq_restore(flags);
}
-void default_send_IPI_allbutself(int vector)
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
{
- /*
- * if there are no other CPUs in the system then we get an APIC send
- * error if we try to broadcast, thus avoid sending IPIs in this case.
- */
- if (!(num_online_cpus() > 1))
- return;
+ __apic_send_IPI_mask(cpumask_of(cpu), vector);
+}
- __default_local_send_IPI_allbutself(vector);
+void default_send_IPI_allbutself(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
}
void default_send_IPI_all(int vector)
{
- __default_local_send_IPI_all(vector);
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector);
}
void default_send_IPI_self(int vector)
{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector);
}
-/* must come after the send_IPI functions above for inlining */
-static int convert_apicid_to_cpu(int apic_id)
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector)
{
- int i;
+ unsigned long flags;
+ unsigned int cpu;
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
+ local_irq_save(flags);
+ for_each_cpu(cpu, mask)
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
}
-int safe_smp_processor_id(void)
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
{
- int apicid, cpuid;
+ unsigned int cpu, this_cpu = smp_processor_id();
+ unsigned long flags;
- if (!cpu_has_apic)
- return 0;
+ local_irq_save(flags);
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ }
+ local_irq_restore(flags);
+}
- apicid = hard_smp_processor_id();
- if (apicid == BAD_APICID)
- return 0;
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
- cpuid = convert_apicid_to_cpu(apicid);
+ if (!mask)
+ return;
- return cpuid >= 0 ? cpuid : 0;
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+ local_irq_restore(flags);
}
#endif
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
new file mode 100644
index 000000000000..bdcf609eb283
--- /dev/null
+++ b/arch/x86/kernel/apic/local.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Historical copyright notices:
+ *
+ * Copyright 2004 James Cleverdon, IBM.
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/jump_label.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+
+/* X2APIC */
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+u32 x2apic_get_apic_id(u32 id);
+
+void x2apic_send_IPI_all(int vector);
+void x2apic_send_IPI_allbutself(int vector);
+void x2apic_send_IPI_self(int vector);
+extern u32 x2apic_max_apicid;
+
+/* IPI */
+
+DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand);
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+void default_init_apic_ldr(void);
+
+void apic_mem_wait_icr_idle(void);
+u32 apic_mem_wait_icr_idle_timeout(void);
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
+
+void default_send_IPI_single(int cpu, int vector);
+void default_send_IPI_single_phys(int cpu, int vector);
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector);
+void default_send_IPI_allbutself(int vector);
+void default_send_IPI_all(int vector);
+void default_send_IPI_self(int vector);
+
+#ifdef CONFIG_X86_32
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
+void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
new file mode 100644
index 000000000000..66bc5d3e79db
--- /dev/null
+++ b/arch/x86/kernel/apic/msi.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support of MSI, HPET and DMAR interrupts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Convert to hierarchical irqdomain
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+#include <linux/msi.h>
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
+
+struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg, false);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path).
+ * The reservation mode bit is set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The interrupt is not yet started up
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_can_reserve(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ !irqd_is_started(irqd) ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in
+ * common_interrupt().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ * MSI parent domain associated
+ * @dev: Pointer to the PCI device
+ */
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
+
+ if (!domain)
+ domain = dev_get_msi_domain(&dev->bus->dev);
+ if (!domain)
+ return false;
+
+ return domain == x86_vector_domain;
+}
+
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain: The domain for which this setup happens
+ * @dev: The device for which interrupts are allocated
+ * @nvec: The number of vectors to allocate
+ * @alloc: The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *alloc)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ init_irq_alloc_info(alloc, NULL);
+
+ switch (info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev: The device for which the domain should be created
+ * @domain: The (root) domain providing this callback
+ * @real_parent: The real parent domain of the to initialize domain
+ * @info: The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_ANY:
+ /* Only the vector domain can have the ANY token */
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+ info->chip->irq_set_affinity = msi_set_affinity;
+ info->chip->flags |= IRQCHIP_MOVE_DEFERRED;
+ break;
+ case DOMAIN_BUS_DMAR:
+ case DOMAIN_BUS_AMDVI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Is the target supported? */
+ switch(info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+ /* Enforce the required flags */
+ info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED;
+
+ /* This is always invoked from the top level MSI domain! */
+ info->ops->msi_prepare = x86_msi_prepare;
+
+ info->chip->irq_ack = irq_chip_ack_parent;
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+ info->chip->flags |= IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP;
+
+ info->handler = handle_edge_irq;
+ info->handler_name = "edge";
+
+ return true;
+}
+
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED,
+ .init_dev_msi_info = x86_init_dev_msi_info,
+};
+
+struct irq_domain * __init native_create_pci_msi_domain(void)
+{
+ if (apic_is_disabled)
+ return NULL;
+
+ x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+ return x86_vector_domain;
+}
+
+void __init x86_create_pci_msi_domain(void)
+{
+ x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
+}
+
+/* Keep around for hyperV */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
+{
+ init_irq_alloc_info(arg, NULL);
+
+ if (to_pci_dev(dev)->msix_enabled)
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ else
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
+
+#ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ dmar_msi_write(data->irq, msg);
+}
+
+static struct irq_chip dmar_msi_controller = {
+ .name = "DMAR-MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = dmar_msi_compose_msg,
+ .irq_write_msi_msg = dmar_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static int dmar_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
+}
+
+static struct msi_domain_ops dmar_msi_domain_ops = {
+ .msi_init = dmar_msi_init,
+};
+
+static struct msi_domain_info dmar_msi_domain_info = {
+ .ops = &dmar_msi_domain_ops,
+ .chip = &dmar_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+ static struct irq_domain *dmar_domain;
+ static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
+
+ mutex_lock(&dmar_lock);
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
+ x86_vector_domain);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
+ }
+out:
+ mutex_unlock(&dmar_lock);
+ return dmar_domain;
+}
+
+int dmar_alloc_hwirq(int id, int node, void *arg)
+{
+ struct irq_domain *domain = dmar_get_irq_domain();
+ struct irq_alloc_info info;
+
+ if (!domain)
+ return -1;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+ info.devid = id;
+ info.hwirq = id;
+ info.data = arg;
+
+ return irq_domain_alloc_irqs(domain, 1, node, &info);
+}
+
+void dmar_free_hwirq(int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+#endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ return xen_initdom_restore_msi(dev);
+}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index 1edaf15c0b8e..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
- return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
- return atomic_read(&mce_entry) > 0;
-#endif
- return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
- return per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /*
- * Intentionally don't use cpu_relax here. This is
- * to make sure that the performance counter really ticks,
- * even if there is a simulator or similar that catches the
- * pause instruction. On a real HT machine this is fine because
- * all other CPUs are busy with "useless" delay loops and don't
- * care if they get somewhat less cycles.
- */
- while (endflag == 0)
- mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING
- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
- cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
- printk(KERN_WARNING
- "Please report this to bugzilla.kernel.org,\n");
- printk(KERN_WARNING
- "and attach the output of the 'dmesg' command.\n");
-
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
- unsigned int *prev_nmi_count;
- int cpu;
-
- if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- goto error;
-
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
- for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = get_nmi_count(cpu);
- local_irq_enable();
- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
- report_broken_nmi(cpu, prev_nmi_count);
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- goto error;
- }
- printk("OK.\n");
-
- /*
- * now that we know it works we can reduce NMI frequency to
- * something more reasonable; makes a difference in some configs
- */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-error:
- if (nmi_watchdog == NMI_IO_APIC) {
- if (!timer_through_8259)
- legacy_pic->chip->mask(0);
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
- }
-
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
- return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- unsigned int nmi;
-
- if (!strncmp(str, "panic", 5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- if (!strncmp(str, "lapic", 5))
- nmi_watchdog = NMI_LOCAL_APIC;
- else if (!strncmp(str, "ioapic", 6))
- nmi_watchdog = NMI_IO_APIC;
- else {
- get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
- return 0;
- nmi_watchdog = nmi;
- }
-
- return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /*
- * should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if (!nmi_watchdog_active())
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- else
- __acpi_nmi_disable(NULL);
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog_active()) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- /*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
- */
- unsigned int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
- sum = get_timer_irqs(cpu);
-
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- /* We can be called before check_nmi_watchdog, hence NULL check. */
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
- raw_spin_lock(&lock);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- dump_stack();
- raw_spin_unlock(&lock);
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
- rc = 1;
- }
-
- /* Could check oops_in_progress here too, but it's safer not to */
- if (mce_in_progress())
- touched = 1;
-
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- __this_cpu_inc(alert_counter);
- if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(alert_counter, 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
- touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
- printk(KERN_WARNING
- "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else if (nmi_watchdog == NMI_IO_APIC) {
- if (nmi_watchdog_enabled)
- enable_ioapic_nmi_watchdog();
- else
- disable_ioapic_nmi_watchdog();
- } else {
- printk(KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
- int i;
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- }
-}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
deleted file mode 100644
index 3e28401f161c..000000000000
--- a/arch/x86/kernel/apic/numaq_32.c
+++ /dev/null
@@ -1,549 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/numaq.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/ipi.h>
-
-#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
-
-int found_numaq;
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_trans {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
-};
-
-static int mpc_record;
-
-static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id[NR_CPUS/4][4];
-
-
-static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
-{
- struct eachquadmem *eq = scd->eq + node;
-
- node_set_online(node);
-
- /* Convert to pages */
- node_start_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
- node_end_pfn[node] =
- MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
- e820_register_active_regions(node, node_start_pfn[node],
- node_end_pfn[node]);
-
- memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
- node_remap_size[node] = node_memmap_size_bytes(node,
- node_start_pfn[node],
- node_end_pfn[node]);
-}
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table. This
- * function also updates node_online_map with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
- struct sys_cfg_data *scd;
- int node;
-
- scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
- nodes_clear(node_online_map);
- for_each_node(node) {
- if (scd->quads_present31_0 & (1 << node))
- numaq_register_node(node, scd);
- }
-}
-
-void __cpuinit numaq_tsc_disable(void)
-{
- if (!found_numaq)
- return;
-
- if (num_online_nodes() > 1) {
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- }
-}
-
-static void __init numaq_tsc_init(void)
-{
- numaq_tsc_disable();
-}
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-/* x86_quirks member */
-static int mpc_apic_id(struct mpc_cpu *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->apicid);
-
- printk(KERN_DEBUG
- "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4,
- m->apicver, quad, logical_apicid);
-
- return logical_apicid;
-}
-
-/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- mp_bus_id_to_node[m->busid] = quad;
- mp_bus_id_to_local[m->busid] = local;
-
- printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
-}
-
-/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_bus *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- quad_local_to_mp_bus_id[quad][local] = m->busid;
-}
-
-/*
- * Called from mpparse code.
- * mode = 0: prescan
- * mode = 1: one mpc entry scanned
- */
-static void numaq_mpc_record(unsigned int mode)
-{
- if (!mode)
- mpc_record = 0;
- else
- mpc_record++;
-}
-
-static void __init MP_translation_info(struct mpc_trans *m)
-{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
-
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
-
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
-}
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-static void __init smp_read_mpc_oem(struct mpc_table *mpc)
-{
- struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
- mpc_record = 0;
- printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
-
- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->signature[0], oemtable->signature[1],
- oemtable->signature[2], oemtable->signature[3]);
- return;
- }
-
- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
-
- while (count < oemtable->length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_trans *m = (void *)oemptr;
-
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
-}
-
-static __init void early_check_numaq(void)
-{
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-
- if (found_numaq) {
- x86_init.mpparse.mpc_record = numaq_mpc_record;
- x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
- x86_init.mpparse.mpc_apic_id = mpc_apic_id;
- x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
- x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
- x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
- x86_init.timers.tsc_pre_init = numaq_tsc_init;
- x86_init.pci.init = pci_numaq_init;
- }
-}
-
-int __init get_memcfg_numaq(void)
-{
- early_check_numaq();
- if (!found_numaq)
- return 0;
- smp_dump_qct();
-
- return 1;
-}
-
-#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline unsigned int numaq_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0x0F;
-}
-
-static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static inline void numaq_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static inline void numaq_send_IPI_all(int vector)
-{
- numaq_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
-#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
-
-/*
- * Because we use NMIs rather than the INIT-STARTUP sequence to
- * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
- */
-static inline void numaq_smp_callin_clear_local_apic(void)
-{
- clear_local_APIC();
-}
-
-static inline const struct cpumask *numaq_target_cpus(void)
-{
- return cpu_all_mask;
-}
-
-static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static inline unsigned long numaq_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static inline int numaq_apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void numaq_init_apic_ldr(void)
-{
- /* Already done in NUMA-Q firmware */
-}
-
-static inline void numaq_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-/*
- * Skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum.
- */
-static inline int numaq_multi_timer_check(int apic, int irq)
-{
- return apic != 0 && irq == 0;
-}
-
-static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL, retmap);
-}
-
-static inline int numaq_cpu_to_logical_apicid(int cpu)
-{
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-}
-
-/*
- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
- * cpu to APIC ID relation to properly interact with the intelligent
- * mode of the cluster controller.
- */
-static inline int numaq_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < 60)
- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
- else
- return BAD_APICID;
-}
-
-static inline int numaq_apicid_to_node(int logical_apicid)
-{
- return logical_apicid >> 4;
-}
-
-static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
-{
- int node = numaq_apicid_to_node(logical_apicid);
- int cpu = __ffs(logical_apicid & 0xf);
-
- physid_set_mask_of_physid(cpu + 4*node, retmap);
-}
-
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-
-static inline int numaq_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
-/*
- * We use physical apicids here, not logical, so just return the default
- * physical broadcast to stop people from breaking us
- */
-static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- return 0x0F;
-}
-
-static inline unsigned int
-numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- return 0x0F;
-}
-
-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
-static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int
-numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- if (strncmp(oem, "IBM NUMA", 8))
- printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
- else
- found_numaq = 1;
-
- return found_numaq;
-}
-
-static int probe_numaq(void)
-{
- /* already know from get_memcfg_numaq() */
- return found_numaq;
-}
-
-static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-static void numaq_setup_portio_remap(void)
-{
- int num_quads = num_online_nodes();
-
- if (num_quads <= 1)
- return;
-
- printk(KERN_INFO
- "Remapping cross-quad port I/O for %d quads\n", num_quads);
-
- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
-
- printk(KERN_INFO
- "xquad_portio vaddr 0x%08lx, len %08lx\n",
- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
-}
-
-/* Use __refdata to keep false positive warning calm. */
-struct apic __refdata apic_numaq = {
-
- .name = "NUMAQ",
- .probe = probe_numaq,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = numaq_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* physical delivery on LOCAL quad: */
- .irq_dest_mode = 0,
-
- .target_cpus = numaq_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = numaq_check_apicid_used,
- .check_apicid_present = numaq_check_apicid_present,
-
- .vector_allocation_domain = numaq_vector_allocation_domain,
- .init_apic_ldr = numaq_init_apic_ldr,
-
- .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
- .setup_apic_routing = numaq_setup_apic_routing,
- .multi_timer_check = numaq_multi_timer_check,
- .apicid_to_node = numaq_apicid_to_node,
- .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
- .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
- .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
- .setup_portio_remap = numaq_setup_portio_remap,
- .check_phys_apicid_present = numaq_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = numaq_phys_pkg_id,
- .mps_oem_check = numaq_mps_oem_check,
-
- .get_apic_id = numaq_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
-
- .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = numaq_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = numaq_send_IPI_allbutself,
- .send_IPI_all = numaq_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
- .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
-
- /* We don't do anything here because we use NMI's to boot instead */
- .wait_for_init_deassert = NULL,
-
- .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
- .inquire_remote_apic = NULL,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..87bc9e7ca5d6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -1,104 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Default generic APIC driver. This handles up to 8 CPUs.
*
* Copyright 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
*
* Generic x86 APIC driver probe layer.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
+#include <linux/export.h>
#include <linux/errno.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-
#include <linux/smp.h>
-#include <asm/ipi.h>
-
-#include <linux/interrupt.h>
-#include <asm/acpi.h>
-#include <asm/e820.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define DEFAULT_SEND_IPI (1)
-#else
-#define DEFAULT_SEND_IPI (0)
-#endif
-
-int no_broadcast = DEFAULT_SEND_IPI;
-
-static __init int no_ipi_broadcast(char *str)
-{
- get_option(&str, &no_broadcast);
- pr_info("Using %s mode\n",
- no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
- return 1;
-}
-__setup("no_ipi_broadcast=", no_ipi_broadcast);
-
-static int __init print_ipi_mode(void)
-{
- pr_info("Using IPI %s mode\n",
- no_broadcast ? "No-Shortcut" : "Shortcut");
- return 0;
-}
-late_initcall(print_ipi_mode);
-void __init default_setup_apic_routing(void)
-{
- int version = apic_version[boot_cpu_physical_apicid];
+#include <xen/xen.h>
- if (num_possible_cpus() > 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
-
-#ifdef CONFIG_X86_BIGSMP
- generic_bigsmp_probe();
-#endif
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/acpi.h>
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
-}
+#include "local.h"
-static void setup_apic_flat_routing(void)
+static u32 default_get_apic_id(u32 x)
{
-#ifdef CONFIG_X86_IO_APIC
- printk(KERN_INFO
- "Enabling APIC mode: Flat. Using %d I/O APICs\n",
- nr_ioapics);
-#endif
-}
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /*
- * Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
}
/* should be called last. */
@@ -107,106 +34,55 @@ static int probe_default(void)
return 1;
}
-struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
- .acpi_madt_oem_check = NULL,
- .apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .dest_mode_logical = true,
- .target_cpus = default_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
- .check_apicid_present = default_check_apicid_present,
- .vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
-
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = setup_apic_flat_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = default_apicid_to_node,
- .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = default_phys_pkg_id,
- .mps_oem_check = NULL,
+ .max_apic_id = 0xFE,
.get_apic_id = default_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
- .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_flat_calc_apicid,
+ .send_IPI = default_send_IPI_single,
.send_IPI_mask = default_send_IPI_mask_logical,
.send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
.send_IPI_allbutself = default_send_IPI_allbutself,
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
-extern struct apic apic_numaq;
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
+apic_driver(apic_default);
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
EXPORT_SYMBOL_GPL(apic);
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
- &apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
- &apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
- &apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
- &apic_es7000,
- &apic_es7000_cluster,
-#endif
- &apic_default, /* must be last */
- NULL,
-};
-
static int cmdline_apic __initdata;
static int __init parse_apic(char *arg)
{
- int i;
+ struct apic **drv;
if (!arg)
return -EINVAL;
- for (i = 0; apic_probe[i]; i++) {
- if (!strcmp(apic_probe[i]->name, arg)) {
- apic = apic_probe[i];
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic_install_driver(*drv);
cmdline_apic = 1;
return 0;
}
@@ -217,82 +93,19 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init generic_bigsmp_probe(void)
-{
-#ifdef CONFIG_X86_BIGSMP
- /*
- * This routine is used to switch to bigsmp mode when
- * - There is no apic= option specified by the user
- * - generic_apic_probe() has chosen apic_default as the sub_arch
- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
- */
-
- if (!cmdline_apic && apic == &apic_default) {
- if (apic_bigsmp.probe()) {
- apic = &apic_bigsmp;
- printk(KERN_INFO "Overriding APIC driver with %s\n",
- apic->name);
- }
- }
-#endif
-}
-
-void __init generic_apic_probe(void)
+void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
- int i;
- for (i = 0; apic_probe[i]; i++) {
- if (apic_probe[i]->probe()) {
- apic = apic_probe[i];
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic_install_driver(*drv);
break;
}
}
/* Not visible without early console */
- if (!apic_probe[i])
+ if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
- printk(KERN_INFO "Using APIC driver %s\n", apic->name);
-}
-
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- int i;
-
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->mps_oem_check)
- continue;
- if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
- continue;
-
- if (!cmdline_apic) {
- apic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
-}
-
-int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- int i;
-
- for (i = 0; apic_probe[i]; ++i) {
- if (!apic_probe[i]->acpi_madt_oem_check)
- continue;
- if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
- continue;
-
- if (!cmdline_apic) {
- apic = apic_probe[i];
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..ecdf0c4121e1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
*
* Generic APIC sub-arch probe layer.
*
@@ -8,99 +8,33 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/hardirq.h>
-#include <linux/dmar.h>
-
-#include <asm/smp.h>
+#include <linux/thread_info.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/setup.h>
-
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
- &apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
- &apic_x2apic_phys,
- &apic_x2apic_cluster,
-#endif
- &apic_physflat,
- NULL,
-};
-static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
+#include "local.h"
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init default_setup_apic_routing(void)
+/* Select the appropriate APIC driver */
+void __init x86_64_probe_apic(void)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_mode
-#ifdef CONFIG_X86_UV
- && apic != &apic_x2apic_uv_x
-#endif
- ) {
- if (x2apic_phys)
- apic = &apic_x2apic_phys;
- else
- apic = &apic_x2apic_cluster;
- }
-#endif
-
- if (apic == &apic_flat && num_possible_cpus() > 8)
- apic = &apic_physflat;
+ struct apic **drv;
- printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
+ enable_IR_x2apic();
- if (is_vsmp_box()) {
- /* need to update phys_pkg_id */
- apic->phys_pkg_id = apicid_phys_pkg_id;
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ apic_install_driver(*drv);
+ break;
+ }
}
-
- /*
- * Now that apic routing model is selected, configure the
- * fault handling for intr remapping.
- */
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
-}
-
-/* Same for both flat and physical. */
-
-void apic_send_IPI_self(int vector)
-{
- __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int i;
+ struct apic **drv;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
- apic = apic_probe[i];
- printk(KERN_INFO "Setting APIC routing to %s.\n",
- apic->name);
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ apic_install_driver(*drv);
return 1;
}
}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
deleted file mode 100644
index 9b419263d90d..000000000000
--- a/arch/x86/kernel/apic/summit_32.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/bios_ebda.h>
-
-/*
- * APIC driver for the IBM "Summit" chipset.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/smp.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/ipi.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/smp.h>
-
-static unsigned summit_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static void summit_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static void summit_send_IPI_all(int vector)
-{
- summit_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#include <asm/tsc.h>
-
-extern int use_cyclone;
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static void setup_summit(void);
-#else
-static inline void setup_summit(void) {}
-#endif
-
-static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- if (!strncmp(oem, "IBM ENSW", 8) &&
- (!strncmp(productid, "VIGIL SMP", 9)
- || !strncmp(productid, "EXA", 3)
- || !strncmp(productid, "RUTHLESS SMP", 12))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (!strncmp(oem_id, "IBM", 3) &&
- (!strncmp(oem_table_id, "SERVIGIL", 8)
- || !strncmp(oem_table_id, "EXA", 3))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-struct rio_table_hdr {
- unsigned char version; /* Version number of this data structure */
- /* Version 3 adds chassis_num & WP_index */
- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
-} __attribute__((packed));
-
-struct scal_detail {
- unsigned char node_id; /* Scalability Node ID */
- unsigned long CBAR; /* Address of 1MB register space */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF = None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port2node; /* Node ID port connected to: 0xFF = None */
- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- unsigned char node_id; /* RIO Node ID */
- unsigned long BBAR; /* Address of 1MB register space */
- unsigned char type; /* Type of device */
- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
- /* For CYC: Node ID of Twister that owns this CYC */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF=None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
- /* For CYC: 0 */
- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie:*/
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- /* For CYC: Bits0:7 Reserved */
- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- /* For CYC: No meaning */
- unsigned char chassis_num; /* 1 based Chassis number */
- /* For LookOut WPEGs this field indicates the */
- /* Expansion Chassis #, enumerated from Boot */
- /* Node WPEG external port, then Boot Node CYC */
- /* external port, then Next Vigil chassis WPEG */
- /* external port, etc. */
- /* Shared Lookouts have only 1 chassis number (the */
- /* first one assigned) */
-} __attribute__((packed));
-
-
-typedef enum {
- CompatTwister = 0, /* Compatibility Twister */
- AltTwister = 1, /* Alternate Twister of internal 8-way */
- CompatCyclone = 2, /* Compatibility Cyclone */
- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
- CompatWPEG = 4, /* Compatibility WPEG */
- AltWPEG = 5, /* Second Planar WPEG */
- LookOutAWPEG = 6, /* LookOut WPEG */
- LookOutBWPEG = 7, /* LookOut WPEG */
-} node_type;
-
-static inline int is_WPEG(struct rio_detail *rio){
- return (rio->type == CompatWPEG || rio->type == AltWPEG ||
- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
-}
-
-#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static const struct cpumask *summit_target_cpus(void)
-{
- /* CPU_MASK_ALL (0xff) has undefined behaviour with
- * dest_LowestPrio mode logical clustered apic interrupt routing
- * Just start on cpu 0. IRQ balancing will spread load
- */
- return cpumask_of(0);
-}
-
-static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static unsigned long summit_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static void summit_init_apic_ldr(void)
-{
- unsigned long val, id;
- int count = 0;
- u8 my_id = (u8)hard_smp_processor_id();
- u8 my_cluster = APIC_CLUSTER(my_id);
-#ifdef CONFIG_SMP
- u8 lid;
- int i;
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
- lid = cpu_2_logical_apicid[i];
- if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
- ++count;
- }
-#endif
- /* We only have a 4 wide bitmap in cluster mode. If a deranged
- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
- id = my_cluster | (1UL << count);
- apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static int summit_apic_id_registered(void)
-{
- return 1;
-}
-
-static void summit_setup_apic_routing(void)
-{
- printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int summit_apicid_to_node(int logical_apicid)
-{
-#ifdef CONFIG_SMP
- return apicid_2_node[hard_smp_processor_id()];
-#else
- return 0;
-#endif
-}
-
-/* Mapping from cpu number to logical apicid */
-static inline int summit_cpu_to_logical_apicid(int cpu)
-{
-#ifdef CONFIG_SMP
- if (cpu >= nr_cpu_ids)
- return BAD_APICID;
- return cpu_2_logical_apicid[cpu];
-#else
- return logical_smp_processor_id();
-#endif
-}
-
-static int summit_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0x0FL, retmap);
-}
-
-static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(0, retmap);
-}
-
-static int summit_check_phys_apicid_present(int physical_apicid)
-{
- return 1;
-}
-
-static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- unsigned int round = 0;
- int cpu, apicid = 0;
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu(cpu, cpumask) {
- int new_apicid = summit_cpu_to_logical_apicid(cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- printk("%s: Not a valid mask!\n", __func__);
- return BAD_APICID;
- }
- apicid |= new_apicid;
- round++;
- }
- return apicid;
-}
-
-static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask)
-{
- int apicid = summit_cpu_to_logical_apicid(0);
- cpumask_var_t cpumask;
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return apicid;
-
- cpumask_and(cpumask, inmask, andmask);
- cpumask_and(cpumask, cpumask, cpu_online_mask);
- apicid = summit_cpu_mask_to_apicid(cpumask);
-
- free_cpumask_var(cpumask);
-
- return apicid;
-}
-
-/*
- * cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-static int probe_summit(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
- /* Careful. Some cpus do not strictly honor the set of cpus
- * specified in the interrupt destination when using lowest
- * priority interrupt delivery mode.
- *
- * In particular there was a hyperthreading cpu observed to
- * deliver interrupts to the wrong hyperthread when only one
- * hyperthread was specified in the interrupt desitination.
- */
- cpumask_clear(retmask);
- cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
-}
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static struct rio_table_hdr *rio_table_hdr;
-static struct scal_detail *scal_devs[MAX_NUMNODES];
-static struct rio_detail *rio_devs[MAX_NUMNODES*4];
-
-#ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES];
-#endif
-
-static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
- int twister = 0, node = 0;
- int i, bus, num_buses;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
- twister = rio_devs[i]->owner_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_rio_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
- return last_bus;
- }
-
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
- if (scal_devs[i]->node_id == twister) {
- node = scal_devs[i]->node_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_scal_dev) {
- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
- return last_bus;
- }
-
- switch (rio_devs[wpeg_num]->type) {
- case CompatWPEG:
- /*
- * The Compatibility Winnipeg controls the 2 legacy buses,
- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
- * a PCI-PCI bridge card is used in either slot: total 5 buses.
- */
- num_buses = 5;
- break;
- case AltWPEG:
- /*
- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
- * the "extra" buses for each of those slots: total 7 buses.
- */
- num_buses = 7;
- break;
- case LookOutAWPEG:
- case LookOutBWPEG:
- /*
- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
- * & the "extra" buses for each of those slots: total 9 buses.
- */
- num_buses = 9;
- break;
- default:
- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
- return last_bus;
- }
-
- for (bus = last_bus; bus < last_bus + num_buses; bus++)
- mp_bus_id_to_node[bus] = node;
- return bus;
-}
-
-static int build_detail_arrays(void)
-{
- unsigned long ptr;
- int i, scal_detail_size, rio_detail_size;
-
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
- return 0;
- }
-
- switch (rio_table_hdr->version) {
- default:
- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
- return 0;
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- }
-
- ptr = (unsigned long)rio_table_hdr + 3;
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 1;
-}
-
-void setup_summit(void)
-{
- unsigned long ptr;
- unsigned short offset;
- int i, next_wpeg, next_bus = 0;
-
- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = get_bios_ebda();
- ptr = (unsigned long)phys_to_virt(ptr);
-
- rio_table_hdr = NULL;
- offset = 0x180;
- while (offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- /* The next offset is stored in the 1st word. 0 means no more */
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
- return;
- }
-
- if (!build_detail_arrays())
- return;
-
- /* The first Winnipeg we're looking for has an index of 0 */
- next_wpeg = 0;
- do {
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
- /* It's the Winnipeg we're looking for! */
- next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
- next_wpeg++;
- break;
- }
- }
- /*
- * If we go through all Rio devices and don't find one with
- * the next index, it means we've found all the Winnipegs,
- * and thus all the PCI buses.
- */
- if (i == rio_table_hdr->num_rio_dev)
- next_wpeg = 0;
- } while (next_wpeg != 0);
-}
-#endif
-
-struct apic apic_summit = {
-
- .name = "summit",
- .probe = probe_summit,
- .acpi_madt_oem_check = summit_acpi_madt_oem_check,
- .apic_id_registered = summit_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
-
- .target_cpus = summit_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = summit_check_apicid_used,
- .check_apicid_present = summit_check_apicid_present,
-
- .vector_allocation_domain = summit_vector_allocation_domain,
- .init_apic_ldr = summit_init_apic_ldr,
-
- .ioapic_phys_id_map = summit_ioapic_phys_id_map,
- .setup_apic_routing = summit_setup_apic_routing,
- .multi_timer_check = NULL,
- .apicid_to_node = summit_apicid_to_node,
- .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
- .cpu_present_to_apicid = summit_cpu_present_to_apicid,
- .apicid_to_cpu_present = summit_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = summit_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = summit_phys_pkg_id,
- .mps_oem_check = summit_mps_oem_check,
-
- .get_apic_id = summit_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = summit_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = summit_send_IPI_allbutself,
- .send_IPI_all = summit_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-};
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
new file mode 100644
index 000000000000..bddc54465399
--- /dev/null
+++ b/arch/x86/kernel/apic/vector.c
@@ -0,0 +1,1387 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/irq_remapping.h>
+
+#include <asm/trace/irq_vectors.h>
+
+struct apic_chip_data {
+ struct irq_cfg hw_irq_cfg;
+ unsigned int vector;
+ unsigned int prev_vector;
+ unsigned int cpu;
+ unsigned int prev_cpu;
+ unsigned int irq;
+ struct hlist_node clist;
+ unsigned int move_in_progress : 1,
+ is_managed : 1,
+ can_reserve : 1,
+ has_reserved : 1;
+};
+
+struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
+static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_searchmask;
+static struct irq_chip lapic_controller;
+static struct irq_matrix *vector_matrix;
+#ifdef CONFIG_SMP
+
+static void vector_cleanup_callback(struct timer_list *tmr);
+
+struct vector_cleanup {
+ struct hlist_head head;
+ struct timer_list timer;
+};
+
+static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
+ .head = HLIST_HEAD_INIT,
+ .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
+};
+#endif
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
+{
+ if (!irqd)
+ return NULL;
+
+ while (irqd->parent_data)
+ irqd = irqd->parent_data;
+
+ return irqd->chip_data;
+}
+
+struct irq_cfg *irqd_cfg(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ return apicd ? &apicd->hw_irq_cfg : NULL;
+}
+EXPORT_SYMBOL_GPL(irqd_cfg);
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irqd_cfg(irq_get_irq_data(irq));
+}
+
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node);
+ if (apicd)
+ INIT_HLIST_NODE(&apicd->clist);
+ return apicd;
+}
+
+static void free_apic_chip_data(struct apic_chip_data *apicd)
+{
+ kfree(apicd);
+}
+
+static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
+ unsigned int cpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ apicd->hw_irq_cfg.vector = vector;
+ apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+
+ apic_update_vector(cpu, vector, true);
+
+ irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
+ trace_vector_config(irqd->irq, vector, cpu, apicd->hw_irq_cfg.dest_apicid);
+}
+
+static void apic_free_vector(unsigned int cpu, unsigned int vector, bool managed)
+{
+ apic_update_vector(cpu, vector, false);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+}
+
+static void chip_data_update(struct irq_data *irqd, unsigned int newvec, unsigned int newcpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ struct irq_desc *desc = irq_data_to_desc(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
+ apicd->cpu);
+
+ /*
+ * If there is no vector associated or if the associated vector is
+ * the shutdown vector, which is associated to make PCI/MSI
+ * shutdown mode work, then there is nothing to release. Clear out
+ * prev_vector for this and the offlined target case.
+ */
+ apicd->prev_vector = 0;
+ if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+ goto setnew;
+ /*
+ * If the target CPU of the previous vector is online, then mark
+ * the vector as move in progress and store it for cleanup when the
+ * first interrupt on the new vector arrives. If the target CPU is
+ * offline then the regular release mechanism via the cleanup
+ * vector is not possible and the vector can be immediately freed
+ * in the underlying matrix allocator.
+ */
+ if (cpu_online(apicd->cpu)) {
+ apicd->move_in_progress = true;
+ apicd->prev_vector = apicd->vector;
+ apicd->prev_cpu = apicd->cpu;
+ WARN_ON_ONCE(apicd->cpu == newcpu);
+ } else {
+ apic_free_vector(apicd->cpu, apicd->vector, managed);
+ }
+
+setnew:
+ apicd->vector = newvec;
+ apicd->cpu = newcpu;
+ BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
+ per_cpu(vector_irq, newcpu)[newvec] = desc;
+ apic_update_irq_cfg(irqd, newvec, newcpu);
+}
+
+static void vector_assign_managed_shutdown(struct irq_data *irqd)
+{
+ unsigned int cpu = cpumask_first(cpu_online_mask);
+
+ apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+}
+
+static int reserve_managed_vector(struct irq_data *irqd)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apicd->is_managed = true;
+ ret = irq_matrix_reserve_managed(vector_matrix, affmsk);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ trace_vector_reserve_managed(irqd->irq, ret);
+ return ret;
+}
+
+static void reserve_irq_vector_locked(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ irq_matrix_reserve(vector_matrix);
+ apicd->can_reserve = true;
+ apicd->has_reserved = true;
+ irqd_set_can_reserve(irqd);
+ trace_vector_reserve(irqd->irq, 0);
+ vector_assign_managed_shutdown(irqd);
+}
+
+static int reserve_irq_vector(struct irq_data *irqd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ reserve_irq_vector_locked(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return 0;
+}
+
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool resvd = apicd->has_reserved;
+ unsigned int cpu = apicd->cpu;
+ int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /*
+ * If the current target CPU is online and in the new requested
+ * affinity mask, there is no point in moving the interrupt from
+ * one CPU to another.
+ */
+ if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
+ return 0;
+
+ /*
+ * Careful here. @apicd might either have move_in_progress set or
+ * be enqueued for cleanup. Assigning a new vector would either
+ * leave a stale vector on some CPU around or in case of a pending
+ * cleanup corrupt the hlist.
+ */
+ if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist))
+ return -EBUSY;
+
+ vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
+ trace_vector_alloc(irqd->irq, vector, resvd, vector);
+ if (vector < 0)
+ return vector;
+ chip_data_update(irqd, vector, cpu);
+
+ return 0;
+}
+
+static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ ret = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static int assign_irq_vector_any_locked(struct irq_data *irqd)
+{
+ /* Get the affinity mask - either irq_default_affinity or (user) set */
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ int node = irq_data_get_node(irqd);
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ }
+
+ /* Try the full affinity mask */
+ cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+ }
+
+ /* Try the full online mask */
+ return assign_vector_locked(irqd, cpu_online_mask);
+}
+
+static int
+assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
+{
+ if (irqd_affinity_is_managed(irqd))
+ return reserve_managed_vector(irqd);
+ if (info->mask)
+ return assign_irq_vector(irqd, info->mask);
+ /*
+ * Make only a global reservation with no guarantee. A real vector
+ * is associated at activation time.
+ */
+ return reserve_irq_vector(irqd);
+}
+
+static int
+assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector, cpu;
+
+ cpumask_and(vector_searchmask, dest, affmsk);
+
+ /* set_affinity might call here for nothing */
+ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
+ return 0;
+ vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask,
+ &cpu);
+ trace_vector_alloc_managed(irqd->irq, vector, vector);
+ if (vector < 0)
+ return vector;
+ chip_data_update(irqd, vector, cpu);
+
+ return 0;
+}
+
+static void clear_irq_vector(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+ unsigned int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ if (!vector)
+ return;
+
+ trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
+ apicd->prev_cpu);
+
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->cpu, vector, managed);
+ apicd->vector = 0;
+
+ /* Clean up move in progress */
+ vector = apicd->prev_vector;
+ if (!vector)
+ return;
+
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
+ apic_free_vector(apicd->prev_cpu, vector, managed);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+ hlist_del_init(&apicd->clist);
+}
+
+static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ trace_vector_deactivate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, false);
+
+ /* Regular fixed assigned interrupt */
+ if (!apicd->is_managed && !apicd->can_reserve)
+ return;
+ /* If the interrupt has a global reservation, nothing to do */
+ if (apicd->has_reserved)
+ return;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ if (apicd->can_reserve)
+ reserve_irq_vector_locked(irqd);
+ else
+ vector_assign_managed_shutdown(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+static int activate_reserved(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int ret;
+
+ ret = assign_irq_vector_any_locked(irqd);
+ if (!ret) {
+ apicd->has_reserved = false;
+ /*
+ * Core might have disabled reservation mode after
+ * allocating the irq descriptor. Ideally this should
+ * happen before allocation time, but that would require
+ * completely convoluted ways of transporting that
+ * information.
+ */
+ if (!irqd_can_reserve(irqd))
+ apicd->can_reserve = false;
+ }
+
+ /*
+ * Check to ensure that the effective affinity mask is a subset
+ * the user supplied affinity mask, and warn the user if it is not
+ */
+ if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd),
+ irq_data_get_affinity_mask(irqd))) {
+ pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n",
+ irqd->irq);
+ }
+
+ return ret;
+}
+
+static int activate_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ int ret;
+
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
+ /* Something in the core code broke! Survive gracefully */
+ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
+ return -EINVAL;
+ }
+
+ ret = assign_managed_vector(irqd, vector_searchmask);
+ /*
+ * This should not happen. The vector reservation got buggered. Handle
+ * it gracefully.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ pr_err("Managed startup irq %u, no vector available\n",
+ irqd->irq);
+ }
+ return ret;
+}
+
+static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
+ bool reserve)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret = 0;
+
+ trace_vector_activate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, reserve);
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
+ vector_assign_managed_shutdown(irqd);
+ else if (apicd->is_managed)
+ ret = activate_managed(irqd);
+ else if (apicd->has_reserved)
+ ret = activate_reserved(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static void vector_free_reserved_and_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ trace_vector_teardown(irqd->irq, apicd->is_managed,
+ apicd->has_reserved);
+
+ if (apicd->has_reserved)
+ irq_matrix_remove_reserved(vector_matrix);
+ if (apicd->is_managed)
+ irq_matrix_remove_managed(vector_matrix, dest);
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irqd && irqd->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ vector_free_reserved_and_managed(irqd);
+ apicd = irqd->chip_data;
+ irq_domain_reset_irq_data(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apicd);
+ }
+ }
+}
+
+static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
+ struct apic_chip_data *apicd)
+{
+ unsigned long flags;
+ bool realloc = false;
+
+ apicd->vector = ISA_IRQ_VECTOR(virq);
+ apicd->cpu = 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ /*
+ * If the interrupt is activated, then it must stay at this vector
+ * position. That's usually the timer interrupt (0).
+ */
+ if (irqd_is_activated(irqd)) {
+ trace_vector_setup(virq, true, 0);
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ } else {
+ /* Release the vector */
+ apicd->can_reserve = true;
+ irqd_set_can_reserve(irqd);
+ clear_irq_vector(irqd);
+ realloc = true;
+ }
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return realloc;
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ int i, err, node;
+
+ if (apic_is_disabled)
+ return -ENXIO;
+
+ /*
+ * Catch any attempt to touch the cascade interrupt on a PIC
+ * equipped system.
+ */
+ if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY &&
+ virq == PIC_CASCADE_IR))
+ return -EINVAL;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irqd);
+ node = irq_data_get_node(irqd);
+ WARN_ON_ONCE(irqd->chip_data);
+ apicd = alloc_apic_chip_data(node);
+ if (!apicd) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ apicd->irq = virq + i;
+ irqd->chip = &lapic_controller;
+ irqd->chip_data = apicd;
+ irqd->hwirq = virq + i;
+ irqd_set_single_target(irqd);
+ /*
+ * Prevent that any of these interrupts is invoked in
+ * non interrupt context via e.g. generic_handle_irq()
+ * as that can corrupt the affinity move state.
+ */
+ irqd_set_handle_enforce_irqctx(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
+ /*
+ * Legacy vectors are already assigned when the IOAPIC
+ * takes them over. They stay on the same vector. This is
+ * required for check_timer() to work correctly as it might
+ * switch back to legacy mode. Only update the hardware
+ * config.
+ */
+ if (info->flags & X86_IRQ_ALLOC_LEGACY) {
+ if (!vector_configure_legacy(virq + i, irqd, apicd))
+ continue;
+ }
+
+ err = assign_irq_vector_policy(irqd, info);
+ trace_vector_setup(virq + i, false, err);
+ if (err) {
+ irqd->chip_data = NULL;
+ free_apic_chip_data(apicd);
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i);
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct apic_chip_data apicd;
+ unsigned long flags;
+ int irq;
+
+ if (!irqd) {
+ irq_matrix_debug_show(m, vector_matrix, ind);
+ return;
+ }
+
+ irq = irqd->irq;
+ if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) {
+ seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq));
+ seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, "");
+ return;
+ }
+
+ if (!irqd->chip_data) {
+ seq_printf(m, "%*sVector: Not assigned\n", ind, "");
+ return;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ memcpy(&apicd, irqd->chip_data, sizeof(apicd));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+ if (apicd.prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);
+ }
+ seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0);
+ seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0);
+ seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
+ seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
+ seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
+}
+#endif
+
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+ simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+ }
+ return to_of_node(fwspec->fwnode) &&
+ of_device_is_compatible(to_of_node(fwspec->fwnode),
+ "intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+ simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+ }
+ return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /*
+ * HPET and I/OAPIC cannot be parented in the vector domain
+ * if IRQ remapping is enabled. APIC IDs above 15 bits are
+ * only permitted if IRQ remapping is enabled, so check that.
+ */
+ if (apic_id_valid(32768))
+ return 0;
+
+ return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .select = x86_vector_select,
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+ .activate = x86_vector_activate,
+ .deactivate = x86_vector_deactivate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = x86_vector_debug_show,
+#endif
+};
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids)
+ irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids);
+
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI)
+ /*
+ * for MSI and HT dyn irq
+ */
+ if (gsi_top <= NR_IRQS_LEGACY)
+ nr += 8 * nr_cpu_ids;
+ else
+ nr += gsi_top * 16;
+#endif
+ if (nr < irq_get_nr_irqs())
+ irq_set_nr_irqs(nr);
+
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
+}
+
+void lapic_assign_legacy_vector(unsigned int irq, bool replace)
+{
+ /*
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
+ * irq reservation from touching it.
+ */
+ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+}
+
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
+void __init lapic_assign_system_vectors(void)
+{
+ unsigned int i, vector;
+
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
+ irq_matrix_assign_system(vector_matrix, vector, false);
+
+ if (nr_legacy_irqs() > 1)
+ lapic_assign_legacy_vector(PIC_CASCADE_IR, false);
+
+ /* System vectors are reserved, online it */
+ irq_matrix_online(vector_matrix);
+
+ /* Mark the preallocated legacy interrupts */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ /*
+ * Don't touch the cascade interrupt. It's unusable
+ * on PIC equipped machines. See the large comment
+ * in the IO/APIC code.
+ */
+ if (i != PIC_CASCADE_IR)
+ irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
+ }
+}
+
+int __init arch_early_irq_init(void)
+{
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_set_default_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+
+ /*
+ * Allocate the vector matrix allocator data structure and limit the
+ * search area.
+ */
+ vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
+ FIRST_SYSTEM_VECTOR);
+ BUG_ON(!vector_matrix);
+
+ return arch_early_ioapic_init();
+}
+
+#ifdef CONFIG_SMP
+
+static struct irq_desc *__setup_vector_irq(int vector)
+{
+ int isairq = vector - ISA_IRQ_VECTOR(0);
+
+ /* Check whether the irq is in the legacy space */
+ if (isairq < 0 || isairq >= nr_legacy_irqs())
+ return VECTOR_UNUSED;
+ /* Check whether the irq is handled by the IOAPIC */
+ if (test_bit(isairq, &io_apic_irqs))
+ return VECTOR_UNUSED;
+ return irq_to_desc(isairq);
+}
+
+/* Online the local APIC infrastructure and initialize the vectors */
+void lapic_online(void)
+{
+ unsigned int vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /* Online the vector matrix array for this CPU */
+ irq_matrix_online(vector_matrix);
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs. The exception are the legacy PIC interrupts. In general
+ * they are only targeted to CPU0, but depending on the platform
+ * they can be distributed to any online CPU in hardware. The
+ * kernel has no influence on that. So all active legacy vectors
+ * must be installed on all CPUs. All non legacy interrupts can be
+ * cleared.
+ */
+ for (vector = 0; vector < NR_VECTORS; vector++)
+ this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
+}
+
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr);
+
+void lapic_offline(void)
+{
+ struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
+
+ lock_vector_lock();
+
+ /* In case the vector cleanup timer has not expired */
+ __vector_cleanup(cl, false);
+
+ irq_matrix_offline(vector_matrix);
+ WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0);
+ WARN_ON_ONCE(!hlist_empty(&cl->head));
+
+ unlock_vector_lock();
+}
+
+static int apic_set_affinity(struct irq_data *irqd,
+ const struct cpumask *dest, bool force)
+{
+ int err;
+
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
+
+ raw_spin_lock(&vector_lock);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (irqd_affinity_is_managed(irqd))
+ err = assign_managed_vector(irqd, vector_searchmask);
+ else
+ err = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock(&vector_lock);
+ return err ? err : IRQ_SET_MASK_OK;
+}
+
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
+ */
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ apic_free_vector(cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+static void apic_force_complete_move(struct irq_data *irqd)
+{
+ unsigned int cpu = smp_processor_id();
+ struct apic_chip_data *apicd;
+ unsigned int vector;
+
+ guard(raw_spinlock)(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ return;
+
+ /*
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
+ return;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendezvous in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theoretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+}
+
+#else
+# define apic_set_affinity NULL
+# define apic_force_complete_move NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __apic_send_IPI(apicd->cpu, apicd->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+ apic_eoi();
+}
+
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
+static struct irq_chip lapic_controller = {
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg,
+ .irq_force_complete_move = apic_force_complete_move,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
+#ifdef CONFIG_SMP
+
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
+{
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
+ bool rearm = false;
+
+ lockdep_assert_held(&vector_lock);
+
+ hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
+ unsigned int vector = apicd->prev_vector;
+
+ /*
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. That's clearly a
+ * hardware issue if the vector arrived on the old target
+ * _after_ interrupts were disabled above. Keep @apicd
+ * on the list and schedule the timer again to give the CPU
+ * a chance to handle the pending interrupt.
+ *
+ * Do not check IRR when called from lapic_offline(), because
+ * fixup_irqs() was just called to scan IRR for set bits and
+ * forward them to new destination CPUs via IPIs.
+ */
+ if (check_irr && is_vector_pending(vector)) {
+ pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
+ rearm = true;
+ continue;
+ }
+ free_moved_vector(apicd);
+ }
+
+ /*
+ * Must happen under vector_lock to make the timer_pending() check
+ * in __vector_schedule_cleanup() race free against the rearm here.
+ */
+ if (rearm)
+ mod_timer(&cl->timer, jiffies + 1);
+}
+
+static void vector_cleanup_callback(struct timer_list *tmr)
+{
+ struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
+
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock_irq(&vector_lock);
+ __vector_cleanup(cl, true);
+ raw_spin_unlock_irq(&vector_lock);
+}
+
+static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
+{
+ unsigned int cpu = apicd->prev_cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ if (cpu_online(cpu)) {
+ struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
+
+ hlist_add_head(&apicd->clist, &cl->head);
+
+ /*
+ * The lockless timer_pending() check is safe here. If it
+ * returns true, then the callback will observe this new
+ * apic data in the hlist as everything is serialized by
+ * vector lock.
+ *
+ * If it returns false then the timer is either not armed
+ * or the other CPU executes the callback, which again
+ * would be blocked on vector lock. Rearming it in the
+ * latter case makes it fire for nothing.
+ *
+ * This is also safe against the callback rearming the timer
+ * because that's serialized via vector lock too.
+ */
+ if (!timer_pending(&cl->timer)) {
+ cl->timer.expires = jiffies + 1;
+ add_timer_on(&cl->timer, cpu);
+ }
+ } else {
+ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu);
+ free_moved_vector(apicd);
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+void vector_schedule_cleanup(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __vector_schedule_cleanup(apicd);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
+ return;
+
+ /*
+ * If the interrupt arrived on the new target CPU, cleanup the
+ * vector on the old target CPU. A vector check is not required
+ * because an interrupt can never move from one vector to another
+ * on the same CPU.
+ */
+ if (apicd->cpu == smp_processor_id())
+ __vector_schedule_cleanup(apicd);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Note, this is not accurate accounting, but at least good enough to
+ * prevent that the actual interrupt move will run out of vectors.
+ */
+int lapic_can_unplug_cpu(void)
+{
+ unsigned int rsvd, avl, tomove, cpu = smp_processor_id();
+ int ret = 0;
+
+ raw_spin_lock(&vector_lock);
+ tomove = irq_matrix_allocated(vector_matrix);
+ avl = irq_matrix_available(vector_matrix, true);
+ if (avl < tomove) {
+ pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n",
+ cpu, tomove, avl);
+ ret = -ENOSPC;
+ goto out;
+ }
+ rsvd = irq_matrix_reserved(vector_matrix);
+ if (avl < rsvd) {
+ pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n",
+ rsvd, avl);
+ }
+out:
+ raw_spin_unlock(&vector_lock);
+ return ret;
+}
+#endif /* HOTPLUG_CPU */
+#endif /* SMP */
+
+static void __init print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+static void __init print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ pr_debug("printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), read_apic_id());
+ v = apic_read(APIC_ID);
+ pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ pr_info("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ pr_debug("... APIC ARBPRI: %08x (%02x)\n",
+ v, v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ pr_debug("... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ pr_debug("... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ pr_debug("... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ pr_debug("... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ pr_debug("... APIC SPIV: %08x\n", v);
+
+ pr_debug("... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ pr_debug("... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ pr_debug("... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ pr_debug("... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ pr_debug("... APIC ICR: %08x\n", (u32)icr);
+ pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ pr_debug("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) {
+ /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ pr_debug("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ pr_debug("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ pr_debug("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) {
+ /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ pr_debug("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ pr_debug("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ pr_debug("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ pr_debug("... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ pr_debug("... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ pr_debug("... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ pr_debug("... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+static void __init print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+static void __init print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!nr_legacy_irqs())
+ return;
+
+ pr_debug("\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ pr_debug("... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ pr_debug("... PIC IRR: %04x\n", v);
+
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ pr_debug("... PIC ISR: %04x\n", v);
+
+ v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1);
+ pr_debug("... PIC ELCR: %04x\n", v);
+}
+
+static int show_lapic __initdata = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+static int __init print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -1,244 +1,261 @@
-#include <linux/threads.h>
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/dmar.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
-#include <asm/smp.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+#include "local.h"
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return x2apic_enabled();
-}
+#define apic_cluster(apicid) ((apicid) >> 4)
/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
+ * __x2apic_send_IPI_mask() possibly needs to read
+ * x86_cpu_to_logical_apicid for all online cpus in a sequential way.
+ * Using per cpu variable would cost one cache line per cpu.
*/
-static const struct cpumask *x2apic_target_cpus(void)
-{
- return cpu_online_mask;
-}
+static u32 *x86_cpu_to_logical_apicid __read_mostly;
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ return x2apic_enabled();
}
-static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+static void x2apic_send_IPI(int cpu, int vector)
{
- unsigned long cfg;
+ u32 dest = x86_cpu_to_logical_apicid[cpu];
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long query_cpu;
+ unsigned int cpu, clustercpu;
+ struct cpumask *tmpmsk;
unsigned long flags;
+ u32 dest;
- x2apic_wrmsr_fence();
-
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
-}
-static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
-{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
+ tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
+ cpumask_copy(tmpmsk, mask);
+ /* If IPI should not be sent to self, clear current CPU */
+ if (apic_dest != APIC_DEST_ALLINC)
+ __cpumask_clear_cpu(smp_processor_id(), tmpmsk);
- x2apic_wrmsr_fence();
+ /* Collapse cpus in a cluster so a single IPI per cluster is sent */
+ for_each_cpu(cpu, tmpmsk) {
+ struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ dest = 0;
+ for_each_cpu_and(clustercpu, tmpmsk, cmsk)
+ dest |= x86_cpu_to_logical_apicid[clustercpu];
+
+ if (!dest)
continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+ /* Remove cluster CPUs from tmpmask */
+ cpumask_andnot(tmpmsk, tmpmsk, cmsk);
}
+
local_irq_restore(flags);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
- continue;
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
- }
- local_irq_restore(flags);
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_all(int vector)
+static u32 x2apic_calc_apicid(unsigned int cpu)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
+ return x86_cpu_to_logical_apicid[cpu];
}
-static int x2apic_apic_id_registered(void)
+static void init_x2apic_ldr(void)
{
- return 1;
+ struct cpumask *cmsk = this_cpu_read(cluster_masks);
+
+ BUG_ON(!cmsk);
+
+ cpumask_set_cpu(smp_processor_id(), cmsk);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
{
- /*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
+ int cpu_i;
+
+ for_each_present_cpu(cpu_i) {
+ struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i);
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+ continue;
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
- else
- return BAD_APICID;
+ if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
+ continue;
+
+ BUG_ON(*cpu_cmsk);
+ *cpu_cmsk = cmsk;
+ }
}
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
- int cpu;
+ struct cpumask *cmsk = NULL;
+ unsigned int cpu_i;
/*
- * We're using fixed IRQ delivery, can only return one logical APIC ID.
- * May as well be the first.
+ * At boot time, the CPU present mask is stable. The cluster mask is
+ * allocated for the first CPU in the cluster and propagated to all
+ * present siblings in the cluster. If the cluster mask is already set
+ * on entry to this function for a given CPU, there is nothing to do.
*/
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
+ if (per_cpu(cluster_masks, cpu))
+ return 0;
- return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ if (system_state < SYSTEM_RUNNING)
+ goto alloc;
+
+ /*
+ * On post boot hotplug for a CPU which was not present at boot time,
+ * iterate over all possible CPUs (even those which are not present
+ * any more) to find any existing cluster mask.
+ */
+ for_each_possible_cpu(cpu_i) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu_i);
+ /*
+ * If the cluster is already initialized, just store
+ * the mask and return. There's no need to propagate.
+ */
+ if (cmsk) {
+ per_cpu(cluster_masks, cpu) = cmsk;
+ return 0;
+ }
+ }
+ }
+ /*
+ * No CPU in the cluster has ever been initialized, so fall through to
+ * the boot time code which will also populate the cluster mask for any
+ * other CPU in the cluster which is (now) present.
+ */
+alloc:
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
+ return -ENOMEM;
+ per_cpu(cluster_masks, cpu) = cmsk;
+ prefill_clustermask(cmsk, cpu, cluster);
+
+ return 0;
}
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static int x2apic_prepare_cpu(unsigned int cpu)
{
- unsigned int id;
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = apic_cluster(phys_apicid);
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
- id = x;
- return id;
-}
+ x86_cpu_to_logical_apicid[cpu] = logical_apicid;
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
+ if (alloc_clustermask(cpu, cluster, node) < 0)
+ return -ENOMEM;
- x = id;
- return x;
-}
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
+ return -ENOMEM;
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
+ return 0;
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- apic_write(APIC_SELF_IPI, vector);
+ struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
+
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, cmsk);
+ free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
+ return 0;
}
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
{
- int cpu = smp_processor_id();
-
- per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+ u32 slots;
+
+ if (!x2apic_mode)
+ return 0;
+
+ slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids);
+ x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL);
+ if (!x86_cpu_to_logical_apicid)
+ return 0;
+
+ if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
+ x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
+ kfree(x86_cpu_to_logical_apicid);
+ x86_cpu_to_logical_apicid = NULL;
+ return 0;
+ }
+ init_x2apic_ldr();
+ return 1;
}
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
- .probe = NULL,
+ .probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_LowestPrio,
- .irq_dest_mode = 1, /* logical */
+ .dest_mode_logical = true,
- .target_cpus = x2apic_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
- .vector_allocation_domain = x2apic_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_cluster_phys_pkg_id,
- .mps_oem_check = NULL,
- .get_apic_id = x2apic_cluster_phys_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = x2apic_calc_apicid,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..12d4c35547a6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -1,105 +1,70 @@
-#include <linux/threads.h>
+// SPDX-License-Identifier: GPL-2.0
+
#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <linux/dmar.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include "local.h"
int x2apic_phys;
-static int set_x2apic_phys_mode(char *arg)
-{
- x2apic_phys = 1;
- return 0;
-}
-early_param("x2apic_phys", set_x2apic_phys_mode);
+static struct apic apic_x2apic_phys;
+u32 x2apic_max_apicid __ro_after_init = UINT_MAX;
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+void __init x2apic_set_max_apicid(u32 apicid)
{
- if (x2apic_phys)
- return x2apic_enabled();
- else
- return 0;
+ x2apic_max_apicid = apicid;
+ if (apic->x2apic_set_max_apicid)
+ apic->max_apic_id = apicid;
}
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
+static int __init set_x2apic_phys_mode(char *arg)
{
- return cpu_online_mask;
+ x2apic_phys = 1;
+ return 0;
}
+early_param("x2apic_phys", set_x2apic_phys_mode);
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static bool x2apic_fadt_phys(void)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+#ifdef CONFIG_ACPI
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return true;
+ }
+#endif
+ return false;
}
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
- unsigned int dest)
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- unsigned long cfg;
-
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * send the IPI.
- */
- native_x2apic_icr_write(cfg, apicid);
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void x2apic_send_IPI(int cpu, int vector)
{
- unsigned long query_cpu;
- unsigned long flags;
-
- x2apic_wrmsr_fence();
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
- local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
- unsigned long this_cpu = smp_processor_id();
unsigned long query_cpu;
+ unsigned long this_cpu;
unsigned long flags;
- x2apic_wrmsr_fence();
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu != this_cpu)
- __x2apic_send_IPI_dest(
- per_cpu(x86_cpu_to_apicid, query_cpu),
- vector, APIC_DEST_PHYSICAL);
- }
- local_irq_restore(flags);
-}
-
-static void x2apic_send_IPI_allbutself(int vector)
-{
- unsigned long this_cpu = smp_processor_id();
- unsigned long query_cpu;
- unsigned long flags;
- x2apic_wrmsr_fence();
-
- local_irq_save(flags);
- for_each_online_cpu(query_cpu) {
- if (query_cpu == this_cpu)
+ this_cpu = smp_processor_id();
+ for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
@@ -107,127 +72,94 @@ static void x2apic_send_IPI_allbutself(int vector)
local_irq_restore(flags);
}
-static void x2apic_send_IPI_all(int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- x2apic_send_IPI_mask(cpu_online_mask, vector);
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
}
-static int x2apic_apic_id_registered(void)
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- return 1;
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+static void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
+ unsigned long cfg = __prepare_ICR(which, vector, 0);
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ native_x2apic_icr_write(cfg, 0);
}
-static unsigned int
-x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
+void x2apic_send_IPI_allbutself(int vector)
{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
-
- return per_cpu(x86_cpu_to_apicid, cpu);
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
+void x2apic_send_IPI_all(int vector)
{
- return x;
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
-static unsigned long set_apic_id(unsigned int id)
+void x2apic_send_IPI_self(int vector)
{
- return id;
+ apic_write(APIC_SELF_IPI, vector);
}
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
- return initial_apicid >> index_msb;
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
}
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
{
- apic_write(APIC_SELF_IPI, vector);
+ if (!x2apic_mode)
+ return 0;
+
+ if (x2apic_phys || x2apic_fadt_phys())
+ return 1;
+
+ return apic == &apic_x2apic_phys;
}
-static void init_x2apic_ldr(void)
+u32 x2apic_get_apic_id(u32 id)
{
+ return id;
}
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
- .probe = NULL,
+ .probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = x2apic_target_cpus,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = x2apic_vector_allocation_domain,
- .init_apic_ldr = init_x2apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
+
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
- .get_apic_id = x2apic_phys_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
- .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_default_calc_apicid,
+ .send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
+ .nmi_to_offline_cpu = true,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c
new file mode 100644
index 000000000000..dbc5678bc3b6
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_savic.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Secure AVIC Support (SEV-SNP Guests)
+ *
+ * Copyright (C) 2024 Advanced Micro Devices, Inc.
+ *
+ * Author: Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/cpumask.h>
+#include <linux/percpu-defs.h>
+#include <linux/align.h>
+
+#include <asm/apic.h>
+#include <asm/sev.h>
+
+#include "local.h"
+
+struct secure_avic_page {
+ u8 regs[PAGE_SIZE];
+} __aligned(PAGE_SIZE);
+
+static struct secure_avic_page __percpu *savic_page __ro_after_init;
+
+static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC);
+}
+
+static inline void *get_reg_bitmap(unsigned int cpu, unsigned int offset)
+{
+ return &per_cpu_ptr(savic_page, cpu)->regs[offset];
+}
+
+static inline void update_vector(unsigned int cpu, unsigned int offset,
+ unsigned int vector, bool set)
+{
+ void *bitmap = get_reg_bitmap(cpu, offset);
+
+ if (set)
+ apic_set_vector(vector, bitmap);
+ else
+ apic_clear_vector(vector, bitmap);
+}
+
+#define SAVIC_ALLOWED_IRR 0x204
+
+/*
+ * When Secure AVIC is enabled, RDMSR/WRMSR of the APIC registers
+ * result in #VC exception (for non-accelerated register accesses)
+ * with VMEXIT_AVIC_NOACCEL error code. The #VC exception handler
+ * can read/write the x2APIC register in the guest APIC backing page.
+ *
+ * Since doing this would increase the latency of accessing x2APIC
+ * registers, instead of doing RDMSR/WRMSR based accesses and
+ * handling the APIC register reads/writes in the #VC exception handler,
+ * the read() and write() callbacks directly read/write the APIC register
+ * from/to the vCPU's APIC backing page.
+ */
+static u32 savic_read(u32 reg)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TMCCT:
+ case APIC_TDCR:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ return savic_ghcb_msr_read(reg);
+ case APIC_ID:
+ case APIC_LVR:
+ case APIC_TASKPRI:
+ case APIC_ARBPRI:
+ case APIC_PROCPRI:
+ case APIC_LDR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_EFEAT:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ return apic_get_reg(ap, reg);
+ case APIC_ICR:
+ return (u32)apic_get_reg64(ap, reg);
+ case APIC_ISR ... APIC_ISR + 0x70:
+ case APIC_TMR ... APIC_TMR + 0x70:
+ if (WARN_ONCE(!IS_ALIGNED(reg, 16),
+ "APIC register read offset 0x%x not aligned at 16 bytes", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ /* IRR and ALLOWED_IRR offset range */
+ case APIC_IRR ... APIC_IRR + 0x74:
+ /*
+ * Valid APIC_IRR/SAVIC_ALLOWED_IRR registers are at 16 bytes strides from
+ * their respective base offset. APIC_IRRs are in the range
+ *
+ * (0x200, 0x210, ..., 0x270)
+ *
+ * while the SAVIC_ALLOWED_IRR range starts 4 bytes later, in the range
+ *
+ * (0x204, 0x214, ..., 0x274).
+ *
+ * Filter out everything else.
+ */
+ if (WARN_ONCE(!(IS_ALIGNED(reg, 16) ||
+ IS_ALIGNED(reg - 4, 16)),
+ "Misaligned APIC_IRR/ALLOWED_IRR APIC register read offset 0x%x", reg))
+ return 0;
+ return apic_get_reg(ap, reg);
+ default:
+ pr_err("Error reading unknown Secure AVIC reg offset 0x%x\n", reg);
+ return 0;
+ }
+}
+
+#define SAVIC_NMI_REQ 0x278
+
+/*
+ * On WRMSR to APIC_SELF_IPI register by the guest, Secure AVIC hardware
+ * updates the APIC_IRR in the APIC backing page of the vCPU. In addition,
+ * hardware evaluates the new APIC_IRR update for interrupt injection to
+ * the vCPU. So, self IPIs are hardware-accelerated.
+ */
+static inline void self_ipi_reg_write(unsigned int vector)
+{
+ native_apic_msr_write(APIC_SELF_IPI, vector);
+}
+
+static void send_ipi_dest(unsigned int cpu, unsigned int vector, bool nmi)
+{
+ if (nmi)
+ apic_set_reg(per_cpu_ptr(savic_page, cpu), SAVIC_NMI_REQ, 1);
+ else
+ update_vector(cpu, APIC_IRR, vector, true);
+}
+
+static void send_ipi_allbut(unsigned int vector, bool nmi)
+{
+ unsigned int cpu, src_cpu;
+
+ guard(irqsave)();
+
+ src_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ if (cpu == src_cpu)
+ continue;
+ send_ipi_dest(cpu, vector, nmi);
+ }
+}
+
+static inline void self_ipi(unsigned int vector, bool nmi)
+{
+ u32 icr_low = APIC_SELF_IPI | vector;
+
+ if (nmi)
+ icr_low |= APIC_DM_NMI;
+
+ native_x2apic_icr_write(icr_low, 0);
+}
+
+static void savic_icr_write(u32 icr_low, u32 icr_high)
+{
+ unsigned int dsh, vector;
+ u64 icr_data;
+ bool nmi;
+
+ dsh = icr_low & APIC_DEST_ALLBUT;
+ vector = icr_low & APIC_VECTOR_MASK;
+ nmi = ((icr_low & APIC_DM_FIXED_MASK) == APIC_DM_NMI);
+
+ switch (dsh) {
+ case APIC_DEST_SELF:
+ self_ipi(vector, nmi);
+ break;
+ case APIC_DEST_ALLINC:
+ self_ipi(vector, nmi);
+ fallthrough;
+ case APIC_DEST_ALLBUT:
+ send_ipi_allbut(vector, nmi);
+ break;
+ default:
+ send_ipi_dest(icr_high, vector, nmi);
+ break;
+ }
+
+ icr_data = ((u64)icr_high) << 32 | icr_low;
+ if (dsh != APIC_DEST_SELF)
+ savic_ghcb_msr_write(APIC_ICR, icr_data);
+ apic_set_reg64(this_cpu_ptr(savic_page), APIC_ICR, icr_data);
+}
+
+static void savic_write(u32 reg, u32 data)
+{
+ void *ap = this_cpu_ptr(savic_page);
+
+ switch (reg) {
+ case APIC_LVTT:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVTERR:
+ savic_ghcb_msr_write(reg, data);
+ break;
+ case APIC_TASKPRI:
+ case APIC_EOI:
+ case APIC_SPIV:
+ case SAVIC_NMI_REQ:
+ case APIC_ESR:
+ case APIC_ECTRL:
+ case APIC_SEOI:
+ case APIC_IER:
+ case APIC_EILVTn(0) ... APIC_EILVTn(3):
+ apic_set_reg(ap, reg, data);
+ break;
+ case APIC_ICR:
+ savic_icr_write(data, 0);
+ break;
+ case APIC_SELF_IPI:
+ self_ipi_reg_write(data);
+ break;
+ /* ALLOWED_IRR offsets are writable */
+ case SAVIC_ALLOWED_IRR ... SAVIC_ALLOWED_IRR + 0x70:
+ if (IS_ALIGNED(reg - 4, 16)) {
+ apic_set_reg(ap, reg, data);
+ break;
+ }
+ fallthrough;
+ default:
+ pr_err("Error writing unknown Secure AVIC reg offset 0x%x\n", reg);
+ }
+}
+
+static void send_ipi(u32 dest, unsigned int vector, unsigned int dsh)
+{
+ unsigned int icr_low;
+
+ icr_low = __prepare_ICR(dsh, vector, APIC_DEST_PHYSICAL);
+ savic_icr_write(icr_low, dest);
+}
+
+static void savic_send_ipi(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ send_ipi(dest, vector, 0);
+}
+
+static void send_ipi_mask(const struct cpumask *mask, unsigned int vector, bool excl_self)
+{
+ unsigned int cpu, this_cpu;
+
+ guard(irqsave)();
+
+ this_cpu = raw_smp_processor_id();
+
+ for_each_cpu(cpu, mask) {
+ if (excl_self && cpu == this_cpu)
+ continue;
+ send_ipi(per_cpu(x86_cpu_to_apicid, cpu), vector, 0);
+ }
+}
+
+static void savic_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, false);
+}
+
+static void savic_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ send_ipi_mask(mask, vector, true);
+}
+
+static void savic_send_ipi_allbutself(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLBUT);
+}
+
+static void savic_send_ipi_all(int vector)
+{
+ send_ipi(0, vector, APIC_DEST_ALLINC);
+}
+
+static void savic_send_ipi_self(int vector)
+{
+ self_ipi_reg_write(vector);
+}
+
+static void savic_update_vector(unsigned int cpu, unsigned int vector, bool set)
+{
+ update_vector(cpu, SAVIC_ALLOWED_IRR, vector, set);
+}
+
+static void savic_eoi(void)
+{
+ unsigned int cpu;
+ int vec;
+
+ cpu = raw_smp_processor_id();
+ vec = apic_find_highest_vector(get_reg_bitmap(cpu, APIC_ISR));
+ if (WARN_ONCE(vec == -1, "EOI write while no active interrupt in APIC_ISR"))
+ return;
+
+ /* Is level-triggered interrupt? */
+ if (apic_test_vector(vec, get_reg_bitmap(cpu, APIC_TMR))) {
+ update_vector(cpu, APIC_ISR, vec, false);
+ /*
+ * Propagate the EOI write to the hypervisor for level-triggered
+ * interrupts. Return to the guest from GHCB protocol event takes
+ * care of re-evaluating interrupt state.
+ */
+ savic_ghcb_msr_write(APIC_EOI, 0);
+ } else {
+ /*
+ * Hardware clears APIC_ISR and re-evaluates the interrupt state
+ * to determine if there is any pending interrupt which can be
+ * delivered to CPU.
+ */
+ native_apic_msr_eoi();
+ }
+}
+
+static void savic_teardown(void)
+{
+ /* Disable Secure AVIC */
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL, 0);
+ savic_unregister_gpa(NULL);
+}
+
+static void savic_setup(void)
+{
+ void *ap = this_cpu_ptr(savic_page);
+ enum es_result res;
+ unsigned long gpa;
+
+ /*
+ * Before Secure AVIC is enabled, APIC MSR reads are intercepted.
+ * APIC_ID MSR read returns the value from the hypervisor.
+ */
+ apic_set_reg(ap, APIC_ID, native_apic_msr_read(APIC_ID));
+
+ gpa = __pa(ap);
+
+ /*
+ * The NPT entry for a vCPU's APIC backing page must always be
+ * present when the vCPU is running in order for Secure AVIC to
+ * function. A VMEXIT_BUSY is returned on VMRUN and the vCPU cannot
+ * be resumed if the NPT entry for the APIC backing page is not
+ * present. Notify GPA of the vCPU's APIC backing page to the
+ * hypervisor by calling savic_register_gpa(). Before executing
+ * VMRUN, the hypervisor makes use of this information to make sure
+ * the APIC backing page is mapped in NPT.
+ */
+ res = savic_register_gpa(gpa);
+ if (res != ES_OK)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ native_wrmsrq(MSR_AMD64_SAVIC_CONTROL,
+ gpa | MSR_AMD64_SAVIC_EN | MSR_AMD64_SAVIC_ALLOWEDNMI);
+}
+
+static int savic_probe(void)
+{
+ if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return 0;
+
+ if (!x2apic_mode) {
+ pr_err("Secure AVIC enabled in non x2APIC mode\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+ /* unreachable */
+ }
+
+ savic_page = alloc_percpu(struct secure_avic_page);
+ if (!savic_page)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SAVIC_FAIL);
+
+ return 1;
+}
+
+static struct apic apic_x2apic_savic __ro_after_init = {
+
+ .name = "secure avic x2apic",
+ .probe = savic_probe,
+ .acpi_madt_oem_check = savic_acpi_madt_oem_check,
+ .setup = savic_setup,
+ .teardown = savic_teardown,
+
+ .dest_mode_logical = false,
+
+ .disable_esr = 0,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
+ .get_apic_id = x2apic_get_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = savic_send_ipi,
+ .send_IPI_mask = savic_send_ipi_mask,
+ .send_IPI_mask_allbutself = savic_send_ipi_mask_allbutself,
+ .send_IPI_allbutself = savic_send_ipi_allbutself,
+ .send_IPI_all = savic_send_ipi_all,
+ .send_IPI_self = savic_send_ipi_self,
+
+ .nmi_to_offline_cpu = true,
+
+ .read = savic_read,
+ .write = savic_write,
+ .eoi = savic_eoi,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = savic_icr_write,
+
+ .update_vector = savic_update_vector,
+};
+
+apic_driver(apic_x2apic_savic);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..15209f220e1f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,48 +5,89 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
+#include <linux/crash_dump.h>
+#include <linux/cpuhotplug.h>
#include <linux/cpumask.h>
-#include <linux/hardirq.h>
#include <linux/proc_fs.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/memory.h>
+#include <linux/export.h>
#include <linux/pci.h>
-#include <linux/kdebug.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <asm/e820/api.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
-#include <asm/current.h>
-#include <asm/pgtable.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
#include <asm/apic.h>
-#include <asm/ipi.h>
-#include <asm/smp.h>
-#include <asm/x86_init.h>
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+#include "local.h"
+
+static enum uv_system_type uv_system_type;
+static int uv_hubbed_system;
+static int uv_hubless_system;
+static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
+static int uv_node_id;
+
+/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
+static u8 uv_archtype[UV_AT_SIZE + 1];
+static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
+static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+
+/* Information derived from CPUID and some UV MMRs */
+static struct {
+ unsigned int apicid_shift;
+ unsigned int apicid_mask;
+ unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */
+ unsigned int pnode_mask;
+ unsigned int nasid_shift;
+ unsigned int gpa_shift;
+ unsigned int gnode_shift;
+ unsigned int m_skt;
+ unsigned int n_skt;
+} uv_cpuid;
+
+static int uv_min_hub_revision_id;
+
+static struct apic apic_x2apic_uv_x;
+static struct uv_hub_info_s uv_hub_info_node0;
+
+/* Set this to use hardware error handler instead of kernel panic: */
+static int disable_uv_undefined_panic = 1;
+
+unsigned long uv_undefined(char *str)
+{
+ if (likely(!disable_uv_undefined_panic))
+ panic("UV: error: undefined MMR: %s\n", str);
+ else
+ pr_crit("UV: error: undefined MMR: %s\n", str);
+
+ /* Cause a machine fault: */
+ return ~0ul;
+}
+EXPORT_SYMBOL(uv_undefined);
-#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
-static enum uv_system_type uv_system_type;
-static u64 gru_start_paddr, gru_end_paddr;
-int uv_min_hub_revision_id;
-EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
-static DEFINE_SPINLOCK(uv_nmi_lock);
+ return val;
+}
static inline bool is_GRU_range(u64 start, u64 end)
{
+ if (!gru_start_paddr)
+ return false;
+
return start >= gru_start_paddr && end <= gru_end_paddr;
}
@@ -55,65 +96,415 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
return is_ISA_range(start, end) || is_GRU_range(start, end);
}
-static int early_get_nodeid(void)
+static void __init early_get_pnodeid(void)
+{
+ int pnode;
+
+ uv_cpuid.m_skt = 0;
+ if (UVH_RH10_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh10_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ uv_cpuid.nasid_shift = 0;
+ } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) {
+ union uvh_rh_gam_addr_map_config_u m_n_config;
+
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG);
+ uv_cpuid.n_skt = m_n_config.s.n_skt;
+ if (is_uv(UV3))
+ uv_cpuid.m_skt = m_n_config.s3.m_skt;
+ if (is_uv(UV2))
+ uv_cpuid.m_skt = m_n_config.s2.m_skt;
+ uv_cpuid.nasid_shift = 1;
+ } else {
+ unsigned long GAM_ADDR_MAP_CONFIG = 0;
+
+ WARN(GAM_ADDR_MAP_CONFIG == 0,
+ "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n");
+ uv_cpuid.n_skt = 0;
+ uv_cpuid.nasid_shift = 0;
+ }
+
+ if (is_uv(UV4|UVY))
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+
+ uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1;
+ pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask;
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
+
+ pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n",
+ uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode);
+}
+
+/* Running on a UV Hubbed system, determine which UV Hub Type it is */
+static int __init early_set_hub_type(void)
{
union uvh_node_id_u node_id;
- unsigned long *mmr;
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
- node_id.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ /*
+ * The NODE_ID MMR is always at offset 0.
+ * Contains the chip part # + revision.
+ * Node_id field started with 15 bits,
+ * ... now 7 but upper 8 are masked to 0.
+ * All blades/nodes have the same part # and hub revision.
+ */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ uv_node_id = node_id.sx.node_id;
+
+ switch (node_id.s.part_number) {
- /* Currently, all blades have same revision number */
- uv_min_hub_revision_id = node_id.s.revision;
+ case UV5_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV5_HUB_REVISION_BASE;
+ uv_hub_type_set(UV5);
+ break;
- return node_id.s.node_id;
+ /* UV4/4A only have a revision difference */
+ case UV4_HUB_PART_NUMBER:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV4_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV4);
+ if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE)
+ uv_hub_type_set(UV4|UV4A);
+ break;
+
+ case UV3_HUB_PART_NUMBER:
+ case UV3_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV3_HUB_REVISION_BASE;
+ uv_hub_type_set(UV3);
+ break;
+
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id = node_id.s.revision
+ + UV2_HUB_REVISION_BASE - 1;
+ uv_hub_type_set(UV2);
+ break;
+
+ default:
+ return 0;
+ }
+
+ pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n",
+ node_id.s.part_number, node_id.s.revision,
+ uv_min_hub_revision_id, is_uv(~0));
+
+ return 1;
}
-static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static void __init uv_tsc_check_sync(void)
{
- int nodeid;
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+
+ /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */
+ if (!is_uv(UV2|UV3|UV4)) {
+ mark_tsc_async_resets("UV5+");
+ return;
+ }
- if (!strcmp(oem_id, "SGI")) {
- nodeid = early_get_nodeid();
- x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
- x86_platform.nmi_init = uv_nmi_init;
- if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
- else if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
- else if (!strcmp(oem_table_id, "UVH")) {
- __get_cpu_var(x2apic_extra_bits) =
- nodeid << (UV_APIC_PNODE_SHIFT - 1);
- uv_system_type = UV_NON_UNIQUE_APIC;
- return 1;
- }
+ /* UV2,3,4, UV BIOS TSC sync state available */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+
+ /* Check if TSC is valid for all sockets */
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ mark_tsc_async_resets("UV BIOS");
+ break;
+
+ /* If BIOS state unknown, don't do anything */
+ case UVH_TSC_SYNC_UNKNOWN:
+ state = "unknown";
+ break;
+
+ /* Otherwise, BIOS indicates problem with TSC */
+ default:
+ state = "unstable";
+ mark_tsc_unstable("UV BIOS");
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+}
+
+/* Selector for (4|4A|5) structs */
+#define uvxy_field(sname, field, undef) ( \
+ is_uv(UV4A) ? sname.s4a.field : \
+ is_uv(UV4) ? sname.s4.field : \
+ is_uv(UV3) ? sname.s3.field : \
+ undef)
+
+static void __init early_get_apic_socketid_shift(void)
+{
+ unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN);
+
+ if (is_uv2_hub() || is_uv3_hub())
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+
+ if (sid_shift) {
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+ } else {
+ pr_info("UV: CPU does not have valid CPUID.11\n");
+ }
+
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+}
+
+static void __init uv_stringify(int len, char *to, char *from)
+{
+ strscpy(to, from, len);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
+}
+
+/* Find UV arch type entry in UVsystab */
+static unsigned long __init early_find_archtype(struct uv_systab *st)
+{
+ int i;
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+ ptr += (unsigned long)st;
+ if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE)
+ return ptr;
+ }
+ return 0;
+}
+
+/* Validate UV arch type field in UVsystab */
+static int __init decode_arch_type(unsigned long ptr)
+{
+ struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr;
+ int n = strlen(uv_ate->archtype);
+
+ if (n > 0 && n < sizeof(uv_ate->archtype)) {
+ pr_info("UV: UVarchtype received from BIOS\n");
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
+ return 1;
}
return 0;
}
+/* Determine if UV arch type entry might exist in UVsystab */
+static int __init early_get_arch_type(void)
+{
+ unsigned long uvst_physaddr, uvst_size, ptr;
+ struct uv_systab *st;
+ u32 rev;
+ int ret;
+
+ uvst_physaddr = get_uv_systab_phys(0);
+ if (!uvst_physaddr)
+ return 0;
+
+ st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab));
+ if (!st) {
+ pr_err("UV: Cannot access UVsystab, remap failed\n");
+ return 0;
+ }
+
+ rev = st->revision;
+ if (rev < UV_SYSTAB_VERSION_UV5) {
+ early_memunmap(st, sizeof(struct uv_systab));
+ return 0;
+ }
+
+ uvst_size = st->size;
+ early_memunmap(st, sizeof(struct uv_systab));
+ st = early_memremap_ro(uvst_physaddr, uvst_size);
+ if (!st) {
+ pr_err("UV: Cannot access UVarchtype, remap failed\n");
+ return 0;
+ }
+
+ ptr = early_find_archtype(st);
+ if (!ptr) {
+ early_memunmap(st, uvst_size);
+ return 0;
+ }
+
+ ret = decode_arch_type(ptr);
+ early_memunmap(st, uvst_size);
+ return ret;
+}
+
+/* UV system found, check which APIC MODE BIOS already selected */
+static void __init early_set_apic_mode(void)
+{
+ if (x2apic_enabled())
+ uv_system_type = UV_X2APIC;
+ else
+ uv_system_type = UV_LEGACY_APIC;
+}
+
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
+{
+ /* Save OEM_ID passed from ACPI MADT */
+ uv_stringify(sizeof(oem_id), oem_id, _oem_id);
+
+ /* Check if BIOS sent us a UVarchtype */
+ if (!early_get_arch_type())
+
+ /* If not use OEM ID for UVarchtype */
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
+
+ /* Check if not hubbed */
+ if (strncmp(uv_archtype, "SGI", 3) != 0) {
+
+ /* (Not hubbed), check if not hubless */
+ if (strncmp(uv_archtype, "NSGI", 4) != 0)
+
+ /* (Not hubless), not a UV */
+ return 0;
+
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
+ /* UV4 Hubless: CH */
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
+
+ /* UV3 Hubless: UV300/MC990X w/o hub */
+ else
+ uv_hubless_system |= 0x8;
+
+ /* Copy OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
+ pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
+ oem_id, oem_table_id, uv_system_type, uv_hubless_system);
+
+ return 0;
+ }
+
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
+ /* Set hubbed type if true */
+ uv_hub_info->hub_revision =
+ !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
+ !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0;
+
+ switch (uv_hub_info->hub_revision) {
+ case UV5_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x21;
+ uv_hub_type_set(UV5);
+ break;
+
+ case UV4_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x11;
+ uv_hub_type_set(UV4);
+ break;
+
+ case UV3_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x9;
+ uv_hub_type_set(UV3);
+ break;
+
+ case UV2_HUB_REVISION_BASE:
+ uv_hubbed_system = 0x5;
+ uv_hub_type_set(UV2);
+ break;
+
+ default:
+ return 0;
+ }
+
+ /* Get UV hub chip part number & revision */
+ early_set_hub_type();
+
+ /* Other UV setup functions */
+ early_set_apic_mode();
+ early_get_pnodeid();
+ early_get_apic_socketid_shift();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+ uv_tsc_check_sync();
+
+ return 1;
+}
+
+/* Called early to probe for the correct APIC driver */
+static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
+{
+ /* Set up early hub info fields for Node 0 */
+ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
+
+ /* If not UV, return. */
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
+ return 0;
+
+ /* Save for display of the OEM Table ID */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
+
+ pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n",
+ oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY),
+ uv_min_hub_revision_id);
+
+ return 0;
+}
+
enum uv_system_type get_uv_system_type(void)
{
return uv_system_type;
}
+int uv_get_hubless_system(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(uv_get_hubless_system);
+
+ssize_t uv_get_archtype(char *buf, int len)
+{
+ return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id);
+}
+EXPORT_SYMBOL_GPL(uv_get_archtype);
+
int is_uv_system(void)
{
return uv_system_type != UV_NONE;
}
EXPORT_SYMBOL_GPL(is_uv_system);
-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
+int is_uv_hubbed(int uvtype)
+{
+ return (uv_hubbed_system & uvtype);
+}
+EXPORT_SYMBOL_GPL(is_uv_hubbed);
-struct uv_blade_info *uv_blade_info;
-EXPORT_SYMBOL_GPL(uv_blade_info);
+static int is_uv_hubless(int uvtype)
+{
+ return (uv_hubless_system & uvtype);
+}
-short *uv_node_to_blade;
-EXPORT_SYMBOL_GPL(uv_node_to_blade);
+void **__uv_hub_info_list;
+EXPORT_SYMBOL_GPL(__uv_hub_info_list);
-short *uv_cpu_to_blade;
-EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
+DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info);
short uv_possible_blades;
EXPORT_SYMBOL_GPL(uv_possible_blades);
@@ -121,50 +512,202 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-static const struct cpumask *uv_target_cpus(void)
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+static __initdata unsigned short *_node_to_socket;
+
+static __initdata struct uv_gam_range_s *_gr_table;
+
+#define SOCK_EMPTY ((unsigned short)~0)
+
+/* Default UV memory block size is 2GB */
+static unsigned long mem_block_size __initdata = (2UL << 30);
+
+/* Kernel parameter to specify UV mem block size */
+static int __init parse_mem_block_size(char *ptr)
{
- return cpu_online_mask;
+ unsigned long size = memparse(ptr, NULL);
+
+ /* Size will be rounded down by set_block_size() below */
+ mem_block_size = size;
+ return 0;
}
+early_param("uv_memblksize", parse_mem_block_size);
-static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
+static __init int adj_blksize(u32 lgre)
{
- cpumask_clear(retmask);
- cpumask_set_cpu(cpu, retmask);
+ unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT;
+ unsigned long size;
+
+ for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1)
+ if (IS_ALIGNED(base, size))
+ break;
+
+ if (size >= mem_block_size)
+ return 0;
+
+ mem_block_size = size;
+ return 1;
}
-static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static __init void set_block_size(void)
+{
+ unsigned int order = ffs(mem_block_size);
+
+ if (order) {
+ /* adjust for ffs return of 1..64 */
+ set_memory_block_size_order(order - 1);
+ pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size);
+ } else {
+ /* bad or zero value, default to 1UL << 31 (2GB) */
+ pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size);
+ set_memory_block_size_order(31);
+ }
+}
+
+/* Build GAM range lookup table: */
+static __init void build_uv_gr_table(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ struct uv_gam_range_s *grt;
+ unsigned long last_limit = 0, ram_limit = 0;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
+
+ if (!gre)
+ return;
+
+ bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
+ grt = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!grt))
+ return;
+ _gr_table = grt;
+
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
+ if (!ram_limit) {
+ /* Mark hole between RAM/non-RAM: */
+ ram_limit = last_limit;
+ last_limit = gre->limit;
+ lsid++;
+ continue;
+ }
+ last_limit = gre->limit;
+ pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
+ continue;
+ }
+ if (_max_socket < gre->sockid) {
+ pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
+ continue;
+ }
+ sid = gre->sockid - _min_socket;
+ if (lsid < sid) {
+ /* New range: */
+ grt = &_gr_table[indx];
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid = sid;
+ lindx = indx++;
+ continue;
+ }
+ /* Update range: */
+ if (lsid == sid && !ram_limit) {
+ /* .. if contiguous: */
+ if (grt->limit == last_limit) {
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ }
+ /* Non-contiguous RAM range: */
+ if (!ram_limit) {
+ grt++;
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ /* Non-contiguous/non-RAM: */
+ grt++;
+ /* base is this entry */
+ grt->base = grt - _gr_table;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid++;
+ }
+
+ /* Shorten table if possible */
+ grt++;
+ i = grt - _gr_table;
+ if (i < _gr_table_len) {
+ void *ret;
+
+ bytes = i * sizeof(struct uv_gam_range_s);
+ ret = krealloc(_gr_table, bytes, GFP_KERNEL);
+ if (ret) {
+ _gr_table = ret;
+ _gr_table_len = i;
+ }
+ }
+
+ /* Display resultant GAM range table: */
+ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+ unsigned long start, end;
+ int gb = grt->base;
+
+ start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+ end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+ pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
+ }
+}
+
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
-#ifdef CONFIG_SMP
unsigned long val;
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
+
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
- mdelay(10);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
- atomic_set(&init_deasserted, 1);
-#endif
return 0;
}
static void uv_send_IPI_one(int cpu, int vector)
{
- unsigned long apicid;
- int pnode;
+ unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ int pnode = uv_apicid_to_pnode(apicid);
+ unsigned long dmode, val;
+
+ if (vector == NMI_VECTOR)
+ dmode = APIC_DELIVERY_MODE_NMI;
+ else
+ dmode = APIC_DELIVERY_MODE_FIXED;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
- pnode = uv_apicid_to_pnode(apicid);
- uv_hub_send_ipi(pnode, apicid, vector);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -202,183 +745,74 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static int uv_apic_id_registered(void)
-{
- return 1;
-}
-
-static void uv_init_apic_ldr(void)
-{
-}
-
-static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- int cpu = cpumask_first(cpumask);
-
- if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
- else
- return BAD_APICID;
-}
-
-static unsigned int
-uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask)
-{
- int cpu;
-
- /*
- * We're using fixed IRQ delivery, can only return one phys APIC ID.
- * May as well be the first.
- */
- for_each_cpu_and(cpu, cpumask, andmask) {
- if (cpumask_test_cpu(cpu, cpu_online_mask))
- break;
- }
- return per_cpu(x86_cpu_to_apicid, cpu);
-}
-
-static unsigned int x2apic_get_apic_id(unsigned long x)
-{
- unsigned int id;
-
- WARN_ON(preemptible() && num_online_cpus() > 1);
- id = x | __get_cpu_var(x2apic_extra_bits);
-
- return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
- unsigned long x;
-
- /* maskout x2apic_extra_bits ? */
- x = id;
- return x;
-}
-
-static unsigned int uv_read_apic_id(void)
-{
-
- return x2apic_get_apic_id(apic_read(APIC_ID));
-}
-
-static int uv_phys_pkg_id(int initial_apicid, int index_msb)
-{
- return uv_read_apic_id() >> index_msb;
-}
-
-static void uv_send_IPI_self(int vector)
+static int uv_probe(void)
{
- apic_write(APIC_SELF_IPI, vector);
+ return apic == &apic_x2apic_uv_x;
}
-struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
- .probe = NULL,
+ .probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .apic_id_registered = uv_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .dest_mode_logical = false,
- .target_cpus = uv_target_cpus,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
- .check_apicid_present = NULL,
-
- .vector_allocation_domain = uv_vector_allocation_domain,
- .init_apic_ldr = uv_init_apic_ldr,
-
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
- .multi_timer_check = NULL,
- .apicid_to_node = NULL,
- .cpu_to_logical_apicid = NULL,
+
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = uv_phys_pkg_id,
- .mps_oem_check = NULL,
+ .max_apic_id = UINT_MAX,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = set_apic_id,
- .apic_id_mask = 0xFFFFFFFFu,
- .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
- .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+ .calc_dest_apicid = apic_default_calc_apicid,
+ .send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_all = uv_send_IPI_all,
- .send_IPI_self = uv_send_IPI_self,
+ .send_IPI_self = x2apic_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
-static __cpuinit void set_x2apic_extra_bits(int pnode)
-{
- __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
-}
-
-/*
- * Called on boot cpu.
- */
-static __init int boot_pnode_to_blade(int pnode)
-{
- int blade;
-
- for (blade = 0; blade < uv_num_possible_blades(); blade++)
- if (pnode == uv_blade_info[blade].pnode)
- return blade;
- BUG();
-}
-
-struct redir_addr {
- unsigned long redirect;
- unsigned long alias;
-};
-
-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
-
-static __initdata struct redir_addr redir_addrs[] = {
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
-};
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
+#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
- union uvh_si_alias0_overlay_config_u alias;
- union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ union uvh_rh_gam_alias_2_overlay_config_u alias;
+ union uvh_rh_gam_alias_2_redirect_config_u redirect;
+ unsigned long m_redirect;
+ unsigned long m_overlay;
int i;
- for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
- alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+ for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
+ switch (i) {
+ case 0:
+ m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG;
+ break;
+ case 1:
+ m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG;
+ break;
+ case 2:
+ m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG;
+ m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG;
+ break;
+ }
+ alias.v = uv_read_local_mmr(m_overlay);
if (alias.s.enable && alias.s.base == 0) {
*size = (1UL << alias.s.m_alias);
- redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+ redirect.v = uv_read_local_mmr(m_redirect);
*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
return;
}
@@ -387,61 +821,291 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
}
enum map_type {map_wb, map_uc};
+static const char * const mt[] = { "WB", "UC" };
-static __init void map_high(char *id, unsigned long base, int pshift,
- int bshift, int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
paddr = base << pshift;
bytes = (1UL << bshift) * (max_pnode + 1);
- printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
- paddr + bytes);
+ if (!paddr) {
+ pr_info("UV: Map %s_HI base address NULL\n", id);
+ return;
+ }
if (map_type == map_uc)
init_extra_mapping_uc(paddr, bytes);
else
init_extra_mapping_wb(paddr, bytes);
+ pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n",
+ id, paddr, paddr + bytes, mt[map_type], max_pnode + 1);
}
+
static __init void map_gru_high(int max_pnode)
{
- union uvh_rh_gam_gru_overlay_config_mmr_u gru;
- int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
-
- gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
- if (gru.s.enable) {
- map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
- gru_start_paddr = ((u64)gru.s.base << shift);
- gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+ union uvh_rh_gam_gru_overlay_config_u gru;
+ unsigned long mask, base;
+ int shift;
+
+ if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) {
+ gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG);
+ shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT;
+ mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK;
+ } else {
+ pr_err("UV: GRU unavailable (no MMR)\n");
+ return;
+ }
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled (by BIOS)\n");
+ return;
}
+
+ base = (gru.v & mask) >> shift;
+ map_high("GRU", base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
}
static __init void map_mmr_high(int max_pnode)
{
- union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
- int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ unsigned long base;
+ int shift;
+ bool enable;
+
+ if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh10_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmr_overlay_config_u mmr;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG);
+ enable = mmr.s.enable;
+ base = mmr.s.base;
+ shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT;
+ } else {
+ pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n",
+ __func__);
+ return;
+ }
- mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
- if (mmr.s.enable)
- map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+ if (enable)
+ map_high("MMR", base, shift, shift, max_pnode, map_uc);
+ else
+ pr_info("UV: MMR disabled\n");
}
-static __init void map_mmioh_high(int max_pnode)
+/* Arch specific ENUM cases */
+enum mmioh_arch {
+ UV2_MMIOH = -1,
+ UVY_MMIOH0, UVY_MMIOH1,
+ UVX_MMIOH0, UVX_MMIOH1,
+};
+
+/* Calculate and Map MMIOH Regions */
+static void __init calc_mmioh_map(enum mmioh_arch index,
+ int min_pnode, int max_pnode,
+ int shift, unsigned long base, int m_io, int n_io)
{
- union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
- int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ unsigned long mmr, nasid_mask;
+ int nasid, min_nasid, max_nasid, lnasid, mapped;
+ int i, fi, li, n, max_io;
+ char id[8];
+
+ /* One (UV2) mapping */
+ if (index == UV2_MMIOH) {
+ strscpy(id, "MMIOH", sizeof(id));
+ max_io = max_pnode;
+ mapped = 0;
+ goto map_exit;
+ }
- mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
- if (mmioh.s.enable)
- map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
- max_pnode, map_uc);
+ /* small and large MMIOH mappings */
+ switch (index) {
+ case UVY_MMIOH0:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVY_MMIOH1:
+ mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVYH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode;
+ max_nasid = max_pnode;
+ mapped = 1;
+ break;
+ case UVX_MMIOH0:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ case UVX_MMIOH1:
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH;
+ min_nasid = min_pnode * 2;
+ max_nasid = max_pnode * 2;
+ mapped = 1;
+ break;
+ default:
+ pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index);
+ return;
+ }
+
+ /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */
+ snprintf(id, sizeof(id), "MMIOH%d", index%2);
+
+ max_io = lnasid = fi = li = -1;
+ for (i = 0; i < n; i++) {
+ unsigned long m_redirect = mmr + i * 8;
+ unsigned long redirect = uv_read_local_mmr(m_redirect);
+
+ nasid = redirect & nasid_mask;
+ if (i == 0)
+ pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
+ id, redirect, m_redirect, nasid);
+
+ /* Invalid NASID check */
+ if (nasid < min_nasid || max_nasid < nasid) {
+ /* Not an error: unused table entries get "poison" values */
+ pr_debug("UV:%s:Invalid NASID(%x):%x (range:%x..%x)\n",
+ __func__, index, nasid, min_nasid, max_nasid);
+ nasid = -1;
+ }
+
+ if (nasid == lnasid) {
+ li = i;
+ /* Last entry check: */
+ if (i != n-1)
+ continue;
+ }
+
+ /* Check if we have a cached (or last) redirect to print: */
+ if (lnasid != -1 || (i == n-1 && nasid != -1)) {
+ unsigned long addr1, addr2;
+ int f, l;
+
+ if (lnasid == -1) {
+ f = l = i;
+ lnasid = nasid;
+ } else {
+ f = fi;
+ l = li;
+ }
+ addr1 = (base << shift) + f * (1ULL << m_io);
+ addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
+ id, fi, li, lnasid, addr1, addr2);
+ if (max_io < l)
+ max_io = l;
+ }
+ fi = li = i;
+ lnasid = nasid;
+ }
+
+map_exit:
+ pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n",
+ id, base, shift, m_io, max_io, max_pnode);
+
+ if (max_io >= 0 && !mapped)
+ map_high(id, base, shift, m_io, max_io, map_uc);
+}
+
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
+{
+ /* UVY flavor */
+ if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io);
+
+ mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else
+ calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode,
+ UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io);
+ return;
+ }
+ /* UVX flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) {
+ union uvh_rh_gam_mmioh_overlay_config0_u mmioh0;
+ union uvh_rh_gam_mmioh_overlay_config1_u mmioh1;
+
+ mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0);
+ if (unlikely(mmioh0.s.enable == 0))
+ pr_info("UV: MMIOH0 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh0, base, 0);
+ int m_io = uvxy_field(mmioh0, m_io, 0);
+ int n_io = uvxy_field(mmioh0, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT,
+ base, m_io, n_io);
+ }
+
+ mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1);
+ if (unlikely(mmioh1.s.enable == 0))
+ pr_info("UV: MMIOH1 disabled\n");
+ else {
+ unsigned long base = uvxy_field(mmioh1, base, 0);
+ int m_io = uvxy_field(mmioh1, m_io, 0);
+ int n_io = uvxy_field(mmioh1, n_io, 0);
+
+ calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode,
+ UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT,
+ base, m_io, n_io);
+ }
+ return;
+ }
+
+ /* UV2 flavor */
+ if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) {
+ union uvh_rh_gam_mmioh_overlay_config_u mmioh;
+
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG);
+ if (unlikely(mmioh.s2.enable == 0))
+ pr_info("UV: MMIOH disabled\n");
+ else
+ calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode,
+ UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT,
+ mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io);
+ return;
+ }
}
static __init void map_low_mmrs(void)
{
- init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
- init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+ if (UV_GLOBAL_MMR32_BASE)
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+
+ if (UV_LOCAL_MMR_BASE)
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
}
static __init void uv_rtc_init(void)
@@ -449,315 +1113,708 @@ static __init void uv_rtc_init(void)
long status;
u64 ticks_per_sec;
- status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
- &ticks_per_sec);
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
- printk(KERN_WARNING
- "unable to determine platform RTC clock frequency, "
- "guessing.\n");
- /* BIOS gives wrong value for clock freq. so guess */
+ pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+ /* BIOS gives wrong value for clock frequency, so guess: */
sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
- } else
+ } else {
sn_rtc_cycles_per_second = ticks_per_sec;
+ }
}
-/*
- * percpu heartbeat timer
- */
-static void uv_heartbeat(unsigned long ignored)
+/* Direct Legacy VGA I/O traffic to designated IOH */
+static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
{
- struct timer_list *timer = &uv_hub_info->scir.timer;
- unsigned char bits = uv_hub_info->scir.state;
+ int domain, bus, rc;
- /* flip heartbeat bit */
- bits ^= SCIR_CPU_HEARTBEAT;
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
+ return 0;
- /* is this cpu idle? */
- if (idle_cpu(raw_smp_processor_id()))
- bits &= ~SCIR_CPU_ACTIVITY;
- else
- bits |= SCIR_CPU_ACTIVITY;
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
- /* update system controller interface reg */
- uv_set_scir_bits(bits);
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
- /* enable next timer period */
- mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+
+ return rc;
}
-static void __cpuinit uv_heartbeat_enable(int cpu)
+/*
+ * Called on each CPU to initialize the per_cpu UV data area.
+ * FIXME: hotplug not supported yet
+ */
+void uv_cpu_init(void)
{
- while (!uv_cpu_hub_info(cpu)->scir.enabled) {
- struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+ /* CPU 0 initialization will be done via uv_system_init. */
+ if (smp_processor_id() == 0)
+ return;
- uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
- setup_timer(timer, uv_heartbeat, cpu);
- timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
- add_timer_on(timer, cpu);
- uv_cpu_hub_info(cpu)->scir.enabled = 1;
+ uv_hub_info->nr_online_cpus++;
+}
+
+struct mn {
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char m_shift;
+ unsigned char n_lshift;
+};
- /* also ensure that boot cpu is enabled */
- cpu = 0;
+/* Initialize caller's MN struct and fill in values */
+static void get_mn(struct mn *mnp)
+{
+ memset(mnp, 0, sizeof(*mnp));
+ mnp->n_val = uv_cpuid.n_skt;
+ if (is_uv(UV4|UVY)) {
+ mnp->m_val = 0;
+ mnp->n_lshift = 0;
+ } else if (is_uv3_hub()) {
+ union uvyh_gr0_gam_gr_config_u m_gr_config;
+
+ mnp->m_val = uv_cpuid.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG);
+ mnp->n_lshift = m_gr_config.s3.m_skt;
+ } else if (is_uv2_hub()) {
+ mnp->m_val = uv_cpuid.m_skt;
+ mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
}
+ mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-static void __cpuinit uv_heartbeat_disable(int cpu)
+static void __init uv_init_hub_info(struct uv_hub_info_s *hi)
{
- if (uv_cpu_hub_info(cpu)->scir.enabled) {
- uv_cpu_hub_info(cpu)->scir.enabled = 0;
- del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ struct mn mn;
+
+ get_mn(&mn);
+ hi->gpa_mask = mn.m_val ?
+ (1UL << (mn.m_val + mn.n_val)) - 1 :
+ (1UL << uv_cpuid.gpa_shift) - 1;
+
+ hi->m_val = mn.m_val;
+ hi->n_val = mn.n_val;
+ hi->m_shift = mn.m_shift;
+ hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+ hi->hub_revision = uv_hub_info->hub_revision;
+ hi->hub_type = uv_hub_info->hub_type;
+ hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->nasid_shift = uv_cpuid.nasid_shift;
+ hi->min_pnode = _min_pnode;
+ hi->min_socket = _min_socket;
+ hi->node_to_socket = _node_to_socket;
+ hi->pnode_to_socket = _pnode_to_socket;
+ hi->socket_to_node = _socket_to_node;
+ hi->socket_to_pnode = _socket_to_pnode;
+ hi->gr_table_len = _gr_table_len;
+ hi->gr_table = _gr_table;
+
+ uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+ hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ if (mn.m_val)
+ hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
+
+ if (uv_gp_table) {
+ hi->global_mmr_base = uv_gp_table->mmr_base;
+ hi->global_mmr_shift = uv_gp_table->mmr_shift;
+ hi->global_gru_base = uv_gp_table->gru_base;
+ hi->global_gru_shift = uv_gp_table->gru_shift;
+ hi->gpa_shift = uv_gp_table->gpa_shift;
+ hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
+ } else {
+ hi->global_mmr_base =
+ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) &
+ ~UV_MMR_ENABLE;
+ hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
}
- uv_set_cpu_scir_bits(cpu, 0xff);
+
+ get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
+
+ hi->apic_pnode_shift = uv_cpuid.socketid_shift;
+
+ /* Show system specific info: */
+ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+ pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift);
+ if (hi->global_gru_base)
+ pr_info("UV: gru_base/shift:0x%lx/%ld\n",
+ hi->global_gru_base, hi->global_gru_shift);
+
+ pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
}
-/*
- * cpu hotplug notifier
- */
-static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static void __init decode_gam_params(unsigned long ptr)
{
- long cpu = (long)hcpu;
+ uv_gp_table = (struct uv_gam_parameters *)ptr;
- switch (action) {
- case CPU_ONLINE:
- uv_heartbeat_enable(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uv_heartbeat_disable(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ pr_info("UV: GAM Params...\n");
+ pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n",
+ uv_gp_table->mmr_base, uv_gp_table->mmr_shift,
+ uv_gp_table->gru_base, uv_gp_table->gru_shift,
+ uv_gp_table->gpa_shift);
}
-static __init void uv_scir_register_cpu_notifier(void)
+static void __init decode_gam_rng_tbl(unsigned long ptr)
{
- hotcpu_notifier(uv_scir_cpu_notify, 0);
-}
+ struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
+ unsigned long lgre = 0, gend = 0;
+ int index = 0;
+ int sock_min = INT_MAX, pnode_min = INT_MAX;
+ int sock_max = -1, pnode_max = -1;
+
+ uv_gre_table = gre;
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ unsigned long size = ((unsigned long)(gre->limit - lgre)
+ << UV_GAM_RANGE_SHFT);
+ int order = 0;
+ char suffix[] = " KMGTPE";
+ int flag = ' ';
+
+ while (size > 9999 && order < sizeof(suffix)) {
+ size /= 1024;
+ order++;
+ }
-#else /* !CONFIG_HOTPLUG_CPU */
+ /* adjust max block size to current range start */
+ if (gre->type == 1 || gre->type == 2)
+ if (adj_blksize(lgre))
+ flag = '*';
-static __init void uv_scir_register_cpu_notifier(void)
-{
+ if (!index) {
+ pr_info("UV: GAM Range Table...\n");
+ pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
+ }
+ pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n",
+ index++,
+ (unsigned long)lgre << UV_GAM_RANGE_SHFT,
+ (unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
+ flag, size, suffix[order],
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
+
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT;
+
+ /* update to next range start */
+ lgre = gre->limit;
+ if (sock_min > gre->sockid)
+ sock_min = gre->sockid;
+ if (sock_max < gre->sockid)
+ sock_max = gre->sockid;
+ if (pnode_min > gre->pnode)
+ pnode_min = gre->pnode;
+ if (pnode_max < gre->pnode)
+ pnode_max = gre->pnode;
+ }
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ _min_pnode = pnode_min;
+ _max_pnode = pnode_max;
+ _gr_table_len = index;
+
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n",
+ index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend));
}
-static __init int uv_init_heartbeat(void)
+/* Walk through UVsystab decoding the fields */
+static int __init decode_uv_systab(void)
{
- int cpu;
+ struct uv_systab *st;
+ int i;
- if (is_uv_system())
- for_each_online_cpu(cpu)
- uv_heartbeat_enable(cpu);
+ /* Get mapped UVsystab pointer */
+ st = uv_systab;
+
+ /* If UVsystab is version 1, there is no extended UVsystab */
+ if (st && st->revision == UV_SYSTAB_VERSION_1)
+ return 0;
+
+ if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+ int rev = st ? st->revision : 0;
+
+ pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n",
+ rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Does not support UV, switch to non-UV x86_64\n");
+ uv_system_type = UV_NONE;
+
+ return -EINVAL;
+ }
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+
+ /* point to payload */
+ ptr += (unsigned long)st;
+
+ switch (st->entry[i].type) {
+ case UV_SYSTAB_TYPE_GAM_PARAMS:
+ decode_gam_params(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_GAM_RNG_TBL:
+ decode_gam_rng_tbl(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_ARCH_TYPE:
+ /* already processed in early startup */
+ break;
+
+ default:
+ pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n",
+ __func__, st->entry[i].type);
+ break;
+ }
+ }
return 0;
}
-late_initcall(uv_init_heartbeat);
+/*
+ * Given a bitmask 'bits' representing presnt blades, numbered
+ * starting at 'base', masking off unused high bits of blade number
+ * with 'mask', update the minimum and maximum blade numbers that we
+ * have found. (Masking with 'mask' necessary because of BIOS
+ * treatment of system partitioning when creating this table we are
+ * interpreting.)
+ */
+static inline void blade_update_min_max(unsigned long bits, int base, int mask, int *min, int *max)
+{
+ int first, last;
+
+ if (!bits)
+ return;
+ first = (base + __ffs(bits)) & mask;
+ last = (base + __fls(bits)) & mask;
-#endif /* !CONFIG_HOTPLUG_CPU */
+ if (*min > first)
+ *min = first;
+ if (*max < last)
+ *max = last;
+}
-/* Direct Legacy VGA I/O traffic to designated IOH */
-int uv_set_vga_state(struct pci_dev *pdev, bool decode,
- unsigned int command_bits, bool change_bridge)
+/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */
+static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
{
- int domain, bus, rc;
+ unsigned long np;
+ int i, uv_pb = 0;
+ int sock_min = INT_MAX, sock_max = -1, s_mask;
+
+ s_mask = (1 << uv_cpuid.n_skt) - 1;
+
+ if (UVH_NODE_PRESENT_TABLE) {
+ pr_info("UV: NODE_PRESENT_DEPTH = %d\n",
+ UVH_NODE_PRESENT_TABLE_DEPTH);
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+ blade_update_min_max(np, i * 64, s_mask, &sock_min, &sock_max);
+ }
+ }
+ if (UVH_NODE_PRESENT_0) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_0);
+ pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np);
+ blade_update_min_max(np, 0, s_mask, &sock_min, &sock_max);
+ }
+ if (UVH_NODE_PRESENT_1) {
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_1);
+ pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np);
+ blade_update_min_max(np, 64, s_mask, &sock_min, &sock_max);
+ }
- PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
- pdev->devfn, decode, command_bits, change_bridge);
+ /* Only update if we actually found some bits indicating blades present */
+ if (sock_max >= sock_min) {
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ uv_pb = sock_max - sock_min + 1;
+ }
+ if (uv_possible_blades != uv_pb)
+ uv_possible_blades = uv_pb;
- if (!change_bridge)
- return 0;
+ pr_info("UV: number nodes/possible blades %d (%d - %d)\n",
+ uv_pb, sock_min, sock_max);
+}
- if ((command_bits & PCI_COMMAND_IO) == 0)
- return 0;
+static int __init alloc_conv_table(int num_elem, unsigned short **table)
+{
+ int i;
+ size_t bytes;
+
+ bytes = num_elem * sizeof(*table[0]);
+ *table = kmalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!*table))
+ return -ENOMEM;
+ for (i = 0; i < num_elem; i++)
+ ((unsigned short *)*table)[i] = SOCK_EMPTY;
+ return 0;
+}
- domain = pci_domain_nr(pdev->bus);
- bus = pdev->bus->number;
+/* Remove conversion table if it's 1:1 */
+#define FREE_1_TO_1_TABLE(tbl, min, max, max2) free_1_to_1_table(&tbl, #tbl, min, max, max2)
- rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
- PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
+static void __init free_1_to_1_table(unsigned short **tp, char *tname, int min, int max, int max2)
+{
+ int i;
+ unsigned short *table = *tp;
- return rc;
+ if (table == NULL)
+ return;
+ if (max != max2)
+ return;
+ for (i = 0; i < max; i++) {
+ if (i != table[i])
+ return;
+ }
+ kfree(table);
+ *tp = NULL;
+ pr_info("UV: %s is 1:1, conversion table removed\n", tname);
}
/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * FIXME: hotplug not supported yet
+ * Build Socket Tables
+ * If the number of nodes is >1 per socket, socket to node table will
+ * contain lowest node number on that socket.
*/
-void __cpuinit uv_cpu_init(void)
+static void __init build_socket_tables(void)
{
- /* CPU 0 initilization will be done via uv_system_init. */
- if (!uv_blade_info)
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ int nums, numn, nump;
+ int i, lnid, apicid;
+ int minsock = _min_socket;
+ int maxsock = _max_socket;
+ int minpnode = _min_pnode;
+ int maxpnode = _max_pnode;
+
+ if (!gre) {
+ if (is_uv2_hub() || is_uv3_hub()) {
+ pr_info("UV: No UVsystab socket table, ignoring\n");
+ return;
+ }
+ pr_err("UV: Error: UVsystab address translations not available!\n");
+ WARN_ON_ONCE(!gre);
return;
+ }
+
+ numn = num_possible_nodes();
+ nump = maxpnode - minpnode + 1;
+ nums = maxsock - minsock + 1;
+
+ /* Allocate and clear tables */
+ if ((alloc_conv_table(nump, &_pnode_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_pnode) < 0)
+ || (alloc_conv_table(numn, &_node_to_socket) < 0)
+ || (alloc_conv_table(nums, &_socket_to_node) < 0)) {
+ kfree(_pnode_to_socket);
+ kfree(_socket_to_pnode);
+ kfree(_node_to_socket);
+ return;
+ }
+
+ /* Fill in pnode/node/addr conversion list values: */
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ continue;
+ i = gre->sockid - minsock;
+ if (_socket_to_pnode[i] == SOCK_EMPTY)
+ _socket_to_pnode[i] = gre->pnode;
+
+ i = gre->pnode - minpnode;
+ if (_pnode_to_socket[i] == SOCK_EMPTY)
+ _pnode_to_socket[i] = gre->sockid;
+
+ pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+ gre->sockid, gre->type, gre->nasid,
+ _socket_to_pnode[gre->sockid - minsock],
+ _pnode_to_socket[gre->pnode - minpnode]);
+ }
+
+ /* Set socket -> node values: */
+ lnid = NUMA_NO_NODE;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
+
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
+ continue;
+ lnid = nid;
+
+ sockid = apicid >> uv_cpuid.socketid_shift;
+
+ if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
+ _socket_to_node[sockid - minsock] = nid;
+
+ if (_node_to_socket[nid] == SOCK_EMPTY)
+ _node_to_socket[nid] = sockid;
+
+ pr_info("UV: sid:%02x: apicid:%04x socket:%02d node:%03x s2n:%03x\n",
+ sockid,
+ apicid,
+ _node_to_socket[nid],
+ nid,
+ _socket_to_node[sockid - minsock]);
+ }
- uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+ /*
+ * If e.g. socket id == pnode for all pnodes,
+ * system runs faster by removing corresponding conversion table.
+ */
+ FREE_1_TO_1_TABLE(_socket_to_node, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_node_to_socket, _min_socket, nums, numn);
+ FREE_1_TO_1_TABLE(_socket_to_pnode, _min_pnode, nums, nump);
+ FREE_1_TO_1_TABLE(_pnode_to_socket, _min_pnode, nums, nump);
+}
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
+/* Check which reboot to use */
+static void check_efi_reboot(void)
+{
+ /* If EFI reboot not available, use ACPI reboot */
+ if (!efi_enabled(EFI_BOOT))
+ reboot_type = BOOT_ACPI;
}
/*
- * When NMI is received, print a stack trace.
+ * User proc fs file handling now deprecated.
+ * Recommend using /sys/firmware/sgi_uv/... instead.
*/
-int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
+static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
{
- if (reason != DIE_NMI_IPI)
- return NOTIFY_OK;
- /*
- * Use a lock so only one cpu prints at a time
- * to prevent intermixed output.
- */
- spin_lock(&uv_nmi_lock);
- pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
- dump_stack();
- spin_unlock(&uv_nmi_lock);
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubbed_system);
+ return 0;
+}
- return NOTIFY_STOP;
+static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n",
+ current->comm);
+ seq_printf(file, "0x%x\n", uv_hubless_system);
+ return 0;
}
-static struct notifier_block uv_dump_stack_nmi_nb = {
- .notifier_call = uv_handle_nmi
-};
+static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data)
+{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n",
+ current->comm);
+ seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id);
+ return 0;
+}
-void uv_register_nmi_notifier(void)
+static __init void uv_setup_proc_files(int hubless)
{
- if (register_die_notifier(&uv_dump_stack_nmi_nb))
- printk(KERN_WARNING "UV NMI handler failed to register\n");
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(UV_PROC_NODE, NULL);
+ proc_create_single("archtype", 0, pde, proc_archtype_show);
+ if (hubless)
+ proc_create_single("hubless", 0, pde, proc_hubless_show);
+ else
+ proc_create_single("hubbed", 0, pde, proc_hubbed_show);
}
-void uv_nmi_init(void)
+/* Initialize UV hubless systems */
+static __init int uv_system_init_hubless(void)
{
- unsigned int value;
+ int rc;
- /*
- * Unmask NMI on all cpus
- */
- value = apic_read(APIC_LVT1) | APIC_DM_NMI;
- value &= ~APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
+ /* Setup PCH NMI handler */
+ uv_nmi_setup_hubless();
+
+ /* Init kernel/BIOS interface */
+ rc = uv_bios_init();
+ if (rc < 0)
+ return rc;
+
+ /* Process UVsystab */
+ rc = decode_uv_systab();
+ if (rc < 0)
+ return rc;
+
+ /* Set section block size for current node memory */
+ set_block_size();
+
+ /* Create user access node */
+ if (rc >= 0)
+ uv_setup_proc_files(1);
+
+ check_efi_reboot();
+
+ return rc;
}
-void __init uv_system_init(void)
+static void __init uv_system_init_hub(void)
{
- union uvh_si_addr_map_config_u m_n_config;
- union uvh_node_id_u node_id;
- unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- int gnode_extra, max_pnode = 0;
- unsigned long mmr_base, present, paddr;
- unsigned short pnode_mask;
+ struct uv_hub_info_s hub_info = {0};
+ int bytes, cpu, nodeid, bid;
+ unsigned short min_pnode = USHRT_MAX, max_pnode = 0;
+ char *hub = is_uv5_hub() ? "UV500" :
+ is_uv4_hub() ? "UV400" :
+ is_uv3_hub() ? "UV300" :
+ is_uv2_hub() ? "UV2000/3000" : NULL;
+ struct uv_hub_info_s **uv_hub_info_list_blade;
+
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
+ }
+ pr_info("UV: Found %s hub\n", hub);
map_low_mmrs();
- m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
- m_val = m_n_config.s.m_skt;
- n_val = m_n_config.s.n_skt;
- mmr_base =
- uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
- ~UV_MMR_ENABLE;
- pnode_mask = (1 << n_val) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
- gnode_upper = ((unsigned long)gnode_extra << m_val);
- printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
- n_val, m_val, gnode_upper, gnode_extra);
-
- printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
-
- for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
- uv_possible_blades +=
- hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
- printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
-
- bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
- uv_blade_info = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_blade_info);
- for (blade = 0; blade < uv_num_possible_blades(); blade++)
- uv_blade_info[blade].memory_nid = -1;
-
- get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
-
- bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
- uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_node_to_blade);
- memset(uv_node_to_blade, 255, bytes);
-
- bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
- uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
- BUG_ON(!uv_cpu_to_blade);
- memset(uv_cpu_to_blade, 255, bytes);
-
- blade = 0;
- for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
- present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
- for (j = 0; j < 64; j++) {
- if (!test_bit(j, &present))
- continue;
- uv_blade_info[blade].pnode = (i * 64 + j);
- uv_blade_info[blade].nr_possible_cpus = 0;
- uv_blade_info[blade].nr_online_cpus = 0;
- blade++;
- }
+ /* Get uv_systab for decoding, setup UV BIOS calls */
+ uv_bios_init();
+
+ /* If there's an UVsystab problem then abort UV init: */
+ if (decode_uv_systab() < 0) {
+ pr_err("UV: Mangled UVsystab format\n");
+ return;
}
- uv_bios_init();
- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
- &sn_region_size, &system_serial_number);
+ build_socket_tables();
+ build_uv_gr_table();
+ set_block_size();
+ uv_init_hub_info(&hub_info);
+ /* If UV2 or UV3 may need to get # blades from HW */
+ if (is_uv(UV2|UV3) && !uv_gre_table)
+ boot_init_possible_blades(&hub_info);
+ else
+ /* min/max sockets set in decode_gam_rng_tbl */
+ uv_possible_blades = (_max_socket - _min_socket) + 1;
+
+ /* uv_num_possible_blades() is really the hub count: */
+ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
+
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
+ hub_info.coherency_domain_number = sn_coherency_id;
uv_rtc_init();
- for_each_present_cpu(cpu) {
+ /*
+ * __uv_hub_info_list[] is indexed by node, but there is only
+ * one hub_info structure per blade. First, allocate one
+ * structure per blade. Further down we create a per-node
+ * table (__uv_hub_info_list[]) pointing to hub_info
+ * structures for the correct blade.
+ */
+
+ bytes = sizeof(void *) * uv_num_possible_blades();
+ uv_hub_info_list_blade = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!uv_hub_info_list_blade))
+ return;
+
+ bytes = sizeof(struct uv_hub_info_s);
+ for_each_possible_blade(bid) {
+ struct uv_hub_info_s *new_hub;
+
+ /* Allocate & fill new per hub info list */
+ new_hub = (bid == 0) ? &uv_hub_info_node0
+ : kzalloc_node(bytes, GFP_KERNEL, uv_blade_to_node(bid));
+ if (WARN_ON_ONCE(!new_hub)) {
+ /* do not kfree() bid 0, which is statically allocated */
+ while (--bid > 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
+ }
+
+ uv_hub_info_list_blade[bid] = new_hub;
+ *new_hub = hub_info;
+
+ /* Use information from GAM table if available: */
+ if (uv_gre_table)
+ new_hub->pnode = uv_blade_to_pnode(bid);
+ else /* Or fill in during CPU loop: */
+ new_hub->pnode = 0xffff;
+
+ new_hub->numa_blade_id = bid;
+ new_hub->memory_nid = NUMA_NO_NODE;
+ new_hub->nr_possible_cpus = 0;
+ new_hub->nr_online_cpus = 0;
+ }
+
+ /*
+ * Now populate __uv_hub_info_list[] for each node with the
+ * pointer to the struct for the blade it resides on.
+ */
+
+ bytes = sizeof(void *) * num_possible_nodes();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ if (WARN_ON_ONCE(!__uv_hub_info_list)) {
+ for_each_possible_blade(bid)
+ /* bid 0 is statically allocated */
+ if (bid != 0)
+ kfree(uv_hub_info_list_blade[bid]);
+ kfree(uv_hub_info_list_blade);
+ return;
+ }
+
+ for_each_node(nodeid)
+ __uv_hub_info_list[nodeid] = uv_hub_info_list_blade[uv_node_to_blade_id(nodeid)];
+
+ /* Initialize per CPU info: */
+ for_each_possible_cpu(cpu) {
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned short bid;
+ unsigned short pnode;
- nid = cpu_to_node(cpu);
pnode = uv_apicid_to_pnode(apicid);
- blade = boot_pnode_to_blade(pnode);
- lcpu = uv_blade_info[blade].nr_possible_cpus;
- uv_blade_info[blade].nr_possible_cpus++;
-
- /* Any node on the blade, else will contain -1. */
- uv_blade_info[blade].memory_nid = nid;
-
- uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
- uv_cpu_hub_info(cpu)->m_val = m_val;
- uv_cpu_hub_info(cpu)->n_val = n_val;
- uv_cpu_hub_info(cpu)->numa_blade_id = blade;
- uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
- uv_cpu_hub_info(cpu)->pnode = pnode;
- uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
- uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
- uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
- uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
- uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
- uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
- uv_node_to_blade[nid] = blade;
- uv_cpu_to_blade[cpu] = blade;
- max_pnode = max(pnode, max_pnode);
+ bid = uv_pnode_to_socket(pnode) - _min_socket;
+
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list_blade[bid];
+ uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
+ uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
+
+ if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ uv_cpu_hub_info(cpu)->pnode = pnode;
}
- /* Add blade/pnode info for nodes without cpus */
- for_each_online_node(nid) {
- if (uv_node_to_blade[nid] >= 0)
+ for_each_possible_blade(bid) {
+ unsigned short pnode = uv_hub_info_list_blade[bid]->pnode;
+
+ if (pnode == 0xffff)
continue;
- paddr = node_start_pfn(nid) << PAGE_SHIFT;
- paddr = uv_soc_phys_ram_to_gpa(paddr);
- pnode = (paddr >> m_val) & pnode_mask;
- blade = boot_pnode_to_blade(pnode);
- uv_node_to_blade[nid] = blade;
+
+ min_pnode = min(pnode, min_pnode);
max_pnode = max(pnode, max_pnode);
+ pr_info("UV: HUB:%2d pn:%02x nrcpus:%d\n",
+ bid,
+ uv_hub_info_list_blade[bid]->pnode,
+ uv_hub_info_list_blade[bid]->nr_possible_cpus);
}
+ pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
map_gru_high(max_pnode);
map_mmr_high(max_pnode);
- map_mmioh_high(max_pnode);
+ map_mmioh_high(min_pnode, max_pnode);
+ kfree(uv_hub_info_list_blade);
+ uv_hub_info_list_blade = NULL;
+
+ uv_nmi_setup();
uv_cpu_init();
- uv_scir_register_cpu_notifier();
- uv_register_nmi_notifier();
- proc_mkdir("sgi_uv", NULL);
+ uv_setup_proc_files(0);
- /* register Legacy VGA I/O redirection handler */
+ /* Register Legacy VGA I/O redirection handler: */
pci_register_set_vga_state(uv_set_vga_state);
+
+ check_efi_reboot();
}
+
+/*
+ * There is a different code path needed to initialize a UV system that does
+ * not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+ if (likely(!is_uv_system() && !is_uv_hubless(1)))
+ return;
+
+ if (is_uv_system())
+ uv_system_init_hub();
+ else
+ uv_system_init_hubless();
+}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c4f9182ca3ac..b37ab1095707 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* -*- linux-c -*-
* APM BIOS driver for Linux
* Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
@@ -5,16 +6,6 @@
* Initial development of this driver was funded by NEC Australia P/L
* and NEC Corporation
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
* October 1995, Rik Faith (faith@cs.unc.edu):
* Minor enhancements and updates (to the patch set) for 1.3.x
* Documentation
@@ -66,7 +57,7 @@
* 1.5: Fix segment register reloading (in case of bad segments saved
* across BIOS call).
* Stephen Rothwell
- * 1.6: Cope with complier/assembler differences.
+ * 1.6: Cope with compiler/assembler differences.
* Only try to turn off the first display device.
* Fix OOPS at power off with no APM BIOS by Jan Echternach
* <echter@informatik.uni-rostock.de>
@@ -103,7 +94,7 @@
* Remove APM dependencies in arch/i386/kernel/process.c
* Remove APM dependencies in drivers/char/sysrq.c
* Reset time across standby.
- * Allow more inititialisation on SMP.
+ * Allow more initialisation on SMP.
* Remove CONFIG_APM_POWER_OFF and make it boot time
* configurable (default on).
* Make debug only a boot time parameter (remove APM_DEBUG).
@@ -140,7 +131,7 @@
* is now the way life works).
* Fix thinko in suspend() (wrong return).
* Notify drivers on critical suspend.
- * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
* modified by sfr).
* Disable interrupts while we are suspended (Andy Henroid
* <andy_henroid@yahoo.com> fixed by sfr).
@@ -189,8 +180,8 @@
* Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
*
* [This document is available free from Intel by calling 800.628.8686 (fax
- * 916.356.6100) or 800.548.4725; or via anonymous ftp from
- * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
* available from Microsoft by calling 206.882.8080.]
*
* APM 1.2 Reference:
@@ -201,6 +192,8 @@
* http://www.microsoft.com/whdc/archive/amp_12.mspx]
*/
+#define pr_fmt(fmt) "apm: " fmt
+
#include <linux/module.h>
#include <linux/poll.h>
@@ -216,7 +209,8 @@
#include <linux/apm_bios.h>
#include <linux/init.h>
#include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/pm.h>
#include <linux/capability.h>
#include <linux/device.h>
@@ -227,28 +221,24 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
+#include <linux/cpuidle.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
-#include <asm/i8253.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
#include <asm/reboot.h>
+#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
extern int (*console_blank_hook)(int);
#endif
/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define APM_MINOR_DEV 134
-
-/*
- * See Documentation/Config.help for the configuration options.
- *
* Various options can be changed at boot time as follows:
* (We allow underscores for compatibility with the modules code)
* apm=on/off enable/disable APM
@@ -365,38 +355,58 @@ struct apm_user {
#endif
#define DEFAULT_IDLE_PERIOD (100 / 3)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
/*
* Local variables
*/
-static struct {
+__visible struct {
unsigned long offset;
unsigned short segment;
} apm_bios_entry;
static int clock_slowed;
static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
-static int set_pm_idle;
static int suspends_pending;
static int standbys_pending;
static int ignore_sys_suspend;
static int ignore_normal_resume;
static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
static int apm_disabled = -1;
#ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
#else
-static int power_off = 1;
+static bool power_off = 1;
#endif
-static int realmode_power_off;
+static bool realmode_power_off;
#ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
#else
-static int allow_ints;
+static bool allow_ints;
#endif
-static int broken_psr;
+static bool broken_psr;
static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
@@ -410,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex);
* This is for buggy BIOS's that refer to (real mode) segment 0x40
* even though they are called in protected mode.
*/
-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
(unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
@@ -485,11 +495,11 @@ static void apm_error(char *str, int err)
if (error_table[i].key == err)
break;
if (i < ERROR_COUNT)
- printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+ pr_notice("%s: %s\n", str, error_table[i].msg);
else if (err < 0)
- printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
+ pr_notice("%s: linux error code %i\n", str, err);
else
- printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+ pr_notice("%s: unknown error code %#2.2x\n",
str, err);
}
@@ -583,19 +593,24 @@ static long __apm_bios_call(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
&call->esi);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
@@ -659,18 +674,23 @@ static long __apm_bios_call_simple(void *_call)
struct desc_struct save_desc_40;
struct desc_struct *gdt;
struct apm_bios_call *call = _call;
+ u64 ibt;
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
APM_DO_RESTORE_SEGS;
+ ibt_restore(ibt);
+ firmware_restrict_branch_speculation_end();
apm_irq_restore(flags);
gdt[0x40 / 8] = save_desc_40;
put_cpu();
@@ -747,7 +767,7 @@ static int apm_driver_version(u_short *val)
* not cleared until it is acknowledged.
*
* Additional information is returned in the info pointer, providing
- * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * that APM 1.2 is in use. If no messages are pending the value 0x80
* is returned (No power management events pending).
*/
static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
@@ -818,24 +838,12 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
@@ -882,8 +890,6 @@ static void apm_do_busy(void)
#define IDLE_CALC_LIMIT (HZ * 100)
#define IDLE_LEAKY_MAX 16
-static void (*original_pm_idle)(void) __read_mostly;
-
/**
* apm_cpu_idle - cpu idling for APM capable Linux
*
@@ -892,34 +898,36 @@ static void (*original_pm_idle)(void) __read_mostly;
* Furthermore it calls the system default idle routine.
*/
-static void apm_cpu_idle(void)
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
- static unsigned int last_stime; /* = 0 */
+ static u64 last_stime; /* = 0 */
+ u64 stime, utime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
recalc:
+ task_cputime(current, &utime, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
} else if (jiffies_since_last_check > idle_period) {
unsigned int idle_percentage;
- idle_percentage = current->stime - last_stime;
+ idle_percentage = nsecs_to_jiffies(stime - last_stime);
idle_percentage *= 100;
idle_percentage /= jiffies_since_last_check;
use_apm_idle = (idle_percentage > idle_threshold);
if (apm_info.forbid_idle)
use_apm_idle = 0;
- last_jiffies = jiffies;
- last_stime = current->stime;
}
+ last_jiffies = jiffies;
+ last_stime = stime;
+
bucket = IDLE_LEAKY_MAX;
while (!need_resched()) {
@@ -947,10 +955,7 @@ recalc:
break;
}
}
- if (original_pm_idle)
- original_pm_idle();
- else
- default_idle();
+ default_idle();
local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
@@ -960,7 +965,7 @@ recalc:
if (apm_idle_done)
apm_do_busy();
- local_irq_enable();
+ return index;
}
/**
@@ -975,20 +980,10 @@ recalc:
static void apm_power_off(void)
{
- unsigned char po_bios_call[] = {
- 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
- 0x8e, 0xd0, /* movw ax,ss */
- 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
- 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
- 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
- 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
- 0xcd, 0x15 /* int $0x15 */
- };
-
/* Some bioses don't like being called from CPU != 0 */
if (apm_info.realmode_power_off) {
set_cpus_allowed_ptr(current, cpumask_of(0));
- machine_real_restart(po_bios_call, sizeof(po_bios_call));
+ machine_real_restart(MRR_APM);
} else {
(void)set_system_power_state(APM_STATE_OFF);
}
@@ -1031,7 +1026,7 @@ static int apm_enable_power_management(int enable)
* status which gives the rough battery status, and current power
* source. The bat value returned give an estimate as a percentage
* of life and a status value for the battery. The estimated life
- * if reported is a lifetime in secodnds/minutes at current powwer
+ * if reported is a lifetime in seconds/minutes at current power
* consumption.
*/
@@ -1045,8 +1040,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
if (apm_info.get_power_status_broken)
return APM_32_UNSUPPORTED;
- if (apm_bios_call(&call))
+ if (apm_bios_call(&call)) {
+ if (!call.err)
+ return APM_NO_ERROR;
return call.err;
+ }
*status = call.ebx;
*bat = call.ecx;
if (apm_info.get_power_status_swabinminutes) {
@@ -1057,41 +1055,12 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
return APM_SUCCESS;
}
-#if 0
-static int apm_get_battery_status(u_short which, u_short *status,
- u_short *bat, u_short *life, u_short *nbat)
-{
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
-
- if (apm_info.connection_version < 0x0102) {
- /* pretend we only have one battery. */
- if (which != 1)
- return APM_BAD_DEVICE;
- *nbat = 1;
- return apm_get_power_status(status, bat, life);
- }
-
- if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
- &ebx, &ecx, &edx, &esi))
- return (eax >> 8) & 0xff;
- *status = ebx;
- *bat = ecx;
- *life = edx;
- *nbat = esi;
- return APM_SUCCESS;
-}
-#endif
-
/**
* apm_engage_power_management - enable PM on a device
* @device: identity of device
* @enable: on/off
*
- * Activate or deactive power management on either a specific device
+ * Activate or deactivate power management on either a specific device
* or the entire system (%APM_DEVICE_ALL).
*/
@@ -1193,7 +1162,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
static int notified;
if (notified++ == 0)
- printk(KERN_ERR "apm: an event queue overflowed\n");
+ pr_err("an event queue overflowed\n");
if (++as->event_tail >= APM_MAX_EVENTS)
as->event_tail = 0;
}
@@ -1226,11 +1195,11 @@ static void reinit_timer(void)
raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
- outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
- outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
udelay(10);
- outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
@@ -1242,11 +1211,10 @@ static int suspend(int vetoable)
struct apm_user *as;
dpm_suspend_start(PMSG_SUSPEND);
-
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
@@ -1264,12 +1232,12 @@ static int suspend(int vetoable)
apm_error("suspend", err);
err = (err == APM_SUCCESS) ? 0 : -EIO;
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
-
+ dpm_resume_start(PMSG_RESUME);
dpm_resume_end(PMSG_RESUME);
+
queue_event(APM_NORMAL_RESUME, NULL);
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
@@ -1285,10 +1253,10 @@ static void standby(void)
{
int err;
- dpm_suspend_noirq(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
local_irq_disable();
- sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
err = set_system_power_state(APM_STATE_STANDBY);
@@ -1296,10 +1264,10 @@ static void standby(void)
apm_error("standby", err);
local_irq_disable();
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
- dpm_resume_noirq(PMSG_RESUME);
+ dpm_resume_start(PMSG_RESUME);
}
static apm_event_t get_event(void)
@@ -1457,7 +1425,7 @@ static void apm_mainloop(void)
static int check_apm_user(struct apm_user *as, const char *func)
{
if (as == NULL || as->magic != APM_BIOS_MAGIC) {
- printk(KERN_ERR "apm: %s passed bad filp\n", func);
+ pr_err("%s passed bad filp\n", func);
return 1;
}
return 0;
@@ -1506,7 +1474,7 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
return 0;
}
-static unsigned int do_poll(struct file *fp, poll_table *wait)
+static __poll_t do_poll(struct file *fp, poll_table *wait)
{
struct apm_user *as;
@@ -1515,7 +1483,7 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
return 0;
poll_wait(fp, &apm_waitqueue, wait);
if (!queue_empty(as))
- return POLLIN | POLLRDNORM;
+ return EPOLLIN | EPOLLRDNORM;
return 0;
}
@@ -1596,7 +1564,7 @@ static int do_release(struct inode *inode, struct file *filp)
as1 = as1->next)
;
if (as1 == NULL)
- printk(KERN_ERR "apm: filp not in user list\n");
+ pr_err("filp not in user list\n");
else
as1->next = as->next;
}
@@ -1610,11 +1578,9 @@ static int do_open(struct inode *inode, struct file *filp)
struct apm_user *as;
as = kmalloc(sizeof(*as), GFP_KERNEL);
- if (as == NULL) {
- printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
- sizeof(*as));
+ if (as == NULL)
return -ENOMEM;
- }
+
as->magic = APM_BIOS_MAGIC;
as->event_tail = as->event_head = 0;
as->suspends_pending = as->standbys_pending = 0;
@@ -1637,6 +1603,7 @@ static int do_open(struct inode *inode, struct file *filp)
return 0;
}
+#ifdef CONFIG_PROC_FS
static int proc_apm_show(struct seq_file *m, void *v)
{
unsigned short bx;
@@ -1716,19 +1683,7 @@ static int proc_apm_show(struct seq_file *m, void *v)
units);
return 0;
}
-
-static int proc_apm_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_apm_show, NULL);
-}
-
-static const struct file_operations apm_file_ops = {
- .owner = THIS_MODULE,
- .open = proc_apm_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+#endif
static int apm(void *unused)
{
@@ -1926,6 +1881,7 @@ static const struct file_operations apm_bios_fops = {
.unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
+ .llseek = noop_llseek,
};
static struct miscdevice apm_device = {
@@ -2044,7 +2000,7 @@ static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
return 0;
}
-static struct dmi_system_id __initdata apm_dmi_table[] = {
+static const struct dmi_system_id apm_dmi_table[] __initconst = {
{
print_if_true,
KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
@@ -2272,7 +2228,7 @@ static int __init apm_init(void)
dmi_check_system(apm_dmi_table);
- if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
+ if (apm_info.bios.version == 0 || machine_is_olpc()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
@@ -2322,20 +2278,19 @@ static int __init apm_init(void)
}
if (apm_info.disabled) {
- printk(KERN_NOTICE "apm: disabled on user request.\n");
+ pr_notice("disabled on user request.\n");
return -ENODEV;
}
if ((num_online_cpus() > 1) && !power_off && !smp) {
- printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
+ pr_notice("disabled - APM is not SMP safe.\n");
apm_info.disabled = 1;
return -ENODEV;
}
- if (pm_flags & PM_ACPI) {
- printk(KERN_NOTICE "apm: overridden by ACPI.\n");
+ if (!acpi_disabled) {
+ pr_notice("overridden by ACPI.\n");
apm_info.disabled = 1;
return -ENODEV;
}
- pm_flags |= PM_APM;
/*
* Set up the long jump entry point to the APM BIOS, which is called
@@ -2354,7 +2309,7 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
- gdt = get_cpu_gdt_table(0);
+ gdt = get_cpu_gdt_rw(0);
set_desc_base(&gdt[APM_CS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
set_desc_base(&gdt[APM_CS_16 >> 3],
@@ -2362,12 +2317,11 @@ static int __init apm_init(void)
set_desc_base(&gdt[APM_DS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
- proc_create("apm", 0, NULL, &apm_file_ops);
+ proc_create_single("apm", 0, NULL, proc_apm_show);
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
- printk(KERN_ERR "apm: disabled - Unable to start kernel "
- "thread.\n");
+ pr_err("disabled - Unable to start kernel thread\n");
err = PTR_ERR(kapmd_task);
kapmd_task = NULL;
remove_proc_entry("apm", NULL);
@@ -2392,9 +2346,10 @@ static int __init apm_init(void)
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
- set_pm_idle = 1;
+ cpuidle_poll_state_init(&apm_idle_driver);
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
}
return 0;
@@ -2404,15 +2359,9 @@ static void __exit apm_exit(void)
{
int error;
- if (set_pm_idle) {
- pm_idle = original_pm_idle;
- /*
- * We are about to unload the current idle thread pm callback
- * (pm_idle), Wait for all processors to update cached/local
- * copies of pm_idle before proceeding.
- */
- cpu_idle_wait();
- }
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
error = apm_engage_power_management(APM_DEVICE_ALL, 0);
@@ -2427,7 +2376,6 @@ static void __exit apm_exit(void)
kthread_stop(kapmd_task);
kapmd_task = NULL;
}
- pm_flags &= ~PM_APM;
}
module_init(apm_init);
@@ -2455,7 +2403,7 @@ MODULE_PARM_DESC(idle_threshold,
"System idle percentage above which to make APM BIOS idle calls");
module_param(idle_period, int, 0444);
MODULE_PARM_DESC(idle_period,
- "Period (in sec/100) over which to caculate the idle percentage");
+ "Period (in sec/100) over which to calculate the idle percentage");
module_param(smp, bool, 0444);
MODULE_PARM_DESC(smp,
"Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..25fcde525c68 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/crypto.h>
+#include <crypto/aria.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+#include <asm/tdx.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
+
#ifdef CONFIG_X86_32
# include "asm-offsets_32.c"
#else
# include "asm-offsets_64.c"
#endif
+
+static void __used common(void)
+{
+ OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+ OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+ OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
+ OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+ OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+ OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+
+ BLANK();
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
+ OFFSET(pbe_address, pbe, address);
+ OFFSET(pbe_orig_address, pbe, orig_address);
+ OFFSET(pbe_next, pbe, next);
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ BLANK();
+ OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+ OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+ OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+ OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+ OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+ OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+ OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+ OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+ OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
+
+ BLANK();
+ OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+ OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
+#endif
+
+ BLANK();
+ OFFSET(TDX_MODULE_rcx, tdx_module_args, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_args, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_args, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_args, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_args, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_args, r11);
+ OFFSET(TDX_MODULE_r12, tdx_module_args, r12);
+ OFFSET(TDX_MODULE_r13, tdx_module_args, r13);
+ OFFSET(TDX_MODULE_r14, tdx_module_args, r14);
+ OFFSET(TDX_MODULE_r15, tdx_module_args, r15);
+ OFFSET(TDX_MODULE_rbx, tdx_module_args, rbx);
+ OFFSET(TDX_MODULE_rdi, tdx_module_args, rdi);
+ OFFSET(TDX_MODULE_rsi, tdx_module_args, rsi);
+
+ BLANK();
+ OFFSET(BP_scratch, boot_params, scratch);
+ OFFSET(BP_secure_boot, boot_params, secure_boot);
+ OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+ OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+ OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_init_size, boot_params, hdr.init_size);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+ OFFSET(C_PTREGS_SIZE, pt_regs, orig_ax);
+
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+ DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
+
+ /* Offset for fields in tss_struct */
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+ OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
+#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
+ /* Offset for fields in aria_ctx */
+ BLANK();
+ OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key);
+ OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key);
+ OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
+#endif
+
+ BLANK();
+ DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
+ DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..e0a292db97b2 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,71 +1,17 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/ucontext.h>
-#include <asm/sigframe.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/bootparam.h>
-#include <asm/elf.h>
-#include <asm/suspend.h>
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
-#include <xen/interface/xen.h>
+#include <linux/efi.h>
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
+#include <asm/ucontext.h>
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
void foo(void)
{
- OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
- OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
- OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
- OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
- OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
- OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
- OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
- OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
- OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
- BLANK();
-
- OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
- OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
- OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
- OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
- OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math);
- OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
- OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
- OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
- BLANK();
-
- OFFSET(TI_task, thread_info, task);
- OFFSET(TI_exec_domain, thread_info, exec_domain);
- OFFSET(TI_flags, thread_info, flags);
- OFFSET(TI_status, thread_info, status);
- OFFSET(TI_preempt_count, thread_info, preempt_count);
- OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_restart_block, thread_info, restart_block);
- OFFSET(TI_sysenter_return, thread_info, sysenter_return);
- OFFSET(TI_cpu, thread_info, cpu);
- BLANK();
-
- OFFSET(GDS_size, desc_ptr, size);
- OFFSET(GDS_address, desc_ptr, address);
- BLANK();
-
OFFSET(PT_EBX, pt_regs, bx);
OFFSET(PT_ECX, pt_regs, cx);
OFFSET(PT_EDX, pt_regs, dx);
@@ -85,67 +31,19 @@ void foo(void)
OFFSET(PT_OLDSS, pt_regs, ss);
BLANK();
- OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
- OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
- BLANK();
-
- OFFSET(pbe_address, pbe, address);
- OFFSET(pbe_orig_address, pbe, orig_address);
- OFFSET(pbe_next, pbe, next);
-
- /* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- sizeof(struct tss_struct));
-
- DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
- DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
- DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
- DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
- DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
-
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
-#ifdef CONFIG_PARAVIRT
+ OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
- OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
- OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
- OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
-#endif
-#ifdef CONFIG_XEN
- BLANK();
- OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
- OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#endif
-
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
- BLANK();
- OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
- OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
- OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
-
- BLANK();
- OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
- OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
- OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
- OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
- OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
- OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
- OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
- OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
- OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
- OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
+ /*
+ * Offset from the entry stack to task stack stored in TSS. Kernel entry
+ * happens on the per-cpu entry-stack, and the asm code switches to the
+ * task-stack pointer stored in x86_tss.sp1, which is a copy of
+ * task->thread.sp0 where entry code can find it.
+ */
+ DEFINE(TSS_entry2task_stack,
+ offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
BLANK();
- OFFSET(BP_scratch, boot_params, scratch);
- OFFSET(BP_loadflags, boot_params, hdr.loadflags);
- OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
- OFFSET(BP_version, boot_params, hdr.version);
- OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map));
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..590b6cd0eac0 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,94 +1,31 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-#define COMPILE_OFFSETS
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
-#include <linux/crypto.h>
-#include <linux/sched.h>
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/hardirq.h>
-#include <linux/suspend.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/segment.h>
-#include <asm/thread_info.h>
#include <asm/ia32.h>
-#include <asm/bootparam.h>
-#include <asm/suspend.h>
-
-#include <xen/interface/xen.h>
-
-#include <asm/sigframe.h>
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
-};
+#if defined(CONFIG_KVM_GUEST)
+#include <asm/kvm_para.h>
+#endif
int main(void)
{
-#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
- ENTRY(state);
- ENTRY(flags);
- ENTRY(pid);
- BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
- ENTRY(flags);
- ENTRY(addr_limit);
- ENTRY(preempt_count);
- ENTRY(status);
-#ifdef CONFIG_IA32_EMULATION
- ENTRY(sysenter_return);
-#endif
- BLANK();
-#undef ENTRY
#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
+#endif
+#endif
BLANK();
- OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
- OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
- OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
- OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
- OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
- OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
- OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
- OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
- OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
- OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
- OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
#endif
-
-#ifdef CONFIG_IA32_EMULATION
-#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
- ENTRY(ax);
- ENTRY(bx);
- ENTRY(cx);
- ENTRY(dx);
- ENTRY(si);
- ENTRY(di);
- ENTRY(bp);
- ENTRY(sp);
- ENTRY(ip);
- BLANK();
-#undef ENTRY
- DEFINE(IA32_RT_SIGFRAME_sigcontext,
- offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
+#if defined(CONFIG_KVM_GUEST)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
- DEFINE(pbe_address, offsetof(struct pbe, address));
- DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
- DEFINE(pbe_next, offsetof(struct pbe, next));
- BLANK();
-#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
- ENTRY(bx);
+
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
ENTRY(cx);
ENTRY(dx);
@@ -107,34 +44,15 @@ int main(void)
ENTRY(flags);
BLANK();
#undef ENTRY
-#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
+
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
ENTRY(cr0);
ENTRY(cr2);
ENTRY(cr3);
ENTRY(cr4);
- ENTRY(cr8);
+ ENTRY(gdt_desc);
BLANK();
#undef ENTRY
- DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
- BLANK();
- DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
- BLANK();
- DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
-
- BLANK();
- OFFSET(BP_scratch, boot_params, scratch);
- OFFSET(BP_loadflags, boot_params, hdr.loadflags);
- OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
- OFFSET(BP_version, boot_params, hdr.version);
- OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
- BLANK();
- DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-#ifdef CONFIG_XEN
- BLANK();
- OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
- OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
-#undef ENTRY
-#endif
return 0;
}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..190c120f4285 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -1,7 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <asm/unistd.h>
+#include <asm/audit.h>
static unsigned dir_class[] = {
#include <asm-generic/audit_dir_write.h>
@@ -40,30 +42,27 @@ int audit_classify_arch(int arch)
int audit_classify_syscall(int abi, unsigned syscall)
{
#ifdef CONFIG_IA32_EMULATION
- extern int ia32_classify_syscall(unsigned);
if (abi == AUDIT_ARCH_I386)
return ia32_classify_syscall(syscall);
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_execve:
- return 5;
+ case __NR_execveat:
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
- extern __u32 ia32_dir_class[];
- extern __u32 ia32_write_class[];
- extern __u32 ia32_read_class[];
- extern __u32 ia32_chattr_class[];
- extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
deleted file mode 100644
index 8bc57baaa9ad..000000000000
--- a/arch/x86/kernel/bios_uv.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * BIOS run time interface routines.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson <rja@sgi.com>
- */
-
-#include <linux/efi.h>
-#include <asm/efi.h>
-#include <linux/io.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv_hub.h>
-
-static struct uv_systab uv_systab;
-
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
-{
- struct uv_systab *tab = &uv_systab;
- s64 ret;
-
- if (!tab->function)
- /*
- * BIOS does not support UV systab
- */
- return BIOS_STATUS_UNIMPLEMENTED;
-
- ret = efi_call6((void *)__va(tab->function), (u64)which,
- a1, a2, a3, a4, a5);
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_call);
-
-s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
- u64 a4, u64 a5)
-{
- unsigned long bios_flags;
- s64 ret;
-
- local_irq_save(bios_flags);
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- local_irq_restore(bios_flags);
-
- return ret;
-}
-
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
- u64 a4, u64 a5)
-{
- s64 ret;
-
- preempt_disable();
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- preempt_enable();
-
- return ret;
-}
-
-
-long sn_partition_id;
-EXPORT_SYMBOL_GPL(sn_partition_id);
-long sn_coherency_id;
-EXPORT_SYMBOL_GPL(sn_coherency_id);
-long sn_region_size;
-EXPORT_SYMBOL_GPL(sn_region_size);
-long system_serial_number;
-EXPORT_SYMBOL_GPL(system_serial_number);
-int uv_type;
-EXPORT_SYMBOL_GPL(uv_type);
-
-
-s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
- long *region, long *ssn)
-{
- s64 ret;
- u64 v0, v1;
- union partition_info_u part;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
- (u64)(&v0), (u64)(&v1), 0, 0);
- if (ret != BIOS_STATUS_SUCCESS)
- return ret;
-
- part.val = v0;
- if (uvtype)
- *uvtype = part.hub_version;
- if (partid)
- *partid = part.partition_id;
- if (coher)
- *coher = part.coherence_id;
- if (region)
- *region = part.region_size;
- if (ssn)
- *ssn = v1;
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
-
-int
-uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
- unsigned long *intr_mmr_offset)
-{
- u64 watchlist;
- s64 ret;
-
- /*
- * bios returns watchlist number or negative error number.
- */
- ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
- mq_size, (u64)intr_mmr_offset,
- (u64)&watchlist, 0);
- if (ret < BIOS_STATUS_SUCCESS)
- return ret;
-
- return watchlist;
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
-
-int
-uv_bios_mq_watchlist_free(int blade, int watchlist_num)
-{
- return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
- blade, watchlist_num, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
-
-s64
-uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
-{
- return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
- perms, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
-
-s64
-uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
- s64 ret;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
- (u64)addr, buf, (u64)len, 0);
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
-
-s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
-{
- return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
- (u64)ticks_per_second, 0, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_freq_base);
-
-/*
- * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
- * @decode: true to enable target, false to disable target
- * @domain: PCI domain number
- * @bus: PCI bus number
- *
- * Returns:
- * 0: Success
- * -EINVAL: Invalid domain or bus number
- * -ENOSYS: Capability not available
- * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
- */
-int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
-{
- return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
- (u64)decode, (u64)domain, (u64)bus, 0, 0);
-}
-EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
-
-
-#ifdef CONFIG_EFI
-void uv_bios_init(void)
-{
- struct uv_systab *tab;
-
- if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
- (efi.uv_systab == (unsigned long)NULL)) {
- printk(KERN_CRIT "No EFI UV System Table.\n");
- uv_systab.function = (unsigned long)NULL;
- return;
- }
-
- tab = (struct uv_systab *)ioremap(efi.uv_systab,
- sizeof(struct uv_systab));
- if (strncmp(tab->signature, "UVST", 4) != 0)
- printk(KERN_ERR "bad signature in UV system table!");
-
- /*
- * Copy table to permanent spot for later use.
- */
- memcpy(&uv_systab, tab, sizeof(struct uv_systab));
- iounmap(tab);
-
- printk(KERN_INFO "EFI UV System Table Revision %d\n",
- uv_systab.revision);
-}
-#else /* !CONFIG_EFI */
-
-void uv_bios_init(void) { }
-#endif
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 5de7f4c56971..73274d76ce16 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Implement 'Simple Boot Flag Specification 2.0'
*/
@@ -7,6 +8,7 @@
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/acpi.h>
+#include <linux/bitops.h>
#include <asm/io.h>
#include <linux/mc146818rtc.h>
@@ -19,27 +21,13 @@
int sbf_port __initdata = -1; /* set via acpi_boot_init() */
-static int __init parity(u8 v)
-{
- int x = 0;
- int i;
-
- for (i = 0; i < 8; i++) {
- x ^= (v & 1);
- v >>= 1;
- }
-
- return x;
-}
-
static void __init sbf_write(u8 v)
{
unsigned long flags;
if (sbf_port != -1) {
- v &= ~SBF_PARITY;
- if (!parity(v))
- v |= SBF_PARITY;
+ if (!parity8(v))
+ v ^= SBF_PARITY;
printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
sbf_port, v);
@@ -65,14 +53,14 @@ static u8 __init sbf_read(void)
return v;
}
-static int __init sbf_value_valid(u8 v)
+static bool __init sbf_value_valid(u8 v)
{
if (v & SBF_RESERVED) /* Reserved bits */
- return 0;
- if (!parity(v))
- return 0;
+ return false;
+ if (!parity8(v))
+ return false;
- return 1;
+ return true;
}
static int __init sbf_init(void)
@@ -98,4 +86,4 @@ static int __init sbf_init(void)
return 0;
}
-module_init(sbf_init);
+arch_initcall(sbf_init);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
new file mode 100644
index 000000000000..a951333c5995
--- /dev/null
+++ b/arch/x86/kernel/callthunks.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "callthunks: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/moduleloader.h>
+#include <linux/static_call.h>
+
+#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
+#include <asm/cpu.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/sections.h>
+#include <asm/switch_to.h>
+#include <asm/sync_core.h>
+#include <asm/text-patching.h>
+#include <asm/xen/hypercall.h>
+
+static int __initdata_or_module debug_callthunks;
+
+#define MAX_PATCH_LEN (255-1)
+
+#define prdbg(fmt, args...) \
+do { \
+ if (debug_callthunks) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+} while(0)
+
+static int __init debug_thunks(char *str)
+{
+ debug_callthunks = 1;
+ return 1;
+}
+__setup("debug-callthunks", debug_thunks);
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count);
+#endif
+
+extern s32 __call_sites[], __call_sites_end[];
+
+struct core_text {
+ unsigned long base;
+ unsigned long end;
+ const char *name;
+};
+
+static bool thunks_initialized __ro_after_init;
+
+static const struct core_text builtin_coretext = {
+ .base = (unsigned long)_text,
+ .end = (unsigned long)_etext,
+ .name = "builtin",
+};
+
+asm (
+ ".pushsection .rodata \n"
+ ".global skl_call_thunk_template \n"
+ "skl_call_thunk_template: \n"
+ __stringify(INCREMENT_CALL_DEPTH)" \n"
+ ".global skl_call_thunk_tail \n"
+ "skl_call_thunk_tail: \n"
+ ".popsection \n"
+);
+
+extern u8 skl_call_thunk_template[];
+extern u8 skl_call_thunk_tail[];
+
+#define SKL_TMPL_SIZE \
+ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template))
+
+extern void error_entry(void);
+extern void xen_error_entry(void);
+extern void paranoid_entry(void);
+
+static inline bool within_coretext(const struct core_text *ct, void *addr)
+{
+ unsigned long p = (unsigned long)addr;
+
+ return ct->base <= p && p < ct->end;
+}
+
+static inline bool within_module_coretext(void *addr)
+{
+ bool ret = false;
+
+#ifdef CONFIG_MODULES
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address((unsigned long)addr);
+ if (mod && within_module_core((unsigned long)addr, mod))
+ ret = true;
+#endif
+ return ret;
+}
+
+static bool is_coretext(const struct core_text *ct, void *addr)
+{
+ if (ct && within_coretext(ct, addr))
+ return true;
+ if (within_coretext(&builtin_coretext, addr))
+ return true;
+ return within_module_coretext(addr);
+}
+
+static bool skip_addr(void *dest)
+{
+ if (dest == error_entry)
+ return true;
+ if (dest == paranoid_entry)
+ return true;
+ if (dest == xen_error_entry)
+ return true;
+ /* Does FILL_RSB... */
+ if (dest == __switch_to_asm)
+ return true;
+ /* Accounts directly */
+ if (dest == ret_from_fork)
+ return true;
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
+ if (dest == soft_restart_cpu)
+ return true;
+#endif
+#ifdef CONFIG_FUNCTION_TRACER
+ if (dest == __fentry__)
+ return true;
+#endif
+#ifdef CONFIG_KEXEC_CORE
+# ifdef CONFIG_X86_64
+ if (dest >= (void *)__relocate_kernel_start &&
+ dest < (void *)__relocate_kernel_end)
+ return true;
+# else
+ if (dest >= (void *)relocate_kernel &&
+ dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
+ return true;
+# endif
+#endif
+ return false;
+}
+
+static __init_or_module void *call_get_dest(void *addr)
+{
+ struct insn insn;
+ void *dest;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Patched out call? */
+ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
+ return NULL;
+
+ dest = addr + insn.length + insn.immediate.value;
+ if (skip_addr(dest))
+ return NULL;
+ return dest;
+}
+
+static const u8 nops[] = {
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+};
+
+static void *patch_dest(void *dest, bool direct)
+{
+ unsigned int tsize = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+ u8 *pad = dest - tsize;
+
+ memcpy(insn_buff, skl_call_thunk_template, tsize);
+ text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
+
+ /* Already patched? */
+ if (!bcmp(pad, insn_buff, tsize))
+ return pad;
+
+ /* Ensure there are nops */
+ if (bcmp(pad, nops, tsize)) {
+ pr_warn_once("Invalid padding area for %pS\n", dest);
+ return NULL;
+ }
+
+ if (direct)
+ memcpy(pad, insn_buff, tsize);
+ else
+ text_poke_copy_locked(pad, insn_buff, tsize, true);
+ return pad;
+}
+
+static __init_or_module void patch_call(void *addr, const struct core_text *ct)
+{
+ void *pad, *dest;
+ u8 bytes[8];
+
+ if (!within_coretext(ct, addr))
+ return;
+
+ dest = call_get_dest(addr);
+ if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
+ return;
+
+ if (!is_coretext(ct, dest))
+ return;
+
+ pad = patch_dest(dest, within_coretext(ct, dest));
+ if (!pad)
+ return;
+
+ prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr,
+ dest, dest, pad);
+ __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE);
+ text_poke_early(addr, bytes, CALL_INSN_SIZE);
+}
+
+static __init_or_module void
+patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++)
+ patch_call((void *)s + *s, ct);
+}
+
+static __init_or_module void
+callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
+{
+ prdbg("Patching call sites %s\n", ct->name);
+ patch_call_sites(cs->call_start, cs->call_end, ct);
+ prdbg("Patching call sites done%s\n", ct->name);
+}
+
+void __init callthunks_patch_builtin_calls(void)
+{
+ struct callthunk_sites cs = {
+ .call_start = __call_sites,
+ .call_end = __call_sites_end,
+ };
+
+ if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return;
+
+ pr_info("Setting up call depth tracking\n");
+ mutex_lock(&text_mutex);
+ callthunks_setup(&cs, &builtin_coretext);
+ thunks_initialized = true;
+ mutex_unlock(&text_mutex);
+}
+
+void *callthunks_translate_call_dest(void *dest)
+{
+ void *target;
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!thunks_initialized || skip_addr(dest))
+ return dest;
+
+ if (!is_coretext(NULL, dest))
+ return dest;
+
+ target = patch_dest(dest, false);
+ return target ? : dest;
+}
+
+#ifdef CONFIG_BPF_JIT
+static bool is_callthunk(void *addr)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+ unsigned long dest;
+ u8 *pad;
+
+ dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
+ if (!thunks_initialized || skip_addr((void *)dest))
+ return false;
+
+ pad = (void *)(dest - tmpl_size);
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ return !bcmp(pad, insn_buff, tmpl_size);
+}
+
+int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
+
+ if (!thunks_initialized)
+ return 0;
+
+ /* Is function call target a thunk? */
+ if (func && is_callthunk(func))
+ return 0;
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ memcpy(*pprog, insn_buff, tmpl_size);
+ *pprog += tmpl_size;
+ return tmpl_size;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
+ struct module *mod)
+{
+ struct core_text ct = {
+ .base = (unsigned long)mod->mem[MOD_TEXT].base,
+ .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size,
+ .name = mod->name,
+ };
+
+ if (!thunks_initialized)
+ return;
+
+ mutex_lock(&text_mutex);
+ callthunks_setup(cs, &ct);
+ mutex_unlock(&text_mutex);
+}
+#endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+
+ seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+ per_cpu(__x86_call_count, cpu),
+ per_cpu(__x86_ret_count, cpu),
+ per_cpu(__x86_stuffs_count, cpu),
+ per_cpu(__x86_ctxsw_count, cpu));
+ return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = callthunks_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+ struct dentry *dir;
+ unsigned long cpu;
+
+ dir = debugfs_create_dir("callthunks", NULL);
+ for_each_possible_cpu(cpu) {
+ void *arg = (void *)cpu;
+ char name [10];
+
+ sprintf(name, "cpu%lu", cpu);
+ debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+ }
+ return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..99444409c026
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <asm/bugs.h>
+#include <asm/msr.h>
+#include <asm/traps.h>
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+static const char cp_err[][10] = {
+ [0] = "unknown",
+ [1] = "near ret",
+ [2] = "far/iret",
+ [3] = "endbranch",
+ [4] = "rstorssp",
+ [5] = "setssbsy",
+};
+
+static const char *cp_err_string(unsigned long error_code)
+{
+ unsigned int cpec = error_code & CP_EC;
+
+ if (cpec >= ARRAY_SIZE(cp_err))
+ cpec = 0;
+ return cp_err[cpec];
+}
+
+static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code)
+{
+ WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n",
+ user_mode(regs) ? "user mode" : "kernel mode",
+ cp_err_string(error_code));
+}
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long ssp;
+
+ /*
+ * An exception was just taken from userspace. Since interrupts are disabled
+ * here, no scheduling should have messed with the registers yet and they
+ * will be whatever is live in userspace. So read the SSP before enabling
+ * interrupts so locking the fpregs to do it later is not required.
+ */
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+
+ cond_local_irq_enable(regs);
+
+ tsk = current;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ /* Ratelimit to prevent log spamming. */
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ __ratelimit(&cpf_rate)) {
+ pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, ssp, error_code,
+ cp_err_string(error_code),
+ error_code & CP_ENCL ? " in enclave" : "");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0);
+ cond_local_irq_disable(regs);
+}
+
+static __ro_after_init bool ibt_fatal = true;
+
+/*
+ * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR.
+ *
+ * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered,
+ * the WFE state of the interrupted context needs to be cleared to let execution
+ * continue. Otherwise when the CPU resumes from the instruction that just
+ * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU
+ * enters a dead loop.
+ *
+ * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't
+ * set WFE. But FRED provides space on the entry stack (in an expanded CS area)
+ * to save and restore the WFE state, thus the WFE state is no longer clobbered,
+ * so software must clear it.
+ */
+static void ibt_clear_fred_wfe(struct pt_regs *regs)
+{
+ /*
+ * No need to do any FRED checks.
+ *
+ * For IDT event delivery, the high-order 48 bits of CS are pushed
+ * as 0s into the stack, and later IRET ignores these bits.
+ *
+ * For FRED, a test to check if fred_cs.wfe is set would be dropped
+ * by compilers.
+ */
+ regs->fred_cs.wfe = 0;
+}
+
+static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ if ((error_code & CP_EC) != CP_ENDBR) {
+ do_unexpected_cp(regs, error_code);
+ return;
+ }
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) {
+ regs->ax = 0;
+ ibt_clear_fred_wfe(regs);
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ ibt_clear_fred_wfe(regs);
+ return;
+ }
+ BUG();
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (user_mode(regs)) {
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ do_user_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ } else {
+ if (cpu_feature_enabled(X86_FEATURE_IBT))
+ do_kernel_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ }
+}
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
new file mode 100644
index 000000000000..638eb5c933e0
--- /dev/null
+++ b/arch/x86/kernel/cfi.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <linux/string.h>
+#include <linux/cfi.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+
+/*
+ * Returns the target address and the expected type when regs->ip points
+ * to a compiler-generated CFI trap.
+ */
+static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
+ u32 *type)
+{
+ char buffer[MAX_INSN_SIZE];
+ struct insn insn;
+ int offset = 0;
+
+ *target = *type = 0;
+
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks:
+ *
+ *   movl -<id>, %r10d ; 6 bytes
+ * addl -<pos>(%reg), %r10d; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * We can decode the expected type and the target address from the
+ * movl/addl instructions.
+ */
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0xBA)
+ return false;
+
+ *type = -(u32)insn.immediate.value;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0x3)
+ return false;
+
+ /* Read the target address from the register. */
+ offset = insn_get_modrm_rm_off(&insn, regs);
+ if (offset < 0)
+ return false;
+
+ *target = *(unsigned long *)((void *)regs + offset);
+
+ return true;
+}
+
+/*
+ * Checks if a ud2 trap is because of a CFI failure, and handles the trap
+ * if needed. Returns a bug_trap_type value similarly to report_bug.
+ */
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+ unsigned long target, addr = regs->ip;
+ u32 type;
+
+ switch (cfi_mode) {
+ case CFI_KCFI:
+ if (!is_cfi_trap(addr))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, addr);
+
+ break;
+
+ case CFI_FINEIBT:
+ if (!decode_fineibt_insn(regs, &target, &type))
+ return BUG_TRAP_TYPE_NONE;
+
+ break;
+
+ default:
+ return BUG_TRAP_TYPE_NONE;
+ }
+
+ return report_cfi_failure(regs, addr, &target, type);
+}
+
+/*
+ * Ensure that __kcfi_typeid_ symbols are emitted for functions that may
+ * not be indirectly called with all configurations.
+ */
+__ADDRESSABLE(__memcpy)
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..5136e6818da8 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,9 +1,15 @@
-#include <linux/module.h>
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
-#include <asm/e820.h>
+#include <linux/memblock.h>
+
#include <asm/proto.h>
+#include <asm/setup.h>
/*
* Some BIOSes seem to corrupt the low 64k of memory during events
@@ -18,27 +24,48 @@ static int __read_mostly memory_corruption_check = -1;
static unsigned __read_mostly corruption_check_size = 64*1024;
static unsigned __read_mostly corruption_check_period = 60; /* seconds */
-static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
static int num_scan_areas;
-
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ memory_corruption_check = val;
- return (*end == 0) ? 0 : -EINVAL;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_period config string not provided\n");
+ return -EINVAL;
+ }
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
@@ -47,6 +74,11 @@ static __init int set_corruption_check_size(char *arg)
char *end;
unsigned size;
+ if (!arg) {
+ pr_err("memory_corruption_check_size config string not provided\n");
+ return -EINVAL;
+ }
+
size = memparse(arg, &end);
if (*end == '\0')
@@ -59,7 +91,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
void __init setup_bios_corruption_check(void)
{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+ phys_addr_t start, end;
+ u64 i;
if (memory_corruption_check == -1) {
memory_corruption_check =
@@ -79,37 +112,32 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
- if (!(addr + 1))
- break;
-
- if (addr >= corruption_check_size)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- e820_update_range(addr, size, E820_RAM, E820_RESERVED);
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
/* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
+ memset(__va(start), 0, end - start);
- addr += size;
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
}
- printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
- num_scan_areas);
- update_e820();
+ if (num_scan_areas)
+ pr_info("Scanning %d areas for low memory corruption\n", num_scan_areas);
}
-void check_for_bios_corruption(void)
+static void check_for_bios_corruption(void)
{
int i;
int corruption = 0;
@@ -124,8 +152,7 @@ void check_for_bios_corruption(void)
for (; size; addr++, size -= sizeof(unsigned long)) {
if (!*addr)
continue;
- printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
- addr, __pa(addr), *addr);
+ pr_err("Corrupted low memory at %p (%lx phys) = %08lx\n", addr, __pa(addr), *addr);
corruption = 1;
*addr = 0;
}
@@ -141,21 +168,20 @@ static void check_corruption(struct work_struct *dummy)
{
check_for_bios_corruption();
schedule_delayed_work(&bios_check_work,
- round_jiffies_relative(corruption_check_period*HZ));
+ round_jiffies_relative(corruption_check_period*HZ));
}
static int start_periodic_check_for_corruption(void)
{
- if (!memory_corruption_check || corruption_check_period == 0)
+ if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
return 0;
- printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
- corruption_check_period);
+ pr_info("Scanning for low memory corruption every %d seconds\n", corruption_check_period);
/* First time we run the checks right away */
schedule_delayed_work(&bios_check_work, 0);
+
return 0;
}
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
index 667df55a4399..0bca7ef7426a 100644
--- a/arch/x86/kernel/cpu/.gitignore
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..2f8a58ef690e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for x86-compatible CPU details, features and quirks
#
@@ -8,37 +9,68 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
-# Make sure load_percpu_segment has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_common.o := $(nostackp)
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+KMSAN_SANITIZE_common.o := n
-obj-y := intel_cacheinfo.o addon_cpuid_features.o
-obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o sched.o mshyperv.o
+# As above, instrumenting secondary CPU boot code causes boot hangs.
+KCSAN_SANITIZE_common.o := n
-obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
-obj-$(CONFIG_X86_64) += bugs_64.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology_ext.o topology_amd.o
+obj-y += common.o
+obj-y += rdrand.o
+obj-y += match.o
+obj-y += bugs.o
+obj-y += aperfmperf.o
+obj-y += cpuid-deps.o cpuid_0x2_table.o
+obj-y += umwait.o
+obj-y += capflags.o powerflags.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_X86_LOCAL_APIC) += topology.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
+ifdef CONFIG_CPU_SUP_INTEL
+obj-y += intel.o tsx.o
+obj-$(CONFIG_PM) += intel_epb.o
+endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy)
+obj-y += amd_cache_disable.o
+endif
+obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-
-obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_MICROCODE) += microcode/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_SGX) += sgx/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+obj-$(CONFIG_BHYVE_GUEST) += bhyve.o
+obj-$(CONFIG_ACRN_GUEST) += acrn.o
+
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-targets += capflags.c
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
+targets += capflags.c
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
new file mode 100644
index 000000000000..2c5b51aad91a
--- /dev/null
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN detection support
+ *
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ *
+ * Jason Chen CJ <jason.cj.chen@intel.com>
+ * Zhao Yakui <yakui.zhao@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+
+#include <asm/acrn.h>
+#include <asm/apic.h>
+#include <asm/cpufeatures.h>
+#include <asm/desc.h>
+#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
+#include <asm/irq_regs.h>
+
+static u32 __init acrn_detect(void)
+{
+ return acrn_cpuid_base();
+}
+
+static void __init acrn_init_platform(void)
+{
+ /* Install system interrupt handler for ACRN hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+
+ x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+ x86_platform.calibrate_cpu = acrn_get_tsc_khz;
+}
+
+static bool acrn_x2apic_available(void)
+{
+ return boot_cpu_has(X86_FEATURE_X2APIC);
+}
+
+static void (*acrn_intr_handler)(void);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * The hypervisor requires that the APIC EOI should be acked.
+ * If the APIC EOI is not acked, the APIC ISR bit for the
+ * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it
+ * will block the interrupt whose vector is lower than
+ * HYPERVISOR_CALLBACK_VECTOR.
+ */
+ apic_eoi();
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (acrn_intr_handler)
+ acrn_intr_handler();
+
+ set_irq_regs(old_regs);
+}
+
+void acrn_setup_intr_handler(void (*handler)(void))
+{
+ acrn_intr_handler = handler;
+}
+EXPORT_SYMBOL_GPL(acrn_setup_intr_handler);
+
+void acrn_remove_intr_handler(void)
+{
+ acrn_intr_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(acrn_remove_intr_handler);
+
+const __initconst struct hypervisor_x86 x86_hyper_acrn = {
+ .name = "ACRN",
+ .detect = acrn_detect,
+ .type = X86_HYPER_ACRN,
+ .init.init_platform = acrn_init_platform,
+ .init.x2apic_available = acrn_x2apic_available,
+};
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
deleted file mode 100644
index 10fa5684a662..000000000000
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Routines to indentify additional cpu features that are scattered in
- * cpuid space.
- */
-#include <linux/cpu.h>
-
-#include <asm/pat.h>
-#include <asm/processor.h>
-
-#include <asm/apic.h>
-
-struct cpuid_bit {
- u16 feature;
- u8 reg;
- u8 bit;
- u32 level;
-};
-
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
- u32 max_level;
- u32 regs[4];
- const struct cpuid_bit *cb;
-
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
- { 0, 0, 0, 0 }
- };
-
- for (cb = cpuid_bits; cb->feature; cb++) {
-
- /* Verify that the level is valid */
- max_level = cpuid_eax(cb->level & 0xffff0000);
- if (max_level < cb->level ||
- max_level > (cb->level | 0xffff))
- continue;
-
- cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
- &regs[CR_ECX], &regs[CR_EDX]);
-
- if (regs[cb->reg] & (1 << cb->bit))
- set_cpu_cap(c, cb->feature);
- }
-}
-
-/* leaf 0xb SMT level */
-#define SMT_LEVEL 0
-
-/* leaf 0xb sub-leaf types */
-#define INVALID_TYPE 0
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
-
-/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- static bool printed;
-
- if (c->cpuid_level < 0xb)
- return;
-
- cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
-
- /*
- * check if the cpuid leaf 0xb is actually implemented.
- */
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return;
-
- set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
-
- /*
- * initial apic id, which also represents 32-bit extended x2apic id.
- */
- c->initial_apicid = edx;
-
- /*
- * Populate HT related information from sub-leaf level 0.
- */
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-
- sub_index = 1;
- do {
- cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
-
- /*
- * Check for the Core type in the implemented sub leaves.
- */
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- break;
- }
-
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-
- core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
-
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
- & core_select_mask;
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
- /*
- * Reinit the apicid, now that we have extended initial_apicid.
- */
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-
- c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
- if (!printed) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
- return;
-#endif
-}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,43 +1,100 @@
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
#include <linux/bitops.h>
+#include <linux/elf.h>
#include <linux/mm.h>
-
+#include <linux/kvm_types.h>
#include <linux/io.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/random.h>
+#include <linux/topology.h>
+#include <linux/platform_data/x86/amd-fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/pci-direct.h>
+#include <asm/delay.h>
+#include <asm/debugreg.h>
+#include <asm/resctrl.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
-# include <asm/numa_64.h>
# include <asm/mmconfig.h>
-# include <asm/cacheflush.h>
#endif
#include "cpu.h"
-#ifdef CONFIG_X86_32
+u16 invlpgb_count_max __ro_after_init = 1;
+
+static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
+{
+ u32 gprs[8] = { 0 };
+ int err;
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[1] = msr;
+ gprs[7] = 0x9c5a203a;
+
+ err = rdmsr_safe_regs(gprs);
+
+ *p = gprs[0] | ((u64)gprs[2] << 32);
+
+ return err;
+}
+
+static inline int wrmsrq_amd_safe(unsigned msr, u64 val)
+{
+ u32 gprs[8] = { 0 };
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[0] = (u32)val;
+ gprs[1] = msr;
+ gprs[2] = val >> 32;
+ gprs[7] = 0x9c5a203a;
+
+ return wrmsr_safe_regs(gprs);
+}
+
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
* contact AMD for precise details and a CPU swap.
*
* See http://www.multimania.com/poulot/k6bug.html
- * http://www.amd.com/K6/k6docs/revgd.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
*
* The following test is erm.. interesting. AMD neglected to up
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
-extern void vide(void);
-__asm__(".align 4\nvide: ret");
+#ifdef CONFIG_X86_32
+extern __visible void vide(void);
+__asm__(".text\n"
+ ".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
+#endif
-static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
+static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
@@ -48,13 +105,14 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
-static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
+static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
- int mbytes = num_physpages >> (20-PAGE_SHIFT);
+ int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
@@ -65,13 +123,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
return;
}
- if (c->x86_model == 6 && c->x86_mask == 1) {
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
- unsigned long d, d2;
+ u64 d, d2;
- printk(KERN_INFO "AMD K6 stepping B detected - ");
+ pr_info("AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -80,23 +138,22 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
- rdtscl(d);
+ OPTIMIZER_HIDE_VAR(f_vide);
+ d = rdtsc();
while (n--)
f_vide();
- rdtscl(d2);
+ d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)
- printk(KERN_CONT
- "system stability may be impaired when more than 32 MB are used.\n");
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
else
- printk(KERN_CONT "probably OK (after B9730xxxx).\n");
- printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+ pr_cont("probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
- (c->x86_model == 8 && c->x86_mask < 8)) {
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
@@ -109,13 +166,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
- if ((c->x86_model == 8 && c->x86_mask > 7) ||
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
@@ -130,7 +187,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
@@ -142,13 +199,43 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ pr_info("Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
@@ -156,13 +243,13 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
- goto valid_k7;
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
+ return;
/* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
- goto valid_k7;
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
+ return;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability
@@ -171,11 +258,11 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
- goto valid_k7;
+ if (cpu_has(c, X86_FEATURE_MP))
+ return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -185,66 +272,26 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- if (!test_taint(TAINT_UNSAFE_SMP))
- add_taint(TAINT_UNSAFE_SMP);
-
-valid_k7:
- ;
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
#endif
}
-static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
-#endif
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-static int __cpuinit nearby_node(int apicid)
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
{
int i, node;
for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
+ node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
+ node = __apicid_to_node[i];
if (node != NUMA_NO_NODE && node_online(node))
return node;
}
@@ -252,138 +299,325 @@ static int __cpuinit nearby_node(int apicid)
}
#endif
-/*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
- */
-#ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
- unsigned long long value;
- u32 nodes, cores_per_node;
+#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
+ int node;
+ unsigned apicid = c->topo.apicid;
- if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
- return;
-
- /* fixup topology information only once for a core */
- if (cpu_has(c, X86_FEATURE_AMD_DCM))
- return;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
-
- nodes = ((value >> 3) & 7) + 1;
- if (nodes == 1)
- return;
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = per_cpu_llc_id(cpu);
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = value & 7;
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs
+ * which the K8 northbridge parsing fills in. Assume
+ * they are all increased by a constant offset, but in
+ * the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
- /* fixup core id to be in range from 0 to (cores_per_node - 1) */
- c->cpu_core_id = c->cpu_core_id % cores_per_node;
-}
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
#endif
+}
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
- unsigned bits;
- int cpu = smp_processor_id();
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- /* fixup topology information on multi-node processors */
- if ((c->x86 == 0x10) && (c->x86_model == 9))
- amd_fixup_dcm(c);
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
+ }
#endif
}
-int amd_get_nb_id(int cpu)
+#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \
+ step, step, ucode)
+
+static const struct x86_cpu_id amd_tsa_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216),
+ {},
+};
+
+static void tsa_init(struct cpuinfo_x86 *c)
{
- int id = 0;
-#ifdef CONFIG_SMP
- id = per_cpu(cpu_llc_id, cpu);
-#endif
- return id;
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ if (x86_match_min_microcode_rev(amd_tsa_microcode))
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ else
+ pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+ }
}
-EXPORT_SYMBOL_GPL(amd_get_nb_id);
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void bsp_init_amd(struct cpuinfo_x86 *c)
{
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
- int cpu = smp_processor_id();
- int node;
- unsigned apicid = c->apicid;
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
- node = per_cpu(cpu_llc_id, cpu);
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
- if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
+ rdmsrq(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
}
- numa_set_node(cpu, node);
-#endif
+
+ if (c->x86 == 0x15) {
+ unsigned long upperbit;
+ u32 cpuid, assoc;
+
+ cpuid = cpuid_edx(0x80000005);
+ assoc = cpuid >> 16 & 0xff;
+ upperbit = ((cpuid >> 24) << 10) / assoc;
+
+ va_align.mask = (upperbit - 1) & PAGE_MASK;
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_u32() & va_align.mask;
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x60 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ case 0x50 ... 0x5f:
+ case 0x80 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ bsp_determine_snp(c);
+ tsa_init(c);
+
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
- unsigned bits, ecx;
+ u64 msr;
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
+ /*
+ * Mark using WBINVD is needed during kexec on processors that
+ * support SME. This provides support for performing a successful
+ * kexec when going from SME inactive to SME active (or vice-versa).
+ *
+ * The cache must be cleared so that if there are entries with the
+ * same physical address, both with and without the encryption bit,
+ * they don't race each other when flushed and potentially end up
+ * with the wrong entry being committed to memory.
+ *
+ * Test the CPUID bit directly because with mem_encrypt=off the
+ * BSP will clear the X86_FEATURE_SME bit and the APs will not
+ * see it set after that.
+ */
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ __this_cpu_write(cache_state_incoherent, true);
- ecx = cpuid_ecx(0x80000008);
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * If the kernel has not enabled SME via any means then
+ * don't advertise the SME feature.
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
- c->x86_max_cores = (ecx & 0xff) + 1;
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
+ if (!sme_me_mask)
+ setup_clear_cpu_cap(X86_FEATURE_SME);
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
+ rdmsrq(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
- c->x86_coreid_bits = bits;
-#endif
+ return;
+
+clear_all:
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+clear_sev:
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ }
}
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void early_init_amd(struct cpuinfo_x86 *c)
{
- early_init_amd_mc(c);
+ u32 dummy;
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
@@ -394,31 +628,94 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
- (c->x86_model == 8 && c->x86_mask >= 8))
+ (c->x86_model == 8 && c->x86_stepping >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
- /* check CPU config space for extended APIC ID */
- if (cpu_has_apic && c->x86 >= 0xf) {
- unsigned int val;
- val = read_pci_config(0, 24, 0, 0x68);
- if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ if (c->x86 > 0x16)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ else if (c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
+ unsigned int val;
+
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val >> 17 & 0x3) == 0x3)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
}
#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
+
+ early_detect_mem_encrypt(c);
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
- unsigned long long value;
+ u32 level;
+ u64 value;
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
+ */
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrq_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrq_amd_safe(0xc001100d, value);
+ }
+ }
+
+ if (!c->x86_model_id[0])
+ strscpy(c->x86_model_id, "Hammer");
+
+#ifdef CONFIG_SMP
/*
* Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
@@ -426,171 +723,544 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
+ set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
- early_init_amd(c);
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MMCONF_FAM10H
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
- clear_cpu_cap(c, 0*32+31);
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- u64 val;
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &val)) {
- val &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, val);
- }
- }
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
- }
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+static bool rdrand_force;
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
*/
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
+ /*
+ * The self-test can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
}
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrq_safe(MSR_F15H_IC_CFG, value);
}
}
- cpu_detect_cache_sizes(c);
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
+};
+
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
+ */
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
+
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
+void init_spectral_chicken(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+ u64 value;
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
- srat_detect_node(c);
+ /*
+ * On Zen2 we offer this chicken (bit) on the altar of Speculation.
+ *
+ * This suppresses speculation from the middle of a basic block, i.e. it
+ * suppresses non-branch predictions.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
+ wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ }
}
+#endif
+}
-#ifdef CONFIG_X86_32
- detect_ht(c);
+static void init_amd_zen_common(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
#endif
+}
- if (c->extended_cpuid_level >= 0x80000006) {
- if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ fix_erratum_1386(c);
+
+ /* Fix up CPUID bits, but only if not virtualised. */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+
+ /* Erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
}
- if (c->x86 >= 0xf && c->x86 <= 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
- if (cpu_has_xmm2) {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
}
+}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
+
+ default:
+ return false;
+ }
- fam10h_check_enable_mmcfg();
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
+
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
}
+}
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+
+ /* Disable RDSEED on AMD Cyan Skillfish because of an error. */
+ if (c->x86_model == 0x47 && c->x86_stepping == 0x0) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg("RDSEED is not reliable on this platform; disabling.\n");
+ }
+ /* Correct misconfigured CPUID on some clients. */
+ clear_cpu_cap(c, X86_FEATURE_INVLPGB);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
/*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
*/
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if ((tseg>>PMD_SHIFT) <
- (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
- ((tseg>>PMD_SHIFT) <
- (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
- (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
- set_memory_4k((unsigned long)__va(tseg), 1);
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
+}
+
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* AMD FSRM also implies FSRS */
+ if (cpu_has(c, X86_FEATURE_FSRM))
+ set_cpu_cap(c, X86_FEATURE_FSRS);
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
+ case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
+ }
+
+ /*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
+ /*
+ * Enable workaround for FXSAVE leak on CPUs
+ * without a XSaveErPtr feature
+ */
+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_amd_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
}
}
-#endif
+
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
+
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
+
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
- unsigned int size)
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
- if ((c->x86 == 6)) {
+ if (c->x86 == 6) {
/* Duron Rev A0 */
- if (c->x86_model == 3 && c->x86_mask == 0)
+ if (c->x86_model == 3 && c->x86_stepping == 0)
size = 64;
/* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
- (c->x86_mask == 0 || c->x86_mask == 1))
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
size = 256;
}
return size;
}
#endif
-static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m = tlb_lld_2m >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
+}
+
+static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
@@ -601,11 +1271,136 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
}
},
},
- .c_size_cache = amd_size_cache,
+ .legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
+ .c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
};
cpu_dev_register(amd_cpu_dev);
+
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
+
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
+
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
+{
+ int cpu = smp_processor_id();
+
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
+
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsrq(amd_msr_dr_addr_masks[dr], mask);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
+
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
+
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
+}
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
+
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ zen2_zenbleed_check(c);
+}
+
+void amd_check_microcode(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
+}
+
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was tripped",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was tripped",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
+{
+ void __iomem *addr;
+ u32 value;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
+
+ value = ioread32(addr);
+
+ /* Value with "all bits set" is an error response and should be ignored. */
+ if (value == U32_MAX) {
+ iounmap(addr);
+ return 0;
+ }
+
+ /*
+ * Clear all reason bits so they won't be retained if the next reset
+ * does not update the register. Besides, some bits are never cleared by
+ * hardware so it's software's responsibility to clear them.
+ *
+ * Writing the value back effectively clears all reason bits as they are
+ * write-1-to-clear.
+ */
+ iowrite32(value, addr);
+ iounmap(addr);
+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
+ if (!(value & BIT(i)))
+ continue;
+
+ if (s5_reset_reason_txt[i]) {
+ pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n",
+ value, s5_reset_reason_txt[i]);
+ }
+ }
+
+ return 0;
+}
+late_initcall(print_s5_reset_status_mmio);
diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c
new file mode 100644
index 000000000000..8843b9557aea
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_cache_disable.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD L3 cache_disable_{0,1} sysfs handling
+ * Documentation/ABI/testing/sysfs-devices-system-cpu
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/capability.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include <asm/amd/nb.h>
+
+#include "cpu.h"
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = ci->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sysfs_emit(buf, "%d\n", index);
+
+ return sysfs_emit(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return show_cache_disable(ci, buf, slot); \
+}
+
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf,
+ size_t count, unsigned int slot)
+{
+ struct amd_northbridge *nb = ci->priv;
+ unsigned long val = 0;
+ int cpu, err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&ci->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return store_cache_disable(ci, buf, count, slot); \
+}
+
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+
+ return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!ci->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ static struct attribute **amd_l3_attrs;
+ int n = 1;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci)
+{
+ struct amd_northbridge *nb = ci->priv;
+
+ if (ci->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ struct amd_northbridge *nb;
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return NULL;
+
+ node = topology_amd_node_id(smp_processor_id());
+ nb = node_to_amd_nb(node);
+ if (nb && !nb->l3_cache.indices)
+ amd_calc_l3_indices(nb);
+
+ return nb;
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000000..7ffc78d5ebf2
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ */
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
+
+static void init_counter_refs(void *data)
+{
+ u64 aperf, mperf;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
+{
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
+
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
+ */
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static const struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static struct syscore freq_invariance_syscore = {
+ .ops = &freq_invariance_syscore_ops,
+};
+
+static void register_freq_invariance_syscore(void)
+{
+ register_syscore(&freq_invariance_syscore);
+}
+#else
+static inline void register_freq_invariance_syscore(void) {}
+#endif
+
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
+ register_freq_invariance_syscore();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
+ freq_invariance_enable();
+ }
+}
+
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
+{
+ u64 freq_scale, freq_ratio;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
+ goto error;
+
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
+
+void arch_scale_freq_tick(void)
+{
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
+
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
+
+ scale_freq_tick(acnt, mcnt);
+}
+
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+int arch_freq_get_on_cpu(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
+
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ init_counter_refs(NULL);
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
+
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs(NULL);
+}
diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c
new file mode 100644
index 000000000000..f1a8ca3dd1ed
--- /dev/null
+++ b/arch/x86/kernel/cpu/bhyve.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FreeBSD Bhyve guest enlightenments
+ *
+ * Copyright © 2025 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+static uint32_t bhyve_cpuid_base;
+static uint32_t bhyve_cpuid_max;
+
+#define BHYVE_SIGNATURE "bhyve bhyve "
+
+#define CPUID_BHYVE_FEATURES 0x40000001
+
+/* Features advertised in CPUID_BHYVE_FEATURES %eax */
+
+/* MSI Extended Dest ID */
+#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0)
+
+static uint32_t __init bhyve_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0);
+ if (!bhyve_cpuid_base)
+ return 0;
+
+ bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base);
+ return bhyve_cpuid_max;
+}
+
+static uint32_t bhyve_features(void)
+{
+ unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES;
+
+ if (bhyve_cpuid_max < cpuid_leaf)
+ return 0;
+
+ return cpuid_eax(cpuid_leaf);
+}
+
+static bool __init bhyve_ext_dest_id(void)
+{
+ return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID);
+}
+
+static bool __init bhyve_x2apic_available(void)
+{
+ return true;
+}
+
+const struct hypervisor_x86 x86_hyper_bhyve __refconst = {
+ .name = "Bhyve",
+ .detect = bhyve_detect,
+ .init.init_platform = x86_init_noop,
+ .init.x2apic_available = bhyve_x2apic_available,
+ .init.msi_ext_dest_id = bhyve_ext_dest_id,
+};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb3018..d0a2847a4bb0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1994 Linus Torvalds
*
@@ -8,162 +9,3730 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/smt.h>
+#include <linux/pgtable.h>
+#include <linux/bpf.h>
+#include <linux/kvm_types.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
+#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
+#include <asm/cpu_device_id.h>
+#include <asm/e820/api.h>
+#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
+#include "cpu.h"
+
+/*
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
+ *
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
+/* The base value of the SPEC_CTRL MSR without task-specific bits set */
+u64 x86_spec_ctrl_base;
+
+/* The current value of the SPEC_CTRL MSR with task-specific bits set */
+DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
-static int __init no_halt(char *s)
+/*
+ * Set when the CPU has run a potentially malicious guest. An IBPB will
+ * be needed to before running userspace. That IBPB will flush the branch
+ * predictor content.
+ */
+DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
+
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+
+static u64 __ro_after_init x86_arch_cap_msr;
+
+static DEFINE_MUTEX(spec_ctrl_mutex);
+
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+
+static void __init set_return_thunk(void *thunk)
{
- boot_cpu_data.hlt_works_ok = 0;
- return 1;
+ x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
}
-__setup("no-hlt", no_halt);
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+/*
+ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
+ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
+ */
+void update_spec_ctrl_cond(u64 val)
+{
+ if (this_cpu_read(x86_spec_ctrl_current) == val)
+ return;
+
+ this_cpu_write(x86_spec_ctrl_current, val);
+
+ /*
+ * When KERNEL_IBRS this MSR is written on return-to-user, unless
+ * forced the update can be delayed until that time.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+noinstr u64 spec_ctrl_current(void)
+{
+ return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+
+/* Control conditional STIBP in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
+
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
-static int __init no_387(char *s)
+/*
+ * Controls whether l1d flush based mitigations are enabled,
+ * based on hw features and admin setting via boot parameter
+ * defaults to false
+ */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
{
- boot_cpu_data.hard_math = 0;
- write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
- return 1;
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
}
-__setup("no387", no_387);
+/*
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
+ */
+void
+x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 guestval, hostval;
+ struct thread_info *ti = current_thread_info();
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculation_ctrl_update(tif);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrq(MSR_AMD64_LS_CFG, msrval);
+}
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
+#undef pr_fmt
+#define pr_fmt(fmt) "MDS: " fmt
/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
*
- * We should really only care about bugs here
- * anyway. Not features.
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
*/
-static void __init check_fpu(void)
+static bool __init should_mitigate_vuln(unsigned int bug)
{
- s32 fdiv_bug;
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER);
+
+ case X86_BUG_VMSCAPE:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
+/* Default mitigation for MDS-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
+static bool mds_nosmt __ro_after_init = false;
+
+static const char * const mds_strings[] = {
+ [MDS_MITIGATION_OFF] = "Vulnerable",
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+};
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to
+ * userspace *and* on entry to KVM guests.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
+static void __init mds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
+
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init mds_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ mds_mitigation = MDS_MITIGATION_OFF;
+ else if (!strcmp(str, "full"))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else if (!strcmp(str, "full,nosmt")) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mds", mds_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static bool __init taa_vulnerable(void)
+{
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+
+static void __init taa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ return;
+ }
+
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
+ */
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO);
+ }
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&cpu_buf_idle_clear);
+
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static inline bool __init verw_clears_cpu_reg_file(void)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static void __init rfds_apply_mitigation(void)
+{
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ }
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1D Flush : " fmt
+
+enum l1d_flush_mitigations {
+ L1D_FLUSH_OFF = 0,
+ L1D_FLUSH_ON,
+};
- if (!boot_cpu_data.hard_math) {
-#ifndef CONFIG_MATH_EMULATION
- printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
- printk(KERN_EMERG "Giving up.\n");
- for (;;) ;
+static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF;
+
+static void __init l1d_flush_select_mitigation(void)
+{
+ if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ return;
+
+ static_branch_enable(&switch_mm_cond_l1d_flush);
+ pr_info("Conditional flush on switch_mm() enabled\n");
+}
+
+static int __init l1d_flush_parse_cmdline(char *str)
+{
+ if (!strcmp(str, "on"))
+ l1d_flush_mitigation = L1D_FLUSH_ON;
+
+ return 0;
+}
+early_param("l1d_flush", l1d_flush_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
+ return;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ return;
+ }
+
+ /* Will verify below that mitigation _can_ be disabled */
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else
+ gds_mitigation = GDS_MITIGATION_OFF;
+ }
+
+ /* No microcode */
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ return;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
+
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V1 : " fmt
+
+enum spectre_v1_mitigation {
+ SPECTRE_V1_MITIGATION_NONE,
+ SPECTRE_V1_MITIGATION_AUTO,
+};
+
+static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
+
+static const char * const spectre_v1_strings[] = {
+ [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
+ [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization",
+};
+
+/*
+ * Does SMAP provide full mitigation against speculative kernel access to
+ * userspace?
+ */
+static bool smap_works_speculatively(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ /*
+ * On CPUs which are vulnerable to Meltdown, SMAP does not
+ * prevent speculative access to user data in the L1 cache.
+ * Consider SMAP to be non-functional as a mitigation on these
+ * CPUs.
+ */
+ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN))
+ return false;
+
+ return true;
+}
+
+static void __init spectre_v1_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ return;
+
+ if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
+ /*
+ * With Spectre v1, a user can speculatively control either
+ * path of a conditional swapgs with a user-controlled GS
+ * value. The mitigation is to add lfences to both code paths.
+ *
+ * If FSGSBASE is enabled, the user can put a kernel address in
+ * GS, in which case SMAP provides no protection.
+ *
+ * If FSGSBASE is disabled, the user can only put a user space
+ * address in GS. That makes an attack harder, but still
+ * possible if there's no SMAP protection.
+ */
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
+ /*
+ * Mitigation can be provided from SWAPGS itself or
+ * PTI as the CR3 write in the Meltdown mitigation
+ * is serializing.
+ *
+ * If neither is there, mitigate with an LFENCE to
+ * stop speculation through swapgs.
+ */
+ if (boot_cpu_has_bug(X86_BUG_SWAPGS) &&
+ !boot_cpu_has(X86_FEATURE_PTI))
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER);
+
+ /*
+ * Enable lfences in the kernel entry (non-swapgs)
+ * paths, to prevent user entry from speculatively
+ * skipping swapgs.
+ */
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ }
+ }
+
+ pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+}
+
+static int __init nospectre_v1_cmdline(char *str)
+{
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return 0;
+}
+early_param("nospectre_v1", nospectre_v1_cmdline);
+
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RETBleed: " fmt
+
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_AUTO,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static enum its_mitigation its_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF;
+
+enum retbleed_mitigation {
+ RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
+ RETBLEED_MITIGATION_UNRET,
+ RETBLEED_MITIGATION_IBPB,
+ RETBLEED_MITIGATION_IBRS,
+ RETBLEED_MITIGATION_EIBRS,
+ RETBLEED_MITIGATION_STUFF,
+};
+
+static const char * const retbleed_strings[] = {
+ [RETBLEED_MITIGATION_NONE] = "Vulnerable",
+ [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
+ [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
+ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
+};
+
+static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
+
+static int __ro_after_init retbleed_nosmt = false;
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
+static int __init retbleed_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "off")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (!strcmp(str, "auto")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!strcmp(str, "unret")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ } else if (!strcmp(str, "ibpb")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ } else if (!strcmp(str, "nosmt")) {
+ retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ } else {
+ pr_err("Ignoring unknown retbleed option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("retbleed", retbleed_parse_cmdline);
+
+#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
+#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
+
+static void __init retbleed_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
+ }
+ break;
+ case RETBLEED_MITIGATION_IBPB:
+ if (!boot_cpu_has(X86_FEATURE_IBPB)) {
+ pr_err("WARNING: CPU does not support IBPB.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
+
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
+ return;
+
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE)
+ pr_err(RETBLEED_INTEL_MSG);
+
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+ }
+ }
+
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
+ case RETBLEED_MITIGATION_UNRET:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ set_return_thunk(retbleed_return_thunk);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ pr_err(RETBLEED_UNTRAIN_MSG);
+
+ mitigate_smt = true;
+ break;
+
+ case RETBLEED_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
+ break;
+
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+
+ set_return_thunk(call_depth_return_thunk);
+ break;
+
+ default:
+ break;
+ }
+
+ if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ } else if (!strcmp(str, "force")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ } else if (!strcmp(str, "stuff")) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_OFF)
+ return;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY &&
+ !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+}
+
+static void __init its_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ return;
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ /* Retpoline+CDT mitigates ITS */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ break;
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ default:
+ break;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !cdt_possible(spectre_v2_enabled))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+static void __init its_apply_mitigation(void)
+{
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_AUTO,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ return;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
+
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * No need to set verw_clear_cpu_buf_mitigation_selected - it
+ * doesn't fit all cases here and it is not needed because this
+ * is the only VERW-based mitigation on AMD.
+ */
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static void __init tsa_apply_mitigation(void)
+{
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ break;
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static bool spectre_v2_bad_module;
+
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vulnerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+
+static inline const char *spectre_v2_module_string(void)
+{
+ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
+
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
+ SPECTRE_V2_CMD_IBRS,
+};
+
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
+
+enum spectre_v2_user_mitigation_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static int __init spectre_v2_user_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "auto"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO;
+ else if (!strcmp(str, "off"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE;
+ else if (!strcmp(str, "on"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE;
+ else if (!strcmp(str, "prctl"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL;
+ else if (!strcmp(str, "prctl,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB;
+ else if (!strcmp(str, "seccomp"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP;
+ else if (!strcmp(str, "seccomp,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB;
+ else
+ pr_err("Ignoring unknown spectre_v2_user option (%s).", str);
+
+ return 0;
+}
+early_param("spectre_v2_user", spectre_v2_user_parse_cmdline);
+
+static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
+}
+
+static void __init spectre_v2_user_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ switch (spectre_v2_user_cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ return;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
}
/*
- * trap_init() enabled FXSR and company _before_ testing for FP
- * problems here.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ }
+
+ /*
+ * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
*
- * Test for the divl bug..
+ * Intel's Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. AMD Automatic IBRS also does not protect userspace.
+ * These modes therefore disable the implicit cross-thread protection,
+ * so allow for STIBP to be selected in those cases.
*/
- __asm__("fninit\n\t"
- "fldl %1\n\t"
- "fdivl %2\n\t"
- "fmull %2\n\t"
- "fldl %1\n\t"
- "fsubp %%st,%%st(1)\n\t"
- "fistpl %0\n\t"
- "fwait\n\t"
- "fninit"
- : "=m" (*&fdiv_bug)
- : "m" (*&x), "m" (*&y));
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !cpu_smt_possible() ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ return;
+ }
+
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
+ pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+ }
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
+
+static void __init spectre_v2_user_apply_mitigation(void)
+{
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
+
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+}
+
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+ [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
+};
+
+static bool nospectre_v2 __ro_after_init;
- boot_cpu_data.fdiv_bug = fdiv_bug;
- if (boot_cpu_data.fdiv_bug)
- printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
+static int __init nospectre_v2_parse_cmdline(char *str)
+{
+ nospectre_v2 = true;
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return 0;
}
+early_param("nospectre_v2", nospectre_v2_parse_cmdline);
-static void __init check_hlt(void)
+static int __init spectre_v2_parse_cmdline(char *str)
{
- if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
+ if (!str)
+ return -EINVAL;
+
+ if (nospectre_v2)
+ return 0;
+
+ if (!strcmp(str, "off")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ } else if (!strcmp(str, "on")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_FORCE;
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ } else if (!strcmp(str, "retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE;
+ } else if (!strcmp(str, "retpoline,amd") ||
+ !strcmp(str, "retpoline,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE;
+ } else if (!strcmp(str, "retpoline,generic")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+ } else if (!strcmp(str, "eibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS;
+ } else if (!strcmp(str, "eibrs,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE;
+ } else if (!strcmp(str, "eibrs,retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE;
+ } else if (!strcmp(str, "auto")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ } else if (!strcmp(str, "ibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_IBRS;
+ } else {
+ pr_err("Ignoring unknown spectre_v2 option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("spectre_v2", spectre_v2_parse_cmdline);
+
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
+static bool __ro_after_init rrsba_disabled;
+
+/* Disable in-kernel use of non-RSB RET predictors */
+static void __init spec_ctrl_disable_kernel_rrsba(void)
+{
+ if (rrsba_disabled)
return;
- printk(KERN_INFO "Checking 'hlt' instruction... ");
- if (!boot_cpu_data.hlt_works_ok) {
- printk("disabled\n");
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
return;
}
- halt();
- halt();
- halt();
- halt();
- printk(KERN_CONT "OK.\n");
+
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
+}
+
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
+{
+ /*
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
+ *
+ * Documentation/admin-guide/hw-vuln/rsb.rst
+ *
+ * In an overly simplified nutshell:
+ *
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
+ *
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
+ *
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
+ */
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ break;
+
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
+ }
+ break;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
}
/*
- * Most 386 processors have a bug where a POPAD can lock the
- * machine even from user space.
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
*/
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
+ BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_BUG_BHI))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
-static void __init check_popad(void)
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+ return;
+ }
+
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
+ pr_err("IBRS selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+ pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return;
+ }
+
+ switch (spectre_v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return;
+
+ case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+ }
+
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE:
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_IBRS:
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
+ break;
+ }
+}
+
+static void __init spectre_v2_update_mitigation(void)
{
-#ifndef CONFIG_X86_POPAD_OK
- int res, inp = (int) &res;
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO &&
+ !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
+
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ return;
+
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
+ }
- printk(KERN_INFO "Checking for popad bug... ");
- __asm__ __volatile__(
- "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
- : "=&a" (res)
- : "d" (inp)
- : "ecx", "edi");
/*
- * If this fails, it means that any user program may lock the
- * CPU hard. Too bad.
+ * Disable alternate RSB predictions in kernel when indirect CALLs and
+ * JMPs gets protection against BHI and Intramode-BTI, but RET
+ * prediction from a non-RSB predictor is still a risk.
*/
- if (res != 12345678)
- printk(KERN_CONT "Buggy.\n");
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
+ spec_ctrl_disable_kernel_rrsba();
+
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
+
+ /*
+ * Retpoline protects the kernel, but doesn't protect firmware. IBRS
+ * and Enhanced IBRS protect firmware too, so enable IBRS around
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
+ *
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
+ */
+ if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ boot_cpu_has(X86_FEATURE_IBPB) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
+ pr_info("Enabling Speculation Barrier for firmware calls\n");
+ }
+
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+ pr_info("Enabling Restricted Speculation for firmware calls\n");
+ }
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
+ update_spec_ctrl(val);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
else
- printk(KERN_CONT "OK.\n");
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+ /*
+ * Enable the idle clearing if SMT is active on CPUs which are
+ * affected only by MSBDS and not any other MDS variant.
+ *
+ * The other variants cannot be mitigated when SMT is enabled, so
+ * clearing the buffers on idle just to prevent the Store Buffer
+ * repartitioning leak would be a window dressing exercise.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+ return;
+
+ if (sched_smt_active()) {
+ static_branch_enable(&cpu_buf_idle_clear);
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
+ static_branch_disable(&cpu_buf_idle_clear);
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE;
+
+static const char * const ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static bool nossb __ro_after_init;
+
+static int __init nossb_parse_cmdline(char *str)
+{
+ nossb = true;
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return 0;
+}
+early_param("nospec_store_bypass_disable", nossb_parse_cmdline);
+
+static int __init ssb_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (nossb)
+ return 0;
+
+ if (!strcmp(str, "auto"))
+ ssb_mode = SPEC_STORE_BYPASS_AUTO;
+ else if (!strcmp(str, "on"))
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
+ else if (!strcmp(str, "off"))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ else if (!strcmp(str, "prctl"))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else if (!strcmp(str, "seccomp"))
+ ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ?
+ SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL;
+ else
+ pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n",
+ str);
+
+ return 0;
+}
+early_param("spec_store_bypass_disable", ssb_parse_cmdline);
+
+static void __init ssb_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) {
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return;
+ }
+
+ if (ssb_mode == SPEC_STORE_BYPASS_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
+ */
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static void task_update_spec_tif(struct task_struct *tsk)
+{
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return -EPERM;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ case PR_SPEC_DISABLE:
+ clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ default:
+ return -ERANGE;
+ }
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return 0;
+
+ /*
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
+ */
+ if (!is_spec_ib_user_controlled() ||
+ task_spec_ib_force_disable(task))
+ return -EPERM;
+
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+
+ if (!is_spec_ib_user_controlled())
+ return 0;
+
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
#endif
+
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return PR_SPEC_FORCE_DISABLE;
+
+ if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ else
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ case SPEC_STORE_BYPASS_AUTO:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ }
+ BUG();
}
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return PR_SPEC_ENABLE;
+ else if (is_spec_ib_user_controlled()) {
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
+ return PR_SPEC_NOT_AFFECTED;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ update_spec_ctrl(x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
+
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
+
/*
- * Check whether we are able to run this kernel safely on SMP.
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
*
- * - In order to run on a i386, we need to be compiled for i386
- * (for due to lack of "invlpg" and working WP on a i386)
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
*/
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
-static void __init check_config(void)
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
+static void __init l1tf_select_mitigation(void)
{
-/*
- * We'd better not be a i386 if we're configured to use some
- * i486+ only features! (WP works in supervisor mode and the
- * new "invlpg" and "bswap" instructions)
- */
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
- defined(CONFIG_X86_BSWAP)
- if (boot_cpu_data.x86 == 3)
- panic("Kernel requires i486+ for 'invlpg' and other features");
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
#endif
+
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
+ e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
}
+early_param("l1tf", l1tf_cmdline);
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
-void __init check_bugs(void)
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
+ [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
+};
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ else if (!strcmp(str, "microcode"))
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ /*
+ * Use safe-RET if user->kernel or guest->host protection is
+ * required. Otherwise the 'microcode' mitigation is sufficient
+ * to protect the user->user and guest->guest vectors.
+ */
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) &&
+ !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ } else {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ goto ibpb_on_vmexit;
+ }
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ibpb_on_vmexit:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void __init srso_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO))
+ return;
+
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ set_return_thunk(srso_alias_return_thunk);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ set_return_thunk(srso_return_thunk);
+ }
+ break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "VMSCAPE: " fmt
+
+enum vmscape_mitigations {
+ VMSCAPE_MITIGATION_NONE,
+ VMSCAPE_MITIGATION_AUTO,
+ VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+ VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+static const char * const vmscape_strings[] = {
+ [VMSCAPE_MITIGATION_NONE] = "Vulnerable",
+ /* [VMSCAPE_MITIGATION_AUTO] */
+ [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace",
+ [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT",
+};
+
+static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE;
+
+static int __init vmscape_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ } else if (!strcmp(str, "ibpb")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+ vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
+ } else {
+ pr_err("Ignoring unknown vmscape=%s option.\n", str);
+ }
+
+ return 0;
+}
+early_param("vmscape", vmscape_parse_cmdline);
+
+static void __init vmscape_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+ !boot_cpu_has(X86_FEATURE_IBPB)) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ return;
+ }
+
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_VMSCAPE))
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ else
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ }
+}
+
+static void __init vmscape_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE))
+ return;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB ||
+ srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT)
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT;
+
+ pr_info("%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static void __init vmscape_apply_mitigation(void)
+{
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+ setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n"
+
+void cpu_bugs_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_AUTO:
+ case TSA_MITIGATION_FULL:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ break;
+ }
+
+ switch (vmscape_mitigation) {
+ case VMSCAPE_MITIGATION_NONE:
+ case VMSCAPE_MITIGATION_AUTO:
+ break;
+ case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+ case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
+ /*
+ * Hypervisors can be attacked across-threads, warn for SMT when
+ * STIBP is not already enabled system-wide.
+ *
+ * Intel eIBRS (!AUTOIBRS) implies STIBP on.
+ */
+ if (!sched_smt_active() ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ break;
+ pr_warn_once(VMSCAPE_MSG_SMT);
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
+void __init cpu_select_mitigations(void)
+{
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ cpu_print_attack_vectors();
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ retbleed_select_mitigation();
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
+ srbds_select_mitigation();
+ l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
+
+ /*
+ * After mitigations are selected, some may need to update their
+ * choices.
+ */
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+ vmscape_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
+ tsa_apply_mitigation();
+ vmscape_apply_mitigation();
+}
+
+#ifdef CONFIG_SYSFS
+
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char * const l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ sched_smt_active())) {
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
+ else if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
+ else if (itlb_multihit_kvm_mitigation)
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sysfs_emit(buf, "Processor vulnerable\n");
+}
+#endif
+
+static ssize_t mds_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
+ }
+
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t mmio_stale_data_show_state(char *buf)
{
- identify_boot_cpu();
-#ifndef CONFIG_SMP
- printk(KERN_INFO "CPU: ");
- print_cpu_info(&boot_cpu_data);
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static ssize_t old_microcode_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return sysfs_emit(buf, "Unknown: running under hypervisor");
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ return "";
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ return "; STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return "; STIBP: forced";
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ return "; STIBP: always-on";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return "; STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return "; IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
+ }
+ return "";
+}
+
+static char *pbrsb_eibrs_state(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+ boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+ return "; PBRSB-eIBRS: SW sequence";
+ else
+ return "; PBRSB-eIBRS: Vulnerable";
+ } else {
+ return "; PBRSB-eIBRS: Not affected";
+ }
+}
+
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
+ spectre_v2_module_string());
+}
+
+static ssize_t srbds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static ssize_t retbleed_show_state(char *buf)
+{
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
+
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
+ }
+
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static ssize_t srso_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static ssize_t vmscape_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
+{
+ if (!boot_cpu_has_bug(bug))
+ return sysfs_emit(buf, "Not affected\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sysfs_emit(buf, "Mitigation: PTI\n");
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+
+ case X86_BUG_SPECTRE_V2:
+ return spectre_v2_show_state(buf);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
+
+ case X86_BUG_MDS:
+ return mds_show_state(buf);
+
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
+ case X86_BUG_RETBLEED:
+ return retbleed_show_state(buf);
+
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
+ case X86_BUG_OLD_MICROCODE:
+ return old_microcode_show_state(buf);
+
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
+ case X86_BUG_VMSCAPE:
+ return vmscape_show_state(buf);
+
+ default:
+ break;
+ }
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
+
+ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE);
+}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
+
+ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE);
+}
#endif
- check_config();
- check_fpu();
- check_hlt();
- check_popad();
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index 04f0fe5af83e..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- printk(KERN_INFO "CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..dbc99a47be45
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrq_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrq(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrq(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (saved_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ }
+
+ cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
new file mode 100644
index 000000000000..51a95b07831f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 CPU caches detection and configuration
+ *
+ * Previous changes
+ * - Venkatesh Pallipadi: Cache identification through CPUID(0x4)
+ * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure
+ * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
+#include <linux/stop_machine.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/cpuid/api.h>
+#include <asm/mtrr.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#include "cpu.h"
+
+/* Shared last level cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Shared L2 cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
+static cpumask_var_t cpu_cacheinfo_mask;
+
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
+
+enum _cache_type {
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
+};
+
+union _cpuid4_leaf_eax {
+ struct {
+ enum _cache_type type :5;
+ unsigned int level :3;
+ unsigned int is_self_initializing :1;
+ unsigned int is_fully_associative :1;
+ unsigned int reserved :4;
+ unsigned int num_threads_sharing :12;
+ unsigned int num_cores_on_die :6;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ebx {
+ struct {
+ unsigned int coherency_line_size :12;
+ unsigned int physical_line_partition :10;
+ unsigned int ways_of_associativity :10;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ecx {
+ struct {
+ unsigned int number_of_sets :32;
+ } split;
+ u32 full;
+};
+
+struct _cpuid4_info {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
+ unsigned long size;
+};
+
+/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+/*
+ * Fallback AMD CPUID(0x4) emulation
+ * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d)
+ *
+ * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should
+ * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006).
+ */
+
+#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff
+#define AMD_L2_L3_INVALID_ASSOC 0x9
+
+union l1_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :8;
+ unsigned assoc :8;
+ unsigned size_in_kb :8;
+ };
+ unsigned int val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned size_in_kb :16;
+ };
+ unsigned int val;
+};
+
+union l3_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned res :2;
+ unsigned size_encoded :14;
+ };
+ unsigned int val;
+};
+
+/* L2/L3 associativity mapping */
+static const unsigned short assocs[] = {
+ [1] = 1,
+ [2] = 2,
+ [3] = 3,
+ [4] = 4,
+ [5] = 6,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+
+static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d, *l1;
+ union l2_cache l2;
+ union l3_cache l3;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ l1 = &l1d;
+ switch (index) {
+ case 1:
+ l1 = &l1i;
+ fallthrough;
+ case 0:
+ if (!l1->val)
+ return;
+
+ assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ break;
+ case 2:
+ if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ /* Use x86_cache_size as it might have K7 errata fixes */
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+ break;
+ case 3:
+ if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
+ break;
+ default:
+ return;
+ }
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[index];
+ eax->split.level = levels[index];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
+
+ if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE)
+ eax->split.is_fully_associative = 1;
+
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
+
+static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax,
+ union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx)
+{
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO;
+
+ id4->eax = eax;
+ id4->ebx = ebx;
+ id4->ecx = ecx;
+ id4->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
+
+ return 0;
+}
+
+static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+ else
+ legacy_amd_cpuid4(index, &eax, &ebx, &ecx);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+
+ return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ?
+ amd_fill_cpuid4_info(index, id4) :
+ intel_fill_cpuid4_info(index, id4);
+}
+
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+
+ /* Do a CPUID(op) loop to calculate num_cache_leaves */
+ op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4;
+ do {
+ ++i;
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
+ cache_eax.full = eax;
+ } while (cache_eax.split.type != CTYPE_NULL);
+ return i;
+}
+
+/*
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4)
+{
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+
+ return apicid >> index_msb;
+}
+
+/*
+ * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
+ */
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ if (c->x86 < 0x17) {
+ /* Pre-Zen: LLC is at the node level */
+ c->topo.llc_id = die_id;
+ } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
+ /*
+ * Family 17h up to 1F models: LLC is at the core
+ * complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+ } else {
+ /*
+ * Newer families: LLC ID is calculated from the number
+ * of threads sharing the L3 cache.
+ */
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+ struct _cpuid4_info id4 = {};
+
+ if (!amd_fill_cpuid4_info(llc_index, &id4))
+ c->topo.llc_id = get_cache_id(c->topo.apicid, &id4);
+ }
+}
+
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ /*
+ * Hygons are similar to AMD Family 17h up to 1F models: LLC is
+ * at the core complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+}
+
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ ci->num_leaves = find_num_cache_leaves(c);
+ else if (c->extended_cpuid_level >= 0x80000006)
+ ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3;
+}
+
+void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
+}
+
+static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3,
+ unsigned int l2, unsigned int l1i, unsigned int l1d)
+{
+ /*
+ * If llc_id is still unset, then cpuid_level < 4, which implies
+ * that the only possibility left is SMT. Since CPUID(0x2) doesn't
+ * specify any shared caches and SMT shares all caches, we can
+ * unconditionally set LLC ID to the package ID so that all
+ * threads share it.
+ */
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
+
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d);
+
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
+
+/*
+ * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available.
+ */
+static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c)
+{
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ switch (desc->c_type) {
+ case CACHE_L1_INST: l1i += desc->c_size; break;
+ case CACHE_L1_DATA: l1d += desc->c_size; break;
+ case CACHE_L2: l2 += desc->c_size; break;
+ case CACHE_L3: l3 += desc->c_size; break;
+ }
+ }
+
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+}
+
+static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4)
+{
+ unsigned int num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ return c->topo.apicid & ~((1 << index_msb) - 1);
+}
+
+static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
+ unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+
+ if (c->cpuid_level < 4)
+ return false;
+
+ /*
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been previously initialized.
+ */
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
+
+ if (!ci->num_leaves)
+ return false;
+
+ for (int i = 0; i < ci->num_leaves; i++) {
+ struct _cpuid4_info id4 = {};
+ int ret;
+
+ ret = intel_fill_cpuid4_info(i, &id4);
+ if (ret < 0)
+ continue;
+
+ switch (id4.eax.split.level) {
+ case 1:
+ if (id4.eax.split.type == CTYPE_DATA)
+ l1d = id4.size / 1024;
+ else if (id4.eax.split.type == CTYPE_INST)
+ l1i = id4.size / 1024;
+ break;
+ case 2:
+ l2 = id4.size / 1024;
+ l2_id = calc_cache_topo_id(c, &id4);
+ break;
+ case 3:
+ l3 = id4.size / 1024;
+ l3_id = calc_cache_topo_id(c, &id4);
+ break;
+ default:
+ break;
+ }
+ }
+
+ c->topo.l2c_id = l2_id;
+ c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+ return true;
+}
+
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */
+ if (intel_cacheinfo_0x4(c))
+ return;
+
+ intel_cacheinfo_0x2(c);
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon
+ */
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci;
+ struct cacheinfo *ci;
+ int i, sibling;
+
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ unsigned int apicid, nshared, first, last;
+
+ nshared = id4->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).topo.apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ apicid = cpu_data(i).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else
+ return 0;
+
+ return 1;
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon
+ */
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cacheinfo *ci, *sibling_ci;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+ if (__cache_amd_cpumap_setup(cpu, index, id4))
+ return;
+ }
+
+ ci = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+
+ cpumask_set_cpu(cpu, &ci->shared_cpu_map);
+ if (num_threads_sharing == 1)
+ return;
+
+ index_msb = get_count_order(num_threads_sharing);
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+ /* Skip if itself or no cacheinfo */
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;
+
+ sibling_ci = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &ci->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map);
+ }
+}
+
+static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4,
+ struct amd_northbridge *nb)
+{
+ ci->id = id4->id;
+ ci->attributes = CACHE_ID;
+ ci->level = id4->eax.split.level;
+ ci->type = cache_type_map[id4->eax.split.type];
+ ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1;
+ ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1;
+ ci->size = id4->size;
+ ci->number_of_sets = id4->ecx.split.number_of_sets + 1;
+ ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1;
+ ci->priv = nb;
+}
+
+int init_cache_level(unsigned int cpu)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
+ return -ENOENT;
+
+ return 0;
+}
+
+int populate_cache_leaves(unsigned int cpu)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *ci = this_cpu_ci->info_list;
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+ u32 apicid = cpu_data(cpu).topo.apicid;
+ struct amd_northbridge *nb = NULL;
+ struct _cpuid4_info id4 = {};
+ int idx, ret;
+
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = fill_cpuid4_info(idx, &id4);
+ if (ret)
+ return ret;
+
+ id4.id = get_cache_id(apicid, &id4);
+
+ if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
+ nb = amd_init_l3_cache(idx);
+
+ ci_info_init(ci++, &id4, nb);
+ __cache_cpumap_setup(cpu, idx, &id4);
+ }
+
+ this_cpu_ci->cpu_map_populated = true;
+ return 0;
+}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+/*
+ * Cache flushing is the most time-consuming step when programming the
+ * MTRRs. On many Intel CPUs without known erratas, it can be skipped
+ * if the CPU declares cache self-snooping support.
+ */
+static void maybe_flush_caches(void)
+{
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * This is not ideal since the cache is only flushed/disabled
+ * for this CPU while the MTRRs are changed, but changing this
+ * requires more invasive changes to the way the kernel boots.
+ */
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ maybe_flush_caches();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ maybe_flush_caches();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
+ mtrr_generic_set_state();
+ cache_enable();
+ }
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_online(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, cpu_cacheinfo_mask);
+
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_cacheinfo_mask);
+
+ return 0;
+}
+
+static int cache_ap_offline(unsigned int cpu)
+{
+ cpumask_clear_cpu(cpu, cpu_cacheinfo_mask);
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL);
+ cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask);
+
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_online, cache_ap_offline);
+ return 0;
+}
+early_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..a3b55db35c96 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,244 +1,16 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <asm/processor.h>
-#include <asm/e820.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "cpu.h"
-#ifdef CONFIG_X86_OOSTORE
-
-static u32 __cpuinit power2(u32 x)
-{
- u32 s = 1;
-
- while (s <= x)
- s <<= 1;
-
- return s >>= 1;
-}
-
-
-/*
- * Set up an actual MCR
- */
-static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
-{
- u32 lo, hi;
-
- hi = base & ~0xFFF;
- lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
- lo &= ~0xFFF; /* Remove the ctrl value bits */
- lo |= key; /* Attribute we wish to set */
- wrmsr(reg+MSR_IDT_MCR0, lo, hi);
- mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
-}
-
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 __cpuinit ramtop(void)
-{
- u32 clip = 0xFFFFFFFFUL;
- u32 top = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
-
- if (e820.map[i].addr > 0xFFFFFFFFUL)
- continue;
- /*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
- */
- if (e820.map[i].type == E820_RESERVED) {
- if (e820.map[i].addr >= 0x100000UL &&
- e820.map[i].addr < clip)
- clip = e820.map[i].addr;
- continue;
- }
- start = e820.map[i].addr;
- end = e820.map[i].addr + e820.map[i].size;
- if (start >= end)
- continue;
- if (end > top)
- top = end;
- }
- /*
- * Everything below 'top' should be RAM except for the ISA hole.
- * Because of the limited MCR's we want to map NV/ACPI into our
- * MCR range for gunk in RAM
- *
- * Clip might cause us to MCR insufficient RAM but that is an
- * acceptable failure mode and should only bite obscure boxes with
- * a VESA hole at 15Mb
- *
- * The second case Clip sometimes kicks in is when the EBDA is marked
- * as reserved. Again we fail safe with reasonable results
- */
- if (top > clip)
- top = clip;
-
- return top;
-}
-
-/*
- * Compute a set of MCR's to give maximum coverage
- */
-static int __cpuinit centaur_mcr_compute(int nr, int key)
-{
- u32 mem = ramtop();
- u32 root = power2(mem);
- u32 base = root;
- u32 top = root;
- u32 floor = 0;
- int ct = 0;
-
- while (ct < nr) {
- u32 fspace = 0;
- u32 high;
- u32 low;
-
- /*
- * Find the largest block we will fill going upwards
- */
- high = power2(mem-top);
-
- /*
- * Find the largest block we will fill going downwards
- */
- low = base/2;
-
- /*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
- if (base <= 1024*1024)
- low = 0;
-
- /*
- * See how much space we could cover by filling below
- * the ISA hole
- */
-
- if (floor == 0)
- fspace = 512*1024;
- else if (floor == 512*1024)
- fspace = 128*1024;
-
- /* And forget ROM space */
-
- /*
- * Now install the largest coverage we get
- */
- if (fspace > high && fspace > low) {
- centaur_mcr_insert(ct, floor, fspace, key);
- floor += fspace;
- } else if (high > low) {
- centaur_mcr_insert(ct, top, high, key);
- top += high;
- } else if (low > 0) {
- base -= low;
- centaur_mcr_insert(ct, base, low, key);
- } else
- break;
- ct++;
- }
- /*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
- */
- return ct;
-}
-
-static void __cpuinit centaur_create_optimal_mcr(void)
-{
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
- *
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
- *
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
- */
- used = centaur_mcr_compute(6, 31);
-
- /*
- * Wipe unused MCRs
- */
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-static void __cpuinit winchip2_create_optimal_mcr(void)
-{
- u32 lo, hi;
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
- *
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
- */
- used = centaur_mcr_compute(6, 25);
-
- /*
- * Mark the registers we are using.
- */
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for (i = 0; i < used; i++)
- lo |= 1<<(9+i);
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
- /*
- * Wipe unused MCRs
- */
-
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-/*
- * Handle the MCR key on the Winchip 2.
- */
-static void __cpuinit winchip2_unprotect_mcr(void)
-{
- u32 lo, hi;
- u32 key;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- key = (lo>>17) & 7;
- lo |= key<<6; /* replace with unlock key */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-static void __cpuinit winchip2_protect_mcr(void)
-{
- u32 lo, hi;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-#endif /* CONFIG_X86_OOSTORE */
-
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -247,7 +19,7 @@ static void __cpuinit winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __cpuinit init_c3(struct cpuinfo_x86 *c)
+static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -260,7 +32,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
- printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+ pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
@@ -268,17 +40,17 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
- printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+ pr_info("CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
- c->x86_capability[5] = cpuid_edx(0xC0000001);
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
- if (c->x86_model >= 6 && c->x86_model <= 9) {
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
@@ -294,7 +66,8 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
- cpu_detect_cache_sizes(c);
+ if (c->x86 >= 7)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
enum {
@@ -318,26 +91,27 @@ enum {
EAMD3D = 1<<20,
};
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+static void early_init_centaur(struct cpuinfo_x86 *c)
{
- switch (c->x86) {
#ifdef CONFIG_X86_32
- case 5:
- /* Emulate MTRRs using Centaur's MCR. */
+ /* Emulate MTRRs using Centaur's MCR. */
+ if (c->x86 == 5)
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
- break;
#endif
- case 6:
- if (c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- break;
- }
+ if ((c->x86 == 6 && c->x86_model >= 0xf) ||
+ (c->x86 >= 7))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
}
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
@@ -353,33 +127,32 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
- switch (c->x86) {
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
#ifdef CONFIG_X86_32
- case 5:
+ if (c->x86 == 5) {
switch (c->x86_model) {
case 4:
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
- printk(KERN_NOTICE "Disabling bugged TSC.\n");
+ pr_notice("Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
-#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- *
- * The C6 original lacks weak read order
- *
- * Note 0x120 is write only on Winchip 1
- */
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
break;
case 8:
- switch (c->x86_mask) {
+ switch (c->x86_stepping) {
default:
name = "2";
break;
@@ -393,40 +166,12 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
default:
name = "??";
@@ -436,11 +181,11 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
- printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
- printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+ pr_info("Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
@@ -457,21 +202,21 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
- break;
+ }
#endif
- case 6:
+ if (c->x86 == 6 || c->x86 >= 7)
init_c3(c);
- break;
- }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
+
+ init_ia32_feat_ctl(c);
}
-static unsigned int __cpuinit
+#ifdef CONFIG_X86_32
+static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
-#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -482,18 +227,20 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
- (c->x86_mask == 1) && (size == 65))
+ (c->x86_stepping == 1) && (size == 65))
size -= 1;
-#endif
return size;
}
+#endif
-static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
+static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
- .c_size_cache = centaur_size_cache,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
- u8 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u8 *)ptr;
- if (prev == old)
- *(u8 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
- u16 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u16 *)ptr;
- if (prev == old)
- *(u16 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
- u32 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u32 *)ptr;
- if (prev == old)
- *(u32 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,64 +1,185 @@
-#include <linux/bootmem.h>
+// SPDX-License-Identifier: GPL-2.0-only
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/memblock.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
-
-#include <asm/stackprotector.h>
+#include <linux/syscore_ops.h>
+#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
+#include <linux/utsname.h>
+#include <linux/efi.h>
+
+#include <asm/alternative.h>
+#include <asm/cmdline.h>
+#include <asm/cpuid/api.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
+#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/debugreg.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
-#include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
#include <linux/numa.h>
+#include <asm/numa.h>
#include <asm/asm.h>
+#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/cacheinfo.h>
+#include <asm/memtype.h>
+#include <asm/microcode.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
-#endif
+#include <asm/ia32.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
-/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_initialized_mask;
-cpumask_var_t cpu_callout_mask;
-cpumask_var_t cpu_callin_mask;
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
+u32 elf_hwcap2 __read_mostly;
+
+/* Number of siblings per CPU package */
+unsigned int __max_threads_per_core __ro_after_init = 1;
+EXPORT_SYMBOL(__max_threads_per_core);
+
+unsigned int __max_dies_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__max_dies_per_package);
+
+unsigned int __max_logical_packages __ro_after_init = 1;
+EXPORT_SYMBOL(__max_logical_packages);
+
+unsigned int __num_cores_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_cores_per_package);
+
+unsigned int __num_threads_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_threads_per_package);
+
+static struct ppin_info {
+ int feature;
+ int msr_ppin_ctl;
+ int msr_ppin;
+} ppin_info[] = {
+ [X86_VENDOR_INTEL] = {
+ .feature = X86_FEATURE_INTEL_PPIN,
+ .msr_ppin_ctl = MSR_PPIN_CTL,
+ .msr_ppin = MSR_PPIN
+ },
+ [X86_VENDOR_AMD] = {
+ .feature = X86_FEATURE_AMD_PPIN,
+ .msr_ppin_ctl = MSR_AMD_PPIN_CTL,
+ .msr_ppin = MSR_AMD_PPIN
+ },
+};
-/* representing cpus for which sibling maps can be computed */
-cpumask_var_t cpu_sibling_setup_mask;
+static const struct x86_cpu_id ppin_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]),
+ X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
+
+ /* Legacy models without CPUID enumeration */
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+
+ {}
+};
-/* correctly size the local cpu masks */
-void __init setup_cpu_local_masks(void)
+static void ppin_init(struct cpuinfo_x86 *c)
{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+ const struct x86_cpu_id *id;
+ unsigned long long val;
+ struct ppin_info *info;
+
+ id = x86_match_cpu(ppin_cpuids);
+ if (!id)
+ return;
+
+ /*
+ * Testing the presence of the MSR is not enough. Need to check
+ * that the PPIN_CTL allows reading of the PPIN.
+ */
+ info = (struct ppin_info *)id->driver_data;
+
+ if (rdmsrq_safe(info->msr_ppin_ctl, &val))
+ goto clear_ppin;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN locked in disabled mode */
+ goto clear_ppin;
+ }
+
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
+ wrmsrq_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrq_safe(info->msr_ppin_ctl, &val);
+ }
+
+ /* Is the enable bit set? */
+ if (val & 2UL) {
+ c->ppin = native_rdmsrq(info->msr_ppin);
+ set_cpu_cap(c, info->feature);
+ return;
+ }
+
+clear_ppin:
+ setup_clear_cpu_cap(info->feature);
}
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
+static void default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
cpu_detect_cache_sizes(c);
@@ -75,13 +196,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst default_cpu = {
+static const struct cpu_dev default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
};
-static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+static const struct cpu_dev *this_cpu = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -93,87 +214,83 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- GDT_STACK_CANARY_INIT
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
-static int __init x86_xsave_setup(char *s)
+#ifdef CONFIG_X86_64
+static int __init x86_nopcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- return 1;
-}
-__setup("noxsave", x86_xsave_setup);
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-#ifdef CONFIG_X86_32
-static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_serial_nr __cpuinitdata = 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 0;
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 0;
}
-__setup("cachesize=", cachesize_setup);
+early_param("nopcid", x86_nopcid_setup);
+#endif
-static int __init x86_fxsr_setup(char *s)
+static int __init x86_noinvpcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
}
-__setup("nosep", x86_sep_setup);
+early_param("noinvpcid", x86_noinvpcid_setup);
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -196,16 +313,27 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
+}
+
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
}
+__setup("cachesize=", cachesize_setup);
/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
+bool cpuid_feature(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -218,7 +346,7 @@ static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
lo |= 0x200000;
wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
+ pr_notice("CPU serial number disabled.\n");
clear_cpu_cap(c, X86_FEATURE_PN);
/* Disabling the serial number may affect the cpuid level */
@@ -232,20 +360,300 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ cr4_set_bits(X86_CR4_SMEP);
+}
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags = native_save_fl();
+
+ /* This should have been cleared long ago */
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP))
+ cr4_set_bits(X86_CR4_SMAP);
+}
+
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ pr_info_once("x86/cpu: User Mode Instruction Prevention (UMIP) activated\n");
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LASS))
+ return;
+
+ /*
+ * Legacy vsyscall page access causes a #GP when LASS is active.
+ * Disable LASS because the #GP handler doesn't support vsyscall
+ * emulation.
+ *
+ * Also disable LASS when running under EFI, as some runtime and
+ * boot services rely on 1:1 mappings in the lower half.
+ */
+ if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+ IS_ENABLED(CONFIG_EFI)) {
+ setup_clear_cpu_cap(X86_FEATURE_LASS);
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_LASS);
+}
+
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
+static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
+static unsigned long cr4_pinned_bits __ro_after_init;
+
+void native_write_cr0(unsigned long val)
+{
+ unsigned long bits_missing = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) {
+ bits_missing = X86_CR0_WP;
+ val |= bits_missing;
+ goto set_register;
+ }
+ /* Warn after we've set the missing bits. */
+ WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n");
+ }
+}
+EXPORT_SYMBOL(native_write_cr0);
+
+void __no_profile native_write_cr4(unsigned long val)
+{
+ unsigned long bits_changed = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr4": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
+ goto set_register;
+ }
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
+ }
+}
+#if IS_MODULE(CONFIG_LKDTM)
+EXPORT_SYMBOL_GPL(native_write_cr4);
+#endif
+
+void cr4_update_irqsoff(unsigned long set, unsigned long clear)
+{
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
+
+ lockdep_assert_irqs_disabled();
+
+ newval = (cr4 & ~clear) | set;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
+
+/* Read the CR4 shadow. */
+unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
+
+void cr4_init(void)
+{
+ unsigned long cr4 = __read_cr4();
+
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4 |= X86_CR4_PCIDE;
+ if (static_branch_likely(&cr_pinning))
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
+
+ __write_cr4(cr4);
+
+ /* Initialize cr4 shadow for this CPU. */
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+}
+
+/*
+ * Once CPU feature detection is finished (and boot params have been
+ * parsed), record any of the sensitive CR bits that are set, and
+ * enable CR pinning.
+ */
+static void __init setup_cr_pinning(void)
+{
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
+ static_key_enable(&cr_pinning.key);
+}
+
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
return 1;
}
-/* Probe for the CPUID instruction */
-static inline int have_cpuid_p(void)
+__setup("nofsgsbase", x86_nofsgsbase_setup);
+
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
+ if (c == &boot_cpu_data) {
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /*
+ * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+ * bit to be set. Enforce it.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSPKE);
+
+ } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_PKE);
+ /* Load the default PKRU value */
+ pkru_write_default();
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
return 1;
}
-static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+__setup("nopku", setup_disable_pku);
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr u64 ibt_save(bool disable)
+{
+ u64 msr = 0;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ if (disable)
+ wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ }
+
+ return msr;
+}
+
+__noendbr void ibt_restore(u64 save)
{
+ u64 msr;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ msr &= ~CET_ENDBR_EN;
+ msr |= (save & CET_ENDBR_EN);
+ wrmsrq(MSR_IA32_S_CET, msr);
+ }
}
+
#endif
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ bool user_shstk, kernel_ibt;
+
+ if (!IS_ENABLED(CONFIG_X86_CET))
+ return;
+
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrq(MSR_IA32_S_CET, 0);
+
+ cr4_set_bits(X86_CR4_CET);
+
+ if (kernel_ibt && ibt_selftest()) {
+ pr_err("IBT selftest: Failed!\n");
+ wrmsrq(MSR_IA32_S_CET, 0);
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ }
+}
+
+__noendbr void cet_disable(void)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrq(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -256,15 +664,15 @@ struct cpuid_dependent_feature {
u32 level;
};
-static const struct cpuid_dependent_feature __cpuinitconst
+static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
-static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
@@ -288,9 +696,8 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- printk(KERN_WARNING
- "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
- x86_cap_flags[df->feature], df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -302,9 +709,10 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
*/
/* Look up CPU names by table lookup. */
-static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
{
- const struct cpu_model_info *info;
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -312,52 +720,95 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
if (!this_cpu)
return NULL;
- info = this_cpu->c_models;
+ info = this_cpu->legacy_models;
- while (info && info->family) {
+ while (info->family) {
if (info->family == c->x86)
return info->model_names[c->x86_model];
info++;
}
+#endif
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- loadsegment(gs, 0);
- wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
- load_stack_canary_segment();
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
}
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
- */
-void switch_to_new_gdt(int cpu)
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- /* Reload the per-cpu base */
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
+ */
+void __init switch_gdt_and_percpu_base(int cpu)
+{
+ load_direct_gdt(cpu);
- load_percpu_segment(cpu);
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrq() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrq() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
-static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
- char *p, *q;
+ char *p, *q, *s;
if (c->extended_cpuid_level < 0x80000004)
return;
@@ -368,22 +819,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
while (*p == ' ')
p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
}
+
+ *(s + 1) = '\0';
}
-void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -408,8 +861,8 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
#else
/* do processor-specific cache resizing */
- if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c, l2size);
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
@@ -422,65 +875,27 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_HT
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
- static bool printed;
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
-
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
- goto out;
- }
-
- if (smp_num_siblings <= 1)
- goto out;
-
- if (smp_num_siblings > nr_cpu_ids) {
- pr_warning("CPU: Unsupported number of siblings %d",
- smp_num_siblings);
- smp_num_siblings = 1;
- return;
- }
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+ if (this_cpu->c_detect_tlb)
+ this_cpu->c_detect_tlb(c);
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
-out:
- if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
-#endif
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -499,15 +914,14 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
}
}
- printk_once(KERN_ERR
- "CPU: vendor_id '%s' unknown, using generic init.\n" \
- "CPU: Your system may be unstable.\n", v);
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
}
-void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+void cpu_detect(struct cpuinfo_x86 *c)
{
/* Get vendor name */
cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -521,14 +935,9 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
-
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_stepping = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -537,50 +946,172 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+static void apply_forced_caps(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- u32 ebx;
+ int i;
+
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
+}
+
+void get_cpu_cap(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
}
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
-
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+ c->x86_capability[CPUID_7_ECX] = ecx;
+ c->x86_capability[CPUID_7_EDX] = edx;
+
+ /* Check valid sub-leaf index before accessing it */
+ if (eax >= 1) {
+ cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_1_EAX] = eax;
}
}
- if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
+ c->x86_capability[CPUID_D_1_EAX] = eax;
+ }
+
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
+
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
+ init_scattered_cpuid_features(c);
+ init_speculation_control(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
}
-static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008)) {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
+ }
+
+ c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
+}
+
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -601,7 +1132,629 @@ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
+}
+
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
+#define NO_MMIO BIT(9)
+#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
+
+#define VULNWL(vendor, family, model, whitelist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
+
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
+
+#define VULNWL_AMD(family, whitelist) \
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+#define VULNWL_HYGON(family, whitelist) \
+ VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
+
+ /* Intel Family 6 */
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
+
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+
+ /*
+ * Technically, swapgs isn't serializing on AMD (despite it previously
+ * being documented as such in the APM). But according to AMD, %gs is
+ * updated non-speculatively, and the issuing of %gs-relative memory
+ * operands will be blocked until the %gs update completes, which is
+ * good enough for our purposes.
+ */
+
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+
+ /* AMD Family 0xf - 0x12 */
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ {}
+};
+
+#define VULNBL(vendor, family, model, blacklist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
+
+#define VULNBL_AMD(family, blacklist) \
+ VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
+#define VULNBL_HYGON(family, blacklist) \
+ VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
+
+#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
+/* CPU is affected by VMSCAPE */
+#define VMSCAPE BIT(11)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE),
+
+ VULNBL_AMD(0x15, RETBLEED),
+ VULNBL_AMD(0x16, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE),
+ VULNBL_AMD(0x1a, SRSO | VMSCAPE),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(table);
+
+ return m && !!(m->driver_data & which);
+}
+
+u64 x86_read_arch_cap_msr(void)
+{
+ u64 x86_arch_cap_msr = 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+
+ return x86_arch_cap_msr;
+}
+
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
+}
+
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
+static struct x86_cpu_id cpu_latest_microcode[] = {
+#include "microcode/intel-ucode-defs.h"
+ {}
+};
+
+static bool __init cpu_has_old_microcode(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode);
+
+ /* Give unknown CPUs a pass: */
+ if (!m) {
+ /* Intel CPUs should be in the list. Warn if not: */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ pr_info("x86/CPU: Model not found in latest microcode list\n");
+ return false;
+ }
+
+ /*
+ * Hosts usually lie to guests with a super high microcode
+ * version. Just ignore what hosts tell guests:
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ /* Consider all debug microcode to be old: */
+ if (boot_cpu_data.microcode & BIT(31))
+ return true;
+
+ /* Give new microcode a pass: */
+ if (boot_cpu_data.microcode >= m->driver_data)
+ return false;
+
+ /* Uh oh, too old: */
+ return true;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ if (cpu_has_old_microcode()) {
+ pr_warn("x86/CPU: Running old microcode\n");
+ setup_force_cpu_bug(X86_BUG_OLD_MICROCODE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
+ setup_force_cpu_bug(X86_BUG_MDS);
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
+ setup_force_cpu_bug(X86_BUG_SWAPGS);
+
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
+ if (cpu_matches(cpu_vuln_blacklist, MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ }
+
+ if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ }
+
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
+
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
+ /*
+ * Set the bug only on bare-metal. A nested hypervisor should already be
+ * deploying IBPB to isolate itself from nested guests.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) &&
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
+#endif
+}
+
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
+{
+ char *opt;
+ int taint = 0;
+
+ while (arg) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&arg, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
+ else
+ pr_cont(" %s\n", x86_cap_flags[bit]);
+
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
+ continue;
+
+ if (strcmp(flag, opt))
+ continue;
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
+ taint++;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
+ }
+
+ return taint;
+}
+
+
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ bool cpuid_taint = false;
+ char arg[128];
+ int arglen;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
+
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
+
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -610,56 +1763,82 @@ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
* cache alignment.
* The others are not touched to avoid unwanted side effects.
*
- * WARNING: this function is only called on the BP. Don't add code here
- * that is supposed to run on all CPUs.
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
-
- cpu_detect(c);
+ if (cpuid_feature()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
+ get_cpu_cap(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_parse_early_param();
+
+ cpu_init_topology(c);
+
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
+
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
+
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_init_topology(c);
+ }
- get_cpu_vendor(c);
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
- get_cpu_cap(c);
+ cpu_set_bug_bits(c);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ sld_setup(c);
-#ifdef CONFIG_SMP
- c->cpu_index = boot_cpu_id;
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
#endif
- filter_cpuid_features(c, false);
+
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
+ mca_bsp_init(c);
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
-#ifdef PROCESSOR_SELECT
- printk(KERN_INFO "KERNEL supported cpus:\n");
-#endif
-
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
@@ -667,89 +1846,144 @@ void __init early_cpu_init(void)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
-#ifdef PROCESSOR_SELECT
- {
- unsigned int j;
+void __init early_cpu_init(void)
+{
+#ifdef CONFIG_PROCESSOR_SELECT
+ unsigned int i, j;
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
- }
- }
+ pr_info("KERNEL supported cpus:\n");
#endif
+
+ init_cpu_devs();
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devs[i]->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
+ }
}
+#endif
+
early_identify_cpu(&boot_cpu_data);
}
-/*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect. In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work. Disable it completely
- * unless we can find a reliable way to detect all the broken cases.
- */
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
- clear_cpu_cap(c, X86_FEATURE_NOPL);
+ /*
+ * Empirically, writing zero to a segment selector on AMD does
+ * not clear the base, whereas writing zero to a segment
+ * selector on Intel does clear the base. Intel's behavior
+ * allows slightly faster context switches in the common case
+ * where GS is unused by the prev and next threads.
+ *
+ * Since neither vendor documents this anywhere that I can see,
+ * detect it directly instead of hard-coding the choice by
+ * vendor.
+ *
+ * I've designated AMD's behavior as the "bug" because it's
+ * counterintuitive and less friendly.
+ */
+
+ unsigned long old_base, tmp;
+ rdmsrq(MSR_FS_BASE, old_base);
+ wrmsrq(MSR_FS_BASE, 1);
+ loadsegment(fs, 0);
+ rdmsrq(MSR_FS_BASE, tmp);
+ wrmsrq(MSR_FS_BASE, old_base);
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
-static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
+ if (!cpuid_feature())
return;
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
- if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-# else
- c->apicid = c->initial_apicid;
-# endif
-#endif
-
-#ifdef CONFIG_X86_HT
- c->phys_proc_id = c->initial_apicid;
-#endif
- }
+ get_cpu_address_sizes(c);
get_model_name(c); /* Default name */
- init_scattered_cpuid_features(c);
- detect_nopl(c);
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
}
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void identify_cpu(struct cpuinfo_x86 *c)
{
int i;
c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
+ c->x86_cache_size = 0;
c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -761,22 +1995,26 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);
- /* Clear/Set all flags overriden by options, after probe */
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ /* Clear/Set all flags overridden by options, after probe */
+ apply_forced_caps(c);
-#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-#endif
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
/*
* Vendor-specific initialization. In this section we
@@ -791,9 +2029,22 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ setup_smep(c);
+ setup_smap(c);
+ setup_umip(c);
+ setup_lass(c);
+
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -802,6 +2053,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -814,20 +2068,15 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
-#ifdef CONFIG_X86_64
- detect_ht(c);
-#endif
-
- init_hypervisor(c);
+ x86_init_rdrand(c);
+ setup_pku(c);
+ setup_cet(c);
/*
- * Clear/Set all flags overriden by options, need do it
+ * Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ apply_forced_caps(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
@@ -839,104 +2088,89 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* AND the already accumulated flags with these */
for (i = 0; i < NCAPINTS; i++)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+
+ /* OR, i.e. replicate the bug flags */
+ for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++)
+ c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
}
+ ppin_init(c);
+
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
- select_idle_routine(c);
-
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
numa_add_cpu(smp_processor_id());
-#endif
}
-#ifdef CONFIG_X86_64
-static void vgetcpu_set_mode(void)
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
{
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
- vgetcpu_mode = VGETCPU_RDTSCP;
- else
- vgetcpu_mode = VGETCPU_LSL;
+ struct tss_struct *tss;
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1);
+ wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32);
+
+ put_cpu();
}
#endif
-void __init identify_boot_cpu(void)
+static __init void identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_c1e_mask();
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
-#else
- vgetcpu_set_mode();
#endif
- init_hw_perf_events();
+ cpu_detect_tlb(&boot_cpu_data);
+ setup_cr_pinning();
+
+ tsx_init();
+ tdx_init();
+ lkgs_init();
}
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
-}
-
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] __cpuinitconst = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __cpuinit print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
+ x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_amd_safe(index, &val))
- continue;
- printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr __cpuinitdata;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
+ tsx_ap_init();
+ c->initialized = true;
}
-__setup("noclflush", setup_noclflush);
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+void print_cpu_info(struct cpuinfo_x86 *c)
{
const char *vendor = NULL;
@@ -948,316 +2182,456 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
}
if (vendor && !strstr(c->x86_model_id, vendor))
- printk(KERN_CONT "%s ", vendor);
+ pr_cont("%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ pr_cont("%s", c->x86_model_id);
else
- printk(KERN_CONT "%d86", c->x86);
+ pr_cont("%d86", c->x86);
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
- else
- printk(KERN_CONT "\n");
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
-#ifdef CONFIG_SMP
- if (c->cpu_index < show_msr)
- print_cpu_msr();
-#else
- if (show_msr)
- print_cpu_msr();
-#endif
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
+ else
+ pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
-{
- int bit;
+/*
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
+ */
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
+static __init int setup_clearcpuid(char *arg)
+{
+ return 1;
+}
+__setup("clearcpuid=", setup_clearcpuid);
+static __init int setup_setcpuid(char *arg)
+{
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("setcpuid=", setup_setcpuid);
-#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU_FIRST(union irq_stack_union,
- irq_stack_union) __aligned(PAGE_SIZE);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+#ifdef CONFIG_X86_64
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
*/
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
-DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+static void wrmsrq_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrq(MSR_CSTAR, val);
+}
-DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+static inline void idt_syscall_init(void)
+{
+ wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
+ if (ia32_enabled()) {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
+ wrmsrq(MSR_SYSCALL_MASK,
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
+}
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ /* The default user and kernel segments */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
/*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+ * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+ * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+ * instruction to return to ring 3 (both sysexit and sysret cause
+ * #UD when FRED is enabled).
*/
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_syscall_init();
+}
+#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+#ifdef CONFIG_STACKPROTECTOR
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
+EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
+#endif
#endif
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+static void initialize_debug_regs(void)
+{
+ /* Control register first -- to make sure everything is disabled. */
+ set_debugreg(DR7_FIXED_1, 7);
+ set_debugreg(DR6_RESERVED, 6);
+ /* dr5 and dr4 don't exist */
+ set_debugreg(0, 3);
+ set_debugreg(0, 2);
+ set_debugreg(0, 1);
+ set_debugreg(0, 0);
}
-unsigned long kernel_eflags;
-
+#ifdef CONFIG_KGDB
/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
*/
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
+static void dbg_restore_debug_regs(void)
+{
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
-#else /* CONFIG_X86_64 */
+static inline void setup_getcpu(int cpu)
+{
+ unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
+ struct desc_struct d = { };
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ wrmsrq(MSR_TSC_AUX, cpudata);
-#ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
-#endif
+ /* Store CPU and node number in limit. */
+ d.limit0 = cpudata;
+ d.limit1 = cpudata >> 16;
+
+ d.type = 5; /* RO data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
-/* Make sure %fs and %gs are initialized properly in idle threads */
-struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
+}
+
+#ifdef CONFIG_X86_64
+static inline void tss_setup_ist(struct tss_struct *tss)
{
- memset(regs, 0, sizeof(struct pt_regs));
- regs->fs = __KERNEL_PERCPU;
- regs->gs = __KERNEL_STACK_CANARY;
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* Only mapped when SEV-ES is active */
+ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
+}
+#else /* CONFIG_X86_64 */
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+#endif /* !CONFIG_X86_64 */
+
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
- return regs;
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
+#endif
}
-#endif /* CONFIG_X86_64 */
/*
- * Clear all 6 debug registers:
+ * Setup everything needed to handle exceptions from the IDT, including the IST
+ * exceptions which use paranoid_entry().
*/
-static void clear_all_debug_regs(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
- int i;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ int cpu = raw_smp_processor_id();
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
+ /* paranoid_entry() gets the CPU number from the GDT */
+ setup_getcpu(cpu);
+
+ /* For IDT mode, IST vectors need to be set in TSS. */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+ load_TR_desc();
+
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
- set_debugreg(0, i);
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
}
}
-#ifdef CONFIG_KGDB
-/*
- * Restore debug regs if using kgdbwait and you have a kernel debugger
- * connection established.
- */
-static void dbg_restore_debug_regs(void)
+void __init cpu_init_replace_early_idt(void)
{
- if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
- arch_kgdb_ops.correct_hw_break();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ cpu_init_fred_exceptions();
+ else
+ idt_setup_early_pf();
}
-#else /* ! CONFIG_KGDB */
-#define dbg_restore_debug_regs()
-#endif /* ! CONFIG_KGDB */
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init for 64 bit
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
-void __cpuinit cpu_init(void)
+void cpu_init(void)
{
- struct orig_ist *oist;
- struct task_struct *me;
- struct tss_struct *t;
- unsigned long v;
- int cpu;
- int i;
-
- cpu = stack_smp_processor_id();
- t = &per_cpu(init_tss, cpu);
- oist = &per_cpu(orig_ist, cpu);
+ struct task_struct *cur = current;
+ int cpu = raw_smp_processor_id();
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
+ pr_debug("Initializing CPU#%d\n", cpu);
- me = current;
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
- panic("CPU#%d already initialized!\n", cpu);
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- pr_debug("Initializing CPU#%d\n", cpu);
+ wrmsrq(MSR_FS_BASE, 0);
+ wrmsrq(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ x2apic_setup();
+
+ intel_posted_msi_init();
+ }
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ mmgrab(&init_mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, cur);
/*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
*/
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
- switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
-
- load_idt((const struct desc_ptr *)&idt_descr);
+ load_mm_ldt(&init_mm);
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
+ initialize_debug_regs();
+ dbg_restore_debug_regs();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ doublefault_init_cpu_tss();
- x86_configure_nx();
- if (cpu != 0)
- enable_x2apic();
+ if (is_uv_system())
+ uv_cpu_init();
- /*
- * set up and load the per-CPU TSS
- */
- if (!oist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
+ load_fixmap_gdt(cpu);
+}
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += exception_stack_sizes[v];
- oist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- }
- }
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
+{
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
+}
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
- enter_lazy_tlb(&init_mm, me);
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds and CPU hotplug lock.
+ *
+ * Return: None
+ */
+void microcode_check(struct cpuinfo_x86 *prev_info)
+{
+ struct cpuinfo_x86 curr_info;
- load_sp0(t, &current->thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ perf_check_microcode();
- clear_all_debug_regs();
- dbg_restore_debug_regs();
+ amd_check_microcode();
- fpu_init();
+ store_cpu_caps(&curr_info);
- raw_local_save_flags(kernel_eflags);
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
+ return;
- if (is_uv_system())
- uv_cpu_init();
+ pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+ pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+#endif
-#else
-
-void __cpuinit cpu_init(void)
+/*
+ * Invoked from core CPU hotplug code after hotplug operations
+ */
+void arch_smt_update(void)
{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = &curr->thread;
-
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;)
- local_irq_enable();
- }
+ /* Handle the speculative execution misfeatures */
+ cpu_bugs_smt_update();
+ /* Check whether IPI broadcasting can be enabled */
+ apic_smt_update();
+}
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+void __init arch_cpu_finalize_init(void)
+{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ identify_boot_cpu();
- load_idt(&idt_descr);
- switch_to_new_gdt(cpu);
+ select_idle_routine();
/*
- * Set up and load the per-CPU TSS and LDT
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
*/
- atomic_inc(&init_mm.mm_count);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- enter_lazy_tlb(&init_mm, curr);
+ cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core);
- load_sp0(t, thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ cpu_select_mitigations();
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ arch_smt_update();
- clear_all_debug_regs();
- dbg_restore_debug_regs();
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
+
+ /*
+ * Must be before alternatives because it might set or clear
+ * feature bits.
+ */
+ fpu__init_system();
+ fpu__init_cpu();
/*
- * Force FPU initialization:
+ * This needs to follow the FPU initializtion, since EFI depends on it.
*/
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
/*
- * Boot processor to setup the FP and extended state context info.
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
*/
- if (smp_processor_id() == boot_cpu_id)
- init_thread_xstate();
+ *c = boot_cpu_data;
+ c->initialized = true;
+
+ alternative_instructions();
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ USER_PTR_MAX = TASK_SIZE_MAX;
- xsave_init();
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
}
-#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,12 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH_X86_CPU_H
-
#define ARCH_X86_CPU_H
-struct cpu_model_info {
- int vendor;
- int family;
- const char *model_names[16];
-};
+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
/* attempt to consolidate cpu attributes */
struct cpu_dev {
@@ -15,23 +14,81 @@ struct cpu_dev {
/* some have two possibilities for cpuid string */
const char *c_ident[2];
- struct cpu_model_info c_models[4];
-
void (*c_early_init)(struct cpuinfo_x86 *);
+ void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+ void (*c_detect_tlb)(struct cpuinfo_x86 *);
int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
};
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
- __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ __section(".x86_cpu_dev.init") = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+extern void __init tsx_init(void);
+void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
+#else
+static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
+extern void init_spectral_chicken(struct cpuinfo_x86 *c);
+
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+struct amd_northbridge *amd_init_l3_cache(int index);
+#else
+static inline struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ return NULL;
+}
#endif
+
+unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
+
+extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
+
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
+
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_PCC_CPUFREQ
- tristate "Processor Clocking Control interface driver"
- depends on ACPI && ACPI_PROCESSOR
- help
- This driver adds support for the PCC interface.
-
- For details, take a look at:
- <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-
- To compile this driver as a module, choose M here: the
- module will be called pcc-cpufreq.
-
- If in doubt, say N.
-
-config X86_ACPI_CPUFREQ
- tristate "ACPI Processor P-States driver"
- select CPU_FREQ_TABLE
- depends on ACPI_PROCESSOR
- help
- This driver adds a CPUFreq driver which utilizes the ACPI
- Processor Performance States.
- This driver also supports Intel Enhanced Speedstep.
-
- To compile this driver as a module, choose M here: the
- module will be called acpi-cpufreq.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config ELAN_CPUFREQ
- tristate "AMD Elan SC400 and SC410"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC400 and SC410
- processors.
-
- You need to specify the processor maximum speed as boot
- parameter: elanfreq=maxspeed (in kHz) or as module
- parameter "max_freq".
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config SC520_CPUFREQ
- tristate "AMD Elan SC520"
- select CPU_FREQ_TABLE
- depends on X86_ELAN
- ---help---
- This adds the CPUFreq driver for AMD Elan SC520 processor.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-
-config X86_POWERNOW_K6
- tristate "AMD Mobile K6-2/K6-3 PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
- AMD K6-3+ processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7
- tristate "AMD Mobile Athlon/Duron PowerNow!"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
- bool
- depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
- depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
- depends on X86_32
- default y
-
-config X86_POWERNOW_K8
- tristate "AMD Opteron/Athlon64 PowerNow!"
- select CPU_FREQ_TABLE
- depends on ACPI && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
- To compile this driver as a module, choose M here: the
- module will be called powernow-k8.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
- tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
- depends on X86_32 && PCI
- help
- This add the CPUFreq driver for NatSemi Geode processors which
- support suspend modulation.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
- tristate "Intel Enhanced SpeedStep (deprecated)"
- select CPU_FREQ_TABLE
- select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
- depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
- help
- This is deprecated and this functionality is now merged into
- acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
- speedstep_centrino.
- This adds the CPUFreq driver for Enhanced SpeedStep enabled
- mobile CPUs. This means Intel Pentium M (Centrino) CPUs
- or 64bit enabled Intel Xeons.
-
- To compile this driver as a module, choose M here: the
- module will be called speedstep-centrino.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
- bool "Built-in tables for Banias CPUs"
- depends on X86_32 && X86_SPEEDSTEP_CENTRINO
- default y
- help
- Use built-in tables for Banias CPUs if ACPI encoding
- is not available.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
- tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
- select CPU_FREQ_TABLE
- depends on X86_32
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
- mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
- ICH3 or ICH4 southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
- tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for certain mobile Intel Pentium III
- (Coppermine), all mobile Intel Pentium III-M (Tualatin)
- on systems which have an Intel 440BX/ZX/MX southbridge.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_P4_CLOCKMOD
- tristate "Intel Pentium 4 clock modulation"
- select CPU_FREQ_TABLE
- help
- This adds the CPUFreq driver for Intel Pentium 4 / XEON
- processors. When enabled it will lower CPU temperature by skipping
- clocks.
-
- This driver should be only used in exceptional
- circumstances when very low power is needed because it causes severe
- slowdowns and noticeable latencies. Normally Speedstep should be used
- instead.
-
- To compile this driver as a module, choose M here: the
- module will be called p4-clockmod.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
- tristate "nVidia nForce2 FSB changing"
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for FSB changing on nVidia nForce2
- platforms.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGRUN
- tristate "Transmeta LongRun"
- depends on X86_32
- help
- This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
- which support LongRun.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_LONGHAUL
- tristate "VIA Cyrix III Longhaul"
- select CPU_FREQ_TABLE
- depends on X86_32 && ACPI_PROCESSOR
- help
- This adds the CPUFreq driver for VIA Samuel/CyrixIII,
- VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
- processors.
-
- For details, take a look at <file:Documentation/cpu-freq/>.
-
- If in doubt, say N.
-
-config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
- select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
- help
- This adds the CPUFreq driver for VIA C7 processors. However, this driver
- does not have any safeguards to prevent operating the CPU out of spec
- and is thus considered dangerous. Please use the regular ACPI cpufreq
- driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
- If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
- tristate
- default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
- bool "Relaxed speedstep capability checks"
- depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
- help
- Don't perform all checks for a speedstep capable system which would
- normally be done. Some ancient or strange systems, though speedstep
- capable, don't always indicate that they are speedstep capable. This
- option lets the probing code bypass some of those checks if the
- parameter "relaxed_check=1" is passed to the module.
-
-endif # CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN) += longrun.o
-obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index 1d3cddaa40ee..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,778 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-#include <trace/events/power.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
- UNDEFINED_CAPABLE = 0,
- SYSTEM_INTEL_MSR_CAPABLE,
- SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct acpi_cpufreq_data {
- struct acpi_processor_performance *acpi_data;
- struct cpufreq_frequency_table *freq_table;
- unsigned int resume;
- unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
- return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
- struct acpi_processor_performance *perf;
- int i;
-
- perf = data->acpi_data;
-
- for (i = 0; i < perf->state_count; i++) {
- if (value == perf->states[i].status)
- return data->freq_table[i].frequency;
- }
- return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
- int i;
- struct acpi_processor_performance *perf;
-
- msr &= INTEL_MSR_RANGE;
- perf = data->acpi_data;
-
- for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
- if (msr == perf->states[data->freq_table[i].index].status)
- return data->freq_table[i].frequency;
- }
- return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- return extract_msr(val, data);
- case SYSTEM_IO_CAPABLE:
- return extract_io(val, data);
- default:
- return 0;
- }
-}
-
-struct msr_addr {
- u32 reg;
-};
-
-struct io_addr {
- u16 port;
- u8 bit_width;
-};
-
-struct drv_cmd {
- unsigned int type;
- const struct cpumask *mask;
- union {
- struct msr_addr msr;
- struct io_addr io;
- } addr;
- u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 h;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, cmd->val, h);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
- &cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
- struct drv_cmd *cmd = _cmd;
- u32 lo, hi;
-
- switch (cmd->type) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- rdmsr(cmd->addr.msr.reg, lo, hi);
- lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
- wrmsr(cmd->addr.msr.reg, lo, hi);
- break;
- case SYSTEM_IO_CAPABLE:
- acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
- cmd->val,
- (u32)cmd->addr.io.bit_width);
- break;
- default:
- break;
- }
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
- int err;
- cmd->val = 0;
-
- err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
- WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
- int this_cpu;
-
- this_cpu = get_cpu();
- if (cpumask_test_cpu(this_cpu, cmd->mask))
- do_drv_write(cmd);
- smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
- put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
- struct acpi_processor_performance *perf;
- struct drv_cmd cmd;
-
- if (unlikely(cpumask_empty(mask)))
- return 0;
-
- switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- break;
- default:
- return 0;
- }
-
- cmd.mask = mask;
- drv_read(&cmd);
-
- dprintk("get_cur_val = %u\n", cmd.val);
-
- return cmd.val;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
- unsigned int freq;
- unsigned int cached_freq;
-
- dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return 0;
- }
-
- cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
- if (freq != cached_freq) {
- /*
- * The dreaded BIOS frequency change behind our back.
- * Force set the frequency on next target call.
- */
- data->resume = 1;
- }
-
- dprintk("cur freq = %u\n", freq);
-
- return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
- struct acpi_cpufreq_data *data)
-{
- unsigned int cur_freq;
- unsigned int i;
-
- for (i = 0; i < 100; i++) {
- cur_freq = extract_freq(get_cur_val(mask), data);
- if (cur_freq == freq)
- return 1;
- udelay(10);
- }
- return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
- struct acpi_processor_performance *perf;
- struct cpufreq_freqs freqs;
- struct drv_cmd cmd;
- unsigned int next_state = 0; /* Index into freq_table */
- unsigned int next_perf_state = 0; /* Index into perf table */
- unsigned int i;
- int result = 0;
-
- dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
- if (unlikely(data == NULL ||
- data->acpi_data == NULL || data->freq_table == NULL)) {
- return -ENODEV;
- }
-
- perf = data->acpi_data;
- result = cpufreq_frequency_table_target(policy,
- data->freq_table,
- target_freq,
- relation, &next_state);
- if (unlikely(result)) {
- result = -ENODEV;
- goto out;
- }
-
- next_perf_state = data->freq_table[next_state].index;
- if (perf->state == next_perf_state) {
- if (unlikely(data->resume)) {
- dprintk("Called after resume, resetting to P%d\n",
- next_perf_state);
- data->resume = 0;
- } else {
- dprintk("Already at target state (P%d)\n",
- next_perf_state);
- goto out;
- }
- }
-
- trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
- switch (data->cpu_feature) {
- case SYSTEM_INTEL_MSR_CAPABLE:
- cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
- cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- case SYSTEM_IO_CAPABLE:
- cmd.type = SYSTEM_IO_CAPABLE;
- cmd.addr.io.port = perf->control_register.address;
- cmd.addr.io.bit_width = perf->control_register.bit_width;
- cmd.val = (u32) perf->states[next_perf_state].control;
- break;
- default:
- result = -ENODEV;
- goto out;
- }
-
- /* cpufreq holds the hotplug lock, so we are safe from here on */
- if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
- cmd.mask = policy->cpus;
- else
- cmd.mask = cpumask_of(policy->cpu);
-
- freqs.old = perf->states[perf->state].core_frequency * 1000;
- freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, cmd.mask) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- drv_write(&cmd);
-
- if (acpi_pstate_strict) {
- if (!check_freqs(cmd.mask, freqs.new, data)) {
- dprintk("acpi_cpufreq_target failed (%d)\n",
- policy->cpu);
- result = -EAGAIN;
- goto out;
- }
- }
-
- for_each_cpu(i, cmd.mask) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- perf->state = next_perf_state;
-
-out:
- return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_verify\n");
-
- return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
- struct acpi_processor_performance *perf = data->acpi_data;
-
- if (cpu_khz) {
- /* search the closest match to cpu_khz */
- unsigned int i;
- unsigned long freq;
- unsigned long freqn = perf->states[0].core_frequency * 1000;
-
- for (i = 0; i < (perf->state_count-1); i++) {
- freq = freqn;
- freqn = perf->states[i+1].core_frequency * 1000;
- if ((2 * cpu_khz) > (freqn + freq)) {
- perf->state = i;
- return freq;
- }
- }
- perf->state = perf->state_count-1;
- return freqn;
- } else {
- /* assume CPU is at P0... */
- perf->state = 0;
- return perf->states[0].core_frequency * 1000;
- }
-}
-
-static void free_acpi_perf_data(void)
-{
- unsigned int i;
-
- /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
- for_each_possible_cpu(i)
- free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
- ->shared_cpu_map);
- free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
- unsigned int i;
- dprintk("acpi_cpufreq_early_init\n");
-
- acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
- if (!acpi_perf_data) {
- dprintk("Memory allocation error for acpi_perf_data.\n");
- return -ENOMEM;
- }
- for_each_possible_cpu(i) {
- if (!zalloc_cpumask_var_node(
- &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
- GFP_KERNEL, cpu_to_node(i))) {
-
- /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
- free_acpi_perf_data();
- return -ENOMEM;
- }
- }
-
- /* Do initialization in ACPI core */
- acpi_processor_preregister_performance(acpi_perf_data);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
- bios_with_sw_any_bug = 1;
- return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
- {
- .callback = sw_any_bug_found,
- .ident = "Supermicro Server X6DLP",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
- DMI_MATCH(DMI_BIOS_VERSION, "080010"),
- DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
- },
- },
- { }
-};
-
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
- /* Intel Xeon Processor 7100 Series Specification Update
- * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
- * AL30: A Machine Check Exception (MCE) Occurring during an
- * Enhanced Intel SpeedStep Technology Ratio Change May Cause
- * Both Processor Cores to Lock Up. */
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- if ((c->x86 == 15) &&
- (c->x86_model == 6) &&
- (c->x86_mask == 8)) {
- printk(KERN_INFO "acpi-cpufreq: Intel(R) "
- "Xeon(R) 7100 Errata AL30, processors may "
- "lock up on frequency changes: disabling "
- "acpi-cpufreq.\n");
- return -ENODEV;
- }
- }
- return 0;
-}
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- unsigned int valid_states = 0;
- unsigned int cpu = policy->cpu;
- struct acpi_cpufreq_data *data;
- unsigned int result = 0;
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
- static int blacklisted;
-#endif
-
- dprintk("acpi_cpufreq_cpu_init\n");
-
-#ifdef CONFIG_SMP
- if (blacklisted)
- return blacklisted;
- blacklisted = acpi_cpufreq_blacklist(c);
- if (blacklisted)
- return blacklisted;
-#endif
-
- data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
- per_cpu(acfreq_data, cpu) = data;
-
- if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
- acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- result = acpi_processor_register_performance(data->acpi_data, cpu);
- if (result)
- goto err_free;
-
- perf = data->acpi_data;
- policy->shared_type = perf->shared_type;
-
- /*
- * Will let policy->cpus know about dependency only when software
- * coordination is required.
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
- policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
- cpumask_copy(policy->cpus, perf->shared_cpu_map);
- }
- cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
- dmi_check_system(sw_any_bug_dmi_table);
- if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
- policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
- cpumask_copy(policy->cpus, cpu_core_mask(cpu));
- }
-#endif
-
- /* capability check */
- if (perf->state_count <= 1) {
- dprintk("No P-States\n");
- result = -ENODEV;
- goto err_unreg;
- }
-
- if (perf->control_register.space_id != perf->status_register.space_id) {
- result = -ENODEV;
- goto err_unreg;
- }
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- dprintk("SYSTEM IO addr space\n");
- data->cpu_feature = SYSTEM_IO_CAPABLE;
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- dprintk("HARDWARE addr space\n");
- if (!check_est_cpu(cpu)) {
- result = -ENODEV;
- goto err_unreg;
- }
- data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
- break;
- default:
- dprintk("Unknown addr space %d\n",
- (u32) (perf->control_register.space_id));
- result = -ENODEV;
- goto err_unreg;
- }
-
- data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
- (perf->state_count+1), GFP_KERNEL);
- if (!data->freq_table) {
- result = -ENOMEM;
- goto err_unreg;
- }
-
- /* detect transition latency */
- policy->cpuinfo.transition_latency = 0;
- for (i = 0; i < perf->state_count; i++) {
- if ((perf->states[i].transition_latency * 1000) >
- policy->cpuinfo.transition_latency)
- policy->cpuinfo.transition_latency =
- perf->states[i].transition_latency * 1000;
- }
-
- /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
- if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
- policy->cpuinfo.transition_latency > 20 * 1000) {
- policy->cpuinfo.transition_latency = 20 * 1000;
- printk_once(KERN_INFO
- "P-state transition latency capped at 20 uS\n");
- }
-
- /* table init */
- for (i = 0; i < perf->state_count; i++) {
- if (i > 0 && perf->states[i].core_frequency >=
- data->freq_table[valid_states-1].frequency / 1000)
- continue;
-
- data->freq_table[valid_states].index = i;
- data->freq_table[valid_states].frequency =
- perf->states[i].core_frequency * 1000;
- valid_states++;
- }
- data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
- perf->state = 0;
-
- result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
- if (result)
- goto err_freqfree;
-
- if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
- printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
- switch (perf->control_register.space_id) {
- case ACPI_ADR_SPACE_SYSTEM_IO:
- /* Current speed is unknown and not detectable by IO port */
- policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
- break;
- case ACPI_ADR_SPACE_FIXED_HARDWARE:
- acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
- policy->cur = get_cur_freq_on_cpu(cpu);
- break;
- default:
- break;
- }
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
- dprintk("CPU%u - ACPI performance management activated.\n", cpu);
- for (i = 0; i < perf->state_count; i++)
- dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
- (i == perf->state ? '*' : ' '), i,
- (u32) perf->states[i].core_frequency,
- (u32) perf->states[i].power,
- (u32) perf->states[i].transition_latency);
-
- cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
- /*
- * the first call to ->target() should result in us actually
- * writing something to the appropriate registers.
- */
- data->resume = 1;
-
- return result;
-
-err_freqfree:
- kfree(data->freq_table);
-err_unreg:
- acpi_processor_unregister_performance(perf, cpu);
-err_free:
- kfree(data);
- per_cpu(acfreq_data, cpu) = NULL;
-
- return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_cpu_exit\n");
-
- if (data) {
- cpufreq_frequency_table_put_attr(policy->cpu);
- per_cpu(acfreq_data, policy->cpu) = NULL;
- acpi_processor_unregister_performance(data->acpi_data,
- policy->cpu);
- kfree(data);
- }
-
- return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
- struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
- dprintk("acpi_cpufreq_resume\n");
-
- data->resume = 1;
-
- return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
- .verify = acpi_cpufreq_verify,
- .target = acpi_cpufreq_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = acpi_cpufreq_cpu_init,
- .exit = acpi_cpufreq_cpu_exit,
- .resume = acpi_cpufreq_resume,
- .name = "acpi-cpufreq",
- .owner = THIS_MODULE,
- .attr = acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- dprintk("acpi_cpufreq_init\n");
-
- ret = acpi_cpufreq_early_init();
- if (ret)
- return ret;
-
- ret = cpufreq_register_driver(&acpi_cpufreq_driver);
- if (ret)
- free_acpi_perf_data();
-
- return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
- dprintk("acpi_cpufreq_exit\n");
-
- cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
- free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
- "value 0 or non-zero. non-zero -> strict ACPI checks are "
- "performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 733093d60436..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "cpufreq-nforce2", msg)
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- * Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
- unsigned char mul, div;
-
- mul = (pll >> 8) & 0xff;
- div = pll & 0xff;
-
- if (div > 0)
- return NFORCE2_XTAL * mul / div;
-
- return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- * Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
- unsigned char xmul, xdiv;
- unsigned char mul = 0, div = 0;
- int tried = 0;
-
- /* Try to calculate multiplier and divider up to 4 times */
- while (((mul == 0) || (div == 0)) && (tried <= 3)) {
- for (xdiv = 2; xdiv <= 0x80; xdiv++)
- for (xmul = 1; xmul <= 0xfe; xmul++)
- if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
- fsb + tried) {
- mul = xmul;
- div = xdiv;
- }
- tried++;
- }
-
- if ((mul == 0) || (div == 0))
- return -1;
-
- return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- * Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
- int temp;
-
- /* Set the pll addr. to 0x00 */
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
- /* Now write the value in all 64 registers */
- for (temp = 0; temp <= 0x3f; temp++)
- pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
- return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- * Read FSB from chipset
- * If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
- struct pci_dev *nforce2_sub5;
- u32 fsb, temp = 0;
-
- /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
- nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
- if (!nforce2_sub5)
- return 0;
-
- pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
- fsb /= 1000000;
-
- /* Check if PLL register is already set */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
- if (bootfsb || !temp)
- return fsb;
-
- /* Use PLL register FSB value */
- pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
- fsb = nforce2_calc_fsb(temp);
-
- return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- * Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
- u32 temp = 0;
- unsigned int tfsb;
- int diff;
- int pll = 0;
-
- if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
- printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
- return -EINVAL;
- }
-
- tfsb = nforce2_fsb_read(0);
- if (!tfsb) {
- printk(KERN_ERR PFX "Error while reading the FSB\n");
- return -EINVAL;
- }
-
- /* First write? Then set actual value */
- pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if (!temp) {
- pll = nforce2_calc_pll(tfsb);
-
- if (pll < 0)
- return -EINVAL;
-
- nforce2_write_pll(pll);
- }
-
- /* Enable write access */
- temp = 0x01;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
- diff = tfsb - fsb;
-
- if (!diff)
- return 0;
-
- while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
- if (diff < 0)
- tfsb++;
- else
- tfsb--;
-
- /* Calculate the PLL reg. value */
- pll = nforce2_calc_pll(tfsb);
- if (pll == -1)
- return -EINVAL;
-
- nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
- mdelay(NFORCE2_DELAY);
-#endif
- }
-
- temp = 0x40;
- pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
- return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
-/* unsigned long flags; */
- struct cpufreq_freqs freqs;
- unsigned int target_fsb;
-
- if ((target_freq > policy->max) || (target_freq < policy->min))
- return -EINVAL;
-
- target_fsb = target_freq / (fid * 100);
-
- freqs.old = nforce2_get(policy->cpu);
- freqs.new = target_fsb * fid * 100;
- freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
-
- if (freqs.old == freqs.new)
- return 0;
-
- dprintk("Old CPU frequency %d kHz, new %d kHz\n",
- freqs.old, freqs.new);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Disable IRQs */
- /* local_irq_save(flags); */
-
- if (nforce2_set_fsb(target_fsb) < 0)
- printk(KERN_ERR PFX "Changing FSB to %d failed\n",
- target_fsb);
- else
- dprintk("Changed FSB successfully to %d\n",
- target_fsb);
-
- /* Enable IRQs */
- /* local_irq_restore(flags); */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
- unsigned int fsb_pol_max;
-
- fsb_pol_max = policy->max / (fid * 100);
-
- if (policy->min < (fsb_pol_max * fid * 100))
- policy->max = (fsb_pol_max + 1) * fid * 100;
-
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int fsb;
- unsigned int rfid;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Get current FSB */
- fsb = nforce2_fsb_read(0);
-
- if (!fsb)
- return -EIO;
-
- /* FIX: Get FID from CPU */
- if (!fid) {
- if (!cpu_khz) {
- printk(KERN_WARNING PFX
- "cpu_khz not set, can't calculate multiplier!\n");
- return -ENODEV;
- }
-
- fid = cpu_khz / (fsb * 100);
- rfid = fid % 5;
-
- if (rfid) {
- if (rfid > 2)
- fid += 5 - rfid;
- else
- fid -= rfid;
- }
- }
-
- printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
- fid / 10, fid % 10);
-
- /* Set maximum FSB to FSB at boot time */
- max_fsb = nforce2_fsb_read(1);
-
- if (!max_fsb)
- return -EIO;
-
- if (!min_fsb)
- min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
- if (min_fsb < NFORCE2_MIN_FSB)
- min_fsb = NFORCE2_MIN_FSB;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = min_fsb * fid * 100;
- policy->cpuinfo.max_freq = max_fsb * fid * 100;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = nforce2_get(policy->cpu);
- policy->min = policy->cpuinfo.min_freq;
- policy->max = policy->cpuinfo.max_freq;
-
- return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
- .name = "nforce2",
- .verify = nforce2_verify,
- .target = nforce2_target,
- .get = nforce2_get,
- .init = nforce2_cpu_init,
- .exit = nforce2_cpu_exit,
- .owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static unsigned int nforce2_detect_chipset(void)
-{
- nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- PCI_DEVICE_ID_NVIDIA_NFORCE2,
- PCI_ANY_ID, PCI_ANY_ID, NULL);
-
- if (nforce2_dev == NULL)
- return -ENODEV;
-
- printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
- nforce2_dev->revision);
- printk(KERN_INFO PFX
- "FSB changing is maybe unstable and can lead to "
- "crashes and data loss.\n");
-
- return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
- /* TODO: do we need to detect the processor? */
-
- /* detect chipset */
- if (nforce2_detect_chipset()) {
- printk(KERN_INFO PFX "No nForce2 chipset.\n");
- return -ENODEV;
- }
-
- return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- * Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
- cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Based on documentation provided by Dave Jones. Thanks!
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M 0
-#define EPS_BRAND_C7 1
-#define EPS_BRAND_EDEN 2
-#define EPS_BRAND_C3 3
-#define EPS_BRAND_C7D 4
-
-struct eps_cpu_data {
- u32 fsb;
- struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (cpu)
- return 0;
- centaur = eps_cpu[cpu];
- if (centaur == NULL)
- return 0;
-
- /* Return current frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
- unsigned int cpu,
- u32 dest_state)
-{
- struct cpufreq_freqs freqs;
- u32 lo, hi;
- int err = 0;
- int i;
-
- freqs.old = eps_get(cpu);
- freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Wait while CPU is busy */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i = 0;
- while (lo & ((1 << 16) | (1 << 17))) {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- }
- /* Set new multiplier and voltage */
- wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
- /* Wait until transition end */
- i = 0;
- do {
- udelay(16);
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- i++;
- if (unlikely(i > 64)) {
- err = -ENODEV;
- goto postchange;
- }
- } while (lo & ((1 << 16) | (1 << 17)));
-
- /* Return current frequency */
-postchange:
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
- {
- u8 current_multiplier, current_voltage;
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n",
- current_multiplier);
- }
-#endif
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct eps_cpu_data *centaur;
- unsigned int newstate = 0;
- unsigned int cpu = policy->cpu;
- unsigned int dest_state;
- int ret;
-
- if (unlikely(eps_cpu[cpu] == NULL))
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- &eps_cpu[cpu]->freq_table[0],
- target_freq,
- relation,
- &newstate))) {
- return -EINVAL;
- }
-
- /* Make frequency transition */
- dest_state = centaur->freq_table[newstate].index & 0xffff;
- ret = eps_set_state(centaur, cpu, dest_state);
- if (ret)
- printk(KERN_ERR "eps: Timeout!\n");
- return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- &eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i;
- u32 lo, hi;
- u64 val;
- u8 current_multiplier, current_voltage;
- u8 max_multiplier, max_voltage;
- u8 min_multiplier, min_voltage;
- u8 brand = 0;
- u32 fsb;
- struct eps_cpu_data *centaur;
- struct cpuinfo_x86 *c = &cpu_data(0);
- struct cpufreq_frequency_table *f_table;
- int k, step, voltage;
- int ret;
- int states;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* Check brand */
- printk(KERN_INFO "eps: Detected VIA ");
-
- switch (c->x86_model) {
- case 10:
- rdmsr(0x1153, lo, hi);
- brand = (((lo >> 2) ^ lo) >> 18) & 3;
- printk(KERN_CONT "Model A ");
- break;
- case 13:
- rdmsr(0x1154, lo, hi);
- brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
- printk(KERN_CONT "Model D ");
- break;
- }
-
- switch (brand) {
- case EPS_BRAND_C7M:
- printk(KERN_CONT "C7-M\n");
- break;
- case EPS_BRAND_C7:
- printk(KERN_CONT "C7\n");
- break;
- case EPS_BRAND_EDEN:
- printk(KERN_CONT "Eden\n");
- break;
- case EPS_BRAND_C7D:
- printk(KERN_CONT "C7-D\n");
- break;
- case EPS_BRAND_C3:
- printk(KERN_CONT "C3\n");
- return -ENODEV;
- break;
- }
- /* Enable Enhanced PowerSaver */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- wrmsrl(MSR_IA32_MISC_ENABLE, val);
- /* Can be locked at 0 */
- rdmsrl(MSR_IA32_MISC_ENABLE, val);
- if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
- return -ENODEV;
- }
- }
-
- /* Print voltage and multiplier */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- current_voltage = lo & 0xff;
- printk(KERN_INFO "eps: Current voltage = %dmV\n",
- current_voltage * 16 + 700);
- current_multiplier = (lo >> 8) & 0xff;
- printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
- /* Print limits */
- max_voltage = hi & 0xff;
- printk(KERN_INFO "eps: Highest voltage = %dmV\n",
- max_voltage * 16 + 700);
- max_multiplier = (hi >> 8) & 0xff;
- printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
- min_voltage = (hi >> 16) & 0xff;
- printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
- min_voltage * 16 + 700);
- min_multiplier = (hi >> 24) & 0xff;
- printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
- /* Sanity checks */
- if (current_multiplier == 0 || max_multiplier == 0
- || min_multiplier == 0)
- return -EINVAL;
- if (current_multiplier > max_multiplier
- || max_multiplier <= min_multiplier)
- return -EINVAL;
- if (current_voltage > 0x1f || max_voltage > 0x1f)
- return -EINVAL;
- if (max_voltage < min_voltage)
- return -EINVAL;
-
- /* Calc FSB speed */
- fsb = cpu_khz / current_multiplier;
- /* Calc number of p-states supported */
- if (brand == EPS_BRAND_C7M)
- states = max_multiplier - min_multiplier + 1;
- else
- states = 2;
-
- /* Allocate private data and frequency table for current cpu */
- centaur = kzalloc(sizeof(struct eps_cpu_data)
- + (states + 1) * sizeof(struct cpufreq_frequency_table),
- GFP_KERNEL);
- if (!centaur)
- return -ENOMEM;
- eps_cpu[0] = centaur;
-
- /* Copy basic values */
- centaur->fsb = fsb;
-
- /* Fill frequency and MSR value table */
- f_table = &centaur->freq_table[0];
- if (brand != EPS_BRAND_C7M) {
- f_table[0].frequency = fsb * min_multiplier;
- f_table[0].index = (min_multiplier << 8) | min_voltage;
- f_table[1].frequency = fsb * max_multiplier;
- f_table[1].index = (max_multiplier << 8) | max_voltage;
- f_table[2].frequency = CPUFREQ_TABLE_END;
- } else {
- k = 0;
- step = ((max_voltage - min_voltage) * 256)
- / (max_multiplier - min_multiplier);
- for (i = min_multiplier; i <= max_multiplier; i++) {
- voltage = (k * step) / 256 + min_voltage;
- f_table[k].frequency = fsb * i;
- f_table[k].index = (i << 8) | voltage;
- k++;
- }
- f_table[k].frequency = CPUFREQ_TABLE_END;
- }
-
- policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
- policy->cur = fsb * current_multiplier;
-
- ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
- if (ret) {
- kfree(centaur);
- return ret;
- }
-
- cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
- return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- struct eps_cpu_data *centaur;
- u32 lo, hi;
-
- if (eps_cpu[cpu] == NULL)
- return -ENODEV;
- centaur = eps_cpu[cpu];
-
- /* Get max frequency */
- rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
- /* Set max frequency */
- eps_set_state(centaur, cpu, hi & 0xffff);
- /* Bye */
- cpufreq_frequency_table_put_attr(policy->cpu);
- kfree(eps_cpu[cpu]);
- eps_cpu[cpu] = NULL;
- return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
- .verify = eps_verify,
- .target = eps_target,
- .init = eps_cpu_init,
- .exit = eps_cpu_exit,
- .get = eps_get,
- .name = "e_powersaver",
- .owner = THIS_MODULE,
- .attr = eps_attr,
-};
-
-static int __init eps_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* This driver will work only on Centaur C7 processors with
- * Enhanced SpeedStep/PowerSaver registers */
- if (c->x86_vendor != X86_VENDOR_CENTAUR
- || c->x86 != 6 || c->x86_model < 10)
- return -ENODEV;
- if (!cpu_has(c, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpufreq_register_driver(&eps_driver))
- return -EINVAL;
- return 0;
-}
-
-static void __exit eps_exit(void)
-{
- cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * elanfreq: cpufreq driver for the AMD ELAN family
- *
- * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- * Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
-#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
- int clock; /* frequency in kHz */
- int val40h; /* PMU Force Mode register */
- int val80h; /* CPU Clock Speed Register */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
- {1000, 0x02, 0x18},
- {2000, 0x02, 0x10},
- {4000, 0x02, 0x08},
- {8000, 0x00, 0x00},
- {16000, 0x00, 0x02},
- {33000, 0x00, 0x04},
- {66000, 0x01, 0x04},
- {99000, 0x01, 0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
- {0, 1000},
- {1, 2000},
- {2, 4000},
- {3, 8000},
- {4, 16000},
- {5, 33000},
- {6, 66000},
- {7, 99000},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-/**
- * elanfreq_get_cpu_frequency: determine current cpu speed
- *
- * Finds out at which frequency the CPU of the Elan SOC runs
- * at the moment. Frequencies from 1 to 33 MHz are generated
- * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- * and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg; /* Clock Speed Register */
-
- local_irq_disable();
- outb_p(0x80, REG_CSCIR);
- clockspeed_reg = inb_p(REG_CSCDR);
- local_irq_enable();
-
- if ((clockspeed_reg & 0xE0) == 0xE0)
- return 0;
-
- /* Are we in CPU clock multiplied mode (66/99 MHz)? */
- if ((clockspeed_reg & 0xE0) == 0xC0) {
- if ((clockspeed_reg & 0x01) == 0)
- return 66000;
- else
- return 99000;
- }
-
- /* 33 MHz is not 32 MHz... */
- if ((clockspeed_reg & 0xE0) == 0xA0)
- return 33000;
-
- return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- * elanfreq_set_cpu_frequency: Change the CPU core frequency
- * @cpu: cpu number
- * @freq: frequency in kHz
- *
- * This function takes a frequency value and changes the CPU frequency
- * according to this. Note that the frequency has to be checked by
- * elanfreq_validatespeed() for correctness!
- *
- * There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
- struct cpufreq_freqs freqs;
-
- freqs.old = elanfreq_get_cpu_frequency(0);
- freqs.new = elan_multiplier[state].clock;
- freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
- elan_multiplier[state].clock);
-
-
- /*
- * Access to the Elan's internal registers is indexed via
- * 0x22: Chip Setup & Control Register Index Register (CSCI)
- * 0x23: Chip Setup & Control Register Data Register (CSCD)
- *
- */
-
- /*
- * 0x40 is the Power Management Unit's Force Mode Register.
- * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
- */
-
- local_irq_disable();
- outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
- outb_p(0x00, REG_CSCDR);
- local_irq_enable(); /* wait till internal pipelines and */
- udelay(1000); /* buffers have cleaned up */
-
- local_irq_disable();
-
- /* now, set the CPU clock speed register (0x80) */
- outb_p(0x80, REG_CSCIR);
- outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
- /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
- outb_p(0x40, REG_CSCIR);
- outb_p(elan_multiplier[state].val40h, REG_CSCDR);
- udelay(10000);
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- * elanfreq_validatespeed: test if frequency range is valid
- * @policy: the policy to validate
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- elanfreq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int i;
- int result;
-
- /* capability check */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10))
- return -ENODEV;
-
- /* max freq */
- if (!max_freq)
- max_freq = elanfreq_get_cpu_frequency(0);
-
- /* table init */
- for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if (elanfreq_table[i].frequency > max_freq)
- elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = elanfreq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
- return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter. Use:
- * elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
- max_freq = simple_strtoul(str, &str, 0);
- printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
- return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
- .get = elanfreq_get_cpu_frequency,
- .verify = elanfreq_verify,
- .target = elanfreq_target,
- .init = elanfreq_cpu_init,
- .exit = elanfreq_cpu_exit,
- .name = "elanfreq",
- .owner = THIS_MODULE,
- .attr = elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- /* Test if we have the right hardware */
- if ((c->x86_vendor != X86_VENDOR_AMD) ||
- (c->x86 != 4) || (c->x86_model != 10)) {
- printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
- return -ENODEV;
- }
- return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
- cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
- "Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 16e3483be9e3..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * Cyrix MediaGX and NatSemi Geode Suspend Modulation
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Hiroshi Miura <miura@da-cha.org>
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- * are based on Suspend Modulation.
- *
- * Suspend Modulation works by asserting and de-asserting the SUSP# pin
- * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- * asserted then power consumption is reduced.
- *
- * Suspend Modulation's OFF/ON duration are configurable
- * with 'Suspend Modulation OFF Count Register'
- * and 'Suspend Modulation ON Count Register'.
- * These registers are 8bit counters that represent the number of
- * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- * to the processor.
- *
- * These counters define a ratio which is the effective frequency
- * of operation of the system.
- *
- * OFF Count
- * F_eff = Fgx * ----------------------
- * OFF Count + ON Count
- *
- * 0 <= On Count, Off Count <= 255
- *
- * From these limits, we can get register values
- *
- * off_duration + on_duration <= MAX_DURATION
- * on_duration = off_duration * (stock_freq - freq) / freq
- *
- * off_duration = (freq * DURATION) / stock_freq
- * on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
- * - fix on/off register mistake
- * - fix cpu_khz calc when it stops cpu modulation.
- *
- * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
- * - rewrite for Cyrix MediaGX Cx5510/5520 and
- * NatSemi Geode Cs5530(A).
- *
- * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * - cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- * Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- * Suspend Modulation - Definitions *
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1 0x80 /* power management enable register 1 */
-#define PCI_PMER2 0x81 /* power management enable register 2 */
-#define PCI_PMER3 0x82 /* power management enable register 3 */
-#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON 0x95 /* suspend modulation ON counter register */
-#define PCI_SUSCFG 0x96 /* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM (1<<0) /* global power management */
-#define GIT (1<<1) /* globally enable PM device idle timers */
-#define GTR (1<<2) /* globally enable IO traps */
-#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
-#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD (1<<0) /* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
- /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA (1<<3) /* stop ISA clock */
-#define PWRSVE (1<<4) /* active idle */
-
-struct gxfreq_params {
- u8 on_duration;
- u8 off_duration;
- u8 pci_suscfg;
- u8 pci_pmer1;
- u8 pci_pmer2;
- struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "gx-suspmod", msg)
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- * MULT[3:0]:
- * 0000 = SYSCLK multiplied by 4 (test only)
- * 0001 = SYSCLK multiplied by 10
- * 0010 = SYSCLK multiplied by 4
- * 0011 = SYSCLK multiplied by 6
- * 0100 = SYSCLK multiplied by 9
- * 0101 = SYSCLK multiplied by 5
- * 0110 = SYSCLK multiplied by 7
- * 0111 = SYSCLK multiplied by 8
- * of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
- 4, 10, 4, 6, 9, 5, 7, 8,
- 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- * Low Level chipset interface *
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
- PCI_ANY_ID, PCI_ANY_ID },
- { 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
- pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
- struct pci_dev *gx_pci = NULL;
-
- /* check if CPU is a MediaGX or a Geode. */
- if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
- (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
- dprintk("error: no MediaGX/Geode processor found!\n");
- return NULL;
- }
-
- /* detect which companion chip is used */
- while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
- if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
- return gx_pci;
- }
-
- dprintk("error: no supported chipset found!\n");
- return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
- if ((gx_params->pci_suscfg & SUSMOD) == 0)
- return stock_freq;
-
- return (stock_freq * gx_params->off_duration)
- / (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- * gx_validate_speed:
- * determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
- u8 *off_duration)
-{
- unsigned int i;
- u8 tmp_on, tmp_off;
- int old_tmp_freq = stock_freq;
- int tmp_freq;
-
- *off_duration = 1;
- *on_duration = 0;
-
- for (i = max_duration; i > 0; i--) {
- tmp_off = ((khz * i) / stock_freq) & 0xff;
- tmp_on = i - tmp_off;
- tmp_freq = (stock_freq * tmp_off) / i;
- /* if this relation is closer to khz, use this. If it's equal,
- * prefer it, too - lower latency */
- if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
- *on_duration = tmp_on;
- *off_duration = tmp_off;
- old_tmp_freq = tmp_freq;
- }
- }
-
- return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
- u8 suscfg, pmer1;
- unsigned int new_khz;
- unsigned long flags;
- struct cpufreq_freqs freqs;
-
- freqs.cpu = 0;
- freqs.old = gx_get_cpuspeed(0);
-
- new_khz = gx_validate_speed(khz, &gx_params->on_duration,
- &gx_params->off_duration);
-
- freqs.new = new_khz;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- local_irq_save(flags);
-
-
-
- if (new_khz != stock_freq) {
- /* if new khz == 100% of CPU speed, it is special case */
- switch (gx_params->cs55x0->device) {
- case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
- pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
- /* FIXME: need to test other values -- Zwane,Miura */
- /* typical 2 to 4ms */
- gx_write_byte(PCI_IRQTC, 4);
- /* typical 50 to 100ms */
- gx_write_byte(PCI_VIDTC, 100);
- gx_write_byte(PCI_PMER1, pmer1);
-
- if (gx_params->cs55x0->revision < 0x10) {
- /* CS5530(rev 1.2, 1.3) */
- suscfg = gx_params->pci_suscfg|SUSMOD;
- } else {
- /* CS5530A,B.. */
- suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
- }
- break;
- case PCI_DEVICE_ID_CYRIX_5520:
- case PCI_DEVICE_ID_CYRIX_5510:
- suscfg = gx_params->pci_suscfg | SUSMOD;
- break;
- default:
- local_irq_restore(flags);
- dprintk("fatal: try to set unknown chipset.\n");
- return;
- }
- } else {
- suscfg = gx_params->pci_suscfg & ~(SUSMOD);
- gx_params->off_duration = 0;
- gx_params->on_duration = 0;
- dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
- }
-
- gx_write_byte(PCI_MODOFF, gx_params->off_duration);
- gx_write_byte(PCI_MODON, gx_params->on_duration);
-
- gx_write_byte(PCI_SUSCFG, suscfg);
- pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
- local_irq_restore(flags);
-
- gx_params->pci_suscfg = suscfg;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
- gx_params->on_duration * 32, gx_params->off_duration * 32);
- dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- * High level functions *
- ****************************************************************/
-
-/*
- * cpufreq_gx_verify: test if frequency range is valid
- *
- * This function checks if a given frequency range in kHz is valid
- * for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
- unsigned int tmp_freq = 0;
- u8 tmp1, tmp2;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- /* it needs to be assured that at least one supported frequency is
- * within policy->min and policy->max. If it is not, policy->max
- * needs to be increased until one freuqency is supported.
- * policy->min may not be decreased, though. This way we guarantee a
- * specific processing capacity.
- */
- tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
- if (tmp_freq < policy->min)
- tmp_freq += stock_freq / max_duration;
- policy->min = tmp_freq;
- if (policy->min > policy->max)
- policy->max = tmp_freq;
- tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
- if (tmp_freq > policy->max)
- tmp_freq -= stock_freq / max_duration;
- policy->max = tmp_freq;
- if (policy->max < policy->min)
- policy->max = policy->min;
- cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
- stock_freq);
-
- return 0;
-}
-
-/*
- * cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- u8 tmp1, tmp2;
- unsigned int tmp_freq;
-
- if (!stock_freq || !policy)
- return -EINVAL;
-
- policy->cpu = 0;
-
- tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
- while (tmp_freq < policy->min) {
- tmp_freq += stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
- while (tmp_freq > policy->max) {
- tmp_freq -= stock_freq / max_duration;
- tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
- }
-
- gx_set_cpuspeed(tmp_freq);
-
- return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int maxfreq, curfreq;
-
- if (!policy || policy->cpu != 0)
- return -ENODEV;
-
- /* determine maximum frequency */
- if (pci_busclk)
- maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
- else if (cpu_khz)
- maxfreq = cpu_khz;
- else
- maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
- stock_freq = maxfreq;
- curfreq = gx_get_cpuspeed(0);
-
- dprintk("cpu max frequency is %d.\n", maxfreq);
- dprintk("cpu current frequency is %dkHz.\n", curfreq);
-
- /* setup basic struct for cpufreq API */
- policy->cpu = 0;
-
- if (max_duration < POLICY_MIN_DIV)
- policy->min = maxfreq / max_duration;
- else
- policy->min = maxfreq / POLICY_MIN_DIV;
- policy->max = maxfreq;
- policy->cur = curfreq;
- policy->cpuinfo.min_freq = maxfreq / max_duration;
- policy->cpuinfo.max_freq = maxfreq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
- return 0;
-}
-
-/*
- * cpufreq_gx_init:
- * MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
- .get = gx_get_cpuspeed,
- .verify = cpufreq_gx_verify,
- .target = cpufreq_gx_target,
- .init = cpufreq_gx_cpu_init,
- .name = "gx-suspmod",
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
- int ret;
- struct gxfreq_params *params;
- struct pci_dev *gx_pci;
-
- /* Test if we have the right hardware */
- gx_pci = gx_detect_chipset();
- if (gx_pci == NULL)
- return -ENODEV;
-
- /* check whether module parameters are sane */
- if (max_duration > 0xff)
- max_duration = 0xff;
-
- dprintk("geode suspend modulation available.\n");
-
- params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
- if (params == NULL)
- return -ENOMEM;
-
- params->cs55x0 = gx_pci;
- gx_params = params;
-
- /* keep cs55x0 configurations */
- pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
- pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
- pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
- pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
- pci_read_config_byte(params->cs55x0, PCI_MODOFF,
- &(params->off_duration));
-
- ret = cpufreq_register_driver(&gx_suspmod_driver);
- if (ret) {
- kfree(params);
- return ret; /* register error! */
- }
-
- return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
- cpufreq_unregister_driver(&gx_suspmod_driver);
- pci_dev_put(gx_params->cs55x0);
- kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index 7e7eea4f8261..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- * (C) 2001-2004 Dave Jones. <davej@redhat.com>
- * (C) 2002 Padraig Brady. <padraig@antefacto.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- * VIA have currently 3 different versions of Longhaul.
- * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- * Version 2 of longhaul is backward compatible with v1, but adds
- * LONGHAUL MSR for purpose of both frequency and voltage scaling.
- * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- * Version 3 of longhaul got renamed to Powersaver and redesigned
- * to use only the POWERSAVER MSR at 0x110a.
- * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- * It's pretty much the same feature wise to longhaul v2, though
- * there is provision for scaling FSB too, but this doesn't work
- * too well in practice so we don't even try to use this.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1 1
-#define TYPE_LONGHAUL_V2 2
-#define TYPE_POWERSAVER 3
-
-#define CPU_SAMUEL 1
-#define CPU_SAMUEL2 2
-#define CPU_EZRA 3
-#define CPU_EZRA_T 4
-#define CPU_NEHEMIAH 5
-#define CPU_NEHEMIAH_C 6
-
-/* Flags */
-#define USE_ACPI_C3 (1 << 1)
-#define USE_NORTHBRIDGE (1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longhaul", msg)
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
- if (speed < 1000) {
- snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
- return speedbuffer;
- }
-
- if (speed%1000 == 0)
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%dGHz", speed/1000);
- else
- snprintf(speedbuffer, sizeof(speedbuffer),
- "%d.%dGHz", speed/1000, (speed%1000)/100);
-
- return speedbuffer;
-}
-#endif
-
-
-static unsigned int calc_speed(int mult)
-{
- int khz;
- khz = (mult/10)*fsb;
- if (mult%10)
- khz += fsb/2;
- khz *= 1000;
- return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
- unsigned long invalue = 0, lo, hi;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
- invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
- if (longhaul_version == TYPE_LONGHAUL_V2 ||
- longhaul_version == TYPE_POWERSAVER) {
- if (lo & (1<<27))
- invalue += 16;
- }
- return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
- union msr_bcr2 bcr2;
-
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Enable software clock multiplier */
- bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
- /* Sync to timer tick */
- safe_halt();
- /* Change frequency on next halt or sleep */
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
- /* Invoke transition */
- ACPI_FLUSH_CPU_CACHE();
- halt();
-
- /* Disable software clock multiplier */
- local_irq_disable();
- rdmsrl(MSR_VIA_BCR2, bcr2.val);
- bcr2.bits.ESOFTBF = 0;
- wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
- unsigned int dir)
-{
- union msr_longhaul longhaul;
- u32 t;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Setup new frequency */
- if (!revid_errata)
- longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
- else
- longhaul.bits.RevisionKey = 0;
- longhaul.bits.SoftBusRatio = mults_index & 0xf;
- longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
- /* Setup new voltage */
- if (can_scale_voltage)
- longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
- /* Sync to timer tick */
- safe_halt();
- /* Raise voltage if necessary */
- if (can_scale_voltage && dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-
- /* Change frequency on next halt or sleep */
- longhaul.bits.EnableSoftBusRatio = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3 read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- /* Disable bus ratio bit */
- longhaul.bits.EnableSoftBusRatio = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
- /* Reduce voltage if necessary */
- if (can_scale_voltage && !dir) {
- longhaul.bits.EnableSoftVID = 1;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- /* Change voltage */
- if (!cx_address) {
- ACPI_FLUSH_CPU_CACHE();
- halt();
- } else {
- ACPI_FLUSH_CPU_CACHE();
- /* Invoke C3 */
- inb(cx_address);
- /* Dummy op - must do something useless after P_LVL3
- * read */
- t = inl(acpi_gbl_FADT.xpm_timer_block.address);
- }
- longhaul.bits.EnableSoftVID = 0;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- }
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
- unsigned int mults_index;
- int speed, mult;
- struct cpufreq_freqs freqs;
- unsigned long flags;
- unsigned int pic1_mask, pic2_mask;
- u16 bm_status = 0;
- u32 bm_timeout = 1000;
- unsigned int dir = 0;
-
- mults_index = longhaul_table[table_index].index;
- /* Safety precautions */
- mult = mults[mults_index & 0x1f];
- if (mult == -1)
- return;
- speed = calc_speed(mult);
- if ((speed > highest_speed) || (speed < lowest_speed))
- return;
- /* Voltage transition before frequency transition? */
- if (can_scale_voltage && longhaul_index < table_index)
- dir = 1;
-
- freqs.old = calc_speed(longhaul_get_cpu_mult());
- freqs.new = speed;
- freqs.cpu = 0; /* longhaul.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
- fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
- preempt_disable();
- local_irq_save(flags);
-
- pic2_mask = inb(0xA1);
- pic1_mask = inb(0x21); /* works on C3. save mask. */
- outb(0xFF, 0xA1); /* Overkill */
- outb(0xFE, 0x21); /* TMR0 only */
-
- /* Wait while PCI bus is busy. */
- if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
- || ((pr != NULL) && pr->flags.bm_control))) {
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- while (bm_status && bm_timeout) {
- outw(1 << 4, acpi_regs_addr);
- bm_timeout--;
- bm_status = inw(acpi_regs_addr);
- bm_status &= 1 << 4;
- }
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Disable AGP and PCI arbiters */
- outb(3, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Disable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
- }
- switch (longhaul_version) {
-
- /*
- * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
- * Software controlled multipliers only.
- */
- case TYPE_LONGHAUL_V1:
- do_longhaul1(mults_index);
- break;
-
- /*
- * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
- *
- * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
- * Nehemiah can do FSB scaling too, but this has never been proven
- * to work in practice.
- */
- case TYPE_LONGHAUL_V2:
- case TYPE_POWERSAVER:
- if (longhaul_flags & USE_ACPI_C3) {
- /* Don't allow wakeup */
- acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
- do_powersaver(cx->address, mults_index, dir);
- } else {
- do_powersaver(0, mults_index, dir);
- }
- break;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE) {
- /* Enable arbiters */
- outb(0, 0x22);
- } else if ((pr != NULL) && pr->flags.bm_control) {
- /* Enable bus master arbitration */
- acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
- }
- outb(pic2_mask, 0xA1); /* restore mask */
- outb(pic1_mask, 0x21);
-
- local_irq_restore(flags);
- preempt_enable();
-
- freqs.new = calc_speed(longhaul_get_cpu_mult());
- /* Check if requested frequency is set. */
- if (unlikely(freqs.new != speed)) {
- printk(KERN_INFO PFX "Failed to set requested frequency!\n");
- /* Revision ID = 1 but processor is expecting revision key
- * equal to 0. Jumpers at the bottom of processor will change
- * multiplier and FSB, but will not change bits in Longhaul
- * MSR nor enable voltage scaling. */
- if (!revid_errata) {
- printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
- "option.\n");
- revid_errata = 1;
- msleep(200);
- goto retry_loop;
- }
- /* Why ACPI C3 sometimes doesn't work is a mystery for me.
- * But it does happen. Processor is entering ACPI C3 state,
- * but it doesn't change frequency. I tried poking various
- * bits in northbridge registers, but without success. */
- if (longhaul_flags & USE_ACPI_C3) {
- printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
- longhaul_flags &= ~USE_ACPI_C3;
- if (revid_errata) {
- printk(KERN_INFO PFX "Disabling \"Ignore "
- "Revision ID\" option.\n");
- revid_errata = 0;
- }
- msleep(200);
- goto retry_loop;
- }
- /* This shouldn't happen. Longhaul ver. 2 was reported not
- * working on processors without voltage scaling, but with
- * RevID = 1. RevID errata will make things right. Just
- * to be 100% sure. */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
- longhaul_version = TYPE_LONGHAUL_V1;
- msleep(200);
- goto retry_loop;
- }
- }
- /* Report true CPU frequency */
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- if (!bm_timeout)
- printk(KERN_INFO PFX "Warning: Timeout while waiting for "
- "idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING 0xf
-
-static int guess_fsb(int mult)
-{
- int speed = cpu_khz / 1000;
- int i;
- int speeds[] = { 666, 1000, 1333, 2000 };
- int f_max, f_min;
-
- for (i = 0; i < 4; i++) {
- f_max = ((speeds[i] * mult) + 50) / 100;
- f_max += (ROUNDING / 2);
- f_min = f_max - ROUNDING;
- if ((speed <= f_max) && (speed >= f_min))
- return speeds[i] / 10;
- }
- return 0;
-}
-
-
-static int __init longhaul_get_ranges(void)
-{
- unsigned int i, j, k = 0;
- unsigned int ratio;
- int mult;
-
- /* Get current frequency */
- mult = longhaul_get_cpu_mult();
- if (mult == -1) {
- printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
- return -EINVAL;
- }
- fsb = guess_fsb(mult);
- if (fsb == 0) {
- printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
- return -EINVAL;
- }
- /* Get max multiplier - as we always did.
- * Longhaul MSR is usefull only when voltage scaling is enabled.
- * C3 is booting at max anyway. */
- maxmult = mult;
- /* Get min multiplier */
- switch (cpu_model) {
- case CPU_NEHEMIAH:
- minmult = 50;
- break;
- case CPU_NEHEMIAH_C:
- minmult = 40;
- break;
- default:
- minmult = 30;
- break;
- }
-
- dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
- minmult/10, minmult%10, maxmult/10, maxmult%10);
-
- highest_speed = calc_speed(maxmult);
- lowest_speed = calc_speed(minmult);
- dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
- print_speed(lowest_speed/1000),
- print_speed(highest_speed/1000));
-
- if (lowest_speed == highest_speed) {
- printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
- return -EINVAL;
- }
- if (lowest_speed > highest_speed) {
- printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
- lowest_speed, highest_speed);
- return -EINVAL;
- }
-
- longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
- GFP_KERNEL);
- if (!longhaul_table)
- return -ENOMEM;
-
- for (j = 0; j < numscales; j++) {
- ratio = mults[j];
- if (ratio == -1)
- continue;
- if (ratio > maxmult || ratio < minmult)
- continue;
- longhaul_table[k].frequency = calc_speed(ratio);
- longhaul_table[k].index = j;
- k++;
- }
- if (k <= 1) {
- kfree(longhaul_table);
- return -ENODEV;
- }
- /* Sort */
- for (j = 0; j < k - 1; j++) {
- unsigned int min_f, min_i;
- min_f = longhaul_table[j].frequency;
- min_i = j;
- for (i = j + 1; i < k; i++) {
- if (longhaul_table[i].frequency < min_f) {
- min_f = longhaul_table[i].frequency;
- min_i = i;
- }
- }
- if (min_i != j) {
- swap(longhaul_table[j].frequency,
- longhaul_table[min_i].frequency);
- swap(longhaul_table[j].index,
- longhaul_table[min_i].index);
- }
- }
-
- longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
- /* Find index we are running on */
- for (j = 0; j < k; j++) {
- if (mults[longhaul_table[j].index & 0x1f] == mult) {
- longhaul_index = j;
- break;
- }
- }
- return 0;
-}
-
-
-static void __init longhaul_setup_voltagescaling(void)
-{
- union msr_longhaul longhaul;
- struct mV_pos minvid, maxvid, vid;
- unsigned int j, speed, pos, kHz_step, numvscales;
- int min_vid_speed;
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
- if (!(longhaul.bits.RevisionID & 1)) {
- printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
- return;
- }
-
- if (!longhaul.bits.VRMRev) {
- printk(KERN_INFO PFX "VRM 8.5\n");
- vrm_mV_table = &vrm85_mV[0];
- mV_vrm_table = &mV_vrm85[0];
- } else {
- printk(KERN_INFO PFX "Mobile VRM\n");
- if (cpu_model < CPU_NEHEMIAH)
- return;
- vrm_mV_table = &mobilevrm_mV[0];
- mV_vrm_table = &mV_mobilevrm[0];
- }
-
- minvid = vrm_mV_table[longhaul.bits.MinimumVID];
- maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
- if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
- printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
- "Voltage scaling disabled.\n",
- minvid.mV/1000, minvid.mV%1000,
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- if (minvid.mV == maxvid.mV) {
- printk(KERN_INFO PFX "Claims to support voltage scaling but "
- "min & max are both %d.%03d. "
- "Voltage scaling disabled\n",
- maxvid.mV/1000, maxvid.mV%1000);
- return;
- }
-
- /* How many voltage steps*/
- numvscales = maxvid.pos - minvid.pos + 1;
- printk(KERN_INFO PFX
- "Max VID=%d.%03d "
- "Min VID=%d.%03d, "
- "%d possible voltage scales\n",
- maxvid.mV/1000, maxvid.mV%1000,
- minvid.mV/1000, minvid.mV%1000,
- numvscales);
-
- /* Calculate max frequency at min voltage */
- j = longhaul.bits.MinMHzBR;
- if (longhaul.bits.MinMHzBR4)
- j += 16;
- min_vid_speed = eblcr[j];
- if (min_vid_speed == -1)
- return;
- switch (longhaul.bits.MinMHzFSB) {
- case 0:
- min_vid_speed *= 13333;
- break;
- case 1:
- min_vid_speed *= 10000;
- break;
- case 3:
- min_vid_speed *= 6666;
- break;
- default:
- return;
- break;
- }
- if (min_vid_speed >= highest_speed)
- return;
- /* Calculate kHz for one voltage step */
- kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
- j = 0;
- while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
- speed = longhaul_table[j].frequency;
- if (speed > min_vid_speed)
- pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
- else
- pos = minvid.pos;
- longhaul_table[j].index |= mV_vrm_table[pos] << 8;
- vid = vrm_mV_table[mV_vrm_table[pos]];
- printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
- speed, j, vid.mV);
- j++;
- }
-
- can_scale_voltage = 1;
- printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int table_index = 0;
- unsigned int i;
- unsigned int dir = 0;
- u8 vid, current_vid;
-
- if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
- relation, &table_index))
- return -EINVAL;
-
- /* Don't set same frequency again */
- if (longhaul_index == table_index)
- return 0;
-
- if (!can_scale_voltage)
- longhaul_setstate(table_index);
- else {
- /* On test system voltage transitions exceeding single
- * step up or down were turning motherboard off. Both
- * "ondemand" and "userspace" are unsafe. C7 is doing
- * this in hardware, C3 is old and we need to do this
- * in software. */
- i = longhaul_index;
- current_vid = (longhaul_table[longhaul_index].index >> 8);
- current_vid &= 0x1f;
- if (table_index > longhaul_index)
- dir = 1;
- while (i != table_index) {
- vid = (longhaul_table[i].index >> 8) & 0x1f;
- if (vid != current_vid) {
- longhaul_setstate(i);
- current_vid = vid;
- msleep(200);
- }
- if (dir)
- i++;
- else
- i--;
- }
- longhaul_setstate(table_index);
- }
- longhaul_index = table_index;
- return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
- if (cpu)
- return 0;
- return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
- u32 nesting_level,
- void *context, void **return_value)
-{
- struct acpi_device *d;
-
- if (acpi_bus_get_device(obj_handle, &d))
- return 0;
-
- *return_value = acpi_driver_data(d);
- return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
- struct pci_dev *dev;
- int status = 1;
- int reg;
- u8 pci_cmd;
-
- /* Find PLE133 host bridge */
- reg = 0x78;
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
- NULL);
- /* Find PM133/VT8605 host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8605_0, NULL);
- /* Find CLE266 host bridge */
- if (dev == NULL) {
- reg = 0x76;
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_862X_0, NULL);
- /* Find CN400 V-Link host bridge */
- if (dev == NULL)
- dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
- }
- if (dev != NULL) {
- /* Enable access to port 0x22 */
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- pci_cmd |= 1<<7;
- pci_write_config_byte(dev, reg, pci_cmd);
- pci_read_config_byte(dev, reg, &pci_cmd);
- if (!(pci_cmd & 1<<7)) {
- printk(KERN_ERR PFX
- "Can't enable access to port 0x22.\n");
- status = 0;
- }
- }
- pci_dev_put(dev);
- return status;
- }
- return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
- struct pci_dev *dev;
- u8 pci_cmd;
-
- /* Find VT8235 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
- if (dev == NULL)
- /* Find VT8237 southbridge */
- dev = pci_get_device(PCI_VENDOR_ID_VIA,
- PCI_DEVICE_ID_VIA_8237, NULL);
- if (dev != NULL) {
- /* Set transition time to max */
- pci_read_config_byte(dev, 0xec, &pci_cmd);
- pci_cmd &= ~(1 << 2);
- pci_write_config_byte(dev, 0xec, pci_cmd);
- pci_read_config_byte(dev, 0xe4, &pci_cmd);
- pci_cmd &= ~(1 << 7);
- pci_write_config_byte(dev, 0xe4, pci_cmd);
- pci_read_config_byte(dev, 0xe5, &pci_cmd);
- pci_cmd |= 1 << 7;
- pci_write_config_byte(dev, 0xe5, pci_cmd);
- /* Get address of ACPI registers block*/
- pci_read_config_byte(dev, 0x81, &pci_cmd);
- if (pci_cmd & 1 << 7) {
- pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
- acpi_regs_addr &= 0xff00;
- printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
- acpi_regs_addr);
- }
-
- pci_dev_put(dev);
- return 1;
- }
- return 0;
-}
-
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- char *cpuname = NULL;
- int ret;
- u32 lo, hi;
-
- /* Check what we have on this motherboard */
- switch (c->x86_model) {
- case 6:
- cpu_model = CPU_SAMUEL;
- cpuname = "C3 'Samuel' [C5A]";
- longhaul_version = TYPE_LONGHAUL_V1;
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
- break;
-
- case 7:
- switch (c->x86_mask) {
- case 0:
- longhaul_version = TYPE_LONGHAUL_V1;
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- /* Note, this is not a typo, early Samuel2's had
- * Samuel1 ratios. */
- memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
- memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
- break;
- case 1 ... 15:
- longhaul_version = TYPE_LONGHAUL_V2;
- if (c->x86_mask < 8) {
- cpu_model = CPU_SAMUEL2;
- cpuname = "C3 'Samuel 2' [C5B]";
- } else {
- cpu_model = CPU_EZRA;
- cpuname = "C3 'Ezra' [C5C]";
- }
- memcpy(mults, ezra_mults, sizeof(ezra_mults));
- memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
- break;
- }
- break;
-
- case 8:
- cpu_model = CPU_EZRA_T;
- cpuname = "C3 'Ezra-T' [C5M]";
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
- memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
- break;
-
- case 9:
- longhaul_version = TYPE_POWERSAVER;
- numscales = 32;
- memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
- memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
- switch (c->x86_mask) {
- case 0 ... 1:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah A' [C5XLOE]";
- break;
- case 2 ... 4:
- cpu_model = CPU_NEHEMIAH;
- cpuname = "C3 'Nehemiah B' [C5XLOH]";
- break;
- case 5 ... 15:
- cpu_model = CPU_NEHEMIAH_C;
- cpuname = "C3 'Nehemiah C' [C5P]";
- break;
- }
- break;
-
- default:
- cpuname = "Unknown";
- break;
- }
- /* Check Longhaul ver. 2 */
- if (longhaul_version == TYPE_LONGHAUL_V2) {
- rdmsr(MSR_VIA_LONGHAUL, lo, hi);
- if (lo == 0 && hi == 0)
- /* Looks like MSR isn't present */
- longhaul_version = TYPE_LONGHAUL_V1;
- }
-
- printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
- switch (longhaul_version) {
- case TYPE_LONGHAUL_V1:
- case TYPE_LONGHAUL_V2:
- printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
- break;
- case TYPE_POWERSAVER:
- printk(KERN_CONT "Powersaver supported.\n");
- break;
- };
-
- /* Doesn't hurt */
- longhaul_setup_southbridge();
-
- /* Find ACPI data for processor */
- acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
- ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
- NULL, (void *)&pr);
-
- /* Check ACPI support for C3 state */
- if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
- cx = &pr->power.states[ACPI_STATE_C3];
- if (cx->address > 0 && cx->latency <= 1000)
- longhaul_flags |= USE_ACPI_C3;
- }
- /* Disable if it isn't working */
- if (disable_acpi_c3)
- longhaul_flags &= ~USE_ACPI_C3;
- /* Check if northbridge is friendly */
- if (enable_arbiter_disable())
- longhaul_flags |= USE_NORTHBRIDGE;
-
- /* Check ACPI support for bus master arbiter disable */
- if (!(longhaul_flags & USE_ACPI_C3
- || longhaul_flags & USE_NORTHBRIDGE)
- && ((pr == NULL) || !(pr->flags.bm_control))) {
- printk(KERN_ERR PFX
- "No ACPI support. Unsupported northbridge.\n");
- return -ENODEV;
- }
-
- if (longhaul_flags & USE_NORTHBRIDGE)
- printk(KERN_INFO PFX "Using northbridge support.\n");
- if (longhaul_flags & USE_ACPI_C3)
- printk(KERN_INFO PFX "Using ACPI support.\n");
-
- ret = longhaul_get_ranges();
- if (ret != 0)
- return ret;
-
- if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
- longhaul_setup_voltagescaling();
-
- policy->cpuinfo.transition_latency = 200000; /* nsec */
- policy->cur = calc_speed(longhaul_get_cpu_mult());
-
- ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
- if (ret)
- return ret;
-
- cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
- return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
- .verify = longhaul_verify,
- .target = longhaul_target,
- .get = longhaul_get,
- .init = longhaul_cpu_init,
- .exit = __devexit_p(longhaul_cpu_exit),
- .name = "longhaul",
- .owner = THIS_MODULE,
- .attr = longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
- return -ENODEV;
-
-#ifdef CONFIG_SMP
- if (num_online_cpus() > 1) {
- printk(KERN_ERR PFX "More than 1 CPU detected, "
- "longhaul disabled.\n");
- return -ENODEV;
- }
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (cpu_has_apic) {
- printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
- "broken in this configuration.\n");
- return -ENODEV;
- }
-#endif
- switch (c->x86_model) {
- case 6 ... 9:
- return cpufreq_register_driver(&longhaul_driver);
- case 10:
- printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
- default:
- ;
- }
-
- return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
- int i;
-
- for (i = 0; i < numscales; i++) {
- if (mults[i] == maxmult) {
- longhaul_setstate(i);
- break;
- }
- }
-
- cpufreq_unregister_driver(&longhaul_driver);
- kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very usefull to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index e2360a469f79..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * longhaul.h
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * VIA-specific information
- */
-
-union msr_bcr2 {
- struct {
- unsigned Reseved:19, // 18:0
- ESOFTBF:1, // 19
- Reserved2:3, // 22:20
- CLOCKMUL:4, // 26:23
- Reserved3:5; // 31:27
- } bits;
- unsigned long val;
-};
-
-union msr_longhaul {
- struct {
- unsigned RevisionID:4, // 3:0
- RevisionKey:4, // 7:4
- EnableSoftBusRatio:1, // 8
- EnableSoftVID:1, // 9
- EnableSoftBSEL:1, // 10
- Reserved:3, // 11:13
- SoftBusRatio4:1, // 14
- VRMRev:1, // 15
- SoftBusRatio:4, // 19:16
- SoftVID:5, // 24:20
- Reserved2:3, // 27:25
- SoftBSEL:2, // 29:28
- Reserved3:2, // 31:30
- MaxMHzBR:4, // 35:32
- MaximumVID:5, // 40:36
- MaxMHzFSB:2, // 42:41
- MaxMHzBR4:1, // 43
- Reserved4:4, // 47:44
- MinMHzBR:4, // 51:48
- MinimumVID:5, // 56:52
- MinMHzFSB:2, // 58:57
- MinMHzBR4:1, // 59
- Reserved5:4; // 63:60
- } bits;
- unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
- */
-static const int __initdata samuel1_mults[16] = {
- -1, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- -1, /* 0100 -> RESERVED */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- -1, /* 1111 -> RESERVED */
-};
-
-static const int __initdata samuel1_eblcr[16] = {
- 50, /* 0000 -> RESERVED */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- -1, /* 0011 -> RESERVED */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- -1, /* 0111 -> RESERVED */
- -1, /* 1000 -> RESERVED */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- -1, /* 1100 -> RESERVED */
- 75, /* 1101 -> 7.5x */
- -1, /* 1110 -> RESERVED */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __initdata samuel2_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 110, /* 0111 -> 11.0x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 130, /* 1110 -> 13.0x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __initdata ezra_mults[16] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-};
-
-static const int __initdata ezra_eblcr[16] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __initdata ezrat_mults[32] = {
- 100, /* 0000 -> 10.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
-
- -1, /* 0000 -> RESERVED (10.0x) */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (9.0x)*/
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __initdata ezrat_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 30, /* 0001 -> 3.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- 35, /* 0101 -> 3.5x */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
-
- -1, /* 0000 -> RESERVED (9.0x) */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- -1, /* 0011 -> RESERVED (10.0x)*/
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- -1, /* 1100 -> RESERVED (12.0x) */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __initdata nehemiah_mults[32] = {
- 100, /* 0000 -> 10.0x */
- -1, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 90, /* 0011 -> 9.0x */
- 95, /* 0100 -> 9.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 55, /* 0111 -> 5.5x */
- 60, /* 1000 -> 6.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 50, /* 1011 -> 5.0x */
- 65, /* 1100 -> 6.5x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 120, /* 1111 -> 12.0x */
- -1, /* 0000 -> 10.0x */
- 110, /* 0001 -> 11.0x */
- -1, /* 0010 -> 12.0x */
- -1, /* 0011 -> 9.0x */
- 105, /* 0100 -> 10.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 135, /* 0111 -> 13.5x */
- 140, /* 1000 -> 14.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 130, /* 1011 -> 13.0x */
- 145, /* 1100 -> 14.5x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- -1, /* 1111 -> 12.0x */
-};
-
-static const int __initdata nehemiah_eblcr[32] = {
- 50, /* 0000 -> 5.0x */
- 160, /* 0001 -> 16.0x */
- 40, /* 0010 -> 4.0x */
- 100, /* 0011 -> 10.0x */
- 55, /* 0100 -> 5.5x */
- -1, /* 0101 -> RESERVED */
- 45, /* 0110 -> 4.5x */
- 95, /* 0111 -> 9.5x */
- 90, /* 1000 -> 9.0x */
- 70, /* 1001 -> 7.0x */
- 80, /* 1010 -> 8.0x */
- 60, /* 1011 -> 6.0x */
- 120, /* 1100 -> 12.0x */
- 75, /* 1101 -> 7.5x */
- 85, /* 1110 -> 8.5x */
- 65, /* 1111 -> 6.5x */
- 90, /* 0000 -> 9.0x */
- 110, /* 0001 -> 11.0x */
- 120, /* 0010 -> 12.0x */
- 100, /* 0011 -> 10.0x */
- 135, /* 0100 -> 13.5x */
- 115, /* 0101 -> 11.5x */
- 125, /* 0110 -> 12.5x */
- 105, /* 0111 -> 10.5x */
- 130, /* 1000 -> 13.0x */
- 150, /* 1001 -> 15.0x */
- 160, /* 1010 -> 16.0x */
- 140, /* 1011 -> 14.0x */
- 120, /* 1100 -> 12.0x */
- 155, /* 1101 -> 15.5x */
- -1, /* 1110 -> RESERVED (13.0x) */
- 145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
- unsigned short mV;
- unsigned short pos;
-};
-
-static const struct mV_pos __initdata vrm85_mV[32] = {
- {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
- {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
- {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
- {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
- {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
- {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
- {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
- {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
-};
-
-static const unsigned char __initdata mV_vrm85[32] = {
- 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
- 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
- 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
- 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
-};
-
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
- {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
- {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
- {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
- {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
- {975, 15}, {950, 14}, {925, 13}, {900, 12},
- {875, 11}, {850, 10}, {825, 9}, {800, 8},
- {775, 7}, {750, 6}, {725, 5}, {700, 4},
- {675, 3}, {650, 2}, {625, 1}, {600, 0}
-};
-
-static const unsigned char __initdata mV_mobilevrm[32] = {
- 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
- 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
- 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
- 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index e7b559d74c52..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "longrun", msg)
-
-static struct cpufreq_driver longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __init longrun_get_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
-
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
- if (msr_lo & 0x01)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
- msr_lo &= 0x0000007F;
- msr_hi &= 0x0000007F;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- policy->min = policy->max = longrun_high_freq;
- } else {
- policy->min = longrun_low_freq + msr_lo *
- ((longrun_high_freq - longrun_low_freq) / 100);
- policy->max = longrun_low_freq + msr_hi *
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
- policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
- u32 msr_lo, msr_hi;
- u32 pctg_lo, pctg_hi;
-
- if (!policy)
- return -EINVAL;
-
- if (longrun_high_freq <= longrun_low_freq) {
- /* Assume degenerate Longrun table */
- pctg_lo = pctg_hi = 100;
- } else {
- pctg_lo = (policy->min - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- pctg_hi = (policy->max - longrun_low_freq) /
- ((longrun_high_freq - longrun_low_freq) / 100);
- }
-
- if (pctg_hi > 100)
- pctg_hi = 100;
- if (pctg_lo > pctg_hi)
- pctg_lo = pctg_hi;
-
- /* performance or economy mode */
- rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFFFE;
- switch (policy->policy) {
- case CPUFREQ_POLICY_PERFORMANCE:
- msr_lo |= 0x00000001;
- break;
- case CPUFREQ_POLICY_POWERSAVE:
- break;
- }
- wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
- /* lower and upper boundary */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_lo |= pctg_lo;
- msr_hi |= pctg_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
- if (!policy)
- return -EINVAL;
-
- policy->cpu = 0;
- cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
-
- if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
- (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
- return -EINVAL;
-
- return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
- u32 eax, ebx, ecx, edx;
-
- if (cpu)
- return 0;
-
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- dprintk("cpuid eax is %u\n", eax);
-
- return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
-{
- u32 msr_lo, msr_hi;
- u32 save_lo, save_hi;
- u32 eax, ebx, ecx, edx;
- u32 try_hi;
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (!low_freq || !high_freq)
- return -EINVAL;
-
- if (cpu_has(c, X86_FEATURE_LRTI)) {
- /* if the LongRun Table Interface is present, the
- * detection is a bit easier:
- * For minimum frequency, read out the maximum
- * level (msr_hi), write that into "currently
- * selected level", and read out the frequency.
- * For maximum frequency, read out level zero.
- */
- /* minimum */
- rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
- wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *low_freq = msr_lo * 1000; /* to kHz */
-
- /* maximum */
- wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
- rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
- *high_freq = msr_lo * 1000; /* to kHz */
-
- dprintk("longrun table interface told %u - %u kHz\n",
- *low_freq, *high_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
- return 0;
- }
-
- /* set the upper border to the value determined during TSC init */
- *high_freq = (cpu_khz / 1000);
- *high_freq = *high_freq * 1000;
- dprintk("high frequency is %u kHz\n", *high_freq);
-
- /* get current borders */
- rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
- save_lo = msr_lo & 0x0000007F;
- save_hi = msr_hi & 0x0000007F;
-
- /* if current perf_pctg is larger than 90%, we need to decrease the
- * upper limit to make the calculation more accurate.
- */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
- /* try decreasing in 10% steps, some processors react only
- * on some barrier values */
- for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
- /* set to 0 to try_hi perf_pctg */
- msr_lo &= 0xFFFFFF80;
- msr_hi &= 0xFFFFFF80;
- msr_hi |= try_hi;
- wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
- /* read out current core MHz and current perf_pctg */
- cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
- /* restore values */
- wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
- }
- dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
- /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- * eqals
- * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
- *
- * high_freq * perf_pctg is stored tempoarily into "ebx".
- */
- ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
- if ((ecx > 95) || (ecx == 0) || (eax < ebx))
- return -EIO;
-
- edx = ((eax - ebx) * 100) / (100 - ecx);
- *low_freq = edx * 1000; /* back to kHz */
-
- dprintk("low frequency is %u kHz\n", *low_freq);
-
- if (*low_freq > *high_freq)
- *low_freq = *high_freq;
-
- return 0;
-}
-
-
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
-{
- int result = 0;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* detect low and high frequency */
- result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
- if (result)
- return result;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.min_freq = longrun_low_freq;
- policy->cpuinfo.max_freq = longrun_high_freq;
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- longrun_get_policy(policy);
-
- return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .verify = longrun_verify_policy,
- .setpolicy = longrun_set_policy,
- .get = longrun_get,
- .init = longrun_cpu_init,
- .name = "longrun",
- .owner = THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
- !cpu_has(c, X86_FEATURE_LONGRUN))
- return -ENODEV;
-
- return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
- cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
- "Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct aperfmperf *am = _cur;
-
- get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct aperfmperf perf;
- unsigned long ratio;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
- return 0;
-
- ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
- per_cpu(acfreq_old_perf, cpu) = perf;
-
- retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * (c) 2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 7b8a8ba67b07..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
- * (C) 2002 Tora T. Engstad
- * All Rights Reserved
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * The author(s) of this software shall not be held liable for damages
- * of any nature resulting due to the use of this software. This
- * software is provided AS-IS with no warranties.
- *
- * Date Errata Description
- * 20020525 N44, O17 12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX "p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "p4-clockmod", msg)
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
- DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
- DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES 8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
- u32 l, h;
-
- if (!cpu_online(cpu) ||
- (newstate > DC_DISABLE) || (newstate == DC_RESV))
- return -EINVAL;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
- if (l & 0x01)
- dprintk("CPU#%d currently thermal throttled\n", cpu);
-
- if (has_N44_O17_errata[cpu] &&
- (newstate == DC_25PT || newstate == DC_DFLT))
- newstate = DC_38PT;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
- if (newstate == DC_DISABLE) {
- dprintk("CPU#%d disabling modulation\n", cpu);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
- } else {
- dprintk("CPU#%d setting duty cycle to %d%%\n",
- cpu, ((125 * newstate) / 10));
- /* bits 63 - 5 : reserved
- * bit 4 : enable/disable
- * bits 3-1 : duty cycle
- * bit 0 : reserved
- */
- l = (l & ~14);
- l = l | (1<<4) | ((newstate & 0x7)<<1);
- wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
- }
-
- return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
- {DC_RESV, CPUFREQ_ENTRY_INVALID},
- {DC_DFLT, 0},
- {DC_25PT, 0},
- {DC_38PT, 0},
- {DC_50PT, 0},
- {DC_64PT, 0},
- {DC_75PT, 0},
- {DC_88PT, 0},
- {DC_DISABLE, 0},
- {DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = DC_RESV;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = cpufreq_p4_get(policy->cpu);
- freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
- if (freqs.new == freqs.old)
- return 0;
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- /* run on each logical CPU,
- * see section 13.15.3 of IA32 Intel Architecture Software
- * Developer's Manual, Volume 3
- */
- for_each_cpu(i, policy->cpus)
- cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
- /* notifiers */
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x06) {
- if (cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Warning: EST-capable CPU "
- "detected. The acpi-cpufreq module offers "
- "voltage scaling in addition of frequency "
- "scaling. You should use that instead of "
- "p4-clockmod, if possible.\n");
- switch (c->x86_model) {
- case 0x0E: /* Core */
- case 0x0F: /* Core Duo */
- case 0x16: /* Celeron Core */
- case 0x1C: /* Atom */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
- case 0x0D: /* Pentium M (Dothan) */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
- /* fall through */
- case 0x09: /* Pentium M (Banias) */
- return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
- }
- }
-
- if (c->x86 != 0xF) {
- if (!cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Unknown CPU. "
- "Please send an e-mail to "
- "<cpufreq@vger.kernel.org>\n");
- return 0;
- }
-
- /* on P-4s, the TSC runs with constant frequency independent whether
- * throttling is active or not. */
- p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
- printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
- "The speedstep-ich or acpi cpufreq modules offer "
- "voltage scaling in addition of frequency scaling. "
- "You should use either one instead of p4-clockmod, "
- "if possible.\n");
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
- }
-
- return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
- int cpuid = 0;
- unsigned int i;
-
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
- /* Errata workaround */
- cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
- switch (cpuid) {
- case 0x0f07:
- case 0x0f0a:
- case 0x0f11:
- case 0x0f12:
- has_N44_O17_errata[policy->cpu] = 1;
- dprintk("has errata -- disabling low frequencies\n");
- }
-
- if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
- c->x86_model < 2) {
- /* switch to maximum frequency and measure result */
- cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
- recalibrate_cpu_khz();
- }
- /* get max frequency */
- stock_freq = cpufreq_p4_get_frequency(c);
- if (!stock_freq)
- return -EINVAL;
-
- /* table init */
- for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
- if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
- p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- p4clockmod_table[i].frequency = (stock_freq * i)/8;
- }
- cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
- /* cpuinfo and default policy values */
-
- /* the transition latency is set to be 1 higher than the maximum
- * transition latency of the ondemand governor */
- policy->cpuinfo.transition_latency = 10000001;
- policy->cur = stock_freq;
-
- return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
- u32 l, h;
-
- rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
- if (l & 0x10) {
- l = l >> 1;
- l &= 0x7;
- } else
- l = DC_DISABLE;
-
- if (l != DC_DISABLE)
- return stock_freq * l / 8;
-
- return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
- .verify = cpufreq_p4_verify,
- .target = cpufreq_p4_target,
- .init = cpufreq_p4_cpu_init,
- .exit = cpufreq_p4_cpu_exit,
- .get = cpufreq_p4_get,
- .name = "p4-clockmod",
- .owner = THIS_MODULE,
- .attr = p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int ret;
-
- /*
- * THERM_CONTROL is architectural for IA32 now, so
- * we can rely on the capability checks
- */
- if (c->x86_vendor != X86_VENDOR_INTEL)
- return -ENODEV;
-
- if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
- !test_cpu_cap(c, X86_FEATURE_ACC))
- return -ENODEV;
-
- ret = cpufreq_register_driver(&p4clockmod_driver);
- if (!ret)
- printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
- "Modulation available\n");
-
- return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
- cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index a36de5bbb622..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- * INFRINGEMENT. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#define PCC_VERSION "1.00.00"
-#define POLL_LOOPS 300
-
-#define CMD_COMPLETE 0x1
-#define CMD_GET_FREQ 0x0
-#define CMD_SET_FREQ 0x1
-
-#define BUF_SZ 4
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "pcc-cpufreq", msg)
-
-struct pcc_register_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 bit_width;
- u8 bit_offset;
- u8 access_size;
- u64 address;
-} __attribute__ ((packed));
-
-struct pcc_memory_resource {
- u8 descriptor;
- u16 length;
- u8 space_id;
- u8 resource_usage;
- u8 type_specific;
- u64 granularity;
- u64 minimum;
- u64 maximum;
- u64 translation_offset;
- u64 address_length;
-} __attribute__ ((packed));
-
-static struct cpufreq_driver pcc_cpufreq_driver;
-
-struct pcc_header {
- u32 signature;
- u16 length;
- u8 major;
- u8 minor;
- u32 features;
- u16 command;
- u16 status;
- u32 latency;
- u32 minimum_time;
- u32 maximum_time;
- u32 nominal;
- u32 throttled_frequency;
- u32 minimum_frequency;
-};
-
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-
-static DEFINE_SPINLOCK(pcc_lock);
-
-static struct acpi_generic_address doorbell;
-
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
- 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-
-struct pcc_cpu {
- u32 input_offset;
- u32 output_offset;
-};
-
-static struct pcc_cpu *pcc_cpu_info;
-
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
- cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
- return 0;
-}
-
-static inline void pcc_cmd(void)
-{
- u64 doorbell_value;
- int i;
-
- acpi_read(&doorbell_value, &doorbell);
- acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
- &doorbell);
-
- for (i = 0; i < POLL_LOOPS; i++) {
- if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
- break;
- }
-}
-
-static inline void pcc_clear_mapping(void)
-{
- if (pcch_virt_addr)
- iounmap(pcch_virt_addr);
- pcch_virt_addr = NULL;
-}
-
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
- struct pcc_cpu *pcc_cpu_data;
- unsigned int curr_freq;
- unsigned int freq_limit;
- u16 status;
- u32 input_buffer;
- u32 output_buffer;
-
- spin_lock(&pcc_lock);
-
- dprintk("get: get_freq for CPU %d\n", cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- input_buffer = 0x1;
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- output_buffer =
- ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("get: FAILED: for CPU %d, status is %d\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
- curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
- / 100) * 1000);
-
- dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
- "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
- cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
- output_buffer, curr_freq);
-
- freq_limit = (output_buffer >> 8) & 0xff;
- if (freq_limit != 0xff) {
- dprintk("get: frequency for cpu %d is being temporarily"
- " capped at %d\n", cpu, curr_freq);
- }
-
- spin_unlock(&pcc_lock);
- return curr_freq;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return -EINVAL;
-}
-
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- struct pcc_cpu *pcc_cpu_data;
- struct cpufreq_freqs freqs;
- u16 status;
- u32 input_buffer;
- int cpu;
-
- spin_lock(&pcc_lock);
- cpu = policy->cpu;
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- dprintk("target: CPU %d should go to target freq: %d "
- "(virtual) input_offset is 0x%x\n",
- cpu, target_freq,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
-
- freqs.new = target_freq;
- freqs.cpu = cpu;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- input_buffer = 0x1 | (((target_freq * 100)
- / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
- iowrite32(input_buffer,
- (pcch_virt_addr + pcc_cpu_data->input_offset));
- iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-
- pcc_cmd();
-
- /* Clear the input buffer - we are done with the current command */
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
- status = ioread16(&pcch_hdr->status);
- if (status != CMD_COMPLETE) {
- dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
- cpu, status);
- goto cmd_incomplete;
- }
- iowrite16(0, &pcch_hdr->status);
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
- spin_unlock(&pcc_lock);
-
- return 0;
-
-cmd_incomplete:
- iowrite16(0, &pcch_hdr->status);
- spin_unlock(&pcc_lock);
- return -EINVAL;
-}
-
-static int pcc_get_offset(int cpu)
-{
- acpi_status status;
- struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object *pccp, *offset;
- struct pcc_cpu *pcc_cpu_data;
- struct acpi_processor *pr;
- int ret = 0;
-
- pr = per_cpu(processors, cpu);
- pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
- status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- pccp = buffer.pointer;
- if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- };
-
- offset = &(pccp->package.elements[0]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->input_offset = offset->integer.value;
-
- offset = &(pccp->package.elements[1]);
- if (!offset || offset->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcc_cpu_data->output_offset = offset->integer.value;
-
- memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
- memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-
- dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
- "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
- cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
- kfree(buffer.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
- acpi_status status;
- struct acpi_object_list input;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- union acpi_object in_params[4];
- union acpi_object *out_obj;
- u32 capabilities[2];
- u32 errors;
- u32 supported;
- int ret = 0;
-
- input.count = 4;
- input.pointer = in_params;
- input.count = 4;
- input.pointer = in_params;
- in_params[0].type = ACPI_TYPE_BUFFER;
- in_params[0].buffer.length = 16;
- in_params[0].buffer.pointer = OSC_UUID;
- in_params[1].type = ACPI_TYPE_INTEGER;
- in_params[1].integer.value = 1;
- in_params[2].type = ACPI_TYPE_INTEGER;
- in_params[2].integer.value = 2;
- in_params[3].type = ACPI_TYPE_BUFFER;
- in_params[3].buffer.length = 8;
- in_params[3].buffer.pointer = (u8 *)&capabilities;
-
- capabilities[0] = OSC_QUERY_ENABLE;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
-
- kfree(output.pointer);
- capabilities[0] = 0x0;
- capabilities[1] = 0x1;
-
- status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- if (!output.length)
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER)
- return -ENODEV;
-
- errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors)
- return -ENODEV;
-
- supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1))
- return -ENODEV;
-
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int __init pcc_cpufreq_probe(void)
-{
- acpi_status status;
- struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
- struct pcc_memory_resource *mem_resource;
- struct pcc_register_resource *reg_resource;
- union acpi_object *out_obj, *member;
- acpi_handle handle, osc_handle, pcch_handle;
- int ret = 0;
-
- status = acpi_get_handle(NULL, "\\_SB", &handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "PCCH", &pcch_handle);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- status = acpi_get_handle(handle, "_OSC", &osc_handle);
- if (ACPI_SUCCESS(status)) {
- ret = pcc_cpufreq_do_osc(&osc_handle);
- if (ret)
- dprintk("probe: _OSC evaluation did not succeed\n");
- /* Firmware's use of _OSC is optional */
- ret = 0;
- }
-
- status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
- if (ACPI_FAILURE(status))
- return -ENODEV;
-
- out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_PACKAGE) {
- ret = -ENODEV;
- goto out_free;
- }
-
- member = &out_obj->package.elements[0];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
-
- mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-
- dprintk("probe: mem_resource descriptor: 0x%x,"
- " length: %d, space_id: %d, resource_usage: %d,"
- " type_specific: %d, granularity: 0x%llx,"
- " minimum: 0x%llx, maximum: 0x%llx,"
- " translation_offset: 0x%llx, address_length: 0x%llx\n",
- mem_resource->descriptor, mem_resource->length,
- mem_resource->space_id, mem_resource->resource_usage,
- mem_resource->type_specific, mem_resource->granularity,
- mem_resource->minimum, mem_resource->maximum,
- mem_resource->translation_offset,
- mem_resource->address_length);
-
- if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
- ret = -ENODEV;
- goto out_free;
- }
-
- pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
- mem_resource->address_length);
- if (pcch_virt_addr == NULL) {
- dprintk("probe: could not map shared mem region\n");
- goto out_free;
- }
- pcch_hdr = pcch_virt_addr;
-
- dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
- dprintk("probe: PCCH header is at physical address: 0x%llx,"
- " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
- " supported features: 0x%x, command field: 0x%x,"
- " status field: 0x%x, nominal latency: %d us\n",
- mem_resource->minimum, ioread32(&pcch_hdr->signature),
- ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
- ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
- ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
- ioread32(&pcch_hdr->latency));
-
- dprintk("probe: min time between commands: %d us,"
- " max time between commands: %d us,"
- " nominal CPU frequency: %d MHz,"
- " minimum CPU frequency: %d MHz,"
- " minimum CPU frequency without throttling: %d MHz\n",
- ioread32(&pcch_hdr->minimum_time),
- ioread32(&pcch_hdr->maximum_time),
- ioread32(&pcch_hdr->nominal),
- ioread32(&pcch_hdr->throttled_frequency),
- ioread32(&pcch_hdr->minimum_frequency));
-
- member = &out_obj->package.elements[1];
- if (member->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-
- doorbell.space_id = reg_resource->space_id;
- doorbell.bit_width = reg_resource->bit_width;
- doorbell.bit_offset = reg_resource->bit_offset;
- doorbell.access_width = 64;
- doorbell.address = reg_resource->address;
-
- dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
- "bit_offset is %d, access_width is %d, address is 0x%llx\n",
- doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
- doorbell.access_width, reg_resource->address);
-
- member = &out_obj->package.elements[2];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_preserve = member->integer.value;
-
- member = &out_obj->package.elements[3];
- if (member->type != ACPI_TYPE_INTEGER) {
- ret = -ENODEV;
- goto pcch_free;
- }
-
- doorbell_write = member->integer.value;
-
- dprintk("probe: doorbell_preserve: 0x%llx,"
- " doorbell_write: 0x%llx\n",
- doorbell_preserve, doorbell_write);
-
- pcc_cpu_info = alloc_percpu(struct pcc_cpu);
- if (!pcc_cpu_info) {
- ret = -ENOMEM;
- goto pcch_free;
- }
-
- printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
- " limits: %d MHz, %d MHz\n", PCC_VERSION,
- ioread32(&pcch_hdr->minimum_frequency),
- ioread32(&pcch_hdr->nominal));
- kfree(output.pointer);
- return ret;
-pcch_free:
- pcc_clear_mapping();
-out_free:
- kfree(output.pointer);
- return ret;
-}
-
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
- unsigned int result = 0;
-
- if (!pcch_virt_addr) {
- result = -1;
- goto out;
- }
-
- result = pcc_get_offset(cpu);
- if (result) {
- dprintk("init: PCCP evaluation failed\n");
- goto out;
- }
-
- policy->max = policy->cpuinfo.max_freq =
- ioread32(&pcch_hdr->nominal) * 1000;
- policy->min = policy->cpuinfo.min_freq =
- ioread32(&pcch_hdr->minimum_frequency) * 1000;
- policy->cur = pcc_get_freq(cpu);
-
- if (!policy->cur) {
- dprintk("init: Unable to get current CPU frequency\n");
- result = -EINVAL;
- goto out;
- }
-
- dprintk("init: policy->max is %d, policy->min is %d\n",
- policy->max, policy->min);
-out:
- return result;
-}
-
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
- return 0;
-}
-
-static struct cpufreq_driver pcc_cpufreq_driver = {
- .flags = CPUFREQ_CONST_LOOPS,
- .get = pcc_get_freq,
- .verify = pcc_cpufreq_verify,
- .target = pcc_cpufreq_target,
- .init = pcc_cpufreq_cpu_init,
- .exit = pcc_cpufreq_cpu_exit,
- .name = "pcc-cpufreq",
- .owner = THIS_MODULE,
-};
-
-static int __init pcc_cpufreq_init(void)
-{
- int ret;
-
- if (acpi_disabled)
- return 0;
-
- ret = pcc_cpufreq_probe();
- if (ret) {
- dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
- return ret;
- }
-
- ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-
- return ret;
-}
-
-static void __exit pcc_cpufreq_exit(void)
-{
- cpufreq_unregister_driver(&pcc_cpufreq_driver);
-
- pcc_clear_mapping();
-
- free_percpu(pcc_cpu_info);
-}
-
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
- * Dominik Brodowski.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
- as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int busfreq; /* FSB, in 10 kHz */
-static unsigned int max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
- {45, /* 000 -> 4.5x */ 0},
- {50, /* 001 -> 5.0x */ 0},
- {40, /* 010 -> 4.0x */ 0},
- {55, /* 011 -> 5.5x */ 0},
- {20, /* 100 -> 2.0x */ 0},
- {30, /* 101 -> 3.0x */ 0},
- {60, /* 110 -> 6.0x */ 0},
- {35, /* 111 -> 3.5x */ 0},
- {0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- * Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
- u64 invalue = 0;
- u32 msrval;
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- * Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
- unsigned long outvalue = 0, invalue = 0;
- unsigned long msrval;
- struct cpufreq_freqs freqs;
-
- if (clock_ratio[best_i].index > max_multiplier) {
- printk(KERN_ERR PFX "invalid target frequency\n");
- return;
- }
-
- freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
- freqs.new = busfreq * clock_ratio[best_i].index;
- freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* we now need to transform best_i to the BVC format, see AMD#23446 */
-
- outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
- msrval = POWERNOW_IOPORT + 0x1;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
- invalue = inl(POWERNOW_IOPORT + 0x8);
- invalue = invalue & 0xf;
- outvalue = outvalue | invalue;
- outl(outvalue , (POWERNOW_IOPORT + 0x8));
- msrval = POWERNOW_IOPORT + 0x0;
- wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- powernow_k6_set_state(newstate);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
- unsigned int i, f;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- /* get frequencies */
- max_multiplier = powernow_k6_get_cpu_multiplier();
- busfreq = cpu_khz / max_multiplier;
-
- /* table init */
- for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
- f = clock_ratio[i].index;
- if (f > max_multiplier)
- clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
- else
- clock_ratio[i].frequency = busfreq * f;
- }
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 200000;
- policy->cur = busfreq * max_multiplier;
-
- result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
- return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int i;
- for (i = 0; i < 8; i++) {
- if (i == max_multiplier)
- powernow_k6_set_state(i);
- }
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
- unsigned int ret;
- ret = (busfreq * powernow_k6_get_cpu_multiplier());
- return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
- .verify = powernow_k6_verify,
- .target = powernow_k6_target,
- .init = powernow_k6_cpu_init,
- .exit = powernow_k6_cpu_exit,
- .get = powernow_k6_get,
- .name = "powernow-k6",
- .owner = THIS_MODULE,
- .attr = powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
- ((c->x86_model != 12) && (c->x86_model != 13)))
- return -ENODEV;
-
- if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
- printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
- return -EIO;
- }
-
- if (cpufreq_register_driver(&powernow_k6_driver)) {
- release_region(POWERNOW_IOPORT, 16);
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
- cpufreq_unregister_driver(&powernow_k6_driver);
- release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 9a97116f89e5..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * AMD K7 Powernow driver.
- * (C) 2003 Dave Jones on behalf of SuSE Labs.
- * (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- * CPU may fail to execute a FID/VID change in presence of interrupt.
- * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- * CPU with half frequency multipliers may hang upon wakeup from disconnect.
- * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags;
- u16 settlingtime;
- u8 reserved1;
- u8 numpst;
-};
-
-struct pst_s {
- u32 cpuid;
- u8 fsbspeed;
- u8 maxfid;
- u8 startvid;
- u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
- struct {
- unsigned long fid:5,
- vid:5,
- sgtc:20,
- res1:2;
- } bits;
- unsigned long val;
-};
-#endif
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
- 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
- 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
- 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
- 1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
- 110, 115, 120, 125, 50, 55, 60, 65,
- 70, 75, 80, 85, 90, 95, 100, 105,
- 30, 190, 40, 200, 130, 135, 140, 210,
- 150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "powernow-k7", msg)
-
-static int check_fsb(unsigned int fsbspeed)
-{
- int delta;
- unsigned int f = fsb / 1000;
-
- delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
- return delta < 5;
-}
-
-static int check_powernow(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- unsigned int maxei, eax, ebx, ecx, edx;
-
- if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
- printk(KERN_INFO PFX "This module only works with "
- "AMD K7 CPUs\n");
-#endif
- return 0;
- }
-
- /* Get maximum capabilities */
- maxei = cpuid_eax(0x80000000);
- if (maxei < 0x80000007) { /* Any powernow info ? */
-#ifdef MODULE
- printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
- return 0;
- }
-
- if ((c->x86_model == 6) && (c->x86_mask == 0)) {
- printk(KERN_INFO PFX "K7 660[A0] core detected, "
- "enabling errata workarounds\n");
- have_a0 = 1;
- }
-
- cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
- /* Check we can actually do something before we say anything.*/
- if (!(edx & (1 << 1 | 1 << 2)))
- return 0;
-
- printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
- if (edx & 1 << 1) {
- printk("frequency");
- can_scale_bus = 1;
- }
-
- if ((edx & (1 << 1 | 1 << 2)) == 0x6)
- printk(" and ");
-
- if (edx & 1 << 2) {
- printk("voltage");
- can_scale_vid = 1;
- }
-
- printk(".\n");
- return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
- unsigned int j;
- unsigned int speed;
- u8 fid, vid;
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table)
- return -ENOMEM;
-
- for (j = 0 ; j < number_scales; j++) {
- fid = *pst++;
-
- powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
- powernow_table[j].index = fid; /* lower 8 bits */
-
- speed = powernow_table[j].frequency;
-
- if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (have_a0 == 1)
- invalidate_entry(j);
-#endif
- }
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
-
- vid = *pst++;
- powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed/1000, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
- }
- powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
- powernow_table[number_scales].index = 0;
-
- return 0;
-}
-
-
-static void change_FID(int fid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.FID != fid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.FID = fid;
- fidvidctl.bits.VIDC = 0;
- fidvidctl.bits.FIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_VID(int vid)
-{
- union msr_fidvidctl fidvidctl;
-
- rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- if (fidvidctl.bits.VID != vid) {
- fidvidctl.bits.SGTC = latency;
- fidvidctl.bits.VID = vid;
- fidvidctl.bits.FIDC = 0;
- fidvidctl.bits.VIDC = 1;
- wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
- }
-}
-
-
-static void change_speed(unsigned int index)
-{
- u8 fid, vid;
- struct cpufreq_freqs freqs;
- union msr_fidvidstatus fidvidstatus;
- int cfid;
-
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in powernow_decode_bios,
- * vid are the upper 8 bits.
- */
-
- fid = powernow_table[index].index & 0xFF;
- vid = (powernow_table[index].index & 0xFF00) >> 8;
-
- freqs.cpu = 0;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
- freqs.old = fsb * fid_codes[cfid] / 10;
-
- freqs.new = powernow_table[index].frequency;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- /* Now do the magic poking into the MSRs. */
-
- if (have_a0 == 1) /* A0 errata 5 */
- local_irq_disable();
-
- if (freqs.old > freqs.new) {
- /* Going down, so change FID first */
- change_FID(fid);
- change_VID(vid);
- } else {
- /* Going up, so change VID first */
- change_VID(vid);
- change_FID(fid);
- }
-
-
- if (have_a0 == 1)
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
- int i;
- int retval = 0;
- union powernow_acpi_control_t pc;
-
- if (acpi_processor_perf != NULL && powernow_table != NULL) {
- retval = -EINVAL;
- goto err0;
- }
-
- acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
- GFP_KERNEL);
- if (!acpi_processor_perf) {
- retval = -ENOMEM;
- goto err0;
- }
-
- if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
- GFP_KERNEL)) {
- retval = -ENOMEM;
- goto err05;
- }
-
- if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
- retval = -EIO;
- goto err1;
- }
-
- if (acpi_processor_perf->control_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- if (acpi_processor_perf->status_register.space_id !=
- ACPI_ADR_SPACE_FIXED_HARDWARE) {
- retval = -ENODEV;
- goto err2;
- }
-
- number_scales = acpi_processor_perf->state_count;
-
- if (number_scales < 2) {
- retval = -ENODEV;
- goto err2;
- }
-
- powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
- (number_scales + 1)), GFP_KERNEL);
- if (!powernow_table) {
- retval = -ENOMEM;
- goto err2;
- }
-
- pc.val = (unsigned long) acpi_processor_perf->states[0].control;
- for (i = 0; i < number_scales; i++) {
- u8 fid, vid;
- struct acpi_processor_px *state =
- &acpi_processor_perf->states[i];
- unsigned int speed, speed_mhz;
-
- pc.val = (unsigned long) state->control;
- dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
- i,
- (u32) state->core_frequency,
- (u32) state->power,
- (u32) state->transition_latency,
- (u32) state->control,
- pc.bits.sgtc);
-
- vid = pc.bits.vid;
- fid = pc.bits.fid;
-
- powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
- powernow_table[i].index = fid; /* lower 8 bits */
- powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
- speed = powernow_table[i].frequency;
- speed_mhz = speed / 1000;
-
- /* processor_perflib will multiply the MHz value by 1000 to
- * get a KHz value (e.g. 1266000). However, powernow-k7 works
- * with true KHz values (e.g. 1266768). To ensure that all
- * powernow frequencies are available, we must ensure that
- * ACPI doesn't restrict them, so we round up the MHz value
- * to ensure that perflib's computed KHz value is greater than
- * or equal to powernow's KHz value.
- */
- if (speed % 1000 > 0)
- speed_mhz++;
-
- if ((fid_codes[fid] % 10) == 5) {
- if (have_a0 == 1)
- invalidate_entry(i);
- }
-
- dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
- "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
- fid_codes[fid] % 10, speed_mhz, vid,
- mobile_vid_table[vid]/1000,
- mobile_vid_table[vid]%1000);
-
- if (state->core_frequency != speed_mhz) {
- state->core_frequency = speed_mhz;
- dprintk(" Corrected ACPI frequency to %d\n",
- speed_mhz);
- }
-
- if (latency < pc.bits.sgtc)
- latency = pc.bits.sgtc;
-
- if (speed < minimum_speed)
- minimum_speed = speed;
- if (speed > maximum_speed)
- maximum_speed = speed;
- }
-
- powernow_table[i].frequency = CPUFREQ_TABLE_END;
- powernow_table[i].index = 0;
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- return 0;
-
-err2:
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
- kfree(acpi_processor_perf);
-err0:
- printk(KERN_WARNING PFX "ACPI perflib can not be used on "
- "this platform\n");
- acpi_processor_perf = NULL;
- return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
- printk(KERN_INFO PFX "no support for ACPI processor found."
- " Please recompile your kernel with ACPI processor\n");
- return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
- dprintk("PST:%d (@%p)\n", j, pst);
- dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
- pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
- struct psb_s *psb;
- struct pst_s *pst;
- unsigned int i, j;
- unsigned char *p;
- unsigned int etuple;
- unsigned int ret;
-
- etuple = cpuid_eax(0x80000001);
-
- for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
- p = phys_to_virt(i);
-
- if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
- dprintk("Found PSB header at %p\n", p);
- psb = (struct psb_s *) p;
- dprintk("Table version: 0x%x\n", psb->tableversion);
- if (psb->tableversion != 0x12) {
- printk(KERN_INFO PFX "Sorry, only v1.2 tables"
- " supported right now\n");
- return -ENODEV;
- }
-
- dprintk("Flags: 0x%x\n", psb->flags);
- if ((psb->flags & 1) == 0)
- dprintk("Mobile voltage regulator\n");
- else
- dprintk("Desktop voltage regulator\n");
-
- latency = psb->settlingtime;
- if (latency < 100) {
- printk(KERN_INFO PFX "BIOS set settling time "
- "to %d microseconds. "
- "Should be at least 100. "
- "Correcting.\n", latency);
- latency = 100;
- }
- dprintk("Settling Time: %d microseconds.\n",
- psb->settlingtime);
- dprintk("Has %d PST tables. (Only dumping ones "
- "relevant to this CPU).\n",
- psb->numpst);
-
- p += sizeof(struct psb_s);
-
- pst = (struct pst_s *) p;
-
- for (j = 0; j < psb->numpst; j++) {
- pst = (struct pst_s *) p;
- number_scales = pst->numpstates;
-
- if ((etuple == pst->cpuid) &&
- check_fsb(pst->fsbspeed) &&
- (maxfid == pst->maxfid) &&
- (startvid == pst->startvid)) {
- print_pst_entry(pst, j);
- p = (char *)pst + sizeof(struct pst_s);
- ret = get_ranges(p);
- return ret;
- } else {
- unsigned int k;
- p = (char *)pst + sizeof(struct pst_s);
- for (k = 0; k < number_scales; k++)
- p += 2;
- }
- }
- printk(KERN_INFO PFX "No PST tables match this cpuid "
- "(0x%x)\n", etuple);
- printk(KERN_INFO PFX "This is indicative of a broken "
- "BIOS.\n");
-
- return -EINVAL;
- }
- p++;
- }
-
- return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate;
-
- if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
- relation, &newstate))
- return -EINVAL;
-
- change_speed(newstate);
-
- return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __init fixup_sgtc(void)
-{
- unsigned int sgtc;
- unsigned int m;
-
- m = fsb / 3333;
- if ((m % 10) >= 5)
- m += 5;
-
- m /= 10;
-
- sgtc = 100 * m * latency;
- sgtc = sgtc / 3;
- if (sgtc > 0xfffff) {
- printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
- sgtc = 0xfffff;
- }
- return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
- union msr_fidvidstatus fidvidstatus;
- unsigned int cfid;
-
- if (cpu)
- return 0;
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
- cfid = fidvidstatus.bits.CFID;
-
- return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
-{
- printk(KERN_WARNING PFX
- "%s laptop with broken PST tables in BIOS detected.\n",
- d->ident);
- printk(KERN_WARNING PFX
- "You need to downgrade to 3A21 (09/09/2002), or try a newer "
- "BIOS than 3A71 (01/20/2003)\n");
- printk(KERN_WARNING PFX
- "cpufreq scaling has been disabled as a result of this.\n");
- return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
- {
- .callback = acer_cpufreq_pst,
- .ident = "Acer Aspire",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
- DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
- },
- },
- { }
-};
-
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
-{
- union msr_fidvidstatus fidvidstatus;
- int result;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
- recalibrate_cpu_khz();
-
- fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
- if (!fsb) {
- printk(KERN_WARNING PFX "can not determine bus frequency\n");
- return -EINVAL;
- }
- dprintk("FSB: %3dMHz\n", fsb/1000);
-
- if (dmi_check_system(powernow_dmi_table) || acpi_force) {
- printk(KERN_INFO PFX "PSB/PST known to be broken. "
- "Trying ACPI instead\n");
- result = powernow_acpi_init();
- } else {
- result = powernow_decode_bios(fidvidstatus.bits.MFID,
- fidvidstatus.bits.SVID);
- if (result) {
- printk(KERN_INFO PFX "Trying ACPI perflib\n");
- maximum_speed = 0;
- minimum_speed = -1;
- latency = 0;
- result = powernow_acpi_init();
- if (result) {
- printk(KERN_INFO PFX
- "ACPI and legacy methods failed\n");
- }
- } else {
- /* SGTC use the bus clock as timer */
- latency = fixup_sgtc();
- printk(KERN_INFO PFX "SGTC: %d\n", latency);
- }
- }
-
- if (result)
- return result;
-
- printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
- minimum_speed/1000, maximum_speed/1000);
-
- policy->cpuinfo.transition_latency =
- cpufreq_scale(2000000UL, fsb, latency);
-
- policy->cur = powernow_get(0);
-
- cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
- return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- if (acpi_processor_perf) {
- acpi_processor_unregister_performance(acpi_processor_perf, 0);
- free_cpumask_var(acpi_processor_perf->shared_cpu_map);
- kfree(acpi_processor_perf);
- }
-#endif
-
- kfree(powernow_table);
- return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
- .verify = powernow_verify,
- .target = powernow_target,
- .get = powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
- .bios_limit = acpi_processor_get_bios_limit,
-#endif
- .init = powernow_cpu_init,
- .exit = powernow_cpu_exit,
- .name = "powernow-k7",
- .owner = THIS_MODULE,
- .attr = powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
- if (check_powernow() == 0)
- return -ENODEV;
- return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
- cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force, int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * (C) 2003 Dave Jones.
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * AMD-specific information
- *
- */
-
-union msr_fidvidctl {
- struct {
- unsigned FID:5, // 4:0
- reserved1:3, // 7:5
- VID:5, // 12:8
- reserved2:3, // 15:13
- FIDC:1, // 16
- VIDC:1, // 17
- reserved3:2, // 19:18
- FIDCHGRATIO:1, // 20
- reserved4:11, // 31-21
- SGTC:20, // 32:51
- reserved5:12; // 63:52
- } bits;
- unsigned long long val;
-};
-
-union msr_fidvidstatus {
- struct {
- unsigned CFID:5, // 4:0
- reserved1:3, // 7:5
- SFID:5, // 12:8
- reserved2:3, // 15:13
- MFID:5, // 20:16
- reserved3:11, // 31:21
- CVID:5, // 36:32
- reserved4:3, // 39:37
- SVID:5, // 44:40
- reserved5:3, // 47:45
- MVID:5, // 52:48
- reserved6:11; // 63:53
- } bits;
- unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 3e90cce3dc8b..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1599 +0,0 @@
-/*
- * (c) 2003-2010 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Support : mark.langsdorf@amd.com
- *
- * Based on the powernow-k7.c module written by Dave Jones.
- * (C) 2003 Dave Jones on behalf of SuSE Labs
- * (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@suse.cz>
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Valuable input gratefully received from Dave Jones, Pavel Machek,
- * Dominik Brodowski, Jacob Shin, and others.
- * Originally developed by Paul Devriendt.
- * Processor information obtained from Chapter 9 (Power and Thermal Management)
- * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- * Opteron Processors" available for download from www.amd.com
- *
- * Tables for specific CPUs can be inferred from
- * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h> /* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-
-/* serialize freq changes */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-
-static struct cpufreq_driver cpufreq_amd64_driver;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
- return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
- return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
- return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
- u32 pstate)
-{
- return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
- if (fid < HI_FID_TABLE_BOTTOM)
- return 8 + (2 * fid);
- else
- return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
- u32 lo, hi;
-
- if (cpu_family == CPU_HW_PSTATE)
- return 0;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
- u32 lo, hi;
- u32 i = 0;
-
- if (cpu_family == CPU_HW_PSTATE) {
- rdmsr(MSR_PSTATE_STATUS, lo, hi);
- i = lo & HW_PSTATE_MASK;
- data->currpstate = i;
-
- /*
- * a workaround for family 11h erratum 311 might cause
- * an "out-of-range Pstate if the core is in Pstate-0
- */
- if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
- data->currpstate = HW_PSTATE_0;
-
- return 0;
- }
- do {
- if (i++ > 10000) {
- dprintk("detected change pending stuck\n");
- return 1;
- }
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- } while (lo & MSR_S_LO_CHANGE_PENDING);
-
- data->currvid = hi & MSR_S_HI_CURRENT_VID;
- data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
- return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
- udelay((1 << data->irt) * 10);
- return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
- udelay(data->vstable * VST_UNITS_20US);
- return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
- u32 lo, hi;
- u8 fid, vid;
-
- rdmsr(MSR_FIDVID_STATUS, lo, hi);
- vid = hi & MSR_S_HI_CURRENT_VID;
- fid = lo & MSR_S_LO_CURRENT_FID;
- lo = fid | (vid << MSR_C_LO_VID_SHIFT);
- hi = MSR_C_HI_STP_GNT_BENIGN;
- dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
- wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
- u32 lo;
- u32 savevid = data->currvid;
- u32 i = 0;
-
- if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on fid write\n");
- return 1;
- }
-
- lo = fid;
- lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
- fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
- if (i++ > 100) {
- printk(KERN_ERR PFX
- "Hardware error - pending bit very stuck - "
- "no further pstate changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- count_off_irt(data);
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX
- "vid change on fid trans, old 0x%x, new 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- if (fid != data->currfid) {
- printk(KERN_ERR PFX
- "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
- data->currfid);
- return 1;
- }
-
- return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
- u32 lo;
- u32 savefid = data->currfid;
- int i = 0;
-
- if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
- printk(KERN_ERR PFX "internal error - overflow on vid write\n");
- return 1;
- }
-
- lo = data->currfid;
- lo |= (vid << MSR_C_LO_VID_SHIFT);
- lo |= MSR_C_LO_INIT_FID_VID;
-
- dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
- vid, lo, STOP_GRANT_5NS);
-
- do {
- wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
- if (i++ > 100) {
- printk(KERN_ERR PFX "internal error - pending bit "
- "very stuck - no further pstate "
- "changes possible\n");
- return 1;
- }
- } while (query_current_values_with_pending_wait(data));
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "fid changed on vid trans, old "
- "0x%x new 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (vid != data->currvid) {
- printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
- "curr 0x%x\n",
- vid, data->currvid);
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
- u32 reqvid, u32 step)
-{
- if ((data->currvid - reqvid) > step)
- reqvid = data->currvid - step;
-
- if (write_new_vid(data, reqvid))
- return 1;
-
- count_off_vst(data);
-
- return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
- wrmsr(MSR_PSTATE_CTRL, pstate, 0);
- data->currpstate = pstate;
- return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
- u32 reqfid, u32 reqvid)
-{
- if (core_voltage_pre_transition(data, reqvid, reqfid))
- return 1;
-
- if (core_frequency_transition(data, reqfid))
- return 1;
-
- if (core_voltage_post_transition(data, reqvid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
- printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
- "curr 0x%x 0x%x\n",
- smp_processor_id(),
- reqfid, reqvid, data->currfid, data->currvid);
- return 1;
- }
-
- dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
- smp_processor_id(), data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 reqfid)
-{
- u32 rvosteps = data->rvo;
- u32 savefid = data->currfid;
- u32 maxvid, lo, rvomult = 1;
-
- dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
- "reqvid 0x%x, rvo 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqvid, data->rvo);
-
- if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
- rvomult = 2;
- rvosteps *= rvomult;
- rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
- maxvid = 0x1f & (maxvid >> 16);
- dprintk("ph1 maxvid=0x%x\n", maxvid);
- if (reqvid < maxvid) /* lower numbers are higher voltages */
- reqvid = maxvid;
-
- while (data->currvid > reqvid) {
- dprintk("ph1: curr 0x%x, req vid 0x%x\n",
- data->currvid, reqvid);
- if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
- return 1;
- }
-
- while ((rvosteps > 0) &&
- ((rvomult * data->rvo + data->currvid) > reqvid)) {
- if (data->currvid == maxvid) {
- rvosteps = 0;
- } else {
- dprintk("ph1: changing vid for rvo, req 0x%x\n",
- data->currvid - 1);
- if (decrease_vid_code_by_step(data, data->currvid-1, 1))
- return 1;
- rvosteps--;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
- u32 vcoreqfid, vcocurrfid, vcofiddiff;
- u32 fid_interval, savevid = data->currvid;
-
- if (data->currfid == reqfid) {
- printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
- data->currfid);
- return 0;
- }
-
- dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
- "reqfid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid, reqfid);
-
- vcoreqfid = convert_fid_to_vco_fid(reqfid);
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
-
- if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
- vcofiddiff = 0;
-
- while (vcofiddiff > 2) {
- (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
- if (reqfid > data->currfid) {
- if (data->currfid > LO_FID_TABLE_TOP) {
- if (write_new_fid(data,
- data->currfid + fid_interval))
- return 1;
- } else {
- if (write_new_fid
- (data,
- 2 + convert_fid_to_vco_fid(data->currfid)))
- return 1;
- }
- } else {
- if (write_new_fid(data, data->currfid - fid_interval))
- return 1;
- }
-
- vcocurrfid = convert_fid_to_vco_fid(data->currfid);
- vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
- : vcoreqfid - vcocurrfid;
- }
-
- if (write_new_fid(data, reqfid))
- return 1;
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (data->currfid != reqfid) {
- printk(KERN_ERR PFX
- "ph2: mismatch, failed fid transition, "
- "curr 0x%x, req 0x%x\n",
- data->currfid, reqfid);
- return 1;
- }
-
- if (savevid != data->currvid) {
- printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
- savevid, data->currvid);
- return 1;
- }
-
- dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
- u32 reqvid)
-{
- u32 savefid = data->currfid;
- u32 savereqvid = reqvid;
-
- dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
- smp_processor_id(),
- data->currfid, data->currvid);
-
- if (reqvid != data->currvid) {
- if (write_new_vid(data, reqvid))
- return 1;
-
- if (savefid != data->currfid) {
- printk(KERN_ERR PFX
- "ph3: bad fid change, save 0x%x, curr 0x%x\n",
- savefid, data->currfid);
- return 1;
- }
-
- if (data->currvid != reqvid) {
- printk(KERN_ERR PFX
- "ph3: failed vid transition\n, "
- "req 0x%x, curr 0x%x",
- reqvid, data->currvid);
- return 1;
- }
- }
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if (savereqvid != data->currvid) {
- dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
- return 1;
- }
-
- if (savefid != data->currfid) {
- dprintk("ph3 failed, currfid changed 0x%x\n",
- data->currfid);
- return 1;
- }
-
- dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
- data->currfid, data->currvid);
-
- return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
- u32 eax, ebx, ecx, edx;
- int *rc = _rc;
-
- *rc = -ENODEV;
-
- if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return;
-
- eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
- ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
- return;
-
- if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
- if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
- ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
- printk(KERN_INFO PFX
- "Processor cpuid %x not supported\n", eax);
- return;
- }
-
- eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
- if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
- printk(KERN_INFO PFX
- "No frequency change capabilities detected\n");
- return;
- }
-
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & P_STATE_TRANSITION_CAPABLE)
- != P_STATE_TRANSITION_CAPABLE) {
- printk(KERN_INFO PFX
- "Power state transitions not supported\n");
- return;
- }
- } else { /* must be a HW Pstate capable processor */
- cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
- if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
- cpu_family = CPU_HW_PSTATE;
- else
- return;
- }
-
- *rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
- u8 maxvid)
-{
- unsigned int j;
- u8 lastfid = 0xff;
-
- for (j = 0; j < data->numps; j++) {
- if (pst[j].vid > LEAST_VID) {
- printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
- j, pst[j].vid);
- return -EINVAL;
- }
- if (pst[j].vid < data->rvo) {
- /* vid + rvo >= 0 */
- printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].vid < maxvid + data->rvo) {
- /* vid + rvo >= maxvid */
- printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (pst[j].fid > MAX_FID) {
- printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
- " %d\n", j);
- return -ENODEV;
- }
- if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
- /* Only first fid is allowed to be in "low" range */
- printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
- "0x%x\n", j, pst[j].fid);
- return -EINVAL;
- }
- if (pst[j].fid < lastfid)
- lastfid = pst[j].fid;
- }
- if (lastfid & 1) {
- printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
- return -EINVAL;
- }
- if (lastfid > LO_FID_TABLE_TOP)
- printk(KERN_INFO FW_BUG PFX
- "first fid not from lo freq table\n");
-
- return 0;
-}
-
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
- unsigned int entry)
-{
- powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
- int j;
- for (j = 0; j < data->numps; j++) {
- if (data->powernow_table[j].frequency !=
- CPUFREQ_ENTRY_INVALID) {
- if (cpu_family == CPU_HW_PSTATE) {
- printk(KERN_INFO PFX
- " %d : pstate %d (%d MHz)\n", j,
- data->powernow_table[j].index,
- data->powernow_table[j].frequency/1000);
- } else {
- printk(KERN_INFO PFX
- " %d : fid 0x%x (%d MHz), vid 0x%x\n",
- j,
- data->powernow_table[j].index & 0xff,
- data->powernow_table[j].frequency/1000,
- data->powernow_table[j].index >> 8);
- }
- }
- }
- if (data->batps)
- printk(KERN_INFO PFX "Only %d pstates on battery\n",
- data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
- u32 mhz = 0;
-
- if (boot_cpu_data.x86 == 0x10)
- mhz = (100 * (fid + 0x10)) >> did;
- else if (boot_cpu_data.x86 == 0x11)
- mhz = (100 * (fid + 8)) >> did;
- else
- BUG();
-
- return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
- struct pst_s *pst, u8 maxvid)
-{
- struct cpufreq_frequency_table *powernow_table;
- unsigned int j;
-
- if (data->batps) {
- /* use ACPI support to get full speed on mains power */
- printk(KERN_WARNING PFX
- "Only %d pstates usable (use ACPI driver for full "
- "range\n", data->batps);
- data->numps = data->batps;
- }
-
- for (j = 1; j < data->numps; j++) {
- if (pst[j-1].fid >= pst[j].fid) {
- printk(KERN_ERR PFX "PST out of sequence\n");
- return -EINVAL;
- }
- }
-
- if (data->numps < 2) {
- printk(KERN_ERR PFX "no p states to transition\n");
- return -ENODEV;
- }
-
- if (check_pst_table(data, pst, maxvid))
- return -EINVAL;
-
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->numps + 1)), GFP_KERNEL);
- if (!powernow_table) {
- printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
- return -ENOMEM;
- }
-
- for (j = 0; j < data->numps; j++) {
- int freq;
- powernow_table[j].index = pst[j].fid; /* lower 8 bits */
- powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
- freq = find_khz_freq_from_fid(pst[j].fid);
- powernow_table[j].frequency = freq;
- }
- powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
- powernow_table[data->numps].index = 0;
-
- if (query_current_values_with_pending_wait(data)) {
- kfree(powernow_table);
- return -EIO;
- }
-
- dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
- data->powernow_table = powernow_table;
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- for (j = 0; j < data->numps; j++)
- if ((pst[j].fid == data->currfid) &&
- (pst[j].vid == data->currvid))
- return 0;
-
- dprintk("currfid/vid do not match PST, ignoring\n");
- return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
- struct psb_s *psb;
- unsigned int i;
- u32 mvs;
- u8 maxvid;
- u32 cpst = 0;
- u32 thiscpuid;
-
- for (i = 0xc0000; i < 0xffff0; i += 0x10) {
- /* Scan BIOS looking for the signature. */
- /* It can not be at ffff0 - it is too big. */
-
- psb = phys_to_virt(i);
- if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
- continue;
-
- dprintk("found PSB header at 0x%p\n", psb);
-
- dprintk("table vers: 0x%x\n", psb->tableversion);
- if (psb->tableversion != PSB_VERSION_1_4) {
- printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
- return -ENODEV;
- }
-
- dprintk("flags: 0x%x\n", psb->flags1);
- if (psb->flags1) {
- printk(KERN_ERR FW_BUG PFX "unknown flags\n");
- return -ENODEV;
- }
-
- data->vstable = psb->vstable;
- dprintk("voltage stabilization time: %d(*20us)\n",
- data->vstable);
-
- dprintk("flags2: 0x%x\n", psb->flags2);
- data->rvo = psb->flags2 & 3;
- data->irt = ((psb->flags2) >> 2) & 3;
- mvs = ((psb->flags2) >> 4) & 3;
- data->vidmvs = 1 << mvs;
- data->batps = ((psb->flags2) >> 6) & 3;
-
- dprintk("ramp voltage offset: %d\n", data->rvo);
- dprintk("isochronous relief time: %d\n", data->irt);
- dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
- dprintk("numpst: 0x%x\n", psb->num_tables);
- cpst = psb->num_tables;
- if ((psb->cpuid == 0x00000fc0) ||
- (psb->cpuid == 0x00000fe0)) {
- thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- if ((thiscpuid == 0x00000fc0) ||
- (thiscpuid == 0x00000fe0))
- cpst = 1;
- }
- if (cpst != 1) {
- printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
- return -ENODEV;
- }
-
- data->plllock = psb->plllocktime;
- dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
- dprintk("maxfid: 0x%x\n", psb->maxfid);
- dprintk("maxvid: 0x%x\n", psb->maxvid);
- maxvid = psb->maxvid;
-
- data->numps = psb->numps;
- dprintk("numpstates: 0x%x\n", data->numps);
- return fill_powernow_table(data,
- (struct pst_s *)(psb+1), maxvid);
- }
- /*
- * If you see this message, complain to BIOS manufacturer. If
- * he tells you "we do not support Linux" or some similar
- * nonsense, remember that Windows 2000 uses the same legacy
- * mechanism that the old Linux PSB driver uses. Tell them it
- * is broken with Windows 2000.
- *
- * The reference to the AMD documentation is chapter 9 in the
- * BIOS and Kernel Developer's Guide, which is available on
- * www.amd.com
- */
- printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
- return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
- unsigned int index)
-{
- u64 control;
-
- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
- return;
-
- control = data->acpi_data.states[index].control;
- data->irt = (control >> IRT_SHIFT) & IRT_MASK;
- data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
- data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
- data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
- data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
- struct cpufreq_frequency_table *powernow_table;
- int ret_val = -ENODEV;
- u64 control, status;
-
- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
- dprintk("register performance failed: bad ACPI data\n");
- return -EIO;
- }
-
- /* verify the data contained in the ACPI structures */
- if (data->acpi_data.state_count <= 1) {
- dprintk("No ACPI P-States\n");
- goto err_out;
- }
-
- control = data->acpi_data.control_register.space_id;
- status = data->acpi_data.status_register.space_id;
-
- if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
- dprintk("Invalid control/status registers (%x - %x)\n",
- control, status);
- goto err_out;
- }
-
- /* fill in data->powernow_table */
- powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->acpi_data.state_count + 1)), GFP_KERNEL);
- if (!powernow_table) {
- dprintk("powernow_table memory alloc failure\n");
- goto err_out;
- }
-
- /* fill in data */
- data->numps = data->acpi_data.state_count;
- powernow_k8_acpi_pst_values(data, 0);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret_val = fill_powernow_table_pstate(data, powernow_table);
- else
- ret_val = fill_powernow_table_fidvid(data, powernow_table);
- if (ret_val)
- goto err_out_mem;
-
- powernow_table[data->acpi_data.state_count].frequency =
- CPUFREQ_TABLE_END;
- powernow_table[data->acpi_data.state_count].index = 0;
- data->powernow_table = powernow_table;
-
- if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
- print_basics(data);
-
- /* notify BIOS that we exist */
- acpi_processor_notify_smm(THIS_MODULE);
-
- if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
- printk(KERN_ERR PFX
- "unable to alloc powernow_k8_data cpumask\n");
- ret_val = -ENOMEM;
- goto err_out_mem;
- }
-
- return 0;
-
-err_out_mem:
- kfree(powernow_table);
-
-err_out:
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
- /* data->acpi_data.state_count informs us at ->exit()
- * whether ACPI was used */
- data->acpi_data.state_count = 0;
-
- return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
- u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
- data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 index;
-
- index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
- if (index > data->max_hw_pstate) {
- printk(KERN_ERR PFX "invalid pstate %d - "
- "bad value %d.\n", i, index);
- printk(KERN_ERR PFX "Please report to BIOS "
- "manufacturer\n");
- invalidate_entry(powernow_table, i);
- continue;
- }
- rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
- if (!(hi & HW_PSTATE_VALID_MASK)) {
- dprintk("invalid pstate %d, ignoring\n", index);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- powernow_table[i].index = index;
-
- /* Frequency may be rounded for these */
- if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
- || boot_cpu_data.x86 == 0x11) {
- powernow_table[i].frequency =
- freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
- } else
- powernow_table[i].frequency =
- data->acpi_data.states[i].core_frequency * 1000;
- }
- return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
- struct cpufreq_frequency_table *powernow_table)
-{
- int i;
-
- for (i = 0; i < data->acpi_data.state_count; i++) {
- u32 fid;
- u32 vid;
- u32 freq, index;
- u64 status, control;
-
- if (data->exttype) {
- status = data->acpi_data.states[i].status;
- fid = status & EXT_FID_MASK;
- vid = (status >> VID_SHIFT) & EXT_VID_MASK;
- } else {
- control = data->acpi_data.states[i].control;
- fid = control & FID_MASK;
- vid = (control >> VID_SHIFT) & VID_MASK;
- }
-
- dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
- index = fid | (vid<<8);
- powernow_table[i].index = index;
-
- freq = find_khz_freq_from_fid(fid);
- powernow_table[i].frequency = freq;
-
- /* verify frequency is OK */
- if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
- dprintk("invalid freq %u kHz, ignoring\n", freq);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- /* verify voltage is OK -
- * BIOSs are using "off" to indicate invalid */
- if (vid == VID_OFF) {
- dprintk("invalid vid %u, ignoring\n", vid);
- invalidate_entry(powernow_table, i);
- continue;
- }
-
- if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
- printk(KERN_INFO PFX "invalid freq entries "
- "%u kHz vs. %u kHz\n", freq,
- (unsigned int)
- (data->acpi_data.states[i].core_frequency
- * 1000));
- invalidate_entry(powernow_table, i);
- continue;
- }
- }
- return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
- if (data->acpi_data.state_count)
- acpi_processor_unregister_performance(&data->acpi_data,
- data->cpu);
- free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
- int max_latency = 0;
- int i;
- for (i = 0; i < data->acpi_data.state_count; i++) {
- int cur_latency = data->acpi_data.states[i].transition_latency
- + data->acpi_data.states[i].bus_master_latency;
- if (cur_latency > max_latency)
- max_latency = cur_latency;
- }
- if (max_latency == 0) {
- /*
- * Fam 11h and later may return 0 as transition latency. This
- * is intended and means "very fast". While cpufreq core and
- * governors currently can handle that gracefully, better set it
- * to 1 to avoid problems in the future.
- */
- if (boot_cpu_data.x86 < 0x11)
- printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
- "latency\n");
- max_latency = 1;
- }
- /* value in usecs, needs to be in nanoseconds */
- return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 fid = 0;
- u32 vid = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* fid/vid correctness check for k8 */
- /* fid are the lower 8 bits of the index we stored into
- * the cpufreq frequency table in find_psb_table, vid
- * are the upper 8 bits.
- */
- fid = data->powernow_table[index].index & 0xFF;
- vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
- dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
- if (query_current_values_with_pending_wait(data))
- return 1;
-
- if ((data->currvid == vid) && (data->currfid == fid)) {
- dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
- fid, vid);
- return 0;
- }
-
- dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
- smp_processor_id(), fid, vid);
- freqs.old = find_khz_freq_from_fid(data->currfid);
- freqs.new = find_khz_freq_from_fid(fid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_fid_vid(data, fid, vid);
- freqs.new = find_khz_freq_from_fid(data->currfid);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
- unsigned int index)
-{
- u32 pstate = 0;
- int res, i;
- struct cpufreq_freqs freqs;
-
- dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
- /* get MSR index for hardware pstate transition */
- pstate = index & HW_PSTATE_MASK;
- if (pstate > data->max_hw_pstate)
- return 0;
- freqs.old = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- res = transition_pstate(data, pstate);
- freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
- for_each_cpu(i, data->available_cores) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
- unsigned targfreq, unsigned relation)
-{
- cpumask_var_t oldmask;
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
- u32 checkfid;
- u32 checkvid;
- unsigned int newstate;
- int ret = -EIO;
-
- if (!data)
- return -EINVAL;
-
- checkfid = data->currfid;
- checkvid = data->currvid;
-
- /* only run on specific CPU from here on. */
- /* This is poor form: use a workqueue or smp_call_function_single */
- if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(oldmask, tsk_cpus_allowed(current));
- set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-
- if (smp_processor_id() != pol->cpu) {
- printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
- goto err_out;
- }
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing targ, change pending bit set\n");
- goto err_out;
- }
-
- dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
- pol->cpu, targfreq, pol->min, pol->max, relation);
-
- if (query_current_values_with_pending_wait(data))
- goto err_out;
-
- if (cpu_family != CPU_HW_PSTATE) {
- dprintk("targ: curr fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- if ((checkvid != data->currvid) ||
- (checkfid != data->currfid)) {
- printk(KERN_INFO PFX
- "error - out of sync, fix 0x%x 0x%x, "
- "vid 0x%x 0x%x\n",
- checkfid, data->currfid,
- checkvid, data->currvid);
- }
- }
-
- if (cpufreq_frequency_table_target(pol, data->powernow_table,
- targfreq, relation, &newstate))
- goto err_out;
-
- mutex_lock(&fidvid_mutex);
-
- powernow_k8_acpi_pst_values(data, newstate);
-
- if (cpu_family == CPU_HW_PSTATE)
- ret = transition_frequency_pstate(data, newstate);
- else
- ret = transition_frequency_fidvid(data, newstate);
- if (ret) {
- printk(KERN_ERR PFX "transition frequency failed\n");
- ret = 1;
- mutex_unlock(&fidvid_mutex);
- goto err_out;
- }
- mutex_unlock(&fidvid_mutex);
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- newstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- ret = 0;
-
-err_out:
- set_cpus_allowed_ptr(current, oldmask);
- free_cpumask_var(oldmask);
- return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
- struct powernow_k8_data *data;
- int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
- struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
- if (pending_bit_stuck()) {
- printk(KERN_ERR PFX "failing init, change pending bit set\n");
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (query_current_values_with_pending_wait(init_on_cpu->data)) {
- init_on_cpu->rc = -ENODEV;
- return;
- }
-
- if (cpu_family == CPU_OPTERON)
- fidvid_msr_init();
-
- init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
- static const char ACPI_PSS_BIOS_BUG_MSG[] =
- KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
- FW_BUG PFX "Try again with latest BIOS.\n";
- struct powernow_k8_data *data;
- struct init_on_cpu init_on_cpu;
- int rc;
- struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-
- if (!cpu_online(pol->cpu))
- return -ENODEV;
-
- smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
- if (rc)
- return -ENODEV;
-
- data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
- if (!data) {
- printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
- return -ENOMEM;
- }
-
- data->cpu = pol->cpu;
- data->currpstate = HW_PSTATE_INVALID;
-
- if (powernow_k8_cpu_init_acpi(data)) {
- /*
- * Use the PSB BIOS structure. This is only availabe on
- * an UP version, and is deprecated by AMD.
- */
- if (num_online_cpus() != 1) {
- printk_once(ACPI_PSS_BIOS_BUG_MSG);
- goto err_out;
- }
- if (pol->cpu != 0) {
- printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
- "CPU other than CPU0. Complain to your BIOS "
- "vendor.\n");
- goto err_out;
- }
- rc = find_psb_table(data);
- if (rc)
- goto err_out;
-
- /* Take a crude guess here.
- * That guess was in microseconds, so multiply with 1000 */
- pol->cpuinfo.transition_latency = (
- ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
- ((1 << data->irt) * 30)) * 1000;
- } else /* ACPI _PSS objects available */
- pol->cpuinfo.transition_latency = get_transition_latency(data);
-
- /* only run on specific CPU from here on */
- init_on_cpu.data = data;
- smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
- &init_on_cpu, 1);
- rc = init_on_cpu.rc;
- if (rc != 0)
- goto err_out_exit_acpi;
-
- if (cpu_family == CPU_HW_PSTATE)
- cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
- else
- cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
- data->available_cores = pol->cpus;
-
- if (cpu_family == CPU_HW_PSTATE)
- pol->cur = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- pol->cur = find_khz_freq_from_fid(data->currfid);
- dprintk("policy current frequency %d kHz\n", pol->cur);
-
- /* min/max the cpu is capable of */
- if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
- printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
- powernow_k8_cpu_exit_acpi(data);
- kfree(data->powernow_table);
- kfree(data);
- return -EINVAL;
- }
-
- /* Check for APERF/MPERF support in hardware */
- if (cpu_has(c, X86_FEATURE_APERFMPERF))
- cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-
- cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
- if (cpu_family == CPU_HW_PSTATE)
- dprintk("cpu_init done, current pstate 0x%x\n",
- data->currpstate);
- else
- dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
- data->currfid, data->currvid);
-
- per_cpu(powernow_data, pol->cpu) = data;
-
- return 0;
-
-err_out_exit_acpi:
- powernow_k8_cpu_exit_acpi(data);
-
-err_out:
- kfree(data);
- return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
- if (!data)
- return -EINVAL;
-
- powernow_k8_cpu_exit_acpi(data);
-
- cpufreq_frequency_table_put_attr(pol->cpu);
-
- kfree(data->powernow_table);
- kfree(data);
- per_cpu(powernow_data, pol->cpu) = NULL;
-
- return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
- int *err = _err;
- struct powernow_k8_data *data = __get_cpu_var(powernow_data);
-
- *err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
- struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
- unsigned int khz = 0;
- int err;
-
- if (!data)
- return 0;
-
- smp_call_function_single(cpu, query_values_on_cpu, &err, true);
- if (err)
- goto out;
-
- if (cpu_family == CPU_HW_PSTATE)
- khz = find_khz_freq_from_pstate(data->powernow_table,
- data->currpstate);
- else
- khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
- return khz;
-}
-
-static void _cpb_toggle_msrs(bool t)
-{
- int cpu;
-
- get_online_cpus();
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- if (t)
- reg->l &= ~BIT(25);
- else
- reg->l |= BIT(25);
- }
- wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- put_online_cpus();
-}
-
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
- if (!cpb_capable)
- return;
-
- if (t && !cpb_enabled) {
- cpb_enabled = true;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting enabled.\n");
- } else if (!t && cpb_enabled) {
- cpb_enabled = false;
- _cpb_toggle_msrs(t);
- printk(KERN_INFO PFX "Core Boosting disabled.\n");
- }
-}
-
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
- size_t count)
-{
- int ret = -EINVAL;
- unsigned long val = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (!ret && (val == 0 || val == 1) && cpb_capable)
- cpb_toggle(val);
- else
- return -EINVAL;
-
- return count;
-}
-
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
- return sprintf(buf, "%u\n", cpb_enabled);
-}
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(cpb);
-
-static struct freq_attr *powernow_k8_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- &cpb,
- NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
- .verify = powernowk8_verify,
- .target = powernowk8_target,
- .bios_limit = acpi_processor_get_bios_limit,
- .init = powernowk8_cpu_init,
- .exit = __devexit_p(powernowk8_cpu_exit),
- .get = powernowk8_get,
- .name = "powernow-k8",
- .owner = THIS_MODULE,
- .attr = powernow_k8_attr,
-};
-
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
- void *hcpu)
-{
- unsigned cpu = (long)hcpu;
- u32 lo, hi;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
-
- if (!cpb_enabled) {
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo |= BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- }
- break;
-
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- lo &= ~BIT(25);
- wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpb_nb = {
- .notifier_call = cpb_notify,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
- unsigned int i, supported_cpus = 0, cpu;
-
- for_each_online_cpu(i) {
- int rc;
- smp_call_function_single(i, check_supported_cpu, &rc, 1);
- if (rc == 0)
- supported_cpus++;
- }
-
- if (supported_cpus != num_online_cpus())
- return -ENODEV;
-
- printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
-
- cpb_capable = true;
-
- register_cpu_notifier(&cpb_nb);
-
- msrs = msrs_alloc();
- if (!msrs) {
- printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
- return -ENOMEM;
- }
-
- rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
- for_each_cpu(cpu, cpu_online_mask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- cpb_enabled |= !(!!(reg->l & BIT(25)));
- }
-
- printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
- (cpb_enabled ? "on" : "off"));
- }
-
- return cpufreq_register_driver(&cpufreq_amd64_driver);
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
- dprintk("exit\n");
-
- if (boot_cpu_has(X86_FEATURE_CPB)) {
- msrs_free(msrs);
- msrs = NULL;
-
- unregister_cpu_notifier(&cpb_nb);
- }
-
- cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
- "Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- */
-
-enum pstate {
- HW_PSTATE_INVALID = 0xff,
- HW_PSTATE_0 = 0,
- HW_PSTATE_1 = 1,
- HW_PSTATE_2 = 2,
- HW_PSTATE_3 = 3,
- HW_PSTATE_4 = 4,
- HW_PSTATE_5 = 5,
- HW_PSTATE_6 = 6,
- HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
- unsigned int cpu;
-
- u32 numps; /* number of p-states */
- u32 batps; /* number of p-states supported on battery */
- u32 max_hw_pstate; /* maximum legal hardware pstate */
-
- /* these values are constant when the PSB is used to determine
- * vid/fid pairings, but are modified during the ->target() call
- * when ACPI is used */
- u32 rvo; /* ramp voltage offset */
- u32 irt; /* isochronous relief time */
- u32 vidmvs; /* usable value calculated from mvs */
- u32 vstable; /* voltage stabilization time, units 20 us */
- u32 plllock; /* pll lock time, units 1 us */
- u32 exttype; /* extended interface = 1 */
-
- /* keep track of the current fid / vid or pstate */
- u32 currvid;
- u32 currfid;
- enum pstate currpstate;
-
- /* the powernow_table includes all frequency and vid/fid pairings:
- * fid are the lower 8 bits of the index, vid are the upper 8 bits.
- * frequency is in kHz */
- struct cpufreq_frequency_table *powernow_table;
-
- /* the acpi table needs to be kept. it's only available if ACPI was
- * used to determine valid frequency/vid/fid states */
- struct acpi_processor_performance acpi_data;
-
- /* we need to keep track of associated cores, but let cpufreq
- * handle hotplug events - so just point at cpufreq pol->cpus
- * structure */
- struct cpumask *available_cores;
-};
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
-#define CPUID_XFAM 0x0ff00000 /* extended family */
-#define CPUID_XFAM_K8 0
-#define CPUID_XMOD 0x000f0000 /* extended model */
-#define CPUID_XMOD_REV_MASK 0x000c0000
-#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
-#define CPUID_USE_XFAM_XMOD 0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES 0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
-#define P_STATE_TRANSITION_CAPABLE 6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL 0xc0010041
-#define MSR_FIDVID_STATUS 0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID 0x00010000
-#define MSR_C_LO_NEW_VID 0x00003f00
-#define MSR_C_LO_NEW_FID 0x0000003f
-#define MSR_C_LO_VID_SHIFT 8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO 0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
-#define MSR_S_LO_MAX_FID 0x003f0000
-#define MSR_S_LO_START_FID 0x00003f00
-#define MSR_S_LO_CURRENT_FID 0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
-#define MSR_S_HI_START_VID 0x00003f00
-#define MSR_S_HI_CURRENT_VID 0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE 0x00000080
-#define HW_PSTATE_MASK 0x00000007
-#define HW_PSTATE_VALID_MASK 0x80000000
-#define HW_PSTATE_MAX_MASK 0x000000f0
-#define HW_PSTATE_MAX_SHIFT 4
-#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- * low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- * in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- * supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- * (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
-#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
-
-#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800 /* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT 30
-#define RVO_SHIFT 28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT 20
-#define MVS_SHIFT 18
-#define VST_SHIFT 11
-#define VID_SHIFT 6
-#define IRT_MASK 3
-#define RVO_MASK 3
-#define EXT_TYPE_MASK 1
-#define PLL_L_MASK 0x7f
-#define MVS_MASK 3
-#define VST_MASK 0x7f
-#define VID_MASK 0x1f
-#define FID_MASK 0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN 10
-
-#define PSB_VERSION_1_4 0x14
-
-struct psb_s {
- u8 signature[10];
- u8 tableversion;
- u8 flags1;
- u16 vstable;
- u8 flags2;
- u8 num_tables;
- u32 cpuid;
- u8 plllocktime;
- u8 maxfid;
- u8 maxvid;
- u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
- u8 fid;
- u8 vid;
-};
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
- u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- * Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Based on elanfreq.c
- *
- * 2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE 0xfffef000 /* The default base address */
-#define OFFS_CPUCTL 0x2 /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "sc520_freq", msg)
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
- {0x01, 100000},
- {0x02, 133000},
- {0, CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
- u8 clockspeed_reg = *cpuctl;
-
- switch (clockspeed_reg & 0x03) {
- default:
- printk(KERN_ERR PFX "error: cpuctl register has unexpected "
- "value %02x\n", clockspeed_reg);
- case 0x01:
- return 100000;
- case 0x02:
- return 133000;
- }
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
- struct cpufreq_freqs freqs;
- u8 clockspeed_reg;
-
- freqs.old = sc520_freq_get_cpu_frequency(0);
- freqs.new = sc520_freq_table[state].frequency;
- freqs.cpu = 0; /* AMD Elan is UP */
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
- dprintk("attempting to set frequency to %i kHz\n",
- sc520_freq_table[state].frequency);
-
- local_irq_disable();
-
- clockspeed_reg = *cpuctl & ~0x03;
- *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
- local_irq_enable();
-
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
-
- if (cpufreq_frequency_table_target(policy, sc520_freq_table,
- target_freq, relation, &newstate))
- return -EINVAL;
-
- sc520_freq_set_cpu_state(newstate);
-
- return 0;
-}
-
-
-/*
- * Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int result;
-
- /* capability check */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9)
- return -ENODEV;
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = 1000000; /* 1ms */
- policy->cur = sc520_freq_get_cpu_frequency(0);
-
- result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
- return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
- .get = sc520_freq_get_cpu_frequency,
- .verify = sc520_freq_verify,
- .target = sc520_freq_target,
- .init = sc520_freq_cpu_init,
- .exit = sc520_freq_cpu_exit,
- .name = "sc520_freq",
- .owner = THIS_MODULE,
- .attr = sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int err;
-
- /* Test if we have the right hardware */
- if (c->x86_vendor != X86_VENDOR_AMD ||
- c->x86 != 4 || c->x86_model != 9) {
- dprintk("no Elan SC520 processor found!\n");
- return -ENODEV;
- }
- cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
- if (!cpuctl) {
- printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
- return -ENOMEM;
- }
-
- err = cpufreq_register_driver(&sc520_freq_driver);
- if (err)
- iounmap(cpuctl);
-
- return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
- cpufreq_unregister_driver(&sc520_freq_driver);
- iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h> /* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX "speedstep-centrino: "
-#define MAINTAINER "cpufreq@vger.kernel.org"
-
-#define dprintk(msg...) \
- cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
-#define INTEL_MSR_RANGE (0xffff)
-
-struct cpu_id
-{
- __u8 x86; /* CPU family */
- __u8 x86_model; /* model */
- __u8 x86_mask; /* stepping */
-};
-
-enum {
- CPU_BANIAS,
- CPU_DOTHAN_A1,
- CPU_DOTHAN_A2,
- CPU_DOTHAN_B0,
- CPU_MP4HT_D0,
- CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
- [CPU_BANIAS] = { 6, 9, 5 },
- [CPU_DOTHAN_A1] = { 6, 13, 1 },
- [CPU_DOTHAN_A2] = { 6, 13, 2 },
- [CPU_DOTHAN_B0] = { 6, 13, 6 },
- [CPU_MP4HT_D0] = {15, 3, 4 },
- [CPU_MP4HT_E0] = {15, 4, 1 },
-};
-#define N_IDS ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
- const struct cpu_id *cpu_id;
- const char *model_name;
- unsigned max_freq; /* max clock in kHz */
-
- struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
- frequency/voltage operating point; frequency in MHz, volts in mV.
- This is stored as "index" in the structure. */
-#define OP(mhz, mv) \
- { \
- .frequency = (mhz) * 1000, \
- .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
- }
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5. I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
- OP(600, 844),
- OP(800, 988),
- OP(900, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
- OP(600, 844),
- OP(800, 972),
- OP(900, 988),
- OP(1000, 1004),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
- OP( 600, 956),
- OP( 800, 1020),
- OP( 900, 1100),
- OP(1000, 1164),
- OP(1100, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP( 900, 1020),
- OP(1000, 1100),
- OP(1100, 1164),
- OP(1200, 1180),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
- OP( 600, 956),
- OP( 800, 1260),
- OP(1000, 1292),
- OP(1200, 1356),
- OP(1300, 1388),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
- OP( 600, 956),
- OP( 800, 1180),
- OP(1000, 1308),
- OP(1200, 1436),
- OP(1400, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
- OP( 600, 956),
- OP( 800, 1116),
- OP(1000, 1228),
- OP(1200, 1356),
- OP(1400, 1452),
- OP(1500, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
- OP( 600, 956),
- OP( 800, 1036),
- OP(1000, 1164),
- OP(1200, 1276),
- OP(1400, 1420),
- OP(1600, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
- OP( 600, 956),
- OP( 800, 1004),
- OP(1000, 1116),
- OP(1200, 1228),
- OP(1400, 1308),
- OP(1700, 1484),
- { .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name) \
-{ .cpu_id = cpuid, \
- .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
- .max_freq = (max)*1000, \
- .op_points = banias_##max, \
-}
-#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
- operating points */
-static struct cpu_model models[] =
-{
- _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
- BANIAS(1000),
- BANIAS(1100),
- BANIAS(1200),
- BANIAS(1300),
- BANIAS(1400),
- BANIAS(1500),
- BANIAS(1600),
- BANIAS(1700),
-
- /* NULL model_name is a wildcard */
- { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
- { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
- { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
- { NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- struct cpu_model *model;
-
- for(model = models; model->cpu_id != NULL; model++)
- if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
- (model->model_name == NULL ||
- strcmp(cpu->x86_model_id, model->model_name) == 0))
- break;
-
- if (model->cpu_id == NULL) {
- /* No match at all */
- dprintk("no support for CPU model \"%s\": "
- "send /proc/cpuinfo to " MAINTAINER "\n",
- cpu->x86_model_id);
- return -ENOENT;
- }
-
- if (model->op_points == NULL) {
- /* Matched a non-match */
- dprintk("no table support for CPU model \"%s\"\n",
- cpu->x86_model_id);
- dprintk("try using the acpi-cpufreq driver\n");
- return -ENOENT;
- }
-
- per_cpu(centrino_model, policy->cpu) = model;
-
- dprintk("found \"%s\": max frequency: %dkHz\n",
- model->model_name, model->max_freq);
-
- return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
- return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
- const struct cpu_id *x)
-{
- if ((c->x86 == x->x86) &&
- (c->x86_model == x->x86_model) &&
- (c->x86_mask == x->x86_mask))
- return 1;
- return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
- int i;
-
- /*
- * Extract clock in kHz from PERF_CTL value
- * for centrino, as some DSDTs are buggy.
- * Ideally, this can be done using the acpi_data structure.
- */
- if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
- (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
- msr = (msr >> 8) & 0xff;
- return msr * 100000;
- }
-
- if ((!per_cpu(centrino_model, cpu)) ||
- (!per_cpu(centrino_model, cpu)->op_points))
- return 0;
-
- msr &= 0xffff;
- for (i = 0;
- per_cpu(centrino_model, cpu)->op_points[i].frequency
- != CPUFREQ_TABLE_END;
- i++) {
- if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
- return per_cpu(centrino_model, cpu)->
- op_points[i].frequency;
- }
- if (failsafe)
- return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
- else
- return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
- unsigned l, h;
- unsigned clock_freq;
-
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
- clock_freq = extract_clock(l, cpu, 0);
-
- if (unlikely(clock_freq == 0)) {
- /*
- * On some CPUs, we can see transient MSR values (which are
- * not present in _PSS), while CPU is doing some automatic
- * P-state transition (like TM2). Get the last freq set
- * in PERF_CTL.
- */
- rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
- clock_freq = extract_clock(l, cpu, 1);
- }
- return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
- unsigned freq;
- unsigned l, h;
- int ret;
- int i;
-
- /* Only Intel makes Enhanced Speedstep-capable CPUs */
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
- centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
- if (policy->cpu != 0)
- return -ENODEV;
-
- for (i = 0; i < N_IDS; i++)
- if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
- break;
-
- if (i != N_IDS)
- per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
- if (!per_cpu(centrino_cpu, policy->cpu)) {
- dprintk("found unsupported CPU with "
- "Enhanced SpeedStep: send /proc/cpuinfo to "
- MAINTAINER "\n");
- return -ENODEV;
- }
-
- if (centrino_cpu_init_table(policy)) {
- return -ENODEV;
- }
-
- /* Check to see if Enhanced SpeedStep is enabled, and try to
- enable it if not. */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
- dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
- wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- /* check to see if it stuck */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
- printk(KERN_INFO PFX
- "couldn't enable Enhanced SpeedStep\n");
- return -ENODEV;
- }
- }
-
- freq = get_cur_freq(policy->cpu);
- policy->cpuinfo.transition_latency = 10000;
- /* 10uS transition latency */
- policy->cur = freq;
-
- dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
- ret = cpufreq_frequency_table_cpuinfo(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
- if (ret)
- return (ret);
-
- cpufreq_frequency_table_get_attr(
- per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
- return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
- unsigned int cpu = policy->cpu;
-
- if (!per_cpu(centrino_model, cpu))
- return -ENODEV;
-
- cpufreq_frequency_table_put_attr(cpu);
-
- per_cpu(centrino_model, cpu) = NULL;
-
- return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy,
- per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0;
- unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
- struct cpufreq_freqs freqs;
- int retval = 0;
- unsigned int j, k, first_cpu, tmp;
- cpumask_var_t covered_cpus;
-
- if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
- return -ENOMEM;
-
- if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
- retval = -ENODEV;
- goto out;
- }
-
- if (unlikely(cpufreq_frequency_table_target(policy,
- per_cpu(centrino_model, cpu)->op_points,
- target_freq,
- relation,
- &newstate))) {
- retval = -EINVAL;
- goto out;
- }
-
- first_cpu = 1;
- for_each_cpu(j, policy->cpus) {
- int good_cpu;
-
- /* cpufreq holds the hotplug lock, so we are safe here */
- if (!cpu_online(j))
- continue;
-
- /*
- * Support for SMP systems.
- * Make sure we are running on CPU that wants to change freq
- */
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- good_cpu = cpumask_any_and(policy->cpus,
- cpu_online_mask);
- else
- good_cpu = j;
-
- if (good_cpu >= nr_cpu_ids) {
- dprintk("couldn't limit to CPUs in this domain\n");
- retval = -EAGAIN;
- if (first_cpu) {
- /* We haven't started the transition yet. */
- goto out;
- }
- break;
- }
-
- msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
- if (first_cpu) {
- rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
- if (msr == (oldmsr & 0xffff)) {
- dprintk("no change needed - msr was and needs "
- "to be %x\n", oldmsr);
- retval = 0;
- goto out;
- }
-
- freqs.old = extract_clock(oldmsr, cpu, 0);
- freqs.new = extract_clock(msr, cpu, 0);
-
- dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
- target_freq, freqs.old, freqs.new, msr);
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs,
- CPUFREQ_PRECHANGE);
- }
-
- first_cpu = 0;
- /* all but 16 LSB are reserved, treat them with care */
- oldmsr &= ~0xffff;
- msr &= 0xffff;
- oldmsr |= msr;
- }
-
- wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
- break;
-
- cpumask_set_cpu(j, covered_cpus);
- }
-
- for_each_cpu(k, policy->cpus) {
- if (!cpu_online(k))
- continue;
- freqs.cpu = k;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- if (unlikely(retval)) {
- /*
- * We have failed halfway through the frequency change.
- * We have sent callbacks to policy->cpus and
- * MSRs have already been written on coverd_cpus.
- * Best effort undo..
- */
-
- for_each_cpu(j, covered_cpus)
- wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
- tmp = freqs.new;
- freqs.new = freqs.old;
- freqs.old = tmp;
- for_each_cpu(j, policy->cpus) {
- if (!cpu_online(j))
- continue;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
- }
- retval = 0;
-
-out:
- free_cpumask_var(covered_cpus);
- return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
- .name = "centrino", /* should be speedstep-centrino,
- but there's a 16 char limit */
- .init = centrino_cpu_init,
- .exit = centrino_cpu_exit,
- .verify = centrino_verify,
- .target = centrino_target,
- .get = get_cur_freq,
- .attr = centrino_attr,
- .owner = THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky. Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables. That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
- struct cpuinfo_x86 *cpu = &cpu_data(0);
-
- if (!cpu_has(cpu, X86_FEATURE_EST))
- return -ENODEV;
-
- return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
- cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * (C) 2001 Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- * Based upon reverse engineered information, and on Intel documentation
- * for chipsets ICH2-M and ICH3-M.
- *
- * Many thanks to Ducrot Bruno for finding and fixing the last
- * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- * for extensive testing.
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- * It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-
-static u32 pmbase;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-ich", msg)
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
- if (!speedstep_chipset_dev)
- return -ENODEV;
-
- /* get PMBASE */
- pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
- if (!(pmbase & 0x01)) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- pmbase &= 0xFFFFFFFE;
- if (!pmbase) {
- printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
- return -ENODEV;
- }
-
- dprintk("pmbase is 0x%x\n", pmbase);
- return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- * Tries to change the SpeedStep state. Can be called from
- * smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
- u8 pm2_blk;
- u8 value;
- unsigned long flags;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- /* read state */
- value = inb(pmbase + 0x50);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- /* write new state */
- value &= 0xFE;
- value |= state;
-
- dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
- /* Disable bus master arbitration */
- pm2_blk = inb(pmbase + 0x20);
- pm2_blk |= 0x01;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* Actual transition */
- outb(value, (pmbase + 0x50));
-
- /* Restore bus master arbitration */
- pm2_blk &= 0xfe;
- outb(pm2_blk, (pmbase + 0x20));
-
- /* check if transition was successful */
- value = inb(pmbase + 0x50);
-
- /* Enable IRQs */
- local_irq_restore(flags);
-
- dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
- if (state == (value & 0x1))
- dprintk("change to %u MHz succeeded\n",
- speedstep_get_frequency(speedstep_processor) / 1000);
- else
- printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
- return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
- speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- * Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
- u16 value = 0;
-
- if (!speedstep_chipset_dev)
- return -EINVAL;
-
- pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
- if (!(value & 0x08)) {
- value |= 0x08;
- dprintk("activating SpeedStep (TM) registers\n");
- pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801DB_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 4; /* 4-M */
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801CA_12,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev)
- return 3; /* 3-M */
-
-
- speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801BA_10,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
- if (speedstep_chipset_dev) {
- /* speedstep.c causes lockups on Dell Inspirons 8000 and
- * 8100 which use a pretty old revision of the 82815
- * host brige. Abort on these systems.
- */
- static struct pci_dev *hostbridge;
-
- hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82815_MC,
- PCI_ANY_ID, PCI_ANY_ID,
- NULL);
-
- if (!hostbridge)
- return 2; /* 2-M */
-
- if (hostbridge->revision < 5) {
- dprintk("hostbridge does not support speedstep\n");
- speedstep_chipset_dev = NULL;
- pci_dev_put(hostbridge);
- return 0;
- }
-
- pci_dev_put(hostbridge);
- return 2; /* 2-M */
- }
-
- return 0;
-}
-
-static void get_freq_data(void *_speed)
-{
- unsigned int *speed = _speed;
-
- *speed = speedstep_get_frequency(speedstep_processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- unsigned int speed;
-
- /* You're supposed to ensure CPU is online. */
- if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
- BUG();
-
- dprintk("detected %u kHz as current frequency\n", speed);
- return speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- unsigned int newstate = 0, policy_cpu;
- struct cpufreq_freqs freqs;
- int i;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
- freqs.old = speedstep_get(policy_cpu);
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = policy->cpu;
-
- dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
- /* no transition necessary */
- if (freqs.old == freqs.new)
- return 0;
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- }
-
- smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
- true);
-
- for_each_cpu(i, policy->cpus) {
- freqs.cpu = i;
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
- }
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
- struct cpufreq_policy *policy;
- int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
- struct get_freqs *get_freqs = _get_freqs;
-
- get_freqs->ret =
- speedstep_get_freqs(speedstep_processor,
- &speedstep_freqs[SPEEDSTEP_LOW].frequency,
- &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
- &get_freqs->policy->cpuinfo.transition_latency,
- &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int policy_cpu, speed;
- struct get_freqs gf;
-
- /* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
- cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
- policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
- /* detect low and high frequency and transition latency */
- gf.policy = policy;
- smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
- if (gf.ret)
- return gf.ret;
-
- /* get current speed setting */
- speed = speedstep_get(policy_cpu);
- if (!speed)
- return -EIO;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-ich",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- /* detect processor */
- speedstep_processor = speedstep_detect_processor();
- if (!speedstep_processor) {
- dprintk("Intel(R) SpeedStep(TM) capable processor "
- "not found\n");
- return -ENODEV;
- }
-
- /* detect chipset */
- if (!speedstep_detect_chipset()) {
- dprintk("Intel(R) SpeedStep(TM) for this chipset not "
- "(yet) available.\n");
- return -ENODEV;
- }
-
- /* activate speedstep support */
- if (speedstep_activate()) {
- pci_dev_put(speedstep_chipset_dev);
- return -EINVAL;
- }
-
- if (speedstep_find_register())
- return -ENODEV;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- pci_dev_put(speedstep_chipset_dev);
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
- "Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
- "with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-lib", msg)
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- * GET PROCESSOR CORE SPEED IN KHZ *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
- /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
- struct {
- unsigned int ratio; /* Frequency Multiplier (x10) */
- u8 bitmap; /* power on configuration bits
- [27, 25:22] (in MSR 0x2a) */
- } msr_decode_mult[] = {
- { 30, 0x01 },
- { 35, 0x05 },
- { 40, 0x02 },
- { 45, 0x06 },
- { 50, 0x00 },
- { 55, 0x04 },
- { 60, 0x0b },
- { 65, 0x0f },
- { 70, 0x09 },
- { 75, 0x0d },
- { 80, 0x0a },
- { 85, 0x26 },
- { 90, 0x20 },
- { 100, 0x2b },
- { 0, 0xff } /* error or unknown value */
- };
-
- /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
- struct {
- unsigned int value; /* Front Side Bus speed in MHz */
- u8 bitmap; /* power on configuration bits [18: 19]
- (in MSR 0x2a) */
- } msr_decode_fsb[] = {
- { 66, 0x0 },
- { 100, 0x2 },
- { 133, 0x1 },
- { 0, 0xff}
- };
-
- u32 msr_lo, msr_tmp;
- int i = 0, j = 0;
-
- /* read MSR 0x2a - we only need the low 32 bits */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
- msr_tmp = msr_lo;
-
- /* decode the FSB */
- msr_tmp &= 0x00c0000;
- msr_tmp >>= 18;
- while (msr_tmp != msr_decode_fsb[i].bitmap) {
- if (msr_decode_fsb[i].bitmap == 0xff)
- return 0;
- i++;
- }
-
- /* decode the multiplier */
- if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
- dprintk("workaround for early PIIIs\n");
- msr_lo &= 0x03c00000;
- } else
- msr_lo &= 0x0bc00000;
- msr_lo >>= 22;
- while (msr_lo != msr_decode_mult[j].bitmap) {
- if (msr_decode_mult[j].bitmap == 0xff)
- return 0;
- j++;
- }
-
- dprintk("speed is %u\n",
- (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
- return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
- u32 msr_lo, msr_tmp;
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
- /* see table B-2 of 24547212.pdf */
- if (msr_lo & 0x00040000) {
- printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
- return 0;
- }
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * 100 * 1000));
-
- return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
- u32 fsb = 0;
- u32 msr_lo, msr_tmp;
- int ret;
-
- rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
- /* see table B-2 of 25366920.pdf */
- switch (msr_lo & 0x07) {
- case 5:
- fsb = 100000;
- break;
- case 1:
- fsb = 133333;
- break;
- case 3:
- fsb = 166667;
- break;
- case 2:
- fsb = 200000;
- break;
- case 0:
- fsb = 266667;
- break;
- case 4:
- fsb = 333333;
- break;
- default:
- printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
- }
-
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
- dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
- msr_lo, msr_tmp);
-
- msr_tmp = (msr_lo >> 22) & 0x1f;
- dprintk("bits 22-26 are 0x%x, speed is %u\n",
- msr_tmp, (msr_tmp * fsb));
-
- ret = (msr_tmp * fsb);
- return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- u32 msr_lo, msr_hi, mult;
- unsigned int fsb = 0;
- unsigned int ret;
- u8 fsb_code;
-
- /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
- * to System Bus Frequency Ratio Field in the Processor Frequency
- * Configuration Register of the MSR. Therefore the current
- * frequency cannot be calculated and has to be measured.
- */
- if (c->x86_model < 2)
- return cpu_khz;
-
- rdmsr(0x2c, msr_lo, msr_hi);
-
- dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
- /* decode the FSB: see IA-32 Intel (C) Architecture Software
- * Developer's Manual, Volume 3: System Prgramming Guide,
- * revision #12 in Table B-1: MSRs in the Pentium 4 and
- * Intel Xeon Processors, on page B-4 and B-5.
- */
- fsb_code = (msr_lo >> 16) & 0x7;
- switch (fsb_code) {
- case 0:
- fsb = 100 * 1000;
- break;
- case 1:
- fsb = 13333 * 10;
- break;
- case 2:
- fsb = 200 * 1000;
- break;
- }
-
- if (!fsb)
- printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
- "Please send an e-mail to <linux@brodo.de>\n");
-
- /* Multiplier. */
- mult = msr_lo >> 24;
-
- dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
- fsb, mult, (fsb * mult));
-
- ret = (fsb * mult);
- return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
- switch (processor) {
- case SPEEDSTEP_CPU_PCORE:
- return pentium_core_get_frequency();
- case SPEEDSTEP_CPU_PM:
- return pentiumM_get_frequency();
- case SPEEDSTEP_CPU_P4D:
- case SPEEDSTEP_CPU_P4M:
- return pentium4_get_frequency();
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- return pentium3_get_frequency(processor);
- default:
- return 0;
- };
- return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- u32 ebx, msr_lo, msr_hi;
-
- dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-
- if ((c->x86_vendor != X86_VENDOR_INTEL) ||
- ((c->x86 != 6) && (c->x86 != 0xF)))
- return 0;
-
- if (c->x86 == 0xF) {
- /* Intel Mobile Pentium 4-M
- * or Intel Mobile Pentium 4 with 533 MHz FSB */
- if (c->x86_model != 2)
- return 0;
-
- ebx = cpuid_ebx(0x00000001);
- ebx &= 0x000000FF;
-
- dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
- switch (c->x86_mask) {
- case 4:
- /*
- * B-stepping [M-P4-M]
- * sample has ebx = 0x0f, production has 0x0e.
- */
- if ((ebx == 0x0e) || (ebx == 0x0f))
- return SPEEDSTEP_CPU_P4M;
- break;
- case 7:
- /*
- * C-stepping [M-P4-M]
- * needs to have ebx=0x0e, else it's a celeron:
- * cf. 25130917.pdf / page 7, footnote 5 even
- * though 25072120.pdf / page 7 doesn't say
- * samples are only of B-stepping...
- */
- if (ebx == 0x0e)
- return SPEEDSTEP_CPU_P4M;
- break;
- case 9:
- /*
- * D-stepping [M-P4-M or M-P4/533]
- *
- * this is totally strange: CPUID 0x0F29 is
- * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
- * The latter need to be sorted out as they don't
- * support speedstep.
- * Celerons with CPUID 0x0F29 may have either
- * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
- * specific.
- * M-P4-Ms may have either ebx=0xe or 0xf [see above]
- * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
- * also, M-P4M HTs have ebx=0x8, too
- * For now, they are distinguished by the model_id
- * string
- */
- if ((ebx == 0x0e) ||
- (strstr(c->x86_model_id,
- "Mobile Intel(R) Pentium(R) 4") != NULL))
- return SPEEDSTEP_CPU_P4M;
- break;
- default:
- break;
- }
- return 0;
- }
-
- switch (c->x86_model) {
- case 0x0B: /* Intel PIII [Tualatin] */
- /* cpuid_ebx(1) is 0x04 for desktop PIII,
- * 0x06 for mobile PIII-M */
- ebx = cpuid_ebx(0x00000001);
- dprintk("ebx is %x\n", ebx);
-
- ebx &= 0x000000FF;
-
- if (ebx != 0x06)
- return 0;
-
- /* So far all PIII-M processors support SpeedStep. See
- * Intel's 24540640.pdf of June 2003
- */
- return SPEEDSTEP_CPU_PIII_T;
-
- case 0x08: /* Intel PIII [Coppermine] */
-
- /* all mobile PIII Coppermines have FSB 100 MHz
- * ==> sort out a few desktop PIIIs. */
- rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- msr_lo &= 0x00c0000;
- if (msr_lo != 0x0080000)
- return 0;
-
- /*
- * If the processor is a mobile version,
- * platform ID has bit 50 set
- * it has SpeedStep technology if either
- * bit 56 or 57 is set
- */
- rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
- dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
- msr_lo, msr_hi);
- if ((msr_hi & (1<<18)) &&
- (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
- if (c->x86_mask == 0x01) {
- dprintk("early PIII version\n");
- return SPEEDSTEP_CPU_PIII_C_EARLY;
- } else
- return SPEEDSTEP_CPU_PIII_C;
- }
-
- default:
- return 0;
- }
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- * DETECT SPEEDSTEP SPEEDS *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state))
-{
- unsigned int prev_speed;
- unsigned int ret = 0;
- unsigned long flags;
- struct timeval tv1, tv2;
-
- if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
- return -EINVAL;
-
- dprintk("trying to determine both speeds\n");
-
- /* get current speed */
- prev_speed = speedstep_get_frequency(processor);
- if (!prev_speed)
- return -EIO;
-
- dprintk("previous speed is %u\n", prev_speed);
-
- local_irq_save(flags);
-
- /* switch to low state */
- set_state(SPEEDSTEP_LOW);
- *low_speed = speedstep_get_frequency(processor);
- if (!*low_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("low speed is %u\n", *low_speed);
-
- /* start latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv1);
-
- /* switch to high state */
- set_state(SPEEDSTEP_HIGH);
-
- /* end latency measurement */
- if (transition_latency)
- do_gettimeofday(&tv2);
-
- *high_speed = speedstep_get_frequency(processor);
- if (!*high_speed) {
- ret = -EIO;
- goto out;
- }
-
- dprintk("high speed is %u\n", *high_speed);
-
- if (*low_speed == *high_speed) {
- ret = -ENODEV;
- goto out;
- }
-
- /* switch to previous state, if necessary */
- if (*high_speed != prev_speed)
- set_state(SPEEDSTEP_LOW);
-
- if (transition_latency) {
- *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
- tv2.tv_usec - tv1.tv_usec;
- dprintk("transition latency is %u uSec\n", *transition_latency);
-
- /* convert uSec to nSec and add 20% for safety reasons */
- *transition_latency *= 1200;
-
- /* check if the latency measurement is too high or too low
- * and set it to a safe value (500uSec) in that case
- */
- if (*transition_latency > 10000000 ||
- *transition_latency < 50000) {
- printk(KERN_WARNING PFX "frequency transition "
- "measured seems out of range (%u "
- "nSec), falling back to a safe one of"
- "%u nSec.\n",
- *transition_latency, 500000);
- *transition_latency = 500000;
- }
- }
-
-out:
- local_irq_restore(flags);
- return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
- "Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- * Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-enum speedstep_processor {
- SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
- SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
- SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
- SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
- SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
- SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
-};
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH 0x00000000
-#define SPEEDSTEP_LOW 0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
- unsigned int *low_speed,
- unsigned int *high_speed,
- unsigned int *transition_latency,
- void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 8abd869baabf..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003 Hiroshi Miura <miura@da-cha.org>
- *
- * Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- * SPEEDSTEP - DEFINITIONS *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
- {SPEEDSTEP_HIGH, 0},
- {SPEEDSTEP_LOW, 0},
- {0, CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
- "speedstep-smi", msg)
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
- u32 command, result, magic, dummy;
- u32 function = GET_SPEEDSTEP_OWNER;
- unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
- magic = virt_to_phys(magic_data);
-
- dprintk("trying to obtain ownership with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=D" (result),
- "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
- "=S" (dummy)
- : "a" (command), "b" (function), "c" (0), "d" (smi_port),
- "D" (0), "S" (magic)
- : "memory"
- );
-
- dprintk("result is %x\n", result);
-
- return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
- u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
- u32 state = 0;
- u32 function = GET_SPEEDSTEP_FREQS;
-
- if (!(ist_info.event & 0xFFFF)) {
- dprintk("bug #1422 -- can't read freqs from BIOS\n");
- return -ENODEV;
- }
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine frequencies with command %x at port %x\n",
- command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=a" (result),
- "=b" (high_mhz),
- "=c" (low_mhz),
- "=d" (state), "=D" (edi), "=S" (dummy)
- : "a" (command),
- "b" (function),
- "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("result %x, low_freq %u, high_freq %u\n",
- result, low_mhz, high_mhz);
-
- /* abort if results are obviously incorrect... */
- if ((high_mhz + low_mhz) < 600)
- return -EINVAL;
-
- *high = high_mhz * 1000;
- *low = low_mhz * 1000;
-
- return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
- u32 function = GET_SPEEDSTEP_STATE;
- u32 result, state, edi, command, dummy;
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to determine current setting with command %x "
- "at port %x\n", command, smi_port);
-
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp\n"
- : "=a" (result),
- "=b" (state), "=D" (edi),
- "=c" (dummy), "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (0),
- "d" (smi_port), "S" (0), "D" (0)
- );
-
- dprintk("state is %x, result is %x\n", state, result);
-
- return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
- unsigned int result = 0, command, new_state, dummy;
- unsigned long flags;
- unsigned int function = SET_SPEEDSTEP_STATE;
- unsigned int retry = 0;
-
- if (state > 0x1)
- return;
-
- /* Disable IRQs */
- local_irq_save(flags);
-
- command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
- dprintk("trying to set frequency to state %u "
- "with command %x at port %x\n",
- state, command, smi_port);
-
- do {
- if (retry) {
- dprintk("retry %u, previous result %u, waiting...\n",
- retry, result);
- mdelay(retry * 50);
- }
- retry++;
- __asm__ __volatile__(
- "push %%ebp\n"
- "out %%al, (%%dx)\n"
- "pop %%ebp"
- : "=b" (new_state), "=D" (result),
- "=c" (dummy), "=a" (dummy),
- "=d" (dummy), "=S" (dummy)
- : "a" (command), "b" (function), "c" (state),
- "d" (smi_port), "S" (0), "D" (0)
- );
- } while ((new_state != state) && (retry <= SMI_TRIES));
-
- /* enable IRQs */
- local_irq_restore(flags);
-
- if (new_state == state)
- dprintk("change to %u MHz succeeded after %u tries "
- "with result %u\n",
- (speedstep_freqs[new_state].frequency / 1000),
- retry, result);
- else
- printk(KERN_ERR "cpufreq: change to state %u "
- "failed with new_state %u and result %u\n",
- state, new_state, result);
-
- return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
- unsigned int target_freq, unsigned int relation)
-{
- unsigned int newstate = 0;
- struct cpufreq_freqs freqs;
-
- if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
- target_freq, relation, &newstate))
- return -EINVAL;
-
- freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
- freqs.new = speedstep_freqs[newstate].frequency;
- freqs.cpu = 0; /* speedstep.c is UP only driver */
-
- if (freqs.old == freqs.new)
- return 0;
-
- cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
- speedstep_set_state(newstate);
- cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
- return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
- return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
- int result;
- unsigned int speed, state;
- unsigned int *low, *high;
-
- /* capability check */
- if (policy->cpu != 0)
- return -ENODEV;
-
- result = speedstep_smi_ownership();
- if (result) {
- dprintk("fails in aquiring ownership of a SMI interface.\n");
- return -EINVAL;
- }
-
- /* detect low and high frequency */
- low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
- high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
- result = speedstep_smi_get_freqs(low, high);
- if (result) {
- /* fall back to speedstep_lib.c dection mechanism:
- * try both states out */
- dprintk("could not detect low and high frequencies "
- "by SMI call.\n");
- result = speedstep_get_freqs(speedstep_processor,
- low, high,
- NULL,
- &speedstep_set_state);
-
- if (result) {
- dprintk("could not detect two different speeds"
- " -- aborting.\n");
- return result;
- } else
- dprintk("workaround worked.\n");
- }
-
- /* get current speed setting */
- state = speedstep_get_state();
- speed = speedstep_freqs[state].frequency;
-
- dprintk("currently at %s speed setting - %i MHz\n",
- (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
- ? "low" : "high",
- (speed / 1000));
-
- /* cpuinfo and default policy values */
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
- policy->cur = speed;
-
- result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
- if (result)
- return result;
-
- cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
- return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
- cpufreq_frequency_table_put_attr(policy->cpu);
- return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
- if (cpu)
- return -ENODEV;
- return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
- int result = speedstep_smi_ownership();
-
- if (result)
- dprintk("fails in re-aquiring ownership of a SMI interface.\n");
-
- return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
- &cpufreq_freq_attr_scaling_available_freqs,
- NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
- .name = "speedstep-smi",
- .verify = speedstep_verify,
- .target = speedstep_target,
- .init = speedstep_cpu_init,
- .exit = speedstep_cpu_exit,
- .get = speedstep_get,
- .resume = speedstep_resume,
- .owner = THIS_MODULE,
- .attr = speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- * Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
- speedstep_processor = speedstep_detect_processor();
-
- switch (speedstep_processor) {
- case SPEEDSTEP_CPU_PIII_T:
- case SPEEDSTEP_CPU_PIII_C:
- case SPEEDSTEP_CPU_PIII_C_EARLY:
- break;
- default:
- speedstep_processor = 0;
- }
-
- if (!speedstep_processor) {
- dprintk("No supported Intel CPU detected.\n");
- return -ENODEV;
- }
-
- dprintk("signature:0x%.8lx, command:0x%.8lx, "
- "event:0x%.8lx, perf_level:0x%.8lx.\n",
- ist_info.signature, ist_info.command,
- ist_info.event, ist_info.perf_level);
-
- /* Error if no IST-SMI BIOS or no PARM
- sig= 'ISGE' aka 'Intel Speedstep Gate E' */
- if ((ist_info.signature != 0x47534943) && (
- (smi_port == 0) || (smi_cmd == 0)))
- return -ENODEV;
-
- if (smi_sig == 1)
- smi_sig = 0x47534943;
- else
- smi_sig = ist_info.signature;
-
- /* setup smi_port from MODLULE_PARM or BIOS */
- if ((smi_port > 0xff) || (smi_port < 0))
- return -EINVAL;
- else if (smi_port == 0)
- smi_port = ist_info.command & 0xff;
-
- if ((smi_cmd > 0xff) || (smi_cmd < 0))
- return -EINVAL;
- else if (smi_cmd == 0)
- smi_cmd = (ist_info.command >> 16) & 0xff;
-
- return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- * Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
- cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd, int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
- "-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
- "-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
- "SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..146f6f8b0650
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,192 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_APX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
+ { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
+ { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
+ { X86_FEATURE_FRED, X86_FEATURE_LKGS },
+ { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
+ { X86_FEATURE_LASS, X86_FEATURE_SMAP },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c
new file mode 100644
index 000000000000..89bc8db5e9c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sizes.h>
+
+#include <asm/cpuid/types.h>
+
+#include "cpu.h"
+
+#define CACHE_ENTRY(_desc, _type, _size) \
+ [_desc] = { \
+ .c_type = (_type), \
+ .c_size = (_size) / SZ_1K, \
+ }
+
+#define TLB_ENTRY(_desc, _type, _entries) \
+ [_desc] = { \
+ .t_type = (_type), \
+ .entries = (_entries), \
+ }
+
+const struct leaf_0x2_table cpuid_0x2_table[256] = {
+ CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */
+
+ TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */
+ TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */
+ TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */
+ TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */
+ TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */
+ TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */
+ TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */
+};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 4fbd384fb645..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,6 +1,7 @@
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/isa-dma.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
@@ -9,13 +10,16 @@
#include <linux/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include "cpu.h"
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
@@ -44,7 +48,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
}
}
-static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
@@ -59,25 +63,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
+static unsigned char Cx86_dir0_msb = 0;
-static const char __cpuinitconst Cx86_model[][9] = {
+static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static const char __cpuinitconst Cx486_name[][5] = {
+static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static const char __cpuinitconst Cx486S_name[][4] = {
+static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static const char __cpuinitconst Cx486D_name[][4] = {
+static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
-static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -87,7 +91,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
-static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
+static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -104,7 +108,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
- printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
@@ -112,52 +116,52 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __cpuinit set_cx86_reorder(void)
+static void set_cx86_reorder(void)
{
u8 ccr3;
- printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
}
-static void __cpuinit set_cx86_memwb(void)
+static void set_cx86_memwb(void)
{
- printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
/*
* Configure later MediaGX and/or Geode processor.
*/
-static void __cpuinit geode_configure(void)
+static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -166,7 +170,7 @@ static void __cpuinit geode_configure(void)
local_irq_restore(flags);
}
-static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
+static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
@@ -185,7 +189,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
+static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -212,7 +216,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
- c->x86_mask = dir1 & 0xf;
+ c->x86_stepping = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
@@ -249,10 +253,11 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* Emulate MTRRs using Cyrix's ARRs. */
set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
- c->coma_bug = 1;
+ set_cpu_bug(c, X86_BUG_COMA);
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
@@ -269,7 +274,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* VSA1 we work around however.
*/
- printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
@@ -287,12 +292,12 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -315,9 +320,10 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
} else {
- c->coma_bug = 1; /* 6x86MX, it has the bug. */
+ /* A 6x86MX - it has the bug. */
+ set_cpu_bug(c, X86_BUG_COMA);
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
@@ -332,7 +338,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
- p = Cx486_name[(c->hard_math) ? 1 : 0];
+ p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
break;
case 0xe: /* a 486S A step */
@@ -355,7 +361,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
+static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
@@ -404,7 +410,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
+static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
@@ -426,13 +432,13 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
- printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+ pr_info("Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
@@ -440,7 +446,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
+static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -451,7 +457,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
cpu_dev_register(cyrix_cpu_dev);
-static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
+static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..1976fef2dfe5
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+ seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg());
+ seq_printf(m, "num_threads: %u\n", __num_threads_per_package);
+ seq_printf(m, "num_cores: %u\n", __num_cores_per_package);
+ seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package);
+ seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_MAX_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_DIEGRP_DOMAIN] = "DieGrp",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..d69757246bde
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ TERTIARY_CTLS_LOW,
+ TERTIARY_CTLS_HIGH,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign, low, high;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ /* All 64 bits of tertiary controls MSR are allowed-1 settings. */
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
+ c->vmx_capability[TERTIARY_CTLS_LOW] = low;
+ c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+static int __init nosgx(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
+
+ return 0;
+}
+
+early_param("nosgx", nosgx);
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
+ bool tboot = tboot_enabled();
+ bool enable_vmx;
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (enable_vmx) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
+
+ wrmsrq(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ goto update_sgx;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+
+update_sgx:
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
new file mode 100644
index 000000000000..1fda6c3a2b65
--- /dev/null
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Hygon Processor Support for Linux
+ *
+ * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd.
+ *
+ * Author: Pu Wen <puwen@hygon.cn>
+ */
+#include <linux/io.h>
+
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
+#include <asm/cacheinfo.h>
+#include <asm/spec-ctrl.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned int apicid = c->topo.apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = c->topo.llc_id;
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs.
+ * Assume they are all increased by a constant offset, but
+ * in the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void bsp_init_hygon(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+ u64 val;
+
+ rdmsrq(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+}
+
+static void early_init_hygon(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for Hygon APIC So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+}
+
+static void init_hygon(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_hygon(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * XXX someone from Hygon needs to confirm this DTRT
+ *
+ init_spectral_chicken(c);
+ */
+
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_hygon_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ /*
+ * Hygon processors have APIC timer running in deep C states.
+ */
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+}
+
+static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m = tlb_lld_2m >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m = eax & 0xff;
+ } else
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
+}
+
+static const struct cpu_dev hygon_cpu_dev = {
+ .c_vendor = "Hygon",
+ .c_ident = { "HygonGenuine" },
+ .c_early_init = early_init_hygon,
+ .c_detect_tlb = cpu_detect_tlb_hygon,
+ .c_bsp_init = bsp_init_hygon,
+ .c_init = init_hygon,
+ .c_x86_vendor = X86_VENDOR_HYGON,
+};
+
+cpu_dev_register(hygon_cpu_dev);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..f3e9219845e8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,55 +21,92 @@
*
*/
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-/*
- * Hypervisor detect order. This is specified explicitly here because
- * some hypervisors might implement compatibility modes for other
- * hypervisors and therefore need to be detected in specific sequence.
- */
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
+#ifdef CONFIG_XEN_PV
+ &x86_hyper_xen_pv,
+#endif
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
+#ifdef CONFIG_JAILHOUSE_GUEST
+ &x86_hyper_jailhouse,
+#endif
+#ifdef CONFIG_ACRN_GUEST
+ &x86_hyper_acrn,
+#endif
+#ifdef CONFIG_BHYVE_GUEST
+ &x86_hyper_bhyve,
+#endif
};
-const struct hypervisor_x86 *x86_hyper;
-EXPORT_SYMBOL(x86_hyper);
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
+
+bool __initdata nopv;
+static __init int parse_nopv(char *arg)
+{
+ nopv = true;
+ return 0;
+}
+early_param("nopv", parse_nopv);
-static inline void __init
+static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
- const struct hypervisor_x86 *h, * const *p;
+ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
- h = *p;
- if (h->detect()) {
- x86_hyper = h;
- printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
- break;
+ if (unlikely(nopv) && !(*p)->ignore_nopv)
+ continue;
+
+ pri = (*p)->detect();
+ if (pri > max_pri) {
+ max_pri = pri;
+ h = *p;
}
}
+
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
- if (x86_hyper && x86_hyper->set_cpu_features)
- x86_hyper->set_cpu_features(c);
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
+
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
}
void __init init_hypervisor_platform(void)
{
+ const struct hypervisor_x86 *h;
- detect_hypervisor_vendor();
+ h = detect_hypervisor_vendor();
- if (!x86_hyper)
+ if (!h)
return;
- init_hypervisor(&boot_cpu_data);
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
- if (x86_hyper->init_platform)
- x86_hyper->init_platform();
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae10..98ae4c37c93e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,50 +1,226 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <linux/string.h>
#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/msr.h>
-#include <asm/bugs.h>
-#include <asm/cpu.h>
+#include <linux/string.h>
+#include <linux/types.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
-#include <asm/numa_64.h>
#endif
+#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
+#include <asm/hwcap2.h>
+#include <asm/intel-family.h>
+#include <asm/microcode.h>
+#include <asm/msr.h>
+#include <asm/numa.h>
+#include <asm/resctrl.h>
+#include <asm/thermal.h>
+#include <asm/uaccess.h>
+
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
+/*
+ * Processors which have self-snooping capability can handle conflicting
+ * memory type across CPUs by snooping its own cache. However, there exists
+ * CPU models in which having conflicting memory types still leads to
+ * unpredictable behavior, machine check errors, or hangs. Clear this
+ * feature to prevent its use on machines with known erratas.
+ */
+static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
+ setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
+ }
+}
+
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+ ring3mwait_disabled = true;
+ return 1;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
{
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- u64 misc_enable;
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (ring3mwait_disabled)
+ return;
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- c->cpuid_level = cpuid_eax(0);
- }
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
+/*
+ * Early microcode releases for the Spectre v2 mitigation were broken.
+ * Information taken from;
+ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
+ * - https://kb.vmware.com/s/article/52345
+ * - Microcode revisions observed in the wild
+ * - Release note from 20180108 microcode release
+ */
+struct sku_microcode {
+ u32 vfm;
+ u8 stepping;
+ u32 microcode;
+};
+static const struct sku_microcode spectre_bad_microcodes[] = {
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
+ /* Observed in the wild */
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
+};
+
+static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
+ return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
+ return false;
+}
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+static void detect_tme_early(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate;
+ int keyid_bits;
+
+ rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ clear_cpu_cap(c, X86_FEATURE_TME);
+ return;
+ }
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
+
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
+
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
+ return;
+
+ /*
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
+ */
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
+
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
+
+ /* Now if any of them are set, check the blacklist and clear the lot */
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
+ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
+ }
/*
* Atom erratum AAE44/AAF40/AAG38/AAH41:
@@ -54,17 +230,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
- u32 ucode, junk;
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- sync_core();
- rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
-
- if (ucode < 0x20e) {
- printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
- clear_cpu_cap(c, X86_FEATURE_PSE);
- }
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
+ c->microcode < 0x20e) {
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
+ clear_cpu_cap(c, X86_FEATURE_PSE);
}
#ifdef CONFIG_X86_64
@@ -76,8 +245,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -86,49 +255,91 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (!check_tsc_unstable())
- sched_clock_stable = 1;
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ }
+
+ /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
+ *
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
-#ifdef CONFIG_KMEMCHECK
/*
- * P4s have a "fast strings" feature which causes single-
- * stepping REP instructions to only generate a #DB on
- * cache-line boundaries.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
*
- * Ingo Molnar reported a Pentium D (model 6) and a Xeon
- * (model 2) with the same problem.
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 == 15) {
- u64 misc_enable;
-
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
+ rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
+ pr_info("Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
-#endif
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+ * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
+ * to be modified.
+ */
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
+
+ check_memory_type_self_snoop_errata(c);
+
+ /*
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
+ */
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
+}
+
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+ resctrl_cpu_detect(c);
}
#ifdef CONFIG_X86_32
@@ -138,74 +349,60 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* This is called before we do cpu ident work
*/
-int __cpuinit ppro_with_ram_bug(void)
+int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask < 8) {
- printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping < 8) {
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
}
-#ifdef CONFIG_X86_F00F_BUG
-static void __cpuinit trap_init_f00f_bug(void)
+static void intel_smp_check(struct cpuinfo_x86 *c)
{
- __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
- /*
- * Update the IDT descriptor and reload the IDT so that
- * it uses the read-only mapped virtual address.
- */
- idt_descr.address = fix_to_virt(FIX_F00F_IDT);
- load_idt(&idt_descr);
-}
-#endif
-
-static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
- if (c->cpu_index == boot_cpu_id)
+ if (!c->cpu_index)
return;
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
-#endif
}
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
- * All current models of Pentium and Pentium with MMX technology CPUs
+ * All models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
- * system.
- * Note that the workaround only should be initialized once...
+ * system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
*/
- c->f00f_bug = 0;
- if (!paravirt_enabled() && c->x86 == 5) {
+ clear_cpu_bug(c, X86_BUG_F00F);
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
- c->f00f_bug = 1;
+ set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
- trap_init_f00f_bug();
- printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
@@ -215,20 +412,30 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
- * P4 Xeon errata 037 workaround.
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ pr_warn("PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
+ * P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
}
}
@@ -238,54 +445,40 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
- (c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
-
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- }
-#endif
-
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
#endif
intel_smp_check(c);
}
#else
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+#ifdef CONFIG_NUMA
unsigned node;
int cpu = smp_processor_id();
- int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE)
- node = first_node(node_online_map);
- else if (!node_online(node)) {
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE || !node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
@@ -293,78 +486,61 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx;
+ u64 msr;
- if (c->cpuid_level < 4)
- return 1;
-
- /* Intel has a non-standard dependency on %ecx for this CPUID level. */
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- return (eax >> 26) + 1;
- else
- return 1;
+ if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
}
-static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- unsigned int l2 = 0;
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
+static void init_intel(struct cpuinfo_x86 *c)
+{
early_init_intel(c);
intel_workarounds(c);
- /*
- * Detect the extended topology information if available. This
- * will reinitialise the initial_apicid which will be used
- * in init_intel_cacheinfo()
- */
- detect_extended_topology(c);
+ init_intel_cacheinfo(c);
- l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
@@ -372,25 +548,33 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- if (cpu_has_xmm2)
+ if (cpu_has(c, X86_FEATURE_XMM2))
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (cpu_has_ds) {
- unsigned int l1;
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
+ unsigned int l1, l2;
+
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
+ set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -398,22 +582,21 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
char *p = NULL;
switch (c->x86_model) {
case 5:
- if (c->x86_mask == 0) {
- if (l2 == 0)
- p = "Celeron (Covington)";
- else if (l2 == 256)
- p = "Mobile Pentium II (Dixon)";
- }
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
break;
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
- else if (c->x86_mask == 0 || c->x86_mask == 5)
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
p = "Celeron-A";
break;
@@ -426,33 +609,25 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
+
+ init_intel_misc_features(c);
+
+ split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
@@ -460,18 +635,98 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if (c->x86_vfm == INTEL_QUARK_X1000)
+ size = 16;
return size;
}
#endif
-static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
+static void intel_tlb_lookup(const struct leaf_0x2_table *desc)
+{
+ short entries = desc->entries;
+
+ switch (desc->t_type) {
+ case STLB_4K:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ break;
+ case STLB_4K_2M:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_INST_ALL:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_INST_4K:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ break;
+ case TLB_INST_4M:
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_INST_2M_4M:
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ break;
+ case TLB_DATA_4K:
+ case TLB_DATA0_4K:
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ break;
+ case TLB_DATA_4M:
+ case TLB_DATA0_4M:
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_2M_4M:
+ case TLB_DATA0_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_4K_4M:
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
+ break;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
+ case TLB_DATA_1G:
+ tlb_lld_1g = max(tlb_lld_1g, entries);
+ break;
+ }
+}
+
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc)
+ intel_tlb_lookup(desc);
+}
+
+static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
@@ -484,7 +739,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[9] = "486 DX/4-WB"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+ { .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
@@ -492,10 +747,11 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
- [8] = "Mobile Pentium MMX"
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+ { .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
@@ -509,7 +765,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[11] = "Pentium III (Tualatin)",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+ { .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
@@ -519,12 +775,13 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
}
},
},
- .c_size_cache = intel_size_cache,
+ .legacy_cache_size = intel_size_cache,
#endif
+ .c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
+ .c_bsp_init = bsp_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
deleted file mode 100644
index 33eae2062cf5..000000000000
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ /dev/null
@@ -1,1149 +0,0 @@
-/*
- * Routines to indentify caches on Intel CPU.
- *
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
- * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/pci.h>
-
-#include <asm/processor.h>
-#include <linux/smp.h>
-#include <asm/k8.h>
-#include <asm/smp.h>
-
-#define LVL_1_INST 1
-#define LVL_1_DATA 2
-#define LVL_2 3
-#define LVL_3 4
-#define LVL_TRACE 5
-
-struct _cache_table {
- unsigned char descriptor;
- char cache_type;
- short size;
-};
-
-#define MB(x) ((x) * 1024)
-
-/* All the cache descriptor types we care about (no TLB or
- trace cache entries) */
-
-static const struct _cache_table __cpuinitconst cache_table[] =
-{
- { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
- { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
- { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
- { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
- { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
- { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
- { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
- { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
- { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
- { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
- { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
- { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
- { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
- { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
- { 0x00, 0, 0}
-};
-
-
-enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
-};
-
-union _cpuid4_leaf_eax {
- struct {
- enum _cache_type type:5;
- unsigned int level:3;
- unsigned int is_self_initializing:1;
- unsigned int is_fully_associative:1;
- unsigned int reserved:4;
- unsigned int num_threads_sharing:12;
- unsigned int num_cores_on_die:6;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ebx {
- struct {
- unsigned int coherency_line_size:12;
- unsigned int physical_line_partition:10;
- unsigned int ways_of_associativity:10;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ecx {
- struct {
- unsigned int number_of_sets:32;
- } split;
- u32 full;
-};
-
-struct amd_l3_cache {
- struct pci_dev *dev;
- bool can_disable;
- unsigned indices;
- u8 subcaches[4];
-};
-
-struct _cpuid4_info {
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned long size;
- struct amd_l3_cache *l3;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
-/* subset of above _cpuid4_info w/o shared_cpu_map */
-struct _cpuid4_info_regs {
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned long size;
- struct amd_l3_cache *l3;
-};
-
-unsigned short num_cache_leaves;
-
-/* AMD doesn't have CPUID4. Emulate it here to report the same
- information to the user. This makes some assumptions about the machine:
- L2 not shared, no SMT etc. that is currently true on AMD CPUs.
-
- In theory the TLBs could be reported as fake type (they are in "dummy").
- Maybe later */
-union l1_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:8;
- unsigned assoc:8;
- unsigned size_in_kb:8;
- };
- unsigned val;
-};
-
-union l2_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned size_in_kb:16;
- };
- unsigned val;
-};
-
-union l3_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned res:2;
- unsigned size_encoded:14;
- };
- unsigned val;
-};
-
-static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1,
- [2] = 2,
- [4] = 4,
- [6] = 8,
- [8] = 16,
- [0xa] = 32,
- [0xb] = 48,
- [0xc] = 64,
- [0xd] = 96,
- [0xe] = 128,
- [0xf] = 0xffff /* fully associative - no way to show this currently */
-};
-
-static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
-static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
-
-static void __cpuinit
-amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
-{
- unsigned dummy;
- unsigned line_size, lines_per_tag, assoc, size_in_kb;
- union l1_cache l1i, l1d;
- union l2_cache l2;
- union l3_cache l3;
- union l1_cache *l1 = &l1d;
-
- eax->full = 0;
- ebx->full = 0;
- ecx->full = 0;
-
- cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
- cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
-
- switch (leaf) {
- case 1:
- l1 = &l1i;
- case 0:
- if (!l1->val)
- return;
- assoc = assocs[l1->assoc];
- line_size = l1->line_size;
- lines_per_tag = l1->lines_per_tag;
- size_in_kb = l1->size_in_kb;
- break;
- case 2:
- if (!l2.val)
- return;
- assoc = assocs[l2.assoc];
- line_size = l2.line_size;
- lines_per_tag = l2.lines_per_tag;
- /* cpu_data has errata corrections for K7 applied */
- size_in_kb = current_cpu_data.x86_cache_size;
- break;
- case 3:
- if (!l3.val)
- return;
- assoc = assocs[l3.assoc];
- line_size = l3.line_size;
- lines_per_tag = l3.lines_per_tag;
- size_in_kb = l3.size_encoded * 512;
- if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
- size_in_kb = size_in_kb >> 1;
- assoc = assoc >> 1;
- }
- break;
- default:
- return;
- }
-
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-
-
- if (assoc == 0xffff)
- eax->split.is_fully_associative = 1;
- ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assoc - 1;
- ebx->split.physical_line_partition = lines_per_tag - 1;
- ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
- (ebx->split.ways_of_associativity + 1) - 1;
-}
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
-};
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-/*
- * L3 cache descriptors
- */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
-static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
-{
- unsigned int sc0, sc1, sc2, sc3;
- u32 val = 0;
-
- pci_read_config_dword(l3->dev, 0x1C4, &val);
-
- /* calculate subcache sizes */
- l3->subcaches[0] = sc0 = !(val & BIT(0));
- l3->subcaches[1] = sc1 = !(val & BIT(4));
- l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
- l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-
- l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
-}
-
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
- struct amd_l3_cache *l3;
- struct pci_dev *dev = node_to_k8_nb_misc(node);
-
- l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
- if (!l3) {
- printk(KERN_WARNING "Error allocating L3 struct\n");
- return NULL;
- }
-
- l3->dev = dev;
-
- amd_calc_l3_indices(l3);
-
- return l3;
-}
-
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
-{
- int node;
-
- if (boot_cpu_data.x86 != 0x10)
- return;
-
- if (index < 3)
- return;
-
- /* see errata #382 and #388 */
- if (boot_cpu_data.x86_model < 0x8)
- return;
-
- if ((boot_cpu_data.x86_model == 0x8 ||
- boot_cpu_data.x86_model == 0x9)
- &&
- boot_cpu_data.x86_mask < 0x1)
- return;
-
- /* not in virtualized environments */
- if (num_k8_northbridges == 0)
- return;
-
- /*
- * Strictly speaking, the amount in @size below is leaked since it is
- * never freed but this is done only on shutdown so it doesn't matter.
- */
- if (!l3_caches) {
- int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
-
- l3_caches = kzalloc(size, GFP_ATOMIC);
- if (!l3_caches)
- return;
- }
-
- node = amd_get_nb_id(smp_processor_id());
-
- if (!l3_caches[node]) {
- l3_caches[node] = amd_init_l3_cache(node);
- l3_caches[node]->can_disable = true;
- }
-
- WARN_ON(!l3_caches[node]);
-
- this_leaf->l3 = l3_caches[node];
-}
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int slot)
-{
- struct pci_dev *dev = this_leaf->l3->dev;
- unsigned int reg = 0;
-
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
- return -EINVAL;
-
- if (!dev)
- return -EINVAL;
-
- pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
- return sprintf(buf, "0x%08x\n", reg);
-}
-
-#define SHOW_CACHE_DISABLE(slot) \
-static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return show_cache_disable(this_leaf, buf, slot); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
- unsigned slot, unsigned long idx)
-{
- int i;
-
- idx |= BIT(30);
-
- /*
- * disable index in all 4 subcaches
- */
- for (i = 0; i < 4; i++) {
- u32 reg = idx | (i << 20);
-
- if (!l3->subcaches[i])
- continue;
-
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
-
- /*
- * We need to WBINVD on a core on the node containing the L3
- * cache which indices we disable therefore a simple wbinvd()
- * is not sufficient.
- */
- wbinvd_on_cpu(cpu);
-
- reg |= BIT(31);
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
- }
-}
-
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
-{
- struct pci_dev *dev = this_leaf->l3->dev;
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- unsigned long val = 0;
-
-#define SUBCACHE_MASK (3UL << 20)
-#define SUBCACHE_INDEX 0xfff
-
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!dev)
- return -EINVAL;
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- /* do not allow writes outside of allowed bits */
- if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
- return -EINVAL;
-
- amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
-
- return count;
-}
-
-#define STORE_CACHE_DISABLE(slot) \
-static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
-{ \
- return store_cache_disable(this_leaf, buf, count, slot); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-#else /* CONFIG_CPU_SUP_AMD */
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
-{
-};
-#endif /* CONFIG_CPU_SUP_AMD */
-
-static int
-__cpuinit cpuid4_cache_lookup_regs(int index,
- struct _cpuid4_info_regs *this_leaf)
-{
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned edx;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_check_l3_disable(index, this_leaf);
- } else {
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
- }
-
- if (eax.split.type == CACHE_TYPE_NULL)
- return -EIO; /* better error ? */
-
- this_leaf->eax = eax;
- this_leaf->ebx = ebx;
- this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
- return 0;
-}
-
-static int __cpuinit find_num_cache_leaves(void)
-{
- unsigned int eax, ebx, ecx, edx;
- union _cpuid4_leaf_eax cache_eax;
- int i = -1;
-
- do {
- ++i;
- /* Do cpuid(4) loop to find out num_cache_leaves */
- cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
- cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
- return i;
-}
-
-unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
- unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
- unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
- unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
- unsigned int cpu = c->cpu_index;
-#endif
-
- if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves();
- is_initialized++;
- }
-
- /*
- * Whenever possible use cpuid(4), deterministic cache
- * parameters cpuid leaf to find the cache details
- */
- for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info_regs this_leaf;
- int retval;
-
- retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval >= 0) {
- switch (this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type ==
- CACHE_TYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type ==
- CACHE_TYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(
- num_threads_sharing);
- l3_id = c->apicid >> index_msb;
- break;
- default:
- break;
- }
- }
- }
- }
- /*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
- */
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
- /* supports eax=2 call */
- int j, n;
- unsigned int regs[4];
- unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (num_cache_leaves != 0 && c->x86 == 15)
- only_trace = 1;
-
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++) {
- unsigned char des = dp[j];
- unsigned char k = 0;
-
- /* look up this descriptor in the table */
- while (cache_table[k].descriptor != 0) {
- if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
- switch (cache_table[k].cache_type) {
- case LVL_1_INST:
- l1i += cache_table[k].size;
- break;
- case LVL_1_DATA:
- l1d += cache_table[k].size;
- break;
- case LVL_2:
- l2 += cache_table[k].size;
- break;
- case LVL_3:
- l3 += cache_table[k].size;
- break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
- }
-
- break;
- }
-
- k++;
- }
- }
- }
- }
-
- if (new_l1d)
- l1d = new_l1d;
-
- if (new_l1i)
- l1i = new_l1i;
-
- if (new_l2) {
- l2 = new_l2;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l2_id;
-#endif
- }
-
- if (new_l3) {
- l3 = new_l3;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
- }
-
- c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
-
- return l2;
-}
-
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- unsigned long num_threads_sharing;
- int index_msb, i, sibling;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
- for_each_cpu(i, c->llc_shared_map) {
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
- for_each_cpu(sibling, c->llc_shared_map) {
- if (!cpu_online(sibling))
- continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
- }
- }
- return;
- }
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
-
- if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void __cpuinit free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static int
-__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
-{
- struct _cpuid4_info_regs *leaf_regs =
- (struct _cpuid4_info_regs *)this_leaf;
-
- return cpuid4_cache_lookup_regs(index, leaf_regs);
-}
-
-static void __cpuinit get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
-
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf;
- this_leaf = CPUID4_INFO_IDX(cpu, j);
- *retval = cpuid4_cache_lookup(j, this_leaf);
- if (unlikely(*retval < 0)) {
- int i;
-
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
- }
- cache_shared_cpu_map_setup(cpu, j);
- }
-}
-
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
-{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
-}
-
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name \
- (struct _cpuid4_info *this_leaf, char *buf) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, eax.split.level, 0);
-show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
-{
- return sprintf(buf, "%luK\n", this_leaf->size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
-
- if (len > 1) {
- const struct cpumask *mask;
-
- mask = to_cpumask(this_leaf->shared_cpu_map);
- n = type ?
- cpulist_scnprintf(buf, len-2, mask) :
- cpumask_scnprintf(buf, len-2, mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
- return n;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf)
-{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
-
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
-{
- switch (this_leaf->eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-#define DEFAULT_SYSFS_CACHE_ATTRS \
- &type.attr, \
- &level.attr, \
- &coherency_line_size.attr, \
- &physical_line_partition.attr, \
- &ways_of_associativity.attr, \
- &number_of_sets.attr, \
- &size.attr, \
- &shared_cpu_map.attr, \
- &shared_cpu_list.attr
-
-static struct attribute *default_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
- NULL
-};
-
-static struct attribute *default_l3_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
- &cache_disable_0.attr,
- &cache_disable_1.attr,
-#endif
- NULL
-};
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
- return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
-}
-
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
-{
- unsigned int cpu = sys_dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &sys_dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
-
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- if (this_leaf->l3 && this_leaf->l3->can_disable)
- ktype_cache.default_attrs = default_l3_attrs;
- else
- ktype_cache.default_attrs = default_attrs;
-
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
- }
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
- return 0;
-}
-
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
-{
- unsigned int cpu = sys_dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
-
- sys_dev = get_cpu_sysdev(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(sys_dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(sys_dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __cpuinit cache_sysfs_init(void)
-{
- int i;
-
- if (num_cache_leaves == 0)
- return 0;
-
- for_each_online_cpu(i) {
- int err;
- struct sys_device *sys_dev = get_cpu_sysdev(i);
-
- err = cache_add_dev(sys_dev);
- if (err)
- return err;
- }
- register_hotcpu_notifier(&cacheinfo_cpu_notifier);
- return 0;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
new file mode 100644
index 000000000000..2c56f8730f59
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Performance and Energy Bias Hint support.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/syscore_ops.h>
+#include <linux/pm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+/**
+ * DOC: overview
+ *
+ * The Performance and Energy Bias Hint (EPB) allows software to specify its
+ * preference with respect to the power-performance tradeoffs present in the
+ * processor. Generally, the EPB is expected to be set by user space (directly
+ * via sysfs or with the help of the x86_energy_perf_policy tool), but there are
+ * two reasons for the kernel to update it.
+ *
+ * First, there are systems where the platform firmware resets the EPB during
+ * system-wide transitions from sleep states back into the working state
+ * effectively causing the previous EPB updates by user space to be lost.
+ * Thus the kernel needs to save the current EPB values for all CPUs during
+ * system-wide transitions to sleep states and restore them on the way back to
+ * the working state. That can be achieved by saving EPB for secondary CPUs
+ * when they are taken offline during transitions into system sleep states and
+ * for the boot CPU in a syscore suspend operation, so that it can be restored
+ * for the boot CPU in a syscore resume operation and for the other CPUs when
+ * they are brought back online. However, CPUs that are already offline when
+ * a system-wide PM transition is started are not taken offline again, but their
+ * EPB values may still be reset by the platform firmware during the transition,
+ * so in fact it is necessary to save the EPB of any CPU taken offline and to
+ * restore it when the given CPU goes back online at all times.
+ *
+ * Second, on many systems the initial EPB value coming from the platform
+ * firmware is 0 ('performance') and at least on some of them that is because
+ * the platform firmware does not initialize EPB at all with the assumption that
+ * the OS will do that anyway. That sometimes is problematic, as it may cause
+ * the system battery to drain too fast, for example, so it is better to adjust
+ * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the
+ * kernel changes it to 6 ('normal').
+ */
+
+static DEFINE_PER_CPU(u8, saved_epb);
+
+#define EPB_MASK 0x0fULL
+#define EPB_SAVED 0x10ULL
+#define MAX_EPB EPB_MASK
+
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
+static int intel_epb_save(void *data)
+{
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ /*
+ * Ensure that saved_epb will always be nonzero after this write even if
+ * the EPB value read from the MSR is 0.
+ */
+ this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED);
+
+ return 0;
+}
+
+static void intel_epb_restore(void *data)
+{
+ u64 val = this_cpu_read(saved_epb);
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if (val) {
+ val &= EPB_MASK;
+ } else {
+ /*
+ * Because intel_epb_save() has not run for the current CPU yet,
+ * it is going online for the first time, so if its EPB value is
+ * 0 ('performance') at this point, assume that it has not been
+ * initialized by the platform firmware and set it to 6
+ * ('normal').
+ */
+ val = epb & EPB_MASK;
+ if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
+ val = energ_perf_values[EPB_INDEX_NORMAL];
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ }
+ }
+ wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
+}
+
+static const struct syscore_ops intel_epb_syscore_ops = {
+ .suspend = intel_epb_save,
+ .resume = intel_epb_restore,
+};
+
+static struct syscore intel_epb_syscore = {
+ .ops = &intel_epb_syscore_ops,
+};
+
+static const char * const energy_perf_strings[] = {
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
+};
+
+static ssize_t energy_perf_bias_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ unsigned int cpu = dev->id;
+ u64 epb;
+ int ret;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%llu\n", epb);
+}
+
+static ssize_t energy_perf_bias_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpu = dev->id;
+ u64 epb, val;
+ int ret;
+
+ ret = __sysfs_match_string(energy_perf_strings,
+ ARRAY_SIZE(energy_perf_strings), buf);
+ if (ret >= 0)
+ val = energ_perf_values[ret];
+ else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
+ return -EINVAL;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
+ (epb & ~EPB_MASK) | val);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(energy_perf_bias);
+
+static struct attribute *intel_epb_attrs[] = {
+ &dev_attr_energy_perf_bias.attr,
+ NULL
+};
+
+static const struct attribute_group intel_epb_attr_group = {
+ .name = power_group_name,
+ .attrs = intel_epb_attrs
+};
+
+static int intel_epb_online(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ intel_epb_restore(NULL);
+ if (!cpuhp_tasks_frozen)
+ sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ return 0;
+}
+
+static int intel_epb_offline(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ if (!cpuhp_tasks_frozen)
+ sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ intel_epb_save(NULL);
+ return 0;
+}
+
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ {}
+};
+
+static __init int intel_epb_init(void)
+{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_EPB))
+ return -ENODEV;
+
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
+ ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
+ "x86/intel/epb:online", intel_epb_online,
+ intel_epb_offline);
+ if (ret < 0)
+ goto err_out_online;
+
+ register_syscore(&intel_epb_syscore);
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
+ return ret;
+}
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000000..6af1e8baeb0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ *
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ if (!x86_match_vendor_cpu_type(c, m))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
+
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
+{
+ const struct x86_cpu_id *res = x86_match_cpu(table);
+
+ if (!res || res->driver_data > boot_cpu_data.microcode)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
new file mode 100644
index 000000000000..015856abdbb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y = core.o severity.o genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += intel.o
+obj-$(CONFIG_X86_MCE_AMD) += amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+
+mce-inject-y := inject.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_ACPI_APEI) += apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
new file mode 100644
index 000000000000..3f1dda355307
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -0,0 +1,1270 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ * Maintained by: Borislav Petkov <bp@alien8.de>
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+#define NR_BLOCKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF 0xF000
+
+static bool thresholding_irq_en;
+
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+ mce_banks_t dfr_intr_banks;
+
+ u32 thr_intr_en: 1,
+ dfr_intr_en: 1,
+ __resv: 30;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "decode_unit",
+ "northbridge",
+ "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
+};
+
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+ u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
+ __reserved :63;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
+};
+
+static const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t];
+}
+
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
+{
+ struct smca_bank *b;
+
+ if (bank >= MAX_NR_BANKS)
+ return N_SMCA_BANK_TYPES;
+
+ b = &per_cpu(smca_banks, cpu)[bank];
+ if (!b->hwid)
+ return N_SMCA_BANK_TYPES;
+
+ return b->hwid->bank_type;
+}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
+
+static const struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype } */
+
+ /* Reserved type */
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) },
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
+};
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ /* List of threshold blocks within this MCA bank. */
+ struct list_head miscj;
+};
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(u64, bank_map);
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
+ unsigned int i, hwid_mcatype;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
+ __set_bit(bank, data->dfr_intr_banks);
+ high |= BIT(5);
+ }
+
+ /*
+ * SMCA Corrected Error Interrupt
+ *
+ * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
+ * send an MCA Thresholding interrupt without the OS initializing
+ * this feature. This can be used if the threshold limit is managed
+ * by the platform.
+ *
+ * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
+ * The OS should set this to inform the platform that the OS is ready
+ * to handle the MCA Thresholding interrupt.
+ */
+ if ((low & BIT(10)) && data->thr_intr_en) {
+ __set_bit(bank, data->thr_intr_banks);
+ high |= BIT(8);
+ }
+
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
+ if (low & MCI_CONFIG_PADDRV)
+ this_cpu_ptr(smca_banks)[bank].paddrv = 1;
+
+ wrmsr(smca_config, low, high);
+ }
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
+ break;
+ }
+ }
+}
+
+struct thresh_restart {
+ struct threshold_block *b;
+ int set_lvt_off;
+ int lvt_off;
+ u16 old_limit;
+};
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return false;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ if (apic != msr) {
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ return true;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold block. */
+static void threshold_restart_block(void *_tr)
+{
+ struct thresh_restart *tr = _tr;
+ u32 hi, lo;
+
+ /* sysfs write might race against an offline operation */
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
+ return;
+
+ rdmsr(tr->b->address, lo, hi);
+
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
+ } else if (tr->old_limit) { /* change limit w/o reset */
+ int new_count = (hi & THRESHOLD_MAX) +
+ (tr->old_limit - tr->b->threshold_limit);
+
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
+
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void threshold_restart_bank(unsigned int bank, bool intr_en)
+{
+ struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
+ struct threshold_block *block, *tmp;
+ struct thresh_restart tr;
+
+ if (!thr_banks || !thr_banks[bank])
+ return;
+
+ memset(&tr, 0, sizeof(tr));
+
+ list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
+ tr.b = block;
+ tr.b->interrupt_enable = intr_en;
+ threshold_restart_block(&tr);
+ }
+}
+
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+ u32 thr_limit = mce_get_apei_thr_limit();
+
+ /* Fallback to old default if APEI limit is not available. */
+ if (!thr_limit)
+ return THRESHOLD_MAX;
+
+ return min(thr_limit, THRESHOLD_MAX);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = get_thr_limit();
+ threshold_restart_block(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block,
+ unsigned int cpu)
+{
+ u32 addr = 0, offset = 0;
+
+ if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ return addr;
+
+ if (mce_flags.smca) {
+ if (!block)
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+ if (!(low & MASK_BLKPTR_LO))
+ return 0;
+
+ return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = mca_msr_reg(bank, MCA_MISC);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
+static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
+{
+ unsigned int cpu = smp_processor_id();
+ struct threshold_block b;
+ int new;
+
+ if (!block)
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
+
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = addr;
+ b.interrupt_capable = lvt_interrupt_supported(bank, misc_high);
+
+ if (!b.interrupt_capable)
+ goto done;
+
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+ b.interrupt_enable = 1;
+
+ if (mce_flags.smca)
+ goto done;
+
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce_threshold(offset, new);
+ if (offset == new)
+ thresholding_irq_en = true;
+
+done:
+ mce_threshold_block_init(&b, offset);
+
+ return offset;
+}
+
+bool amd_filter_mce(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* See Family 17h Models 10h-2Fh Erratum #1114. */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
+ return true;
+
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not supported on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
+ */
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+{
+ int i, num_msrs;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
+
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
+ return;
+
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
+ return;
+ }
+
+ rdmsrq(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < num_msrs; i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr);
+}
+
+static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
+}
+
+/*
+ * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
+ * ready to send interrupts.
+ *
+ * Individual error sources are enabled later during per-bank init.
+ */
+static void smca_enable_interrupt_vectors(void)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u64 mca_intr_cfg, offset;
+
+ if (!mce_flags.smca || !mce_flags.succor)
+ return;
+
+ if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
+ return;
+
+ offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
+ if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->thr_intr_en = 1;
+
+ offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
+ if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->dfr_intr_en = 1;
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ unsigned int bank, block, cpu = smp_processor_id();
+ u32 low = 0, high = 0, address = 0;
+ int offset = -1;
+
+ amd_apply_cpu_quirks(c);
+
+ mce_flags.amd_threshold = 1;
+
+ smca_enable_interrupt_vectors();
+
+ for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
+ if (mce_flags.smca) {
+ smca_configure(bank, cpu);
+
+ if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
+ continue;
+ }
+
+ disable_err_thresholding(c, bank);
+
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ address = get_block_address(address, low, high, bank, block, cpu);
+ if (!address)
+ break;
+
+ if (rdmsr_safe(address, &low, &high))
+ break;
+
+ if (!(high & MASK_VALID_HI))
+ continue;
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ continue;
+
+ offset = prepare_threshold_block(bank, block, address, offset, high);
+ }
+ }
+}
+
+void smca_bsp_init(void)
+{
+ mce_threshold_vector = amd_threshold_interrupt;
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+}
+
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
+{
+ enum smca_bank_types bank_type;
+
+ if (XEC(m->status, 0x3f))
+ return false;
+
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+ if (mce_flags.smca)
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
+ * system physical address. Individual cases though, need to be detected for
+ * other systems. Future cases will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ * 4) MCI_STATUS_PADDRVAL is set
+ * a) Will provide a valid system physical address.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
+
+ if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
+ return m->status & MCI_STATUS_PADDRV;
+
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
+
+ /* Assume address is not usable for all others. */
+ return false;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
+{
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ apic_eoi();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+}
+
+void mce_amd_handle_storm(unsigned int bank, bool on)
+{
+ threshold_restart_bank(bank, on);
+}
+
+static void amd_reset_thr_limit(unsigned int bank)
+{
+ threshold_restart_bank(bank, true);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+}
+
+void amd_clear_bank(struct mce *m)
+{
+ amd_reset_thr_limit(m->bank);
+
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
+ mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ b->interrupt_enable = !!new;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.old_limit = b->threshold_limit;
+ b->threshold_limit = new;
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ u32 lo, hi;
+
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
+
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
+};
+ATTRIBUTE_GROUPS(default);
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->show ? a->show(b, buf) : -EIO;
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+
+ return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static void threshold_block_release(struct kobject *kobj);
+
+static const struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_groups = default_groups,
+ .release = threshold_block_release,
+};
+
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
+{
+ enum smca_bank_types bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ bank_type = smca_get_bank_type(cpu, bank);
+
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
+ }
+
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
+ return smca_get_name(bank_type);
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
+ return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
+{
+ struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
+
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe(address, &low, &high))
+ return 0;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
+ b->threshold_limit = get_thr_limit();
+
+ if (b->interrupt_capable) {
+ default_attrs[2] = &interrupt_enable.attr;
+ b->interrupt_enable = 1;
+ } else {
+ default_attrs[2] = NULL;
+ }
+
+ list_add(&b->miscj, &tb->miscj);
+
+ mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
+ if (err)
+ goto out_free;
+recurse:
+ address = get_block_address(address, low, high, bank, ++block, cpu);
+ if (!address)
+ return 0;
+
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
+ if (err)
+ goto out_free;
+
+ if (b)
+ kobject_uevent(&b->kobj, KOBJ_ADD);
+
+ return 0;
+
+out_free:
+ if (b) {
+ list_del(&b->miscj);
+ kobject_put(&b->kobj);
+ }
+ return err;
+}
+
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
+{
+ struct device *dev = this_cpu_read(mce_device);
+ struct threshold_bank *b = NULL;
+ const char *name = get_name(cpu, bank, NULL);
+ int err = 0;
+
+ if (!dev)
+ return -ENODEV;
+
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Associate the bank with the per-CPU MCE device */
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
+ if (!b->kobj) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
+ if (err)
+ goto out_kobj;
+
+ bp[bank] = b;
+ return 0;
+
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
+ kfree(b);
+out:
+ return err;
+}
+
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void threshold_remove_bank(struct threshold_bank *bank)
+{
+ struct threshold_block *pos, *tmp;
+
+ list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) {
+ list_del(&pos->miscj);
+ kobject_put(&pos->kobj);
+ }
+
+ kobject_put(bank->kobj);
+ kfree(bank);
+}
+
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
+void mce_threshold_remove_device(unsigned int cpu)
+{
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+
+ if (!bp)
+ return;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ __threshold_remove_device(bp);
+ return;
+}
+
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
+void mce_threshold_create_device(unsigned int cpu)
+{
+ unsigned int numbanks, bank;
+ struct threshold_bank **bp;
+
+ if (!mce_flags.amd_threshold)
+ return;
+
+ bp = this_cpu_read(threshold_banks);
+ if (bp)
+ return;
+
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
+ if (!bp)
+ return;
+
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
+ continue;
+ if (threshold_create_bank(bp, cpu, bank)) {
+ __threshold_remove_device(bp);
+ return;
+ }
+ }
+ this_cpu_write(threshold_banks, bp);
+
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+ return;
+}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
new file mode 100644
index 000000000000..0a89947e47bc
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+ struct mce_hw_err err;
+ struct mce *m;
+ int lsb;
+
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+ return;
+
+ /*
+ * Even if the ->validation_bits are set for address mask,
+ * to be extra safe, check and reject an error radius '0',
+ * and fall back to the default page size.
+ */
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+ lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+ else
+ lsb = PAGE_SHIFT;
+
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
+ /* Fake a memory read error with unknown channel */
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m->status |= MCI_STATUS_UC;
+
+ if (severity >= GHES_SEV_PANIC) {
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
+ }
+
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
+
+ if (!boot_cpu_has(X86_FEATURE_SMCA))
+ return -EINVAL;
+
+ /*
+ * The starting address of the register array extracted from BERT must
+ * match with the first expected register in the register layout of
+ * SMCA address space. This address corresponds to banks's MCA_STATUS
+ * register.
+ *
+ * Match any MCi_STATUS register by turning off bank numbers.
+ */
+ if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
+ MSR_AMD64_SMCA_MC0_STATUS)
+ return -EINVAL;
+
+ /*
+ * The number of registers in the register array is determined by
+ * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
+ * Sanity-check registers array size.
+ */
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
+ apicid_found = true;
+ break;
+ }
+ }
+
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
+
+ mce_log(&err);
+
+ return 0;
+}
+
+#define CPER_CREATOR_MCE \
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ int rc, pos;
+
+ rc = erst_get_record_id_begin(&pos);
+ if (rc)
+ return rc;
+retry:
+ rc = erst_get_record_id_next(&pos, record_id);
+ if (rc)
+ goto out;
+ /* no more record */
+ if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out;
+ rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
+ &CPER_CREATOR_MCE);
+ /* someone else has cleared the record, try next one */
+ if (rc == -ENOENT)
+ goto retry;
+ else if (rc < 0)
+ goto out;
+
+ memcpy(m, &rcd.mce, sizeof(*m));
+ rc = sizeof(*m);
+out:
+ erst_get_record_id_end();
+
+ return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
new file mode 100644
index 000000000000..34440021e8cf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -0,0 +1,2970 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/set_memory.h>
+#include <linux/sync_core.h>
+#include <linux/task_work.h>
+#include <linux/hardirq.h>
+#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
+
+#include <asm/fred.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+#include <asm/tdx.h>
+
+#include "internal.h"
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT 100 /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+#define ATTR_LEN 16
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank_dev {
+ struct device_attribute attr; /* device attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+ u8 bank; /* bank number */
+};
+static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
+
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
+static unsigned long mce_need_notify;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+void mce_prep_record_common(struct mce *m)
+{
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce_hw_err *err)
+{
+ if (mce_gen_pool_add(err))
+ irq_work_queue(&mce_irq_work);
+}
+EXPORT_SYMBOL_GPL(mce_log);
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
+ return;
+
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static void __print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
+
+ if (m->ip) {
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
+ if (m->cs == __KERNEL_CS)
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+ pr_cont("\n");
+ }
+
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+ if (m->addr)
+ pr_cont("ADDR %llx ", m->addr);
+ if (m->misc)
+ pr_cont("MISC %llx ", m->misc);
+ if (m->ppin)
+ pr_cont("PPIN %llx ", m->ppin);
+
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
+ pr_cont("\n");
+
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ m->microcode);
+}
+
+static void print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ __print_mce(err);
+
+ if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
+{
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+ int apei_err = 0;
+ const char *memmsg;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ goto out;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(&final->m);
+ }
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(&final->m);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __this_cpu_read(injectm.bank);
+
+ if (msr == mca_cfg.rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
+ return offsetof(struct mce, status);
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
+ return offsetof(struct mce, addr);
+ if (msr == mca_msr_reg(bank, MCA_MISC))
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+{
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
+
+ show_stack_regs(regs);
+
+ panic("MCA architectural violation!\n");
+
+ while (true)
+ cpu_relax();
+}
+
+/* MSR access wrappers used for error injection */
+noinstr u64 mce_rdmsrq(u32 msr)
+{
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+ u64 ret;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset < 0)
+ ret = 0;
+ else
+ ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+ instrumentation_end();
+
+ return ret;
+ }
+
+ /*
+ * RDMSR on MCA MSRs should not fault. If they do, this is very much an
+ * architectural violation and needs to be reported to hw vendor. Panic
+ * the box to not allow any further progress.
+ */
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+noinstr void mce_wrmsrq(u32 msr, u64 v)
+{
+ u32 low, high;
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset >= 0)
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+ instrumentation_end();
+
+ return;
+ }
+
+ low = (u32)v;
+ high = (u32)(v >> 32);
+
+ /* See comment in mce_rdmsrq() */
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
+ : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
+{
+ struct mce *m;
+ /*
+ * Enable instrumentation around mce_prep_record() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
+ mce_prep_record(err);
+ instrumentation_end();
+
+ m = &err->m;
+ m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrq(mca_cfg.rip_msr);
+ }
+}
+
+bool mce_available(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return false;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_gen_pool_empty())
+ schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+ mce_schedule_work();
+}
+
+bool mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_ADDRV))
+ return false;
+
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
+
+ default:
+ return true;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ return amd_mce_is_memory_error(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
+bool mce_is_correctable(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ /* Emit the trace record: */
+ trace_mce_record(err);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
+};
+
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn);
+ mce->kflags |= MCE_HANDLED_UC;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
+{
+ struct mce *m = &err->m;
+
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
+
+ if (m->status & MCI_STATUS_ADDRV) {
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+
+ smca_extract_err_addr(m);
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV) {
+ m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
+ }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+static bool ser_should_log_poll_error(struct mce *m)
+{
+ /* Log "not enabled" (speculative) errors */
+ if (!(m->status & MCI_STATUS_EN))
+ return true;
+
+ /*
+ * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+ * UC == 1 && PCC == 0 && S == 0
+ */
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
+ return true;
+
+ return false;
+}
+
+static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
+ /* If this entry is not valid, ignore it. */
+ if (!(m->status & MCI_STATUS_VAL))
+ return false;
+
+ /*
+ * If we are logging everything (at CPU online) or this
+ * is a corrected error, then we must log it.
+ */
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
+ return true;
+
+ if (mca_cfg.ser)
+ return ser_should_log_poll_error(m);
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+
+static void clear_bank(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD)
+ return amd_clear_bank(m);
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll handler -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mce_hw_err err;
+ struct mce *m;
+ int i;
+
+ this_cpu_inc(mce_poll_count);
+
+ mce_gather_info(&err, NULL);
+ m = &err.m;
+
+ if (flags & MCP_TIMESTAMP)
+ m->tsc = rdtsc();
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ barrier();
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(m);
+
+ /* Verify that the error should be logged based on hardware conditions. */
+ if (!should_log_poll_error(flags, &err))
+ continue;
+
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
+ goto clear_it;
+
+ if (flags & MCP_QUEUE_LOG)
+ mce_gen_pool_add(&err);
+ else
+ mce_log(&err);
+
+clear_it:
+ clear_bank(m);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static __always_inline void
+quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);
+
+ instrumentation_begin();
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ instrumentation_end();
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ struct mce *m = &err->m;
+ char *tmp = *msg;
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ arch___set_bit(i, validp);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
+
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
+ m->bank = i;
+ if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(err, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static noinstr int mce_timed_out(u64 *t, const char *msg)
+{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
+ mce_panic(msg, NULL, NULL);
+
+ ret = 1;
+ goto out;
+ }
+ *t -= SPINUNIT;
+
+out:
+ touch_nmi_watchdog();
+
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ struct mce_hw_err *err = NULL;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ int cpu;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
+
+ if (mtmp->severity > global_worst) {
+ global_worst = mtmp->severity;
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY) {
+ /* call mce_severity() to get "msg" for panic */
+ mce_severity(m, NULL, &msg, true);
+ mce_panic("Fatal machine check", err, msg);
+ }
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the hw_errs_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static noinstr int mce_start(int *no_way_out)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
+
+ if (!timeout)
+ return ret;
+
+ raw_atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = raw_atomic_inc_return(&mce_callin);
+ arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * Wait for everyone.
+ */
+ while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ raw_atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (raw_atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = raw_atomic_read(&global_nwo);
+
+ ret = order;
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static noinstr int mce_end(int order)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+static __always_inline void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (arch_test_bit(i, toclear))
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+ }
+}
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static noinstr bool mce_check_crashing_cpu(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (arch_cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static __always_inline int
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i, taint = 0;
+ struct mce *m = &err->m;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ arch___clear_bit(i, toclear);
+ if (!arch_test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ taint++;
+
+ severity = mce_severity(m, regs, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ arch___set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(err, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
+ mce_log(err);
+ instrumentation_end();
+
+ if (severity > *worst) {
+ *final = *err;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *err = *final;
+
+ return taint;
+}
+
+static void kill_me_now(struct callback_head *ch)
+{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
+ int ret;
+
+ p->mce_count = 0;
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+ flags |= MF_MUST_KILL;
+
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
+ if (!ret) {
+ set_mce_nospec(pfn);
+ sync_core();
+ return;
+ }
+
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info,
+ * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
+ *
+ * In both cases, no further processing is required.
+ */
+ if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
+ return;
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
+static void kill_me_never(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
+
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+}
+
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
+{
+ int count = ++current->mce_count;
+ struct mce *m = &err->m;
+
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+ current->mce_kill_me.func = func;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
+
+ task_work_add(current, &current->mce_kill_me, TWA_RESUME);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
+/*
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
+ *
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
+ */
+noinstr void do_machine_check(struct pt_regs *regs)
+{
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
+ char *msg = NULL;
+ struct mce *m;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
+
+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ goto clear;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE.
+ */
+ no_way_out = 0;
+
+ /*
+ * If kill_current_task is not set, there might be a way to recover from this
+ * error.
+ */
+ kill_current_task = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ lmce = 1;
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
+
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
+
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ kill_current_task = 1;
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel, Zhaoxin only.
+ */
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &err, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &err, msg);
+ }
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY) {
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
+ }
+ }
+
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
+ goto out;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m->cs & 3) == 3) {
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
+
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
+ else
+ queue_task_work(&err, msg, kill_me_maybe);
+
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
+ } else {
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &err, msg);
+ }
+
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
+ }
+
+out:
+ /* Given it didn't panic, mark it as recoverable */
+ hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
+ instrumentation_end();
+
+clear:
+ mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
+
+ local_irq_restore(flags);
+}
+
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
+static bool should_enable_timer(unsigned long iv)
+{
+ return !mca_cfg.ignore_ce && iv;
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+ unsigned long iv;
+
+ WARN_ON(cpu_t != t);
+
+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mce_available(this_cpu_ptr(&cpu_info)))
+ mc_poll_banks();
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
+ */
+ if (mce_notify_irq())
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ else
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else if (should_enable_timer(iv)) {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+/*
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
+ */
+void mce_timer_kick(bool storm)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_set_storm_mode(storm);
+
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
+}
+
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
+}
+
+static void __mcheck_cpu_mce_banks_init(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u8 n_banks = this_cpu_read(mce_num_banks);
+ int i;
+
+ for (i = 0; i < n_banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ /*
+ * Init them all by default.
+ *
+ * The required vendor quirks will be applied before
+ * __mcheck_cpu_init_prepare_banks() does the final bank setup.
+ */
+ b->ctl = -1ULL;
+ b->init = true;
+ }
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static void __mcheck_cpu_cap_init(void)
+{
+ u64 cap;
+ u8 b;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+
+ if (b > MAX_NR_BANKS) {
+ pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
+ smp_processor_id(), MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ this_cpu_write(mce_num_banks, b);
+
+ __mcheck_cpu_mce_banks_init();
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+ u64 cap;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_prepare_banks(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u64 msrval;
+ int i;
+
+ /*
+ * Log the machine checks left over from the previous reset. Log them
+ * only, do not start processing them. That will happen in mcheck_late_init()
+ * when all consumers have been registered on the notifier chain.
+ */
+ if (mca_cfg.bootlog) {
+ mce_banks_t all_banks;
+
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+ }
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
+ continue;
+
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+
+ rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
+ b->init = !!msrval;
+ }
+}
+
+static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ mca_cfg.bootlog = 0;
+ }
+
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
+
+static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return false;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
+ return true;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
+ return true;
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ mce_amd_feature_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+ unsigned long iv = check_interval * HZ;
+
+ if (should_enable_timer(iv)) {
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+ mce_start_timer(t);
+}
+
+bool filter_mce(struct mce *m)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return amd_filter_mce(m);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return intel_filter_mce(m);
+
+ return false;
+}
+
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ WARN_ON_ONCE(user_mode(regs));
+
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
+ return;
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ do_machine_check(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ irqentry_enter_from_user_mode(regs);
+
+ do_machine_check(regs);
+
+ irqentry_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_RAW(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+
+void mca_bsp_init(struct cpuinfo_x86 *c)
+{
+ u64 cap;
+
+ if (!mce_available(c))
+ return;
+
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ mca_cfg.disabled = 1;
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return;
+ }
+
+ mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR);
+ mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA);
+
+ if (mce_flags.smca)
+ smca_bsp_init();
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mca_cfg.ser = 1;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ amd_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_INTEL:
+ intel_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ zhaoxin_apply_global_quirks(c);
+ break;
+ }
+
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = 0;
+ if (mca_cfg.bootlog != 0)
+ mca_cfg.panic_timeout = 30;
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (__mcheck_cpu_ancient_init(c))
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ __mcheck_cpu_cap_init();
+
+ if (!mce_gen_pool_init()) {
+ mca_cfg.disabled = 1;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
+ mca_cfg.initialized = 1;
+
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_setup_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= this_cpu_read(mce_num_banks)) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable copy_mc_fragile()
+ */
+static int __init mcheck_enable(char *str)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (*str == 0) {
+ enable_p5_mce();
+ return 1;
+ }
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ cfg->disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
+ else if (!strcmp(str, "ignore_ce"))
+ cfg->ignore_ce = true;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = 1;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = 1;
+ else if (isdigit(str[0]))
+ get_option(&str, &(cfg->monarch_timeout));
+ else {
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+ mce_register_decode_chain(&early_nb);
+ mce_register_decode_chain(&mce_uc_nb);
+ mce_register_decode_chain(&mce_default_nb);
+
+ INIT_WORK(&mce_work, mce_gen_pool_process);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+ return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
+ }
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
+ return;
+
+ mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void *data)
+{
+ vendor_disable_error_reporting();
+ return 0;
+}
+
+static void mce_syscore_shutdown(void *data)
+{
+ vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void *data)
+{
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_prepare_banks();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+static const struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+static struct syscore mce_syscore = {
+ .ops = &mce_syscore_ops,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_init_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ mce_timer_delete_all();
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ __mcheck_cpu_init_timer();
+}
+
+static const struct bus_type mce_subsys = {
+ .name = "machinecheck",
+ .dev_name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
+{
+ return container_of(attr, struct mce_bank_dev, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+ char *buf)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+
+ if (!b->init)
+ return -ENODEV;
+
+ return sprintf(buf, "%llx\n", b->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+ if (!b->init)
+ return -ENODEV;
+
+ b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
+ } else {
+ /* enable ce features */
+ mca_cfg.ignore_ce = false;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
+ } else {
+ /* enable cmci */
+ mca_cfg.cmci_disabled = false;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long old_check_interval = check_interval;
+ ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+ if (check_interval == old_check_interval)
+ return ret;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return ret;
+}
+
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+ &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+ &dev_attr_trigger,
+#endif
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/* Per CPU device init. All of the CPUs still share the same bank device: */
+static int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
+ int err;
+ int i, j;
+
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
+
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
+ return err;
+ }
+
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
+ err = device_create_file(dev, &mce_bank_devs[j].attr);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
+
+ return 0;
+error2:
+ while (--j >= 0)
+ device_remove_file(dev, &mce_bank_devs[j].attr);
+error:
+ while (--i >= 0)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ device_unregister(dev);
+
+ return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
+ return;
+
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
+ device_remove_file(dev, &mce_bank_devs[i].attr);
+
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_clear();
+
+ vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_reenable();
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_device_create(cpu);
+ mce_threshold_create_device(cpu);
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ timer_delete_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_BANKS; i++) {
+ struct mce_bank_dev *b = &mce_bank_devs[i];
+ struct device_attribute *a = &b->attr;
+
+ b->bank = i;
+
+ sysfs_attr_init(&a->attr);
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+}
+
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
+static __init int mcheck_init_device(void)
+{
+ int err;
+
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mce_init_banks();
+
+ err = subsys_system_register(&mce_subsys, NULL);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
+
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+
+ register_syscore(&mce_syscore);
+
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+ return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mca_cfg.disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+ static struct dentry *dmce;
+
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
+
+ return dmce;
+}
+
+static void mce_reset(void)
+{
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+ mce_reset();
+ fake_panic = val;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
+ "%llu\n");
+
+static void __init mcheck_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+ debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+}
+#else
+static void __init mcheck_debugfs_init(void) { }
+#endif
+
+static int __init mcheck_late_init(void)
+{
+ if (mca_cfg.recovery)
+ enable_copy_mc_fragile();
+
+ mcheck_debugfs_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
new file mode 100644
index 000000000000..8d023239ce18
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -0,0 +1,365 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer *mcelog;
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int entry;
+
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ entry = mcelog->next;
+
+ /*
+ * When the buffer fills up discard new entries. Assume that the
+ * earlier errors are the more interesting ones:
+ */
+ if (entry >= mcelog->len) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
+ goto unlock;
+ }
+
+ mcelog->next = entry + 1;
+
+ memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
+ mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ mce->kflags |= MCE_HANDLED_MCELOG;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strscpy(mce_helper, buf, sizeof(mce_helper));
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned next;
+ int i, err;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
+ goto out;
+
+ next = mcelog->next;
+ err = 0;
+
+ for (i = 0; i < next; i++) {
+ struct mce *m = &mcelog->entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(mcelog->entry, 0, next * sizeof(struct mce));
+ mcelog->next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog->next))
+ return EPOLLIN | EPOLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(mcelog->len, p);
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
+ default:
+ return -ENOTTY;
+ }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffy or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int mce_log_len;
+ int err;
+
+ mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
+ if (!mcelog)
+ return -ENOMEM;
+
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ mcelog->len = mce_log_len;
+ mcelog->recordlen = sizeof(struct mce);
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+ kfree(mcelog);
+ return err;
+ }
+
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
new file mode 100644
index 000000000000..3ca9c007a666
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
+ */
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_hw_err *err1, *err2;
+ struct mce_evt_llist *node;
+
+ err1 = &t->err;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ err2 = &node->err;
+
+ if (!mce_cmp(&err1->m, &err2->m))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+ struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
+ mce = &node->err.m;
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+bool mce_gen_pool_add(struct mce_hw_err *err)
+{
+ struct mce_evt_llist *node;
+
+ if (filter_mce(&err->m))
+ return false;
+
+ if (!mce_evt_pool)
+ return false;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return false;
+ }
+
+ memcpy(&node->err, err, sizeof(*err));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return true;
+}
+
+static bool mce_gen_pool_create(void)
+{
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
+
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
+ }
+
+ mce_evt_pool = gpool;
+
+ return true;
+}
+
+bool mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return true;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
new file mode 100644
index 000000000000..d02c4f556cd0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd/nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "internal.h"
+
+static bool hw_injection_possible = true;
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+#define MAX_FLAG_OPT_SIZE 4
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->time = ktime_get_real_seconds();
+ m->cpuid = cpuid_eax(1);
+ m->microcode = boot_cpu_data.microcode;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure no one reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
+
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+ struct pt_regs regs;
+ unsigned long flags;
+
+ if (!pregs) {
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ pregs = &regs;
+ }
+ /* do_machine_check() expects interrupts disabled -- at least */
+ local_irq_save(flags);
+ do_machine_check(pregs);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+ if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+ return NMI_DONE;
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, regs);
+ else if (m->status)
+ raise_poll(m);
+ return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = this_cpu_ptr(&injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ fallthrough;
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ pr_info("Invalid MCE context\n");
+ ret = -EINVAL;
+ }
+ pr_info("MCE exception done on CPU %d\n", cpu);
+ } else if (m->status) {
+ pr_info("Starting machine check poll CPU %d\n", cpu);
+ raise_poll(m);
+ pr_info("Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+ unsigned long start;
+ int cpu;
+
+ cpus_read_lock();
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ }
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+ }
+ start = jiffies;
+ while (!cpumask_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ pr_err("Timeout waiting for mce inject %lx\n",
+ *cpumask_bits(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ cpus_read_unlock();
+ } else {
+ preempt_disable();
+ raise_local();
+ preempt_enable();
+ }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ if (i > SW_INJ && !hw_injection_possible)
+ continue;
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ u32 cores_per_node;
+
+ cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg();
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ } else {
+ wrmsrq(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr);
+
+ if (m.misc)
+ wrmsrq(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
+ u8 b = i_mce.bank;
+
+ i_mce.tsc = rdtsc_ordered();
+
+ i_mce.status |= MCI_STATUS_VAL;
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ err.m = i_mce;
+ mce_log(&err);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status &= ~MCI_STATUS_UC;
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu));
+ cpu = get_nbc_for_node(topology_amd_node_id(cpu));
+ }
+
+ cpus_read_lock();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ cpus_read_unlock();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+ u8 n_banks;
+ u64 cap;
+
+ /* Get bank count on target CPU so we can handle non-uniform values. */
+ rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ if (val >= n_banks) {
+ pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
+ do_inject();
+
+ /* Reset injection struct */
+ setup_inj_struct(&i_mce);
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static void __init debugfs_init(void)
+{
+ unsigned int i;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+ debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
+ &i_mce, dfs_fls[i].fops);
+}
+
+static void check_hw_inj_possible(void)
+{
+ int cpu;
+ u8 bank;
+
+ /*
+ * This behavior exists only on SMCA systems though its not directly
+ * related to SMCA.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SMCA))
+ return;
+
+ cpu = get_cpu();
+
+ for (bank = 0; bank < MAX_NR_BANKS; ++bank) {
+ u64 status = MCI_STATUS_VAL, ipid;
+
+ /* Check whether bank is populated */
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ if (!ipid)
+ continue;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0);
+
+ if (!status) {
+ hw_injection_possible = false;
+ pr_warn("Platform does not allow *hardware* error injection."
+ "Try using APEI EINJ instead.\n");
+ }
+
+ toggle_hw_mce_inject(cpu, false);
+
+ break;
+ }
+
+ put_cpu();
+}
+
+static int __init inject_init(void)
+{
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ check_hw_inj_possible();
+
+ debugfs_init();
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
+ pr_info("Machine check injector initialized\n");
+
+ return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
new file mode 100644
index 000000000000..4655223ba560
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
+#define CMCI_THRESHOLD 1
+
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
+
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
+
+static bool cmci_supported(int *banks)
+{
+ u64 cap;
+
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+ return false;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
+ return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
+ */
+ rdmsrq(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
+
+ return tmp & FEAT_CTL_LMCE_ENABLED;
+}
+
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
+{
+ unsigned long flags;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+void mce_intel_handle_storm(int bank, bool on)
+{
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
+{
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+
+ if (test_bit(bank, owned))
+ return true;
+
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
+
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), *val);
+
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
+
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ return val;
+}
+
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
+ }
+
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
+
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
+
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
+ */
+static void cmci_discover(int banks)
+{
+ int bios_wrong_thresh = 0;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+ int bios_zero_thresh = 0;
+
+ if (cmci_skip_bank(i, &val))
+ continue;
+
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ return;
+
+ local_irq_save(flags);
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+ return;
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ unsigned long flags;
+ int i;
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
+void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
+ return;
+ }
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+ u64 error_control;
+
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
+ if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control))
+ return;
+ error_control |= 2;
+ wrmsrq_safe(MSR_ERROR_CONTROL, error_control);
+ break;
+ }
+}
+
+static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ *
+ * Older CPUs (prior to family 6) can't reach this point and already
+ * return early due to the check of __mcheck_cpu_ancient_init().
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ this_cpu_ptr(mce_banks_array)[0].init = false;
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_apply_cpu_quirks(c);
+ intel_init_cmci();
+ intel_init_lmce();
+ intel_imc_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+ cmci_clear();
+}
+
+bool intel_filter_mce(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
+ (m->bank == 0) &&
+ ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
new file mode 100644
index 000000000000..a31cf984619c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce_hw_err err;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_handle_storm(int bank, bool on);
+void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
+bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
+#else
+static inline void mce_intel_handle_storm(int bank, bool on) { }
+static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
+#endif
+
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+u32 mce_get_apei_thr_limit(void);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+static inline u32 mce_get_apei_thr_limit(void) { return 0; }
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
+#endif
+
+struct mca_config {
+ __u64 lmce_disabled : 1,
+ disabled : 1,
+ ser : 1,
+ recovery : 1,
+ bios_cmci_threshold : 1,
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
+
+ int monarch_timeout;
+ int panic_timeout;
+ u32 rip_msr;
+ s8 bootlog;
+};
+
+extern struct mca_config mca_cfg;
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+struct mce_vendor_flags {
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 55;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
+};
+
+/* Decide whether to add MCE record to MCE event pool or filter it out. */
+extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_threshold_create_device(unsigned int cpu);
+void mce_threshold_remove_device(unsigned int cpu);
+void mce_amd_handle_storm(unsigned int bank, bool on);
+extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
+void amd_clear_bank(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
+void smca_bsp_init(void);
+#else
+static inline void mce_threshold_create_device(unsigned int cpu) { }
+static inline void mce_threshold_remove_device(unsigned int cpu) { }
+static inline void mce_amd_handle_storm(unsigned int bank, bool on) { }
+static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void amd_clear_bank(struct mce *m) { }
+static inline void smca_extract_err_addr(struct mce *m) { }
+static inline void smca_bsp_init(void) { }
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
+#endif
+
+noinstr u64 mce_rdmsrq(u32 msr);
+noinstr void mce_wrmsrq(u32 msr, u64 v);
+
+static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
+
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
+
+ return 0;
+}
+
+extern void (*mc_poll_banks)(void);
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 5c0e6533d9bc..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,36 +6,39 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* By default disabled */
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG
- "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
- smp_processor_id(), loaddr, lotype);
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
- printk(KERN_EMERG
- "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
- smp_processor_id());
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
}
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
@@ -50,19 +54,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO
- "Intel old style machine check architecture supported.\n");
+ pr_info("Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO
- "Intel old style machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
+ cr4_set_bits(X86_CR4_MCE);
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
new file mode 100644
index 000000000000..2235a7477436
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include <asm/mce.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+
+#include "internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned short mcgmask;
+ unsigned short mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char excp;
+ unsigned char covered;
+ unsigned int cpu_vfm;
+ unsigned char cpu_minstepping;
+ unsigned char bank_lo, bank_hi;
+ char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define KERNEL_RECOV .context = IN_KERNEL_RECOV
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ EXCP, BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
+ /* When MCIP is not set something is very confused */
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
+ /*
+ * known AO MCACODs reported via MCE or CMC:
+ *
+ * SRAO could be signaled either via a machine check exception or
+ * CMCI with the corresponding bit S 1 or 0. So we don't need to
+ * check bit S for SRAO.
+ */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ ),
+ /*
+ * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured
+ * to report uncorrected errors using CMCI with a special signature.
+ * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported
+ * in one of the memory controller banks.
+ * Set severity to "AO" for same action as normal patrol scrub error.
+ */
+ MCESEV(
+ AO, "Uncorrected Patrol Scrub Error",
+ SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ ),
+
+ /* ignore OVER for UCNA */
+ MCESEV(
+ UCNA, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signaled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
+
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load in error recoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL_RECOV
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+ MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
+#endif
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
+
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+ (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static bool is_copy_from_user(struct pt_regs *regs)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ unsigned long addr;
+ struct insn insn;
+ int ret;
+
+ if (!regs)
+ return false;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return false;
+
+ switch (insn.opcode.value) {
+ /* MOV mem,reg */
+ case 0x8A: case 0x8B:
+ /* MOVZ mem,reg */
+ case 0xB60F: case 0xB70F:
+ addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ break;
+ /* REP MOVS */
+ case 0xA4: case 0xA5:
+ addr = regs->si;
+ break;
+ default:
+ return false;
+ }
+
+ if (fault_in_kernel_space(addr))
+ return false;
+
+ current->mce_vaddr = (void __user *)addr;
+
+ return true;
+}
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
+{
+ int fixup_type;
+ bool copy_user;
+
+ if ((m->cs & 3) == 3)
+ return IN_USER;
+
+ if (!mc_recoverable(m->mcgstatus))
+ return IN_KERNEL;
+
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+
+ switch (fixup_type) {
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+
+ default:
+ return IN_KERNEL;
+ }
+}
+
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ char *panic_msg = NULL;
+ int ret;
+
+ /*
+ * Default return value: Action required, the error must be handled
+ * immediately.
+ */
+ ret = MCE_AR_SEVERITY;
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
+ */
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
+ */
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
+}
+
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+ enum context ctx = error_context(m, regs);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((m->status & s->mask) != s->result)
+ continue;
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+ continue;
+ if (s->ser == NO_SER && mca_cfg.ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (s->excp && excp != s->excp)
+ continue;
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
+ continue;
+ if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
+ continue;
+ if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi))
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+
+ return s->sev;
+ }
+}
+
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return mce_severity_amd(m, regs, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, msg, is_excp);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+ .llseek = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+
+ debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ return 0;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
new file mode 100644
index 000000000000..0d13c9ffcba0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+ mce_apei_thr_limit = thr_limit;
+ pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+ return mce_apei_thr_limit;
+}
+
+static void default_threshold_interrupt(void)
+{
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
+{
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ apic_eoi();
+}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ case X86_VENDOR_AMD:
+ mce_amd_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ if (!mce_flags.amd_threshold)
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!--storm->stormy_bank_count)
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index 54060f565974..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,18 +6,23 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
-#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
- printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
- add_taint(TAINT_MACHINE_CHECK);
+ instrumentation_begin();
+ pr_emerg("CPU0: Machine Check Exception.\n");
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
@@ -24,17 +30,12 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
- printk(KERN_INFO
- "Winchip machine check reporting enabled on CPU#0.\n");
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
deleted file mode 100644
index bb34b03af252..000000000000
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-obj-y = mce.o mce-severity.o
-
-obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
-obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
-obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
-obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
-obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
-obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
deleted file mode 100644
index 745b54f9be89..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Bridge between MCE and APEI
- *
- * On some machine, corrected memory errors are reported via APEI
- * generic hardware error source (GHES) instead of corrected Machine
- * Check. These corrected memory errors can be reported to user space
- * through /dev/mcelog via faking a corrected Machine Check, so that
- * the error memory page can be offlined by /sbin/mcelog if the error
- * count for one page is beyond the threshold.
- *
- * For fatal MCE, save MCE record into persistent storage via ERST, so
- * that the MCE record can be logged after reboot via ERST.
- *
- * Copyright 2010 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/cper.h>
-#include <acpi/apei.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
-{
- struct mce m;
-
- /* Only corrected MC is reported */
- if (!corrected)
- return;
-
- mce_setup(&m);
- m.bank = 1;
- /* Fake a memory read corrected error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
- m.addr = mem_err->physical_addr;
- mce_log(&m);
- mce_notify_irq();
-}
-EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
-
-#define CPER_CREATOR_MCE \
- UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
-#define CPER_SECTION_TYPE_MCE \
- UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
-
-/*
- * CPER specification (in UEFI specification 2.3 appendix N) requires
- * byte-packed.
- */
-struct cper_mce_record {
- struct cper_record_header hdr;
- struct cper_section_descriptor sec_hdr;
- struct mce mce;
-} __packed;
-
-int apei_write_mce(struct mce *m)
-{
- struct cper_mce_record rcd;
-
- memset(&rcd, 0, sizeof(rcd));
- memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
- rcd.hdr.revision = CPER_RECORD_REV;
- rcd.hdr.signature_end = CPER_SIG_END;
- rcd.hdr.section_count = 1;
- rcd.hdr.error_severity = CPER_SER_FATAL;
- /* timestamp, platform_id, partition_id are all invalid */
- rcd.hdr.validation_bits = 0;
- rcd.hdr.record_length = sizeof(rcd);
- rcd.hdr.creator_id = CPER_CREATOR_MCE;
- rcd.hdr.notification_type = CPER_NOTIFY_MCE;
- rcd.hdr.record_id = cper_next_record_id();
- rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
-
- rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
- rcd.sec_hdr.section_length = sizeof(rcd.mce);
- rcd.sec_hdr.revision = CPER_SEC_REV;
- /* fru_id and fru_text is invalid */
- rcd.sec_hdr.validation_bits = 0;
- rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
- rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
- rcd.sec_hdr.section_severity = CPER_SER_FATAL;
-
- memcpy(&rcd.mce, m, sizeof(*m));
-
- return erst_write(&rcd.hdr);
-}
-
-ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
- struct cper_mce_record rcd;
- ssize_t len;
-
- len = erst_read_next(&rcd.hdr, sizeof(rcd));
- if (len <= 0)
- return len;
- /* Can not skip other records in storage via ERST unless clear them */
- else if (len != sizeof(rcd) ||
- uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
- if (printk_ratelimit())
- pr_warning(
- "MCE-APEI: Can not skip the unknown record in ERST");
- return -EIO;
- }
-
- memcpy(m, &rcd.mce, sizeof(*m));
- *record_id = rcd.hdr.record_id;
-
- return sizeof(*m);
-}
-
-/* Check whether there is record in ERST */
-int apei_check_mce(void)
-{
- return erst_get_record_count();
-}
-
-int apei_clear_mce(u64 record_id)
-{
- return erst_clear(record_id);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
deleted file mode 100644
index e7dbde7bfedb..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Machine check injection support.
- * Copyright 2008 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Authors:
- * Andi Kleen
- * Ying Huang
- */
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/timer.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/mce.h>
-#include <asm/apic.h>
-
-/* Update fake mce registers on current CPU. */
-static void inject_mce(struct mce *m)
-{
- struct mce *i = &per_cpu(injectm, m->extcpu);
-
- /* Make sure noone reads partially written injectm */
- i->finished = 0;
- mb();
- m->finished = 0;
- /* First set the fields after finished */
- i->extcpu = m->extcpu;
- mb();
- /* Now write record in order, finished last (except above) */
- memcpy(i, m, sizeof(struct mce));
- /* Finally activate it */
- mb();
- i->finished = 1;
-}
-
-static void raise_poll(struct mce *m)
-{
- unsigned long flags;
- mce_banks_t b;
-
- memset(&b, 0xff, sizeof(mce_banks_t));
- local_irq_save(flags);
- machine_check_poll(0, &b);
- local_irq_restore(flags);
- m->finished = 0;
-}
-
-static void raise_exception(struct mce *m, struct pt_regs *pregs)
-{
- struct pt_regs regs;
- unsigned long flags;
-
- if (!pregs) {
- memset(&regs, 0, sizeof(struct pt_regs));
- regs.ip = m->ip;
- regs.cs = m->cs;
- pregs = &regs;
- }
- /* in mcheck exeception handler, irq will be disabled */
- local_irq_save(flags);
- do_machine_check(pregs, 0);
- local_irq_restore(flags);
- m->finished = 0;
-}
-
-static cpumask_var_t mce_inject_cpumask;
-
-static int mce_raise_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct die_args *args = (struct die_args *)data;
- int cpu = smp_processor_id();
- struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
- return NOTIFY_DONE;
- cpumask_clear_cpu(cpu, mce_inject_cpumask);
- if (m->inject_flags & MCJ_EXCEPTION)
- raise_exception(m, args->regs);
- else if (m->status)
- raise_poll(m);
- return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_raise_nb = {
- .notifier_call = mce_raise_notify,
- .priority = 1000,
-};
-
-/* Inject mce on current CPU */
-static int raise_local(void)
-{
- struct mce *m = &__get_cpu_var(injectm);
- int context = MCJ_CTX(m->inject_flags);
- int ret = 0;
- int cpu = m->extcpu;
-
- if (m->inject_flags & MCJ_EXCEPTION) {
- printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
- switch (context) {
- case MCJ_CTX_IRQ:
- /*
- * Could do more to fake interrupts like
- * calling irq_enter, but the necessary
- * machinery isn't exported currently.
- */
- /*FALL THROUGH*/
- case MCJ_CTX_PROCESS:
- raise_exception(m, NULL);
- break;
- default:
- printk(KERN_INFO "Invalid MCE context\n");
- ret = -EINVAL;
- }
- printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
- } else if (m->status) {
- printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
- raise_poll(m);
- mce_notify_irq();
- printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
- } else
- m->finished = 0;
-
- return ret;
-}
-
-static void raise_mce(struct mce *m)
-{
- int context = MCJ_CTX(m->inject_flags);
-
- inject_mce(m);
-
- if (context == MCJ_CTX_RANDOM)
- return;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- if (m->inject_flags & MCJ_NMI_BROADCAST) {
- unsigned long start;
- int cpu;
- get_online_cpus();
- cpumask_copy(mce_inject_cpumask, cpu_online_mask);
- cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
- for_each_online_cpu(cpu) {
- struct mce *mcpu = &per_cpu(injectm, cpu);
- if (!mcpu->finished ||
- MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
- cpumask_clear_cpu(cpu, mce_inject_cpumask);
- }
- if (!cpumask_empty(mce_inject_cpumask))
- apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
- start = jiffies;
- while (!cpumask_empty(mce_inject_cpumask)) {
- if (!time_before(jiffies, start + 2*HZ)) {
- printk(KERN_ERR
- "Timeout waiting for mce inject NMI %lx\n",
- *cpumask_bits(mce_inject_cpumask));
- break;
- }
- cpu_relax();
- }
- raise_local();
- put_cpu();
- put_online_cpus();
- } else
-#endif
- raise_local();
-}
-
-/* Error injection interface */
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- struct mce m;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * There are some cases where real MSR reads could slip
- * through.
- */
- if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
- return -EIO;
-
- if ((unsigned long)usize > sizeof(struct mce))
- usize = sizeof(struct mce);
- if (copy_from_user(&m, ubuf, usize))
- return -EFAULT;
-
- if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
- return -EINVAL;
-
- /*
- * Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
- */
- schedule_timeout(2);
- raise_mce(&m);
- return usize;
-}
-
-static int inject_init(void)
-{
- if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
- return -ENOMEM;
- printk(KERN_INFO "Machine check injector initialized\n");
- mce_chrdev_ops.write = mce_write;
- register_die_notifier(&mce_raise_nb);
- return 0;
-}
-
-module_init(inject_init);
-/*
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
deleted file mode 100644
index fefcc69ee8b5..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <linux/sysdev.h>
-#include <asm/mce.h>
-
-enum severity_level {
- MCE_NO_SEVERITY,
- MCE_KEEP_SEVERITY,
- MCE_SOME_SEVERITY,
- MCE_AO_SEVERITY,
- MCE_UC_SEVERITY,
- MCE_AR_SEVERITY,
- MCE_PANIC_SEVERITY,
-};
-
-#define ATTR_LEN 16
-
-/* One object for each MCE bank, shared by all CPUs */
-struct mce_bank {
- u64 ctl; /* subevents to enable */
- unsigned char init; /* initialise bank? */
- struct sysdev_attribute attr; /* sysdev attribute */
- char attrname[ATTR_LEN]; /* attribute name */
-};
-
-int mce_severity(struct mce *a, int tolerant, char **msg);
-struct dentry *mce_get_debugfs_dir(void);
-
-extern int mce_ser;
-
-extern struct mce_bank *mce_banks;
-
-#ifdef CONFIG_ACPI_APEI
-int apei_write_mce(struct mce *m);
-ssize_t apei_read_mce(struct mce *m, u64 *record_id);
-int apei_check_mce(void);
-int apei_clear_mce(u64 record_id);
-#else
-static inline int apei_write_mce(struct mce *m)
-{
- return -EINVAL;
-}
-static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
- return 0;
-}
-static inline int apei_check_mce(void)
-{
- return 0;
-}
-static inline int apei_clear_mce(u64 record_id)
-{
- return -EINVAL;
-}
-#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
deleted file mode 100644
index 8a85dd1b1aa1..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * MCE grading rules.
- * Copyright 2008, 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Author: Andi Kleen
- */
-#include <linux/kernel.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-/*
- * Grade an mce by severity. In general the most severe ones are processed
- * first. Since there are quite a lot of combinations test the bits in a
- * table-driven way. The rules are simply processed in order, first
- * match wins.
- *
- * Note this is only used for machine check exceptions, the corrected
- * errors use much simpler rules. The exceptions still check for the corrected
- * errors, but only to leave them alone for the CMCI handler (except for
- * panic situations)
- */
-
-enum context { IN_KERNEL = 1, IN_USER = 2 };
-enum ser { SER_REQUIRED = 1, NO_SER = 2 };
-
-static struct severity {
- u64 mask;
- u64 result;
- unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
- unsigned char ser;
- unsigned char context;
- unsigned char covered;
- char *msg;
-} severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
- { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
- { .mask = x, .result = y, SEV(s), .msg = m, ## r }
-#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
-#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
-#define MCACOD 0xffff
-
- BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
- BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
- BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
- /* When MCIP is not set something is very confused */
- MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
- /* Neither return not error IP -- no chance to recover -> PANIC */
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
- "Neither restart nor error IP"),
- MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
- KERNEL),
- BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
- MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
- "Spurious not enabled", SER),
-
- /* ignore OVER for UCNA */
- MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
- "Uncorrected no action required", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
- "Illegal combination (UCNA with AR=1)", SER),
- MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
-
- /* AR add known MCACODs here */
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
- "Action required with lost events", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
- "Action required; unknown MCACOD", SER),
-
- /* known AO MCACODs: */
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
- "Action optional: memory scrubbing error", SER),
- MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
- "Action optional: last level cache writeback error", SER),
-
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
- "Action optional unknown MCACOD", SER),
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
- "Action optional with lost events", SER),
- BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
- BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
- BITSET(0, SOME, "No match") /* always matches. keep at end */
-};
-
-/*
- * If the EIPV bit is set, it means the saved IP is the
- * instruction which caused the MCE.
- */
-static int error_context(struct mce *m)
-{
- if (m->mcgstatus & MCG_STATUS_EIPV)
- return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
- /* Unknown, assume kernel */
- return IN_KERNEL;
-}
-
-int mce_severity(struct mce *a, int tolerant, char **msg)
-{
- enum context ctx = error_context(a);
- struct severity *s;
-
- for (s = severities;; s++) {
- if ((a->status & s->mask) != s->result)
- continue;
- if ((a->mcgstatus & s->mcgmask) != s->mcgres)
- continue;
- if (s->ser == SER_REQUIRED && !mce_ser)
- continue;
- if (s->ser == NO_SER && mce_ser)
- continue;
- if (s->context && ctx != s->context)
- continue;
- if (msg)
- *msg = s->msg;
- s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
- return MCE_PANIC_SEVERITY;
- }
- return s->sev;
- }
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void *s_start(struct seq_file *f, loff_t *pos)
-{
- if (*pos >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void *s_next(struct seq_file *f, void *data, loff_t *pos)
-{
- if (++(*pos) >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void s_stop(struct seq_file *f, void *data)
-{
-}
-
-static int s_show(struct seq_file *f, void *data)
-{
- struct severity *ser = data;
- seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
- return 0;
-}
-
-static const struct seq_operations severities_seq_ops = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
-static int severities_coverage_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &severities_seq_ops);
-}
-
-static ssize_t severities_coverage_write(struct file *file,
- const char __user *ubuf,
- size_t count, loff_t *ppos)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(severities); i++)
- severities[i].covered = 0;
- return count;
-}
-
-static const struct file_operations severities_coverage_fops = {
- .open = severities_coverage_open,
- .release = seq_release,
- .read = seq_read,
- .write = severities_coverage_write,
-};
-
-static int __init severities_debugfs_init(void)
-{
- struct dentry *dmce = NULL, *fseverities_coverage = NULL;
-
- dmce = mce_get_debugfs_dir();
- if (dmce == NULL)
- goto err_out;
- fseverities_coverage = debugfs_create_file("severities-coverage",
- 0444, dmce, NULL,
- &severities_coverage_fops);
- if (fseverities_coverage == NULL)
- goto err_out;
-
- return 0;
-
-err_out:
- return -ENOMEM;
-}
-late_initcall(severities_debugfs_init);
-#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
deleted file mode 100644
index 18cc42562250..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ /dev/null
@@ -1,2221 +0,0 @@
-/*
- * Machine check handler.
- *
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-#include <linux/thread_info.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
-#include <linux/ratelimit.h>
-#include <linux/kallsyms.h>
-#include <linux/rcupdate.h>
-#include <linux/kobject.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/string.h>
-#include <linux/sysdev.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kmod.h>
-#include <linux/poll.h>
-#include <linux/nmi.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/debugfs.h>
-#include <linux/edac_mce.h>
-
-#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#include "mce-internal.h"
-
-static DEFINE_MUTEX(mce_read_mutex);
-
-#define rcu_dereference_check_mce(p) \
- rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_read_mutex))
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/mce.h>
-
-int mce_disabled __read_mostly;
-
-#define MISC_MCELOG_MINOR 227
-
-#define SPINUNIT 100 /* 100ns */
-
-atomic_t mce_entry;
-
-DEFINE_PER_CPU(unsigned, mce_exception_count);
-
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant __read_mostly = 1;
-static int banks __read_mostly;
-static int rip_msr __read_mostly;
-static int mce_bootlog __read_mostly = -1;
-static int monarch_timeout __read_mostly = -1;
-static int mce_panic_timeout __read_mostly;
-static int mce_dont_log_ce __read_mostly;
-int mce_cmci_disabled __read_mostly;
-int mce_ignore_ce __read_mostly;
-int mce_ser __read_mostly;
-
-struct mce_bank *mce_banks __read_mostly;
-
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
-static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
-
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- pr_emerg("No human readable MCE decoding support on this CPU type.\n");
- pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
-
- return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
- .notifier_call = default_decode_mce,
- .priority = -1,
-};
-
-/* MCA banks polled by the period polling timer for corrected events */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
- [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-static DEFINE_PER_CPU(struct work_struct, mce_work);
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
- rdtscll(m->tsc);
- /* We hope get_seconds stays lockless */
- m->time = get_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
- m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
-}
-
-DEFINE_PER_CPU(struct mce, injectm);
-EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
-{
- unsigned next, entry;
-
- /* Emit the trace record: */
- trace_mce_record(mce);
-
- mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference_check_mce(mcelog.next);
- for (;;) {
- /*
- * If edac_mce is enabled, it will check the error type
- * and will process it, if it is a known error.
- * Otherwise, the error will be sent through mcelog
- * interface
- */
- if (edac_mce_parse(mce))
- return;
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- mce->finished = 1;
- set_bit(0, &mce_need_notify);
-}
-
-static void print_mce(struct mce *m)
-{
- pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
-
- if (m->ip) {
- pr_emerg("RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
-
- if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
- pr_cont("\n");
- }
-
- pr_emerg("TSC %llx ", m->tsc);
- if (m->addr)
- pr_cont("ADDR %llx ", m->addr);
- if (m->misc)
- pr_cont("MISC %llx ", m->misc);
-
- pr_cont("\n");
- pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
-
- /*
- * Print out human-readable details about the MCE error,
- * (if the CPU has an implementation for that)
- */
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-}
-
-static void print_mce_head(void)
-{
- pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- pr_emerg("This is not a software problem!\n");
-}
-
-#define PANIC_TIMEOUT 5 /* 5 seconds */
-
-static atomic_t mce_paniced;
-
-static int fake_panic;
-static atomic_t mce_fake_paniced;
-
-/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
-{
- long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
-
- preempt_disable();
- local_irq_enable();
- while (timeout-- > 0)
- udelay(1);
- if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
- panic("Panicing machine check CPU died");
-}
-
-static void mce_panic(char *msg, struct mce *final, char *exp)
-{
- int i, apei_err = 0;
-
- if (!fake_panic) {
- /*
- * Make sure only one CPU runs in machine check panic
- */
- if (atomic_inc_return(&mce_paniced) > 1)
- wait_for_panic();
- barrier();
-
- bust_spinlocks(1);
- console_verbose();
- } else {
- /* Don't log too much for fake panic */
- if (atomic_inc_return(&mce_fake_paniced) > 1)
- return;
- }
- print_mce_head();
- /* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce))) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- if (final) {
- print_mce(final);
- if (!apei_err)
- apei_err = apei_write_mce(final);
- }
- if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
- if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
- if (!fake_panic) {
- if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
- panic(msg);
- } else
- printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
-}
-
-/* Support code for software error injection */
-
-static int msr_to_offset(u32 msr)
-{
- unsigned bank = __get_cpu_var(injectm.bank);
-
- if (msr == rip_msr)
- return offsetof(struct mce, ip);
- if (msr == MSR_IA32_MCx_STATUS(bank))
- return offsetof(struct mce, status);
- if (msr == MSR_IA32_MCx_ADDR(bank))
- return offsetof(struct mce, addr);
- if (msr == MSR_IA32_MCx_MISC(bank))
- return offsetof(struct mce, misc);
- if (msr == MSR_IA32_MCG_STATUS)
- return offsetof(struct mce, mcgstatus);
- return -1;
-}
-
-/* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
-{
- u64 v;
-
- if (__get_cpu_var(injectm).finished) {
- int offset = msr_to_offset(msr);
-
- if (offset < 0)
- return 0;
- return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
- }
-
- if (rdmsrl_safe(msr, &v)) {
- WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
- /*
- * Return zero in case the access faulted. This should
- * not happen normally but can happen if the CPU does
- * something weird, or if the code is buggy.
- */
- v = 0;
- }
-
- return v;
-}
-
-static void mce_wrmsrl(u32 msr, u64 v)
-{
- if (__get_cpu_var(injectm).finished) {
- int offset = msr_to_offset(msr);
-
- if (offset >= 0)
- *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
- return;
- }
- wrmsrl(msr, v);
-}
-
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16 /* we use one entry less */
-
-struct mce_ring {
- unsigned short start;
- unsigned short end;
- unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
-
- return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
- struct mce_ring *r;
- int ret = 0;
-
- *pfn = 0;
- get_cpu();
- r = &__get_cpu_var(mce_ring);
- if (r->start == r->end)
- goto out;
- *pfn = r->ring[r->start];
- r->start = (r->start + 1) % MCE_RING_SIZE;
- ret = 1;
-out:
- put_cpu();
- return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
- unsigned next;
-
- next = (r->end + 1) % MCE_RING_SIZE;
- if (next == r->start)
- return -1;
- r->ring[r->end] = pfn;
- wmb();
- r->end = next;
- return 0;
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
- if (mce_disabled)
- return 0;
- return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static void mce_schedule_work(void)
-{
- if (!mce_ring_empty()) {
- struct work_struct *work = &__get_cpu_var(mce_work);
- if (!work_pending(work))
- schedule_work(work);
- }
-}
-
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
-{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
- mce_notify_irq();
- mce_schedule_work();
- irq_exit();
-}
-#endif
-
-static void mce_report_event(struct pt_regs *regs)
-{
- if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
- mce_notify_irq();
- /*
- * Triggering the work queue here is just an insurance
- * policy in case the syscall exit notify handler
- * doesn't run soon enough or ends up running on the
- * wrong CPU (can happen when audit sleeps)
- */
- mce_schedule_work();
- return;
- }
-
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
-
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
-
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
-}
-
-DEFINE_PER_CPU(unsigned, mce_poll_count);
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- *
- * Note: spec recommends to panic for fatal unsignalled
- * errors here. However this would be quite problematic --
- * we would need to reimplement the Monarch handling and
- * it would mess up the exclusion between exception handler
- * and poll hander -- * so we skip this for now.
- * These cases should not happen anyways, or only when the CPU
- * is already totally * confused. In this case it's likely it will
- * not fully execute the machine check handler either.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
- struct mce m;
- int i;
-
- percpu_inc(mce_poll_count);
-
- mce_setup(&m);
-
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- for (i = 0; i < banks; i++) {
- if (!mce_banks[i].ctl || !test_bit(i, *b))
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
- m.tsc = 0;
-
- barrier();
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (!(m.status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Uncorrected or signalled events are handled by the exception
- * handler when it is enabled, so don't process those here.
- *
- * TBD do the same check for MCI_STATUS_EN here?
- */
- if (!(flags & MCP_UC) &&
- (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
- continue;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
-
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
- /*
- * Don't get the IP here because it's unlikely to
- * have anything to do with the actual error location.
- */
- if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- /*
- * Clear state for this bank.
- */
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-
- /*
- * Don't clear MCG_STATUS here because it's only defined for
- * exceptions.
- */
-
- sync_core();
-}
-EXPORT_SYMBOL_GPL(machine_check_poll);
-
-/*
- * Do a quick check if any of the events requires a panic.
- * This decides if we keep the events around or clear them.
- */
-static int mce_no_way_out(struct mce *m, char **msg)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
- }
- return 0;
-}
-
-/*
- * Variable to establish order between CPUs while scanning.
- * Each CPU spins initially until executing is equal its number.
- */
-static atomic_t mce_executing;
-
-/*
- * Defines order of CPUs on entry. First CPU becomes Monarch.
- */
-static atomic_t mce_callin;
-
-/*
- * Check if a timeout waiting for other CPUs happened.
- */
-static int mce_timed_out(u64 *t)
-{
- /*
- * The others already did panic for some reason.
- * Bail out like in a timeout.
- * rmb() to tell the compiler that system_state
- * might have been modified by someone else.
- */
- rmb();
- if (atomic_read(&mce_paniced))
- wait_for_panic();
- if (!monarch_timeout)
- goto out;
- if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (tolerant < 1)
- mce_panic("Timeout synchronizing machine check over CPUs",
- NULL, NULL);
- cpu_missing = 1;
- return 1;
- }
- *t -= SPINUNIT;
-out:
- touch_nmi_watchdog();
- return 0;
-}
-
-/*
- * The Monarch's reign. The Monarch is the CPU who entered
- * the machine check handler first. It waits for the others to
- * raise the exception too and then grades them. When any
- * error is fatal panic. Only then let the others continue.
- *
- * The other CPUs entering the MCE handler will be controlled by the
- * Monarch. They are called Subjects.
- *
- * This way we prevent any potential data corruption in a unrecoverable case
- * and also makes sure always all CPU's errors are examined.
- *
- * Also this detects the case of a machine check event coming from outer
- * space (not detected by any CPUs) In this case some external agent wants
- * us to shut down, so panic too.
- *
- * The other CPUs might still decide to panic if the handler happens
- * in a unrecoverable place, but in this case the system is in a semi-stable
- * state and won't corrupt anything by itself. It's ok to let the others
- * continue for a bit first.
- *
- * All the spin loops have timeouts; when a timeout happens a CPU
- * typically elects itself to be Monarch.
- */
-static void mce_reign(void)
-{
- int cpu;
- struct mce *m = NULL;
- int global_worst = 0;
- char *msg = NULL;
- char *nmsg = NULL;
-
- /*
- * This CPU is the Monarch and the other CPUs have run
- * through their handlers.
- * Grade the severity of the errors of all the CPUs.
- */
- for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
- &nmsg);
- if (severity > global_worst) {
- msg = nmsg;
- global_worst = severity;
- m = &per_cpu(mces_seen, cpu);
- }
- }
-
- /*
- * Cannot recover? Panic here then.
- * This dumps all the mces in the log buffer and stops the
- * other CPUs.
- */
- if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
-
- /*
- * For UC somewhere we let the CPU who detects it handle it.
- * Also must let continue the others, otherwise the handling
- * CPU could deadlock on a lock.
- */
-
- /*
- * No machine check event found. Must be some external
- * source or one CPU is hung. Panic.
- */
- if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
-
- /*
- * Now clear all the mces_seen so that they don't reappear on
- * the next mce.
- */
- for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
-}
-
-static atomic_t global_nwo;
-
-/*
- * Start of Monarch synchronization. This waits until all CPUs have
- * entered the exception handler and then determines if any of them
- * saw a fatal event that requires panic. Then it executes them
- * in the entry order.
- * TBD double check parallel CPU hotunplug
- */
-static int mce_start(int *no_way_out)
-{
- int order;
- int cpus = num_online_cpus();
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- return -1;
-
- atomic_add(*no_way_out, &global_nwo);
- /*
- * global_nwo should be updated before mce_callin
- */
- smp_wmb();
- order = atomic_inc_return(&mce_callin);
-
- /*
- * Wait for everyone.
- */
- while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
-
- /*
- * mce_callin should be read before global_nwo
- */
- smp_rmb();
-
- if (order == 1) {
- /*
- * Monarch: Starts executing now, the others wait.
- */
- atomic_set(&mce_executing, 1);
- } else {
- /*
- * Subject: Now start the scanning loop one by one in
- * the original callin order.
- * This way when there are any shared banks it will be
- * only seen by one CPU before cleared, avoiding duplicates.
- */
- while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
- }
-
- /*
- * Cache the global no_way_out state.
- */
- *no_way_out = atomic_read(&global_nwo);
-
- return order;
-}
-
-/*
- * Synchronize between CPUs after main scanning loop.
- * This invokes the bulk of the Monarch processing.
- */
-static int mce_end(int order)
-{
- int ret = -1;
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- goto reset;
- if (order < 0)
- goto reset;
-
- /*
- * Allow others to run.
- */
- atomic_inc(&mce_executing);
-
- if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
- /*
- * Monarch: Wait for everyone to go through their scanning
- * loops.
- */
- while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- mce_reign();
- barrier();
- ret = 0;
- } else {
- /*
- * Subject: Wait for Monarch to finish.
- */
- while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- /*
- * Don't reset anything. That's done by the Monarch.
- */
- return 0;
- }
-
- /*
- * Reset all global state.
- */
-reset:
- atomic_set(&global_nwo, 0);
- atomic_set(&mce_callin, 0);
- barrier();
-
- /*
- * Let others run again.
- */
- atomic_set(&mce_executing, 0);
- return ret;
-}
-
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses upto page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
- return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
- return 0;
- return 1;
-}
-
-static void mce_clear_state(unsigned long *toclear)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- *
- * On Intel systems this is entered on all CPUs in parallel through
- * MCE broadcast. However some CPUs might be broken beyond repair,
- * so be always careful when synchronizing with others.
- */
-void do_machine_check(struct pt_regs *regs, long error_code)
-{
- struct mce m, *final;
- int i;
- int worst = 0;
- int severity;
- /*
- * Establish sequential order between the CPUs entering the machine
- * check handler.
- */
- int order;
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
- /*
- * If kill_it gets set, there might be a way to recover from this
- * error.
- */
- int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- char *msg = "Unknown";
-
- atomic_inc(&mce_entry);
-
- percpu_inc(mce_exception_count);
-
- if (notify_die(DIE_NMI, "machine check", regs, error_code,
- 18, SIGKILL) == NOTIFY_STOP)
- goto out;
- if (!banks)
- goto out;
-
- mce_setup(&m);
-
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- final = &__get_cpu_var(mces_seen);
- *final = m;
-
- no_way_out = mce_no_way_out(&m, &msg);
-
- barrier();
-
- /*
- * When no restart IP must always kill or panic.
- */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
-
- /*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
- */
- order = mce_start(&no_way_out);
- for (i = 0; i < banks; i++) {
- __clear_bit(i, toclear);
- if (!mce_banks[i].ctl)
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected or non signaled errors are handled by
- * machine_check_poll. Leave them alone, unless this panics.
- */
- if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK);
-
- severity = mce_severity(&m, tolerant, NULL);
-
- /*
- * When machine check was for corrected handler don't touch,
- * unless we're panicing.
- */
- if (severity == MCE_KEEP_SEVERITY && !no_way_out)
- continue;
- __set_bit(i, toclear);
- if (severity == MCE_NO_SEVERITY) {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
-
- /*
- * Action optional error. Queue address for later processing.
- * When the ring overflows we just ignore the AO error.
- * RED-PEN add some logging mechanism when
- * usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for tolerant == 0
- */
- if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
- mce_ring_add(m.addr >> PAGE_SHIFT);
-
- mce_get_rip(&m, regs);
- mce_log(&m);
-
- if (severity > worst) {
- *final = m;
- worst = severity;
- }
- }
-
- if (!no_way_out)
- mce_clear_state(toclear);
-
- /*
- * Do most of the synchronization with other CPUs.
- * When there's any problem use only local no_way_out state.
- */
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
-
- /*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
- */
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
-
- if (worst > 0)
- mce_report_event(regs);
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-out:
- atomic_dec(&mce_entry);
- sync_core();
-}
-EXPORT_SYMBOL_GPL(do_machine_check);
-
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
-{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
-}
-
-/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
- */
-void mce_notify_process(void)
-{
- unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
-}
-
-static void mce_process_work(struct work_struct *dummy)
-{
- mce_notify_process();
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll 2x faster. When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-static int check_interval = 5 * 60; /* 5 minutes */
-
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static void mce_start_timer(unsigned long data)
-{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
-
- WARN_ON(smp_processor_id() != data);
-
- if (mce_available(&current_cpu_data)) {
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
- }
-
- /*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
- */
- n = &__get_cpu_var(mce_next_interval);
- if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
- else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
-
- t->expires = jiffies + *n;
- add_timer_on(t, smp_processor_id());
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- clear_thread_flag(TIF_MCE_NOTIFY);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
-
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
-static int __cpuinit __mcheck_cpu_mce_banks_init(void)
-{
- int i;
-
- mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
- if (!mce_banks)
- return -ENOMEM;
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- b->ctl = -1ULL;
- b->init = 1;
- }
- return 0;
-}
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int __cpuinit __mcheck_cpu_cap_init(void)
-{
- unsigned b;
- u64 cap;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
-
- b = cap & MCG_BANKCNT_MASK;
- if (!banks)
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
-
- if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
- MAX_NR_BANKS, b);
- b = MAX_NR_BANKS;
- }
-
- /* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
- if (!mce_banks) {
- int err = __mcheck_cpu_mce_banks_init();
-
- if (err)
- return err;
- }
-
- /* Use accurate RIP reporting if available. */
- if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
-
- if (cap & MCG_SER_P)
- mce_ser = 1;
-
- return 0;
-}
-
-static void __mcheck_cpu_init_generic(void)
-{
- mce_banks_t all_banks;
- u64 cap;
- int i;
-
- /*
- * Log the machine checks left over from the previous reset.
- */
- bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
-
- set_in_cr4(X86_CR4_MCE);
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- if (cap & MCG_CTL_P)
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (!b->init)
- continue;
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
- wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-}
-
-/* Add per CPU specific workarounds here */
-static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
-{
- if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
- return -EOPNOTSUPP;
- }
-
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
- if (c->x86 <= 17 && mce_bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- mce_bootlog = 0;
- }
- /*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
- */
- if (c->x86 == 6 && banks > 0)
- mce_banks[0].ctl = 0;
- }
-
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
-
- if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
- mce_banks[0].init = 0;
-
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- monarch_timeout < 0)
- monarch_timeout = USEC_PER_SEC;
-
- /*
- * There are also broken BIOSes on some Pentium M and
- * earlier systems:
- */
- if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
- mce_bootlog = 0;
- }
- if (monarch_timeout < 0)
- monarch_timeout = 0;
- if (mce_bootlog != 0)
- mce_panic_timeout = 30;
-
- return 0;
-}
-
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
-{
- if (c->x86 != 5)
- return;
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- intel_p5_mcheck_init(c);
- break;
- case X86_VENDOR_CENTAUR:
- winchip_mcheck_init(c);
- break;
- }
-}
-
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
-{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- break;
- case X86_VENDOR_AMD:
- mce_amd_feature_init(c);
- break;
- default:
- break;
- }
-}
-
-static void __mcheck_cpu_init_timer(void)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
-
- setup_timer(t, mce_start_timer, smp_processor_id());
-
- if (mce_ignore_ce)
- return;
-
- *n = check_interval * HZ;
- if (!*n)
- return;
- t->expires = round_jiffies(jiffies + *n);
- add_timer_on(t, smp_processor_id());
-}
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
- smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off:
- */
-void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
-{
- if (mce_disabled)
- return;
-
- __mcheck_cpu_ancient_init(c);
-
- if (!mce_available(c))
- return;
-
- if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
- mce_disabled = 1;
- return;
- }
-
- machine_check_vector = do_machine_check;
-
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(c);
- __mcheck_cpu_init_timer();
- INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
-}
-
-/*
- * Character device to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
-
-static int mce_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
-
- spin_unlock(&mce_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_state_lock);
-
- open_count--;
- open_exclu = 0;
-
- spin_unlock(&mce_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = rcu_dereference_check_mce(mcelog.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
-
- while (!mcelog.entry[i].finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
- smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_wait, wait);
- if (rcu_dereference_check_mce(mcelog.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-/* Modified in mce-inject.c, so not static or const */
-struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
-};
-EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-
-static struct miscdevice mce_log_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
-/*
- * mce=off Disables machine check
- * mce=no_cmci Disables CMCI
- * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
- * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
- * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
- * monarchtimeout is how long to wait for other CPUs on machine
- * check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
- * mce=nobootlog Don't log MCEs from before booting.
- */
-static int __init mcheck_enable(char *str)
-{
- if (*str == 0) {
- enable_p5_mce();
- return 1;
- }
- if (*str == '=')
- str++;
- if (!strcmp(str, "off"))
- mce_disabled = 1;
- else if (!strcmp(str, "no_cmci"))
- mce_cmci_disabled = 1;
- else if (!strcmp(str, "dont_log_ce"))
- mce_dont_log_ce = 1;
- else if (!strcmp(str, "ignore_ce"))
- mce_ignore_ce = 1;
- else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- mce_bootlog = (str[0] == 'b');
- else if (isdigit(str[0])) {
- get_option(&str, &tolerant);
- if (*str == ',') {
- ++str;
- get_option(&str, &monarch_timeout);
- }
- } else {
- printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- str);
- return 0;
- }
- return 1;
-}
-__setup("mce", mcheck_enable);
-
-int __init mcheck_init(void)
-{
- atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
- mcheck_intel_therm_init();
-
- return 0;
-}
-
-/*
- * Sysfs support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable_error_reporting(void)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
- return 0;
-}
-
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
-{
- return mce_disable_error_reporting();
-}
-
-static int mce_shutdown(struct sys_device *dev)
-{
- return mce_disable_error_reporting();
-}
-
-/*
- * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- * Only one CPU is active at this time, the others get re-added later using
- * CPU hotplug:
- */
-static int mce_resume(struct sys_device *dev)
-{
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(&current_cpu_data);
-
- return 0;
-}
-
-static void mce_cpu_restart(void *data)
-{
- del_timer_sync(&__get_cpu_var(mce_timer));
- if (!mce_available(&current_cpu_data))
- return;
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
- on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-/* Toggle features for corrected errors */
-static void mce_disable_ce(void *all)
-{
- if (!mce_available(&current_cpu_data))
- return;
- if (all)
- del_timer_sync(&__get_cpu_var(mce_timer));
- cmci_clear();
-}
-
-static void mce_enable_ce(void *all)
-{
- if (!mce_available(&current_cpu_data))
- return;
- cmci_reenable();
- cmci_recheck();
- if (all)
- __mcheck_cpu_init_timer();
-}
-
-static struct sysdev_class mce_sysclass = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
- .name = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct sys_device, mce_dev);
-
-__cpuinitdata
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
-{
- return container_of(attr, struct mce_bank, attr);
-}
-
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
-}
-
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- attr_to_bank(attr)->ctl = new;
- mce_restart();
-
- return size;
-}
-
-static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mce_ignore_ce ^ !!new) {
- if (new) {
- /* disable ce features */
- on_each_cpu(mce_disable_ce, (void *)1, 1);
- mce_ignore_ce = 1;
- } else {
- /* enable ce features */
- mce_ignore_ce = 0;
- on_each_cpu(mce_enable_ce, (void *)1, 1);
- }
- }
- return size;
-}
-
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mce_cmci_disabled ^ !!new) {
- if (new) {
- /* disable cmci */
- on_each_cpu(mce_disable_ce, NULL, 1);
- mce_cmci_disabled = 1;
- } else {
- /* enable cmci */
- mce_cmci_disabled = 0;
- on_each_cpu(mce_enable_ce, NULL, 1);
- }
- }
- return size;
-}
-
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
- mce_restart();
- return ret;
-}
-
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
-
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
- &check_interval
-};
-
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
- &mce_ignore_ce
-};
-
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
- &mce_cmci_disabled
-};
-
-static struct sysdev_attribute *mce_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
- NULL
-};
-
-static cpumask_var_t mce_dev_initialized;
-
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
-{
- int err;
- int i, j;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
-
- err = sysdev_register(&per_cpu(mce_dev, cpu));
- if (err)
- return err;
-
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
- if (err)
- goto error;
- }
- for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &mce_banks[j].attr);
- if (err)
- goto error2;
- }
- cpumask_set_cpu(cpu, mce_dev_initialized);
-
- return 0;
-error2:
- while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
-error:
- while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
-
- sysdev_unregister(&per_cpu(mce_dev, cpu));
-
- return err;
-}
-
-static __cpuinit void mce_remove_device(unsigned int cpu)
-{
- int i;
-
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
- return;
-
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
-
- for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
-
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuinit mce_disable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(&current_cpu_data))
- return;
-
- if (!(action & CPU_TASKS_FROZEN))
- cmci_clear();
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
-}
-
-static void __cpuinit mce_reenable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(&current_cpu_data))
- return;
-
- if (!(action & CPU_TASKS_FROZEN))
- cmci_reenable();
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
- }
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- break;
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
- add_timer_on(t, cpu);
- }
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- break;
- case CPU_POST_DEAD:
- /* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
- .notifier_call = mce_cpu_callback,
-};
-
-static __init void mce_init_banks(void)
-{
- int i;
-
- for (i = 0; i < banks; i++) {
- struct mce_bank *b = &mce_banks[i];
- struct sysdev_attribute *a = &b->attr;
-
- sysfs_attr_init(&a->attr);
- a->attr.name = b->attrname;
- snprintf(b->attrname, ATTR_LEN, "bank%d", i);
-
- a->attr.mode = 0644;
- a->show = show_bank;
- a->store = set_bank;
- }
-}
-
-static __init int mcheck_init_device(void)
-{
- int err;
- int i = 0;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
-
- mce_init_banks();
-
- err = sysdev_class_register(&mce_sysclass);
- if (err)
- return err;
-
- for_each_online_cpu(i) {
- err = mce_create_device(i);
- if (err)
- return err;
- }
-
- register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
-
- return err;
-}
-
-device_initcall(mcheck_init_device);
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
- mce_disabled = 1;
- return 1;
-}
-__setup("nomce", mcheck_disable);
-
-#ifdef CONFIG_DEBUG_FS
-struct dentry *mce_get_debugfs_dir(void)
-{
- static struct dentry *dmce;
-
- if (!dmce)
- dmce = debugfs_create_dir("mce", NULL);
-
- return dmce;
-}
-
-static void mce_reset(void)
-{
- cpu_missing = 0;
- atomic_set(&mce_fake_paniced, 0);
- atomic_set(&mce_executing, 0);
- atomic_set(&mce_callin, 0);
- atomic_set(&global_nwo, 0);
-}
-
-static int fake_panic_get(void *data, u64 *val)
-{
- *val = fake_panic;
- return 0;
-}
-
-static int fake_panic_set(void *data, u64 val)
-{
- mce_reset();
- fake_panic = val;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
- fake_panic_set, "%llu\n");
-
-static int __init mcheck_debugfs_init(void)
-{
- struct dentry *dmce, *ffake_panic;
-
- dmce = mce_get_debugfs_dir();
- if (!dmce)
- return -ENOMEM;
- ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
- &fake_panic_fops);
- if (!ffake_panic)
- return -ENOMEM;
-
- return 0;
-}
-late_initcall(mcheck_debugfs_init);
-#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
deleted file mode 100644
index 224392d8fe8c..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ /dev/null
@@ -1,707 +0,0 @@
-/*
- * (c) 2005, 2006 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Written by Jacob Shin - AMD, Inc.
- *
- * Support : jacob.shin@amd.com
- *
- * April 2006
- * - added support for AMD Family 0x10 processors
- *
- * All MC4_MISCi registers are shared between multi-cores
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/sysdev.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#define PFX "mce_threshold: "
-#define VERSION "version 1.1.1"
-#define NR_BANKS 6
-#define NR_BLOCKS 9
-#define THRESHOLD_MAX 0xFFF
-#define INT_TYPE_APIC 0x00020000
-#define MASK_VALID_HI 0x80000000
-#define MASK_CNTP_HI 0x40000000
-#define MASK_LOCKED_HI 0x20000000
-#define MASK_LVTOFF_HI 0x00F00000
-#define MASK_COUNT_EN_HI 0x00080000
-#define MASK_INT_TYPE_HI 0x00060000
-#define MASK_OVERFLOW_HI 0x00010000
-#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_BLKPTR_LO 0xFF000000
-#define MCG_XBLK_ADDR 0xC0000400
-
-struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
-};
-
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
-};
-
-struct threshold_bank {
- struct kobject *kobj;
- struct threshold_block *blocks;
- cpumask_var_t cpus;
-};
-static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
-
-#ifdef CONFIG_SMP
-static unsigned char shared_bank[NR_BANKS] = {
- 0, 0, 0, 0, 1
-};
-#endif
-
-static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
-
-static void amd_threshold_interrupt(void);
-
-/*
- * CPU Initialization
- */
-
-struct thresh_restart {
- struct threshold_block *b;
- int reset;
- u16 old_limit;
-};
-
-/* must be called with correct cpu affinity */
-/* Called via smp_call_function_single() */
-static void threshold_restart_bank(void *_tr)
-{
- struct thresh_restart *tr = _tr;
- u32 mci_misc_hi, mci_misc_lo;
-
- rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-
- if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- mci_misc_hi =
- (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
- } else if (tr->old_limit) { /* change limit w/o reset */
- int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- (tr->old_limit - tr->b->threshold_limit);
-
- mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
- (new_count & THRESHOLD_MAX);
- }
-
- tr->b->interrupt_enable ?
- (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (mci_misc_hi &= ~MASK_INT_TYPE_HI);
-
- mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-}
-
-/* cpu init entry point, called from mce.c with preempt off */
-void mce_amd_feature_init(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- struct thresh_restart tr;
- u8 lvt_off;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else
- ++address;
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
- if (shared_bank[bank] && c->cpu_core_id)
- break;
-#endif
- lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0);
-
- high &= ~MASK_LVTOFF_HI;
- high |= lvt_off << 20;
- wrmsr(address, low, high);
-
- threshold_defaults.address = address;
- tr.b = &threshold_defaults;
- tr.reset = 0;
- tr.old_limit = 0;
- threshold_restart_bank(&tr);
-
- mce_threshold_vector = amd_threshold_interrupt;
- }
- }
-}
-
-/*
- * APIC Interrupt Handler
- */
-
-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
-static void amd_threshold_interrupt(void)
-{
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- struct mce m;
-
- mce_setup(&m);
-
- /* assume first bank caused it */
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
- continue;
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MC0_MISC + bank * 4;
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
-
- if (high & MASK_OVERFLOW_HI) {
- rdmsrl(address, m.misc);
- rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
- m.status);
- m.bank = K8_MCE_THRESHOLD_BASE
- + bank * NR_BLOCKS
- + block;
- mce_log(&m);
- return;
- }
- }
- }
-}
-
-/*
- * Sysfs Interface
- */
-
-struct threshold_attr {
- struct attribute attr;
- ssize_t (*show) (struct threshold_block *, char *);
- ssize_t (*store) (struct threshold_block *, const char *, size_t count);
-};
-
-#define SHOW_FIELDS(name) \
-static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
-{ \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
-}
-SHOW_FIELDS(interrupt_enable)
-SHOW_FIELDS(threshold_limit)
-
-static ssize_t
-store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- b->interrupt_enable = !!new;
-
- tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-static ssize_t
-store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (new > THRESHOLD_MAX)
- new = THRESHOLD_MAX;
- if (new < 1)
- new = 1;
-
- tr.old_limit = b->threshold_limit;
- b->threshold_limit = new;
- tr.b = b;
- tr.reset = 0;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-struct threshold_block_cross_cpu {
- struct threshold_block *tb;
- long retval;
-};
-
-static void local_error_count_handler(void *_tbcc)
-{
- struct threshold_block_cross_cpu *tbcc = _tbcc;
- struct threshold_block *b = tbcc->tb;
- u32 low, high;
-
- rdmsr(b->address, low, high);
- tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
-}
-
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
-{
- struct threshold_block_cross_cpu tbcc = { .tb = b, };
-
- smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
- return sprintf(buf, "%lx\n", tbcc.retval);
-}
-
-static ssize_t store_error_count(struct threshold_block *b,
- const char *buf, size_t count)
-{
- struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- return 1;
-}
-
-#define RW_ATTR(val) \
-static struct threshold_attr val = { \
- .attr = {.name = __stringify(val), .mode = 0644 }, \
- .show = show_## val, \
- .store = store_## val, \
-};
-
-RW_ATTR(interrupt_enable);
-RW_ATTR(threshold_limit);
-RW_ATTR(error_count);
-
-static struct attribute *default_attrs[] = {
- &interrupt_enable.attr,
- &threshold_limit.attr,
- &error_count.attr,
- NULL
-};
-
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->show ? a->show(b, buf) : -EIO;
-
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->store ? a->store(b, buf, count) : -EIO;
-
- return ret;
-}
-
-static const struct sysfs_ops threshold_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type threshold_ktype = {
- .sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
-};
-
-static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
- unsigned int bank,
- unsigned int block,
- u32 address)
-{
- struct threshold_block *b = NULL;
- u32 low, high;
- int err;
-
- if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
- return 0;
-
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
- return 0;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- goto recurse;
- else
- return 0;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- goto recurse;
-
- b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
- if (!b)
- return -ENOMEM;
-
- b->block = block;
- b->bank = bank;
- b->cpu = cpu;
- b->address = address;
- b->interrupt_enable = 0;
- b->threshold_limit = THRESHOLD_MAX;
-
- INIT_LIST_HEAD(&b->miscj);
-
- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
- list_add(&b->miscj,
- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- } else {
- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
- }
-
- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
- per_cpu(threshold_banks, cpu)[bank]->kobj,
- "misc%i", block);
- if (err)
- goto out_free;
-recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- err = allocate_threshold_blocks(cpu, bank, ++block, address);
- if (err)
- goto out_free;
-
- if (b)
- kobject_uevent(&b->kobj, KOBJ_ADD);
-
- return err;
-
-out_free:
- if (b) {
- kobject_put(&b->kobj);
- kfree(b);
- }
- return err;
-}
-
-static __cpuinit long
-local_allocate_threshold_blocks(int cpu, unsigned int bank)
-{
- return allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
-}
-
-/* symlinks sibling shared banks to first core. first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
- int i, err = 0;
- struct threshold_bank *b = NULL;
- char name[32];
-#ifdef CONFIG_SMP
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-#endif
-
- sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
- if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
- i = cpumask_first(c->llc_shared_map);
-
- /* first core not up yet */
- if (cpu_data(i).cpu_core_id)
- goto out;
-
- /* already linked */
- if (per_cpu(threshold_banks, cpu)[bank])
- goto out;
-
- b = per_cpu(threshold_banks, i)[bank];
-
- if (!b)
- goto out;
-
- err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
- b->kobj, name);
- if (err)
- goto out;
-
- cpumask_copy(b->cpus, c->llc_shared_map);
- per_cpu(threshold_banks, cpu)[bank] = b;
-
- goto out;
- }
-#endif
-
- b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
- if (!b) {
- err = -ENOMEM;
- goto out;
- }
- if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
- kfree(b);
- err = -ENOMEM;
- goto out;
- }
-
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
- if (!b->kobj)
- goto out_free;
-
-#ifndef CONFIG_SMP
- cpumask_setall(b->cpus);
-#else
- cpumask_copy(b->cpus, c->llc_shared_map);
-#endif
-
- per_cpu(threshold_banks, cpu)[bank] = b;
-
- err = local_allocate_threshold_blocks(cpu, bank);
- if (err)
- goto out_free;
-
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
- b->kobj, name);
- if (err)
- goto out;
-
- per_cpu(threshold_banks, i)[bank] = b;
- }
-
- goto out;
-
-out_free:
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- free_cpumask_var(b->cpus);
- kfree(b);
-out:
- return err;
-}
-
-/* create dir/files for all valid threshold banks */
-static __cpuinit int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- int err = 0;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- goto out;
- }
-out:
- return err;
-}
-
-/*
- * let's be hotplug friendly.
- * in case of multiple core processors, the first core always takes ownership
- * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- */
-
-static void deallocate_threshold_block(unsigned int cpu,
- unsigned int bank)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
- kobject_put(&pos->kobj);
- list_del(&pos->miscj);
- kfree(pos);
- }
-
- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
-}
-
-static void threshold_remove_bank(unsigned int cpu, int bank)
-{
- struct threshold_bank *b;
- char name[32];
- int i = 0;
-
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
- return;
- if (!b->blocks)
- goto free_out;
-
- sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
- /* sibling symlink */
- if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-
- return;
- }
-#endif
-
- /* remove all sibling symlinks before unregistering */
- for_each_cpu(i, b->cpus) {
- if (i == cpu)
- continue;
-
- sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
- per_cpu(threshold_banks, i)[bank] = NULL;
- }
-
- deallocate_threshold_block(cpu, bank);
-
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- free_cpumask_var(b->cpus);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-}
-
-static void threshold_remove_device(unsigned int cpu)
-{
- unsigned int bank;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
- }
-}
-
-/* get notified when a cpu comes on/off */
-static void __cpuinit
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
- }
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
-
- if (err)
- return err;
- }
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
-
- return 0;
-}
-device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
deleted file mode 100644
index 62b48e40920a..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Intel specific MCE features.
- * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- * Copyright (C) 2008, 2009 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-
-/*
- * Support for Intel Correct Machine Check Interrupts. This allows
- * the CPU to raise an interrupt when a corrected machine check happened.
- * Normally we pick those up using a regular polling timer.
- * Also supports reliable discovery of shared banks.
- */
-
-static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
-
-/*
- * cmci_discover_lock protects against parallel discovery attempts
- * which could race against each other.
- */
-static DEFINE_SPINLOCK(cmci_discover_lock);
-
-#define CMCI_THRESHOLD 1
-
-static int cmci_supported(int *banks)
-{
- u64 cap;
-
- if (mce_cmci_disabled || mce_ignore_ce)
- return 0;
-
- /*
- * Vendor check is not strictly needed, but the initial
- * initialization is vendor keyed and this
- * makes sure none of the backdoors are entered otherwise.
- */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return 0;
- if (!cpu_has_apic || lapic_get_maxlvt() < 6)
- return 0;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
- return !!(cap & MCG_CMCI_P);
-}
-
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- mce_notify_irq();
-}
-
-static void print_update(char *type, int *hdr, int num)
-{
- if (*hdr == 0)
- printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
- *hdr = 1;
- printk(KERN_CONT " %s:%d", type, num);
-}
-
-/*
- * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
- * on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
- */
-static void cmci_discover(int banks, int boot)
-{
- unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
- unsigned long flags;
- int hdr = 0;
- int i;
-
- spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- u64 val;
-
- if (test_bit(i, owned))
- continue;
-
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & CMCI_EN) {
- if (test_and_clear_bit(i, owned) && !boot)
- print_update("SHD", &hdr, i);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- continue;
- }
-
- val |= CMCI_EN | CMCI_THRESHOLD;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
- if (!test_and_set_bit(i, owned) && !boot)
- print_update("CMCI", &hdr, i);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- } else {
- WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
- }
- }
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (hdr)
- printk(KERN_CONT "\n");
-}
-
-/*
- * Just in case we missed an event during initialization check
- * all the CMCI owned banks.
- */
-void cmci_recheck(void)
-{
- unsigned long flags;
- int banks;
-
- if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
- return;
- local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- local_irq_restore(flags);
-}
-
-/*
- * Disable CMCI on this CPU for all banks it owns when it goes down.
- * This allows other CPUs to claim the banks on rediscovery.
- */
-void cmci_clear(void)
-{
- unsigned long flags;
- int i;
- int banks;
- u64 val;
-
- if (!cmci_supported(&banks))
- return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-/*
- * After a CPU went down cycle through all the others and rediscover
- * Must run in process context.
- */
-void cmci_rediscover(int dying)
-{
- int banks;
- int cpu;
- cpumask_var_t old;
-
- if (!cmci_supported(&banks))
- return;
- if (!alloc_cpumask_var(&old, GFP_KERNEL))
- return;
- cpumask_copy(old, &current->cpus_allowed);
-
- for_each_online_cpu(cpu) {
- if (cpu == dying)
- continue;
- if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
- continue;
- /* Recheck banks in case CPUs don't all have the same */
- if (cmci_supported(&banks))
- cmci_discover(banks, 0);
- }
-
- set_cpus_allowed_ptr(current, old);
- free_cpumask_var(old);
-}
-
-/*
- * Reenable CMCI on this CPU in case a CPU down failed.
- */
-void cmci_reenable(void)
-{
- int banks;
- if (cmci_supported(&banks))
- cmci_discover(banks, 0);
-}
-
-static void intel_init_cmci(void)
-{
- int banks;
-
- if (!cmci_supported(&banks))
- return;
-
- mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks, 1);
- /*
- * For CPU #0 this runs with still disabled APIC, but that's
- * ok because only the vector is set up. We still do another
- * check for the banks later for CPU #0 just to make sure
- * to not miss any events.
- */
- apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
- cmci_recheck();
-}
-
-void mce_intel_feature_init(struct cpuinfo_x86 *c)
-{
- intel_init_thermal(c);
- intel_init_cmci();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
deleted file mode 100644
index e1a0a3bf9716..000000000000
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/sysdev.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-/*
- * Current thermal throttling state:
- */
-struct thermal_state {
- bool is_throttled;
-
- u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
-};
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
-
-#define define_therm_throt_sysdev_show_func(name) \
- \
-static ssize_t therm_throt_sysdev_show_##name( \
- struct sys_device *dev, \
- struct sysdev_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
- NULL
-};
-
-static struct attribute_group thermal_throttle_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- * "timeout" from previous log message.
- * 1 : Event should be logged further, and a message has been
- * printed to the syslog.
- */
-static int therm_throt_process(bool is_throttled)
-{
- struct thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
- u64 now;
-
- this_cpu = smp_processor_id();
- now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
-
- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
-
- if (is_throttled)
- state->throttle_count++;
-
- if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
-
- /* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
-
- add_taint(TAINT_MACHINE_CHECK);
- return 1;
- }
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
- return 1;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
-{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
-}
-
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
-{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
-}
-
-/* Mutex protecting device creation against CPU hotplug: */
-static DEFINE_MUTEX(therm_cpu_lock);
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
- int err = 0;
-
- sys_dev = get_cpu_sysdev(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev);
- mutex_unlock(&therm_cpu_lock);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(sys_dev);
- mutex_unlock(&therm_cpu_lock);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
-{
- .notifier_call = thermal_throttle_cpu_callback,
-};
-
-static __init int thermal_throttle_init_device(void)
-{
- unsigned int cpu = 0;
- int err;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
-
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&therm_cpu_lock);
-#endif
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu));
- WARN_ON(err);
- }
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_unlock(&therm_cpu_lock);
-#endif
-
- return 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
- mce_log_therm_throt_event(msr_val);
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
- smp_processor_id());
- add_taint(TAINT_MACHINE_CHECK);
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
-{
- exit_idle();
- irq_enter();
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
- if (!cpu_has_apic)
- return 0;
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return 0;
- return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- if (!intel_thermal_supported(c))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- /*
- * The initial value of thermal LVT entries on all APs always reads
- * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
- * sequence to them and LVT registers are reset to 0s except for
- * the mask bits which are set to 1s when APs receive INIT IPI.
- * Always restore the value that BIOS has programmed on AP based on
- * BSP's info we saved since BIOS is always setting the same value
- * for all threads/cores
- */
- apic_write(APIC_LVTTHMR, lvtthmr_init);
-
- h = lvtthmr_init;
-
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- /* Check whether a vector already exists */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
- /* early Pentium M models use different method for enabling TM2 */
- if (cpu_has(c, X86_FEATURE_TM2)) {
- if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
- rdmsr(MSR_THERM2_CTL, l, h);
- if (l & MSR_THERM2_CTL_TM_SELECT)
- tm2 = 1;
- } else if (l & MSR_IA32_MISC_ENABLE_TM2)
- tm2 = 1;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
deleted file mode 100644
index d746df2909c9..000000000000
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Common corrected MCE threshold handler code:
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-
-static void default_threshold_interrupt(void)
-{
- printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
- THRESHOLD_APIC_VECTOR);
-}
-
-void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-
-asmlinkage void smp_threshold_interrupt(void)
-{
- exit_idle();
- irq_enter();
- inc_irq_stat(irq_threshold_count);
- mce_threshold_vector();
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
-}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..193d98b33a0a
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o
+microcode-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 000000000000..3821a985f4ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,1306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2018 Borislav Petkov <bp@alien8.de>
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
+ *
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/bsearch.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <crypto/sha2.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __packed;
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __packed;
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[];
+};
+
+static struct equiv_cpu_table {
+ unsigned int num_entries;
+ struct equiv_cpu_entry *entry;
+} equiv_table;
+
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
+ */
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/arch/x86/microcode.rst
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static u32 cpuid_to_ucode_rev(unsigned int val)
+{
+ union zen_patch_rev p = {};
+ union cpuid_1_eax c;
+
+ c.full = val;
+
+ p.stepping = c.stepping;
+ p.model = c.model;
+ p.ext_model = c.ext_model;
+ p.ext_fam = c.ext_fam;
+
+ return p.ucode_rev;
+}
+
+static u32 get_cutoff_revision(u32 rev)
+{
+ switch (rev >> 8) {
+ case 0x80012: return 0x8001277; break;
+ case 0x80082: return 0x800820f; break;
+ case 0x83010: return 0x830107c; break;
+ case 0x86001: return 0x860010e; break;
+ case 0x86081: return 0x8608108; break;
+ case 0x87010: return 0x8701034; break;
+ case 0x8a000: return 0x8a0000a; break;
+ case 0xa0010: return 0xa00107a; break;
+ case 0xa0011: return 0xa0011da; break;
+ case 0xa0012: return 0xa001243; break;
+ case 0xa0082: return 0xa00820e; break;
+ case 0xa1011: return 0xa101153; break;
+ case 0xa1012: return 0xa10124e; break;
+ case 0xa1081: return 0xa108109; break;
+ case 0xa2010: return 0xa20102f; break;
+ case 0xa2012: return 0xa201212; break;
+ case 0xa4041: return 0xa404109; break;
+ case 0xa5000: return 0xa500013; break;
+ case 0xa6012: return 0xa60120a; break;
+ case 0xa7041: return 0xa704109; break;
+ case 0xa7052: return 0xa705208; break;
+ case 0xa7080: return 0xa708009; break;
+ case 0xa70c0: return 0xa70C009; break;
+ case 0xaa001: return 0xaa00116; break;
+ case 0xaa002: return 0xaa00218; break;
+ case 0xb0021: return 0xb002146; break;
+ case 0xb0081: return 0xb008111; break;
+ case 0xb1010: return 0xb101046; break;
+ case 0xb2040: return 0xb204031; break;
+ case 0xb4040: return 0xb404031; break;
+ case 0xb4041: return 0xb404101; break;
+ case 0xb6000: return 0xb600031; break;
+ case 0xb6080: return 0xb608031; break;
+ case 0xb7000: return 0xb700031; break;
+ default: break;
+
+ }
+ return 0;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ u32 cutoff;
+
+ if (!cur_rev) {
+ cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+ pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
+ }
+
+ cutoff = get_cutoff_revision(cur_rev);
+ if (cutoff)
+ return cur_rev <= cutoff;
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ int i;
+
+ if (!cpu_has_entrysign())
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256(data, len, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ int cpu = smp_processor_id();
+
+ if (!microcode_rev[cpu]) {
+ if (!base_rev)
+ base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+
+ microcode_rev[cpu] = base_rev;
+
+ ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev);
+ }
+
+ return microcode_rev[cpu];
+ }
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
+{
+ unsigned int i;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
+ if (!et || !et->num_entries)
+ return 0;
+
+ for (i = 0; i < et->num_entries; i++) {
+ struct equiv_cpu_entry *e = &et->entry[i];
+
+ if (sig == e->installed_cpu)
+ return e->equiv_cpu;
+ }
+ return 0;
+}
+
+/*
+ * Check whether there is a valid microcode container file at the beginning
+ * of @buf of size @buf_size.
+ */
+static bool verify_container(const u8 *buf, size_t buf_size)
+{
+ u32 cont_magic;
+
+ if (buf_size <= CONTAINER_HDR_SZ) {
+ ucode_dbg("Truncated microcode container header.\n");
+ return false;
+ }
+
+ cont_magic = *(const u32 *)buf;
+ if (cont_magic != UCODE_MAGIC) {
+ ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated CPU equivalence table at the
+ * beginning of @buf of size @buf_size.
+ */
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
+{
+ const u32 *hdr = (const u32 *)buf;
+ u32 cont_type, equiv_tbl_len;
+
+ if (!verify_container(buf, buf_size))
+ return false;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
+ cont_type = hdr[1];
+ if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
+ ucode_dbg("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
+ return false;
+ }
+
+ buf_size -= CONTAINER_HDR_SZ;
+
+ equiv_tbl_len = hdr[2];
+ if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
+ buf_size < equiv_tbl_len) {
+ ucode_dbg("Truncated equivalence table.\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated microcode patch section at the
+ * beginning of @buf of size @buf_size.
+ *
+ * On success, @sh_psize returns the patch size according to the section header,
+ * to the caller.
+ */
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+{
+ u32 p_type, p_size;
+ const u32 *hdr;
+
+ if (buf_size < SECTION_HDR_SIZE) {
+ ucode_dbg("Truncated patch section.\n");
+ return false;
+ }
+
+ hdr = (const u32 *)buf;
+ p_type = hdr[0];
+ p_size = hdr[1];
+
+ if (p_type != UCODE_UCODE_TYPE) {
+ ucode_dbg("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
+ return false;
+ }
+
+ if (p_size < sizeof(struct microcode_header_amd)) {
+ ucode_dbg("Patch of size %u too short.\n", p_size);
+ return false;
+ }
+
+ *sh_psize = p_size;
+
+ return true;
+}
+
+/*
+ * Check whether the passed remaining file @buf_size is large enough to contain
+ * a patch of the indicated @sh_psize (and also whether this size does not
+ * exceed the per-family maximum). @sh_psize is the size read from the section
+ * header.
+ */
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ u32 max_size;
+
+ if (family >= 0x15)
+ goto ret;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+
+ switch (family) {
+ case 0x10 ... 0x12:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ default:
+ WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
+ return false;
+ }
+
+ if (sh_psize > max_size)
+ return false;
+
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
+}
+
+/*
+ * Verify the patch in @buf.
+ *
+ * Returns:
+ * negative: on error
+ * positive: patch is not for this family, skip it
+ * 0: success
+ */
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct microcode_header_amd *mc_hdr;
+ u32 cur_rev, cutoff, patch_rev;
+ u32 sh_psize;
+ u16 proc_id;
+ u8 patch_fam;
+
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
+ return -1;
+
+ /*
+ * The section header length is not included in this indicated size
+ * but is present in the leftover file length so we need to subtract
+ * it before passing this value to the function below.
+ */
+ buf_size -= SECTION_HDR_SIZE;
+
+ /*
+ * Check if the remaining buffer is big enough to contain a patch of
+ * size sh_psize, as the section claims.
+ */
+ if (buf_size < sh_psize) {
+ ucode_dbg("Patch of size %u truncated.\n", sh_psize);
+ return -1;
+ }
+
+ if (!__verify_patch_size(sh_psize, buf_size)) {
+ ucode_dbg("Per-family patch size mismatch.\n");
+ return -1;
+ }
+
+ *patch_size = sh_psize;
+
+ mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ return -1;
+ }
+
+ proc_id = mc_hdr->processor_rev_id;
+ patch_fam = 0xf + (proc_id >> 12);
+
+ if (patch_fam != family)
+ return 1;
+
+ cur_rev = get_patch_level();
+
+ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */
+ cutoff = get_cutoff_revision(cur_rev);
+ if (!cutoff)
+ goto ok;
+
+ patch_rev = mc_hdr->patch_id;
+
+ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n",
+ cur_rev, cutoff, patch_rev);
+
+ if (cur_rev <= cutoff && patch_rev <= cutoff)
+ goto ok;
+
+ if (cur_rev > cutoff && patch_rev > cutoff)
+ goto ok;
+
+ return 1;
+
+ok:
+ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
+ return 0;
+}
+
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ *
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_table table;
+ size_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
+
+ if (!verify_equivalence_table(ucode, size))
+ return 0;
+
+ buf = ucode;
+
+ table.entry = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
+ table.num_entries = hdr[2] / sizeof(struct equiv_cpu_entry);
+
+ /*
+ * Find the equivalence ID of our CPU in this table. Even if this table
+ * doesn't contain a patch for the CPU, scan through the whole container
+ * so that it can be skipped in case there are other containers appended.
+ */
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
+
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
+
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
+ int ret;
+
+ ret = verify_patch(buf, size, &patch_size);
+ if (ret < 0) {
+ /*
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
+ */
+ goto out;
+ } else if (ret > 0) {
+ goto skip;
+ }
+
+ mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
+
+ if (mc_patch_matches(mc, eq_id)) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+
+ ucode_dbg(" match: size: %d\n", patch_size);
+ }
+
+skip:
+ /* Skip patch section header too: */
+ buf += patch_size + SECTION_HDR_SIZE;
+ size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+out:
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ while (size) {
+ size_t s = parse_container(ucode, size, desc);
+ if (!s)
+ return;
+
+ /* catch wraparound */
+ if (size >= s) {
+ ucode += s;
+ size -= s;
+ } else {
+ return;
+ }
+ }
+}
+
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
+{
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
+
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
+
+ native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr);
+
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
+
+ invlpg(p_addr);
+
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
+ }
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG))
+ microcode_rev[smp_processor_id()] = mc->hdr.patch_id;
+
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+
+ ucode_dbg("updated rev: 0x%x\n", *cur_rev);
+
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
+
+ return true;
+}
+
+static bool get_builtin_microcode(struct cpio_data *cp)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
+
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
+}
+
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
+{
+ struct cpio_data cp;
+ bool found;
+
+ if (!get_builtin_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
+
+ return found;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
+{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
+ struct cpio_data cp = { };
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
+
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
+ rev = get_patch_level();
+ ed->old_rev = rev;
+
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
+
+ if (!find_blobs_in_containers(&cp))
+ return;
+
+ scan_containers(cp.data, cp.size, &desc);
+
+ mc = desc.mc;
+ if (!mc)
+ return;
+
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
+
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
+}
+
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ list_for_each_entry(p, &microcode_cache, plist)
+ if (patch_cpus_equivalent(p, &n, false))
+ return p;
+
+ return NULL;
+}
+
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+ int ret;
+
+ list_for_each_entry(p, &microcode_cache, plist) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
+ /* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
+ return;
+ }
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &microcode_cache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u16 equiv_id = 0;
+
+ uci->cpu_sig.rev = get_patch_level();
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
+
+ return cache_find_patch(uci, equiv_id);
+}
+
+void reload_ucode_amd(unsigned int cpu)
+{
+ u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
+ struct ucode_patch *p;
+
+ p = find_patch(cpu);
+ if (!p)
+ return;
+
+ mc = p->data;
+
+ rev = get_patch_level();
+ if (rev < mc->hdr.patch_id) {
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
+ }
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = get_patch_level();
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ return 0;
+}
+
+static enum ucode_state apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+ u32 rev;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return UCODE_NFOUND;
+
+ rev = uci->cpu_sig.rev;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ /* need to apply patch? */
+ if (rev > mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return UCODE_ERROR;
+ }
+
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
+static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
+{
+ u32 equiv_tbl_len;
+ const u32 *hdr;
+
+ if (!verify_equivalence_table(buf, buf_size))
+ return 0;
+
+ hdr = (const u32 *)buf;
+ equiv_tbl_len = hdr[2];
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
+ equiv_table.entry = vmalloc(equiv_tbl_len);
+ if (!equiv_table.entry) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return 0;
+ }
+
+ memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
+ equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+
+out:
+ /* add header length */
+ return equiv_tbl_len + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
+ vfree(equiv_table.entry);
+ memset(&equiv_table, 0, sizeof(equiv_table));
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * Return a non-negative value even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
+ unsigned int *patch_size)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ u16 proc_id;
+ int ret;
+
+ ret = verify_patch(fw, leftover, patch_size);
+ if (ret)
+ return ret;
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, *patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+ patch->size = *patch_size;
+
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return 0;
+}
+
+/* Scan the blob in @data and add microcode patches to the cache. */
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ u8 *fw = (u8 *)data;
+ size_t offset;
+
+ offset = install_equiv_cpu_table(data, size);
+ if (!offset)
+ return UCODE_ERROR;
+
+ fw += offset;
+ size -= offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return UCODE_ERROR;
+ }
+
+ while (size > 0) {
+ unsigned int crnt_size = 0;
+ int ret;
+
+ ret = verify_and_add_patch(family, fw, size, &crnt_size);
+ if (ret < 0)
+ return UCODE_ERROR;
+
+ fw += crnt_size + SECTION_HDR_SIZE;
+ size -= (crnt_size + SECTION_HDR_SIZE);
+ }
+
+ return UCODE_OK;
+}
+
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ cleanup();
+
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ return ret;
+
+ for_each_node_with_cpus(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
+
+ p = find_patch(cpu);
+ if (!p)
+ continue;
+
+ if (c->microcode >= p->patch_id)
+ continue;
+
+ ret = UCODE_NEW;
+ }
+
+ return ret;
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ if (force_minrev)
+ return UCODE_NFOUND;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ ucode_dbg("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (!verify_container(fw->data, fw->size))
+ goto fw_release;
+
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static void finalize_late_load_amd(int result)
+{
+ if (result)
+ cleanup();
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .finalize_late_load = finalize_late_load_amd,
+ .nmi_safe = true,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+ return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..1fd349cfc802
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,556 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa0011d7, {
+ 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b,
+ 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12,
+ 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2,
+ 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00123b, {
+ 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2,
+ 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3,
+ 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28,
+ 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa00820d, {
+ 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4,
+ 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca,
+ 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1,
+ 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10114c, {
+ 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64,
+ 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74,
+ 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a,
+ 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa10124c, {
+ 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90,
+ 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46,
+ 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc,
+ 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa108109, {
+ 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa,
+ 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b,
+ 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35,
+ 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa20102e, {
+ 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd,
+ 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6,
+ 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5,
+ 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa201211, {
+ 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95,
+ 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27,
+ 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff,
+ 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa404108, {
+ 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc,
+ 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb,
+ 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19,
+ 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa500012, {
+ 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4,
+ 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16,
+ 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f,
+ 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa60120a, {
+ 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d,
+ 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b,
+ 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d,
+ 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa704108, {
+ 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93,
+ 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98,
+ 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49,
+ 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa705208, {
+ 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19,
+ 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2,
+ 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11,
+ 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa708008, {
+ 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46,
+ 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab,
+ 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16,
+ 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xa70c008, {
+ 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21,
+ 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e,
+ 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66,
+ 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+ { 0xaa00216, {
+ 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5,
+ 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08,
+ 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2,
+ 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 000000000000..68049f171860
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,926 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ */
+
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
+#include <linux/syscore_ops.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/firmware.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/msr.h>
+#include <asm/setup.h>
+
+#include "internal.h"
+
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr;
+
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+
+/*
+ * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in
+ * order to not uglify the code with ifdeffery and use IS_ENABLED()
+ * instead, leave them in. When microcode debugging is not enabled,
+ * those are meaningless anyway.
+ */
+/* base microcode revision for debugging */
+u32 base_rev;
+u32 microcode_rev[NR_CPUS] = {};
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - cpus_read_lock/unlock() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+struct early_load_data early_data;
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
+bool __init microcode_loader_disabled(void)
+{
+ if (dis_ucode_ldr)
+ return true;
+
+ /*
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
+ */
+ if (!cpuid_feature() ||
+ ((native_cpuid_ecx(1) & BIT(31)) &&
+ !IS_ENABLED(CONFIG_MICROCODE_DBG)) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
+
+ return dis_ucode_ldr;
+}
+
+static void __init early_parse_cmdline(void)
+{
+ char cmd_buf[64] = {};
+ char *s, *p = cmd_buf;
+
+ if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) {
+ while ((s = strsep(&p, ","))) {
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ if (strstr(s, "base_rev=")) {
+ /* advance to the option arg */
+ strsep(&s, "=");
+ if (kstrtouint(s, 16, &base_rev)) { ; }
+ }
+ }
+
+ if (!strcmp("force_minrev", s))
+ force_minrev = true;
+
+ if (!strcmp(s, "dis_ucode_ldr"))
+ dis_ucode_ldr = true;
+ }
+ }
+
+ /* old, compat option */
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+}
+
+void __init load_ucode_bsp(void)
+{
+ unsigned int cpuid_1_eax;
+ bool intel = true;
+
+ early_parse_cmdline();
+
+ if (microcode_loader_disabled())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
+ break;
+
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
+ break;
+
+ default:
+ return;
+ }
+
+ if (intel)
+ load_ucode_intel_bsp(&early_data);
+ else
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
+}
+
+void load_ucode_ap(void)
+{
+ unsigned int cpuid_1_eax;
+
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
+ if (dis_ucode_ldr)
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
+ break;
+ default:
+ break;
+ }
+}
+
+struct cpio_data __init find_microcode_in_initrd(const char *path)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
+ if (size)
+ start = initrd_start_early;
+
+#else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+ start += PAGE_OFFSET;
+ }
+#endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ */
+ if (initrd_start)
+ start = initrd_start;
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
+static void reload_early_microcode(unsigned int cpu)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd(cpu);
+ break;
+ default:
+ break;
+ }
+}
+
+/* fake device for request_firmware */
+static struct faux_device *microcode_fdev;
+
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
+
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
+
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
+{
+ unsigned int timeout, loops;
+
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
+}
+
+static noinstr bool wait_for_ctrl(void)
+{
+ unsigned int timeout, loops;
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ return false;
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
+{
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
+
+ /*
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
+{
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
+ }
+
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
+ /*
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
+ */
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
+ else
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
+
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
+
+static void __load_primary(unsigned int cpu)
+{
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
+ }
+
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+
+ /*
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
+ */
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
+ }
+}
+
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
+
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
+}
+
+/*
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
+ */
+void noinstr microcode_offline_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
+
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
+ }
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
+}
+
+/*
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
+ */
+bool noinstr microcode_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
+
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
+ struct cpuinfo_x86 prev_info;
+
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
+
+ /*
+ * Pre-load the microcode image into a staging device. This
+ * process is preemptible and does not require stopping CPUs.
+ * Successful staging simplifies the subsequent late-loading
+ * process, reducing rendezvous time.
+ *
+ * Even if the transfer fails, the update will proceed as usual.
+ */
+ if (microcode_ops->use_staging)
+ microcode_ops->stage_microcode();
+
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
+
+ /*
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
+ */
+ store_cpu_caps(&prev_info);
+
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
+
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
+
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
+ }
+
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
+
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
+
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
+}
+
+/*
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
+ */
+static bool setup_cpus(void)
+{
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
+
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
+
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ }
+ return true;
+}
+
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ case UCODE_OK:
+ return 0;
+ default:
+ return -EBADFD;
+ }
+}
+
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret || val != 1)
+ return -EINVAL;
+
+ cpus_read_lock();
+ ret = load_late_locked();
+ cpus_read_unlock();
+
+ return ret ? : size;
+}
+
+static DEVICE_ATTR_WO(reload);
+#endif
+
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t processor_flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
+
+static struct attribute *mc_default_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
+ NULL
+};
+
+static const struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
+}
+
+/**
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
+ */
+void microcode_bsp_resume(void)
+{
+ int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->mc)
+ microcode_ops->apply_microcode(cpu);
+ else
+ reload_early_microcode(cpu);
+}
+
+static void microcode_bsp_syscore_resume(void *data)
+{
+ microcode_bsp_resume();
+}
+
+static const struct syscore_ops mc_syscore_ops = {
+ .resume = microcode_bsp_syscore_resume,
+};
+
+static struct syscore mc_syscore = {
+ .ops = &mc_syscore_ops,
+};
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct device *dev = get_cpu_device(cpu);
+
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+ &dev_attr_reload.attr,
+#endif
+ NULL
+};
+
+static const struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
+static int __init microcode_init(void)
+{
+ struct device *dev_root;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int error;
+
+ if (microcode_loader_disabled())
+ return -EINVAL;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+ else
+ pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
+ return -ENODEV;
+
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group);
+ put_device(dev_root);
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_pdev;
+ }
+ }
+
+ register_syscore(&mc_syscore);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
+
+ return 0;
+
+ out_pdev:
+ faux_device_destroy(microcode_fdev);
+ return error;
+
+}
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
new file mode 100644
index 000000000000..2d48e6593540
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -0,0 +1,160 @@
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 },
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 000000000000..8744f3adc2a0
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
+/* Defines for the microcode staging mailbox interface */
+#define MBOX_REG_NUM 4
+#define MBOX_REG_SIZE sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET 0x0
+#define MBOX_STATUS_OFFSET 0x4
+#define MBOX_WRDATA_OFFSET 0x8
+#define MBOX_RDDATA_OFFSET 0xc
+
+#define MASK_MBOX_CTRL_ABORT BIT(0)
+#define MASK_MBOX_CTRL_GO BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR BIT(2)
+#define MASK_MBOX_STATUS_READY BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS BIT(0)
+#define MASK_MBOX_RESP_PROGRESS BIT(1)
+#define MASK_MBOX_RESP_ERROR BIT(2)
+
+#define MBOX_CMD_LOAD 0x3
+#define MBOX_OBJ_STAGING 0xb
+#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \
+ (MBOX_OBJ_STAGING << 16) | \
+ ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC)
+
+/* Current microcode patch used in early patching on the APs. */
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
+
+/* last level cache size per core */
+static unsigned int llc_size_per_core __ro_after_init;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[];
+};
+
+/**
+ * struct staging_state - Track the current staging process state
+ *
+ * @mmio_base: MMIO base address for staging
+ * @ucode_len: Total size of the microcode image
+ * @chunk_size: Size of each data piece
+ * @bytes_sent: Total bytes transmitted so far
+ * @offset: Current offset in the microcode image
+ */
+struct staging_state {
+ void __iomem *mmio_base;
+ unsigned int ucode_len;
+ unsigned int chunk_size;
+ unsigned int bytes_sent;
+ unsigned int offset;
+};
+
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+
+static inline unsigned int get_totalsize(struct microcode_header_intel *hdr)
+{
+ return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE;
+}
+
+static inline unsigned int exttable_size(struct extended_sigtable *et)
+{
+ return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
+}
+
+void intel_collect_cpu_info(struct cpu_signature *sig)
+{
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
+
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
+ unsigned int val[2];
+
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
+
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
+{
+ if (s1->sig != sig2)
+ return false;
+
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
+}
+
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
+ int i;
+
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return false;
+
+ ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = intel_microcode_get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
+static void update_ucode_pointer(struct microcode_intel *mc)
+{
+ kvfree(ucode_patch_va);
+
+ /*
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
+ */
+ ucode_patch_va = mc;
+}
+
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
+
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
+ else
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
+}
+
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
+{
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
+ unsigned int mc_size;
+
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
+ mc_header = (struct microcode_header_intel *)data;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > size ||
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
+ break;
+
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
+ continue;
+
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
+ if (save) {
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
+ }
+
+ patch = data;
+ cur_rev = mc_header->rev;
+ }
+
+ return size ? NULL : patch;
+}
+
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+ u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+ /* Acknowledge read completion to the staging hardware */
+ writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+ return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+ writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+ u32 high, low;
+
+ low = read_mbox_dword(mmio_base);
+ high = read_mbox_dword(mmio_base);
+
+ return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+ write_mbox_dword(mmio_base, value);
+ write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+ int i;
+
+ /*
+ * The MMIO space is mapped as Uncached (UC). Each write arrives
+ * at the device as an individual transaction in program order.
+ * The device can then reassemble the sequence accordingly.
+ */
+ for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+ write_mbox_dword(mmio_base, chunk[i]);
+}
+
+/*
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+ ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+ /*
+ * Abort any ongoing process, effectively resetting the device.
+ * Unlike regular mailbox data processing requests, this
+ * operation does not require a status check.
+ */
+ writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss, int *err)
+{
+ /* A page size or remaining bytes if this is the final chunk */
+ ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
+ /*
+ * Each microcode image is divided into chunks, each at most
+ * one page size. A 10-chunk image would typically require 10
+ * transactions.
+ *
+ * However, the hardware managing the mailbox has limited
+ * resources and may not cache the entire image, potentially
+ * requesting the same chunk multiple times.
+ *
+ * To tolerate this behavior, allow up to twice the expected
+ * number of transactions (i.e., a 10-chunk image can take up to
+ * 20 attempts).
+ *
+ * If the number of attempts exceeds this limit, treat it as
+ * exceeding the maximum allowed transfer size.
+ */
+ if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+ *err = -EMSGSIZE;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+/*
+ * The hardware indicates completion by returning a sentinel end offset.
+ */
+static inline bool is_end_offset(u32 offset)
+{
+ return offset == UINT_MAX;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss, int *err)
+{
+ return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+ u32 timeout, status;
+
+ /* Allow time for hardware to complete the operation: */
+ for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+ msleep(1);
+
+ status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+ /* Break out early if the hardware is ready: */
+ if (status & MASK_MBOX_STATUS_READY)
+ break;
+ }
+
+ /* Check for explicit error response */
+ if (status & MASK_MBOX_STATUS_ERROR)
+ return -EIO;
+
+ /*
+ * Hardware has neither responded to the action nor signaled any
+ * error. Treat this as a timeout.
+ */
+ if (!(status & MASK_MBOX_STATUS_READY))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
+{
+ u32 *src_chunk = ucode_ptr + ss->offset;
+ u16 mbox_size;
+
+ /*
+ * Write a 'request' mailbox object in this order:
+ * 1. Mailbox header includes total size
+ * 2. Command header specifies the load operation
+ * 3. Data section contains a microcode chunk
+ *
+ * Thus, the mailbox size is two headers plus the chunk size.
+ */
+ mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+ write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+ write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+ write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+ ss->bytes_sent += ss->chunk_size;
+
+ /* Notify the hardware that the mailbox is ready for processing. */
+ writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+ return wait_for_transaction(ss);
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+ const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+ u32 offset, status;
+ u64 header;
+
+ /*
+ * The 'response' mailbox returns three fields, in order:
+ * 1. Header
+ * 2. Next offset in the microcode image
+ * 3. Status flags
+ */
+ header = read_mbox_header(ss->mmio_base);
+ offset = read_mbox_dword(ss->mmio_base);
+ status = read_mbox_dword(ss->mmio_base);
+
+ /* All valid responses must start with the expected header. */
+ if (header != expected_header) {
+ pr_err_once("staging: invalid response header (0x%llx)\n", header);
+ return -EBADR;
+ }
+
+ /*
+ * Verify the offset: If not at the end marker, it must not
+ * exceed the microcode image length.
+ */
+ if (!is_end_offset(offset) && offset > ss->ucode_len) {
+ pr_err_once("staging: invalid offset (%u) past the image end (%u)\n",
+ offset, ss->ucode_len);
+ return -EINVAL;
+ }
+
+ /* Hardware may report errors explicitly in the status field */
+ if (status & MASK_MBOX_RESP_ERROR)
+ return -EPROTO;
+
+ ss->offset = offset;
+ return 0;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion.
+ * Return 0 on success or an error code on failure.
+ */
+static int do_stage(u64 mmio_pa)
+{
+ struct staging_state ss = {};
+ int err;
+
+ ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+ if (WARN_ON_ONCE(!ss.mmio_base))
+ return -EADDRNOTAVAIL;
+
+ init_stage(&ss);
+
+ /* Perform the staging process while within the retry limit */
+ while (!staging_is_complete(&ss, &err)) {
+ /* Send a chunk of microcode each time: */
+ err = send_data_chunk(&ss, ucode_patch_late);
+ if (err)
+ break;
+ /*
+ * Then, ask the hardware which piece of the image it
+ * needs next. The same piece may be sent more than once.
+ */
+ err = fetch_next_offset(&ss);
+ if (err)
+ break;
+ }
+
+ iounmap(ss.mmio_base);
+
+ return err;
+}
+
+static void stage_microcode(void)
+{
+ unsigned int pkg_id = UINT_MAX;
+ int cpu, err;
+ u64 mmio_pa;
+
+ if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) {
+ pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n",
+ get_totalsize(&ucode_patch_late->hdr));
+ return;
+ }
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * The MMIO address is unique per package, and all the SMT
+ * primary threads are online here. Find each MMIO space by
+ * their package IDs to avoid duplicate staging.
+ */
+ for_each_cpu(cpu, cpu_primary_thread_mask) {
+ if (topology_logical_package_id(cpu) == pkg_id)
+ continue;
+
+ pkg_id = topology_logical_package_id(cpu);
+
+ err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ err = do_stage(mmio_pa);
+ if (err) {
+ pr_err("Error: staging failed (%d) for CPU%d at package %u.\n",
+ err, cpu, pkg_id);
+ return;
+ }
+ }
+
+ pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev);
+}
+
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
+{
+ u32 rev;
+
+ if (!mc)
+ return UCODE_NFOUND;
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
+ return UCODE_OK;
+ }
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return UCODE_ERROR;
+
+ uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
+}
+
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
+
+ return __apply_microcode(uci, mc, &cur_rev);
+}
+
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
+ char name[30];
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+ return false;
+}
+
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
+{
+ struct cpio_data cp;
+
+ intel_collect_cpu_info(&uci->cpu_sig);
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ if (!(cp.data && cp.size))
+ return NULL;
+
+ return scan_microcode(cp.data, cp.size, uci, save);
+}
+
+/*
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
+ */
+static int __init save_builtin_microcode(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
+
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
+}
+early_initcall(save_builtin_microcode);
+
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
+
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
+ }
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+/* Reload microcode on resume */
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
+
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ intel_collect_cpu_info(csig);
+ return 0;
+}
+
+static enum ucode_state apply_microcode_late(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_intel *mc = ucode_patch_late;
+ enum ucode_state ret;
+ u32 cur_rev;
+
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
+ return UCODE_ERROR;
+
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
+
+ /*
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
+ */
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
+ }
+
+ /*
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
+ */
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
+ }
+ return true;
+}
+
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
+ u8 *new_mc = NULL, *mc = NULL;
+
+ while (iov_iter_count(iter)) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size, data_size;
+ u8 *data;
+
+ if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
+ pr_err("error! Truncated or inaccessible header in microcode data file\n");
+ goto fail;
+ }
+
+ mc_size = get_totalsize(&mc_header);
+ if (mc_size < sizeof(mc_header)) {
+ pr_err("error! Bad data in microcode data file (totalsize too small)\n");
+ goto fail;
+ }
+ data_size = mc_size - sizeof(mc_header);
+ if (data_size > iov_iter_count(iter)) {
+ pr_err("error! Bad data in microcode data file (truncated file?)\n");
+ goto fail;
+ }
+
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
+ if (!mc)
+ goto fail;
+ curr_mc_size = mc_size;
+ }
+
+ memcpy(mc, &mc_header, sizeof(mc_header));
+ data = mc + sizeof(mc_header);
+ if (!copy_from_iter_full(data, data_size, iter) ||
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
+
+ if (cur_rev >= mc_header.rev)
+ continue;
+
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
+
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
+ }
+
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
+
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
+}
+
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Late loading on model 79 with microcode revision less than 0x0b000021
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
+ */
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
+ c->x86_stepping == 0x01 &&
+ llc_size_per_core > 2621440 &&
+ c->microcode < 0x0b000021) {
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
+ }
+
+ return false;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ struct iov_iter iter;
+ enum ucode_state ret;
+ struct kvec kvec;
+ char name[30];
+
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_stepping);
+
+ if (request_firmware_direct(&firmware, name, device)) {
+ pr_debug("data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ kvec.iov_base = (void *)firmware->data;
+ kvec.iov_len = firmware->size;
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
+ ret = parse_microcode_blobs(cpu, &iter);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static void finalize_late_load(int result)
+{
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .stage_microcode = stage_microcode,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
+};
+
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024ULL;
+
+ do_div(llc_size, topology_num_cores_per_package());
+ llc_size_per_core = (unsigned int)llc_size;
+}
+
+static __init bool staging_available(void)
+{
+ u64 val;
+
+ val = x86_read_arch_cap_msr();
+ if (!(val & ARCH_CAP_MCU_ENUM))
+ return false;
+
+ rdmsrq(MSR_IA32_MCU_ENUMERATION, val);
+ return !!(val & MCU_STAGING);
+}
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ if (staging_available()) {
+ microcode_intel_ops.use_staging = true;
+ pr_info("Enabled staging feature.\n");
+ }
+
+ calc_llc_size_per_core(c);
+
+ return &microcode_intel_ops;
+}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
new file mode 100644
index 000000000000..a10b547eda1e
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MICROCODE_INTERNAL_H
+#define _X86_MICROCODE_INTERNAL_H
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/microcode.h>
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_NEW_SAFE,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
+ void (*microcode_fini_cpu)(int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode)(int cpu);
+ void (*stage_microcode)(void);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1,
+ use_staging : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
+};
+
+extern struct early_load_data early_data;
+extern struct ucode_cpu_info ucode_cpu_info[];
+extern u32 microcode_rev[NR_CPUS];
+extern u32 base_rev;
+
+struct cpio_data find_microcode_in_initrd(const char *path);
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+extern bool force_minrev;
+
+#ifdef CONFIG_CPU_SUP_AMD
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
+void load_ucode_amd_ap(unsigned int family);
+void reload_ucode_amd(unsigned int cpu);
+struct microcode_ops *init_amd_microcode(void);
+void exit_amd_microcode(void);
+#else /* CONFIG_CPU_SUP_AMD */
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
+static inline void load_ucode_amd_ap(unsigned int family) { }
+static inline void reload_ucode_amd(unsigned int cpu) { }
+static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
+static inline void exit_amd_microcode(void) { }
+#endif /* !CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+void load_ucode_intel_bsp(struct early_load_data *ed);
+void load_ucode_intel_ap(void);
+void reload_ucode_intel(void);
+struct microcode_ops *init_intel_microcode(void);
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
+static inline void load_ucode_intel_ap(void) { }
+static inline void reload_ucode_intel(void) { }
+static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+#define ucode_dbg(fmt, ...) \
+({ \
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+})
+
+#endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
deleted file mode 100644
index dfea390e1608..000000000000
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/perl
-#
-# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
-#
-
-($in, $out) = @ARGV;
-
-open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
-open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
-
-print OUT "#include <asm/cpufeature.h>\n\n";
-print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
-
-while (defined($line = <IN>)) {
- if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
- $macro = $1;
- $feature = $2;
- $tail = $3;
- if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = $1;
- }
-
- if ($feature ne '') {
- printf OUT "\t%-32s = \"%s\",\n",
- "[$macro]", "\L$feature";
- }
- }
-}
-print OUT "};\n";
-
-close(IN);
-close(OUT);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 000000000000..68f537347466
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
+#
+
+set -e
+
+OUT=$1
+
+dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
+ IN=$5
+
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
+
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ ! "$VALUE" ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
+ done
+ echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURES_H"
+ echo "#include <asm/cpufeatures.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..579fb2c64cfd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -1,55 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* HyperV Detection code.
*
* Copyright (C) 2010, Novell, Inc.
* Author : K. Y. Srinivasan <ksrinivasan@novell.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
*/
#include <linux/types.h>
-#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kexec.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/idtentry.h>
+#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/reboot.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+#include <asm/svm.h>
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
struct ms_hyperv_info ms_hyperv;
-static bool __init ms_hyperv_platform(void)
+#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
+{
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_msr(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
+ else
+ rdmsrq(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
+
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ /* The hypervisor will get the intercept. */
+ hv_ivm_msr_write(reg, value);
+
+ /* Using wrmsrq so the following goes to the paravisor. */
+ if (hv_is_sint_msr(reg)) {
+ union hv_synic_sint sint = { .as_uint64 = value };
+
+ sint.proxy = hv_para_sint_proxy;
+ native_wrmsrq(reg, sint.as_uint64);
+ }
+ } else {
+ native_wrmsrq(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+ hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return ~0ULL;
+ return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return;
+ native_write_msr(reg, val);
+}
+
+u64 hv_get_msr(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ return hv_get_non_nested_msr(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_msr);
+
+void hv_set_msr(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ hv_set_non_nested_msr(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_msr);
+
+static void (*mshv_handler)(void);
+static void (*vmbus_handler)(void);
+static void (*hv_stimer0_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
+ if (vmbus_handler)
+ vmbus_handler();
+
+ if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
+void hv_setup_vmbus_handler(void (*handler)(void))
+{
+ vmbus_handler = handler;
+}
+
+void hv_remove_vmbus_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+
+/*
+ * Routines to do per-architecture handling of stimer0
+ * interrupts when in Direct Mode
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(hyperv_stimer0_count);
+ if (hv_stimer0_handler)
+ hv_stimer0_handler();
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
+{
+ hv_stimer0_handler = handler;
+}
+
+void hv_remove_stimer0_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ hv_stimer0_handler = NULL;
+}
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+
+ /*
+ * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+ * corrupts the old VP Assist Pages and can crash the kexec kernel.
+ */
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
+
+ /* The function calls stop_other_cpus(). */
+ native_machine_shutdown();
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ if (kexec_in_progress)
+ hyperv_cleanup();
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+
+ /* The function calls crash_smp_send_stop(). */
+ native_machine_crash_shutdown(regs);
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ hyperv_cleanup();
+}
+#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
+EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall);
+#define hypercall_update(hc) static_call_update(hv_hypercall, hc)
+#endif
+#endif /* CONFIG_HYPERV */
+
+#ifndef hypercall_update
+#define hypercall_update(hc) (void)hc
+#endif
+
+static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
+ return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- return eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12);
+ if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX ||
+ memcmp("Microsoft Hv", hyp_signature, 12))
+ return 0;
+
+ /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */
+ eax = cpuid_eax(HYPERV_CPUID_FEATURES);
+ if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) {
+ pr_warn("x86/hyperv: HYPERCALL MSR not available.\n");
+ return 0;
+ }
+ if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) {
+ pr_warn("x86/hyperv: VP_INDEX MSR not available.\n");
+ return 0;
+ }
+
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
+#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV)
+static void __init hv_smp_prepare_boot_cpu(void)
+{
+ native_smp_prepare_boot_cpu();
+#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ hv_init_spinlocks();
+#endif
+}
+
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ int ret;
+#endif
+
+ native_smp_prepare_cpus(max_cpus);
+
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
+#ifdef CONFIG_X86_64
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ BUG_ON(ret);
+ }
+
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+ BUG_ON(ret);
+ }
+#endif
+}
+#endif
+
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
+ int hv_max_functions_eax, eax;
+
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
/*
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints);
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
+ ms_hyperv.misc_features);
+
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
+ hv_identify_partition_type();
+
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
+ }
+
+ /*
+ * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+ * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+ * root) will not get them because the nested (L1) hypervisor filters them out.
+ * These are handled through intercept processing by the Windows Hyper-V stack
+ * or the paravisor.
+ */
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ ms_hyperv.confidential_vmbus_available =
+ eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+ ms_hyperv.msi_ext_dest_id =
+ eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ }
+
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
+ ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ static_branch_enable(&isolation_type_snp);
+ if (!ms_hyperv.paravisor_present)
+ hypercall_update(hv_snp_hypercall);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ hypercall_update(hv_tdx_hypercall);
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
+ /* HV_MSR_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
+ }
+
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_period = hv_lapic_frequency;
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_period);
+ }
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (hv_root_partition())
+ machine_ops.power_off = hv_machine_power_off;
+#if defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
+ if (!hv_root_partition())
+ machine_ops.crash_shutdown = hv_guest_crash_shutdown;
+#endif
+#endif
+ /*
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root
+ * partition doesn't need to write to synthetic MSR to enable invariant
+ * TSC feature. It sees what the hardware provides.
+ */
+ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ /*
+ * Writing to synthetic MSR 0x40000118 updates/changes the
+ * guest visible CPUIDs. Setting bit 0 of this MSR enables
+ * guests to report invariant TSC feature through CPUID
+ * instruction, CPUID 0x800000007/EDX, bit 8. See code in
+ * early_init_intel() where this bit is examined. The
+ * setting of this MSR bit should happen before init_intel()
+ * is called.
+ */
+ wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ }
+
+ /*
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
+ */
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ ms_hyperv.paravisor_present)
+ hv_vtom_init();
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+ hyperv_setup_mmu_ops();
+
+ /* Install system interrupt handler for hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+
+ /* Install system interrupt handler for reenlightenment notifications */
+ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
+ sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+ }
+
+ /* Install system interrupt handler for stimer0 */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
+ sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+ }
+
+# ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+ if (hv_root_partition() ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
+# endif
+
+ /*
+ * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
+ * set x2apic destination mode to physical mode when x2apic is available
+ * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
+ * have 8-bit APIC id.
+ */
+# ifdef CONFIG_X86_X2APIC
+ if (x2apic_supported())
+ x2apic_phys = 1;
+# endif
+
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
+ hv_vtl_init_platform();
+#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ *
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No
+ * need to check for it.
+ */
+ if (!hv_root_partition() &&
+ !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
+
+ hardlockup_detector_disable();
+}
+
+static bool __init ms_hyperv_x2apic_available(void)
+{
+ return x2apic_supported();
+}
+
+/*
+ * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
+ * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
+ * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
+ *
+ * Note: for a VM on Hyper-V, the I/O-APIC is the only device which
+ * (logically) generates MSIs directly to the system APIC irq domain.
+ * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
+ * pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ */
+static bool __init ms_hyperv_msi_ext_dest_id(void)
+{
+ return ms_hyperv.msi_ext_dest_id;
+}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
}
+#endif
-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
- .name = "Microsoft HyperV",
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
- .init_platform = ms_hyperv_init_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.x2apic_available = ms_hyperv_x2apic_available,
+ .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
+ .init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
};
-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed81181..aee4bc5ad496 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
-obj-y := main.o if.o generic.o cleanup.o
-obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y := mtrr.o if.o generic.o cleanup.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 92ba9cd31c9a..ef3e8e42b782 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
@@ -108,17 +109,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static const struct mtrr_ops amd_mtrr_ops = {
- .vendor = X86_VENDOR_AMD,
+const struct mtrr_ops amd_mtrr_ops = {
+ .var_regs = 2,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index 316fe3e60a97..6f6c3ae92943 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
@@ -44,15 +45,6 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
return -ENOSPC;
}
-/*
- * Report boot time MCR setups
- */
-void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
-{
- centaur_mcr[mcr].low = lo;
- centaur_mcr[mcr].high = hi;
-}
-
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
@@ -103,24 +95,18 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
*/
if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
- pr_warning("mtrr: only write-combining%s supported\n",
+ pr_warn("mtrr: only write-combining%s supported\n",
centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
return 0;
}
-static const struct mtrr_ops centaur_mtrr_ops = {
- .vendor = X86_VENDOR_CENTAUR,
+const struct mtrr_ops centaur_mtrr_ops = {
+ .var_regs = 8,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..763534d77f59 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.0+
/*
* MTRR (Memory Type Range Register) cleanup
*
* Copyright (C) 2009 Yinghai Lu
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
@@ -28,7 +14,7 @@
#include <linux/range.h>
#include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
@@ -56,10 +42,7 @@ static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
@@ -80,12 +63,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
base, base + size);
}
- if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
/* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
@@ -98,9 +80,10 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
- printk(BIOS_BUG_MSG, i);
+ pr_warn(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -112,24 +95,22 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size);
- if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+ Dprintk("After UC checking\n");
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
- if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
return nr_range;
}
@@ -164,16 +145,9 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
+ unsigned char type)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
@@ -183,7 +157,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
return;
}
- mask = (1ULL << address_bits) - 1;
+ mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
base = ((u64)basek) << 10;
@@ -209,7 +183,7 @@ save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
range_state[reg].type = type;
}
-static void __init set_var_mtrr_all(unsigned int address_bits)
+static void __init set_var_mtrr_all(void)
{
unsigned long basek, sizek;
unsigned char type;
@@ -220,7 +194,7 @@ static void __init set_var_mtrr_all(unsigned int address_bits)
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
- set_var_mtrr(reg, basek, sizek, type, address_bits);
+ set_var_mtrr(reg, basek, sizek, type);
}
}
@@ -258,16 +232,16 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
/* Compute the maximum size with which we can make a range: */
if (range_startk)
- max_align = ffs(range_startk) - 1;
+ max_align = __ffs(range_startk);
else
- max_align = 32;
+ max_align = BITS_PER_LONG - 1;
- align = fls(range_sizek) - 1;
+ align = __fls(range_sizek);
if (align > max_align)
align = max_align;
- sizek = 1 << align;
- if (debug_print) {
+ sizek = 1UL << align;
+ if (mtrr_debug) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
@@ -296,7 +270,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
+ unsigned long second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
@@ -304,7 +278,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
hole_basek = 0;
hole_sizek = 0;
- second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
@@ -435,7 +408,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
state->range_sizek = sizek - second_sizek;
}
-/* Mininum size of mtrr block that can take hole: */
+/* Minimum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -538,12 +511,12 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -592,9 +565,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct range range_new[RANGE_NUM];
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
unsigned long range_sums_new;
- static int nr_range_new;
+ int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
@@ -632,9 +612,9 @@ static void __init mtrr_print_out_one_result(int i)
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
@@ -674,7 +654,7 @@ static int __init mtrr_search_optimal_index(void)
return index_good;
}
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
@@ -683,7 +663,10 @@ int __init mtrr_cleanup(unsigned address_bits)
int index_good;
int i;
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ if (!mtrr_enabled())
+ return 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
@@ -705,7 +688,7 @@ int __init mtrr_cleanup(unsigned address_bits)
return 0;
/* Print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
+ Dprintk("original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
@@ -714,18 +697,18 @@ int __init mtrr_cleanup(unsigned address_bits)
if (mtrr_tom2)
x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
- nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size);
/*
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
+ nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
1ULL<<(20 - PAGE_SHIFT));
- /* Sort the ranges: */
- sort_range(range, nr_range);
+ /* add from var mtrr at last */
+ nr_range = x86_get_mtrr_mem_range(range, nr_range,
+ x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM covered: %ldM\n",
+ pr_info("total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
@@ -736,13 +719,12 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
}
i = 0;
@@ -758,9 +740,9 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_calc_range_state(chunk_size, gran_size,
x_remove_base, x_remove_size, i);
- if (debug_print) {
+ if (mtrr_debug) {
mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
+ pr_info("\n");
}
i++;
@@ -771,7 +753,7 @@ int __init mtrr_cleanup(unsigned address_bits)
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ pr_info("Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
@@ -781,8 +763,8 @@ int __init mtrr_cleanup(unsigned address_bits)
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
@@ -791,13 +773,13 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
}
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
#else
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
return 0;
}
@@ -825,12 +807,13 @@ int __init amd_special_default_mtrr(void)
{
u32 l, h;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
- if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+ if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
@@ -854,7 +837,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
- return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
}
/**
@@ -876,15 +859,18 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
+ if (!mtrr_enabled())
+ return 0;
+
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
- if (!is_cpu(INTEL) || disable_mtrr_trim)
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
- def &= 0xff;
+ def &= MTRR_DEF_TYPE_TYPE;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
@@ -910,7 +896,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
return 0;
}
@@ -965,13 +951,14 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
end_pfn);
if (total_trim_size) {
- pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
pr_info("update e820 for mtrr\n");
- update_e820();
+ e820__update_table_print();
return 1;
}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 68a3343e5798..238dad57d4d6 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -97,6 +98,7 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
case 7:
if (size < 0x40)
break;
+ fallthrough;
case 6:
case 5:
case 4:
@@ -137,9 +139,9 @@ static void prepare_set(void)
u32 cr0;
/* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
@@ -167,11 +169,11 @@ static void post_set(void)
setCx86(CX86_CCR3, ccr3);
/* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
+ write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(cr4);
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
@@ -232,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
-
- for (i = 0; i < 8; i++) {
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
- }
-
- post_set();
-}
-
-static const struct mtrr_ops cyrix_mtrr_ops = {
- .vendor = X86_VENDOR_CYRIX,
- .set_all = cyrix_set_all,
+const struct mtrr_ops cyrix_mtrr_ops = {
+ .var_regs = 8,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,21 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
- * because MTRRs can span upto 40 bits (36bits on most modern x86)
+ * because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
-
+#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
-#include <asm/system.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
@@ -31,19 +35,70 @@ static struct fixed_range_block fixed_range_blocks[] = {
{}
};
+struct cache_map {
+ u64 start;
+ u64 end;
+ u64 flags;
+ u64 type:8;
+ u64 fixed:1;
+};
+
+bool mtrr_debug;
+
+static int __init mtrr_param_setup(char *str)
+{
+ int rc = 0;
+
+ if (!str)
+ return -EINVAL;
+ if (!strcmp(str, "debug"))
+ mtrr_debug = true;
+ else
+ rc = -EINVAL;
+
+ return rc;
+}
+early_param("mtrr", mtrr_param_setup);
+
+/*
+ * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where
+ * no 2 adjacent ranges have the same cache mode (those would be merged).
+ * The number is based on the worst case:
+ * - no two adjacent fixed MTRRs share the same cache mode
+ * - one variable MTRR is spanning a huge area with mode WB
+ * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2
+ * additional ranges each (result like "ababababa...aba" with a = WB, b = UC),
+ * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries
+ * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries
+ * to the possible maximum, as it always starts at 4GB, thus it can't be in
+ * the middle of that MTRR, unless that MTRR starts at 0, which would remove
+ * the initial "a" from the "abababa" pattern above)
+ * The map won't contain ranges with no matching MTRR (those fall back to the
+ * default cache mode).
+ */
+#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2)
+
+static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata;
+static struct cache_map *cache_map __refdata = init_cache_map;
+static unsigned int cache_map_size = CACHE_MAP_MAX;
+static unsigned int cache_map_n;
+static unsigned int cache_map_fixed;
+
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
-EXPORT_SYMBOL_GPL(mtrr_state);
+
+/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
+u32 phys_hi_rsvd;
/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
@@ -54,116 +109,448 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
- printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
- mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi);
+ }
+}
+
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask |= (u64)phys_hi_rsvd << 32;
+ size = -mask;
+
+ return size;
+}
+
+static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size)
+{
+ struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg;
+
+ if (!(mtrr->mask_lo & MTRR_PHYSMASK_V))
+ return MTRR_TYPE_INVALID;
+
+ *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK);
+ *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) +
+ (mtrr->mask_lo & PAGE_MASK));
+
+ return mtrr->base_lo & MTRR_PHYSBASE_TYPE;
+}
+
+static u8 get_effective_type(u8 type1, u8 type2)
+{
+ if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK))
+ return MTRR_TYPE_WRTHROUGH;
+
+ if (type1 != type2)
+ return MTRR_TYPE_UNCACHABLE;
+
+ return type1;
+}
+
+static void rm_map_entry_at(int idx)
+{
+ cache_map_n--;
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx, cache_map + idx + 1,
+ sizeof(*cache_map) * (cache_map_n - idx));
}
}
/*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Add an entry into cache_map at a specific index. Merges adjacent entries if
+ * appropriate. Return the number of merges for correcting the scan index
+ * (this is needed as merging will reduce the number of entries, which will
+ * result in skipping entries in future iterations if the scan index isn't
+ * corrected).
+ * Note that the corrected index can never go below -1 (resulting in being 0 in
+ * the next scan iteration), as "2" is returned only if the current index is
+ * larger than zero.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int add_map_entry_at(u64 start, u64 end, u8 type, int idx)
{
+ bool merge_prev = false, merge_next = false;
+
+ if (start >= end)
+ return 0;
+
+ if (idx > 0) {
+ struct cache_map *prev = cache_map + idx - 1;
+
+ if (!prev->fixed && start == prev->end && type == prev->type)
+ merge_prev = true;
+ }
+
+ if (idx < cache_map_n) {
+ struct cache_map *next = cache_map + idx;
+
+ if (!next->fixed && end == next->start && type == next->type)
+ merge_next = true;
+ }
+
+ if (merge_prev && merge_next) {
+ cache_map[idx - 1].end = cache_map[idx].end;
+ rm_map_entry_at(idx);
+ return 2;
+ }
+ if (merge_prev) {
+ cache_map[idx - 1].end = end;
+ return 1;
+ }
+ if (merge_next) {
+ cache_map[idx].start = start;
+ return 1;
+ }
+
+ /* Sanity check: the array should NEVER be too small! */
+ if (cache_map_n == cache_map_size) {
+ WARN(1, "MTRR cache mode memory map exhausted!\n");
+ cache_map_n = cache_map_fixed;
+ return 0;
+ }
+
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx + 1, cache_map + idx,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+
+ cache_map[idx].start = start;
+ cache_map[idx].end = end;
+ cache_map[idx].type = type;
+ cache_map[idx].fixed = 0;
+ cache_map_n++;
+
+ return 0;
+}
+
+/* Clear a part of an entry. Return 1 if start of entry is still valid. */
+static int clr_map_range_at(u64 start, u64 end, int idx)
+{
+ int ret = start != cache_map[idx].start;
+ u64 tmp;
+
+ if (start == cache_map[idx].start && end == cache_map[idx].end) {
+ rm_map_entry_at(idx);
+ } else if (start == cache_map[idx].start) {
+ cache_map[idx].start = end;
+ } else if (end == cache_map[idx].end) {
+ cache_map[idx].end = start;
+ } else {
+ tmp = cache_map[idx].end;
+ cache_map[idx].end = start;
+ add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1);
+ }
+
+ return ret;
+}
+
+/*
+ * Add MTRR to the map. The current map is scanned and each part of the MTRR
+ * either overlapping with an existing entry or with a hole in the map is
+ * handled separately.
+ */
+static void add_map_entry(u64 start, u64 end, u8 type)
+{
+ u8 new_type, old_type;
+ u64 tmp;
int i;
- u64 base, mask;
- u8 prev_match, curr_match;
-
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
+
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ if (start >= cache_map[i].end)
+ continue;
+
+ if (start < cache_map[i].start) {
+ /* Region start has no overlap. */
+ tmp = min(end, cache_map[i].start);
+ i -= add_map_entry_at(start, tmp, type, i);
+ start = tmp;
+ continue;
+ }
+
+ new_type = get_effective_type(type, cache_map[i].type);
+ old_type = cache_map[i].type;
+
+ if (cache_map[i].fixed || new_type == old_type) {
+ /* Cut off start of new entry. */
+ start = cache_map[i].end;
+ continue;
}
+
+ /* Handle only overlapping part of region. */
+ tmp = min(end, cache_map[i].end);
+ i += clr_map_range_at(start, tmp, i);
+ i -= add_map_entry_at(start, tmp, new_type, i);
+ start = tmp;
}
+ /* Add rest of region after last map entry (rest might be empty). */
+ add_map_entry_at(start, end, type, i);
+}
+
+/* Add variable MTRRs to cache map. */
+static void map_add_var(void)
+{
+ u64 start, size;
+ unsigned int i;
+ u8 type;
+
/*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
+ * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it
+ * needs to be added again when rebuilding the map due to potentially
+ * having moved as a result of variable MTRRs for memory below 4GB.
*/
- if (!(mtrr_state.enabled & 2))
- return mtrr_state.def_type;
+ if (mtrr_tom2) {
+ add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK);
+ cache_map[cache_map_n - 1].fixed = 1;
+ }
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ for (i = 0; i < num_var_ranges; i++) {
+ type = get_var_mtrr_state(i, &start, &size);
+ if (type != MTRR_TYPE_INVALID)
+ add_map_entry(start, start + size, type);
+ }
+}
- if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
- continue;
+/*
+ * Rebuild map by replacing variable entries. Needs to be called when MTRR
+ * registers are being changed after boot, as such changes could include
+ * removals of registers, which are complicated to handle without rebuild of
+ * the map.
+ */
+void generic_rebuild_map(void)
+{
+ if (mtrr_if != &generic_mtrr_ops)
+ return;
- base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
- (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
- (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+ cache_map_n = cache_map_fixed;
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
+ map_add_var();
+}
- if ((start & mask) != (base & mask))
- continue;
+static unsigned int __init get_cache_map_size(void)
+{
+ return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0);
+}
- curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
+/* Build the cache_map containing the cache modes per memory range. */
+void __init mtrr_build_map(void)
+{
+ u64 start, end, size;
+ unsigned int i;
+ u8 type;
+
+ /* Add fixed MTRRs, optimize for adjacent entries with same type. */
+ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) {
+ /*
+ * Start with 64k size fixed entries, preset 1st one (hence the
+ * loop below is starting with index 1).
+ */
+ start = 0;
+ end = size = 0x10000;
+ type = mtrr_state.fixed_ranges[0];
+
+ for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) {
+ /* 8 64k entries, then 16 16k ones, rest 4k. */
+ if (i == 8 || i == 24)
+ size >>= 2;
+
+ if (mtrr_state.fixed_ranges[i] != type) {
+ add_map_entry(start, end, type);
+ start = end;
+ type = mtrr_state.fixed_ranges[i];
+ }
+ end += size;
}
+ add_map_entry(start, end, type);
+ }
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE) {
- return MTRR_TYPE_UNCACHABLE;
+ /* Mark fixed, they take precedence. */
+ for (i = 0; i < cache_map_n; i++)
+ cache_map[i].fixed = 1;
+ cache_map_fixed = cache_map_n;
+
+ map_add_var();
+
+ pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n",
+ cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed,
+ get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0));
+
+ if (mtrr_debug) {
+ for (i = 0; i < cache_map_n; i++) {
+ pr_info("%3u: %016llx-%016llx %s\n", i,
+ cache_map[i].start, cache_map[i].end - 1,
+ mtrr_attrib_to_str(cache_map[i].type));
}
+ }
+}
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
+/* Copy the cache_map from __initdata memory to dynamically allocated one. */
+void __init mtrr_copy_map(void)
+{
+ unsigned int new_size = get_cache_map_size();
+
+ if (!mtrr_state.enabled || !new_size) {
+ cache_map = NULL;
+ return;
+ }
+
+ mutex_lock(&mtrr_mutex);
+
+ cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL);
+ if (cache_map) {
+ memmove(cache_map, init_cache_map,
+ cache_map_n * sizeof(*cache_map));
+ cache_map_size = new_size;
+ } else {
+ mtrr_state.enabled = 0;
+ pr_err("MTRRs disabled due to allocation failure for lookup map.\n");
+ }
+
+ mutex_unlock(&mtrr_mutex);
+}
+
+/**
+ * guest_force_mtrr_state - set static MTRR state for a guest
+ *
+ * Used to set MTRR state via different means (e.g. with data obtained from
+ * a hypervisor).
+ * Is allowed only for special cases when running virtualized. Must be called
+ * from the x86_init.hyper.init_platform() hook. It can be called only once.
+ * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
+ * is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
+ */
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
+{
+ unsigned int i;
+
+ /* Only allowed to be called once before mtrr_bp_init(). */
+ if (WARN_ON_ONCE(mtrr_state_set))
+ return;
+
+ /* Only allowed when running virtualized. */
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /*
+ * Only allowed for special virtualization cases:
+ * - when running as Hyper-V, SEV-SNP guest using vTOM
+ * - when running as Xen PV guest
+ * - when running as SEV-SNP or TDX guest to avoid unnecessary
+ * VMM communication/Virtualization exceptions (#VC, #VE)
+ */
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) &&
+ !hv_is_isolation_supported() &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV) &&
+ !cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return;
+
+ /* Disable MTRR in order to disable MTRR modifications. */
+ setup_clear_cpu_cap(X86_FEATURE_MTRR);
+
+ if (var) {
+ if (num_var > MTRR_MAX_VAR_RANGES) {
+ pr_warn("Trying to overwrite MTRR state with %u variable entries\n",
+ num_var);
+ num_var = MTRR_MAX_VAR_RANGES;
}
+ for (i = 0; i < num_var; i++)
+ mtrr_state.var_ranges[i] = var[i];
+ num_var_ranges = num_var;
+ }
+
+ mtrr_state.def_type = def_type;
+ mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED;
+
+ mtrr_state_set = 1;
+}
+
+static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
+{
+ u8 effective_type;
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
+ if (type == MTRR_TYPE_INVALID)
+ return new_type;
+
+ effective_type = get_effective_type(type, new_type);
+ if (type != effective_type)
+ *uniform = 0;
+
+ return effective_type;
+}
+
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
+{
+ u8 type = MTRR_TYPE_INVALID;
+ unsigned int i;
+
+ if (!mtrr_state_set) {
+ /* Uniformity is unknown. */
+ *uniform = 0;
+ return MTRR_TYPE_UNCACHABLE;
}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
+ *uniform = 1;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_UNCACHABLE;
+
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ /* Region after current map entry? -> continue with next one. */
+ if (start >= cache_map[i].end)
+ continue;
+
+ /* Start of region not covered by current map entry? */
+ if (start < cache_map[i].start) {
+ /* At least some part of region has default type. */
+ type = type_merge(type, mtrr_state.def_type, uniform);
+ /* End of region not covered, too? -> lookup done. */
+ if (end <= cache_map[i].start)
+ return type;
+ }
+
+ /* At least part of region covered by map entry. */
+ type = type_merge(type, cache_map[i].type, uniform);
+
+ start = cache_map[i].end;
}
- if (prev_match != 0xFF)
- return prev_match;
+ /* End of region past last entry in map? -> use default type. */
+ if (start < end)
+ type = type_merge(type, mtrr_state.def_type, uniform);
- return mtrr_state.def_type;
+ return type;
}
/* Get the MSR pair relating to a var range */
@@ -205,7 +592,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (cpu_has_mtrr)
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
@@ -218,8 +605,8 @@ static void __init print_fixed_last(void)
if (!last_fixed_end)
return;
- pr_debug(" %05X-%05X %s\n", last_fixed_start,
- last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+ pr_info(" %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
@@ -252,19 +639,18 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
- pr_debug("MTRR default type: %s\n",
- mtrr_attrib_to_str(mtrr_state.def_type));
+ pr_info("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_debug("MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -276,44 +662,40 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
- if (size_or_mask & 0xffffffffUL)
- high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
- else
- high_width = ffs(size_or_mask>>32) + 32 - 1;
- high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
+ high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V)
+ pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo &
+ MTRR_PHYSBASE_TYPE));
else
- pr_debug(" %u disabled\n", i);
+ pr_info(" %u disabled\n", i);
}
if (mtrr_tom2)
- pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
+ pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
- unsigned long flags;
unsigned lo, dummy;
unsigned int i;
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
- mtrr_state.have_fixed = (lo >> 8) & 1;
+ mtrr_state.have_fixed = lo & MTRR_CAP_FIX;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
@@ -321,8 +703,8 @@ void __init get_mtrr_state(void)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
- mtrr_state.def_type = (lo & 0xff);
- mtrr_state.enabled = (lo & 0xc00) >> 10;
+ mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE;
+ mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT;
if (amd_special_default_mtrr()) {
unsigned low, high;
@@ -335,18 +717,12 @@ void __init get_mtrr_state(void)
mtrr_tom2 &= 0xffffff800000ULL;
}
- print_mtrr_state();
+ if (mtrr_debug)
+ print_mtrr_state();
mtrr_state_set = 1;
- /* PAT setup for BP. We need to go through sync steps here */
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
@@ -357,14 +733,14 @@ void __init mtrr_state_warn(void)
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
- pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
- pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
- pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
- printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
- printk(KERN_INFO "mtrr: corrected configuration.\n");
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
}
/*
@@ -375,8 +751,7 @@ void __init mtrr_state_warn(void)
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
if (wrmsr_safe(msr, a, b) < 0) {
- printk(KERN_ERR
- "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
}
}
@@ -431,19 +806,19 @@ generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
static void generic_get_mtrr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type)
{
- unsigned int mask_lo, mask_hi, base_lo, base_hi;
- unsigned int tmp, hi;
- int cpu;
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
- cpu = get_cpu();
+ get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
- if ((mask_lo & 0x800) == 0) {
+ if (!(mask_lo & MTRR_PHYSMASK_V)) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
@@ -454,17 +829,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask_lo = size_or_mask | tmp;
+ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK);
+ mask = (u64)phys_hi_rsvd << 32 | tmp;
/* Expand tmp with high bits to all 1s: */
- hi = fls(tmp);
+ hi = fls64(tmp);
if (hi > 0) {
- tmp |= ~((1<<(hi - 1)) - 1);
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
- if (tmp != mask_lo) {
- printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
- mask_lo = tmp;
+ if (tmp != mask) {
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ mask = tmp;
}
}
@@ -472,9 +848,9 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask_lo;
- *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
- *type = base_lo & 0xff;
+ *size = -mask >> PAGE_SHIFT;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *type = base_lo & MTRR_PHYSBASE_TYPE;
out_put_cpu:
put_cpu();
@@ -512,9 +888,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
- if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
- || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD)
+ || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
@@ -522,9 +897,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), lo, hi);
- if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
- || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD)
+ || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
@@ -536,7 +910,10 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
@@ -556,103 +933,46 @@ static unsigned long set_mtrr_state(void)
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
- if ((deftype_lo & 0xff) != mtrr_state.def_type
- || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type ||
+ ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) {
- deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
- (mtrr_state.enabled << 10);
+ deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) |
+ mtrr_state.def_type |
+ (mtrr_state.enabled << MTRR_STATE_SHIFT);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
- */
-
- raw_spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- __flush_tlb();
-
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi);
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- __flush_tlb();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & 0xbfffffff);
-
- /* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
/* Use the atomic bitops to update the global mask */
- for (count = 0; count < sizeof mask * 8; ++count) {
+ for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
/**
@@ -674,7 +994,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
/*
@@ -685,15 +1005,15 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
- vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
- vr->mask_lo = -size << PAGE_SHIFT | 0x800;
- vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
+ vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V;
+ vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
@@ -706,17 +1026,16 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask <= 7) {
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
- pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
- pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
@@ -730,7 +1049,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
lbase = lbase >> 1, last = last >> 1)
;
if (lbase != last) {
- pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
@@ -740,7 +1059,7 @@ static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
- return config & (1 << 10);
+ return config & MTRR_CAP_WC;
}
int positive_have_wrcomb(void)
@@ -752,8 +1071,6 @@ int positive_have_wrcomb(void)
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 79289632cb27..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
-#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -43,7 +43,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
max = num_var_ranges;
if (fcount == NULL) {
- fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
+ fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);
if (!fcount)
return -ENOMEM;
FILE_FCOUNT(file) = fcount;
@@ -99,28 +99,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
memset(line, 0, LINE_SIZE);
- length = len;
- length--;
-
- if (length > LINE_SIZE - 1)
- length = LINE_SIZE - 1;
-
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
if (length < 0)
- return -EINVAL;
+ return length;
- if (copy_from_user(line, buf, length))
- return -EFAULT;
-
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
@@ -149,17 +137,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return -EINVAL;
ptr = skip_spaces(ptr + 5);
- for (i = 0; i < MTRR_NUM_TYPES; ++i) {
- if (strcmp(ptr, mtrr_strings[i]))
- continue;
- base >>= PAGE_SHIFT;
- size >>= PAGE_SHIFT;
- err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
- if (err < 0)
- return err;
- return len;
- }
- return -EINVAL;
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
}
static long
@@ -167,11 +154,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
{
int err = 0;
mtrr_type type;
+ unsigned long base;
unsigned long size;
struct mtrr_sentry sentry;
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
+ memset(&gentry, 0, sizeof(gentry));
+
switch (cmd) {
case MTRRIOC_ADD_ENTRY:
case MTRRIOC_SET_ENTRY:
@@ -181,12 +171,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC_SET_PAGE_ENTRY:
case MTRRIOC_DEL_PAGE_ENTRY:
case MTRRIOC_KILL_PAGE_ENTRY:
- if (copy_from_user(&sentry, arg, sizeof sentry))
+ if (copy_from_user(&sentry, arg, sizeof(sentry)))
return -EFAULT;
break;
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_from_user(&gentry, arg, sizeof gentry))
+ if (copy_from_user(&gentry, arg, sizeof(gentry)))
return -EFAULT;
break;
#ifdef CONFIG_COMPAT
@@ -231,8 +221,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -241,24 +229,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -267,14 +249,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that go above 4GB */
- if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
|| size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
gentry.base = gentry.size = gentry.type = 0;
else {
- gentry.base <<= PAGE_SHIFT;
+ gentry.base = base << PAGE_SHIFT;
gentry.size = size << PAGE_SHIFT;
gentry.type = type;
}
@@ -284,8 +266,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -294,8 +274,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -303,16 +281,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -321,11 +295,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#endif
if (gentry.regnum >= num_var_ranges)
return -EINVAL;
- mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
/* Hide entries that would overflow */
if (size != (__typeof__(gentry.size))size)
gentry.base = gentry.size = gentry.type = 0;
else {
+ gentry.base = base;
gentry.size = size;
gentry.type = type;
}
@@ -338,7 +313,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
switch (cmd) {
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_to_user(arg, &gentry, sizeof gentry))
+ if (copy_to_user(arg, &gentry, sizeof(gentry)))
err = -EFAULT;
break;
#ifdef CONFIG_COMPAT
@@ -377,36 +352,13 @@ static int mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -423,15 +375,37 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -442,7 +416,7 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c
new file mode 100644
index 000000000000..2415ffaaf02c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/legacy.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <asm/cpufeature.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+#include "mtrr.h"
+
+void mtrr_set_if(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR))
+ mtrr_if = &amd_mtrr_ops;
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR))
+ mtrr_if = &centaur_mtrr_ops;
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR))
+ mtrr_if = &cyrix_mtrr_ops;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic
+ * MTRR driver don't require this.
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value *mtrr_value;
+
+static int mtrr_save(void *data)
+{
+ int i;
+
+ if (!mtrr_value)
+ return -ENOMEM;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void *data)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ mtrr_if->set(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+static const struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+static struct syscore mtrr_syscore = {
+ .ops = &mtrr_syscore_ops,
+};
+
+void mtrr_register_syscore(void)
+{
+ mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL);
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore(&mtrr_syscore);
+}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 79556bd9b602..4b3d492afe17 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.0+
/* Generic MTRR (Memory Type Range Register) driver.
Copyright (C) 1997-2000 Richard Gooch
Copyright (c) 2002 Patrick Mochel
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
Richard Gooch may be reached by email at rgooch@atnf.csiro.au
The postal address is:
Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
@@ -31,53 +18,50 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/cpu.h>
#include <linux/pci.h>
#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
-#include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
+#include <asm/memtype.h>
#include "mtrr.h"
-u32 num_var_ranges;
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
-unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-static DEFINE_MUTEX(mtrr_mutex);
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
-u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
+u32 num_var_ranges;
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+DEFINE_MUTEX(mtrr_mutex);
const struct mtrr_ops *mtrr_if;
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void set_mtrr_ops(const struct mtrr_ops *ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
struct pci_dev *dev;
- u8 rev;
dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
if (dev != NULL) {
@@ -87,13 +71,11 @@ static int have_wrcomb(void)
* chipsets to be tagged
*/
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
- dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
- pci_dev_put(dev);
- return 0;
- }
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
}
/*
* Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -101,7 +83,7 @@ static int have_wrcomb(void)
*/
if (dev->vendor == PCI_VENDOR_ID_INTEL &&
dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
- pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
@@ -110,21 +92,6 @@ static int have_wrcomb(void)
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
-/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
-{
- unsigned long config = 0, dummy;
-
- if (use_intel())
- rdmsr(MSR_MTRRcap, config, dummy);
- else if (is_cpu(AMD))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
-
- num_var_ranges = config & 0xff;
-}
-
static void __init init_table(void)
{
int i, max;
@@ -135,8 +102,6 @@ static void __init init_table(void)
}
struct set_mtrr_data {
- atomic_t count;
- atomic_t gate;
unsigned long smp_base;
unsigned long smp_size;
unsigned int smp_reg;
@@ -144,41 +109,19 @@ struct set_mtrr_data {
};
/**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static void ipi_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
{
-#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
- unsigned long flags;
-
- local_irq_save(flags);
-
- atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
- cpu_relax();
- /* The master has cleared me to execute */
- if (data->smp_reg != ~0U) {
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init) {
- /*
- * Initialize the MTRRs inaddition to the synchronisation.
- */
- mtrr_if->set_all();
- }
-
- atomic_dec(&data->count);
- while (atomic_read(&data->gate))
- cpu_relax();
-
- atomic_dec(&data->count);
- local_irq_restore(flags);
-#endif
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
+ return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +141,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
@@ -214,81 +157,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 14. Wait for buddies to catch up
* 15. Enable interrupts.
*
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
*
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
*/
-static void
-set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size,
+ mtrr_type type)
{
- struct set_mtrr_data data;
- unsigned long flags;
-
- data.smp_reg = reg;
- data.smp_base = base;
- data.smp_size = size;
- data.smp_type = type;
- atomic_set(&data.count, num_booting_cpus() - 1);
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
- /* Make sure data.count is visible before unleashing other CPUs */
- smp_wmb();
- atomic_set(&data.gate, 0);
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
- /* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
-
- local_irq_save(flags);
-
- while (atomic_read(&data.count))
- cpu_relax();
-
- /* Ok, reset count and toggle gate */
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 1);
-
- /* Do our MTRR business */
-
- /*
- * HACK!
- * We use this same function to initialize the mtrrs on boot.
- * The state of the boot cpu's mtrrs has been saved, and we want
- * to replicate across all the APs.
- * If we're doing that @reg is set to something special...
- */
- if (reg != ~0U)
- mtrr_if->set(reg, base, size, type);
- else if (!mtrr_aps_delayed_init)
- mtrr_if->set_all();
-
- /* Wait for the others */
- while (atomic_read(&data.count))
- cpu_relax();
-
- atomic_set(&data.count, num_booting_cpus() - 1);
- smp_wmb();
- atomic_set(&data.gate, 0);
-
- /*
- * Wait here for everyone to have seen the gate change
- * So we're the last ones to touch 'data'
- */
- while (atomic_read(&data.count))
- cpu_relax();
-
- local_irq_restore(flags);
+ generic_rebuild_map();
}
/**
@@ -333,7 +222,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
int i, replace, error;
mtrr_type ltype;
- if (!mtrr_if)
+ if (!mtrr_enabled())
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
@@ -341,23 +230,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return error;
if (type >= MTRR_NUM_TYPES) {
- pr_warning("mtrr: type: %u invalid\n", type);
+ pr_warn("type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- pr_warning("mtrr: your processor doesn't support write-combining\n");
+ pr_warn("your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
- pr_warning("mtrr: zero sized request\n");
+ pr_warn("zero sized request\n");
return -EINVAL;
}
- if (base & size_or_mask || size & size_or_mask) {
- pr_warning("mtrr: base or size exceeds the MTRR width\n");
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
+ pr_warn("base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -365,7 +255,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
replace = -1;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
@@ -388,8 +278,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
} else if (types_compatible(type, ltype))
continue;
}
- pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%lx000\n", base, size, lbase,
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
@@ -397,7 +286,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
- pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
@@ -423,20 +312,20 @@ int mtrr_add_page(unsigned long base, unsigned long size,
}
}
} else {
- pr_info("mtrr: no more MTRRs available\n");
+ pr_info("no more MTRRs available\n");
}
error = i;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
- pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
- pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ Dprintk("size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
@@ -481,12 +370,13 @@ static int mtrr_check(unsigned long base, unsigned long size)
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
-EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
@@ -509,12 +399,12 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
unsigned long lbase, lsize;
int error = -EINVAL;
- if (!mtrr_if)
- return -ENXIO;
+ if (!mtrr_enabled())
+ return -ENODEV;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
@@ -526,22 +416,21 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
}
}
if (reg < 0) {
- pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
- base, size);
+ Dprintk("no MTRR for %lx000,%lx000 found\n", base, size);
goto out;
}
}
if (reg >= max) {
- pr_warning("mtrr: register: %d too big\n", reg);
+ pr_warn("register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
- pr_warning("mtrr: MTRR %d not used\n", reg);
+ pr_warn("MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
- pr_warning("mtrr: reg: %d has count=0\n", reg);
+ pr_warn("reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
@@ -549,7 +438,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
error = reg;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
@@ -569,240 +458,176 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
-EXPORT_SYMBOL(mtrr_del);
-/*
- * HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing. If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
*/
-static void __init init_ifs(void)
+int arch_phys_wc_add(unsigned long base, unsigned long size)
{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
-/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
- * MTRR driver doesn't require this
- */
-struct mtrr_value {
- mtrr_type ltype;
- unsigned long lbase;
- unsigned long lsize;
-};
+ int ret;
-static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
+ if (pat_enabled() || !mtrr_enabled())
+ return 0; /* Success! (We don't need to do anything.) */
-static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &mtrr_value[i].lbase,
- &mtrr_value[i].lsize,
- &mtrr_value[i].ltype);
+ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+ if (ret < 0) {
+ pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+ (void *)base, (void *)(base + size - 1));
+ return ret;
}
- return 0;
+ return ret + MTRR_TO_PHYS_WC_OFFSET;
}
+EXPORT_SYMBOL(arch_phys_wc_add);
-static int mtrr_restore(struct sys_device *sysdev)
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_value[i].lsize) {
- set_mtrr(i, mtrr_value[i].lbase,
- mtrr_value[i].lsize,
- mtrr_value[i].ltype);
- }
+ if (handle >= 1) {
+ WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+ mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
}
- return 0;
}
+EXPORT_SYMBOL(arch_phys_wc_del);
-
-
-static struct sysdev_driver mtrr_sysdev_driver = {
- .suspend = mtrr_save,
- .resume = mtrr_restore,
-};
+/*
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line. Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int arch_phys_wc_index(int handle)
+{
+ if (handle < MTRR_TO_PHYS_WC_OFFSET)
+ return -1;
+ else
+ return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
int __initdata changed_by_mtrr_cleanup;
/**
- * mtrr_bp_init - initialize mtrrs on the boot CPU
+ * mtrr_bp_init - initialize MTRRs on the boot CPU
*
* This needs to be called early; before any of the other CPUs are
* initialized (i.e. before smp_init()).
- *
*/
void __init mtrr_bp_init(void)
{
- u32 phys_addr;
+ bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR);
+ const char *why = "(not available)";
+ unsigned long config, dummy;
- init_ifs();
-
- phys_addr = 32;
-
- if (cpu_has_mtrr) {
- mtrr_if = &generic_mtrr_ops;
- size_or_mask = 0xff000000; /* 36 bits */
- size_and_mask = 0x00f00000;
- phys_addr = 36;
+ phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32);
+ if (!generic_mtrrs && mtrr_state.enabled) {
/*
- * This is an AMD specific MSR, but we assume(hope?) that
- * Intel will implement it to when they extend the address
- * bus of the Xeon.
+ * Software overwrite of MTRR state, only for generic case.
+ * Note that X86_FEATURE_MTRR has been reset in this case.
*/
- if (cpuid_eax(0x80000000) >= 0x80000008) {
- phys_addr = cpuid_eax(0x80000008) & 0xff;
- /* CPUID workaround for Intel 0F33/0F34 CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 0xF &&
- boot_cpu_data.x86_model == 0x3 &&
- (boot_cpu_data.x86_mask == 0x3 ||
- boot_cpu_data.x86_mask == 0x4))
- phys_addr = 36;
-
- size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
- size_and_mask = ~size_or_mask & 0xfffff00000ULL;
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
- boot_cpu_data.x86 == 6) {
- /*
- * VIA C* family have Intel style MTRRs,
- * but don't support PAE
- */
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- phys_addr = 32;
- }
- } else {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (cpu_has_k6_mtrr) {
- /* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CENTAUR:
- if (cpu_has_centaur_mcr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CYRIX:
- if (cpu_has_cyrix_arr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = 0xfff00000; /* 32 bits */
- size_and_mask = 0;
- }
- break;
- default:
- break;
- }
+ init_table();
+ mtrr_build_map();
+ pr_info("MTRRs set to read-only\n");
+
+ return;
}
- if (mtrr_if) {
- set_num_var_ranges();
- init_table();
- if (use_intel()) {
- get_mtrr_state();
+ if (generic_mtrrs)
+ mtrr_if = &generic_mtrr_ops;
+ else
+ mtrr_set_if();
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
+ if (mtrr_enabled()) {
+ /* Get the number of variable MTRR ranges. */
+ if (mtrr_if == &generic_mtrr_ops)
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else
+ config = mtrr_if->var_regs;
+ num_var_ranges = config & MTRR_CAP_VCNT;
+
+ init_table();
+ if (mtrr_if == &generic_mtrr_ops) {
+ /* BIOS may override */
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup();
+ mtrr_build_map();
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
}
}
}
-}
-void mtrr_ap_init(void)
-{
- if (!use_intel() || mtrr_aps_delayed_init)
- return;
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries
- * changed, but this routine will be called in cpu boot time,
- * holding the lock breaks it.
- *
- * This routine is called in two cases:
- *
- * 1. very earily time of software resume, when there absolutely
- * isn't mtrr entry changes;
- *
- * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
- * lock to prevent mtrr entry changes
- */
- set_mtrr(~0U, 0, 0, 0);
+ if (!mtrr_enabled())
+ pr_info("MTRRs disabled %s\n", why);
}
/**
- * Save current fixed-range MTRR state of the BSP
+ * mtrr_save_state - Save current fixed-range MTRR state of the first
+ * cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
- smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
-}
-
-void set_mtrr_aps_delayed_init(void)
-{
- if (!use_intel())
- return;
-
- mtrr_aps_delayed_init = true;
-}
+ int first_cpu;
-/*
- * MTRR initialization for all AP's
- */
-void mtrr_aps_init(void)
-{
- if (!use_intel())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
- set_mtrr(~0U, 0, 0, 0);
- mtrr_aps_delayed_init = false;
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
}
-void mtrr_bp_restore(void)
+static int __init mtrr_init_finalize(void)
{
- if (!use_intel())
- return;
-
- mtrr_if->set_all();
-}
+ /*
+ * Map might exist if guest_force_mtrr_state() has been called or if
+ * mtrr_enabled() returns true.
+ */
+ mtrr_copy_map();
-static int __init mtrr_init_finialize(void)
-{
- if (!mtrr_if)
+ if (!mtrr_enabled())
return 0;
- if (use_intel()) {
+ if (memory_caching_control & CACHE_MTRR) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
}
- /*
- * The CPU has no MTRR and seems to not support SMP. They have
- * specific drivers, we use a tricky method to support
- * suspend/resume for them.
- *
- * TBD: is there any system with such CPU which supports
- * suspend/resume? If no, we should remove the code.
- */
- sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver);
+ mtrr_register_syscore();
return 0;
}
-subsys_initcall(mtrr_init_finialize);
+subsys_initcall(mtrr_init_finalize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* local MTRR defines.
*/
@@ -9,15 +10,15 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+extern bool mtrr_debug;
+#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0)
+
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
- u32 vendor;
- u32 use_intel_if;
+ u32 var_regs;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
@@ -45,34 +46,45 @@ struct set_mtrr_context {
u32 ccr3;
};
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
-
-extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
-
-#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+extern struct mutex mtrr_mutex;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
+extern u32 phys_hi_rsvd;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+#ifdef CONFIG_X86_32
+void mtrr_set_if(void);
+void mtrr_register_syscore(void);
+#else
+static inline void mtrr_set_if(void) { }
+static inline void mtrr_register_syscore(void) { }
+#endif
+void mtrr_build_map(void);
+void mtrr_copy_map(void);
+
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
-extern int mtrr_cleanup(unsigned address_bits);
+extern int mtrr_cleanup(void);
+
+/*
+ * Must be used by code which uses mtrr_if to call platform-specific
+ * MTRR manipulation functions.
+ */
+static inline bool mtrr_enabled(void)
+{
+ return !!mtrr_if;
+}
+void generic_rebuild_map(void);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
deleted file mode 100644
index 5db5b7d65a18..000000000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,1781 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2009 Jaswinder Singh Rajput
- * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- * Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/compat.h>
-
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
- (unsigned long)(val)); \
- native_write_msr((msr), (u32)((u64)(val)), \
- (u32)((u64)(val) >> 32)); \
-} while (0)
-#endif
-
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page, type);
- memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
- put_page(page);
-
- len += size;
- to += size;
- addr += size;
-
- } while (len < n);
-
- return len;
-}
-
-struct event_constraint {
- union {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- u64 idxmsk64;
- };
- u64 code;
- u64 cmask;
- int weight;
-};
-
-struct amd_nb {
- int nb_id; /* NorthBridge id */
- int refcnt; /* reference count */
- struct perf_event *owners[X86_PMC_IDX_MAX];
- struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-#define MAX_LBR_ENTRIES 16
-
-struct cpu_hw_events {
- /*
- * Generic x86 PMC bits
- */
- struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
- unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int enabled;
-
- int n_events;
- int n_added;
- int n_txn;
- int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
- u64 tags[X86_PMC_IDX_MAX];
- struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-
- unsigned int group_flag;
-
- /*
- * Intel DebugStore bits
- */
- struct debug_store *ds;
- u64 pebs_enabled;
-
- /*
- * Intel LBR bits
- */
- int lbr_users;
- void *lbr_context;
- struct perf_branch_stack lbr_stack;
- struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
-
- /*
- * AMD specific bits
- */
- struct amd_nb *amd_nb;
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w) {\
- { .idxmsk64 = (n) }, \
- .code = (c), \
- .cmask = (m), \
- .weight = (w), \
-}
-
-#define EVENT_CONSTRAINT(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- * - inv
- * - edge
- * - cnt-mask
- * The other filters are supported by fixed counters.
- * The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define PEBS_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-#define EVENT_CONSTRAINT_END \
- EVENT_CONSTRAINT(0, 0, 0)
-
-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->weight; (e)++)
-
-union perf_capabilities {
- struct {
- u64 lbr_format : 6;
- u64 pebs_trap : 1;
- u64 pebs_arch_reg : 1;
- u64 pebs_format : 4;
- u64 smm_freeze : 1;
- };
- u64 capabilities;
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
- /*
- * Generic x86 PMC bits
- */
- const char *name;
- int version;
- int (*handle_irq)(struct pt_regs *);
- void (*disable_all)(void);
- void (*enable_all)(int added);
- void (*enable)(struct perf_event *);
- void (*disable)(struct perf_event *);
- int (*hw_config)(struct perf_event *event);
- int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
- unsigned eventsel;
- unsigned perfctr;
- u64 (*event_map)(int);
- int max_events;
- int num_counters;
- int num_counters_fixed;
- int cntval_bits;
- u64 cntval_mask;
- int apic;
- u64 max_period;
- struct event_constraint *
- (*get_event_constraints)(struct cpu_hw_events *cpuc,
- struct perf_event *event);
-
- void (*put_event_constraints)(struct cpu_hw_events *cpuc,
- struct perf_event *event);
- struct event_constraint *event_constraints;
- void (*quirks)(void);
-
- int (*cpu_prepare)(int cpu);
- void (*cpu_starting)(int cpu);
- void (*cpu_dying)(int cpu);
- void (*cpu_dead)(int cpu);
-
- /*
- * Intel Arch Perfmon v2+
- */
- u64 intel_ctrl;
- union perf_capabilities intel_cap;
-
- /*
- * Intel DebugStore bits
- */
- int bts, pebs;
- int pebs_record_size;
- void (*drain_pebs)(struct pt_regs *regs);
- struct event_constraint *pebs_constraints;
-
- /*
- * Intel LBR
- */
- unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
- int lbr_nr; /* hardware stack size */
-};
-
-static struct x86_pmu x86_pmu __read_mostly;
-
-static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
- .enabled = 1,
-};
-
-static int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-static u64 __read_mostly hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-static u64
-x86_perf_event_update(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int shift = 64 - x86_pmu.cntval_bits;
- u64 prev_raw_count, new_raw_count;
- int idx = hwc->idx;
- s64 delta;
-
- if (idx == X86_PMC_IDX_FIXED_BTS)
- return 0;
-
- /*
- * Careful: an NMI might modify the previous event value.
- *
- * Our tactic to handle this is to first atomically read and
- * exchange a new raw count - then add that new-prev delta
- * count to the generic event atomically:
- */
-again:
- prev_raw_count = atomic64_read(&hwc->prev_count);
- rdmsrl(hwc->event_base + idx, new_raw_count);
-
- if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- goto again;
-
- /*
- * Now we have the new raw value and have updated the prev
- * timestamp already. We can now calculate the elapsed delta
- * (event-)time and add that to the generic event.
- *
- * Careful, not all hw sign-extends above the physical width
- * of the count.
- */
- delta = (new_raw_count << shift) - (prev_raw_count << shift);
- delta >>= shift;
-
- atomic64_add(delta, &event->count);
- atomic64_sub(delta, &hwc->period_left);
-
- return new_raw_count;
-}
-
-static atomic_t active_events;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
- int i;
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- disable_lapic_nmi_watchdog();
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
- goto perfctr_fail;
- }
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
- goto eventsel_fail;
- }
-
- return true;
-
-eventsel_fail:
- for (i--; i >= 0; i--)
- release_evntsel_nmi(x86_pmu.eventsel + i);
-
- i = x86_pmu.num_counters;
-
-perfctr_fail:
- for (i--; i >= 0; i--)
- release_perfctr_nmi(x86_pmu.perfctr + i);
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-
- return false;
-}
-
-static void release_pmc_hardware(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- release_perfctr_nmi(x86_pmu.perfctr + i);
- release_evntsel_nmi(x86_pmu.eventsel + i);
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static int reserve_ds_buffers(void);
-static void release_ds_buffers(void);
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
- if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
- release_pmc_hardware();
- release_ds_buffers();
- mutex_unlock(&pmc_reserve_mutex);
- }
-}
-
-static inline int x86_pmu_initialized(void)
-{
- return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
-{
- unsigned int cache_type, cache_op, cache_result;
- u64 config, val;
-
- config = attr->config;
-
- cache_type = (config >> 0) & 0xff;
- if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
- return -EINVAL;
-
- cache_op = (config >> 8) & 0xff;
- if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
- return -EINVAL;
-
- cache_result = (config >> 16) & 0xff;
- if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
- return -EINVAL;
-
- val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
- if (val == 0)
- return -ENOENT;
-
- if (val == -1)
- return -EINVAL;
-
- hwc->config |= val;
-
- return 0;
-}
-
-static int x86_setup_perfctr(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- if (!hwc->sample_period) {
- hwc->sample_period = x86_pmu.max_period;
- hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
- }
-
- if (attr->type == PERF_TYPE_RAW)
- return 0;
-
- if (attr->type == PERF_TYPE_HW_CACHE)
- return set_ext_hw_attr(hwc, attr);
-
- if (attr->config >= x86_pmu.max_events)
- return -EINVAL;
-
- /*
- * The generic map:
- */
- config = x86_pmu.event_map(attr->config);
-
- if (config == 0)
- return -ENOENT;
-
- if (config == -1LL)
- return -EINVAL;
-
- /*
- * Branch tracing:
- */
- if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
- (hwc->sample_period == 1)) {
- /* BTS is not supported by this architecture. */
- if (!x86_pmu.bts)
- return -EOPNOTSUPP;
-
- /* BTS is currently only allowed for user-mode. */
- if (!attr->exclude_kernel)
- return -EOPNOTSUPP;
- }
-
- hwc->config |= config;
-
- return 0;
-}
-
-static int x86_pmu_hw_config(struct perf_event *event)
-{
- if (event->attr.precise_ip) {
- int precise = 0;
-
- /* Support for constant skid */
- if (x86_pmu.pebs)
- precise++;
-
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr)
- precise++;
-
- if (event->attr.precise_ip > precise)
- return -EOPNOTSUPP;
- }
-
- /*
- * Generate PMC IRQs:
- * (keep 'enabled' bit clear for now)
- */
- event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
- /*
- * Count user and OS events unless requested not to
- */
- if (!event->attr.exclude_user)
- event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
- if (!event->attr.exclude_kernel)
- event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
- if (event->attr.type == PERF_TYPE_RAW)
- event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
- return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
- int err;
-
- if (!x86_pmu_initialized())
- return -ENODEV;
-
- err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
- if (!reserve_pmc_hardware())
- err = -EBUSY;
- else {
- err = reserve_ds_buffers();
- if (err)
- release_pmc_hardware();
- }
- }
- if (!err)
- atomic_inc(&active_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
-
- event->destroy = hw_perf_event_destroy;
-
- event->hw.idx = -1;
- event->hw.last_cpu = -1;
- event->hw.last_tag = ~0ULL;
-
- return x86_pmu.hw_config(event);
-}
-
-static void x86_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- u64 val;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- rdmsrl(x86_pmu.eventsel + idx, val);
- if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
- continue;
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(x86_pmu.eventsel + idx, val);
- }
-}
-
-void hw_perf_disable(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu_initialized())
- return;
-
- if (!cpuc->enabled)
- return;
-
- cpuc->n_added = 0;
- cpuc->enabled = 0;
- barrier();
-
- x86_pmu.disable_all();
-}
-
-static void x86_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
- u64 val;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- val = event->hw.config;
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(x86_pmu.eventsel + idx, val);
- }
-}
-
-static const struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
- return event->pmu == &pmu;
-}
-
-static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
- struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int i, j, w, wmax, num = 0;
- struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
- for (i = 0; i < n; i++) {
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
- constraints[i] = c;
- }
-
- /*
- * fastpath, try to reuse previous register
- */
- for (i = 0; i < n; i++) {
- hwc = &cpuc->event_list[i]->hw;
- c = constraints[i];
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- /* not already used */
- if (test_bit(hwc->idx, used_mask))
- break;
-
- __set_bit(hwc->idx, used_mask);
- if (assign)
- assign[i] = hwc->idx;
- }
- if (i == n)
- goto done;
-
- /*
- * begin slow path
- */
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
- /*
- * weight = number of possible counters
- *
- * 1 = most constrained, only works on one counter
- * wmax = least constrained, works on any counter
- *
- * assign events to counters starting with most
- * constrained events.
- */
- wmax = x86_pmu.num_counters;
-
- /*
- * when fixed event counters are present,
- * wmax is incremented by 1 to account
- * for one more choice
- */
- if (x86_pmu.num_counters_fixed)
- wmax++;
-
- for (w = 1, num = n; num && w <= wmax; w++) {
- /* for each event */
- for (i = 0; num && i < n; i++) {
- c = constraints[i];
- hwc = &cpuc->event_list[i]->hw;
-
- if (c->weight != w)
- continue;
-
- for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_bit(j, used_mask))
- break;
- }
-
- if (j == X86_PMC_IDX_MAX)
- break;
-
- __set_bit(j, used_mask);
-
- if (assign)
- assign[i] = j;
- num--;
- }
- }
-done:
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
- for (i = 0; i < n; i++) {
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
- }
- }
- return num ? -ENOSPC : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
- struct perf_event *event;
- int n, max_count;
-
- max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
- /* current number of events already accepted */
- n = cpuc->n_events;
-
- if (is_x86_event(leader)) {
- if (n >= max_count)
- return -ENOSPC;
- cpuc->event_list[n] = leader;
- n++;
- }
- if (!dogrp)
- return n;
-
- list_for_each_entry(event, &leader->sibling_list, group_entry) {
- if (!is_x86_event(event) ||
- event->state <= PERF_EVENT_STATE_OFF)
- continue;
-
- if (n >= max_count)
- return -ENOSPC;
-
- cpuc->event_list[n] = event;
- n++;
- }
- return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
- struct cpu_hw_events *cpuc, int i)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- hwc->idx = cpuc->assign[i];
- hwc->last_cpu = smp_processor_id();
- hwc->last_tag = ++cpuc->tags[i];
-
- if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
- hwc->config_base = 0;
- hwc->event_base = 0;
- } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- /*
- * We set it so that event_base + idx in wrmsr/rdmsr maps to
- * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
- */
- hwc->event_base =
- MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
- } else {
- hwc->config_base = x86_pmu.eventsel;
- hwc->event_base = x86_pmu.perfctr;
- }
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
- struct cpu_hw_events *cpuc,
- int i)
-{
- return hwc->idx == cpuc->assign[i] &&
- hwc->last_cpu == smp_processor_id() &&
- hwc->last_tag == cpuc->tags[i];
-}
-
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
-
-void hw_perf_enable(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int i, added = cpuc->n_added;
-
- if (!x86_pmu_initialized())
- return;
-
- if (cpuc->enabled)
- return;
-
- if (cpuc->n_added) {
- int n_running = cpuc->n_events - cpuc->n_added;
- /*
- * apply assignment obtained either from
- * hw_perf_group_sched_in() or x86_pmu_enable()
- *
- * step1: save events moving to new counters
- * step2: reprogram moved events into new counters
- */
- for (i = 0; i < n_running; i++) {
- event = cpuc->event_list[i];
- hwc = &event->hw;
-
- /*
- * we can avoid reprogramming counter if:
- * - assigned same counter as last time
- * - running on same CPU as last time
- * - no other event has used the counter since
- */
- if (hwc->idx == -1 ||
- match_prev_assignment(hwc, cpuc, i))
- continue;
-
- x86_pmu_stop(event);
- }
-
- for (i = 0; i < cpuc->n_events; i++) {
- event = cpuc->event_list[i];
- hwc = &event->hw;
-
- if (!match_prev_assignment(hwc, cpuc, i))
- x86_assign_hw_event(event, cpuc, i);
- else if (i < n_running)
- continue;
-
- x86_pmu_start(event);
- }
- cpuc->n_added = 0;
- perf_events_lapic_init();
- }
-
- cpuc->enabled = 1;
- barrier();
-
- x86_pmu.enable_all(added);
-}
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
- u64 enable_mask)
-{
- wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
-}
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base + hwc->idx, hwc->config);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-static int
-x86_perf_event_set_period(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- s64 left = atomic64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
- int ret = 0, idx = hwc->idx;
-
- if (idx == X86_PMC_IDX_FIXED_BTS)
- return 0;
-
- /*
- * If we are way outside a reasonable range then just skip forward:
- */
- if (unlikely(left <= -period)) {
- left = period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
-
- if (unlikely(left <= 0)) {
- left += period;
- atomic64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
- /*
- * Quirk: certain CPUs dont like it if just 1 hw_event is left:
- */
- if (unlikely(left < 2))
- left = 2;
-
- if (left > x86_pmu.max_period)
- left = x86_pmu.max_period;
-
- per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
- /*
- * The hw event starts counting from this event offset,
- * mark it to be able to extra future deltas:
- */
- atomic64_set(&hwc->prev_count, (u64)-left);
-
- wrmsrl(hwc->event_base + idx,
- (u64)(-left) & x86_pmu.cntval_mask);
-
- perf_event_update_userpage(event);
-
- return ret;
-}
-
-static void x86_pmu_enable_event(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- if (cpuc->enabled)
- __x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * activate a single event
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-static int x86_pmu_enable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc;
- int assign[X86_PMC_IDX_MAX];
- int n, n0, ret;
-
- hwc = &event->hw;
-
- n0 = cpuc->n_events;
- n = collect_events(cpuc, event, false);
- if (n < 0)
- return n;
-
- /*
- * If group events scheduling transaction was started,
- * skip the schedulability test here, it will be peformed
- * at commit time(->commit_txn) as a whole
- */
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
- goto out;
-
- ret = x86_pmu.schedule_events(cpuc, n, assign);
- if (ret)
- return ret;
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n*sizeof(int));
-
-out:
- cpuc->n_events = n;
- cpuc->n_added += n - n0;
- cpuc->n_txn += n - n0;
-
- return 0;
-}
-
-static int x86_pmu_start(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx = event->hw.idx;
-
- if (idx == -1)
- return -EAGAIN;
-
- x86_perf_event_set_period(event);
- cpuc->events[idx] = event;
- __set_bit(idx, cpuc->active_mask);
- x86_pmu.enable(event);
- perf_event_update_userpage(event);
-
- return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
- int ret = x86_pmu_start(event);
- WARN_ON_ONCE(ret);
-}
-
-void perf_event_print_debug(void)
-{
- u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
- struct cpu_hw_events *cpuc;
- unsigned long flags;
- int cpu, idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_events, cpu);
-
- if (x86_pmu.version >= 2) {
- rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
- rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-
- pr_info("\n");
- pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
- pr_info("CPU#%d: status: %016llx\n", cpu, status);
- pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
- pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
- }
- pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
- rdmsrl(x86_pmu.perfctr + idx, pmc_count);
-
- prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
- pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
- cpu, idx, pmc_ctrl);
- pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
- cpu, idx, prev_left);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
- pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- }
- local_irq_restore(flags);
-}
-
-static void x86_pmu_stop(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- int idx = hwc->idx;
-
- if (!__test_and_clear_bit(idx, cpuc->active_mask))
- return;
-
- x86_pmu.disable(event);
-
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- x86_perf_event_update(event);
-
- cpuc->events[idx] = NULL;
-}
-
-static void x86_pmu_disable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
-
- /*
- * If we're called during a txn, we don't need to do anything.
- * The events never got scheduled and ->cancel_txn will truncate
- * the event_list.
- */
- if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
- return;
-
- x86_pmu_stop(event);
-
- for (i = 0; i < cpuc->n_events; i++) {
- if (event == cpuc->event_list[i]) {
-
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, event);
-
- while (++i < cpuc->n_events)
- cpuc->event_list[i-1] = cpuc->event_list[i];
-
- --cpuc->n_events;
- break;
- }
- }
- perf_event_update_userpage(event);
-}
-
-static int x86_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int idx, handled = 0;
- u64 val;
-
- perf_sample_data_init(&data, 0);
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- event = cpuc->events[idx];
- hwc = &event->hw;
-
- val = x86_perf_event_update(event);
- if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
- continue;
-
- /*
- * event overflow
- */
- handled = 1;
- data.period = event->hw.last_period;
-
- if (!x86_perf_event_set_period(event))
- continue;
-
- if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
- irq_enter();
- ack_APIC_irq();
- inc_irq_stat(apic_pending_irqs);
- perf_event_do_pending();
- irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
-void perf_events_lapic_init(void)
-{
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- /*
- * Always use NMI for PMU
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int __kprobes
-perf_event_nmi_handler(struct notifier_block *self,
- unsigned long cmd, void *__args)
-{
- struct die_args *args = __args;
- struct pt_regs *regs;
-
- if (!atomic_read(&active_events))
- return NOTIFY_DONE;
-
- switch (cmd) {
- case DIE_NMI:
- case DIE_NMI_IPI:
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- regs = args->regs;
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /*
- * Can't rely on the handled return value to say it was our NMI, two
- * events could trigger 'simultaneously' raising two back-to-back NMIs.
- *
- * If the first NMI handles both, the latter will be empty and daze
- * the CPU.
- */
- x86_pmu.handle_irq(regs);
-
- return NOTIFY_STOP;
-}
-
-static __read_mostly struct notifier_block perf_event_nmi_notifier = {
- .notifier_call = perf_event_nmi_handler,
- .next = NULL,
- .priority = 1
-};
-
-static struct event_constraint unconstrained;
-static struct event_constraint emptyconstraint;
-
-static struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct event_constraint *c;
-
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if ((event->hw.config & c->cmask) == c->code)
- return c;
- }
- }
-
- return &unconstrained;
-}
-
-#include "perf_event_amd.c"
-#include "perf_event_p6.c"
-#include "perf_event_p4.c"
-#include "perf_event_intel_lbr.c"
-#include "perf_event_intel_ds.c"
-#include "perf_event_intel.c"
-
-static int __cpuinit
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- if (x86_pmu.cpu_prepare)
- ret = x86_pmu.cpu_prepare(cpu);
- break;
-
- case CPU_STARTING:
- if (x86_pmu.cpu_starting)
- x86_pmu.cpu_starting(cpu);
- break;
-
- case CPU_DYING:
- if (x86_pmu.cpu_dying)
- x86_pmu.cpu_dying(cpu);
- break;
-
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- if (x86_pmu.cpu_dead)
- x86_pmu.cpu_dead(cpu);
- break;
-
- default:
- break;
- }
-
- return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
- if (cpu_has_apic)
- return;
-
- x86_pmu.apic = 0;
- pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
- pr_info("no hardware sampling interrupt available.\n");
-}
-
-void __init init_hw_perf_events(void)
-{
- struct event_constraint *c;
- int err;
-
- pr_info("Performance Events: ");
-
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- err = intel_pmu_init();
- break;
- case X86_VENDOR_AMD:
- err = amd_pmu_init();
- break;
- default:
- return;
- }
- if (err != 0) {
- pr_cont("no PMU driver, software events only.\n");
- return;
- }
-
- pmu_check_apic();
-
- pr_cont("%s PMU driver.\n", x86_pmu.name);
-
- if (x86_pmu.quirks)
- x86_pmu.quirks();
-
- if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
- x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
- }
- x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
- perf_max_events = x86_pmu.num_counters;
-
- if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
- }
-
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
-
- perf_events_lapic_init();
- register_die_notifier(&perf_event_nmi_notifier);
-
- unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters);
-
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != X86_RAW_EVENT_MASK)
- continue;
-
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
- }
- }
-
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
- pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
-
- perf_cpu_notifier(x86_pmu_notifier);
-}
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
- x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- */
-static void x86_pmu_start_txn(const struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
- cpuc->n_txn = 0;
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
- /*
- * Truncate the collected events.
- */
- cpuc->n_added -= cpuc->n_txn;
- cpuc->n_events -= cpuc->n_txn;
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int assign[X86_PMC_IDX_MAX];
- int n, ret;
-
- n = cpuc->n_events;
-
- if (!x86_pmu_initialized())
- return -EAGAIN;
-
- ret = x86_pmu.schedule_events(cpuc, n, assign);
- if (ret)
- return ret;
-
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n*sizeof(int));
-
- /*
- * Clear out the txn count so that ->cancel_txn() which gets
- * run after ->commit_txn() doesn't undo things.
- */
- cpuc->n_txn = 0;
-
- return 0;
-}
-
-static const struct pmu pmu = {
- .enable = x86_pmu_enable,
- .disable = x86_pmu_disable,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
- .unthrottle = x86_pmu_unthrottle,
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
-};
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
- struct cpu_hw_events *fake_cpuc;
- struct event_constraint *c;
- int ret = 0;
-
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- return -ENOMEM;
-
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
-
- if (!c || !c->weight)
- ret = -ENOSPC;
-
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(fake_cpuc, event);
-
- kfree(fake_cpuc);
-
- return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- * - check events are compatible which each other
- * - events do not compete for the same counter
- * - number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
- struct perf_event *leader = event->group_leader;
- struct cpu_hw_events *fake_cpuc;
- int ret, n;
-
- ret = -ENOMEM;
- fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
- if (!fake_cpuc)
- goto out;
-
- /*
- * the event is not yet connected with its
- * siblings therefore we must first collect
- * existing siblings, then add the new event
- * before we can simulate the scheduling
- */
- ret = -ENOSPC;
- n = collect_events(fake_cpuc, leader, true);
- if (n < 0)
- goto out_free;
-
- fake_cpuc->n_events = n;
- n = collect_events(fake_cpuc, event, false);
- if (n < 0)
- goto out_free;
-
- fake_cpuc->n_events = n;
-
- ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out_free:
- kfree(fake_cpuc);
-out:
- return ret;
-}
-
-const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
- const struct pmu *tmp;
- int err;
-
- err = __hw_perf_event_init(event);
- if (!err) {
- /*
- * we temporarily connect event to its pmu
- * such that validate_group() can classify
- * it as an x86 event using is_x86_event()
- */
- tmp = event->pmu;
- event->pmu = &pmu;
-
- if (event->group_leader != event)
- err = validate_group(event);
- else
- err = validate_event(event);
-
- event->pmu = tmp;
- }
- if (err) {
- if (event->destroy)
- event->destroy(event);
- return ERR_PTR(err);
- }
-
- return &pmu;
-}
-
-/*
- * callchain support
- */
-
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
- if (entry->nr < PERF_MAX_STACK_DEPTH)
- entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-
-
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
- /* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
- /* Ignore warnings */
-}
-
-static int backtrace_stack(void *data, char *name)
-{
- return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct perf_callchain_entry *entry = data;
-
- callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .warning = backtrace_warning,
- .warning_symbol = backtrace_warning_symbol,
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack_bp,
-};
-
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
-
- dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
-}
-
-#ifdef CONFIG_COMPAT
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- /* 32-bit process in 64-bit kernel. */
- struct stack_frame_ia32 frame;
- const void __user *fp;
-
- if (!test_thread_flag(TIF_IA32))
- return 0;
-
- fp = compat_ptr(regs->bp);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- unsigned long bytes;
- frame.next_frame = 0;
- frame.return_address = 0;
-
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
- break;
-
- if (fp < compat_ptr(regs->sp))
- break;
-
- callchain_store(entry, frame.return_address);
- fp = compat_ptr(frame.next_frame);
- }
- return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- return 0;
-}
-#endif
-
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- struct stack_frame frame;
- const void __user *fp;
-
- if (!user_mode(regs))
- regs = task_pt_regs(current);
-
- fp = (void __user *)regs->bp;
-
- callchain_store(entry, PERF_CONTEXT_USER);
- callchain_store(entry, regs->ip);
-
- if (perf_callchain_user32(regs, entry))
- return;
-
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- unsigned long bytes;
- frame.next_frame = NULL;
- frame.return_address = 0;
-
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
- break;
-
- if ((unsigned long)fp < regs->sp)
- break;
-
- callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
- }
-}
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- int is_user;
-
- if (!regs)
- return;
-
- is_user = user_mode(regs);
-
- if (is_user && current->state != TASK_RUNNING)
- return;
-
- if (!is_user)
- perf_callchain_kernel(regs, entry);
-
- if (current->mm)
- perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- struct perf_callchain_entry *entry;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return NULL;
- }
-
- if (in_nmi())
- entry = &__get_cpu_var(pmc_nmi_entry);
- else
- entry = &__get_cpu_var(pmc_irq_entry);
-
- entry->nr = 0;
-
- perf_do_callchain(regs, entry);
-
- return entry;
-}
-
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
- regs->ip = ip;
- /*
- * perf_arch_fetch_caller_regs adds another call, we need to increment
- * the skip level
- */
- regs->bp = rewind_frame_pointer(skip + 1);
- regs->cs = __KERNEL_CS;
- /*
- * We abuse bit 3 to pass exact information, see perf_misc_flags
- * and the comment with PERF_EFLAGS_EXACT.
- */
- regs->flags = 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
- unsigned long ip;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- ip = perf_guest_cbs->get_guest_ip();
- else
- ip = instruction_pointer(regs);
-
- return ip;
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
- int misc = 0;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
- misc |= PERF_RECORD_MISC_GUEST_USER;
- else
- misc |= PERF_RECORD_MISC_GUEST_KERNEL;
- } else {
- if (user_mode(regs))
- misc |= PERF_RECORD_MISC_USER;
- else
- misc |= PERF_RECORD_MISC_KERNEL;
- }
-
- if (regs->flags & PERF_EFLAGS_EXACT)
- misc |= PERF_RECORD_MISC_EXACT_IP;
-
- return misc;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
deleted file mode 100644
index c2897b7b4a3b..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,420 +0,0 @@
-#ifdef CONFIG_CPU_SUP_AMD
-
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-
-static __initconst const u64 amd_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
- [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
- [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
- [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
- [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
- return amd_perfmon_event_map[hw_event];
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.type != PERF_TYPE_RAW)
- return 0;
-
- event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
- return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
- return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
- struct amd_nb *nb = cpuc->amd_nb;
-
- return nb && nb->nb_id != -1;
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct amd_nb *nb = cpuc->amd_nb;
- int i;
-
- /*
- * only care about NB events
- */
- if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
- return;
-
- /*
- * need to scan whole list because event may not have
- * been assigned during scheduling
- *
- * no race condition possible because event can only
- * be removed on one CPU at a time AND PMU is disabled
- * when we come here
- */
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (nb->owners[i] == event) {
- cmpxchg(nb->owners+i, event, NULL);
- break;
- }
- }
-}
-
- /*
- * AMD64 NorthBridge events need special treatment because
- * counter access needs to be synchronized across all cores
- * of a package. Refer to BKDG section 3.12
- *
- * NB events are events measuring L3 cache, Hypertransport
- * traffic. They are identified by an event code >= 0xe00.
- * They measure events on the NorthBride which is shared
- * by all cores on a package. NB events are counted on a
- * shared set of counters. When a NB event is programmed
- * in a counter, the data actually comes from a shared
- * counter. Thus, access to those counters needs to be
- * synchronized.
- *
- * We implement the synchronization such that no two cores
- * can be measuring NB events using the same counters. Thus,
- * we maintain a per-NB allocation table. The available slot
- * is propagated using the event_constraint structure.
- *
- * We provide only one choice for each NB event based on
- * the fact that only NB events have restrictions. Consequently,
- * if a counter is available, there is a guarantee the NB event
- * will be assigned to it. If no slot is available, an empty
- * constraint is returned and scheduling will eventually fail
- * for this event.
- *
- * Note that all cores attached the same NB compete for the same
- * counters to host NB events, this is why we use atomic ops. Some
- * multi-chip CPUs may have more than one NB.
- *
- * Given that resources are allocated (cmpxchg), they must be
- * eventually freed for others to use. This is accomplished by
- * calling amd_put_event_constraints().
- *
- * Non NB events are not impacted by this restriction.
- */
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct amd_nb *nb = cpuc->amd_nb;
- struct perf_event *old = NULL;
- int max = x86_pmu.num_counters;
- int i, j, k = -1;
-
- /*
- * if not NB event or no NB, then no constraints
- */
- if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
- return &unconstrained;
-
- /*
- * detect if already present, if so reuse
- *
- * cannot merge with actual allocation
- * because of possible holes
- *
- * event can already be present yet not assigned (in hwc->idx)
- * because of successive calls to x86_schedule_events() from
- * hw_perf_group_sched_in() without hw_perf_enable()
- */
- for (i = 0; i < max; i++) {
- /*
- * keep track of first free slot
- */
- if (k == -1 && !nb->owners[i])
- k = i;
-
- /* already present, reuse */
- if (nb->owners[i] == event)
- goto done;
- }
- /*
- * not present, so grab a new slot
- * starting either at:
- */
- if (hwc->idx != -1) {
- /* previous assignment */
- i = hwc->idx;
- } else if (k != -1) {
- /* start from free slot found */
- i = k;
- } else {
- /*
- * event not found, no slot found in
- * first pass, try again from the
- * beginning
- */
- i = 0;
- }
- j = i;
- do {
- old = cmpxchg(nb->owners+i, NULL, event);
- if (!old)
- break;
- if (++i == max)
- i = 0;
- } while (i != j);
-done:
- if (!old)
- return &nb->event_constraints[i];
-
- return &emptyconstraint;
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
-{
- struct amd_nb *nb;
- int i;
-
- nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
- if (!nb)
- return NULL;
-
- memset(nb, 0, sizeof(*nb));
- nb->nb_id = nb_id;
-
- /*
- * initialize all possible NB constraints
- */
- for (i = 0; i < x86_pmu.num_counters; i++) {
- __set_bit(i, nb->event_constraints[i].idxmsk);
- nb->event_constraints[i].weight = 1;
- }
- return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
- WARN_ON_ONCE(cpuc->amd_nb);
-
- if (boot_cpu_data.x86_max_cores < 2)
- return NOTIFY_OK;
-
- cpuc->amd_nb = amd_alloc_nb(cpu, -1);
- if (!cpuc->amd_nb)
- return NOTIFY_BAD;
-
- return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct amd_nb *nb;
- int i, nb_id;
-
- if (boot_cpu_data.x86_max_cores < 2)
- return;
-
- nb_id = amd_get_nb_id(cpu);
- WARN_ON_ONCE(nb_id == BAD_APICID);
-
- raw_spin_lock(&amd_nb_lock);
-
- for_each_online_cpu(i) {
- nb = per_cpu(cpu_hw_events, i).amd_nb;
- if (WARN_ON_ONCE(!nb))
- continue;
-
- if (nb->nb_id == nb_id) {
- kfree(cpuc->amd_nb);
- cpuc->amd_nb = nb;
- break;
- }
- }
-
- cpuc->amd_nb->nb_id = nb_id;
- cpuc->amd_nb->refcnt++;
-
- raw_spin_unlock(&amd_nb_lock);
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
- struct cpu_hw_events *cpuhw;
-
- if (boot_cpu_data.x86_max_cores < 2)
- return;
-
- cpuhw = &per_cpu(cpu_hw_events, cpu);
-
- raw_spin_lock(&amd_nb_lock);
-
- if (cpuhw->amd_nb) {
- struct amd_nb *nb = cpuhw->amd_nb;
-
- if (nb->nb_id == -1 || --nb->refcnt == 0)
- kfree(nb);
-
- cpuhw->amd_nb = NULL;
- }
-
- raw_spin_unlock(&amd_nb_lock);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = amd_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .event_map = amd_pmu_event_map,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = 4,
- .cntval_bits = 48,
- .cntval_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
- .get_event_constraints = amd_get_event_constraints,
- .put_event_constraints = amd_put_event_constraints,
-
- .cpu_prepare = amd_pmu_cpu_prepare,
- .cpu_starting = amd_pmu_cpu_starting,
- .cpu_dead = amd_pmu_cpu_dead,
-};
-
-static __init int amd_pmu_init(void)
-{
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
-
- x86_pmu = amd_pmu;
-
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- return 0;
-}
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static int amd_pmu_init(void)
-{
- return 0;
-}
-
-#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
deleted file mode 100644
index 214ac860ebe0..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,1008 +0,0 @@
-#ifdef CONFIG_CPU_SUP_INTEL
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static const u64 intel_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
-};
-
-static struct event_constraint intel_core_event_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /*
- * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
- * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
- * ratio between these counters.
- */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
- INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
- INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
- INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
- EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
- return intel_perfmon_event_map[hw_event];
-}
-
-static __initconst const u64 westmere_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
- [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
- [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
- [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
- [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
- [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
- [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
- [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static void intel_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
- intel_pmu_disable_bts();
-
- intel_pmu_pebs_disable_all();
- intel_pmu_lbr_disable_all();
-}
-
-static void intel_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
-
- if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- struct perf_event *event =
- cpuc->events[X86_PMC_IDX_FIXED_BTS];
-
- if (WARN_ON_ONCE(!event))
- return;
-
- intel_pmu_enable_bts(event->hw.config);
- }
-}
-
-/*
- * Workaround for:
- * Intel Errata AAK100 (model 26)
- * Intel Errata AAP53 (model 30)
- * Intel Errata BD53 (model 44)
- *
- * These chips need to be 'reset' when adding counters by programming
- * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
- * either in sequence on the same PMC or on different PMCs.
- */
-static void intel_pmu_nhm_enable_all(int added)
-{
- if (added) {
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
-
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
-
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
- for (i = 0; i < 3; i++) {
- struct perf_event *event = cpuc->events[i];
-
- if (!event)
- continue;
-
- __x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
- }
- }
- intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
- u64 status;
-
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
- return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
- int idx = hwc->idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, mask;
-
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
- intel_pmu_disable_bts();
- intel_pmu_drain_bts_buffer();
- return;
- }
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_disable_fixed(hwc);
- return;
- }
-
- x86_pmu_disable_event(event);
-
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
- int idx = hwc->idx - X86_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
-
- /*
- * Enable IRQ generation (0x8),
- * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
- * if requested:
- */
- bits = 0x8ULL;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
- bits |= 0x2;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
- bits |= 0x1;
-
- /*
- * ANY bit is supported in v3 and up
- */
- if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
- bits |= 0x4;
-
- bits <<= (idx * 4);
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
- if (!__get_cpu_var(cpu_hw_events).enabled)
- return;
-
- intel_pmu_enable_bts(hwc->config);
- return;
- }
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc);
- return;
- }
-
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_enable(event);
-
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-static int intel_pmu_save_and_restart(struct perf_event *event)
-{
- x86_perf_event_update(event);
- return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
- struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- printk("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
- checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
- checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
- if (ds)
- ds->bts_index = ds->bts_buffer_base;
-
- local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int bit, loops;
- u64 ack, status;
-
- perf_sample_data_init(&data, 0);
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- intel_pmu_disable_all();
- intel_pmu_drain_bts_buffer();
- status = intel_pmu_get_status();
- if (!status) {
- intel_pmu_enable_all(0);
- return 0;
- }
-
- loops = 0;
-again:
- if (++loops > 100) {
- WARN_ONCE(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- intel_pmu_reset();
- goto done;
- }
-
- inc_irq_stat(apic_perf_irqs);
- ack = status;
-
- intel_pmu_lbr_read();
-
- /*
- * PEBS overflow sets bit 62 in the global status register
- */
- if (__test_and_clear_bit(62, (unsigned long *)&status))
- x86_pmu.drain_pebs(regs);
-
- for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_event *event = cpuc->events[bit];
-
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- if (!intel_pmu_save_and_restart(event))
- continue;
-
- data.period = event->hw.last_period;
-
- if (perf_event_overflow(event, 1, &data, regs))
- x86_pmu_stop(event);
- }
-
- intel_pmu_ack_status(ack);
-
- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
-done:
- intel_pmu_enable_all(0);
- return 1;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int hw_event, bts_event;
-
- hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
- bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
- if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
- return &bts_constraint;
-
- return NULL;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct event_constraint *c;
-
- c = intel_bts_constraints(event);
- if (c)
- return c;
-
- c = intel_pebs_constraints(event);
- if (c)
- return c;
-
- return x86_get_event_constraints(cpuc, event);
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.type != PERF_TYPE_RAW)
- return 0;
-
- if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
- return 0;
-
- if (x86_pmu.version < 3)
- return -EINVAL;
-
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
- return 0;
-}
-
-static __initconst const struct x86_pmu core_pmu = {
- .name = "core",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .get_event_constraints = intel_get_event_constraints,
- .event_constraints = intel_core_event_constraints,
-};
-
-static void intel_pmu_cpu_starting(int cpu)
-{
- init_debug_store_on_cpu(cpu);
- /*
- * Deal with CPUs that don't clear their LBRs on power-up.
- */
- intel_pmu_lbr_reset();
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
- fini_debug_store_on_cpu(cpu);
-}
-
-static __initconst const struct x86_pmu intel_pmu = {
- .name = "Intel",
- .handle_irq = intel_pmu_handle_irq,
- .disable_all = intel_pmu_disable_all,
- .enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_event,
- .disable = intel_pmu_disable_event,
- .hw_config = intel_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .get_event_constraints = intel_get_event_constraints,
-
- .cpu_starting = intel_pmu_cpu_starting,
- .cpu_dying = intel_pmu_cpu_dying,
-};
-
-static void intel_clovertown_quirks(void)
-{
- /*
- * PEBS is unreliable due to:
- *
- * AJ67 - PEBS may experience CPL leaks
- * AJ68 - PEBS PMI may be delayed by one event
- * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
- * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
- *
- * AJ67 could be worked around by restricting the OS/USR flags.
- * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
- *
- * AJ106 could possibly be worked around by not allowing LBR
- * usage from PEBS, including the fixup.
- * AJ68 could possibly be worked around by always programming
- * a pebs_event_reset[0] value and coping with the lost events.
- *
- * But taken together it might just make sense to not enable PEBS on
- * these chips.
- */
- printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
- x86_pmu.pebs = 0;
- x86_pmu.pebs_constraints = NULL;
-}
-
-static __init int intel_pmu_init(void)
-{
- union cpuid10_edx edx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int ebx;
- int version;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- switch (boot_cpu_data.x86) {
- case 0x6:
- return p6_pmu_init();
- case 0xf:
- return p4_pmu_init();
- }
- return -ENODEV;
- }
-
- /*
- * Check whether the Architectural PerfMon supports
- * Branch Misses Retired hw_event or not.
- */
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
- return -ENODEV;
-
- version = eax.split.version_id;
- if (version < 2)
- x86_pmu = core_pmu;
- else
- x86_pmu = intel_pmu;
-
- x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
- x86_pmu.cntval_bits = eax.split.bit_width;
- x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
-
- /*
- * Quirk: v2 perfmon does not report fixed-purpose events, so
- * assume at least 3 events:
- */
- if (version > 1)
- x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
- /*
- * v2 and above have a perf capabilities MSR
- */
- if (version > 1) {
- u64 capabilities;
-
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
- x86_pmu.intel_cap.capabilities = capabilities;
- }
-
- intel_ds_init();
-
- /*
- * Install the hw-cache-events table:
- */
- switch (boot_cpu_data.x86_model) {
- case 14: /* 65 nm core solo/duo, "Yonah" */
- pr_cont("Core events, ");
- break;
-
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- x86_pmu.quirks = intel_clovertown_quirks;
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
- memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_core();
-
- x86_pmu.event_constraints = intel_core2_event_constraints;
- pr_cont("Core2 events, ");
- break;
-
- case 26: /* 45 nm nehalem, "Bloomfield" */
- case 30: /* 45 nm nehalem, "Lynnfield" */
- case 46: /* 45 nm nehalem-ex, "Beckton" */
- memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_nhm();
-
- x86_pmu.event_constraints = intel_nehalem_event_constraints;
- x86_pmu.enable_all = intel_pmu_nhm_enable_all;
- pr_cont("Nehalem events, ");
- break;
-
- case 28: /* Atom */
- memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_atom();
-
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("Atom events, ");
- break;
-
- case 37: /* 32 nm nehalem, "Clarkdale" */
- case 44: /* 32 nm nehalem, "Gulftown" */
- memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_nhm();
-
- x86_pmu.event_constraints = intel_westmere_event_constraints;
- x86_pmu.enable_all = intel_pmu_nhm_enable_all;
- pr_cont("Westmere events, ");
- break;
-
- default:
- /*
- * default constraints for v2 and up
- */
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("generic architected perfmon, ");
- }
- return 0;
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static int intel_pmu_init(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
deleted file mode 100644
index 18018d1311cd..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,641 +0,0 @@
-#ifdef CONFIG_CPU_SUP_INTEL
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE 24
-
-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
- u32 flags, ip;
- u32 ax, bc, cx, dx;
- u32 si, di, bp, sp;
-};
-
- */
-
-struct pebs_record_core {
- u64 flags, ip;
- u64 ax, bx, cx, dx;
- u64 si, di, bp, sp;
- u64 r8, r9, r10, r11;
- u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
- u64 flags, ip;
- u64 ax, bx, cx, dx;
- u64 si, di, bp, sp;
- u64 r8, r9, r10, r11;
- u64 r12, r13, r14, r15;
- u64 status, dla, dse, lat;
-};
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-static void init_debug_store_on_cpu(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
- (u32)((u64)(unsigned long)ds),
- (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
- if (!per_cpu(cpu_hw_events, cpu).ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_ds_buffers(void)
-{
- int cpu;
-
- if (!x86_pmu.bts && !x86_pmu.pebs)
- return;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu)
- fini_debug_store_on_cpu(cpu);
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- continue;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
-
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- kfree(ds);
- }
-
- put_online_cpus();
-}
-
-static int reserve_ds_buffers(void)
-{
- int cpu, err = 0;
-
- if (!x86_pmu.bts && !x86_pmu.pebs)
- return 0;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds;
- void *buffer;
- int max, thresh;
-
- err = -ENOMEM;
- ds = kzalloc(sizeof(*ds), GFP_KERNEL);
- if (unlikely(!ds))
- break;
- per_cpu(cpu_hw_events, cpu).ds = ds;
-
- if (x86_pmu.bts) {
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
- }
-
- if (x86_pmu.pebs) {
- buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
- ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
- /*
- * Always use single record PEBS
- */
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- x86_pmu.pebs_record_size;
- }
-
- err = 0;
- }
-
- if (err)
- release_ds_buffers();
- else {
- for_each_online_cpu(cpu)
- init_debug_store_on_cpu(cpu);
- }
-
- put_online_cpus();
-
- return err;
-}
-
-/*
- * BTS
- */
-
-static struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
-static void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= DEBUGCTLMSR_TR;
- debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
- DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_drain_bts_buffer(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return;
-
- if (!ds)
- return;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return;
-
- ds->bts_index = ds->bts_buffer_base;
-
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
- regs.ip = 0;
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
- return;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
-}
-
-/*
- * PEBS
- */
-
-static struct event_constraint intel_core_pebs_events[] = {
- PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
- PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
- PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
- PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
- PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
- PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
- PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
- PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
- PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_pebs_events[] = {
- PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
- PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
- PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
- PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
- PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
- PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
- PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
- PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
- PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint *
-intel_pebs_constraints(struct perf_event *event)
-{
- struct event_constraint *c;
-
- if (!event->attr.precise_ip)
- return NULL;
-
- if (x86_pmu.pebs_constraints) {
- for_each_event_constraint(c, x86_pmu.pebs_constraints) {
- if ((event->hw.config & c->cmask) == c->code)
- return c;
- }
- }
-
- return &emptyconstraint;
-}
-
-static void intel_pmu_pebs_enable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
- cpuc->pebs_enabled |= 1ULL << hwc->idx;
- WARN_ON_ONCE(cpuc->enabled);
-
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_enable(event);
-}
-
-static void intel_pmu_pebs_disable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
- if (cpuc->enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
- hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
- intel_pmu_lbr_disable(event);
-}
-
-static void intel_pmu_pebs_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-static void intel_pmu_pebs_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-#include <asm/insn.h>
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
- return ip > PAGE_OFFSET;
-#else
- return (long)ip < 0;
-#endif
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long from = cpuc->lbr_entries[0].from;
- unsigned long old_to, to = cpuc->lbr_entries[0].to;
- unsigned long ip = regs->ip;
-
- /*
- * We don't need to fixup if the PEBS assist is fault like
- */
- if (!x86_pmu.intel_cap.pebs_trap)
- return 1;
-
- /*
- * No LBR entry, no basic block, no rewinding
- */
- if (!cpuc->lbr_stack.nr || !from || !to)
- return 0;
-
- /*
- * Basic blocks should never cross user/kernel boundaries
- */
- if (kernel_ip(ip) != kernel_ip(to))
- return 0;
-
- /*
- * unsigned math, either ip is before the start (impossible) or
- * the basic block is larger than 1 page (sanity)
- */
- if ((ip - to) > PAGE_SIZE)
- return 0;
-
- /*
- * We sampled a branch insn, rewind using the LBR stack
- */
- if (ip == to) {
- regs->ip = from;
- return 1;
- }
-
- do {
- struct insn insn;
- u8 buf[MAX_INSN_SIZE];
- void *kaddr;
-
- old_to = to;
- if (!kernel_ip(ip)) {
- int bytes, size = MAX_INSN_SIZE;
-
- bytes = copy_from_user_nmi(buf, (void __user *)to, size);
- if (bytes != size)
- return 0;
-
- kaddr = buf;
- } else
- kaddr = (void *)to;
-
- kernel_insn_init(&insn, kaddr);
- insn_get_length(&insn);
- to += insn.length;
- } while (to < ip);
-
- if (to == ip) {
- regs->ip = old_to;
- return 1;
- }
-
- /*
- * Even though we decoded the basic block, the instruction stream
- * never matched the given IP, either the TO or the IP got corrupted.
- */
- return 0;
-}
-
-static int intel_pmu_save_and_restart(struct perf_event *event);
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs, void *__pebs)
-{
- /*
- * We cast to pebs_record_core since that is a subset of
- * both formats and we don't use the other fields in this
- * routine.
- */
- struct pebs_record_core *pebs = __pebs;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!intel_pmu_save_and_restart(event))
- return;
-
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
-
- /*
- * We use the interrupt regs as a base because the PEBS record
- * does not contain a full regs set, specifically it seems to
- * lack segment descriptors, which get used by things like
- * user_mode().
- *
- * In the simple case fix up only the IP and BP,SP regs, for
- * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
- * A possible PERF_SAMPLE_REGS will have to transfer all regs.
- */
- regs = *iregs;
- regs.ip = pebs->ip;
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
-
- if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
- regs.flags |= PERF_EFLAGS_EXACT;
- else
- regs.flags &= ~PERF_EFLAGS_EXACT;
-
- if (perf_event_overflow(event, 1, &data, &regs))
- x86_pmu_stop(event);
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct perf_event *event = cpuc->events[0]; /* PMC0 only */
- struct pebs_record_core *at, *top;
- int n;
-
- if (!ds || !x86_pmu.pebs)
- return;
-
- at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
- /*
- * Whatever else happens, drain the thing
- */
- ds->pebs_index = ds->pebs_buffer_base;
-
- if (!test_bit(0, cpuc->active_mask))
- return;
-
- WARN_ON_ONCE(!event);
-
- if (!event->attr.precise_ip)
- return;
-
- n = top - at;
- if (n <= 0)
- return;
-
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ON_ONCE(n > 1);
- at += n - 1;
-
- __intel_pmu_pebs_event(event, iregs, at);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct pebs_record_nhm *at, *top;
- struct perf_event *event = NULL;
- u64 status = 0;
- int bit, n;
-
- if (!ds || !x86_pmu.pebs)
- return;
-
- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
- ds->pebs_index = ds->pebs_buffer_base;
-
- n = top - at;
- if (n <= 0)
- return;
-
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
-
- for ( ; at < top; at++) {
- for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
- event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- WARN_ON_ONCE(!event);
-
- if (!event->attr.precise_ip)
- continue;
-
- if (__test_and_set_bit(bit, (unsigned long *)&status))
- continue;
-
- break;
- }
-
- if (!event || bit >= MAX_PEBS_EVENTS)
- continue;
-
- __intel_pmu_pebs_event(event, iregs, at);
- }
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-static void intel_ds_init(void)
-{
- /*
- * No support for 32bit formats
- */
- if (!boot_cpu_has(X86_FEATURE_DTES64))
- return;
-
- x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
- x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
- if (x86_pmu.pebs) {
- char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
- int format = x86_pmu.intel_cap.pebs_format;
-
- switch (format) {
- case 0:
- printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
- x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
- x86_pmu.pebs_constraints = intel_core_pebs_events;
- break;
-
- case 1:
- printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
- x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
- x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
- break;
-
- default:
- printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
- x86_pmu.pebs = 0;
- break;
- }
- }
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static int reserve_ds_buffers(void)
-{
- return 0;
-}
-
-static void release_ds_buffers(void)
-{
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
deleted file mode 100644
index d202c1bece1a..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,218 +0,0 @@
-#ifdef CONFIG_CPU_SUP_INTEL
-
-enum {
- LBR_FORMAT_32 = 0x00,
- LBR_FORMAT_LIP = 0x01,
- LBR_FORMAT_EIP = 0x02,
- LBR_FORMAT_EIP_FLAGS = 0x03,
-};
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(void)
-{
- u64 debugctl;
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
- u64 debugctl;
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++)
- wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- wrmsrl(x86_pmu.lbr_from + i, 0);
- wrmsrl(x86_pmu.lbr_to + i, 0);
- }
-}
-
-static void intel_pmu_lbr_reset(void)
-{
- if (!x86_pmu.lbr_nr)
- return;
-
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
- intel_pmu_lbr_reset_32();
- else
- intel_pmu_lbr_reset_64();
-}
-
-static void intel_pmu_lbr_enable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu.lbr_nr)
- return;
-
- WARN_ON_ONCE(cpuc->enabled);
-
- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
-
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
-
- cpuc->lbr_users++;
-}
-
-static void intel_pmu_lbr_disable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu.lbr_nr)
- return;
-
- cpuc->lbr_users--;
- WARN_ON_ONCE(cpuc->lbr_users < 0);
-
- if (cpuc->enabled && !cpuc->lbr_users)
- __intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
-}
-
-static void intel_pmu_lbr_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->lbr_users)
- __intel_pmu_lbr_disable();
-}
-
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
- unsigned long mask = x86_pmu.lbr_nr - 1;
- u64 tos = intel_pmu_lbr_tos();
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- unsigned long lbr_idx = (tos - i) & mask;
- union {
- struct {
- u32 from;
- u32 to;
- };
- u64 lbr;
- } msr_lastbranch;
-
- rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].flags = 0;
- }
- cpuc->lbr_stack.nr = i;
-}
-
-#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
- unsigned long mask = x86_pmu.lbr_nr - 1;
- int lbr_format = x86_pmu.intel_cap.lbr_format;
- u64 tos = intel_pmu_lbr_tos();
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- unsigned long lbr_idx = (tos - i) & mask;
- u64 from, to, flags = 0;
-
- rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
- rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
-
- if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
- flags = !!(from & LBR_FROM_FLAG_MISPRED);
- from = (u64)((((s64)from) << 1) >> 1);
- }
-
- cpuc->lbr_entries[i].from = from;
- cpuc->lbr_entries[i].to = to;
- cpuc->lbr_entries[i].flags = flags;
- }
- cpuc->lbr_stack.nr = i;
-}
-
-static void intel_pmu_lbr_read(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!cpuc->lbr_users)
- return;
-
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
- intel_pmu_lbr_read_32(cpuc);
- else
- intel_pmu_lbr_read_64(cpuc);
-}
-
-static void intel_pmu_lbr_init_core(void)
-{
- x86_pmu.lbr_nr = 4;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
-}
-
-static void intel_pmu_lbr_init_nhm(void)
-{
- x86_pmu.lbr_nr = 16;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x680;
- x86_pmu.lbr_to = 0x6c0;
-}
-
-static void intel_pmu_lbr_init_atom(void)
-{
- x86_pmu.lbr_nr = 8;
- x86_pmu.lbr_tos = 0x01c9;
- x86_pmu.lbr_from = 0x40;
- x86_pmu.lbr_to = 0x60;
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
deleted file mode 100644
index ae85d69644d1..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- * Netburst Perfomance Events (P4, old Xeon)
- *
- * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-#include <asm/perf_event_p4.h>
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
- unsigned int opcode; /* Event code and ESCR selector */
- unsigned int escr_msr[2]; /* ESCR MSR for this event */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
-};
-
-struct p4_cache_event_bind {
- unsigned int metric_pebs;
- unsigned int metric_vert;
-};
-
-#define P4_GEN_CACHE_EVENT_BIND(name) \
- [P4_CACHE__##name] = { \
- .metric_pebs = P4_PEBS__##name, \
- .metric_vert = P4_VERT__##name, \
- }
-
-static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
- P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
- P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
- [P4_EVENT_TC_DELIVER_MODE] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
- .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_BPU_FETCH_REQUEST] = {
- .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
- .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_ITLB_REFERENCE] = {
- .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
- .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_MEMORY_CANCEL] = {
- .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
- .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_MEMORY_COMPLETE] = {
- .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
- .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_LOAD_PORT_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
- .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_STORE_PORT_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
- .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_MOB_LOAD_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
- .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_PAGE_WALK_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
- .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BSQ_CACHE_REFERENCE] = {
- .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
- .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_IOQ_ALLOCATION] = {
- .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
- .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
- .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
- .cntr = { {2, -1, -1}, {3, -1, -1} },
- },
- [P4_EVENT_FSB_DATA_ACTIVITY] = {
- .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
- .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
- .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
- .cntr = { {0, -1, -1}, {1, -1, -1} },
- },
- [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
- .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
- .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
- .cntr = { {2, -1, -1}, {3, -1, -1} },
- },
- [P4_EVENT_SSE_INPUT_ASSIST] = {
- .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_PACKED_SP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_PACKED_DP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_SCALAR_SP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_SCALAR_DP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_64BIT_MMX_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_128BIT_MMX_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_X87_FP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_TC_MISC] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
- .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_GLOBAL_POWER_EVENTS] = {
- .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_TC_MS_XFER] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
- .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_UOP_QUEUE_WRITES] = {
- .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
- .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
- .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RETIRED_BRANCH_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
- .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RESOURCE_STALL] = {
- .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
- .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_WC_BUFFER] = {
- .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
- .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_B2B_CYCLES] = {
- .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BNR] = {
- .opcode = P4_OPCODE(P4_EVENT_BNR),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_SNOOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SNOOP),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_RESPONSE] = {
- .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_FRONT_END_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_EXECUTION_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_REPLAY_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_INSTR_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_UOPS_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_UOP_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
- .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_BRANCH_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_X87_ASSIST] = {
- .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_MACHINE_CLEAR] = {
- .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_INSTR_COMPLETED] = {
- .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
- p4_config_pack_escr(P4_ESCR_EVENT(event) | \
- P4_ESCR_EMASK_BIT(event, bit)) | \
- p4_config_pack_cccr(cache_event | \
- P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__1stl_cache_load_miss_retired),
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__2ndl_cache_load_miss_retired),
- },
-},
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_load_miss_retired),
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_CACHE__dtlb_store_miss_retired),
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
- P4_CACHE__itlb_reference_hit),
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
- P4_CACHE__itlb_reference_miss),
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
- /* non-halted CPU clocks */
- [PERF_COUNT_HW_CPU_CYCLES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-
- /*
- * retired instructions
- * in a sake of simplicity we don't use the FSB tagging
- */
- [PERF_COUNT_HW_INSTRUCTIONS] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
- /* cache hits */
- [PERF_COUNT_HW_CACHE_REFERENCES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
- /* cache misses */
- [PERF_COUNT_HW_CACHE_MISSES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
- /* branch instructions retired */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
- /* mispredicted branches retired */
- [PERF_COUNT_HW_BRANCH_MISSES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
- /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
- [PERF_COUNT_HW_BUS_CYCLES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
- p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
- unsigned int evnt = p4_config_unpack_event(config);
- struct p4_event_bind *bind = NULL;
-
- if (evnt < ARRAY_SIZE(p4_event_bind_map))
- bind = &p4_event_bind_map[evnt];
-
- return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
- struct p4_event_bind *bind;
- unsigned int esel;
- u64 config;
-
- config = p4_general_events[hw_event];
- bind = p4_config_get_bind(config);
- esel = P4_OPCODE_ESEL(bind->opcode);
- config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
- return config;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
- int cpu = get_cpu();
- int rc = 0;
- unsigned int evnt;
- u32 escr, cccr;
-
- /*
- * the reason we use cpu that early is that: if we get scheduled
- * first time on the same cpu -- we will not need swap thread
- * specific flags in config (and will save some cpu cycles)
- */
-
- cccr = p4_default_cccr_conf(cpu);
- escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
- event->attr.exclude_user);
- event->hw.config = p4_config_pack_escr(escr) |
- p4_config_pack_cccr(cccr);
-
- if (p4_ht_active() && p4_ht_thread(cpu))
- event->hw.config = p4_set_ht_bit(event->hw.config);
-
- if (event->attr.type == PERF_TYPE_RAW) {
-
- /* user data may have out-of-bound event index */
- evnt = p4_config_unpack_event(event->attr.config);
- if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
- rc = -EINVAL;
- goto out;
- }
-
- /*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- *
- * XXX: HT wide things should check perf_paranoid_cpu() &&
- * CAP_SYS_ADMIN
- */
- event->hw.config |= event->attr.config &
- (p4_config_pack_escr(P4_ESCR_MASK_HT) |
- p4_config_pack_cccr(P4_CCCR_MASK_HT));
- }
-
- rc = x86_setup_perfctr(event);
-out:
- put_cpu();
- return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
- int overflow = 0;
- u32 low, high;
-
- rdmsr(hwc->config_base + hwc->idx, low, high);
-
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
- }
-
- return overflow;
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- /*
- * If event gets disabled while counter is in overflowed
- * state we need to clear P4_CCCR_OVF, otherwise interrupt get
- * asserted again and again
- */
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- (u64)(p4_config_unpack_cccr(hwc->config)) &
- ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- p4_pmu_disable_event(event);
- }
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int thread = p4_ht_config_thread(hwc->config);
- u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
- unsigned int idx = p4_config_unpack_event(hwc->config);
- unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
- struct p4_event_bind *bind;
- struct p4_cache_event_bind *bind_cache;
- u64 escr_addr, cccr;
-
- bind = &p4_event_bind_map[idx];
- escr_addr = (u64)bind->escr_msr[thread];
-
- /*
- * - we dont support cascaded counters yet
- * - and counter 1 is broken (erratum)
- */
- WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
- WARN_ON_ONCE(hwc->idx == 1);
-
- /* we need a real Event value */
- escr_conf &= ~P4_ESCR_EVENT_MASK;
- escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
- cccr = p4_config_unpack_cccr(hwc->config);
-
- /*
- * it could be Cache event so that we need to
- * set metrics into additional MSRs
- */
- BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
- if (idx_cache > P4_CACHE__NONE &&
- idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
- bind_cache = &p4_cache_event_bind_map[idx_cache];
- (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
- (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
- }
-
- (void)checking_wrmsrl(escr_addr, escr_conf);
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- p4_pmu_enable_event(event);
- }
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int idx, handled = 0;
- u64 val;
-
- data.addr = 0;
- data.raw = NULL;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- event = cpuc->events[idx];
- hwc = &event->hw;
-
- WARN_ON_ONCE(hwc->idx != idx);
-
- /* it might be unflagged overflow */
- handled = p4_pmu_clear_cccr_ovf(hwc);
-
- val = x86_perf_event_update(event);
- if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
- continue;
-
- /* event overflow for sure */
- data.period = event->hw.last_period;
-
- if (!x86_perf_event_set_period(event))
- continue;
- if (perf_event_overflow(event, 1, &data, regs))
- p4_pmu_disable_event(event);
- }
-
- if (handled) {
- /* p4 quirk: unmask it again */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
- inc_irq_stat(apic_perf_irqs);
- }
-
- return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
- u32 escr, cccr;
-
- /*
- * we either lucky and continue on same cpu or no HT support
- */
- if (!p4_should_swap_ts(hwc->config, cpu))
- return;
-
- /*
- * the event is migrated from an another logical
- * cpu, so we need to swap thread specific flags
- */
-
- escr = p4_config_unpack_escr(hwc->config);
- cccr = p4_config_unpack_cccr(hwc->config);
-
- if (p4_ht_thread(cpu)) {
- cccr &= ~P4_CCCR_OVF_PMI_T0;
- cccr |= P4_CCCR_OVF_PMI_T1;
- if (escr & P4_ESCR_T0_OS) {
- escr &= ~P4_ESCR_T0_OS;
- escr |= P4_ESCR_T1_OS;
- }
- if (escr & P4_ESCR_T0_USR) {
- escr &= ~P4_ESCR_T0_USR;
- escr |= P4_ESCR_T1_USR;
- }
- hwc->config = p4_config_pack_escr(escr);
- hwc->config |= p4_config_pack_cccr(cccr);
- hwc->config |= P4_CONFIG_HT;
- } else {
- cccr &= ~P4_CCCR_OVF_PMI_T1;
- cccr |= P4_CCCR_OVF_PMI_T0;
- if (escr & P4_ESCR_T1_OS) {
- escr &= ~P4_ESCR_T1_OS;
- escr |= P4_ESCR_T0_OS;
- }
- if (escr & P4_ESCR_T1_USR) {
- escr &= ~P4_ESCR_T1_USR;
- escr |= P4_ESCR_T0_USR;
- }
- hwc->config = p4_config_pack_escr(escr);
- hwc->config |= p4_config_pack_cccr(cccr);
- hwc->config &= ~P4_CONFIG_HT;
- }
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE 0x000003a0
-#define P4_ESCR_MSR_MAX 0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
- unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
- if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
- !p4_escr_table[idx] ||
- p4_escr_table[idx] != addr)) {
- WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
- return -1;
- }
-
- return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
- struct p4_event_bind *bind)
-{
- int i, j;
-
- for (i = 0; i < P4_CNTR_LIMIT; i++) {
- j = bind->cntr[thread][i];
- if (j != -1 && !test_bit(j, used_mask))
- return j;
- }
-
- return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
- int cpu = smp_processor_id();
- struct hw_perf_event *hwc;
- struct p4_event_bind *bind;
- unsigned int i, thread, num;
- int cntr_idx, escr_idx;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
- for (i = 0, num = n; i < n; i++, num--) {
-
- hwc = &cpuc->event_list[i]->hw;
- thread = p4_ht_thread(cpu);
- bind = p4_config_get_bind(hwc->config);
- escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
- if (unlikely(escr_idx == -1))
- goto done;
-
- if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
- cntr_idx = hwc->idx;
- if (assign)
- assign[i] = hwc->idx;
- goto reserve;
- }
-
- cntr_idx = p4_next_cntr(thread, used_mask, bind);
- if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
- goto done;
-
- p4_pmu_swap_config_ts(hwc, cpu);
- if (assign)
- assign[i] = cntr_idx;
-reserve:
- set_bit(cntr_idx, used_mask);
- set_bit(escr_idx, escr_mask);
- }
-
-done:
- return num ? -ENOSPC : 0;
-}
-
-static __initconst const struct x86_pmu p4_pmu = {
- .name = "Netburst P4/Xeon",
- .handle_irq = p4_pmu_handle_irq,
- .disable_all = p4_pmu_disable_all,
- .enable_all = p4_pmu_enable_all,
- .enable = p4_pmu_enable_event,
- .disable = p4_pmu_disable_event,
- .eventsel = MSR_P4_BPU_CCCR0,
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .event_map = p4_pmu_event_map,
- .max_events = ARRAY_SIZE(p4_general_events),
- .get_event_constraints = x86_get_event_constraints,
- /*
- * IF HT disabled we may need to use all
- * ARCH_P4_MAX_CCCR counters simulaneously
- * though leave it restricted at moment assuming
- * HT is on
- */
- .num_counters = ARCH_P4_MAX_CCCR,
- .apic = 1,
- .cntval_bits = 40,
- .cntval_mask = (1ULL << 40) - 1,
- .max_period = (1ULL << 39) - 1,
- .hw_config = p4_hw_config,
- .schedule_events = p4_pmu_schedule_events,
-};
-
-static __init int p4_pmu_init(void)
-{
- unsigned int low, high;
-
- /* If we get stripped -- indexig fails */
- BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
-
- rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!(low & (1 << 7))) {
- pr_cont("unsupported Netburst CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Netburst events, ");
-
- x86_pmu = p4_pmu;
-
- return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
deleted file mode 100644
index 34ba07be2cda..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifdef CONFIG_CPU_SUP_INTEL
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
- return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT 0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
- u64 val;
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
- unsigned long val;
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- u64 val = P6_NOP_EVENT;
-
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
- u64 val;
-
- val = hwc->config;
- if (cpuc->enabled)
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
- (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
-}
-
-static __initconst const struct x86_pmu p6_pmu = {
- .name = "p6",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = p6_pmu_disable_all,
- .enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_event,
- .disable = p6_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_P6_EVNTSEL0,
- .perfctr = MSR_P6_PERFCTR0,
- .event_map = p6_pmu_event_map,
- .max_events = ARRAY_SIZE(p6_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 31) - 1,
- .version = 0,
- .num_counters = 2,
- /*
- * Events have 40 bits implemented. However they are designed such
- * that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a event for P6-like PMU is 32 bits only.
- *
- * See IA-32 Intel Architecture Software developer manual Vol 3B
- */
- .cntval_bits = 32,
- .cntval_mask = (1ULL << 32) - 1,
- .get_event_constraints = x86_get_event_constraints,
- .event_constraints = p6_event_constraints,
-};
-
-static __init int p6_pmu_init(void)
-{
- switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- case 9:
- case 13:
- /* Pentium M */
- break;
- default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- x86_pmu = p6_pmu;
-
- return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -12,36 +13,16 @@
*/
#include <linux/percpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
-struct nmi_watchdog_ctlblk {
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
- int (*reserve)(void);
- void (*unreserve)(void);
- int (*setup)(unsigned nmi_hz);
- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
- void (*stop)(void);
- unsigned perfctr;
- unsigned evntsel;
- u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
@@ -60,14 +41,15 @@ static const struct wd_ops *wd_ops;
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTR)
+ return (msr - MSR_F15H_PERF_CTR) >> 1;
return msr - MSR_K7_PERFCTR0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -76,9 +58,15 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
}
return 0;
}
@@ -91,7 +79,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTL)
+ return (msr - MSR_F15H_PERF_CTL) >> 1;
return msr - MSR_K7_EVNTSEL0;
case X86_VENDOR_INTEL:
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -100,23 +91,20 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
}
return 0;
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
@@ -172,624 +160,3 @@ void release_evntsel_nmi(unsigned int msr)
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
- if (wd_ops)
- wd_ops->unreserve();
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (!wd_ops)
- return;
- if (!wd_ops->reserve()) {
- printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
- return;
- }
-
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
- touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
- if (!reserve_perfctr_nmi(wd_ops->perfctr))
- return 0;
-
- if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
- release_perfctr_nmi(wd_ops->perfctr);
- return 0;
- }
- return 1;
-}
-
-static void single_msr_unreserve(void)
-{
- release_evntsel_nmi(wd_ops->evntsel);
- release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- /* KVM doesn't implement this MSR */
- if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
- return 0;
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS (1 << 3)
-#define P4_ESCR_USR (1 << 2)
-#define P4_CCCR_OVF_PMI0 (1 << 26)
-#define P4_CCCR_OVF_PMI1 (1 << 27)
-#define P4_CCCR_THRESHOLD(N) ((N) << 20)
-#define P4_CCCR_COMPLEMENT (1 << 19)
-#define P4_CCCR_COMPARE (1 << 18)
-#define P4_CCCR_REQUIRED (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE (1 << 12)
-#define P4_CCCR_OVF (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /*
- * performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
-
- /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
- if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
- cccr_val = P4_CCCR_OVF_PMI0;
- else
- cccr_val = P4_CCCR_OVF_PMI1;
- cccr_val |= P4_CCCR_ESCR_SELECT(4);
- }
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
- if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
- return 0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
- goto fail1;
-#endif
- if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
- goto fail2;
- /* RED-PEN why is ESCR1 not reserved here? */
- return 1;
- fail2:
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
- return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
- release_evntsel_nmi(MSR_P4_CRU_ESCR0);
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- unsigned dummy;
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
- /* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length <
- (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
- intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
- boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
- return;
- wd_ops = &k7_wd_ops;
- break;
- case X86_VENDOR_INTEL:
- /* Work around where perfctr1 doesn't have a working enable
- * bit as described in the following errata:
- * AE49 Core Duo and Intel Core Solo 65 nm
- * AN49 Intel Pentium Dual-Core
- * AF49 Dual-Core Intel Xeon Processor LV
- */
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
- ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
- boot_cpu_data.x86_mask == 4))) {
- intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
- intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
- }
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- wd_ops = &intel_arch_wd_ops;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 13)
- return;
-
- wd_ops = &p6_wd_ops;
- break;
- case 15:
- wd_ops = &p4_wd_ops;
- break;
- default:
- return;
- }
- break;
- }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
- if (!wd_ops) {
- probe_nmi_watchdog();
- if (!wd_ops) {
- printk(KERN_INFO "NMI watchdog: CPU not supported\n");
- return -1;
- }
-
- if (!wd_ops->reserve()) {
- printk(KERN_ERR
- "NMI watchdog: cannot reserve perfctrs\n");
- return -1;
- }
- }
-
- if (!(wd_ops->setup(nmi_hz))) {
- printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
- raw_smp_processor_id());
- return -1;
- }
-
- return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
- if (wd_ops)
- wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
- hz = adjust_for_32bit_ctr(hz);
- return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
-
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
-
- wd_ops->rearm(wd, nmi_hz);
- return 1;
-}
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..fd6ec2aa0303 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Strings for the various x86 power flags
*
@@ -11,10 +12,13 @@ const char *const x86_power_flags[32] = {
"fid", /* frequency id control */
"vid", /* voltage id control */
"ttp", /* thermal trip */
- "tm",
- "stc",
- "100mhzsteps",
- "hwpstate",
+ "tm", /* hardware thermal control */
+ "stc", /* software thermal control */
+ "100mhzsteps", /* 100 MHz multiplier control */
+ "hwpstate", /* hardware P-state control */
"", /* tsc invariant mapped to constant_tsc */
- /* nothing */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
+ "proc_feedback", /* processor feedback interface */
+ "acc_power", /* accumulated power mechanism */
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 62ac8cb6ba27..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -1,8 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
/*
* Get CPU information for use by the procfs.
@@ -11,43 +20,33 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_core_mask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
#ifdef CONFIG_X86_32
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
{
- /*
- * We use exception 16 if we have hardware math and we've either seen
- * it or the CPU claims it is internal
- */
- int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
seq_printf(m,
"fdiv_bug\t: %s\n"
- "hlt_bug\t\t: %s\n"
"f00f_bug\t: %s\n"
"coma_bug\t: %s\n"
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
- "wp\t\t: %s\n",
- c->fdiv_bug ? "yes" : "no",
- c->hlt_works_ok ? "no" : "yes",
- c->f00f_bug ? "yes" : "no",
- c->coma_bug ? "yes" : "no",
- c->hard_math ? "yes" : "no",
- fpu_exception ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
+ "wp\t\t: yes\n",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ c->cpuid_level);
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
@@ -64,12 +63,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu = 0;
+ unsigned int cpu;
int i;
-#ifdef CONFIG_SMP
cpu = c->cpu_index;
-#endif
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
@@ -81,32 +78,53 @@ static int show_cpuinfo(struct seq_file *m, void *v)
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
- if (c->x86_mask || c->cpuid_level >= 0)
- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
else
- seq_printf(m, "stepping\t: unknown\n");
+ seq_puts(m, "stepping\t: unknown\n");
+ if (c->microcode)
+ seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
- if (c->x86_cache_size >= 0)
- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
- seq_printf(m, "flags\t\t:");
+ seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
+ seq_puts(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
@@ -120,7 +138,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
- seq_printf(m, "power management:");
+ seq_puts(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
@@ -133,17 +151,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
}
}
- seq_printf(m, "\n\n");
+ seq_puts(m, "\n\n");
return 0;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = cpumask_first(cpu_online_mask);
- else
- *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
@@ -165,3 +180,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
new file mode 100644
index 000000000000..eeac00d20926
--- /dev/null
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ */
+#include <linux/printk.h>
+
+#include <asm/processor.h>
+#include <asm/archrandom.h>
+#include <asm/sections.h>
+
+/*
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check. Also make sure
+ * it's not outputting the same value over and over, which has happened
+ * as a result of past CPU bugs.
+ *
+ * If it fails, it is simple to disable RDRAND and RDSEED here.
+ */
+
+void x86_init_rdrand(struct cpuinfo_x86 *c)
+{
+ enum { SAMPLES = 8, MIN_CHANGE = 5 };
+ unsigned long sample, prev;
+ bool failure = false;
+ size_t i, changed;
+
+ if (!cpu_has(c, X86_FEATURE_RDRAND))
+ return;
+
+ for (changed = 0, i = 0; i < SAMPLES; ++i) {
+ if (!rdrand_long(&sample)) {
+ failure = true;
+ break;
+ }
+ changed += i && sample != prev;
+ prev = sample;
+ }
+ if (changed < MIN_CHANGE)
+ failure = true;
+
+ if (failure) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ pr_emerg("RDRAND is not reliable on this platform; disabling.\n");
+ }
+}
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
new file mode 100644
index 000000000000..d8a04b195da2
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
+CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
new file mode 100644
index 000000000000..3792ab4819dc
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+#include "internal.h"
+
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * The cached resctrl_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
+
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
+
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
+ [RDT_RESOURCE_L3] =
+ {
+ .r_resctrl = {
+ .name = "L3",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_L2] =
+ {
+ .r_resctrl = {
+ .name = "L2",
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_MBA] =
+ {
+ .r_resctrl = {
+ .name = "MB",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .name = "SMBA",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+};
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->mon.num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cache mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &hw_res->r_resctrl;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
+
+ if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
+ return;
+
+ rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
+
+ /* If all the bits were set in MSR, return success */
+ if (l3_cbm_0 != max_cbm)
+ return;
+
+ hw_res->num_closid = 4;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
+ r->alloc_capable = true;
+
+ rdt_alloc_capable = true;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx, max_delay;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ max_delay = eax.split.max_delay + 1;
+ r->membw.max_bw = MAX_MBA_BW;
+ r->membw.arch_needs_linear = true;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ r->membw.arch_needs_linear = false;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+ r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+ else
+ r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 eax, ebx, ecx, edx, subleaf;
+
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
+
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->membw.max_bw = 1 << eax;
+
+ /* AMD does not use delay */
+ r->membw.delay_linear = false;
+ r->membw.arch_needs_linear = false;
+
+ /*
+ * AMD does not use memory delay throttle model to control
+ * the allocation like Intel does.
+ */
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ r->membw.min_bw = 0;
+ r->membw.bw_gran = 1;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, default_ctrl;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
+ r->alloc_capable = true;
+}
+
+static void rdt_get_cdp_config(int level)
+{
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ rdt_resources_all[level].cdp_enabled = false;
+ rdt_resources_all[level].r_resctrl.cdp_capable = true;
+}
+
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+ r->cache.io_alloc_capable = true;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2);
+}
+
+static void mba_wrmsr_amd(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return MAX_MBA_BW;
+}
+
+static void mba_wrmsr_intel(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
+}
+
+static void cat_wrmsr(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->num_closid;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct rdt_hw_resource *hw_res;
+ struct msr_param *m = arg;
+
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
+}
+
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ */
+ for (i = 0; i < hw_res->num_closid; i++, dc++)
+ *dc = resctrl_get_default_ctrl(r);
+}
+
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
+{
+ int idx;
+
+ for_each_mbm_idx(idx)
+ kfree(hw_dom->arch_mbm_states[idx]);
+ kfree(hw_dom);
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct msr_param m;
+ u32 *dc;
+
+ dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+ GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ hw_dom->ctrl_val = dc;
+ setup_default_ctrlval(r, dc);
+
+ m.res = r;
+ m.dom = d;
+ m.low = 0;
+ m.high = hw_res->num_closid;
+ hw_res->msr_update(&m);
+ return 0;
+}
+
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid: The size of the MBM counter array
+ * @hw_dom: The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
+{
+ size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_states[idx])
+ goto cleanup;
+ }
+
+ return 0;
+cleanup:
+ for_each_mbm_idx(idx) {
+ kfree(hw_dom->arch_mbm_states[idx]);
+ hw_dom->arch_mbm_states[idx] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct list_head *add_pos = NULL;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ if (r->cache.arch_has_per_cpu_cfg)
+ rdt_domain_reconfigure_cdp(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ rdt_domain_reconfigure_cdp(r);
+
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
+ return;
+ }
+ d->ci_id = ci->id;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_mon_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+
+ /*
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
+ * its pointer from pseudo_lock_region struct.
+ */
+ if (d->plr)
+ d->plr->d = NULL;
+ ctrl_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
+}
+
+static void clear_closid_rmid(int cpu)
+{
+ struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
+}
+
+static int resctrl_arch_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+ resctrl_online_cpu(cpu);
+
+ return 0;
+}
+
+static int resctrl_arch_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+
+ return 0;
+}
+
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_L2_CDP,
+ RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
+ RDT_FLAG_ABMC,
+ RDT_FLAG_SDCIAE,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __ro_after_init = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
+ RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC),
+ RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+bool rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
+static __init bool get_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_MBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return __get_mem_config_intel(&hw_res->r_resctrl);
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+ struct rdt_resource *r;
+ bool ret = false;
+
+ if (rdt_alloc_capable)
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ rdt_get_cache_alloc_cfg(1, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
+ if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+ rdt_set_io_alloc_capable(r);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+ rdt_get_cache_alloc_cfg(2, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
+ ret = true;
+ }
+
+ if (get_mem_config())
+ ret = true;
+
+ if (get_slow_mem_config())
+ ret = true;
+
+ return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ bool ret = false;
+
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
+ resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_ABMC))
+ ret = true;
+
+ if (!ret)
+ return false;
+
+ return !rdt_get_mon_l3_config(r);
+}
+
+static __init void __check_quirks_intel(void)
+{
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_SKYLAKE_X:
+ if (boot_cpu_data.x86_stepping <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
+ fallthrough;
+ case INTEL_BROADWELL_X:
+ intel_rdt_mbm_apply_quirk();
+ break;
+ }
+}
+
+static __init void check_quirks(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ __check_quirks_intel();
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static __init void rdt_init_res_defs_intel(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_per_cpu_cfg = false;
+ r->cache.min_cbm_bits = 1;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+ hw_res->msr_update = mba_wrmsr_intel;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs_amd(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_sparse_bitmasks = true;
+ r->cache.arch_has_per_cpu_cfg = true;
+ r->cache.min_cbm_bits = 0;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ rdt_init_res_defs_intel();
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ rdt_init_res_defs_amd();
+}
+
+static enum cpuhp_state rdt_online;
+
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ c->x86_cache_mbm_width_offset = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
+ cpu_has(c, X86_FEATURE_ABMC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+ }
+}
+
+static int __init resctrl_arch_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
+
+ /*
+ * Initialize functions(or definitions) that are different
+ * between vendors here.
+ */
+ rdt_init_res_defs();
+
+ check_quirks();
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/resctrl/cat:online:",
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = resctrl_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+ rdt_online = state;
+
+ for_each_alloc_capable_rdt_resource(r)
+ pr_info("%s allocation detected\n", r->name);
+
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("%s monitoring detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(resctrl_arch_late_init);
+
+static void __exit resctrl_arch_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+
+ resctrl_exit();
+}
+
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
new file mode 100644
index 000000000000..b20e705606b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+
+#include "internal.h"
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 idx = resctrl_get_config_index(closid, t);
+ struct msr_param msr_param;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
+ return -EINVAL;
+
+ hw_dom->ctrl_val[idx] = cfg_val;
+
+ msr_param.res = r;
+ msr_param.dom = d;
+ msr_param.low = idx;
+ msr_param.high = idx + 1;
+ hw_res->msr_update(&msr_param);
+
+ return 0;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ struct resctrl_staged_config *cfg;
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ enum resctrl_conf_type t;
+ u32 idx;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
+ for (t = 0; t < CDP_NUM_TYPES; t++) {
+ cfg = &hw_dom->d_resctrl.staged_config[t];
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ idx = resctrl_get_config_index(closid, t);
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
+ continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
+
+ if (!msr_param.res) {
+ msr_param.low = idx;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+ msr_param.dom = d;
+ } else {
+ msr_param.low = min(msr_param.low, idx);
+ msr_param.high = max(msr_param.high, idx + 1);
+ }
+ }
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ u32 idx = resctrl_get_config_index(closid, type);
+
+ return hw_dom->ctrl_val[idx];
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_ctrl_domain *d;
+
+ /* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (hw_res->r_resctrl.cache.io_alloc_capable &&
+ hw_res->sdciae_enabled != enable) {
+ _resctrl_sdciae_enable(r, enable);
+ hw_res->sdciae_enabled = enable;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
new file mode 100644
index 000000000000..4a916c84a322
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RESCTRL_INTERNAL_H
+#define _ASM_X86_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
+#define MBM_CNTR_WIDTH_BASE 24
+
+#define MBA_IS_LINEAR 0x4
+
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+
+/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ * return value.
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to
+ * find this struct.
+ */
+struct arch_mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */
+#define ABMC_ENABLE_BIT 0
+
+/*
+ * Qos Event Identifiers.
+ */
+#define ABMC_EXTENDED_EVT_ID BIT(31)
+#define ABMC_EVT_ID BIT(0)
+
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT 1
+
+/**
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @arch_mbm_states: Per-event pointer to the MBM event's saved state.
+ * An MBM event's state is an array of struct arch_mbm_state
+ * indexed by RMID on x86.
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
+ struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS];
+};
+
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
+{
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
+}
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @dom: The domain to update
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
+ u32 low;
+ u32 high;
+};
+
+/**
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl: Attributes of the resource used directly by resctrl.
+ * @num_closid: Maximum number of closid this hardware can support,
+ * regardless of CDP. This is exposed via
+ * resctrl_arch_get_num_closid() to avoid confusion
+ * with struct resctrl_schema's property of the same name,
+ * which has been corrected for features like CDP.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width: Monitor width, to detect and correct for overflow.
+ * @cdp_enabled: CDP state of this resource
+ * @mbm_cntr_assign_enabled: ABMC feature is enabled
+ * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled.
+ *
+ * Members of this structure are either private to the architecture
+ * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
+ * msr_update and msr_base.
+ */
+struct rdt_hw_resource {
+ struct rdt_resource r_resctrl;
+ u32 num_closid;
+ unsigned int msr_base;
+ void (*msr_update)(struct msr_param *m);
+ unsigned int mon_scale;
+ unsigned int mbm_width;
+ bool cdp_enabled;
+ bool mbm_cntr_assign_enabled;
+ bool sdciae_enabled;
+};
+
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+ return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+extern struct rdt_hw_resource rdt_resources_all[];
+
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG.
+ *
+ * @bw_type : Event configuration that represents the memory
+ * transactions being tracked by the @cntr_id.
+ * @bw_src : Bandwidth source (RMID or CLOSID).
+ * @reserved1 : Reserved.
+ * @is_clos : @bw_src field is a CLOSID (not an RMID).
+ * @cntr_id : Counter identifier.
+ * @reserved : Reserved.
+ * @cntr_en : Counting enable bit.
+ * @cfg_en : Configuration enable bit.
+ *
+ * Configuration and counting:
+ * Counter can be configured across multiple writes to MSR. Configuration
+ * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the
+ * configuration is applied.
+ * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not
+ * count events.
+ * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start
+ * counting events.
+ */
+union l3_qos_abmc_cfg {
+ struct {
+ unsigned long bw_type :32,
+ bw_src :12,
+ reserved1: 3,
+ is_clos : 1,
+ cntr_id : 5,
+ reserved : 9,
+ cntr_en : 1,
+ cfg_en : 1;
+ } split;
+ unsigned long full;
+};
+
+void rdt_ctrl_update(void *arg);
+
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+
+bool rdt_cpu_has(int flag);
+
+void __init intel_rdt_mbm_apply_quirk(void);
+
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r);
+
+#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
new file mode 100644
index 000000000000..dffcc8307500
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -0,0 +1,583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+static int snc_nodes_per_l3_cache = 1;
+
+/*
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initconst = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
+ rdmsrq(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct arch_mbm_state *state;
+
+ if (!resctrl_is_mbm_event(eventid))
+ return NULL;
+
+ state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
+
+ return state ? &state[rmid] : NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u32 prmid;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ /* Record any initial, non-zero count value. */
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
+ }
+}
+
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ memset(hw_dom->arch_mbm_states[idx], 0,
+ sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
+ }
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+ u64 shift = 64 - width, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >> shift;
+}
+
+static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct arch_mbm_state *am;
+ u64 chunks;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
+
+ return chunks * hw_res->mon_scale;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u64 msr_val;
+ u32 prmid;
+ int ret;
+
+ resctrl_arch_rmid_read_context_check();
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
+
+ if (!ret) {
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+ } else if (ret == -EINVAL) {
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ am->prev_msr = 0;
+ }
+
+ return ret;
+}
+
+static int __cntr_id_read(u32 cntr_id, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * QM_EVTSEL Register definition:
+ * =======================================================
+ * Bits Mnemonic Description
+ * =======================================================
+ * 63:44 -- Reserved
+ * 43:32 RMID RMID or counter ID in ABMC mode
+ * when reading an MBM event
+ * 31 ExtendedEvtID Extended Event Identifier
+ * 30:8 -- Reserved
+ * 7:0 EvtID Event Identifier
+ * =======================================================
+ * The contents of a specific counter can be read by setting the
+ * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
+ * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
+ * to the desired counter ID. Reading the QM_CTR then returns the
+ * contents of the specified counter. The RMID_VAL_ERROR bit is set
+ * if the counter configuration is invalid, or if an invalid counter
+ * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit
+ * is set if the counter data is unavailable.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct arch_mbm_state *am;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ /* Record any initial, non-zero count value. */
+ __cntr_id_read(cntr_id, &am->prev_msr);
+ }
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+ int ret;
+
+ ret = __cntr_id_read(cntr_id, &msr_val);
+ if (ret)
+ return ret;
+
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+
+ return 0;
+}
+
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ unsigned int threshold;
+ u32 eax, ebx, ecx, edx;
+
+ snc_nodes_per_l3_cache = snc_get_config();
+
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
+ hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+ if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+ hw_res->mbm_width += mbm_offset;
+ else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+ pr_warn("Ignoring impossible MBM counter offset\n");
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
+
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
+
+ if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+ }
+
+ /*
+ * resctrl assumes a system that supports assignable counters can
+ * switch to "default" mode. Ensure that there is a "default" mode
+ * to switch to. This enforces a dependency between the independent
+ * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
+ * hardware features.
+ */
+ if (rdt_cpu_has(X86_FEATURE_ABMC) &&
+ (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
+ rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
+ r->mon.mbm_cntr_assignable = true;
+ cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
+ r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
+ hw_res->mbm_cntr_assign_enabled = true;
+ }
+
+ r->mon_capable = true;
+
+ return 0;
+}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
+
+static void resctrl_abmc_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+}
+
+/*
+ * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
+ * associated with all monitor domains.
+ */
+static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
+ &enable, 1);
+ resctrl_arch_reset_rmid_all(r, d);
+ }
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (r->mon.mbm_cntr_assignable &&
+ hw_res->mbm_cntr_assign_enabled != enable) {
+ _resctrl_abmc_enable(r, enable);
+ hw_res->mbm_cntr_assign_enabled = enable;
+ }
+
+ return 0;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
+}
+
+static void resctrl_abmc_config_one_amd(void *info)
+{
+ union l3_qos_abmc_cfg *abmc_cfg = info;
+
+ wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+}
+
+/*
+ * Send an IPI to the domain to assign the counter to RMID, event pair.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ union l3_qos_abmc_cfg abmc_cfg = { 0 };
+ struct arch_mbm_state *am;
+
+ abmc_cfg.split.cfg_en = 1;
+ abmc_cfg.split.cntr_en = assign ? 1 : 0;
+ abmc_cfg.split.cntr_id = cntr_id;
+ abmc_cfg.split.bw_src = rmid;
+ if (assign)
+ abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
+
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
+
+ /*
+ * The hardware counter is reset (because cfg_en == 1) so there is no
+ * need to record initial non-zero counts.
+ */
+ am = get_arch_mbm_state(hw_dom, rmid, evtid);
+ if (am)
+ memset(am, 0, sizeof(*am));
+}
+
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
new file mode 100644
index 000000000000..de580eca3363
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheflush.h>
+#include <linux/cpu.h>
+#include <linux/perf_event.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "pseudo_lock_trace.h"
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/**
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
+ * @void: It takes no parameters.
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * resctrl_arch_measure_l*_residency()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+u64 resctrl_arch_get_prefetch_disable_bits(void)
+{
+ prefetch_disable_bits = 0;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ prefetch_disable_bits = 0xF;
+ break;
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ prefetch_disable_bits = 0x5;
+ break;
+ }
+
+ return prefetch_disable_bits;
+}
+
+/**
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_pseudo_lock_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+ u64 saved_msr;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+ register void *mem_r asm(_ASM_BX);
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+ native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 saved_low, saved_high;
+ unsigned long i;
+ u64 start, end;
+ void *mem_r;
+
+ local_irq_disable();
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ mem_r = READ_ONCE(plr->kmem);
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+struct residency_counts {
+ u64 miss_before, hits_before;
+ u64 miss_after, hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+ struct perf_event_attr *hit_attr,
+ struct pseudo_lock_region *plr,
+ struct residency_counts *counts)
+{
+ u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+ struct perf_event *miss_event, *hit_event;
+ int hit_pmcnum, miss_pmcnum;
+ u32 saved_low, saved_high;
+ unsigned int line_size;
+ unsigned int size;
+ unsigned long i;
+ void *mem_r;
+ u64 tmp;
+
+ miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(miss_event))
+ goto out;
+
+ hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(hit_event))
+ goto out_miss;
+
+ local_irq_disable();
+ /*
+ * Check any possible error state of events used by performing
+ * one local read.
+ */
+ if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+ if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+
+ /* Initialize rest of local variables */
+ /*
+ * Performance event has been validated right before this with
+ * interrupts disabled - it is thus safe to read the counter index.
+ */
+ miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+ hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+ line_size = READ_ONCE(plr->line_size);
+ mem_r = READ_ONCE(plr->kmem);
+ size = READ_ONCE(plr->size);
+
+ /*
+ * Read counter variables twice - first to load the instructions
+ * used in L1 cache, second to capture accurate value that does not
+ * include cache misses incurred because of instruction loads.
+ */
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * From SDM: Performing back-to-back fast reads are not guaranteed
+ * to be monotonic.
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_after = rdpmc(hit_pmcnum);
+ miss_after = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ /* Re-enable hardware prefetchers */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+out_hit:
+ perf_event_release_kernel(hit_event);
+out_miss:
+ perf_event_release_kernel(miss_event);
+out:
+ /*
+ * All counts will be zero on failure.
+ */
+ counts->miss_before = miss_before;
+ counts->hits_before = hits_before;
+ counts->miss_after = miss_after;
+ counts->hits_after = hits_after;
+ return 0;
+}
+
+int resctrl_arch_measure_l2_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L2_HIT 02H
+ * L2_MISS 10H
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x10);
+ perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x2);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+ trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+ counts.miss_after - counts.miss_before);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+int resctrl_arch_measure_l3_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform the following events are used instead:
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /* On BDW the hit event counts references, not hits */
+ perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x4f);
+ perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x41);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+
+ counts.miss_after -= counts.miss_before;
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
+ /*
+ * On BDW references and misses are counted, need to adjust.
+ * Sometimes the "hits" counter is a bit more than the
+ * references, for example, x references but x + 1 hits.
+ * To not report invalid hit values in this case we treat
+ * that as misses equal to references.
+ */
+ /* First compute the number of cache references measured */
+ counts.hits_after -= counts.hits_before;
+ /* Next convert references to cache hits */
+ counts.hits_after -= min(counts.miss_after, counts.hits_after);
+ } else {
+ counts.hits_after -= counts.hits_before;
+ }
+
+ trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
new file mode 100644
index 000000000000..7c8aef08010f
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
new file mode 100644
index 000000000000..885026468440
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/msr.h>
+#include "internal.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/*
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is protected against __switch_to() because
+ * preemption is disabled.
+ */
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+ struct resctrl_cpu_defaults *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
+ }
+
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ resctrl_arch_sched_in(current);
+}
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+void resctrl_arch_mon_event_config_read(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+ u64 msrval;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+ /* Report only the valid event configuration bits */
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+void resctrl_arch_mon_event_config_write(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
+ struct rdt_resource *r_l;
+ cpumask_var_t cpu_mask;
+ int cpu;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ r_l = &rdt_resources_all[level].r_resctrl;
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
+ if (r_l->cache.arch_has_per_cpu_cfg)
+ /* Pick all the CPUs in the domain instance */
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
+ cpumask_set_cpu(cpu, cpu_mask);
+ else
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
+ }
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (!r->cdp_capable)
+ return;
+
+ if (r->rid == RDT_RESOURCE_L2)
+ l2_qos_cfg_update(&hw_res->cdp_enabled);
+
+ if (r->rid == RDT_RESOURCE_L3)
+ l3_qos_cfg_update(&hw_res->cdp_enabled);
+}
+
+static int cdp_enable(int level)
+{
+ struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+ int ret;
+
+ if (!r_l->alloc_capable)
+ return -EINVAL;
+
+ ret = set_cache_qos_cfg(level, true);
+ if (!ret)
+ rdt_resources_all[level].cdp_enabled = true;
+
+ return ret;
+}
+
+static void cdp_disable(int level)
+{
+ struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
+
+ if (r_hw->cdp_enabled) {
+ set_cache_qos_cfg(level, false);
+ r_hw->cdp_enabled = false;
+ }
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
+
+ if (!hw_res->r_resctrl.cdp_capable)
+ return -EINVAL;
+
+ if (enable)
+ return cdp_enable(l);
+
+ cdp_disable(l);
+
+ return 0;
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+ return rdt_resources_all[l].cdp_enabled;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ int i;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = hw_res->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ for (i = 0; i < hw_res->num_closid; i++)
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return;
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..42c7eac0c387
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,93 @@
+/*
+ * Routines to identify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/memtype.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+/*
+ * Please keep the leaf sorted by cpuid_bit.level for faster search.
+ * X86_FEATURE_MBA is supported by both Intel and AMD. But the CPUID
+ * levels are different and there is a separate entry for each.
+ */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 },
+ { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 },
+ { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 },
+ { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 },
+ { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 },
+ { 0, 0, 0, 0, 0 }
+};
+
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
deleted file mode 100644
index a640ae5ad201..000000000000
--- a/arch/x86/kernel/cpu/sched.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/sched.h>
-#include <linux/math64.h>
-#include <linux/percpu.h>
-#include <linux/irqflags.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#ifdef CONFIG_SMP
-
-static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
-
-static unsigned long scale_aperfmperf(void)
-{
- struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
- unsigned long ratio, flags;
-
- local_irq_save(flags);
- get_aperfmperf(&val);
- local_irq_restore(flags);
-
- ratio = calc_aperfmperf_ratio(old, &val);
- *old = val;
-
- return ratio;
-}
-
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
- /*
- * do aperf/mperf on the cpu level because it includes things
- * like turbo mode, which are relevant to full cores.
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return scale_aperfmperf();
-
- /*
- * maybe have something cpufreq here
- */
-
- return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
- /*
- * aperf/mperf already includes the smt gain
- */
- if (boot_cpu_has(X86_FEATURE_APERFMPERF))
- return SCHED_LOAD_SCALE;
-
- return default_scale_smt_power(sd, cpu);
-}
-
-#endif
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..9c1656779b2a
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,6 @@
+obj-y += \
+ driver.o \
+ encl.o \
+ ioctl.o \
+ main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..a42c7180900b
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_misc_reserved_mask;
+
+static int __sgx_open(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl;
+ int ret;
+
+ encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+ if (!encl)
+ return -ENOMEM;
+
+ kref_init(&encl->refcount);
+ xa_init(&encl->page_array);
+ mutex_init(&encl->lock);
+ INIT_LIST_HEAD(&encl->va_pages);
+ INIT_LIST_HEAD(&encl->mm_list);
+ spin_lock_init(&encl->mm_lock);
+
+ ret = init_srcu_struct(&encl->srcu);
+ if (ret) {
+ kfree(encl);
+ return ret;
+ }
+
+ file->private_data = encl;
+
+ return 0;
+}
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Drain the remaining mm_list entries. At this point the list contains
+ * entries for processes, which have closed the enclave file but have
+ * not exited yet. The processes, which have exited, are gone from the
+ * list by sgx_mmu_notifier_release().
+ */
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
+ }
+
+ kref_put(&encl->refcount, sgx_encl_release);
+ return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = file->private_data;
+ int ret;
+
+ ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+ if (ret)
+ return ret;
+
+ ret = sgx_encl_mm_add(encl, vma->vm_mm);
+ if (ret)
+ return ret;
+
+ vma->vm_ops = &sgx_vm_ops;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
+ vma->vm_private_data = encl;
+
+ return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if ((flags & MAP_TYPE) == MAP_PRIVATE)
+ return -EINVAL;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ return mm_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations sgx_encl_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_open,
+ .release = sgx_release,
+ .unlocked_ioctl = sgx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sgx_compat_ioctl,
+#endif
+ .mmap = sgx_mmap,
+ .get_unmapped_area = sgx_get_unmapped_area,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_enclave",
+ .nodename = "sgx_enclave",
+ .fops = &sgx_encl_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 attr_mask;
+ u64 xfrm_mask;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ if (!(eax & 1)) {
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+
+ cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+ attr_mask = (((u64)ebx) << 32) + (u64)eax;
+ sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+ if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+ sgx_xfrm_reserved_mask = ~xfrm_mask;
+ }
+
+ ret = misc_register(&sgx_dev_enclave);
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..30f39f92c98f
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT 20
+#define SGX_EINIT_SLEEP_COUNT 50
+#define SGX_EINIT_SLEEP_TIME 20
+
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_misc_reserved_mask;
+
+extern const struct file_operations sgx_provision_fops;
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..cf149b9f4916
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,1326 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include <asm/sgx.h>
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
+/*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+ unsigned long page_index)
+{
+ pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+ return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+ struct inode *inode = file_inode(encl->backing);
+
+ shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
+/*
+ * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
+ * Pages" in the SDM.
+ */
+static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_epc_page *secs_page)
+{
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_backing b;
+ bool pcmd_page_empty;
+ u8 *pcmd_page;
+ int ret;
+
+ if (secs_page)
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ else
+ page_index = PFN_DOWN(encl->size);
+
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
+ page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
+ ret = sgx_encl_lookup_backing(encl, page_index, &b);
+ if (ret)
+ return ret;
+
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
+ pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
+
+ if (secs_page)
+ pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
+ else
+ pginfo.secs = 0;
+
+ ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
+ sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ELDU");
+
+ ret = -EFAULT;
+ }
+
+ memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
+
+ /*
+ * The area for the PCMD in the page was zeroed above. Check if the
+ * whole page is now empty meaning that all PCMD's have been zeroed:
+ */
+ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
+
+ sgx_encl_truncate_backing_page(encl, page_index);
+
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
+ sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_local_page(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_local(pcmd_page);
+ }
+
+ put_page(b.pcmd);
+
+ return ret;
+}
+
+static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *secs_page)
+{
+
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page))
+ return epc_page;
+
+ ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
+ if (ret) {
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(ret);
+ }
+
+ sgx_free_va_slot(encl_page->va_page, va_offset);
+ list_move(&encl_page->va_page->list, &encl->va_pages);
+ encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ encl_page->epc_page = epc_page;
+
+ return epc_page;
+}
+
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+ struct sgx_encl_page *entry)
+{
+ struct sgx_epc_page *epc_page;
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr)
+{
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+/**
+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
+ * @vma: VMA obtained from fault info from where page is accessed
+ * @encl: enclave accessing the page
+ * @addr: address that triggered the page fault
+ *
+ * When an initialized enclave accesses a page with no backing EPC page
+ * on a SGX2 system then the EPC can be added dynamically via the SGX2
+ * ENCLS[EAUG] instruction.
+ *
+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ */
+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
+ struct sgx_encl *encl, unsigned long addr)
+{
+ vm_fault_t vmret = VM_FAULT_SIGBUS;
+ struct sgx_pageinfo pginfo = {0};
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ unsigned long phys_addr;
+ u64 secinfo_flags;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Ignore internal permission checking for dynamically added pages.
+ * They matter only for data added during the pre-initialization
+ * phase. The enclave decides the permissions by the means of
+ * EACCEPT, EACCEPTCOPY and EMODPE.
+ */
+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
+ if (IS_ERR(encl_page))
+ return VM_FAULT_OOM;
+
+ mutex_lock(&encl->lock);
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ va_page = sgx_encl_grow(encl, false);
+ if (IS_ERR(va_page)) {
+ if (PTR_ERR(va_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_epc;
+ }
+
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ /*
+ * If ret == -EBUSY then page was created in another flow while
+ * running without encl->lock
+ */
+ if (ret)
+ goto err_out_shrink;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = 0;
+
+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
+ if (ret)
+ goto err_out;
+
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = SGX_PAGE_TYPE_REG;
+ encl->secs_child_cnt++;
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+
+ phys_addr = sgx_get_epc_phys_addr(epc_page);
+ /*
+ * Do not undo everything when creating PTE entry fails - next #PF
+ * would find page ready for a PTE.
+ */
+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (vmret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_NOPAGE;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+err_out_epc:
+ sgx_encl_free_epc_page(epc_page);
+err_out_unlock:
+ mutex_unlock(&encl->lock);
+ kfree(encl_page);
+
+ return vmret;
+}
+
+static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
+{
+ unsigned long addr = (unsigned long)vmf->address;
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_encl_page *entry;
+ unsigned long phys_addr;
+ struct sgx_encl *encl;
+ vm_fault_t ret;
+
+ encl = vma->vm_private_data;
+
+ /*
+ * It's very unlikely but possible that allocating memory for the
+ * mm_list entry of a forked process failed in sgx_vma_open(). When
+ * this happens, vm_private_data is set to NULL.
+ */
+ if (unlikely(!encl))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * The page_array keeps track of all enclave pages, whether they
+ * are swapped out or not. If there is no entry for this page and
+ * the system supports SGX2 then it is possible to dynamically add
+ * a new enclave page. This is only possible for an initialized
+ * enclave that will be checked for right away.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
+ (!xa_load(&encl->page_array, PFN_DOWN(addr))))
+ return sgx_encl_eaug_page(vma, encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
+ if (IS_ERR(entry)) {
+ mutex_unlock(&encl->lock);
+
+ if (PTR_ERR(entry) == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
+
+ ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (ret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ sgx_encl_test_and_clear_young(vma->vm_mm, entry);
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+
+ /*
+ * It's possible but unlikely that vm_private_data is NULL. This can
+ * happen in a grandchild of a process, when sgx_encl_mm_add() had
+ * failed to allocate memory in this callback.
+ */
+ if (unlikely(!encl))
+ return;
+
+ if (sgx_encl_mm_add(encl, vma->vm_mm))
+ vma->vm_private_data = NULL;
+}
+
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl: an enclave pointer
+ * @start: lower bound of the address range, inclusive
+ * @end: upper bound of the address range, exclusive
+ * @vm_flags: VMA flags
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
+ * do not contain any permissions that are not contained in the build time
+ * permissions of any of the enclave pages within the given address range.
+ *
+ * An enclave creator must declare the strongest permissions that will be
+ * needed for each enclave page. This ensures that mappings have the identical
+ * or weaker permissions than the earlier declared permissions.
+ *
+ * Return: 0 on success, -EACCES otherwise
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags)
+{
+ vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *page;
+ unsigned long count = 0;
+ int ret = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+
+ /* Disallow mapping outside enclave's address range. */
+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
+ (start < encl->base || end > encl->base + encl->size))
+ return -EACCES;
+
+ /*
+ * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+ * conflict with the enclave page permissions.
+ */
+ if (current->personality & READ_IMPLIES_EXEC)
+ return -EACCES;
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
+ if (~page->vm_max_prot_bits & vm_prot_bits) {
+ ret = -EACCES;
+ break;
+ }
+
+ /* Reschedule on every XA_CHECK_SCHED iteration. */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ cond_resched();
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ return ret;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+}
+
+static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+
+ ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+ ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Load an enclave page to EPC if required, and take encl->lock.
+ */
+static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ struct sgx_encl_page *entry;
+
+ for ( ; ; ) {
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
+ if (PTR_ERR(entry) != -EBUSY)
+ break;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ if (IS_ERR(entry))
+ mutex_unlock(&encl->lock);
+
+ return entry;
+}
+
+static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+ struct sgx_encl_page *entry = NULL;
+ char data[sizeof(unsigned long)];
+ unsigned long align;
+ int offset;
+ int cnt;
+ int ret = 0;
+ int i;
+
+ /*
+ * If process was forked, VMA is still there but vm_private_data is set
+ * to NULL.
+ */
+ if (!encl)
+ return -EFAULT;
+
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ return -EFAULT;
+
+ for (i = 0; i < len; i += cnt) {
+ entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
+ vma->vm_flags);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ break;
+ }
+
+ align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
+ offset = (addr + i) & (sizeof(unsigned long) - 1);
+ cnt = sizeof(unsigned long) - offset;
+ cnt = min(cnt, len - i);
+
+ ret = sgx_encl_debug_read(encl, entry, align, data);
+ if (ret)
+ goto out;
+
+ if (write) {
+ memcpy(data + offset, buf + i, cnt);
+ ret = sgx_encl_debug_write(encl, entry, align, data);
+ if (ret)
+ goto out;
+ } else {
+ memcpy(buf + i, data + offset, cnt);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+
+ if (ret)
+ break;
+ }
+
+ return ret < 0 ? ret : i;
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+ .fault = sgx_vma_fault,
+ .mprotect = sgx_vma_mprotect,
+ .open = sgx_vma_open,
+ .access = sgx_vma_access,
+};
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @ref: address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+ struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
+ struct sgx_va_page *va_page;
+ struct sgx_encl_page *entry;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
+ if (entry->epc_page) {
+ /*
+ * The page and its radix tree entry cannot be freed
+ * if the page is being held by the reclaimer.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page))
+ continue;
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ }
+
+ kfree(entry);
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+
+ xa_destroy(&encl->page_array);
+
+ if (!encl->secs_child_cnt && encl->secs.epc_page) {
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+ }
+
+ while (!list_empty(&encl->va_pages)) {
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ list_del(&va_page->list);
+ sgx_encl_free_epc_page(va_page->epc_page);
+ kfree(va_page);
+ }
+
+ if (encl->backing)
+ fput(encl->backing);
+
+ cleanup_srcu_struct(&encl->srcu);
+
+ WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+ /* Detect EPC page leak's. */
+ WARN_ON_ONCE(encl->secs_child_cnt);
+ WARN_ON_ONCE(encl->secs.epc_page);
+
+ kfree(encl);
+ sgx_dec_usage_count();
+}
+
+/*
+ * 'mm' is exiting and no longer needs mmu notifications.
+ */
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+ bool found = false;
+
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
+ spin_lock(&encl_mm->encl->mm_lock);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&encl_mm->encl->mm_lock);
+
+ if (found) {
+ synchronize_srcu(&encl_mm->encl->srcu);
+ mmu_notifier_put(mn);
+ }
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
+ kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+ .free_notifier = sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = NULL;
+ struct sgx_encl_mm *tmp;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+ if (tmp->mm == mm) {
+ encl_mm = tmp;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm;
+ int ret;
+
+ /*
+ * Even though a single enclave may be mapped into an mm more than once,
+ * each 'mm' only appears once on encl->mm_list. This is guaranteed by
+ * holding the mm's mmap lock for write before an mm can be added or
+ * remove to an encl->mm_list.
+ */
+ mmap_assert_write_locked(mm);
+
+ /*
+ * It's possible that an entry already exists in the mm_list, because it
+ * is removed only on VFS release or process exit.
+ */
+ if (sgx_encl_find_mm(encl, mm))
+ return 0;
+
+ encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+ if (!encl_mm)
+ return -ENOMEM;
+
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
+ encl_mm->encl = encl;
+ encl_mm->mm = mm;
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
+
+ spin_lock(&encl->mm_lock);
+ list_add_rcu(&encl_mm->list, &encl->mm_list);
+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
+ smp_wmb();
+ encl->mm_list_version++;
+ spin_unlock(&encl->mm_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
+ * @encl: the enclave
+ *
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. For example, ENCLS[EWB]
+ * copies a page from the enclave page cache to regular main memory but
+ * it fails if it cannot ensure that there are no cached
+ * linear-to-physical address mappings referring to the page.
+ *
+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU
+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
+ * address mappings are cleared but coordination with the tracking done within
+ * the SGX hardware is needed to support the SGX functions that depend on this
+ * cache clearing.
+ *
+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware
+ * tracks threads operating inside the enclave at that time. The SGX
+ * hardware tracking require that all the identified threads must have
+ * exited the enclave in order to flush the mappings before a function such
+ * as ENCLS[EWB] will be permitted
+ *
+ * The following flow is used to support SGX functions that require that
+ * no cached linear-to-physical address mappings are present:
+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
+ * accessing the enclave.
+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and
+ * thus flushing all locally cached linear-to-physical address mappings.
+ * 4) Execute SGX function.
+ *
+ * Context: It is required to call this function after ENCLS[ETRACK].
+ * This will ensure that if any new mm appears (racing with
+ * sgx_encl_mm_add()) then the new mm will enter into the
+ * enclave with fresh linear-to-physical address mappings.
+ *
+ * It is required that all IPIs are completed before a new
+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
+ * of the above flow with the enclave's mutex.
+ *
+ * Return: cpumask of CPUs that might be accessing @encl
+ */
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
+static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
+ pgoff_t index)
+{
+ struct address_space *mapping = encl->backing->f_mapping;
+ gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+ return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+/**
+ * __sgx_encl_get_backing() - Pin the backing storage
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Pin the backing storage pages for storing the encrypted contents and Paging
+ * Crypto MetaData (PCMD) of an enclave page.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+ struct page *contents;
+ struct page *pcmd;
+
+ contents = sgx_encl_get_backing_page(encl, page_index);
+ if (IS_ERR(contents))
+ return PTR_ERR(contents);
+
+ pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ if (IS_ERR(pcmd)) {
+ put_page(contents);
+ return PTR_ERR(pcmd);
+ }
+
+ backing->contents = contents;
+ backing->pcmd = pcmd;
+ backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
+
+ return 0;
+}
+
+/*
+ * When called from ksgxd, returns the mem_cgroup of a struct mm stored
+ * in the enclave's mm_list. When not called from ksgxd, just returns
+ * the mem_cgroup of the current task.
+ */
+static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * If called from normal task context, return the mem_cgroup
+ * of the current task's mm. The remainder of the handling is for
+ * ksgxd.
+ */
+ if (!current_is_ksgxd())
+ return get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * Search the enclave's mm_list to find an mm associated with
+ * this enclave to charge the allocation to.
+ */
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ memcg = get_mem_cgroup_from_mm(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ /*
+ * In the rare case that there isn't an mm associated with
+ * the enclave, set memcg to the current active mem_cgroup.
+ * This will be the root mem_cgroup if there is no active
+ * mem_cgroup.
+ */
+ if (!memcg)
+ return get_mem_cgroup_from_mm(NULL);
+
+ return memcg;
+}
+
+/**
+ * sgx_encl_alloc_backing() - create a new backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * When called from ksgxd, sets the active memcg from one of the
+ * mms in the enclave's mm_list prior to any backing page allocation,
+ * in order to ensure that shmem page allocations are charged to the
+ * enclave. Create a backing page for loading data back into an EPC page with
+ * ELDU. This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
+ struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
+ int ret;
+
+ ret = __sgx_encl_get_backing(encl, page_index, backing);
+
+ set_active_memcg(memcg);
+ mem_cgroup_put(encl_memcg);
+
+ return ret;
+}
+
+/**
+ * sgx_encl_lookup_backing() - retrieve an existing backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Retrieve a backing page for loading data back into an EPC page with ELDU.
+ * It is the caller's responsibility to ensure that it is appropriate to use
+ * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
+ * not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ return __sgx_encl_get_backing(encl, page_index, backing);
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing)
+{
+ put_page(backing->pcmd);
+ put_page(backing->contents);
+}
+
+static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
+ void *data)
+{
+ pte_t pte;
+ int ret;
+
+ ret = pte_young(*ptep);
+ if (ret) {
+ pte = pte_mkold(*ptep);
+ set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+ }
+
+ return ret;
+}
+
+/**
+ * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
+ * @mm: mm_struct that is checked
+ * @page: enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the enclave page and
+ * clears it.
+ *
+ * Return: 1 if the page has been recently accessed and 0 if not.
+ */
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page)
+{
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = sgx_encl_find(mm, addr, &vma);
+ if (ret)
+ return 0;
+
+ if (encl != vma->vm_private_data)
+ return 0;
+
+ ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+ sgx_encl_test_and_clear_young_cb, vma->vm_mm);
+ if (ret < 0)
+ return 0;
+
+ return ret;
+}
+
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+/**
+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
+ * @encl: the enclave
+ * @addr: page aligned pointer to single page for which PTEs will be removed
+ *
+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
+ * @addr from each VMA. Ensure that page fault handler is ready to handle
+ * new mappings of @addr before calling this function.
+ */
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+}
+
+/**
+ * sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ * @reclaim: Reclaim EPC pages directly if none available. Enclave
+ * mutex should not be held if this is set.
+ *
+ * Allocate a free EPC page and convert it to a Version Array (VA) page.
+ *
+ * Return:
+ * a VA page,
+ * -errno otherwise
+ */
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
+{
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(NULL, reclaim);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ ret = __epa(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return epc_page;
+}
+
+/**
+ * sgx_alloc_va_slot - allocate a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Allocates a slot from a &struct sgx_va_page instance.
+ *
+ * Return: offset of the slot inside the VA page
+ */
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ if (slot < SGX_VA_SLOT_COUNT)
+ set_bit(slot, va_page->slots);
+
+ return slot << 3;
+}
+
+/**
+ * sgx_free_va_slot - free a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ * @offset: offset of the slot inside the VA page
+ *
+ * Frees a slot from a &struct sgx_va_page instance.
+ */
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
+{
+ clear_bit(offset >> 3, va_page->slots);
+}
+
+/**
+ * sgx_va_page_full - is the VA page full?
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Return: true if all slots have been taken
+ */
+bool sgx_va_page_full(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ return slot == SGX_VA_SLOT_COUNT;
+}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..8ff47f6652b9
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains the software defined data structures for enclaves.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/* 'desc' bits holding the offset in the VA (version array) page. */
+#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3)
+
+/* 'desc' bit marking that the page is being reclaimed. */
+#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3)
+
+struct sgx_encl_page {
+ unsigned long desc;
+ unsigned long vm_max_prot_bits:8;
+ enum sgx_page_type type:16;
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl *encl;
+ struct sgx_va_page *va_page;
+};
+
+enum sgx_encl_flags {
+ SGX_ENCL_IOCTL = BIT(0),
+ SGX_ENCL_DEBUG = BIT(1),
+ SGX_ENCL_CREATED = BIT(2),
+ SGX_ENCL_INITIALIZED = BIT(3),
+};
+
+struct sgx_encl_mm {
+ struct sgx_encl *encl;
+ struct mm_struct *mm;
+ struct list_head list;
+ struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+ unsigned long base;
+ unsigned long size;
+ unsigned long flags;
+ unsigned int page_cnt;
+ unsigned int secs_child_cnt;
+ struct mutex lock;
+ struct xarray page_array;
+ struct sgx_encl_page secs;
+ unsigned long attributes;
+ unsigned long attributes_mask;
+
+ cpumask_t cpumask;
+ struct file *backing;
+ struct kref refcount;
+ struct list_head va_pages;
+ unsigned long mm_list_version;
+ struct list_head mm_list;
+ spinlock_t mm_lock;
+ struct srcu_struct srcu;
+};
+
+#define SGX_VA_SLOT_COUNT 512
+
+struct sgx_va_page {
+ struct sgx_epc_page *epc_page;
+ DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT);
+ struct list_head list;
+};
+
+struct sgx_backing {
+ struct page *contents;
+ struct page *pcmd;
+ unsigned long pcmd_offset;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **vma)
+{
+ struct vm_area_struct *result;
+
+ result = vma_lookup(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops)
+ return -EINVAL;
+
+ *vma = result;
+
+ return 0;
+}
+
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags);
+
+bool current_is_ksgxd(void);
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing);
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page);
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags);
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr);
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim);
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
+bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr);
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
new file mode 100644
index 000000000000..74be751199a4
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_ENCLS_H
+#define _X86_ENCLS_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+/* Retrieve the encoded trapnr from the specified return code. */
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
+
+/* Issue a WARN() about an ENCLS function. */
+#define ENCLS_WARN(r, name) { \
+ do { \
+ int _r = (r); \
+ WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
+ } while (0); \
+}
+
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & SGX_ENCLS_FAULT_FLAG;
+}
+
+/**
+ * encls_failed() - Check if an ENCLS function failed
+ * @ret: the return value of an ENCLS function call
+ *
+ * Check if an ENCLS function failed. This happens when the function causes a
+ * fault that is not caused by an EPCM conflict or when the function returns a
+ * non-zero value.
+ */
+static inline bool encls_failed(int ret)
+{
+ if (encls_faulted(ret))
+ return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
+
+ return !!ret;
+}
+
+/**
+ * __encls_ret_N - encode an ENCLS function that returns an error code in EAX
+ * @rax: function number
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE.
+ * And because SGX isn't complex enough as it is, function that return an error
+ * code also modify flags.
+ *
+ * Return:
+ * 0 on success,
+ * SGX error code on failure
+ */
+#define __encls_ret_N(rax, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret) \
+ : "a"(rax), inputs \
+ : "memory", "cc"); \
+ ret; \
+ })
+
+#define __encls_ret_1(rax, rcx) \
+ ({ \
+ __encls_ret_N(rax, "c"(rcx)); \
+ })
+
+#define __encls_ret_2(rax, rbx, rcx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_ret_3(rax, rbx, rcx, rdx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \
+ })
+
+/**
+ * __encls_N - encode an ENCLS function that doesn't return an error code
+ * @rax: function number
+ * @rbx_out: optional output variable
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that does not return an error code, e.g.
+ * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an
+ * optional parameter for use by EDGBRD, which returns the requested value in
+ * RBX.
+ *
+ * Return:
+ * 0 on success,
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
+ */
+#define __encls_N(rax, rbx_out, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n\t" \
+ "xor %%eax,%%eax\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret), "=b"(rbx_out) \
+ : "a"(rax), inputs \
+ : "memory"); \
+ ret; \
+ })
+
+#define __encls_2(rax, rbx, rcx) \
+ ({ \
+ unsigned long ign_rbx_out; \
+ __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_1_1(rax, data, rcx) \
+ ({ \
+ unsigned long rbx_out; \
+ int ret = __encls_N(rax, rbx_out, "c"(rcx)); \
+ if (!ret) \
+ data = rbx_out; \
+ ret; \
+ })
+
+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
+static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
+{
+ return __encls_2(ECREATE, pginfo, secs);
+}
+
+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */
+static inline int __eextend(void *secs, void *addr)
+{
+ return __encls_2(EEXTEND, secs, addr);
+}
+
+/*
+ * Associate an EPC page to an enclave either as a REG or TCS page
+ * populated with the provided data.
+ */
+static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EADD, pginfo, addr);
+}
+
+/* Finalize enclave build, initialize enclave for user code execution. */
+static inline int __einit(void *sigstruct, void *token, void *secs)
+{
+ return __encls_ret_3(EINIT, sigstruct, secs, token);
+}
+
+/* Disassociate EPC page from its enclave and mark it as unused. */
+static inline int __eremove(void *addr)
+{
+ return __encls_ret_1(EREMOVE, addr);
+}
+
+/* Copy data to an EPC page belonging to a debug enclave. */
+static inline int __edbgwr(void *addr, unsigned long *data)
+{
+ return __encls_2(EDGBWR, *data, addr);
+}
+
+/* Copy data from an EPC page belonging to a debug enclave. */
+static inline int __edbgrd(void *addr, unsigned long *data)
+{
+ return __encls_1_1(EDGBRD, *data, addr);
+}
+
+/* Track that software has completed the required TLB address clears. */
+static inline int __etrack(void *addr)
+{
+ return __encls_ret_1(ETRACK, addr);
+}
+
+/* Load, verify, and unblock an EPC page. */
+static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(ELDU, pginfo, addr, va);
+}
+
+/* Make EPC page inaccessible to enclave, ready to be written to memory. */
+static inline int __eblock(void *addr)
+{
+ return __encls_ret_1(EBLOCK, addr);
+}
+
+/* Initialize an EPC page into a Version Array (VA) page. */
+static inline int __epa(void *addr)
+{
+ unsigned long rbx = SGX_PAGE_TYPE_VA;
+
+ return __encls_2(EPA, rbx, addr);
+}
+
+/* Invalidate an EPC page and write it out to main memory. */
+static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(EWB, pginfo, addr, va);
+}
+
+/* Restrict the EPCM permissions of an EPC page. */
+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODPR, secinfo, addr);
+}
+
+/* Change the type of an EPC page. */
+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODT, secinfo, addr);
+}
+
+/* Zero a page of EPC memory and add it to an initialized enclave. */
+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EAUG, pginfo, addr);
+}
+
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+ return __encls_ret_1(EUPDATESVN, "");
+}
+#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..66f1efa16fbb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <asm/mman.h>
+#include <asm/sgx.h>
+#include <crypto/sha2.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim)
+{
+ struct sgx_va_page *va_page = NULL;
+ void *err;
+
+ BUILD_BUG_ON(SGX_VA_SLOT_COUNT !=
+ (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1);
+
+ if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) {
+ va_page = kzalloc(sizeof(*va_page), GFP_KERNEL);
+ if (!va_page)
+ return ERR_PTR(-ENOMEM);
+
+ va_page->epc_page = sgx_alloc_va_page(reclaim);
+ if (IS_ERR(va_page->epc_page)) {
+ err = ERR_CAST(va_page->epc_page);
+ kfree(va_page);
+ return err;
+ }
+
+ WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT);
+ }
+ encl->page_cnt++;
+ return va_page;
+}
+
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+{
+ encl->page_cnt--;
+
+ if (va_page) {
+ sgx_encl_free_epc_page(va_page->epc_page);
+ list_del(&va_page->list);
+ kfree(va_page);
+ }
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+ struct sgx_epc_page *secs_epc;
+ struct sgx_va_page *va_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_secinfo secinfo;
+ unsigned long encl_size;
+ struct file *backing;
+ long ret;
+
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page))
+ return PTR_ERR(va_page);
+ else if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+ /* else the tail page of the VA page list had free slots. */
+
+ /* The extra page goes to SECS. */
+ encl_size = secs->size + PAGE_SIZE;
+
+ backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+ VM_NORESERVE);
+ if (IS_ERR(backing)) {
+ ret = PTR_ERR(backing);
+ goto err_out_shrink;
+ }
+
+ encl->backing = backing;
+
+ secs_epc = sgx_alloc_epc_page(&encl->secs, true);
+ if (IS_ERR(secs_epc)) {
+ ret = PTR_ERR(secs_epc);
+ goto err_out_backing;
+ }
+
+ encl->secs.epc_page = secs_epc;
+
+ pginfo.addr = 0;
+ pginfo.contents = (unsigned long)secs;
+ pginfo.metadata = (unsigned long)&secinfo;
+ pginfo.secs = 0;
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc));
+ if (ret) {
+ ret = -EIO;
+ goto err_out;
+ }
+
+ if (secs->attributes & SGX_ATTR_DEBUG)
+ set_bit(SGX_ENCL_DEBUG, &encl->flags);
+
+ encl->secs.encl = encl;
+ encl->secs.type = SGX_PAGE_TYPE_SECS;
+ encl->base = secs->base;
+ encl->size = secs->size;
+ encl->attributes = secs->attributes;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
+
+ /* Set only after completion, as encl->lock has not been taken. */
+ set_bit(SGX_ENCL_CREATED, &encl->flags);
+
+ return 0;
+
+err_out:
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+err_out_backing:
+ fput(encl->backing);
+ encl->backing = NULL;
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @encl: An enclave pointer.
+ * @arg: The ioctl argument.
+ *
+ * Allocate kernel data structures for the enclave and invoke ECREATE.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EIO: ECREATE failed.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_create create_arg;
+ void *secs;
+ int ret;
+
+ if (test_bit(SGX_ENCL_CREATED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&create_arg, arg, sizeof(create_arg)))
+ return -EFAULT;
+
+ secs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!secs)
+ return -ENOMEM;
+
+ if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
+ ret = -EFAULT;
+ else
+ ret = sgx_encl_create(encl, secs);
+
+ kfree(secs);
+ return ret;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+ u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+ u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+ if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+ return -EINVAL;
+
+ if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+ return -EINVAL;
+
+ /*
+ * CPU will silently overwrite the permissions as zero, which means
+ * that we need to validate it ourselves.
+ */
+ if (pt == SGX_SECINFO_TCS && perm)
+ return -EINVAL;
+
+ if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+ return -EINVAL;
+
+ if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+ struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_secinfo *secinfo, unsigned long src)
+{
+ struct sgx_pageinfo pginfo;
+ struct vm_area_struct *vma;
+ struct page *src_page;
+ int ret;
+
+ /* Deny noexec. */
+ vma = find_vma(current->mm, src);
+ if (!vma)
+ return -EFAULT;
+
+ if (!(vma->vm_flags & VM_MAYEXEC))
+ return -EACCES;
+
+ ret = get_user_pages(src, 1, 0, &src_page);
+ if (ret < 1)
+ return -EFAULT;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = (unsigned long)secinfo;
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
+
+ ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
+
+ kunmap_local((void *)pginfo.contents);
+ put_page(src_page);
+
+ return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+ struct sgx_epc_page *epc_page)
+{
+ unsigned long offset;
+ int ret;
+
+ for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) {
+ ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
+ sgx_get_epc_virt_addr(epc_page) + offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EEXTEND");
+
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+ unsigned long offset, struct sgx_secinfo *secinfo,
+ unsigned long flags)
+{
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ int ret;
+
+ encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+ if (IS_ERR(encl_page))
+ return PTR_ERR(encl_page);
+
+ epc_page = sgx_alloc_epc_page(encl_page, true);
+ if (IS_ERR(epc_page)) {
+ kfree(encl_page);
+ return PTR_ERR(epc_page);
+ }
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page)) {
+ ret = PTR_ERR(va_page);
+ goto err_out_free;
+ }
+
+ mmap_read_lock(current->mm);
+ mutex_lock(&encl->lock);
+
+ /*
+ * Adding to encl->va_pages must be done under encl->lock. Ditto for
+ * deleting (via sgx_encl_shrink()) in the error path.
+ */
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ /*
+ * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e.
+ * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+ * to userspace errors (or kernel/hardware bugs).
+ */
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ if (ret)
+ goto err_out_unlock;
+
+ ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+ src);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Complete the "add" before doing the "extend" so that the "add"
+ * isn't in a half-baked state in the extremely unlikely scenario
+ * the enclave will be destroyed in response to EEXTEND failure.
+ */
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8;
+ encl->secs_child_cnt++;
+
+ if (flags & SGX_PAGE_MEASURE) {
+ ret = __sgx_encl_extend(encl, epc_page);
+ if (ret)
+ goto err_out;
+ }
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+ return ret;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+ sgx_encl_shrink(encl, va_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+
+err_out_free:
+ sgx_encl_free_epc_page(epc_page);
+ kfree(encl_page);
+
+ return ret;
+}
+
+/*
+ * Ensure user provided offset and length values are valid for
+ * an enclave.
+ */
+static int sgx_validate_offset_length(struct sgx_encl *encl,
+ unsigned long offset,
+ unsigned long length)
+{
+ if (!IS_ALIGNED(offset, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!length || !IS_ALIGNED(length, PAGE_SIZE))
+ return -EINVAL;
+
+ if (offset + length < offset)
+ return -EINVAL;
+
+ if (offset + length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl: an enclave pointer
+ * @arg: a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * The function deinitializes kernel data structures for enclave and returns
+ * -EIO in any of the following conditions:
+ *
+ * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
+ * been invalidated. This will cause EADD and EEXTEND to fail.
+ * - If the source address is corrupted somehow when executing EADD.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EACCES: The source page is located in a noexec partition.
+ * - -ENOMEM: Out of EPC pages.
+ * - -EINTR: The call was interrupted before data was processed.
+ * - -EIO: Either EADD or EEXTEND failed because invalid source address
+ * or power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_add_pages add_arg;
+ struct sgx_secinfo secinfo;
+ unsigned long c;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE))
+ return -EINVAL;
+
+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length))
+ return -EINVAL;
+
+ if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
+ sizeof(secinfo)))
+ return -EFAULT;
+
+ if (sgx_validate_secinfo(&secinfo))
+ return -EINVAL;
+
+ for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) {
+ if (signal_pending(current)) {
+ if (!c)
+ ret = -ERESTARTSYS;
+
+ break;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+ ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
+ &secinfo, add_arg.flags);
+ if (ret)
+ break;
+ }
+
+ add_arg.count = c;
+
+ if (copy_to_user(arg, &add_arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+ void *token)
+{
+ u64 mrsigner[4];
+ int i, j;
+ void *addr;
+ int ret;
+
+ /*
+ * Deny initializing enclaves with attributes (namely provisioning)
+ * that have not been explicitly allowed.
+ */
+ if (encl->attributes & ~encl->attributes_mask)
+ return -EACCES;
+
+ /*
+ * Attributes should not be enforced *only* against what's available on
+ * platform (done in sgx_encl_create) but checked and enforced against
+ * the mask for enforcement in sigstruct. For example an enclave could
+ * opt to sign with AVX bit in xfrm, but still be loadable on a platform
+ * without it if the sigstruct->body.attributes_mask does not turn that
+ * bit on.
+ */
+ if (sigstruct->body.attributes & sigstruct->body.attributes_mask &
+ sgx_attributes_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.miscselect & sigstruct->body.misc_mask &
+ sgx_misc_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask &
+ sgx_xfrm_reserved_mask)
+ return -EINVAL;
+
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
+
+ mutex_lock(&encl->lock);
+
+ /*
+ * ENCLS[EINIT] is interruptible because it has such a high latency,
+ * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending,
+ * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be
+ * serviced.
+ */
+ for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+ for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+ addr = sgx_get_epc_virt_addr(encl->secs.epc_page);
+
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(mrsigner);
+
+ ret = __einit(sigstruct, token, addr);
+
+ preempt_enable();
+
+ if (ret == SGX_UNMASKED_EVENT)
+ continue;
+ else
+ break;
+ }
+
+ if (ret != SGX_UNMASKED_EVENT)
+ break;
+
+ msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto err_out;
+ }
+ }
+
+ if (encls_faulted(ret)) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EINIT");
+
+ ret = -EIO;
+ } else if (ret) {
+ pr_debug("EINIT returned %d\n", ret);
+ ret = -EPERM;
+ } else {
+ set_bit(SGX_ENCL_INITIALIZED, &encl->flags);
+ }
+
+err_out:
+ mutex_unlock(&encl->lock);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT. The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EPERM: Invalid SIGSTRUCT.
+ * - -EIO: EINIT failed because of a power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_sigstruct *sigstruct;
+ struct sgx_enclave_init init_arg;
+ void *token;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
+ return -EFAULT;
+
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
+ return -ENOMEM;
+
+ token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
+ memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
+
+ if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct,
+ sizeof(*sigstruct))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * A legacy field used with Intel signed enclaves. These used to mean
+ * regular and architectural enclaves. The CPU only accepts these values
+ * but they do not have any other meaning.
+ *
+ * Thus, reject any other values.
+ */
+ if (sigstruct->header.vendor != 0x0000 &&
+ sigstruct->header.vendor != 0x8086) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sgx_encl_init(encl, sigstruct, token);
+
+out:
+ kfree(sigstruct);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_provision instance
+ *
+ * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to
+ * /dev/sgx_provision.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_provision params;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
+}
+
+/*
+ * Ensure enclave is ready for SGX2 functions. Readiness is checked
+ * by ensuring the hardware supports SGX2 and the enclave is initialized
+ * and thus able to handle requests to modify pages within it.
+ */
+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2)))
+ return -ENODEV;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. Collaborate with
+ * hardware via ENCLS[ETRACK] to ensure that all cached
+ * linear-to-physical address mappings belonging to all threads of
+ * the enclave are cleared. See sgx_encl_cpumask() for details.
+ *
+ * Must be called with enclave's mutex held from the time the
+ * SGX function requiring that no cached linear-to-physical mappings
+ * are present is executed until this ETRACK flow is complete.
+ */
+static int sgx_enclave_etrack(struct sgx_encl *encl)
+{
+ void *epc_virt;
+ int ret;
+
+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ /*
+ * ETRACK only fails when there is an OS issue. For
+ * example, two consecutive ETRACK was sent without
+ * completed IPI between.
+ */
+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret);
+ /*
+ * Send IPIs to kick CPUs out of the enclave and
+ * try ETRACK again.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ pr_err_once("ETRACK repeat returned %d (0x%x)",
+ ret, ret);
+ return -EFAULT;
+ }
+ }
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+
+ return 0;
+}
+
+/**
+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions
+ * @encl: Enclave to which the pages belong.
+ * @modp: Checked parameters from user on which pages need modifying and
+ * their new permissions.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long
+sgx_enclave_restrict_permissions(struct sgx_encl *encl,
+ struct sgx_enclave_restrict_permissions *modp)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK;
+
+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
+ addr = encl->base + modp->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Changing EPCM permissions is only supported on regular
+ * SGX pages. Attempting this change on other pages will
+ * result in #PF.
+ */
+ if (entry->type != SGX_PAGE_TYPE_REG) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Apart from ensuring that read-access remains, do not verify
+ * the permission bits requested. Kernel has no control over
+ * how EPCM permissions can be relaxed from within the enclave.
+ * ENCLS[EMODPR] can only remove existing EPCM permissions,
+ * attempting to set new permissions will be ignored by the
+ * hardware.
+ */
+
+ /* Change EPCM permissions. */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * permissions of a regular page, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these
+ * are protected with mutex.
+ */
+ pr_err_once("EMODPR encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ if (encls_failed(ret)) {
+ modp->result = ret;
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modp->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_restrict_permissions() - handler for
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions
+ * instance
+ *
+ * SGX2 distinguishes between relaxing and restricting the enclave page
+ * permissions maintained by the hardware (EPCM permissions) of pages
+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT).
+ *
+ * EPCM permissions cannot be restricted from within the enclave, the enclave
+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR]
+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call
+ * will be ignored by the hardware.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_restrict_permissions params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK)
+ return -EINVAL;
+
+ /*
+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR]
+ * from faulting later when the CPU does the same check.
+ */
+ if ((params.permissions & SGX_SECINFO_W) &&
+ !(params.permissions & SGX_SECINFO_R))
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_restrict_permissions(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages
+ * @encl: Enclave to which the pages belong.
+ * @modt: Checked parameters from user about which pages need modifying
+ * and their new page type.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_enclave_modify_types(struct sgx_encl *encl,
+ struct sgx_enclave_modify_types *modt)
+{
+ unsigned long max_prot_restore;
+ enum sgx_page_type page_type;
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long prot;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK;
+
+ /*
+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM.
+ */
+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM)
+ return -EINVAL;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ secinfo.flags = page_type << 8;
+
+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
+ addr = encl->base + modt->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Borrow the logic from the Intel SDM. Regular pages
+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS
+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed.
+ * CET pages not supported yet.
+ */
+ if (!(entry->type == SGX_PAGE_TYPE_REG ||
+ (entry->type == SGX_PAGE_TYPE_TCS &&
+ page_type == SGX_PAGE_TYPE_TRIM))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ max_prot_restore = entry->vm_max_prot_bits;
+
+ /*
+ * Once a regular page becomes a TCS page it cannot be
+ * changed back. So the maximum allowed protection reflects
+ * the TCS page that is always RW from kernel perspective but
+ * will be inaccessible from within enclave. Before doing
+ * so, do make sure that the new page type continues to
+ * respect the originally vetted page permissions.
+ */
+ if (entry->type == SGX_PAGE_TYPE_REG &&
+ page_type == SGX_PAGE_TYPE_TCS) {
+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ prot = PROT_READ | PROT_WRITE;
+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ /*
+ * Prevent page from being reclaimed while mutex
+ * is released.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EAGAIN;
+ goto out_entry_changed;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_mark_page_reclaimable(entry->epc_page);
+ }
+
+ /* Change EPC type */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodt(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * valid page types, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these are
+ * protected with mutex.
+ */
+ pr_err_once("EMODT encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+ if (encls_failed(ret)) {
+ modt->result = ret;
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ entry->type = page_type;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_entry_changed:
+ entry->vm_max_prot_bits = max_prot_restore;
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modt->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance
+ *
+ * Ability to change the enclave page type supports the following use cases:
+ *
+ * * It is possible to add TCS pages to an enclave by changing the type of
+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages.
+ * With this support the number of threads supported by an initialized
+ * enclave can be increased dynamically.
+ *
+ * * Regular or TCS pages can dynamically be removed from an initialized
+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the
+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual
+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called
+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the
+ * enclave.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_modify_types params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.page_type & ~SGX_PAGE_TYPE_MASK)
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_modify_types(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave
+ * @encl: Enclave to which the pages belong
+ * @params: Checked parameters from user on which pages need to be removed
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_encl_remove_pages(struct sgx_encl *encl,
+ struct sgx_enclave_remove_pages *params)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+
+ for (c = 0 ; c < params->length; c += PAGE_SIZE) {
+ addr = encl->base + params->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entry->type != SGX_PAGE_TYPE_TRIM) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ /*
+ * ENCLS[EMODPR] is a no-op instruction used to inform if
+ * ENCLU[EACCEPT] was run from within the enclave. If
+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is
+ * not yet accepted then it will return
+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is
+ * accepted the instruction will encounter a page fault.
+ */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc));
+ sgx_encl_shrink(encl, NULL);
+ kfree(entry);
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ params->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance
+ *
+ * Final step of the flow removing pages from an initialized enclave. The
+ * complete flow is:
+ *
+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM
+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl().
+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within
+ * the enclave.
+ * 3) User initiates actual page removal using the
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here.
+ *
+ * First remove any page table entries pointing to the page and then proceed
+ * with the actual removal of the enclave page and data in support of it.
+ *
+ * VA pages are not affected by this removal. It is thus possible that the
+ * enclave may end up with more VA pages than needed to support all its
+ * pages.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_remove_pages params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.count)
+ return -EINVAL;
+
+ ret = sgx_encl_remove_pages(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct sgx_encl *encl = filep->private_data;
+ int ret;
+
+ if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags))
+ return -EBUSY;
+
+ switch (cmd) {
+ case SGX_IOC_ENCLAVE_CREATE:
+ ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_ADD_PAGES:
+ ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_INIT:
+ ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_PROVISION:
+ ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS:
+ ret = sgx_ioc_enclave_restrict_permissions(encl,
+ (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_MODIFY_TYPES:
+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_REMOVE_PAGES:
+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ clear_bit(SGX_ENCL_IOCTL, &encl->flags);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..dc73194416ac
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/node.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#include <asm/msr.h>
+#include <asm/sgx.h>
+#include <asm/archrandom.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+static int sgx_nr_epc_sections;
+static struct task_struct *ksgxd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
+
+/*
+ * These variables are part of the state of the reclaimer, and must be accessed
+ * with sgx_reclaimer_lock acquired.
+ */
+static LIST_HEAD(sgx_active_page_list);
+static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
+/*
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
+ *
+ * Return 0 when sanitization was successful or kthread was stopped, and the
+ * number of unsanitized pages otherwise.
+ */
+static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
+{
+ unsigned long left_dirty = 0;
+ struct sgx_epc_page *page;
+ LIST_HEAD(dirty);
+ int ret;
+
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
+ if (kthread_should_stop())
+ return 0;
+
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
+ list_move_tail(&page->list, &dirty);
+ left_dirty++;
+ }
+
+ cond_resched();
+ }
+
+ list_splice(&dirty, dirty_page_list);
+ return left_dirty;
+}
+
+static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ struct sgx_encl *encl = page->encl;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = true;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+ ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ if (!ret)
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ int ret;
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ ret = __eblock(sgx_get_epc_virt_addr(epc_page));
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EBLOCK");
+
+ mutex_unlock(&encl->lock);
+}
+
+static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
+ struct sgx_backing *backing)
+{
+ struct sgx_pageinfo pginfo;
+ int ret;
+
+ pginfo.addr = 0;
+ pginfo.secs = 0;
+
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
+ backing->pcmd_offset;
+
+ ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
+
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
+ backing->pcmd_offset));
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ return ret;
+}
+
+void sgx_ipi_cb(void *info)
+{
+}
+
+/*
+ * Swap page to the regular memory transformed to the blocked state by using
+ * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
+ *
+ * The first trial just tries to write the page assuming that some other thread
+ * has reset the count for threads inside the enclave by using ETRACK, and
+ * previous thread count has been zeroed out. The second trial calls ETRACK
+ * before EWB. If that fails we kick all the HW threads out, and then do EWB,
+ * which should be guaranteed the succeed.
+ */
+static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_va_page *va_page;
+ unsigned int va_offset;
+ void *va_slot;
+ int ret;
+
+ encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
+
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ va_offset = sgx_alloc_va_slot(va_page);
+ va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
+ if (sgx_va_page_full(va_page))
+ list_move_tail(&va_page->list, &encl->va_pages);
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ETRACK");
+ }
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ /*
+ * Slow path, send IPIs to kick cpus out of the
+ * enclave. Note, it's imperative that the cpu
+ * mask is generated *after* ETRACK, else we'll
+ * miss cpus that entered the enclave between
+ * generating the mask and incrementing epoch.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl),
+ sgx_ipi_cb, NULL, 1);
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ }
+ }
+
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EWB");
+
+ sgx_free_va_slot(va_page, va_offset);
+ } else {
+ encl_page->desc |= va_offset;
+ encl_page->va_page = va_page;
+ }
+}
+
+static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_backing secs_backing;
+ int ret;
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_ewb(epc_page, backing);
+ encl_page->epc_page = NULL;
+ encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
+
+ if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
+ ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
+ &secs_backing);
+ if (ret)
+ goto out;
+
+ sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
+
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+ sgx_encl_put_backing(&secs_backing);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+}
+
+/*
+ * Take a fixed number of pages from the head of the active page pool and
+ * reclaim them to the enclave's private shmem files. Skip the pages, which have
+ * been accessed since the last scan. Move those pages to the tail of active
+ * page pool so that the pages get scanned in LRU like fashion.
+ *
+ * Batch process a chunk of pages (at the moment 16) in order to degrade amount
+ * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
+ * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
+ * + EWB) but not sufficiently. Reclaiming one page at a time would also be
+ * problematic as it would increase the lock contention too much, which would
+ * halt forward progress.
+ */
+static void sgx_reclaim_pages(void)
+{
+ struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
+ struct sgx_backing backing[SGX_NR_TO_SCAN];
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ pgoff_t page_index;
+ int cnt = 0;
+ int ret;
+ int i;
+
+ spin_lock(&sgx_reclaimer_lock);
+ for (i = 0; i < SGX_NR_TO_SCAN; i++) {
+ if (list_empty(&sgx_active_page_list))
+ break;
+
+ epc_page = list_first_entry(&sgx_active_page_list,
+ struct sgx_epc_page, list);
+ list_del_init(&epc_page->list);
+ encl_page = epc_page->owner;
+
+ if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
+ chunk[cnt++] = epc_page;
+ else
+ /* The owner is freeing the page. No need to add the
+ * page back to the list of reclaimable pages.
+ */
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ encl_page = epc_page->owner;
+
+ if (!sgx_reclaimer_age(epc_page))
+ goto skip;
+
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+
+ mutex_lock(&encl_page->encl->lock);
+ ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
+ goto skip;
+ }
+
+ encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
+ mutex_unlock(&encl_page->encl->lock);
+ continue;
+
+skip:
+ spin_lock(&sgx_reclaimer_lock);
+ list_add_tail(&epc_page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+
+ chunk[i] = NULL;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (epc_page)
+ sgx_reclaimer_block(epc_page);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (!epc_page)
+ continue;
+
+ encl_page = epc_page->owner;
+ sgx_reclaimer_write(epc_page, &backing[i]);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+
+ sgx_free_epc_page(epc_page);
+ }
+}
+
+static bool sgx_should_reclaim(unsigned long watermark)
+{
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
+}
+
+/*
+ * sgx_reclaim_direct() should be called (without enclave's mutex held)
+ * in locations where SGX memory resources might be low and might be
+ * needed in order to make forward progress.
+ */
+void sgx_reclaim_direct(void)
+{
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages();
+}
+
+static int ksgxd(void *p)
+{
+ set_freezable();
+
+ /*
+ * Sanitize pages in order to recover from kexec(). The 2nd pass is
+ * required for SECS pages, whose child pages blocked EREMOVE.
+ */
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_freezable(ksgxd_waitq,
+ kthread_should_stop() ||
+ sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+
+ if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages();
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool __init sgx_page_reclaimer_init(void)
+{
+ struct task_struct *tsk;
+
+ tsk = kthread_run(ksgxd, NULL, "ksgxd");
+ if (IS_ERR(tsk))
+ return false;
+
+ ksgxd_tsk = tsk;
+
+ return true;
+}
+
+bool current_is_ksgxd(void)
+{
+ return current == ksgxd_tsk;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
+
+ spin_lock(&node->lock);
+
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
+ return NULL;
+ }
+
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
+ list_del_init(&page->list);
+ page->flags = 0;
+
+ spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
+
+ return page;
+}
+
+/**
+ * __sgx_alloc_epc_page() - Allocate an EPC page
+ *
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
+ *
+ * Return:
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
+ */
+struct sgx_epc_page *__sgx_alloc_epc_page(void)
+{
+ struct sgx_epc_page *page;
+ int nid_of_current = numa_node_id();
+ int nid_start, nid;
+
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+
+ nid = nid_start;
+ do {
+ page = __sgx_alloc_epc_page_from_node(nid);
+ if (page)
+ return page;
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * @page: EPC page
+ *
+ * Mark a page as reclaimable and add it to the active page list. Pages
+ * are automatically removed from the active list when freed.
+ */
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ list_add_tail(&page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+}
+
+/**
+ * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
+ * @page: EPC page
+ *
+ * Clear the reclaimable flag and remove the page from the active page list.
+ *
+ * Return:
+ * 0 on success,
+ * -EBUSY if the page is in the process of being reclaimed
+ */
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
+ /* The page is being reclaimed. */
+ if (list_empty(&page->list)) {
+ spin_unlock(&sgx_reclaimer_lock);
+ return -EBUSY;
+ }
+
+ list_del(&page->list);
+ page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_alloc_epc_page() - Allocate an EPC page
+ * @owner: the owner of the EPC page
+ * @reclaim: reclaim pages if necessary
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page(). If
+ * @reclaim is set to true, directly reclaim pages when we are out of pages. No
+ * mm's can be locked when @reclaim is set to true.
+ *
+ * Finally, wake up ksgxd when the number of pages goes below the watermark
+ * before returning back to the caller.
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
+{
+ struct sgx_epc_page *page;
+
+ for ( ; ; ) {
+ page = __sgx_alloc_epc_page();
+ if (!IS_ERR(page)) {
+ page->owner = owner;
+ break;
+ }
+
+ if (list_empty(&sgx_active_page_list))
+ return ERR_PTR(-ENOMEM);
+
+ if (!reclaim) {
+ page = ERR_PTR(-EBUSY);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ page = ERR_PTR(-ERESTARTSYS);
+ break;
+ }
+
+ sgx_reclaim_pages();
+ cond_resched();
+ }
+
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ wake_up(&ksgxd_waitq);
+
+ return page;
+}
+
+/**
+ * sgx_free_epc_page() - Free an EPC page
+ * @page: an EPC page
+ *
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
+ */
+void sgx_free_epc_page(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
+
+ spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
+}
+
+static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
+ unsigned long index,
+ struct sgx_epc_section *section)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long i;
+
+ section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
+ if (!section->virt_addr)
+ return false;
+
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
+ if (!section->pages) {
+ memunmap(section->virt_addr);
+ return false;
+ }
+
+ section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
+
+ for (i = 0; i < nr_pages; i++) {
+ section->pages[i].section = index;
+ section->pages[i].flags = 0;
+ section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
+ }
+
+ return true;
+}
+
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ sgx_unmark_page_reclaimable(page);
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
+/*
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+ return (low & GENMASK_ULL(31, 12)) +
+ ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
+static bool __init sgx_page_cache_init(void)
+{
+ u32 eax, ebx, ecx, edx, type;
+ u64 pa, size;
+ int nid;
+ int i;
+
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+ cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID)
+ break;
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ pr_err_once("Unknown EPC section type: %u\n", type);
+ break;
+ }
+
+ pa = sgx_calc_section_metric(eax, ebx);
+ size = sgx_calc_section_metric(ecx, edx);
+
+ pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+ if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+ pr_err("No free memory for an EPC section\n");
+ break;
+ }
+
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
+ node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
+
+ sgx_nr_epc_sections++;
+ }
+
+ if (!sgx_nr_epc_sections) {
+ pr_err("There are zero EPC sections.\n");
+ return false;
+ }
+
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
+ return true;
+}
+
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ CLASS(fd, f)(attribute_fd);
+
+ if (fd_empty(f))
+ return -EINVAL;
+
+ if (fd_file(f)->f_op != &sgx_provision_fops)
+ return -EINVAL;
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0: - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * * entropy in RNG
+ * * %-EIO: - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+ int ret;
+
+ /*
+ * If EUPDATESVN is not available, it is ok to
+ * silently skip it to comply with legacy behavior.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+ return 0;
+
+ /*
+ * EPC is guaranteed to be empty when there are no users.
+ * Ensure we are on our first user before proceeding further.
+ */
+ WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+ for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+ ret = __eupdatesvn();
+
+ /* Stop on success or unexpected errors: */
+ if (ret != SGX_INSUFFICIENT_ENTROPY)
+ break;
+ }
+
+ switch (ret) {
+ case 0:
+ /*
+ * SVN successfully updated.
+ * Let users know when the update was successful.
+ */
+ pr_info("SVN updated successfully\n");
+ return 0;
+ case SGX_NO_UPDATE:
+ /*
+ * SVN update failed since the current SVN is
+ * not newer than CPUSVN. This is the most
+ * common case and indicates no harm.
+ */
+ return 0;
+ case SGX_INSUFFICIENT_ENTROPY:
+ /*
+ * SVN update failed due to lack of entropy in DRNG.
+ * Indicate to userspace that it should retry.
+ */
+ return -EAGAIN;
+ default:
+ break;
+ }
+
+ /*
+ * EUPDATESVN was called when EPC is empty, all other error
+ * codes are unexpected.
+ */
+ ENCLS_WARN(ret, "EUPDATESVN");
+ return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+ int ret;
+
+ guard(mutex)(&sgx_svn_lock);
+
+ if (!sgx_usage_count) {
+ ret = sgx_update_svn();
+ if (ret)
+ return ret;
+ }
+
+ sgx_usage_count++;
+
+ return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+ guard(mutex)(&sgx_svn_lock);
+ sgx_usage_count--;
+}
+
+static int __init sgx_init(void)
+{
+ int ret;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX))
+ return -ENODEV;
+
+ if (!sgx_page_cache_init())
+ return -ENOMEM;
+
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
+ goto err_page_cache;
+ }
+
+ ret = misc_register(&sgx_dev_provision);
+ if (ret)
+ goto err_kthread;
+
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
+ return 0;
+
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
+err_kthread:
+ kthread_stop(ksgxd_tsk);
+
+err_page_cache:
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ vfree(sgx_epc_sections[i].pages);
+ memunmap(sgx_epc_sections[i].virt_addr);
+ }
+
+ return ret;
+}
+
+device_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..f5940393d9bd
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/sgx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/arch/x86/sgx.rst for more information."
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_EEXTEND_BLOCK_SIZE 256
+#define SGX_NR_TO_SCAN 16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
+/* Pages, which are being tracked by the page reclaimer. */
+#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
+struct sgx_epc_page {
+ unsigned int section;
+ u16 flags;
+ u16 poison;
+ struct sgx_encl_page *owner;
+ struct list_head list;
+};
+
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
+ spinlock_t lock;
+};
+
+/*
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ */
+struct sgx_epc_section {
+ unsigned long phys_addr;
+ void *virt_addr;
+ struct sgx_epc_page *pages;
+ struct sgx_numa_node *node;
+};
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->phys_addr + index * PAGE_SIZE;
+}
+
+static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->virt_addr + index * PAGE_SIZE;
+}
+
+struct sgx_epc_page *__sgx_alloc_epc_page(void);
+void sgx_free_epc_page(struct sgx_epc_page *page);
+
+void sgx_reclaim_direct(void);
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+
+void sgx_ipi_cb(void *info);
+
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
+#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..8de1f1a755f2
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
+{
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+ return 0;
+}
+
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ xa_destroy(&vepc->page_array);
+ kfree(vepc);
+
+ sgx_dec_usage_count();
+ return 0;
+}
+
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_vepc_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 000000000000..f55ea3cdbf88
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CPU/APIC topology
+ *
+ * The APIC IDs describe the system topology in multiple domain levels.
+ * The CPUID topology parser provides the information which part of the
+ * APIC ID is associated to the individual levels:
+ *
+ * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
+ *
+ * The root space contains the package (socket) IDs.
+ *
+ * Not enumerated levels consume 0 bits space, but conceptually they are
+ * always represented. If e.g. only CORE and THREAD levels are enumerated
+ * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
+ *
+ * If SMT is not supported, then the THREAD domain is still used. It then
+ * has the same physical ID as the CORE domain and is the only child of
+ * the core domain.
+ *
+ * This allows a unified view on the system independent of the enumerated
+ * domain levels without requiring any conditionals in the code.
+ */
+#define pr_fmt(fmt) "CPU topo: " fmt
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/apic.h>
+#include <asm/hypervisor.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
+
+/* Bitmap of physically present CPUs. */
+DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
+
+/* Used for CPU number allocation and parallel CPU bringup */
+u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
+
+/* Bitmaps to mark registered APICs at each topology domain */
+static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
+
+/*
+ * Keep track of assigned, disabled and rejected CPUs. Present assigned
+ * with 1 as CPU #0 is reserved for the boot CPU.
+ */
+static struct {
+ unsigned int nr_assigned_cpus;
+ unsigned int nr_disabled_cpus;
+ unsigned int nr_rejected_cpus;
+ u32 boot_cpu_apic_id;
+ u32 real_bsp_apic_id;
+} topo_info __ro_after_init = {
+ .nr_assigned_cpus = 1,
+ .boot_cpu_apic_id = BAD_APICID,
+ .real_bsp_apic_id = BAD_APICID,
+};
+
+#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == (u64)cpuid_to_apicid[cpu];
+}
+
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ if (!(apicid & (__max_threads_per_core - 1)))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
+}
+
+/*
+ * Convert the APIC ID to a domain level ID by masking out the low bits
+ * below the domain level @dom.
+ */
+static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
+}
+
+static int topo_lookup_cpuid(u32 apic_id)
+{
+ int i;
+
+ /* CPU# to APICID mapping is persistent once it is established */
+ for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
+ if (cpuid_to_apicid[i] == apic_id)
+ return i;
+ }
+ return -ENODEV;
+}
+
+static __init int topo_get_cpunr(u32 apic_id)
+{
+ int cpu = topo_lookup_cpuid(apic_id);
+
+ if (cpu >= 0)
+ return cpu;
+
+ return topo_info.nr_assigned_cpus++;
+}
+
+static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
+#endif
+ set_cpu_present(cpu, true);
+}
+
+static __init bool check_for_real_bsp(u32 apic_id)
+{
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
+ /*
+ * There is no real good way to detect whether this a kdump()
+ * kernel, but except on the Voyager SMP monstrosity which is not
+ * longer supported, the real BSP APIC ID is the first one which is
+ * enumerated by firmware. That allows to detect whether the boot
+ * CPU is the real BSP. If it is not, then do not register the APIC
+ * because sending INIT to the real BSP would reset the whole
+ * system.
+ *
+ * The first APIC ID which is enumerated by firmware is detectable
+ * because the boot CPU APIC ID is registered before that without
+ * invoking this code.
+ */
+ if (topo_info.real_bsp_apic_id != BAD_APICID)
+ return false;
+
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
+ if (apic_id == topo_info.boot_cpu_apic_id) {
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
+ }
+
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
+ topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
+
+ pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
+
+ topo_info.real_bsp_apic_id = apic_id;
+ return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
+}
+
+static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
+ unsigned long *map)
+{
+ unsigned int id, end, cnt = 0;
+
+ /* Calculate the exclusive end */
+ end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
+
+ /* Unfortunately there is no bitmap_weight_range() */
+ for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
+ cnt++;
+ return cnt;
+}
+
+static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ int cpu, dom;
+
+ if (present) {
+ set_bit(apic_id, phys_cpu_present_map);
+
+ /*
+ * Double registration is valid in case of the boot CPU
+ * APIC because that is registered before the enumeration
+ * of the APICs via firmware parsers or VM guest
+ * mechanisms.
+ */
+ if (apic_id == topo_info.boot_cpu_apic_id)
+ cpu = 0;
+ else
+ cpu = topo_get_cpunr(apic_id);
+
+ cpuid_to_apicid[cpu] = apic_id;
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ } else {
+ u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
+
+ /*
+ * Check for present APICs in the same package when running
+ * on bare metal. Allow the bogosity in a guest.
+ */
+ if (hypervisor_is_type(X86_HYPER_NATIVE) &&
+ topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
+ pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
+ apic_id);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_info.nr_disabled_cpus++;
+ }
+
+ /*
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
+ */
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
+ set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
+}
+
+/**
+ * topology_register_apic - Register an APIC in early topology maps
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ * @present: True if the corresponding CPU is present
+ */
+void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ if (apic_id >= MAX_LOCAL_APIC) {
+ pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ if (check_for_real_bsp(apic_id)) {
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ /* CPU numbers exhausted? */
+ if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
+ pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_register_apic(apic_id, acpi_id, present);
+}
+
+/**
+ * topology_register_boot_apic - Register the boot CPU APIC
+ * @apic_id: The APIC ID to set up
+ *
+ * Separate so CPU #0 can be assigned
+ */
+void __init topology_register_boot_apic(u32 apic_id)
+{
+ WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
+
+ topo_info.boot_cpu_apic_id = apic_id;
+ topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+}
+
+/**
+ * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
+ * @apicid: The APIC ID for which to lookup the logical ID
+ * @at_level: The topology domain level to use
+ *
+ * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
+ * all bits below the domain level specified by @at_level to be clear. So both
+ * real APIC IDs and backshifted normalized APIC IDs work correctly.
+ *
+ * Returns:
+ * - >= 0: The requested logical ID
+ * - -ERANGE: @apicid is out of range
+ * - -ENODEV: @apicid is not registered
+ */
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return -ERANGE;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return -ENODEV;
+ /* Get the number of set bits before @lvlid. */
+ return bitmap_weight(apic_maps[at_level].map, lvlid);
+}
+EXPORT_SYMBOL_GPL(topology_get_logical_id);
+
+/**
+ * topology_unit_count - Retrieve the count of specified units at a given topology domain level
+ * @apicid: The APIC ID which specifies the search range
+ * @which_units: The domain level specifying the units to count
+ * @at_level: The domain level at which @which_units have to be counted
+ *
+ * This returns the number of possible units according to the enumerated
+ * information.
+ *
+ * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
+ * counts the number of possible cores in the package to which @apicid
+ * belongs.
+ *
+ * @at_level must obviously be greater than @which_level to produce useful
+ * results. If @at_level is equal to @which_units the result is
+ * unsurprisingly 1. If @at_level is less than @which_units the results
+ * is by definition undefined and the function returns 0.
+ */
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return 0;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return 0;
+ if (which_units > at_level)
+ return 0;
+ if (which_units == at_level)
+ return 1;
+ return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
+}
+
+#ifdef CONFIG_SMP
+int topology_get_primary_thread(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ /*
+ * Get the core domain level APIC id, which is the primary thread
+ * and return the CPU number assigned to it.
+ */
+ return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
+}
+#endif
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+/**
+ * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ */
+int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
+{
+ int cpu;
+
+ if (apic_id >= MAX_LOCAL_APIC)
+ return -EINVAL;
+
+ /* Reject if the APIC ID was not registered during enumeration. */
+ if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
+ return -ENODEV;
+
+ cpu = topo_lookup_cpuid(apic_id);
+ if (cpu < 0)
+ return -ENOSPC;
+
+ set_bit(apic_id, phys_cpu_present_map);
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ cpu_mark_primary_thread(cpu, apic_id);
+ return cpu;
+}
+
+/**
+ * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
+ * @cpu: The CPU number for which the APIC ID is removed
+ */
+void topology_hotunplug_apic(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ if (apic_id == BAD_APICID)
+ return;
+
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
+ clear_bit(apic_id, phys_cpu_present_map);
+ set_cpu_present(cpu, false);
+}
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int max_possible_cpus __initdata = NR_CPUS;
+
+/**
+ * topology_apply_cmdline_limits_early - Apply topology command line limits early
+ *
+ * Ensure that command line limits are in effect before firmware parsing
+ * takes place.
+ */
+void __init topology_apply_cmdline_limits_early(void)
+{
+ unsigned int possible = nr_cpu_ids;
+
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
+ possible = 1;
+
+ /* 'possible_cpus=N' */
+ possible = min_t(unsigned int, max_possible_cpus, possible);
+
+ if (possible < nr_cpu_ids) {
+ pr_info("Limiting to %u possible CPUs\n", possible);
+ set_nr_cpu_ids(possible);
+ }
+}
+
+static __init bool restrict_to_up(void)
+{
+ if (!smp_found_config)
+ return true;
+ /*
+ * XEN PV is special as it does not advertise the local APIC
+ * properly, but provides a fake topology for it so that the
+ * infrastructure works. So don't apply the restrictions vs. APIC
+ * here.
+ */
+ if (xen_pv_domain())
+ return false;
+
+ return apic_is_disabled;
+}
+
+void __init topology_init_possible_cpus(void)
+{
+ unsigned int assigned = topo_info.nr_assigned_cpus;
+ unsigned int disabled = topo_info.nr_disabled_cpus;
+ unsigned int cnta, cntb, cpu, allowed = 1;
+ unsigned int total = assigned + disabled;
+ u32 apicid, firstid;
+
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
+ if (!restrict_to_up()) {
+ if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
+ disabled += assigned - nr_cpu_ids;
+ assigned = nr_cpu_ids;
+ }
+ allowed = min_t(unsigned int, total, nr_cpu_ids);
+ }
+
+ if (total > allowed)
+ pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
+
+ assigned = min_t(unsigned int, allowed, assigned);
+ disabled = allowed - assigned;
+
+ topo_info.nr_assigned_cpus = assigned;
+ topo_info.nr_disabled_cpus = disabled;
+
+ total_cpus = allowed;
+ set_nr_cpu_ids(allowed);
+
+ cnta = domain_weight(TOPO_PKG_DOMAIN);
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
+ __max_logical_packages = cnta;
+ __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
+
+ pr_info("Max. logical packages: %3u\n", cnta);
+ pr_info("Max. logical dies: %3u\n", cntb);
+ pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
+
+ cnta = domain_weight(TOPO_CORE_DOMAIN);
+ cntb = domain_weight(TOPO_SMT_DOMAIN);
+ /*
+ * Can't use order delta here as order(cnta) can be equal
+ * order(cntb) even if cnta != cntb.
+ */
+ __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
+ pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
+
+ firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
+ __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
+ __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
+
+ pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
+ if (topo_info.nr_rejected_cpus)
+ pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ /* Assign CPU numbers to non-present CPUs */
+ for (apicid = 0; disabled; disabled--, apicid++) {
+ apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
+ MAX_LOCAL_APIC, apicid);
+ if (apicid >= MAX_LOCAL_APIC)
+ break;
+ cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ }
+
+ for (cpu = 0; cpu < allowed; cpu++) {
+ apicid = cpuid_to_apicid[cpu];
+
+ set_cpu_possible(cpu, true);
+
+ if (apicid == BAD_APICID)
+ continue;
+
+ cpu_mark_primary_thread(cpu, apicid);
+ set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
+ }
+}
+
+/*
+ * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
+ */
+void __init topology_reset_possible_cpus_up(void)
+{
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ if (topo_info.boot_cpu_apic_id != BAD_APICID)
+ set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
+}
+
+static int __init setup_possible_cpus(char *str)
+{
+ get_option(&str, &max_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", setup_possible_cpus);
+#endif
diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h
new file mode 100644
index 000000000000..37326297f80c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+ /* Legacy CPUID[1]:EBX[23:16] number of logical processors */
+ unsigned int ebx1_nproc_shift;
+
+ /* AMD specific node ID which cannot be mapped into APIC space. */
+ u16 amd_nodes_per_pkg;
+ u16 amd_node_id;
+};
+
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);
+void cpu_parse_topology_amd(struct topo_scan *tscan);
+void cpu_topology_fixup_amd(struct topo_scan *tscan);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level);
+#else
+static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ return 1;
+}
+#endif
+
+#endif /* ARCH_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
new file mode 100644
index 000000000000..6ac097e13106
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static bool parse_8000_0008(struct topo_scan *tscan)
+{
+ struct {
+ // ecx
+ u32 cpu_nthreads : 8, // Number of physical threads - 1
+ : 4, // Reserved
+ apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID
+ perf_tsc_len : 2, // Performance time-stamp counter size
+ : 14; // Reserved
+ } ecx;
+ unsigned int sft;
+
+ if (tscan->c->extended_cpuid_level < 0x80000008)
+ return false;
+
+ cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx);
+
+ /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */
+ sft = ecx.apicid_coreid_len;
+ if (!sft)
+ sft = get_count_order(ecx.cpu_nthreads + 1);
+
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ return true;
+}
+
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
+{
+ /*
+ * Starting with Fam 17h the DIE domain could probably be used to
+ * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps
+ * suggests it's the topmost bit(s) of the CPU cores area, but
+ * that's guess work and neither enumerated nor documented.
+ *
+ * Up to Fam 16h this does not work at all and the legacy node ID
+ * has to be used.
+ */
+ tscan->amd_nodes_per_pkg = nr_nodes;
+ tscan->amd_node_id = node_id;
+}
+
+static bool parse_8000_001e(struct topo_scan *tscan)
+{
+ struct {
+ // eax
+ u32 ext_apic_id : 32; // Extended APIC ID
+ // ebx
+ u32 core_id : 8, // Unique per-socket logical core unit ID
+ core_nthreads : 8, // #Threads per core (zero-based)
+ : 16; // Reserved
+ // ecx
+ u32 node_id : 8, // Node (die) ID of invoking logical CPU
+ nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket
+ : 21; // Reserved
+ // edx
+ u32 : 32; // Reserved
+ } leaf;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return false;
+
+ cpuid_leaf(0x8000001e, &leaf);
+
+ /*
+ * If leaf 0xb/0x26 is available, then the APIC ID and the domain
+ * shifts are set already.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) {
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
+ /*
+ * Leaf 0x8000008 sets the CORE domain shift but not the
+ * SMT domain shift. On CPUs with family >= 0x17, there
+ * might be hyperthreads.
+ */
+ if (tscan->c->x86 >= 0x17) {
+ /* Update the SMT domain, but do not propagate it. */
+ unsigned int nthreads = leaf.core_nthreads + 1;
+
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN,
+ get_count_order(nthreads), nthreads);
+ }
+ }
+
+ store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
+
+ if (tscan->c->x86_vendor == X86_VENDOR_AMD) {
+ if (tscan->c->x86 == 0x15)
+ tscan->c->topo.cu_id = leaf.core_id;
+
+ cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id);
+ } else {
+ /*
+ * Package ID is ApicId[6..] on certain Hygon CPUs. See
+ * commit e0ceeae708ce for explanation. The topology info
+ * is screwed up: The package shift is always 6 and the
+ * node ID is bit [4:5].
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) {
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6,
+ tscan->dom_ncpus[TOPO_CORE_DOMAIN]);
+ }
+ cacheinfo_hygon_init_llc_id(tscan->c);
+ }
+ return true;
+}
+
+static void parse_fam10h_node_id(struct topo_scan *tscan)
+{
+ union {
+ struct {
+ u64 node_id : 3,
+ nodes_per_pkg : 3,
+ unused : 58;
+ };
+ u64 msr;
+ } nid;
+
+ if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
+ return;
+
+ rdmsrq(MSR_FAM10H_NODE_ID, nid.msr);
+ store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
+ tscan->c->topo.llc_id = nid.node_id;
+}
+
+static void legacy_set_llc(struct topo_scan *tscan)
+{
+ unsigned int apicid = tscan->c->topo.initial_apicid;
+
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT,
+ MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0)
+ return;
+
+ rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval);
+ if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+}
+
+static void parse_topology_amd(struct topo_scan *tscan)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
+ /*
+ * Try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb. If either leaf is
+ * available, cpu_parse_topology_ext() will return true.
+ *
+ * If XTOPOLOGY leaves (0x26/0xb) are not available, try to
+ * get the CORE shift from leaf 0x8000_0008 first.
+ */
+ if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan))
+ return;
+
+ /*
+ * Prefer leaf 0x8000001e if available to get the SMT shift and
+ * the initial APIC ID if XTOPOLOGY leaves are not available.
+ */
+ if (parse_8000_001e(tscan))
+ return;
+
+ /* Try the NODEID MSR */
+ parse_fam10h_node_id(tscan);
+}
+
+void cpu_parse_topology_amd(struct topo_scan *tscan)
+{
+ tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
+ parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
+
+ if (tscan->amd_nodes_per_pkg > 1)
+ set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
+}
+
+void cpu_topology_fixup_amd(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+
+ /*
+ * Adjust the core_id relative to the node when there is more than
+ * one node.
+ */
+ if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1)
+ c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
new file mode 100644
index 000000000000..71625795d711
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+EXPORT_SYMBOL_GPL(x86_topo_system);
+
+unsigned int __amd_nodes_per_pkg __ro_after_init;
+EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ topology_update_dom(tscan, dom, shift, ncpus);
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
+static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+{
+ struct {
+ u32 cache_type : 5,
+ unused : 21,
+ ncores : 6;
+ } eax;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax);
+ if (!eax.cache_type)
+ return 1;
+
+ return eax.ncores + 1;
+}
+
+static void parse_legacy(struct topo_scan *tscan)
+{
+ unsigned int cores, core_shift, smt_shift = 0;
+ struct cpuinfo_x86 *c = tscan->c;
+
+ cores = parse_num_cores_legacy(c);
+ core_shift = get_count_order(cores);
+
+ if (cpu_has(c, X86_FEATURE_HT)) {
+ if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift))
+ smt_shift = tscan->ebx1_nproc_shift - core_shift;
+ /*
+ * The parser expects leaf 0xb/0x1f format, which means
+ * the number of logical processors at core level is
+ * counting threads.
+ */
+ core_shift += smt_shift;
+ cores <<= smt_shift;
+ }
+
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores);
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1);
+
+ return tscan->c->cpuid_level < 1;
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+
+ tscan->ebx1_nproc_shift = get_count_order(ebx.nproc);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD))
+ cpu_parse_topology_amd(tscan);
+ break;
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_ZHAOXIN:
+ parse_legacy(tscan);
+ break;
+ case X86_VENDOR_INTEL:
+ if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
+ break;
+ case X86_VENDOR_HYGON:
+ if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+ cpu_parse_topology_amd(tscan);
+ break;
+ }
+}
+
+static void topo_set_ids(struct topo_scan *tscan, bool early)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ }
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+
+ c->topo.amd_node_id = tscan->amd_node_id;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ cpu_topology_fixup_amd(tscan);
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (c->topo.initial_apicid != c->topo.apicid) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n",
+ cpu, c->topo.initial_apicid, c->topo.apicid);
+ }
+
+ if (c->topo.apicid != cpuid_to_apicid[cpu]) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n",
+ cpu, cpuid_to_apicid[cpu], c->topo.apicid);
+ }
+ }
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ topo_set_ids(&tscan, false);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan, true);
+
+ /*
+ * AMD systems have Nodes per package which cannot be mapped to
+ * APIC ID.
+ */
+ __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
new file mode 100644
index 000000000000..467b0326bf1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MAX_TYPE_0B = 3,
+ MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
+ TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE_1F = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ */
+static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
+};
+
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
+ unsigned int *last_dom)
+{
+ unsigned int dom, maxtype;
+ const unsigned int *map;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ switch (leaf) {
+ case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
+ case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
+ default: return false;
+ }
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ /*
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of following the
+ * usual "this can be fixed in software" principle.
+ */
+ dom = *last_dom + 1;
+ } else {
+ dom = map[sl.type];
+ *last_dom = dom;
+ }
+
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ unsigned int last_dom;
+ u32 subleaf;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Intel: Try leaf 0x1F first. */
+ if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
+ return true;
+
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
+ /* Intel/AMD: Fall back to leaf 0xB if available */
+ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 28000743bbb0..42c939827621 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,11 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <asm/msr.h>
#include "cpu.h"
-static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+static void early_init_transmeta(struct cpuinfo_x86 *c)
{
u32 xlvl;
@@ -13,11 +15,11 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
}
-static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
+static void init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -34,7 +36,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860001) {
cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
(cpu_rev >> 16) & 0xff,
(cpu_rev >> 8) & 0xff,
@@ -45,10 +47,10 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
new_cpu_rev, cpu_freq);
}
- printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
(cms_rev1 >> 24) & 0xff,
(cms_rev1 >> 16) & 0xff,
(cms_rev1 >> 8) & 0xff,
@@ -77,13 +79,13 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
(void *)&cpu_info[56],
(void *)&cpu_info[60]);
cpu_info[64] = '\0';
- printk(KERN_INFO "CPU: %s\n", cpu_info);
+ pr_info("CPU: %s\n", cpu_info);
}
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
- c->x86_capability[0] = cpuid_edx(0x00000001);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
@@ -98,7 +100,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
+static const struct cpu_dev transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..209b5a22d880
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
+enum tsx_ctrl_states {
+ TSX_CTRL_AUTO,
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO :
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
+
+static void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrq(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrq(MSR_TSX_FORCE_ABORT, msr);
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
+ rdmsrq(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrq(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
+ }
+}
+
+static int __init tsx_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(str, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(str, "auto")) {
+ tsx_ctrl_state = TSX_CTRL_AUTO;
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+
+ return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
+
+void __init tsx_init(void)
+{
+ tsx_dev_mode_disable();
+
+ /*
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ return;
+ }
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
+ return;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_AUTO)
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index fd2c37bf7acb..65a58a390fc3 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -8,11 +8,11 @@
* so no special init takes place.
*/
-static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
+static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
- .c_models = {
- { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
new file mode 100644
index 000000000000..e4a31c536642
--- /dev/null
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/syscore_ops.h>
+#include <linux/suspend.h>
+#include <linux/cpu.h>
+
+#include <asm/msr.h>
+#include <asm/mwait.h>
+
+#define UMWAIT_C02_ENABLE 0
+
+#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
+ (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
+ ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
+
+/*
+ * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
+ * umwait max time is 100000 in TSC-quanta and C0.2 is enabled
+ */
+static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+
+/*
+ * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
+ * hardware or BIOS before kernel boot.
+ */
+static u32 orig_umwait_control_cached __ro_after_init;
+
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
+ * the sysfs write functions.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
+static void umwait_update_control_msr(void * unused)
+{
+ lockdep_assert_irqs_disabled();
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached));
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the global control
+ * value.
+ *
+ * Disable interrupts so the read of umwait_control_cached and the WRMSR
+ * are protected against a concurrent sysfs write. Otherwise the sysfs
+ * write could update the cached value after it had been read on this CPU
+ * and issue the IPI before the old value had been written. The IPI would
+ * interrupt, write the new value and after return from IPI the previous
+ * value would be written by this CPU.
+ *
+ * With interrupts disabled the upcoming CPU either sees the new control
+ * value or the IPI is updating this CPU to the new control value after
+ * interrupts have been reenabled.
+ */
+static int umwait_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ umwait_update_control_msr(NULL);
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the original control
+ * value.
+ */
+static int umwait_cpu_offline(unsigned int cpu)
+{
+ /*
+ * This code is protected by the CPU hotplug already and
+ * orig_umwait_control_cached is never changed after it caches
+ * the original control MSR value in umwait_init(). So there
+ * is no race condition here.
+ */
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ return 0;
+}
+
+/*
+ * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
+ * is the only active CPU at this time. The MSR is set up on the APs via the
+ * CPU hotplug callback.
+ *
+ * This function is invoked on resume from suspend and hibernation. On
+ * resume from suspend the restore should be not required, but we neither
+ * trust the firmware nor does it matter if the same value is written
+ * again.
+ */
+static void umwait_syscore_resume(void *data)
+{
+ umwait_update_control_msr(NULL);
+}
+
+static const struct syscore_ops umwait_syscore_ops = {
+ .resume = umwait_syscore_resume,
+};
+
+static struct syscore umwait_syscore = {
+ .ops = &umwait_syscore_ops,
+};
+
+/* sysfs interface */
+
+/*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled.
+ */
+static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
+{
+ return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
+}
+
+static inline u32 umwait_ctrl_max_time(u32 ctrl)
+{
+ return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+}
+
+static inline void umwait_update_control(u32 maxtime, bool c02_enable)
+{
+ u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+
+ if (!c02_enable)
+ ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
+
+ WRITE_ONCE(umwait_control_cached, ctrl);
+ /* Propagate to all CPUs */
+ on_each_cpu(umwait_update_control_msr, NULL, 1);
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool c02_enable;
+ u32 ctrl;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
+ umwait_update_control(ctrl, c02_enable);
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static ssize_t
+max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
+}
+
+static ssize_t max_time_store(struct device *kobj,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 max_time, ctrl;
+ int ret;
+
+ ret = kstrtou32(buf, 0, &max_time);
+ if (ret)
+ return ret;
+
+ /* bits[1:0] must be zero */
+ if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
+ return -EINVAL;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (max_time != umwait_ctrl_max_time(ctrl))
+ umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(max_time);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ &dev_attr_max_time.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
+static int __init umwait_init(void)
+{
+ struct device *dev;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ return -ENODEV;
+
+ /*
+ * Cache the original control MSR value before the control MSR is
+ * changed. This is the only place where orig_umwait_control_cached
+ * is modified.
+ */
+ rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
+ umwait_cpu_online, umwait_cpu_offline);
+ if (ret < 0) {
+ /*
+ * On failure, the control MSR on all CPUs has the
+ * original control value.
+ */
+ return ret;
+ }
+
+ register_syscore(&umwait_syscore);
+
+ /*
+ * Add umwait control interface. Ignore failure, so at least the
+ * default values are set up in case the machine manages to boot.
+ */
+ dev = bus_get_dev_root(&cpu_subsys);
+ if (dev) {
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ put_device(dev);
+ }
+ return ret;
+}
+device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,106 +22,572 @@
*/
#include <linux/dmi.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/efi.h>
+#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
+#include <asm/vmware.h>
+#include <asm/svm.h>
-#define CPUID_VMWARE_INFO_LEAF 0x40000000
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-#define VMWARE_HYPERVISOR_PORT 0x5658
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
-#define VMWARE_PORT_CMD_GETVERSION 10
-#define VMWARE_PORT_CMD_GETHZ 45
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "0"(VMWARE_HYPERVISOR_MAGIC), \
- "1"(VMWARE_PORT_CMD_##cmd), \
- "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
- "memory");
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
+
+#define STEALCLOCK_NOT_AVAILABLE (-1)
+#define STEALCLOCK_DISABLED 0
+#define STEALCLOCK_ENABLED 1
+
+struct vmware_steal_time {
+ union {
+ u64 clock; /* stolen time counter in units of vtsc */
+ struct {
+ /* only for little-endian */
+ u32 clock_low;
+ u32 clock_high;
+ };
+ };
+ u64 reserved[7];
+};
+
+static unsigned long vmware_tsc_khz __ro_after_init;
+static u8 vmware_hypercall_mode __ro_after_init;
+
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
+
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static bool vmw_sched_clock __initdata = true;
+static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
+static bool has_steal_clock;
+static bool steal_acc __initdata = true; /* steal time accounting */
+
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = false;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static __init int parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static noinstr u64 vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
+}
+
+static void __init vmware_cyc2ns_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
+{
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
+}
+
+static bool stealclock_enable(phys_addr_t pa)
+{
+ return vmware_cmd_stealclock(upper_32_bits(pa),
+ lower_32_bits(pa)) == STEALCLOCK_ENABLED;
+}
+
+static int __stealclock_disable(void)
+{
+ return vmware_cmd_stealclock(0, 1);
+}
+
+static void stealclock_disable(void)
+{
+ __stealclock_disable();
+}
+
+static bool vmware_is_stealclock_available(void)
+{
+ return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
+}
+
+/**
+ * vmware_steal_clock() - read the per-cpu steal clock
+ * @cpu: the cpu number whose steal clock we want to read
+ *
+ * The function reads the steal clock if we are on a 64-bit system, otherwise
+ * reads it in parts, checking that the high part didn't change in the
+ * meantime.
+ *
+ * Return:
+ * The steal clock reading in ns.
+ */
+static u64 vmware_steal_clock(int cpu)
+{
+ struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
+ u64 clock;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ clock = READ_ONCE(steal->clock);
+ else {
+ u32 initial_high, low, high;
+
+ do {
+ initial_high = READ_ONCE(steal->clock_high);
+ /* Do not reorder initial_high and high readings */
+ virt_rmb();
+ low = READ_ONCE(steal->clock_low);
+ /* Keep low reading in between */
+ virt_rmb();
+ high = READ_ONCE(steal->clock_high);
+ } while (initial_high != high);
+
+ clock = ((u64)high << 32) | low;
+ }
+
+ return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+}
+
+static void vmware_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ if (!stealclock_enable(slow_virt_to_phys(st))) {
+ has_steal_clock = false;
+ return;
+ }
+
+ pr_info("vmware-stealtime: cpu %d, pa %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static void vmware_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ stealclock_disable();
+}
+
+static void vmware_guest_cpu_init(void)
+{
+ if (has_steal_clock)
+ vmware_register_steal_time();
+}
+
+static void vmware_pv_guest_cpu_reboot(void *unused)
+{
+ vmware_disable_steal_time();
+}
+
+static int vmware_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vmware_pv_reboot_nb = {
+ .notifier_call = vmware_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init vmware_smp_prepare_boot_cpu(void)
+{
+ vmware_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static int vmware_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
+
+static int vmware_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_disable_steal_time();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_ops.cpu.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz == 0)
+ return;
+
+ vmware_cyc2ns_setup();
+
+ if (vmw_sched_clock)
+ paravirt_set_sched_clock(vmware_sched_clock);
+
+ if (vmware_is_stealclock_available()) {
+ has_steal_clock = true;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+ /* We use reboot notifier only to disable steal clock */
+ register_reboot_notifier(&vmware_pv_reboot_nb);
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
- (unsigned long) tsc_hz / 1000,
- (unsigned long) tsc_hz % 1000);
- return tsc_hz;
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu =
+ vmware_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/vmware:online",
+ vmware_cpu_online,
+ vmware_cpu_down_prepare) < 0)
+ pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ vmware_guest_cpu_init();
+#endif
+ }
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+static void __init vmware_set_capabilities(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMCALL);
+ else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
}
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
- printk(KERN_WARNING
- "Failed to get TSC freq from the hypervisor\n");
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_period = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
+ vmware_paravirt_ops_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+ vmware_set_capabilities();
+}
+
+static u8 __init vmware_select_hypercall(void)
+{
+ int eax, ebx, ecx, edx;
+
+ cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
+ return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
+ CPUID_VMWARE_FEATURES_ECX_VMCALL));
}
/*
- * While checking the dmi string infomation, just checking the product
+ * While checking the dmi string information, just checking the product
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
+ * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
+ * intentionally defaults to 0.
*/
-static bool __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
- if (cpu_has_hypervisor) {
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
unsigned int hyper_vendor_id[3];
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
- if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
- return true;
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
+ if (eax >= CPUID_VMWARE_FEATURES_LEAF)
+ vmware_hypercall_mode =
+ vmware_select_hypercall();
+
+ pr_info("hypercall mode: 0x%02x\n",
+ (unsigned int) vmware_hypercall_mode);
+
+ return CPUID_VMWARE_INFO_LEAF;
+ }
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return true;
+ return 1;
- return false;
+ return 0;
}
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
+}
+
+#ifdef CONFIG_INTEL_TDX_GUEST
/*
- * VMware hypervisor takes care of exporting a reliable TSC to the guest.
- * Still, due to timing difference when running on virtual cpus, the TSC can
- * be marked as unstable in some cases. For example, the TSC sync check at
- * bootup can fail due to a marginal offset between vcpus' TSCs (though the
- * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
- * is not suitable as a watchdog when running on a hypervisor because the
- * kernel may miss a wrap of the counter if the vcpu is descheduled for a
- * long time. To skip these checks at runtime we set these capability bits,
- * so that the kernel could just trust the hypervisor with providing a
- * reliable virtual TSC that is suitable for timekeeping.
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
*/
-static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
+}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
+ struct pt_regs *regs)
{
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+ /* Copy VMWARE specific Hypercall parameters to the GHCB */
+ ghcb_set_rip(ghcb, regs->ip);
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+ ghcb_set_rdi(ghcb, regs->di);
+ ghcb_set_rbp(ghcb, regs->bp);
+}
+
+static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ if (!(ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb) &&
+ ghcb_rsi_is_valid(ghcb) &&
+ ghcb_rdi_is_valid(ghcb) &&
+ ghcb_rbp_is_valid(ghcb)))
+ return false;
+
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
+
+ return true;
}
+#endif
-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
- .name = "VMware",
- .detect = vmware_platform,
- .set_cpu_features = vmware_set_cpu_features,
- .init_platform = vmware_platform_setup,
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
+#endif
};
-EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
new file mode 100644
index 000000000000..89b1c8a70fe8
--- /dev/null
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#define MSR_ZHAOXIN_FCR57 0x00001257
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
+
+static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* Enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable ACE unit */
+ lo |= ACE_FCR;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* Enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable RNG unit */
+ lo |= RNG_ENABLE;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /*
+ * Store Extended Feature Flags as word 5 of the CPU
+ * capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+}
+
+static void early_init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+}
+
+static void init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ early_init_zhaoxin(c);
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (c->x86 >= 0x6)
+ init_zhaoxin_cap(c);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ init_ia32_feat_ctl(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ return size;
+}
+#endif
+
+static const struct cpu_dev zhaoxin_cpu_dev = {
+ .c_vendor = "zhaoxin",
+ .c_ident = { " Shanghai " },
+ .c_early_init = early_init_zhaoxin,
+ .c_init = init_zhaoxin,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = zhaoxin_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_ZHAOXIN,
+};
+
+cpu_dev_register(zhaoxin_cpu_dev);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..dae436253de4 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* ----------------------------------------------------------------------- *
*
* Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- * USA; either version 2 of the License, or (at your option) any later
- * version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
/*
@@ -33,7 +28,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
@@ -41,53 +35,35 @@
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
+#include <linux/completion.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/system.h>
-static struct class *cpuid_class;
+static enum cpuhp_state cpuhp_cpuid_state;
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
};
static void cpuid_smp_cpuid(void *cmd_block)
{
- struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
+ struct cpuid_regs_done *cmd = cmd_block;
- cpuid_count(cmd->eax, cmd->ecx,
- &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
-}
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
+ complete(&cmd->done);
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
char __user *tmp = buf;
- struct cpuid_regs cmd;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ struct cpuid_regs_done cmd;
+ int cpu = iminor(file_inode(file));
u64 pos = *ppos;
ssize_t bytes = 0;
int err = 0;
@@ -95,19 +71,27 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
if (count % 16)
return -EINVAL; /* Invalid chunk size */
+ init_completion(&cmd.done);
for (; count; count -= 16) {
- cmd.eax = pos;
- cmd.ecx = pos >> 32;
- err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
+ call_single_data_t csd;
+
+ INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
if (err)
break;
- if (copy_to_user(tmp, &cmd, 16)) {
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
err = -EFAULT;
break;
}
tmp += 16;
bytes += 16;
*ppos = ++pos;
+ reinit_completion(&cmd.done);
}
return bytes ? bytes : err;
@@ -118,7 +102,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
- cpu = iminor(file->f_path.dentry->d_inode);
+ cpu = iminor(file_inode(file));
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
@@ -134,107 +118,72 @@ static int cpuid_open(struct inode *inode, struct file *file)
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
- .llseek = cpuid_seek,
+ .llseek = no_seek_end_llseek,
.read = cpuid_read,
.open = cpuid_open,
};
-static __cpuinit int cpuid_device_create(int cpu)
+static char *cpuid_devnode(const struct device *dev, umode_t *mode)
{
- struct device *dev;
-
- dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
- "cpu%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
-static void cpuid_device_destroy(int cpu)
-{
- device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
-}
+static const struct class cpuid_class = {
+ .name = "cpuid",
+ .devnode = cpuid_devnode,
+};
-static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int cpuid_device_create(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
+ struct device *dev;
- switch (action) {
- case CPU_UP_PREPARE:
- err = cpuid_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- cpuid_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
+ dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
}
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
-{
- .notifier_call = cpuid_class_cpu_callback,
-};
-
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static int cpuid_device_destroy(unsigned int cpu)
{
- return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+ device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
}
static int __init cpuid_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
"cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
- cpuid_class = class_create(THIS_MODULE, "cpuid");
- if (IS_ERR(cpuid_class)) {
- err = PTR_ERR(cpuid_class);
+ err = class_register(&cpuid_class);
+ if (err)
goto out_chrdev;
- }
- cpuid_class->devnode = cpuid_devnode;
- for_each_online_cpu(i) {
- err = cpuid_device_create(i);
- if (err != 0)
- goto out_class;
- }
- register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
+
+ cpuhp_cpuid_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i) {
- cpuid_device_destroy(i);
- }
- class_destroy(cpuid_class);
+ class_unregister(&cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-out:
return err;
}
+module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
- int cpu = 0;
-
- for_each_online_cpu(cpu)
- cpuid_device_destroy(cpu);
- class_destroy(cpuid_class);
+ cpuhp_remove_state(cpuhp_cpuid_state);
+ class_unregister(&cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
}
-
-module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..335fd2ee9766 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -1,13 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Architecture specific (i386/x86_64) functions for kexec based crash dumps.
*
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
-#include <linux/init.h>
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,58 +21,76 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+#include <asm/bootparam.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/e820/types.h>
+#include <asm/io_apic.h>
#include <asm/hpet.h>
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
+#include <asm/intel_pt.h>
+#include <asm/crash.h>
+#include <asm/cmdline.h>
+#include <asm/sev.h>
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static void kdump_nmi_callback(int cpu, struct die_args *args)
+static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
- struct pt_regs *regs;
-#ifdef CONFIG_X86_32
- struct pt_regs fixed_regs;
-#endif
-
- regs = args->regs;
-
-#ifdef CONFIG_X86_32
- if (!user_mode_vm(regs)) {
- crash_fixup_ss_esp(&fixed_regs, regs);
- regs = &fixed_regs;
- }
-#endif
crash_save_cpu(regs, cpu);
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
+ /*
+ * Disable Intel PT to stop its logging
*/
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_stop_pt();
+
+ kdump_sev_callback();
disable_local_APIC();
}
-static void kdump_nmi_shootdown_cpus(void)
+void kdump_nmi_shootdown_cpus(void)
{
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
}
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
#else
-static void kdump_nmi_shootdown_cpus(void)
+void crash_smp_send_stop(void)
{
/* There are no cpus to shootdown */
}
@@ -86,21 +109,463 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();
- kdump_nmi_shootdown_cpus();
+ crash_smp_send_stop();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
+ cpu_emergency_disable_virtualization();
+
+ /*
+ * Disable Intel PT to stop its logging
*/
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_stop_pt();
- lapic_shutdown();
-#if defined(CONFIG_X86_IO_APIC)
- disable_IO_APIC();
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
+ clear_IO_APIC();
#endif
+ lapic_shutdown();
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
- crash_save_cpu(regs, safe_smp_processor_id());
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin();
+ x86_platform.guest.enc_kexec_finish();
+
+ crash_save_cpu(regs, smp_processor_id());
+}
+
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
+{
+ unsigned int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static struct crash_mem *fill_up_crash_elf_data(void)
+{
+ unsigned int nr_ranges = 0;
+ struct crash_mem *cmem;
+
+ walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
+ if (!nr_ranges)
+ return NULL;
+
+ /*
+ * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
+ * may cause range splits. So add extra slots here.
+ *
+ * Exclusion of low 1M may not cause another range split, because the
+ * range of exclude is [0, 1M] and the condition for splitting a new
+ * region is that the start, end parameters are both in a certain
+ * existing region in cmem and cannot be equal to existing region's
+ * start or end. Obviously, the start of [0, 1M] cannot meet this
+ * condition.
+ *
+ * But in order to lest the low 1M could be changed in the future,
+ * (e.g. [start, 1M]), add a extra slot.
+ */
+ nr_ranges += 3 + crashk_cma_cnt;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
+ if (!cmem)
+ return NULL;
+
+ cmem->max_nr_ranges = nr_ranges;
+
+ return cmem;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in cmem->ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_mem *cmem)
+{
+ int ret = 0;
+ int i;
+
+ /* Exclude the low 1M because it is always reserved */
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
+ if (ret)
+ return ret;
+
+ /* Exclude crashkernel region */
+ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ if (crashk_low_res.end)
+ ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
+ crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
+{
+ struct crash_mem *cmem = arg;
+
+ cmem->ranges[cmem->nr_ranges].start = res->start;
+ cmem->ranges[cmem->nr_ranges].end = res->end;
+ cmem->nr_ranges++;
+
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
+{
+ struct crash_mem *cmem;
+ int ret;
+
+ cmem = fill_up_crash_elf_data();
+ if (!cmem)
+ return -ENOMEM;
+
+ ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
+ if (ret)
+ goto out;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(cmem);
+ if (ret)
+ goto out;
+
+ /* Return the computed number of memory ranges, for hotplug usage */
+ *nr_mem_ranges = cmem->nr_ranges;
+
+ /* By default prepare 64bit headers */
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+
+out:
+ vfree(cmem);
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_KEXEC_FILE
+static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
+ return 1;
+
+ memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
+ params->e820_entries++;
+ return 0;
}
+
+static int memmap_entry_callback(struct resource *res, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820_entry ei;
+
+ ei.addr = res->start;
+ ei.size = resource_size(res);
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude elf header region */
+ start = image->elf_load_addr;
+ end = start + image->elf_headers_sz - 1;
+ ret = crash_exclude_mem_range(cmem, start, end);
+
+ if (ret)
+ return ret;
+
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ unsigned int nr_ranges = 0;
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820_entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ /*
+ * In the current x86 architecture code, the elfheader is always
+ * allocated at crashk_res.start. But it depends on the allocation
+ * position of elfheader in crashk_res. To avoid potential out of
+ * bounds in future, add an extra slot.
+ *
+ * And using random kexec_buf for passing dm crypt keys may cause a
+ * range split too, add another extra slot here.
+ */
+ nr_ranges = 3;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
+ if (!cmem)
+ return -ENOMEM;
+
+ cmem->max_nr_ranges = nr_ranges;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add the low 1M */
+ cmd.type = E820_TYPE_RAM;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI tables */
+ cmd.type = E820_TYPE_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_TYPE_NVS;
+ walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add e820 reserved ranges */
+ cmd.type = E820_TYPE_RESERVED;
+ flags = IORESOURCE_MEM;
+ walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = resource_size(&crashk_low_res);
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ei.addr = crashk_cma_ranges[i].start;
+ ei.size = crashk_cma_ranges[i].end -
+ crashk_cma_ranges[i].start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ int ret;
+ unsigned long pnum = 0;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
+ if (ret)
+ return ret;
+
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
+ kbuf.memsz = kbuf.bufsz;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ /*
+ * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map,
+ * maximum CPUs and maximum memory ranges.
+ */
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
+ else
+ pnum += 2 + CONFIG_NR_CPUS_DEFAULT;
+
+ if (pnum < (unsigned long)PN_XNUM) {
+ kbuf.memsz = pnum * sizeof(Elf64_Phdr);
+ kbuf.memsz += sizeof(Elf64_Ehdr);
+
+ image->elfcorehdr_index = image->nr_segments;
+
+ /* Mark as usable to crash kernel, else crash kernel fails on boot */
+ image->elf_headers_sz = kbuf.memsz;
+ } else {
+ pr_err("number of Phdrs %lu exceeds max\n", pnum);
+ }
+#endif
+
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ return ret;
+ image->elf_load_addr = kbuf.mem;
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_HOTPLUG
+
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
+{
+
+#ifdef CONFIG_KEXEC_FILE
+ if (image->file_mode)
+ return 1;
+#endif
+ /*
+ * Initially, crash hotplug support for kexec_load was added
+ * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+ * functionality was expanded to accommodate multiple kexec
+ * segment updates, leading to the introduction of the
+ * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+ * when the kexec tool sends either of these flags, it indicates
+ * that the required kexec segment (elfcorehdr) is excluded from
+ * the SHA calculation.
+ */
+ return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+ kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
+
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+ unsigned int sz;
+
+ /* kernel_map, VMCOREINFO and maximum CPUs */
+ sz = 2 + CONFIG_NR_CPUS_DEFAULT;
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ sz += CONFIG_CRASH_MAX_MEMORY_RANGES;
+ sz *= sizeof(Elf64_Phdr);
+ return sz;
+}
+
+/**
+ * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
+ * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ * NULL for CPU hotplug case.
+ *
+ * Prepare the new elfcorehdr and replace the existing elfcorehdr.
+ */
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
+{
+ void *elfbuf = NULL, *old_elfcorehdr;
+ unsigned long nr_mem_ranges;
+ unsigned long mem, memsz;
+ unsigned long elfsz = 0;
+
+ /*
+ * As crash_prepare_elf64_headers() has already described all
+ * possible CPUs, there is no need to update the elfcorehdr
+ * for additional CPU changes.
+ */
+ if ((image->file_mode || image->elfcorehdr_updated) &&
+ ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) ||
+ (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU)))
+ return;
+
+ /*
+ * Create the new elfcorehdr reflecting the changes to CPU and/or
+ * memory resources.
+ */
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
+ pr_err("unable to create new elfcorehdr");
+ goto out;
+ }
+
+ /*
+ * Obtain address and size of the elfcorehdr segment, and
+ * check it against the new elfcorehdr buffer.
+ */
+ mem = image->segment[image->elfcorehdr_index].mem;
+ memsz = image->segment[image->elfcorehdr_index].memsz;
+ if (elfsz > memsz) {
+ pr_err("update elfcorehdr elfsz %lu > memsz %lu",
+ elfsz, memsz);
+ goto out;
+ }
+
+ /*
+ * Copy new elfcorehdr over the old elfcorehdr at destination.
+ */
+ old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (!old_elfcorehdr) {
+ pr_err("mapping elfcorehdr segment failed\n");
+ goto out;
+ }
+
+ /*
+ * Temporarily invalidate the crash image while the
+ * elfcorehdr is updated.
+ */
+ xchg(&kexec_crash_image, NULL);
+ memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz);
+ xchg(&kexec_crash_image, image);
+ kunmap_local(old_elfcorehdr);
+ pr_debug("updated elfcorehdr\n");
+
+out:
+ vfree(elfbuf);
+}
+#endif
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..5f4ae5476e19 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Memory preserving reboot related code.
*
@@ -9,13 +10,7 @@
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/crash_dump.h>
-
-#include <asm/uaccess.h>
-
-static void *kdump_buf_page;
-
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+#include <linux/uio.h>
static inline bool is_crashed_pfn_valid(unsigned long pfn)
{
@@ -33,25 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn)
#endif
}
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
void *vaddr;
@@ -61,38 +39,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
- vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
-
- if (!userbuf) {
- memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr, KM_PTE0);
- } else {
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Kdump buffer page not"
- " allocated\n");
- kunmap_atomic(vaddr, KM_PTE0);
- return -EFAULT;
- }
- copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr, KM_PTE0);
- if (copy_to_user(buf, (kdump_buf_page + offset), csize))
- return -EFAULT;
- }
+ vaddr = kmap_local_pfn(pfn);
+ csize = copy_to_iter(vaddr + offset, csize, iter);
+ kunmap_local(vaddr);
return csize;
}
-
-static int __init kdump_buf_page_init(void)
-{
- int ret = 0;
-
- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
- " page\n");
- ret = -ENOMEM;
- }
-
- return ret;
-}
-arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..32d710f7eb84 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Memory preserving reboot related code.
*
@@ -7,45 +8,57 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
-#include <linux/uaccess.h>
+#include <linux/uio.h>
#include <linux/io.h>
+#include <linux/cc_platform.h>
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- * space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- * otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset,
+ bool encrypted)
{
void *vaddr;
if (!csize)
return 0;
- vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (encrypted)
+ vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+ else
+ vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+
if (!vaddr)
return -ENOMEM;
- if (userbuf) {
- if (copy_to_user(buf, vaddr + offset, csize)) {
- iounmap(vaddr);
- return -EFAULT;
- }
- } else
- memcpy(buf, vaddr + offset, csize);
+ csize = copy_to_iter(vaddr + offset, csize, iter);
- iounmap(vaddr);
+ iounmap((void __iomem *)vaddr);
return csize;
}
+
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
+{
+ return __copy_oldmem_page(iter, pfn, csize, offset, false);
+}
+
+/*
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
+ * memory with the encryption mask set to accommodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn,
+ size_t csize, unsigned long offset)
+{
+ return __copy_oldmem_page(iter, pfn, csize, offset, true);
+}
+
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+
+ return read_from_oldmem(&iter, count, ppos,
+ cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
+}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..dd8748c45529
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/libfdt.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
+#include <linux/initrd.h>
+
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/numa.h>
+#include <asm/prom.h>
+
+__initdata u64 initial_dtb;
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+
+int __initdata of_ioapic;
+
+void __init add_dtb(u64 data)
+{
+ initial_dtb = data + offsetof(struct setup_data, data);
+}
+
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+ { .compatible = "intel,ce4100-cp", },
+ { .compatible = "isa", },
+ { .compatible = "pci", },
+ {},
+};
+
+static int __init add_bus_probe(void)
+{
+ if (!of_have_populated_dt())
+ return 0;
+
+ return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+device_initcall(add_bus_probe);
+
+#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+ u32 virq;
+ int ret;
+ u8 pin;
+
+ ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (ret)
+ return pcibios_err_to_errno(ret);
+ if (!pin)
+ return 0;
+
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
+ if (virq == 0)
+ return -EINVAL;
+ dev->irq = virq;
+ return 0;
+}
+
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+
+void x86_of_pci_init(void)
+{
+ pcibios_enable_irq = x86_of_pci_irq_enable;
+ pcibios_disable_irq = x86_of_pci_irq_disable;
+}
+#endif
+
+static void __init dtb_setup_hpet(void)
+{
+#ifdef CONFIG_HPET_TIMER
+ struct device_node *dn;
+ struct resource r;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+ if (!dn)
+ return;
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ WARN_ON(1);
+ return;
+ }
+ hpet_address = r.start;
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static void __init dtb_cpu_setup(void)
+{
+ struct device_node *dn;
+ u32 apic_id;
+
+ for_each_of_cpu_node(dn) {
+ apic_id = of_get_cpu_hwid(dn, 0);
+ if (apic_id == ~0U) {
+ pr_warn("%pOF: missing local APIC ID\n", dn);
+ continue;
+ }
+ topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
+ }
+}
+
+static void __init dtb_lapic_setup(void)
+{
+ struct device_node *dn;
+ struct resource r;
+ unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+ if (dn) {
+ ret = of_address_to_resource(dn, 0, &r);
+ if (WARN_ON(ret))
+ return;
+ lapic_addr = r.start;
+ }
+
+ /* Did the boot loader setup the local APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ /* Try force enabling, which registers the APIC address */
+ if (!apic_force_enable(lapic_addr))
+ return;
+ } else {
+ register_lapic_address(lapic_addr);
+ }
+ smp_found_config = 1;
+ pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode");
+ pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire");
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+
+struct of_ioapic_type {
+ u32 out_type;
+ u32 is_level;
+ u32 active_low;
+};
+
+static struct of_ioapic_type of_ioapic_type[] =
+{
+ {
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .is_level = 0,
+ .active_low = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .is_level = 1,
+ .active_low = 0,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .is_level = 1,
+ .active_low = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .is_level = 0,
+ .active_low = 0,
+ },
+};
+
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = (struct irq_fwspec *)arg;
+ struct of_ioapic_type *it;
+ struct irq_alloc_info tmp;
+ int type_index;
+
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+
+ type_index = fwspec->param[1];
+ if (type_index >= ARRAY_SIZE(of_ioapic_type))
+ return -EINVAL;
+
+ it = &of_ioapic_type[type_index];
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
+ tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+ tmp.ioapic.pin = fwspec->param[0];
+
+ return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
+}
+
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .alloc = dt_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
+
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+ struct resource r;
+ int ret;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ pr_err("Can't obtain address from device node %pOF.\n", dn);
+ return;
+ }
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
+}
+
+static void __init dtb_ioapic_setup(void)
+{
+ struct device_node *dn;
+
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
+ return;
+ }
+ pr_err("Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+
+static void __init dtb_apic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ dtb_lapic_setup();
+ dtb_cpu_setup();
+#endif
+ dtb_ioapic_setup();
+}
+
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
+void __init x86_flattree_get_config(void)
+{
+#ifdef CONFIG_OF_EARLY_FLATTREE
+ u32 size, map_len;
+ void *dt;
+
+ if (initial_dtb) {
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
+
+ early_init_dt_verify(dt, __pa(dt));
+ }
+
+ unflatten_and_copy_device_tree();
+
+ if (initial_dtb)
+ early_memunmap(dt, map_len);
+#endif
+ if (acpi_disabled && of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
+}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 37250fe490b1..6eaf9a6bc02f 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -1,69 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/sched/debug.h>
#include <linux/init_task.h>
#include <linux/fs.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/desc.h>
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#include <asm/traps.h>
+#include <asm/doublefault.h>
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-static void doublefault_fn(void)
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
+
+static void set_df_gdt_entry(unsigned int cpu);
+
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
+ * we're running the doublefault task. Cannot return.
+ */
+asmlinkage noinstr void __noreturn doublefault_shim(void)
{
- struct desc_ptr gdt_desc = {0, 0};
- unsigned long gdt, tss;
+ unsigned long cr2;
+ struct pt_regs regs;
- store_gdt(&gdt_desc);
- gdt = gdt_desc.address;
+ BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ cr2 = native_read_cr2();
- if (ptr_ok(gdt)) {
- gdt += GDT_ENTRY_TSS << 3;
- tss = get_desc_base((struct desc_struct *)gdt);
- printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+ /* Reset back to the normal kernel task. */
+ force_reload_TR();
+ set_df_gdt_entry(smp_processor_id());
- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ trace_hardirqs_off();
- printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
- t->ip, t->sp);
+ /*
+ * Fill in pt_regs. A downside of doing this in C is that the unwinder
+ * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+ * won't successfully unwind to the source of the double fault.
+ * The main dump from exc_double_fault() is fine, though, since it
+ * uses these regs directly.
+ *
+ * If anyone ever cares, this could be moved to asm.
+ */
+ regs.ss = TSS(ss);
+ regs.__ssh = 0;
+ regs.sp = TSS(sp);
+ regs.flags = TSS(flags);
+ regs.cs = TSS(cs);
+ /* We won't go through the entry asm, so we can leave __csh as 0. */
+ regs.__csh = 0;
+ regs.ip = TSS(ip);
+ regs.orig_ax = 0;
+ regs.gs = TSS(gs);
+ regs.__gsh = 0;
+ regs.fs = TSS(fs);
+ regs.__fsh = 0;
+ regs.es = TSS(es);
+ regs.__esh = 0;
+ regs.ds = TSS(ds);
+ regs.__dsh = 0;
+ regs.ax = TSS(ax);
+ regs.bp = TSS(bp);
+ regs.di = TSS(di);
+ regs.si = TSS(si);
+ regs.dx = TSS(dx);
+ regs.cx = TSS(cx);
+ regs.bx = TSS(bx);
- printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
- t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
- }
- }
+ exc_double_fault(&regs, 0, cr2);
- for (;;)
- cpu_relax();
+ /*
+ * x86_32 does not save the original CR3 anywhere on a task switch.
+ * This means that, even if we wanted to return, we would need to find
+ * some way to reconstruct CR3. We could make a credible guess based
+ * on cpu_tlbstate, but that would be racy and would not account for
+ * PTI.
+ */
+ panic("cannot return from double fault\n");
}
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+ .tss = {
+ /*
+ * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+ * active. sp is filled in later.
+ */
.ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
+ .ip = (unsigned long) asm_exc_double_fault,
+ .flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
+ .gs = 0,
.__cr3 = __pa_nodebug(swapper_pg_dir),
- }
+ },
};
+
+static void set_df_gdt_entry(unsigned int cpu)
+{
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+ &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
+void doublefault_init_cpu_tss(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+ /*
+ * The linker isn't smart enough to initialize percpu variables that
+ * point to other places in percpu space.
+ */
+ this_cpu_write(doublefault_stack.tss.sp,
+ (unsigned long)&cea->doublefault_stack.stack +
+ sizeof(doublefault_stack.stack));
+
+ set_df_gdt_entry(cpu);
+}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..b10684dedc58 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,216 +10,343 @@
#include <linux/kdebug.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/ftrace.h>
#include <linux/kexec.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <linux/kasan.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-#include "dumpstack.h"
-
-int panic_on_unrecovered_nmi;
-int panic_on_io_nmi;
-unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-void printk_address(unsigned long address, int reliable)
-{
- printk(" [<%p>] %s%pS\n", (void *) address,
- reliable ? "" : "? ", (void *) address);
-}
+static struct pt_regs exec_summary_regs;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
+bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
- struct task_struct *task = tinfo->task;
- unsigned long ret_addr;
- int index = task->curr_ret_stack;
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
- if (addr != (unsigned long)return_to_handler)
- return;
+ if (stack < begin || stack >= end)
+ return false;
- if (!task->ret_stack || index < *graph)
- return;
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
- index -= *graph;
- ret_addr = task->ret_stack[index].ret;
+ return true;
+}
- ops->address(data, ret_addr, 1);
+/* Called from get_stack_info_noinstr - so must be noinstr too */
+bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
- (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct thread_info *tinfo, int *graph)
-{ }
-#endif
+ void *begin = ss;
+ void *end = ss + 1;
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
-static inline int valid_stack_ptr(struct thread_info *tinfo,
- void *p, unsigned int size, void *end)
-{
- void *t = tinfo;
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p > t && p < t + THREAD_SIZE - size;
+ info->type = STACK_TYPE_ENTRY;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
}
-unsigned long
-print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
+static void printk_stack_address(unsigned long address, int reliable,
+ const char *log_lvl)
{
- struct stack_frame *frame = (struct stack_frame *)bp;
-
- while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, 0);
- }
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
- }
- stack++;
- }
- return bp;
+ touch_nmi_watchdog();
+ printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
-EXPORT_SYMBOL_GPL(print_context_stack);
-unsigned long
-print_context_stack_bp(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
+static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
+ unsigned int nbytes)
{
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long *ret_addr = &frame->return_address;
+ if (!user_mode(regs))
+ return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
- while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
- unsigned long addr = *ret_addr;
+ /* The user space code from other tasks cannot be accessed. */
+ if (regs != task_pt_regs(current))
+ return -EPERM;
- if (!__kernel_text_address(addr))
- break;
+ /*
+ * Even if named copy_from_user_nmi() this can be invoked from
+ * other contexts and will not try to resolve a pagefault, which is
+ * the correct thing to do here as this code can be called from any
+ * context.
+ */
+ return copy_from_user_nmi(buf, (void __user *)src, nbytes);
+}
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- ret_addr = &frame->return_address;
- print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(struct pt_regs *regs, const char *loglvl)
+{
+#define PROLOGUE_SIZE 42
+#define EPILOGUE_SIZE 21
+#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
+ u8 opcodes[OPCODE_BUFSIZE];
+ unsigned long prologue = regs->ip - PROLOGUE_SIZE;
+
+ switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
+ case 0:
+ printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
+ __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
+ opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ break;
+ case -EPERM:
+ /* No access to the user space stack of other tasks. Ignore. */
+ break;
+ default:
+ printk("%sCode: Unable to access opcode bytes at 0x%lx.\n",
+ loglvl, prologue);
+ break;
}
-
- return (unsigned long)frame;
}
-EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+void show_ip(struct pt_regs *regs, const char *loglvl)
{
- printk(data);
- print_symbol(msg, symbol);
- printk("\n");
+#ifdef CONFIG_X86_32
+ printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+ show_opcodes(regs, loglvl);
}
-static void print_trace_warning(void *data, char *msg)
+void show_iret_regs(struct pt_regs *regs, const char *log_lvl)
{
- printk("%s%s\n", (char *)data, msg);
+ show_ip(regs, log_lvl);
+ printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss,
+ regs->sp, regs->flags);
}
-static int print_trace_stack(void *data, char *name)
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial, const char *log_lvl)
{
- printk("%s <%s> ", (char *)data, name);
- return 0;
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, SHOW_REGS_SHORT, log_lvl);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs, log_lvl);
+ }
}
/*
- * Print one address/symbol entries per line.
+ * This function reads pointers from the stack and dereferences them. The
+ * pointers may not have their KMSAN shadow set up properly, which may result
+ * in false positive reports. Disable instrumentation to avoid those.
*/
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+__no_kmsan_checks
+static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
- touch_nmi_watchdog();
- printk(data);
- printk_address(addr, reliable);
-}
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
+ bool partial = false;
-static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
- .walk_stack = print_context_stack,
-};
-
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ unwind_start(&state, task, regs, stack);
+ stack = stack ?: get_stack_pointer(task, regs);
+ regs = unwind_get_entry_regs(&state, &partial);
+
+ /*
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ * - entry stack
+ *
+ * x86-32 can have up to four stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
+ * - entry stack
+ */
+ for (; stack; stack = stack_info.next_sp) {
+ const char *stack_name;
+
+ stack = PTR_ALIGN(stack, sizeof(long));
+
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
+
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
+
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = READ_ONCE_NOCHECK(*stack);
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by show_regs_if_on_stack().
+ */
+ if (regs && stack == &regs->ip)
+ goto next;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+next:
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
+ }
+
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
+ }
}
-void show_stack(struct task_struct *task, unsigned long *sp)
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, const char *log_lvl)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ /*
+ * Disable KASAN to avoid false positives during walking another
+ * task's stacks, as values on these stacks may change concurrently
+ * with task execution.
+ */
+ bool disable_kasan = task && task != current;
+
+ if (disable_kasan)
+ kasan_disable_current();
+
+ __show_trace_log_lvl(task, regs, stack, log_lvl);
+
+ if (disable_kasan)
+ kasan_enable_current();
}
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
+void show_stack(struct task_struct *task, unsigned long *sp,
+ const char *loglvl)
{
- unsigned long bp = 0;
- unsigned long stack;
+ task = task ? : current;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
+ /*
+ * Stack frames below this one aren't interesting. Don't show them
+ * if we're printing for %current.
+ */
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ show_trace_log_lvl(task, NULL, sp, loglvl);
+}
+
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
-EXPORT_SYMBOL(dump_stack);
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
@@ -241,15 +368,18 @@ unsigned __kprobes long oops_begin(void)
bust_spinlocks(1);
return flags;
}
+NOKPROBE_SYMBOL(oops_begin);
+
+void __noreturn rewind_stack_and_make_dead(int signr);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
die_owner = -1;
- add_taint(TAINT_DIE);
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
die_nest_count--;
if (!die_nest_count)
/* Nest count reaches zero, release the lock. */
@@ -257,57 +387,64 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
raw_local_irq_restore(flags);
oops_exit();
+ /* Executive summary in case the oops scrolled away */
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT);
+
if (!signr)
return;
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops)
panic("Fatal exception");
- do_exit(signr);
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ * Before we rewind the stack, we have to tell KASAN that we're going to
+ * reuse the task stack and that existing poisons are invalid.
+ */
+ kasan_unpoison_task_stack(current);
+ rewind_stack_and_make_dead(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+static void __die_header(const char *str, struct pt_regs *regs, long err)
{
-#ifdef CONFIG_X86_32
- unsigned short ss;
- unsigned long sp;
-#endif
- printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
-#endif
- printk("\n");
- sysfs_printk_last_file();
+ /* Save the regs of the first oops for the executive summary later. */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
+ printk(KERN_DEFAULT
+ "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter,
+ IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+ debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+}
+NOKPROBE_SYMBOL(__die_header);
+
+static int __die_body(const char *str, struct pt_regs *regs, long err)
+{
+ show_regs(regs);
+ print_modules();
+
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
-#ifdef CONFIG_X86_32
- if (user_mode_vm(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- } else {
- sp = kernel_stack_pointer(regs);
- savesegment(ss, ss);
- }
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
-#else
- /* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
- printk(" RSP <%016lx>\n", regs->sp);
-#endif
return 0;
}
+NOKPROBE_SYMBOL(__die_body);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ __die_header(str, regs, err);
+ return __die_body(str, regs, err);
+}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
@@ -318,64 +455,36 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode_vm(regs))
- report_bug(regs->ip, regs);
-
if (__die(str, regs, err))
sig = 0;
oops_end(flags, regs, sig);
}
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
{
- unsigned long flags;
-
- if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- flags = oops_begin();
- printk(KERN_EMERG "%s", str);
- printk(" on CPU%d, ip %08lx, registers:\n",
- smp_processor_id(), regs->ip);
- show_registers(regs);
- oops_end(flags, regs, 0);
- if (do_panic || panic_on_oops)
- panic("Non maskable interrupt");
- nmi_exit();
- local_irq_enable();
- do_exit(SIGBUS);
+ __die_header(str, regs, err);
+ if (gp_addr)
+ kasan_non_canonical_hook(gp_addr);
+ if (__die_body(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
}
-static int __init oops_setup(char *s)
+void show_regs(struct pt_regs *regs)
{
- if (!s)
- return -EINVAL;
- if (!strcmp(s, "panic"))
- panic_on_oops = 1;
- return 0;
-}
-early_param("oops", oops_setup);
+ enum show_regs_mode print_kernel_regs;
-static int __init kstack_setup(char *s)
-{
- if (!s)
- return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
- return 0;
-}
-early_param("kstack", kstack_setup);
+ show_regs_print_info(KERN_DEFAULT);
-static int __init code_bytes_setup(char *s)
-{
- code_bytes = simple_strtoul(s, NULL, 0);
- if (code_bytes > 8192)
- code_bytes = 8192;
+ print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL;
+ __show_regs(regs, print_kernel_regs, KERN_DEFAULT);
- return 1;
+ /*
+ * When in-kernel, we also print out the stack at the time of the fault..
+ */
+ if (!user_mode(regs))
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
- u32 next_frame;
- u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
- struct stack_frame *frame;
-
- get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
- while (n--) {
- if (probe_kernel_address(&frame->next_frame, frame))
- break;
- }
-#endif
-
- return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..722fd712e1cf 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -1,13 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
@@ -16,138 +18,138 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
+const char *stack_type_name(enum stack_type type)
+{
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
+ if (type == STACK_TYPE_EXCEPTION)
+ return "#DF";
+
+ return NULL;
+}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- int graph = 0;
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- if (!task)
- task = current;
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- if (!stack) {
- unsigned long dummy;
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- stack = &dummy;
- if (task && task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
+ return true;
+}
- for (;;) {
- struct thread_info *context;
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- stack = (unsigned long *)context->previous_esp;
- if (!stack)
- break;
- if (ops->stack(data, "IRQ") < 0)
- break;
- touch_nmi_watchdog();
- }
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *stack;
- int i;
-
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+ struct doublefault_stack *ss = &cea->doublefault_stack;
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %08lx", *stack++);
- touch_nmi_watchdog();
- }
- printk("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ void *begin = ss->stack;
+ void *end = begin + sizeof(ss->stack);
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_EXCEPTION;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+ return true;
}
-void show_registers(struct pt_regs *regs)
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
{
- int i;
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- print_modules();
- __show_regs(regs, 0);
+ if (task != current)
+ goto unknown;
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
- TASK_COMM_LEN, current->comm, task_pid_nr(current),
- current_thread_info(), current, task_thread_info(current));
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_doublefault_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
/*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- if (!user_mode_vm(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp,
- 0, KERN_EMERG);
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad EIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
}
+ *visit_mask |= 1UL << info->type;
}
- printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
- if (ip < PAGE_OFFSET)
- return 0;
- if (probe_kernel_address((unsigned short *)ip, ud2))
- return 0;
+ return 0;
- return ud2 == 0x0b0f;
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..6c5defd6569a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -1,345 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
-#include "dumpstack.h"
-
-#define N_EXCEPTION_STACKS_END \
- (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
-
-static char x86_stack_ids[][8] = {
- [ DEBUG_STACK-1 ] = "#DB",
- [ NMI_STACK-1 ] = "NMI",
- [ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ STACKFAULT_STACK-1 ] = "#SS",
- [ MCE_STACK-1 ] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [ N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS_END ] = "#DB[?]"
-#endif
+static const char * const exception_stack_names[] = {
+ [ ESTACK_DF ] = "#DF",
+ [ ESTACK_NMI ] = "NMI",
+ [ ESTACK_DB ] = "#DB",
+ [ ESTACK_MCE ] = "#MC",
+ [ ESTACK_VC ] = "#VC",
+ [ ESTACK_VC2 ] = "#VC2",
};
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
+const char *stack_type_name(enum stack_type type)
{
- unsigned k;
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = x86_stack_ids[k];
- return (unsigned long *)end;
- }
+ if (type == STACK_TYPE_TASK)
+ return "TASK";
+
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ if (type == STACK_TYPE_ENTRY) {
/*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
*/
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- x86_stack_ids[j][4] = '1' +
- (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = x86_stack_ids[j];
- return (unsigned long *)end;
- }
-#endif
+ return "ENTRY_TRAMPOLINE";
}
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
return NULL;
}
-static inline int
-in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
- unsigned long *irq_stack_end)
-{
- return (stack >= irq_stack && stack < irq_stack_end);
-}
+/**
+ * struct estack_pages - Page descriptor for exception stacks
+ * @offs: Offset from the start of the exception stack area
+ * @size: Size of the exception stack
+ * @type: Type to store in the stack_info struct
+ */
+struct estack_pages {
+ u32 offs;
+ u16 size;
+ u16 type;
+};
+
+#define EPAGERANGE(st) \
+ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \
+ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \
+ .offs = CEA_ESTACK_OFFS(st), \
+ .size = CEA_ESTACK_SIZE(st), \
+ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }
/*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
+ * Array of exception stack page descriptors. If the stack is larger than
+ * PAGE_SIZE, all pages covering a particular stack will have the same
+ * info. The guard pages including the not mapped DB2 stack are zeroed
+ * out.
*/
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
- unsigned long *irq_stack, unsigned long *irq_stack_end)
+static const
+struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
+ EPAGERANGE(DF),
+ EPAGERANGE(NMI),
+ EPAGERANGE(DB),
+ EPAGERANGE(MCE),
+ EPAGERANGE(VC),
+ EPAGERANGE(VC2),
+};
+
+static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
-#ifdef CONFIG_FRAME_POINTER
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long next;
-
- if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (!probe_kernel_address(&frame->next_frame, next))
- return next;
- else
- WARN_ONCE(1, "Perf: bad frame pointer = %p in "
- "callchain\n", &frame->next_frame);
- }
-#endif
- return bp;
-}
+ unsigned long begin, end, stk = (unsigned long)stack;
+ const struct estack_pages *ep;
+ struct pt_regs *regs;
+ unsigned int k;
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+ begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ /*
+ * Handle the case where stack trace is collected _before_
+ * cea_exception_stacks had been initialized.
+ */
+ if (!begin)
+ return false;
+
+ end = begin + sizeof(struct cea_exception_stacks);
+ /* Bail if @stack is outside the exception stack area. */
+ if (stk < begin || stk >= end)
+ return false;
+
+ /* Calc page offset from start of exception stacks */
+ k = (stk - begin) >> PAGE_SHIFT;
+ /* Lookup the page descriptor */
+ ep = &estack_pages[k];
+ /* Guard page? */
+ if (!ep->size)
+ return false;
+
+ begin += (unsigned long)ep->offs;
+ end = begin + (unsigned long)ep->size;
+ regs = (struct pt_regs *)end - 1;
+
+ info->type = ep->type;
+ info->begin = (unsigned long *)begin;
+ info->end = (unsigned long *)end;
+ info->next_sp = (unsigned long *)regs->sp;
+ return true;
+}
+
+static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- unsigned long *irq_stack_end =
- (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned used = 0;
- struct thread_info *tinfo;
- int graph = 0;
-
- if (!task)
- task = current;
-
- if (!stack) {
- unsigned long dummy;
- stack = &dummy;
- if (task && task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
+ /*
+ * @end points directly to the top most stack entry to avoid a -8
+ * adjustment in the stack switch hotpath. Adjust it back before
+ * calculating @begin.
+ */
+ end++;
+ begin = end - (IRQ_STACK_SIZE / sizeof(long));
/*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
+ * Due to the switching logic RSP can never be == @end because the
+ * final operation is 'popq %rsp' which means after that RSP points
+ * to the original stack and not to @end.
*/
- tinfo = task_thread_info(task);
- for (;;) {
- char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
-
- if (estack_end) {
- if (ops->stack(data, id) < 0)
- break;
-
- bp = ops->walk_stack(tinfo, stack, bp, ops,
- data, estack_end, &graph);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
- if (irq_stack_end) {
- unsigned long *irq_stack;
- irq_stack = irq_stack_end -
- (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
-
- if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(tinfo, stack, bp,
- ops, data, irq_stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (irq_stack_end[-1]);
- bp = fixup_bp_irq_link(bp, stack, irq_stack,
- irq_stack_end);
- irq_stack_end = NULL;
- ops->stack(data, "EOI");
- continue;
- }
- }
- break;
- }
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
/*
- * This handles the process stack:
+ * The next stack pointer is stored at the top of the irq stack
+ * before switching to the irq stack. Actual stack entries are all
+ * below that.
*/
- bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
- put_cpu();
+ info->next_sp = (unsigned long *)*(end - 1);
+
+ return true;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
- unsigned long *irq_stack_end;
- unsigned long *irq_stack;
- unsigned long *stack;
- int cpu;
- int i;
+ if (in_task_stack(stack, task, info))
+ return true;
- preempt_disable();
- cpu = smp_processor_id();
+ if (task != current)
+ return false;
- irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+ if (in_exception_stack(stack, info))
+ return true;
- /*
- * Debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu:
- */
- if (sp == NULL) {
- if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ if (in_irq_stack(stack, info))
+ return true;
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (stack >= irq_stack && stack <= irq_stack_end) {
- if (stack == irq_stack_end) {
- stack = (unsigned long *) (irq_stack_end[-1]);
- printk(" <EOI> ");
- }
- } else {
- if (((long) stack & (THREAD_SIZE-1)) == 0)
- break;
- }
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- printk("\n%s", log_lvl);
- printk(" %016lx", *stack++);
- touch_nmi_watchdog();
- }
- preempt_enable();
+ if (in_entry_stack(stack, info))
+ return true;
- printk("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ return false;
}
-void show_registers(struct pt_regs *regs)
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
{
- int i;
- unsigned long sp;
- const int cpu = smp_processor_id();
- struct task_struct *cur = current;
-
- sp = regs->sp;
- printk("CPU %d ", cpu);
- print_modules();
- __show_regs(regs, 1);
- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
- cur->comm, cur->pid, task_thread_info(cur), cur);
+ task = task ? : current;
+
+ if (!stack)
+ goto unknown;
+
+ if (!get_stack_info_noinstr(stack, task, info))
+ goto unknown;
/*
- * When in-kernel, we also print out the stack and code at the
- * time of the fault..
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- if (!user_mode(regs)) {
- unsigned int code_prologue = code_bytes * 43 / 64;
- unsigned int code_len = code_bytes;
- unsigned char c;
- u8 *ip;
-
- printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, KERN_EMERG);
-
- printk(KERN_EMERG "Code: ");
-
- ip = (u8 *)regs->ip - code_prologue;
- if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
- /* try starting at IP */
- ip = (u8 *)regs->ip;
- code_len = code_len - code_prologue + 1;
- }
- for (i = 0; i < code_len; i++, ip++) {
- if (ip < (u8 *)PAGE_OFFSET ||
- probe_kernel_address(ip, c)) {
- printk(" Bad RIP value.");
- break;
- }
- if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
- else
- printk("%02x ", c);
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ if (task == current)
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
}
+ *visit_mask |= 1UL << info->type;
}
- printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
- if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
- return 0;
+ return 0;
- return ud2 == 0x0b0f;
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..b15b97d3cb52 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1,41 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
+ * Low level x86 E820 memory map handling functions.
*
- * Getting sanitize_e820_map() in sync with i386 version by applying change:
- * - Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * The firmware and bootloader passes us the "E820 table", which is the primary
+ * physical memory layout description available about x86 systems.
*
+ * The kernel takes the E820 memory layout and optionally modifies it with
+ * quirks and other tweaks, and feeds that into the generic Linux memory
+ * allocation code routines via a platform independent interface (memblock, etc.).
*/
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/pfn.h>
+#include <linux/crash_dump.h>
+#include <linux/memblock.h>
#include <linux/suspend.h>
+#include <linux/acpi.h>
#include <linux/firmware-map.h>
+#include <linux/sort.h>
+#include <linux/memory_hotplug.h>
+#include <linux/kvm_types.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
+#include <asm/e820/api.h>
#include <asm/setup.h>
/*
- * The e820 map is the map that gets modified e.g. with command line parameters
- * and that is also registered with modifications in the kernel resource tree
- * with the iomem_resource as parent.
+ * We organize the E820 table into three main data structures:
*
- * The e820_saved is directly saved after the BIOS-provided memory map is
- * copied. It doesn't get modified afterwards. It's registered for the
- * /sys/firmware/memmap interface.
+ * - 'e820_table_firmware': the original firmware version passed to us by the
+ * bootloader - not modified by the kernel. It is composed of two parts:
+ * the first 128 E820 memory entries in boot_params.e820_table and the remaining
+ * (if any) entries of the SETUP_E820_EXT nodes. We use this to:
*
- * That memory map is not modified and is used as base for kexec. The kexec'd
- * kernel should get the same memory map as the firmware provides. Then the
- * user can e.g. boot the original kernel with mem=1G while still booting the
- * next kernel with full memory.
+ * - the hibernation code uses it to generate a kernel-independent CRC32
+ * checksum of the physical memory layout of a system.
+ *
+ * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
+ * passed to us by the bootloader - the major difference between
+ * e820_table_firmware[] and this one is that e820_table_kexec[]
+ * might be modified by the kexec itself to fake an mptable.
+ * We use this to:
+ *
+ * - kexec, which is a bootloader in disguise, uses the original E820
+ * layout to pass to the kexec-ed kernel. This way the original kernel
+ * can have a restricted E820 map while the kexec()-ed kexec-kernel
+ * can have access to full memory - etc.
+ *
+ * Export the memory layout via /sys/firmware/memmap. kexec-tools uses
+ * the entries to create an E820 table for the kexec kernel.
+ *
+ * kexec_file_load in-kernel code uses the table for the kexec kernel.
+ *
+ * - 'e820_table': this is the main E820 table that is massaged by the
+ * low level x86 platform code, or modified by boot parameters, before
+ * passed on to higher level MM layers.
+ *
+ * Once the E820 map has been converted to the standard Linux memory layout
+ * information its role stops - modifying it has no effect and does not get
+ * re-propagated. So its main role is a temporary bootstrap storage of firmware
+ * specific memory layout data during early bootup.
*/
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820_table e820_table_init __initdata;
+static struct e820_table e820_table_kexec_init __initdata;
+static struct e820_table e820_table_firmware_init __initdata;
+
+struct e820_table *e820_table __refdata = &e820_table_init;
+struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init;
+struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -47,141 +75,166 @@ EXPORT_SYMBOL(pci_mem_start);
* This function checks if any part of the range <start,end> is mapped
* with type.
*/
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
+static bool _e820__mapped_any(struct e820_table *table,
+ u64 start, u64 end, enum e820_type type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
- if (type && ei->type != type)
+ if (type && entry->type != type)
continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
+ if (entry->addr >= end || entry->addr + entry->size <= start)
continue;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type)
+{
+ return _e820__mapped_any(e820_table_firmware, start, end, type);
+}
+EXPORT_SYMBOL_FOR_KVM(e820__mapped_raw_any);
+
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+{
+ return _e820__mapped_any(e820_table, start, end, type);
+}
+EXPORT_SYMBOL_GPL(e820__mapped_any);
/*
- * This function checks if the entire range <start,end> is mapped with type.
+ * This function checks if the entire <start,end> range is mapped with 'type'.
*
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
+ * Note: this function only works correctly once the E820 table is sorted and
+ * not-overlapping (at least for the range specified), which is the case normally.
*/
-int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- if (type && ei->type != type)
+ if (type && entry->type != type)
continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
+
+ /* Is the region (part) in overlap with the current region? */
+ if (entry->addr >= end || entry->addr + entry->size <= start)
continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
+ /*
+ * If the region is at the beginning of <start,end> we move
+ * 'start' to the end of the region since it's ok until there
*/
- if (ei->addr <= start)
- start = ei->addr + ei->size;
+ if (entry->addr <= start)
+ start = entry->addr + entry->size;
+
/*
- * if start is now at or beyond end, we're done, full
- * coverage
+ * If 'start' is now at or beyond 'end', we're done, full
+ * coverage of the desired range exists:
*/
if (start >= end)
- return 1;
+ return entry;
}
- return 0;
+
+ return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+ return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
}
/*
- * Add a memory region to the kernel e820 map.
+ * Add a memory region to the kernel E820 map.
*/
-static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
- int type)
+static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
{
- int x = e820x->nr_map;
+ int x = table->nr_entries;
- if (x >= ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ if (x >= ARRAY_SIZE(table->entries)) {
+ pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size - 1);
return;
}
- e820x->map[x].addr = start;
- e820x->map[x].size = size;
- e820x->map[x].type = type;
- e820x->nr_map++;
+ table->entries[x].addr = start;
+ table->entries[x].size = size;
+ table->entries[x].type = type;
+ table->nr_entries++;
}
-void __init e820_add_region(u64 start, u64 size, int type)
+void __init e820__range_add(u64 start, u64 size, enum e820_type type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820__range_add(e820_table, start, size, type);
}
-static void __init e820_print_type(u32 type)
+static void __init e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_RAM:
- case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)");
- break;
- case E820_RESERVED:
- printk(KERN_CONT "(reserved)");
- break;
- case E820_ACPI:
- printk(KERN_CONT "(ACPI data)");
- break;
- case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)");
- break;
- case E820_UNUSABLE:
- printk(KERN_CONT "(unusable)");
- break;
- default:
- printk(KERN_CONT "type %u", type);
- break;
+ case E820_TYPE_RAM: pr_cont("usable"); break;
+ case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
+ case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
+ case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
+ case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
+ case E820_TYPE_PMEM: /* Fall through: */
+ case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break;
+ default: pr_cont("type %u", type); break;
}
}
-void __init e820_print_map(char *who)
+void __init e820__print_table(char *who)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
- (unsigned long long) e820.map[i].addr,
- (unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
- e820_print_type(e820.map[i].type);
- printk(KERN_CONT "\n");
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ pr_info("%s: [mem %#018Lx-%#018Lx] ",
+ who,
+ e820_table->entries[i].addr,
+ e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+
+ e820_print_type(e820_table->entries[i].type);
+ pr_cont("\n");
}
}
/*
- * Sanitize the BIOS e820 map.
+ * Sanitize an E820 map.
*
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps,
+ * Some E820 layouts include overlapping entries. The following
+ * replaces the original E820 map with a new one, removing overlaps,
* and resolving conflicting memory types in favor of highest
* numbered type.
*
- * The input parameter biosmap points to an array of 'struct
- * e820entry' which on entry has elements in the range [0, *pnr_map)
- * valid, and which has space for up to max_nr_map entries.
- * On return, the resulting sanitized e820 map entries will be in
- * overwritten in the same location, starting at biosmap.
+ * The input parameter 'entries' points to an array of 'struct
+ * e820_entry' which on entry has elements in the range [0, *nr_entries)
+ * valid, and which has space for up to max_nr_entries entries.
+ * On return, the resulting sanitized E820 map entries will be in
+ * overwritten in the same location, starting at 'entries'.
*
- * The integer pointed to by pnr_map must be valid on entry (the
- * current number of valid entries located at biosmap) and will
- * be updated on return, with the new number of valid entries
- * (something no more than max_nr_map.)
+ * The integer pointed to by nr_entries must be valid on entry (the
+ * current number of valid entries located at 'entries'). If the
+ * sanitizing succeeds the *nr_entries will be updated with the new
+ * number of valid entries (something no more than max_nr_entries).
*
- * The return value from sanitize_e820_map() is zero if it
+ * The return value from e820__update_table() is zero if it
* successfully 'sanitized' the map entries passed in, and is -1
* if it did nothing, which can happen if either of (1) it was
* only passed one map entry, or (2) any of the input map entries
@@ -223,188 +276,173 @@ void __init e820_print_map(char *who)
* ____________________33__
* ______________________4_
*/
+struct change_member {
+ /* Pointer to the original entry: */
+ struct e820_entry *entry;
+ /* Address for this change point: */
+ unsigned long long addr;
+};
+
+static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
+static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
+static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
+static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are not equal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
+}
+
+static bool e820_nomerge(enum e820_type type)
+{
+ /*
+ * These types may indicate distinct platform ranges aligned to
+ * numa node, protection domain, performance domain, or other
+ * boundaries. Do not merge them.
+ */
+ if (type == E820_TYPE_PRAM)
+ return true;
+ if (type == E820_TYPE_SOFT_RESERVED)
+ return true;
+ return false;
+}
-int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
- u32 *pnr_map)
+int __init e820__update_table(struct e820_table *table)
{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
- static struct change_member change_point_list[2*E820_X_MAX] __initdata;
- static struct change_member *change_point[2*E820_X_MAX] __initdata;
- static struct e820entry *overlap_list[E820_X_MAX] __initdata;
- static struct e820entry new_bios[E820_X_MAX] __initdata;
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
+ struct e820_entry *entries = table->entries;
+ u32 max_nr_entries = ARRAY_SIZE(table->entries);
+ enum e820_type current_type, last_type;
unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
+ u32 new_nr_entries, overlap_entries;
+ u32 i, chg_idx, chg_nr;
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2)
+ /* If there's only one memory region, don't bother: */
+ if (table->nr_entries < 2)
return -1;
- old_nr = *pnr_map;
- BUG_ON(old_nr > max_nr_map);
+ BUG_ON(table->nr_entries > max_nr_entries);
- /* bail out if we find any unreasonable addresses in bios map */
- for (i = 0; i < old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ /* Bail out if we find any unreasonable addresses in the map: */
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].addr + entries[i].size < entries[i].addr)
return -1;
+ }
- /* create pointers for initial change-point information (for sorting) */
- for (i = 0; i < 2 * old_nr; i++)
+ /* Create pointers for initial change-point information (for sorting): */
+ for (i = 0; i < 2 * table->nr_entries; i++)
change_point[i] = &change_point_list[i];
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i = 0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr +
- biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx;
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
+ /*
+ * Record all known change-points (starting and ending addresses),
+ * omitting empty memory regions:
+ */
+ chg_idx = 0;
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].size != 0) {
+ change_point[chg_idx]->addr = entries[i].addr;
+ change_point[chg_idx++]->entry = &entries[i];
+ change_point[chg_idx]->addr = entries[i].addr + entries[i].size;
+ change_point[chg_idx++]->entry = &entries[i];
}
}
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries = 0; /* number of entries in the overlap table */
- new_bios_entry = 0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
-
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx = 0; chgidx < chg_nr; chgidx++) {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr ==
- change_point[chgidx]->pbios->addr) {
- /*
- * add map entry to overlap list (> 1 entry
- * implies an overlap)
- */
- overlap_list[overlap_entries++] =
- change_point[chgidx]->pbios;
+ chg_nr = chg_idx;
+
+ /* Sort change-point list by memory addresses (low -> high): */
+ sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
+
+ /* Create a new memory map, removing overlaps: */
+ overlap_entries = 0; /* Number of entries in the overlap table */
+ new_nr_entries = 0; /* Index for creating new map entries */
+ last_type = 0; /* Start with undefined memory type */
+ last_addr = 0; /* Start with 0 as last starting address */
+
+ /* Loop through change-points, determining effect on the new map: */
+ for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
+ /* Keep track of all overlapping entries */
+ if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
+ /* Add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
} else {
- /*
- * remove entry from list (order independent,
- * so swap with last)
- */
+ /* Remove entry from list (order independent, so swap with last): */
for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i] ==
- change_point[chgidx]->pbios)
- overlap_list[i] =
- overlap_list[overlap_entries-1];
+ if (overlap_list[i] == change_point[chg_idx]->entry)
+ overlap_list[i] = overlap_list[overlap_entries-1];
}
overlap_entries--;
}
/*
- * if there are overlapping entries, decide which
+ * If there are overlapping entries, decide which
* "type" to use (larger value takes precedence --
* 1=usable, 2,3,4,4+=unusable)
*/
current_type = 0;
- for (i = 0; i < overlap_entries; i++)
+ for (i = 0; i < overlap_entries; i++) {
if (overlap_list[i]->type > current_type)
current_type = overlap_list[i]->type;
- /*
- * continue building up new bios map based on this
- * information
- */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /*
- * move forward only if the new size
- * was non-zero
- */
- if (new_bios[new_bios_entry].size != 0)
- /*
- * no more space left for new
- * bios entries ?
- */
- if (++new_bios_entry >= max_nr_map)
+ }
+
+ /* Continue building up new map based on this information: */
+ if (current_type != last_type || e820_nomerge(current_type)) {
+ if (last_type) {
+ new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
+ /* Move forward only if the new size was non-zero: */
+ if (new_entries[new_nr_entries].size != 0)
+ /* No more space left for new entries? */
+ if (++new_nr_entries >= max_nr_entries)
break;
}
- if (current_type != 0) {
- new_bios[new_bios_entry].addr =
- change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr = change_point[chgidx]->addr;
+ if (current_type) {
+ new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
+ new_entries[new_nr_entries].type = current_type;
+ last_addr = change_point[chg_idx]->addr;
}
last_type = current_type;
}
}
- /* retain count for new bios entries */
- new_nr = new_bios_entry;
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
- *pnr_map = new_nr;
+ /* Copy the new entries into the original location: */
+ memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
+ table->nr_entries = new_nr_entries;
return 0;
}
-static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
{
- while (nr_map) {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
+ struct boot_e820_entry *entry = entries;
+
+ while (nr_entries) {
+ u64 start = entry->addr;
+ u64 size = entry->size;
+ u64 end = start + size - 1;
+ u32 type = entry->type;
+
+ /* Ignore the entry on 64-bit overflow: */
+ if (start > end && likely(size))
return -1;
- e820_add_region(start, size, type);
+ e820__range_add(start, size, type);
- biosmap++;
- nr_map--;
+ entry++;
+ nr_entries--;
}
return 0;
}
/*
- * Copy the BIOS e820 map into a safe place.
+ * Copy the BIOS E820 map into a safe place.
*
* Sanity-check it while we're at it..
*
@@ -412,18 +450,17 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
*/
-static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
{
/* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
+ if (nr_entries < 2)
return -1;
- return __append_e820_map(biosmap, nr_map);
+ return __append_e820_table(entries, nr_entries);
}
-static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
- u64 size, unsigned old_type,
- unsigned new_type)
+static u64 __init
+__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
u64 end;
unsigned int i;
@@ -435,78 +472,74 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
e820_print_type(old_type);
- printk(KERN_CONT " ==> ");
+ pr_cont(" ==> ");
e820_print_type(new_type);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
- for (i = 0; i < e820x->nr_map; i++) {
- struct e820entry *ei = &e820x->map[i];
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
u64 final_start, final_end;
- u64 ei_end;
+ u64 entry_end;
- if (ei->type != old_type)
+ if (entry->type != old_type)
continue;
- ei_end = ei->addr + ei->size;
- /* totally covered by new range? */
- if (ei->addr >= start && ei_end <= end) {
- ei->type = new_type;
- real_updated_size += ei->size;
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered by new range? */
+ if (entry->addr >= start && entry_end <= end) {
+ entry->type = new_type;
+ real_updated_size += entry->size;
continue;
}
- /* new range is totally covered? */
- if (ei->addr < start && ei_end > end) {
- __e820_add_region(e820x, start, size, new_type);
- __e820_add_region(e820x, end, ei_end - end, ei->type);
- ei->size = start - ei->addr;
+ /* New range is completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ __e820__range_add(table, start, size, new_type);
+ __e820__range_add(table, end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
real_updated_size += size;
continue;
}
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(end, ei_end);
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
if (final_start >= final_end)
continue;
- __e820_add_region(e820x, final_start, final_end - final_start,
- new_type);
+ __e820__range_add(table, final_start, final_end - final_start, new_type);
real_updated_size += final_end - final_start;
/*
- * left range could be head or tail, so need to update
- * size at first.
+ * Left range could be head or tail, so need to update
+ * its size first:
*/
- ei->size -= final_end - final_start;
- if (ei->addr < final_start)
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
continue;
- ei->addr = final_end;
+
+ entry->addr = final_end;
}
return real_updated_size;
}
-u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
+u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
- return __e820_update_range(&e820, start, size, old_type, new_type);
+ return __e820__range_update(e820_table, start, size, old_type, new_type);
}
-static u64 __init e820_update_range_saved(u64 start, u64 size,
- unsigned old_type, unsigned new_type)
+u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
+ enum e820_type old_type, enum e820_type new_type)
{
- return __e820_update_range(&e820_saved, start, size, old_type,
- new_type);
+ return __e820__range_update(t, start, size, old_type, new_type);
}
-/* make e820 not cover the range */
-u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
- int checktype)
+/* Remove a range of memory from the E820 table: */
+u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
{
int i;
u64 end;
@@ -516,99 +549,89 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
- if (checktype)
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
+ if (check_type)
e820_print_type(old_type);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
u64 final_start, final_end;
- u64 ei_end;
+ u64 entry_end;
- if (checktype && ei->type != old_type)
+ if (check_type && entry->type != old_type)
continue;
- ei_end = ei->addr + ei->size;
- /* totally covered? */
- if (ei->addr >= start && ei_end <= end) {
- real_removed_size += ei->size;
- memset(ei, 0, sizeof(struct e820entry));
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered? */
+ if (entry->addr >= start && entry_end <= end) {
+ real_removed_size += entry->size;
+ memset(entry, 0, sizeof(*entry));
continue;
}
- /* new range is totally covered? */
- if (ei->addr < start && ei_end > end) {
- e820_add_region(end, ei_end - end, ei->type);
- ei->size = start - ei->addr;
+ /* Is the new range completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ e820__range_add(end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
real_removed_size += size;
continue;
}
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(end, ei_end);
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
if (final_start >= final_end)
continue;
+
real_removed_size += final_end - final_start;
/*
- * left range could be head or tail, so need to update
- * size at first.
+ * Left range could be head or tail, so need to update
+ * the size first:
*/
- ei->size -= final_end - final_start;
- if (ei->addr < final_start)
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
continue;
- ei->addr = final_end;
+
+ entry->addr = final_end;
}
return real_removed_size;
}
-void __init update_e820(void)
+void __init e820__update_table_print(void)
{
- u32 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ if (e820__update_table(e820_table))
return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- e820_print_map("modified");
+
+ pr_info("modified physical RAM map:\n");
+ e820__print_table("modified");
}
-static void __init update_e820_saved(void)
-{
- u32 nr_map;
- nr_map = e820_saved.nr_map;
- if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
- return;
- e820_saved.nr_map = nr_map;
+static void __init e820__update_table_kexec(void)
+{
+ e820__update_table(e820_table_kexec);
}
+
#define MAX_GAP_END 0x100000000ull
+
/*
- * Search for a gap in the e820 memory space from start_addr to end_addr.
+ * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
*/
-__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
- unsigned long start_addr, unsigned long long end_addr)
+static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
{
- unsigned long long last;
- int i = e820.nr_map;
+ unsigned long long last = MAX_GAP_END;
+ int i = e820_table->nr_entries;
int found = 0;
- last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
-
while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- if (end < start_addr)
- continue;
+ unsigned long long start = e820_table->entries[i].addr;
+ unsigned long long end = start + e820_table->entries[i].size;
/*
* Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
+ * fit in 32 bits if this condition is true:
*/
if (last > end) {
unsigned long gap = last - end;
@@ -626,216 +649,168 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
}
/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space. We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
+ * Search for the biggest gap in the low 32 bits of the E820
+ * memory space. We pass this space to the PCI subsystem, so
+ * that it can assign MMIO resources for hotplug or
+ * unconfigured devices in.
+ *
* Hopefully the BIOS let enough space left.
*/
-__init void e820_setup_gap(void)
+__init void e820__setup_pci_gap(void)
{
unsigned long gapstart, gapsize;
int found;
- gapstart = 0x10000000;
gapsize = 0x400000;
- found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+ found = e820_search_gap(&gapstart, &gapsize);
-#ifdef CONFIG_X86_64
if (!found) {
+#ifdef CONFIG_X86_64
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
- printk(KERN_ERR
- "PCI: Warning: Cannot find a gap in the 32bit address range\n"
- "PCI: Unassigned devices with 32bit resource registers may break!\n");
- }
+ pr_err("Cannot find an available gap in the 32-bit address range\n");
+ pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
+#else
+ gapstart = 0x10000000;
#endif
+ }
/*
- * e820_reserve_resources_late protect stolen RAM already
+ * e820__reserve_resources_late() protects stolen RAM already:
*/
pci_mem_start = gapstart;
- printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-/**
- * Because of the size limitation of struct boot_params, only first
- * 128 E820 memory entries are passed to kernel via
- * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
- * linked list of struct setup_data, which is parsed here.
- */
-void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
-{
- u32 map_len;
- int entries;
- struct e820entry *extmap;
-
- entries = sdata->len / sizeof(struct e820entry);
- map_len = sdata->len + sizeof(struct setup_data);
- if (map_len > PAGE_SIZE)
- sdata = early_ioremap(pa_data, map_len);
- extmap = (struct e820entry *)(sdata->data);
- __append_e820_map(extmap, entries);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- if (map_len > PAGE_SIZE)
- early_iounmap(sdata, map_len);
- printk(KERN_INFO "extended physical RAM map:\n");
- e820_print_map("extended");
+ pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
-#if defined(CONFIG_X86_64) || \
- (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
-/**
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+/*
+ * Called late during init, in free_initmem().
*
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
+ * Initial e820_table and e820_table_kexec are largish __initdata arrays.
+ *
+ * Copy them to a (usually much smaller) dynamically allocated area that is
+ * sized precisely after the number of e820 entries.
+ *
+ * This is done after we've performed all the fixes and tweaks to the tables.
+ * All functions which modify them are __init functions, which won't exist
+ * after free_initmem().
*/
-void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+__init void e820__reallocate_tables(void)
{
- int i;
- unsigned long pfn;
-
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (pfn < PFN_UP(ei->addr))
- register_nosave_region(pfn, PFN_UP(ei->addr));
-
- pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
- register_nosave_region(PFN_UP(ei->addr), pfn);
-
- if (pfn >= limit_pfn)
- break;
- }
+ struct e820_table *n;
+ int size;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
+ n = kmemdup(e820_table, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
+ n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table_kexec = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
+ n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
+ BUG_ON(!n);
+ e820_table_firmware = n;
}
-#endif
-#ifdef CONFIG_HIBERNATION
-/**
- * Mark ACPI NVS memory region, so that we can save/restore it during
- * hibernation and the subsequent resume.
+/*
+ * Because of the small fixed size of struct boot_params, only the first
+ * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
+ * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
+ * struct setup_data, which is parsed here.
*/
-static int __init e820_mark_nvs_memory(void)
+void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
{
- int i;
+ int entries;
+ struct boot_e820_entry *extmap;
+ struct setup_data *sdata;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ sdata = early_memremap(phys_addr, data_len);
+ entries = sdata->len / sizeof(*extmap);
+ extmap = (struct boot_e820_entry *)(sdata->data);
- if (ei->type == E820_NVS)
- suspend_nvs_register(ei->addr, ei->size);
- }
+ __append_e820_table(extmap, entries);
+ e820__update_table(e820_table);
- return 0;
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ early_memunmap(sdata, data_len);
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("extended");
}
-core_initcall(e820_mark_nvs_memory);
-#endif
/*
- * Find a free area with specified alignment in a specific range.
+ * Find the ranges of physical addresses that do not correspond to
+ * E820 RAM areas and register the corresponding pages as 'nosave' for
+ * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
+ *
+ * This function requires the E820 map to be sorted and without any
+ * overlapping entries.
*/
-u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+void __init e820__register_nosave_regions(unsigned long limit_pfn)
{
int i;
+ u64 last_addr = 0;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- if (ei->type != E820_RAM)
+ if (entry->type != E820_TYPE_RAM)
continue;
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area(ei_start, ei_last, start, end,
- size, align);
+ if (last_addr < entry->addr)
+ register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr));
- if (addr != -1ULL)
- return addr;
+ last_addr = entry->addr + entry->size;
}
- return -1ULL;
-}
-u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
-{
- return find_e820_area(start, end, size, align);
+ register_nosave_region(PFN_DOWN(last_addr), limit_pfn);
}
-u64 __init get_max_mapped(void)
-{
- u64 end = max_pfn_mapped;
-
- end <<= PAGE_SHIFT;
-
- return end;
-}
+#ifdef CONFIG_ACPI
/*
- * Find next free range after *start
+ * Register ACPI NVS memory regions, so that we can save/restore them during
+ * hibernation and the subsequent resume:
*/
-u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+static int __init e820__register_nvs_regions(void)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 addr;
- u64 ei_start, ei_last;
-
- if (ei->type != E820_RAM)
- continue;
-
- ei_last = ei->addr + ei->size;
- ei_start = ei->addr;
- addr = find_early_area_size(ei_start, ei_last, start,
- sizep, align);
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
- if (addr != -1ULL)
- return addr;
+ if (entry->type == E820_TYPE_NVS)
+ acpi_nvs_register(entry->addr, entry->size);
}
- return -1ULL;
+ return 0;
}
+core_initcall(e820__register_nvs_regions);
+#endif
/*
- * pre allocated 4k and reserved it in e820
+ * Allocate the requested number of bytes with the requested alignment
+ * and return (the physical address) to the caller. Also register this
+ * range in the 'kexec' E820 table as a reserved range.
+ *
+ * This allows kexec to fake a new mptable, as if it came from the real
+ * system.
*/
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
{
- u64 size = 0;
u64 addr;
- u64 start;
- for (start = startt; ; start += size) {
- start = find_e820_area_size(start, &size, align);
- if (!(start + 1))
- return 0;
- if (size >= sizet)
- break;
+ addr = memblock_phys_alloc(size, align);
+ if (addr) {
+ e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+ e820__update_table_kexec();
}
-#ifdef CONFIG_X86_32
- if (start >= MAXMEM)
- return 0;
- if (start + size > MAXMEM)
- size = MAXMEM - start;
-#endif
-
- addr = round_down(start + size - sizet, align);
- if (addr < start)
- return 0;
- e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
- e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820 for early_reserve_e820\n");
- update_e820();
- update_e820_saved();
-
return addr;
}
@@ -852,22 +827,23 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != type)
+ if (entry->type != E820_TYPE_RAM &&
+ entry->type != E820_TYPE_ACPI)
continue;
- start_pfn = ei->addr >> PAGE_SHIFT;
- end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+ start_pfn = entry->addr >> PAGE_SHIFT;
+ end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
if (start_pfn >= limit_pfn)
continue;
@@ -882,89 +858,22 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
- last_pfn, max_arch_pfn);
+ pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
return last_pfn;
}
-unsigned long __init e820_end_of_ram_pfn(void)
-{
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
-}
-unsigned long __init e820_end_of_low_ram_pfn(void)
-{
- return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
-}
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
+unsigned long __init e820__end_of_ram_pfn(void)
{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
+ return e820__end_ram_pfn(MAX_ARCH_PFN);
}
-/* Walk the e820 map and register active regions within a node */
-void __init e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
+unsigned long __init e820__end_of_low_ram_pfn(void)
{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
+ return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init e820_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - ((u64)ram << PAGE_SHIFT);
-}
-
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
{
early_printk(msg);
panic(msg);
@@ -972,7 +881,7 @@ static void early_panic(char *msg)
static int userdef __initdata;
-/* "mem=nopentium" disables the 4MB page tables. */
+/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
static int __init parse_memopt(char *p)
{
u64 mem_size;
@@ -980,22 +889,34 @@ static int __init parse_memopt(char *p)
if (!p)
return -EINVAL;
-#ifdef CONFIG_X86_32
if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
setup_clear_cpu_cap(X86_FEATURE_PSE);
return 0;
- }
+#else
+ pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
+ return -EINVAL;
#endif
+ }
userdef = 1;
mem_size = memparse(p, &p);
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+ /* Don't remove all memory when getting "mem={invalid}" parameter: */
+ if (mem_size == 0)
+ return -EINVAL;
+
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ max_mem_size = mem_size;
+#endif
return 0;
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -1004,15 +925,7 @@ static int __init parse_memmap_opt(char *p)
return -EINVAL;
if (!strncmp(p, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /*
- * If we are doing a crash dump, we still need to know
- * the real mem size before original memory map is
- * reset.
- */
- saved_max_pfn = e820_end_of_ram_pfn();
-#endif
- e820.nr_map = 0;
+ e820_table->nr_entries = 0;
userdef = 1;
return 0;
}
@@ -1025,92 +938,197 @@ static int __init parse_memmap_opt(char *p)
userdef = 1;
if (*p == '@') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_RAM);
+ e820__range_add(start_at, mem_size, E820_TYPE_RAM);
} else if (*p == '#') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_ACPI);
+ e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
} else if (*p == '$') {
start_at = memparse(p+1, &p);
- e820_add_region(start_at, mem_size, E820_RESERVED);
- } else
- e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+ e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
+ } else if (*p == '%') {
+ enum e820_type from = 0, to = 0;
+
+ start_at = memparse(p + 1, &p);
+ if (*p == '-')
+ from = simple_strtoull(p + 1, &p, 0);
+ if (*p == '+')
+ to = simple_strtoull(p + 1, &p, 0);
+ if (*p != '\0')
+ return -EINVAL;
+ if (from && to)
+ e820__range_update(start_at, mem_size, from, to);
+ else if (to)
+ e820__range_add(start_at, mem_size, to);
+ else if (from)
+ e820__range_remove(start_at, mem_size, from, 1);
+ else
+ e820__range_remove(start_at, mem_size, 0, 0);
+ } else {
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ }
return *p == '\0' ? 0 : -EINVAL;
}
+
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
early_param("memmap", parse_memmap_opt);
-void __init finish_e820_parsing(void)
+/*
+ * Called after parse_early_param(), after early parameters (such as mem=)
+ * have been processed, in which case we already have an E820 table filled in
+ * via the parameter callback function(s), but it's not sorted and printed yet:
+ */
+void __init e820__finish_early_params(void)
{
if (userdef) {
- u32 nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ if (e820__update_table(e820_table) < 0)
early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
+ pr_info("user-defined physical RAM map:\n");
+ e820__print_table("user");
+ }
+}
+
+static const char *__init e820_type_to_string(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RAM: return "System RAM";
+ case E820_TYPE_ACPI: return "ACPI Tables";
+ case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
+ case E820_TYPE_UNUSABLE: return "Unusable memory";
+ case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
+ case E820_TYPE_PMEM: return "Persistent Memory";
+ case E820_TYPE_RESERVED: return "Reserved";
+ case E820_TYPE_SOFT_RESERVED: return "Soft Reserved";
+ default: return "Unknown E820 type";
}
}
-static inline const char *e820_type_to_string(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
{
- switch (e820_type) {
- case E820_RESERVED_KERN:
- case E820_RAM: return "System RAM";
- case E820_ACPI: return "ACPI Tables";
- case E820_NVS: return "ACPI Non-volatile Storage";
- case E820_UNUSABLE: return "Unusable memory";
- default: return "reserved";
+ switch (entry->type) {
+ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
+ case E820_TYPE_ACPI: /* Fall-through: */
+ case E820_TYPE_NVS: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_PRAM: /* Fall-through: */
+ case E820_TYPE_PMEM: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ case E820_TYPE_SOFT_RESERVED: /* Fall-through: */
+ default: return IORESOURCE_MEM;
+ }
+}
+
+static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES;
+ case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
+ case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
+ case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+ case E820_TYPE_RESERVED: return IORES_DESC_RESERVED;
+ case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED;
+ case E820_TYPE_RAM: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ default: return IORES_DESC_NONE;
+ }
+}
+
+static bool __init do_mark_busy(enum e820_type type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory and other special memory ranges like
+ * device memory, i.e. reserve it for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_SOFT_RESERVED:
+ case E820_TYPE_PRAM:
+ case E820_TYPE_PMEM:
+ return false;
+ case E820_TYPE_RAM:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ default:
+ return true;
}
}
/*
- * Mark e820 reserved areas as busy for the resource manager.
+ * Mark E820 reserved areas as busy for the resource manager:
*/
+
static struct resource __initdata *e820_res;
-void __init e820_reserve_resources(void)
+
+void __init e820__reserve_resources(void)
{
int i;
struct resource *res;
u64 end;
- res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+ res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries,
+ SMP_CACHE_BYTES);
e820_res = res;
- for (i = 0; i < e820.nr_map; i++) {
- end = e820.map[i].addr + e820.map[i].size - 1;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = e820_table->entries + i;
+
+ end = entry->addr + entry->size - 1;
if (end != (resource_size_t)end) {
res++;
continue;
}
- res->name = e820_type_to_string(e820.map[i].type);
- res->start = e820.map[i].addr;
- res->end = end;
-
- res->flags = IORESOURCE_MEM;
+ res->start = entry->addr;
+ res->end = end;
+ res->name = e820_type_to_string(entry);
+ res->flags = e820_type_to_iomem_type(entry);
+ res->desc = e820_type_to_iores_desc(entry);
/*
- * don't register the region that could be conflicted with
- * pci device BAR resource and insert them later in
- * pcibios_resource_survey()
+ * Don't register the region that could be conflicted with
+ * PCI device BAR resources and insert them later in
+ * pcibios_resource_survey():
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+ if (do_mark_busy(entry->type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- for (i = 0; i < e820_saved.nr_map; i++) {
- struct e820entry *entry = &e820_saved.map[i];
- firmware_map_add_early(entry->addr,
- entry->addr + entry->size - 1,
- e820_type_to_string(entry->type));
+ /* Expose the kexec e820 table to the sysfs. */
+ for (i = 0; i < e820_table_kexec->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_kexec->entries + i;
+
+ firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
}
-/* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+/*
+ * How much should we pad the end of RAM, depending on where it is?
+ */
+static unsigned long __init ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1128,63 +1146,59 @@ static unsigned long ram_alignment(resource_size_t pos)
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
-void __init e820_reserve_resources_late(void)
+void __init e820__reserve_resources_late(void)
{
int i;
struct resource *res;
res = e820_res;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820_table->nr_entries; i++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
/*
- * Try to bump up RAM regions to reasonable boundaries to
+ * Try to bump up RAM regions to reasonable boundaries, to
* avoid stolen RAM:
*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *entry = &e820.map[i];
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
u64 start, end;
- if (entry->type != E820_RAM)
+ if (entry->type != E820_TYPE_RAM)
continue;
+
start = entry->addr + entry->size;
end = round_up(start, ram_alignment(start)) - 1;
if (end > MAX_RESOURCE_SIZE)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
- printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
- start, end);
- reserve_region_with_split(&iomem_resource, start, end,
- "RAM buffer");
+
+ printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+ reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
}
}
-char *__init default_machine_specific_memory_setup(void)
+/*
+ * Pass the firmware (bootloader) E820 map to the kernel and process it:
+ */
+char *__init e820__memory_setup_default(void)
{
char *who = "BIOS-e820";
- u32 new_nr;
+
/*
* Try to copy the BIOS-supplied E820-map.
*
* Otherwise fake a memory map; one section from 0k->640k,
* the next section from 1mb->appropriate_mem_k
*/
- new_nr = boot_params.e820_entries;
- sanitize_e820_map(boot_params.e820_map,
- ARRAY_SIZE(boot_params.e820_map),
- &new_nr);
- boot_params.e820_entries = new_nr;
- if (append_e820_map(boot_params.e820_map, boot_params.e820_entries)
- < 0) {
+ if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
u64 mem_size;
- /* compare results from other methods and take the greater */
- if (boot_params.alt_mem_k
- < boot_params.screen_info.ext_mem_k) {
+ /* Compare results from other methods and take the one that gives more RAM: */
+ if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
mem_size = boot_params.screen_info.ext_mem_k;
who = "BIOS-88";
} else {
@@ -1192,21 +1206,128 @@ char *__init default_machine_specific_memory_setup(void)
who = "BIOS-e801";
}
- e820.nr_map = 0;
- e820_add_region(0, LOWMEMSIZE(), E820_RAM);
- e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ e820_table->nr_entries = 0;
+ e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
+ e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
}
- /* In case someone cares... */
+ /* We just appended a lot of ranges, sanitize the table: */
+ e820__update_table(e820_table);
+
return who;
}
-void __init setup_memory_map(void)
+/*
+ * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
+ * E820 map - with an optional platform quirk available for virtual platforms
+ * to override this method of boot environment processing:
+ */
+void __init e820__memory_setup(void)
{
char *who;
+ /* This is a firmware interface ABI - make sure we don't break it: */
+ BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
+
who = x86_init.resources.memory_setup();
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ pr_info("BIOS-provided physical RAM map:\n");
+ e820__print_table(who);
+}
+
+void __init e820__memblock_setup(void)
+{
+ int i;
+ u64 end;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ /*
+ * At this point only the first megabyte is mapped for sure, the
+ * rest of the memory cannot be used for memblock resizing
+ */
+ memblock_set_current_limit(ISA_END_ADDRESS);
+
+ /*
+ * The bootstrap memblock region count maximum is 128 entries
+ * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
+ * than that - so allow memblock resizing.
+ *
+ * This is safe, because this call happens pretty late during x86 setup,
+ * so we know about reserved memory regions already. (This is important
+ * so that memblock resizing does no stomp over reserved areas.)
+ */
+ memblock_allow_resize();
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ end = entry->addr + entry->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (entry->type == E820_TYPE_SOFT_RESERVED)
+ memblock_reserve(entry->addr, entry->size);
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ memblock_add(entry->addr, entry->size);
+ }
+
+ /*
+ * At this point memblock is only allowed to allocate from memory
+ * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set
+ * up in init_mem_mapping().
+ *
+ * KHO kernels are special and use only scratch memory for memblock
+ * allocations, but memory below 1M is ignored by kernel after early
+ * boot and cannot be naturally marked as scratch.
+ *
+ * To allow allocation of the real-mode trampoline and a few (if any)
+ * other very early allocations from below 1M forcibly mark the memory
+ * below 1M as scratch.
+ *
+ * After real mode trampoline is allocated, we clear that scratch
+ * marking.
+ */
+ memblock_mark_kho_scratch(0, SZ_1M);
+
+ /*
+ * 32-bit systems are limited to 4BG of memory even with HIGHMEM and
+ * to even less without it.
+ * Discard memory after max_pfn - the actual limit detected at runtime.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ memblock_remove(PFN_PHYS(max_pfn), -1);
+
+ /* Throw away partial pages: */
+ memblock_trim_memory(PAGE_SIZE);
+
+ memblock_dump_all();
}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60d..6b6f32f40cbe 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/* Various workarounds for chipset bugs.
This code runs very early and can't use the regular PCI subsystem
The entries are keyed to PCI bridges which usually identify chipsets
@@ -11,14 +12,22 @@
#include <linux/pci.h>
#include <linux/acpi.h>
+#include <linux/delay.h>
#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
+#include <drm/intel/i915_drm.h>
+#include <drm/intel/pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
+#include <asm/hpet.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/hpet.h>
+#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -74,6 +83,13 @@ static void __init nvidia_bugs(int num, int slot, int func)
#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
/*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
* All timer overrides on Nvidia are
* wrong unless HPET is enabled.
* Unfortunately that's not true on many Asus boards.
@@ -98,7 +114,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -116,7 +131,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
-#endif
static void __init ati_bugs(int num, int slot, int func)
{
@@ -146,15 +160,10 @@ static void __init ati_bugs(int num, int slot, int func)
static u32 __init ati_sbx00_rev(int num, int slot, int func)
{
- u32 old, d;
+ u32 d;
- d = read_pci_config(num, slot, func, 0x70);
- old = d;
- d &= ~(1<<8);
- write_pci_config(num, slot, func, 0x70, d);
d = read_pci_config(num, slot, func, 0x8);
d &= 0xff;
- write_pci_config(num, slot, func, 0x70, old);
return d;
}
@@ -163,11 +172,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
{
u32 d, rev;
- if (acpi_use_timer_override)
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev >= 0x40)
+ acpi_fix_pin2_polarity = 1;
+
+ /*
+ * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
+ * SB700: revisions 0x39, 0x3a, ...
+ * SB800: revisions 0x40, 0x41, ...
+ */
+ if (rev >= 0x39)
return;
- rev = ati_sbx00_rev(num, slot, func);
- if (rev > 0x13)
+ if (acpi_use_timer_override)
return;
/* check for IRQ0 interrupt swap */
@@ -192,21 +209,481 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
+static void __init intel_remapping_check(int num, int slot, int func)
+{
+ u8 revision;
+ u16 device;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
+
+ /*
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
+ */
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
+ set_irq_remapping_broken();
+}
+
/*
- * Force the read back of the CMP register in hpet_next_event()
- * to work around the problem that the CMP register write seems to be
- * delayed. See hpet_next_event() for details.
- *
- * We do this on all SMBUS incarnations for now until we have more
- * information about the affected chipsets.
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
*/
-static void __init ati_hpet_bugs(int num, int slot, int func)
+
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB (KB (x)))
+
+static resource_size_t __init i830_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ if (esmramc & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static resource_size_t __init i845_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ switch (tseg_size) {
+ case I845_TSEG_SIZE_512K: return KB(512);
+ case I845_TSEG_SIZE_1M: return MB(1);
+ default:
+ WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc);
+ }
+ return 0;
+}
+
+static resource_size_t __init i85x_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static resource_size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static resource_size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static resource_size_t __init i830_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i845_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i85x_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i865_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u16 toud = 0;
+
+ toud = read_pci_config_16(0, 0, 0, I865_TOUD);
+
+ return toud * KB(64) + i845_tseg_size();
+}
+
+static resource_size_t __init gen3_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u32 bsm;
+
+ /* Almost universally we can find the Graphics Base of Stolen Memory
+ * at register BSM (0x5c) in the igfx configuration space. On a few
+ * (desktop) machines this is also mirrored in the bridge device at
+ * different locations, or in the MCHBAR.
+ */
+ bsm = read_pci_config(num, slot, func, INTEL_BSM);
+
+ return bsm & INTEL_BSM_MASK;
+}
+
+static resource_size_t __init gen11_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u64 bsm;
+
+ bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0);
+ bsm &= INTEL_BSM_MASK;
+ bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32;
+
+ return bsm;
+}
+
+static resource_size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I830_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I830_GMCH_GMS_STOLEN_512: return KB(512);
+ case I830_GMCH_GMS_STOLEN_1024: return MB(1);
+ case I830_GMCH_GMS_STOLEN_8192: return MB(8);
+ /* local memory isn't part of the normal address space */
+ case I830_GMCH_GMS_LOCAL: return 0;
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I855_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I855_GMCH_GMS_STOLEN_1M: return MB(1);
+ case I855_GMCH_GMS_STOLEN_4M: return MB(4);
+ case I855_GMCH_GMS_STOLEN_8M: return MB(8);
+ case I855_GMCH_GMS_STOLEN_16M: return MB(16);
+ case I855_GMCH_GMS_STOLEN_32M: return MB(32);
+ case I915_GMCH_GMS_STOLEN_48M: return MB(48);
+ case I915_GMCH_GMS_STOLEN_64M: return MB(64);
+ case G33_GMCH_GMS_STOLEN_128M: return MB(128);
+ case G33_GMCH_GMS_STOLEN_256M: return MB(256);
+ case INTEL_GMCH_GMS_STOLEN_96M: return MB(96);
+ case INTEL_GMCH_GMS_STOLEN_160M:return MB(160);
+ case INTEL_GMCH_GMS_STOLEN_224M:return MB(224);
+ case INTEL_GMCH_GMS_STOLEN_352M:return MB(352);
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gms < 0x11)
+ return gms * MB(32);
+ else if (gms < 0x17)
+ return (gms - 0x11) * MB(4) + MB(8);
+ else
+ return (gms - 0x17) * MB(4) + MB(36);
+}
+
+static resource_size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ /* 0x0 to 0xef: 32MB increments starting at 0MB */
+ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */
+ if (gms < 0xf0)
+ return gms * MB(32);
+ else
+ return (gms - 0xf0) * MB(4) + MB(4);
+}
+
+struct intel_early_ops {
+ resource_size_t (*stolen_size)(int num, int slot, int func);
+ resource_size_t (*stolen_base)(int num, int slot, int func,
+ resource_size_t size);
+};
+
+static const struct intel_early_ops i830_early_ops __initconst = {
+ .stolen_base = i830_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i845_early_ops __initconst = {
+ .stolen_base = i845_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i85x_early_ops __initconst = {
+ .stolen_base = i85x_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops i865_early_ops __initconst = {
+ .stolen_base = i865_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen3_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen6_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen6_stolen_size,
+};
+
+static const struct intel_early_ops gen8_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen8_stolen_size,
+};
+
+static const struct intel_early_ops gen9_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+static const struct intel_early_ops chv_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = chv_stolen_size,
+};
+
+static const struct intel_early_ops gen11_early_ops __initconst = {
+ .stolen_base = gen11_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
+static const struct pci_device_id intel_early_ids[] __initconst = {
+ INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops),
+ INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops),
+ INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops),
+ INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops),
+ INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops),
+ INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops),
+ INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+};
+
+struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
+EXPORT_SYMBOL(intel_graphics_stolen_res);
+
+static void __init
+intel_graphics_stolen(int num, int slot, int func,
+ const struct intel_early_ops *early_ops)
+{
+ resource_size_t base, size;
+ resource_size_t end;
+
+ size = early_ops->stolen_size(num, slot, func);
+ base = early_ops->stolen_base(num, slot, func, size);
+
+ if (!size || !base)
+ return;
+
+ end = base + size - 1;
+
+ intel_graphics_stolen_res.start = base;
+ intel_graphics_stolen_res.end = end;
+
+ printk(KERN_INFO "Reserving Intel graphics memory at %pR\n",
+ &intel_graphics_stolen_res);
+
+ /* Mark this space as reserved */
+ e820__range_add(base, size, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+}
+
+static void __init intel_graphics_quirks(int num, int slot, int func)
+{
+ const struct intel_early_ops *early_ops;
+ u16 device;
+ int i;
+
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
+ kernel_ulong_t driver_data = intel_early_ids[i].driver_data;
+
+ if (intel_early_ids[i].device != device)
+ continue;
+
+ early_ops = (typeof(early_ops))driver_data;
+
+ intel_graphics_stolen(num, slot, func, early_ops);
+
+ return;
+ }
+}
+
+static void __init force_disable_hpet(int num, int slot, int func)
{
#ifdef CONFIG_HPET_TIMER
- hpet_readback_cmp = 1;
+ boot_hpet_disable = true;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
#endif
}
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!x86_apple_machine)
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -219,12 +696,6 @@ struct chipset {
void (*f)(int num, int slot, int func);
};
-/*
- * Only works for devices on the root bus. If you add any devices
- * not on bus 0 readd another loop level in early_quirks(). But
- * be careful because at least the Nvidia quirk here relies on
- * only matching on bus 0.
- */
static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -236,11 +707,31 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
+ { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ 0, intel_graphics_quirks },
+ /*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
{}
};
+static void __init early_pci_scan_bus(int bus);
+
/**
* check_dev_quirk - apply early quirks to a given PCI device
* @num: bus number
@@ -249,7 +740,7 @@ static struct chipset early_qrk[] __initdata = {
*
* Check the vendor & device ID against the early quirks table.
*
- * If the device is single function, let early_quirks() know so we don't
+ * If the device is single function, let early_pci_scan_bus() know so we don't
* poke at this device again.
*/
static int __init check_dev_quirk(int num, int slot, int func)
@@ -258,6 +749,7 @@ static int __init check_dev_quirk(int num, int slot, int func)
u16 vendor;
u16 device;
u8 type;
+ u8 sec;
int i;
class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
@@ -285,25 +777,36 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+
+ if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
+ if (!(type & PCI_HEADER_TYPE_MFD))
return -1;
return 0;
}
-void __init early_quirks(void)
+static void __init early_pci_scan_bus(int bus)
{
int slot, func;
- if (!early_pci_allowed())
- return;
-
/* Poor man's PCI discovery */
- /* Only scan the root bus */
for (slot = 0; slot < 32; slot++)
for (func = 0; func < 8; func++) {
/* Only probe function 0 on single fn devices */
- if (check_dev_quirk(0, slot, func))
+ if (check_dev_quirk(bus, slot, func))
break;
}
}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..cba75306e5b6 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/console.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/screen_info.h>
@@ -7,6 +9,7 @@
#include <linux/pci_regs.h>
#include <linux/pci_ids.h>
#include <linux/errno.h>
+#include <linux/pgtable.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
@@ -14,8 +17,10 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
-#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <asm/pci_x86.h>
+#include <linux/static_call.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -73,7 +78,7 @@ static struct console early_vga_console = {
/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
-static int early_serial_base = 0x3f8; /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define XMTRDY 0x20
@@ -91,13 +96,28 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
+static __noendbr unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+ANNOTATE_NOENDBR_SYM(io_serial_in);
+
+static __noendbr void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+ANNOTATE_NOENDBR_SYM(io_serial_out);
+
+DEFINE_STATIC_CALL(serial_in, io_serial_in);
+DEFINE_STATIC_CALL(serial_out, io_serial_out);
+
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- outb(ch, early_serial_base + TXR);
+ static_call(serial_out)(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -111,13 +131,33 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
}
}
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */
+ static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */
+ static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */
+ static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = static_call(serial_in)(early_serial_base, LCR);
+ static_call(serial_out)(early_serial_base, LCR, c | DLAB);
+ static_call(serial_out)(early_serial_base, DLL, divisor & 0xff);
+ static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ static_call(serial_out)(early_serial_base, LCR, c & ~DLAB);
+
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ if (static_call_query(serial_in) == io_serial_in)
+ kexec_debug_8250_port = early_serial_base;
+#endif
+}
+
#define DEFAULT_BAUD 9600
static __init void early_serial_init(char *s)
{
- unsigned char c;
unsigned divisor;
- unsigned baud = DEFAULT_BAUD;
+ unsigned long baud = DEFAULT_BAUD;
char *e;
if (*s == ',')
@@ -142,24 +182,186 @@ static __init void early_serial_init(char *s)
s++;
}
- outb(0x3, early_serial_base + LCR); /* 8n1 */
- outb(0, early_serial_base + IER); /* no interrupt */
- outb(0, early_serial_base + FCR); /* no fifo */
- outb(0x3, early_serial_base + MCR); /* DTR + RTS */
-
if (*s) {
- baud = simple_strtoul(s, &e, 0);
+ baud = simple_strtoull(s, &e, 0);
+
if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
+ /* Convert from baud to divisor value */
divisor = 115200 / baud;
- c = inb(early_serial_base + LCR);
- outb(c | DLAB, early_serial_base + LCR);
- outb(divisor & 0xff, early_serial_base + DLL);
- outb((divisor >> 8) & 0xff, early_serial_base + DLH);
- outb(c & ~DLAB, early_serial_base + LCR);
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+
+static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
}
+ANNOTATE_NOENDBR_SYM(mem32_serial_out);
+
+static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+ANNOTATE_NOENDBR_SYM(mem32_serial_in);
+
+/*
+ * early_mmio_serial_init() - Initialize MMIO-based early serial console.
+ * @s: MMIO-based serial specification.
+ */
+static __init void early_mmio_serial_init(char *s)
+{
+ unsigned long baudrate;
+ unsigned long membase;
+ char *e;
+
+ if (*s == ',')
+ s++;
+
+ if (!strncmp(s, "0x", 2)) {
+ /* NB: only 32-bit addresses are supported. */
+ membase = simple_strtoul(s, &e, 16);
+ early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE);
+
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
+
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ if (!strncmp(s, "nocfg", 5)) {
+ baudrate = 0;
+ } else {
+ baudrate = simple_strtoul(s, &e, 0);
+ if (baudrate == 0 || s == e)
+ baudrate = DEFAULT_BAUD;
+ }
+
+ if (baudrate)
+ early_serial_hw_init(115200 / baudrate);
+}
+
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe
+ * the location of a PCI device that must be a UART device. "force" is optional
+ * and overrides the use of an UART device with a wrong PCI class code.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ u32 classcode, bar0;
+ u16 cmdreg;
+ char *e;
+ int force = 0;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s == 0)
+ return;
+
+ /* Force the use of an UART device with wrong class code */
+ if (!strncmp(s, "force,", 6)) {
+ force = 1;
+ s += 6;
+ }
+
+ /*
+ * Part the param to get the BDF values
+ */
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
+
+ /*
+ * Find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+ /*
+ * Verify it is a 16550-UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) {
+ if (!force)
+ return;
+ }
+
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
+ /* it is IO mapped */
+ early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ static_call_update(serial_in, mem32_serial_in);
+ static_call_update(serial_out, mem32_serial_out);
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
+#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64)
+ kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+#endif
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
+ }
+
+ /*
+ * Initialize the hardware
+ */
+ if (*s) {
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+#endif
static struct console early_serial_console = {
.name = "earlyser",
@@ -168,25 +370,9 @@ static struct console early_serial_console = {
.index = -1,
};
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-
-asmlinkage void early_printk(const char *fmt, ...)
+static void early_console_register(struct console *con, int keep_early)
{
- char buf[512];
- int n;
- va_list ap;
-
- va_start(ap, fmt);
- n = vscnprintf(buf, sizeof(buf), fmt, ap);
- early_console->write(early_console, buf, n);
- va_end(ap);
-}
-
-static inline void early_console_register(struct console *con, int keep_early)
-{
- if (early_console->index != -1) {
+ if (con->index != -1) {
printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
con->name);
return;
@@ -206,13 +392,17 @@ static int __init setup_early_printk(char *buf)
if (!buf)
return 0;
- if (early_console_initialized)
+ if (early_console)
return 0;
- early_console_initialized = 1;
keep = (strstr(buf, "keep") != NULL);
while (*buf != '\0') {
+ if (!strncmp(buf, "mmio32", 6)) {
+ buf += 6;
+ early_mmio_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ }
if (!strncmp(buf, "serial", 6)) {
buf += 6;
early_serial_init(buf);
@@ -224,6 +414,13 @@ static int __init setup_early_printk(char *buf)
early_serial_init(buf + 4);
early_console_register(&early_serial_console, keep);
}
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ buf += 9; /* Keep from match the above "pciserial" */
+ early_pci_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ }
+#endif
if (!strncmp(buf, "vga", 3) &&
boot_params.screen_info.orig_video_isVGA == 1) {
max_xpos = boot_params.screen_info.orig_video_cols;
@@ -239,6 +436,11 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4, keep);
+#endif
+
buf++;
}
return 0;
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
new file mode 100644
index 000000000000..38e7d597b660
--- /dev/null
+++ b/arch/x86/kernel/ebda.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
+ *
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
+ */
+
+#define BIOS_RAM_SIZE_KB_PTR 0x413
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
+{
+ unsigned int bios_start, ebda_start;
+
+ /*
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
+ */
+ if (!x86_platform.legacy.reserve_bios_regions)
+ return;
+
+ /*
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
+ */
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
+
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
+
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
+}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
deleted file mode 100644
index c2fa9b8b497e..000000000000
--- a/arch/x86/kernel/efi.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- * Common EFI (Extensible Firmware Interface) support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2005-2008 Intel Co.
- * Fenghua Yu <fenghua.yu@intel.com>
- * Bibo Mao <bibo.mao@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- * Huang Ying <ying.huang@intel.com>
- *
- * Copied from efi_32.c to eliminate the duplicated code between EFI
- * 32/64 support code. --ying 2007-10-26
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version. --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls. --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- * Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/efi.h>
-#include <linux/bootmem.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/bcd.h>
-
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/time.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/x86_init.h>
-
-#define EFI_DEBUG 1
-#define PFX "EFI: "
-
-int efi_enabled;
-EXPORT_SYMBOL(efi_enabled);
-
-struct efi efi;
-EXPORT_SYMBOL(efi);
-
-struct efi_memory_map memmap;
-
-static struct efi efi_phys __initdata;
-static efi_system_table_t efi_systab __initdata;
-
-static int __init setup_noefi(char *arg)
-{
- efi_enabled = 0;
- return 0;
-}
-early_param("noefi", setup_noefi);
-
-int add_efi_memmap;
-EXPORT_SYMBOL(add_efi_memmap);
-
-static int __init setup_add_efi_memmap(char *arg)
-{
- add_efi_memmap = 1;
- return 0;
-}
-early_param("add_efi_memmap", setup_add_efi_memmap);
-
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
- return efi_call_virt2(get_time, tm, tc);
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
- return efi_call_virt1(set_time, tm);
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
- efi_bool_t *pending,
- efi_time_t *tm)
-{
- return efi_call_virt3(get_wakeup_time,
- enabled, pending, tm);
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
- return efi_call_virt2(set_wakeup_time,
- enabled, tm);
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 *attr,
- unsigned long *data_size,
- void *data)
-{
- return efi_call_virt5(get_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
- efi_char16_t *name,
- efi_guid_t *vendor)
-{
- return efi_call_virt3(get_next_variable,
- name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- unsigned long attr,
- unsigned long data_size,
- void *data)
-{
- return efi_call_virt5(set_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
- return efi_call_virt1(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
- efi_status_t status,
- unsigned long data_size,
- efi_char16_t *data)
-{
- efi_call_virt4(reset_system, reset_type, status,
- data_size, data);
-}
-
-static efi_status_t virt_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- return efi_call_virt4(set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
-}
-
-static efi_status_t __init phys_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- efi_status_t status;
-
- efi_call_phys_prelog();
- status = efi_call_phys4(efi_phys.set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
- efi_call_phys_epilog();
- return status;
-}
-
-static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
- efi_time_cap_t *tc)
-{
- efi_status_t status;
-
- efi_call_phys_prelog();
- status = efi_call_phys2(efi_phys.get_time, tm, tc);
- efi_call_phys_epilog();
- return status;
-}
-
-int efi_set_rtc_mmss(unsigned long nowtime)
-{
- int real_seconds, real_minutes;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS) {
- printk(KERN_ERR "Oops: efitime: can't read time!\n");
- return -1;
- }
-
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
- real_minutes += 30;
- real_minutes %= 60;
- eft.minute = real_minutes;
- eft.second = real_seconds;
-
- status = efi.set_time(&eft);
- if (status != EFI_SUCCESS) {
- printk(KERN_ERR "Oops: efitime: can't write time!\n");
- return -1;
- }
- return 0;
-}
-
-unsigned long efi_get_time(void)
-{
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS)
- printk(KERN_ERR "Oops: efitime: can't read time!\n");
-
- return mktime(eft.year, eft.month, eft.day, eft.hour,
- eft.minute, eft.second);
-}
-
-/*
- * Tell the kernel about the EFI memory map. This might include
- * more than the max 128 entries that can fit in the e820 legacy
- * (zeropage) memory map.
- */
-
-static void __init do_add_efi_memmap(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- unsigned long long start = md->phys_addr;
- unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
- int e820_type;
-
- switch (md->type) {
- case EFI_LOADER_CODE:
- case EFI_LOADER_DATA:
- case EFI_BOOT_SERVICES_CODE:
- case EFI_BOOT_SERVICES_DATA:
- case EFI_CONVENTIONAL_MEMORY:
- if (md->attribute & EFI_MEMORY_WB)
- e820_type = E820_RAM;
- else
- e820_type = E820_RESERVED;
- break;
- case EFI_ACPI_RECLAIM_MEMORY:
- e820_type = E820_ACPI;
- break;
- case EFI_ACPI_MEMORY_NVS:
- e820_type = E820_NVS;
- break;
- case EFI_UNUSABLE_MEMORY:
- e820_type = E820_UNUSABLE;
- break;
- default:
- /*
- * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
- * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
- * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
- */
- e820_type = E820_RESERVED;
- break;
- }
- e820_add_region(start, size, e820_type);
- }
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-}
-
-void __init efi_reserve_early(void)
-{
- unsigned long pmap;
-
-#ifdef CONFIG_X86_32
- pmap = boot_params.efi_info.efi_memmap;
-#else
- pmap = (boot_params.efi_info.efi_memmap |
- ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
-#endif
- memmap.phys_map = (void *)pmap;
- memmap.nr_map = boot_params.efi_info.efi_memmap_size /
- boot_params.efi_info.efi_memdesc_size;
- memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
- memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
- reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
- "EFI memmap");
-}
-
-#if EFI_DEBUG
-static void __init print_efi_memmap(void)
-{
- efi_memory_desc_t *md;
- void *p;
- int i;
-
- for (p = memmap.map, i = 0;
- p < memmap.map_end;
- p += memmap.desc_size, i++) {
- md = p;
- printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
- "range=[0x%016llx-0x%016llx) (%lluMB)\n",
- i, md->type, md->attribute, md->phys_addr,
- md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
- (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
- }
-}
-#endif /* EFI_DEBUG */
-
-void __init efi_init(void)
-{
- efi_config_table_t *config_tables;
- efi_runtime_services_t *runtime;
- efi_char16_t *c16;
- char vendor[100] = "unknown";
- int i = 0;
- void *tmp;
-
-#ifdef CONFIG_X86_32
- efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
- efi_phys.systab = (efi_system_table_t *)
- (boot_params.efi_info.efi_systab |
- ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
-
- efi.systab = early_ioremap((unsigned long)efi_phys.systab,
- sizeof(efi_system_table_t));
- if (efi.systab == NULL)
- printk(KERN_ERR "Couldn't map the EFI system table!\n");
- memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
- early_iounmap(efi.systab, sizeof(efi_system_table_t));
- efi.systab = &efi_systab;
-
- /*
- * Verify the EFI Table
- */
- if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
- printk(KERN_ERR "EFI system table signature incorrect!\n");
- if ((efi.systab->hdr.revision >> 16) == 0)
- printk(KERN_ERR "Warning: EFI system table version "
- "%d.%02d, expected 1.00 or greater!\n",
- efi.systab->hdr.revision >> 16,
- efi.systab->hdr.revision & 0xffff);
-
- /*
- * Show what we know for posterity
- */
- c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
- if (c16) {
- for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
- vendor[i] = *c16++;
- vendor[i] = '\0';
- } else
- printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
- early_iounmap(tmp, 2);
-
- printk(KERN_INFO "EFI v%u.%.02u by %s\n",
- efi.systab->hdr.revision >> 16,
- efi.systab->hdr.revision & 0xffff, vendor);
-
- /*
- * Let's see what config tables the firmware passed to us.
- */
- config_tables = early_ioremap(
- efi.systab->tables,
- efi.systab->nr_tables * sizeof(efi_config_table_t));
- if (config_tables == NULL)
- printk(KERN_ERR "Could not map EFI Configuration Table!\n");
-
- printk(KERN_INFO);
- for (i = 0; i < efi.systab->nr_tables; i++) {
- if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
- efi.mps = config_tables[i].table;
- printk(" MPS=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- ACPI_20_TABLE_GUID)) {
- efi.acpi20 = config_tables[i].table;
- printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- ACPI_TABLE_GUID)) {
- efi.acpi = config_tables[i].table;
- printk(" ACPI=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- SMBIOS_TABLE_GUID)) {
- efi.smbios = config_tables[i].table;
- printk(" SMBIOS=0x%lx ", config_tables[i].table);
-#ifdef CONFIG_X86_UV
- } else if (!efi_guidcmp(config_tables[i].guid,
- UV_SYSTEM_TABLE_GUID)) {
- efi.uv_systab = config_tables[i].table;
- printk(" UVsystab=0x%lx ", config_tables[i].table);
-#endif
- } else if (!efi_guidcmp(config_tables[i].guid,
- HCDP_TABLE_GUID)) {
- efi.hcdp = config_tables[i].table;
- printk(" HCDP=0x%lx ", config_tables[i].table);
- } else if (!efi_guidcmp(config_tables[i].guid,
- UGA_IO_PROTOCOL_GUID)) {
- efi.uga = config_tables[i].table;
- printk(" UGA=0x%lx ", config_tables[i].table);
- }
- }
- printk("\n");
- early_iounmap(config_tables,
- efi.systab->nr_tables * sizeof(efi_config_table_t));
-
- /*
- * Check out the runtime services table. We need to map
- * the runtime services table so that we can grab the physical
- * address of several of the EFI runtime functions, needed to
- * set the firmware into virtual mode.
- */
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
- sizeof(efi_runtime_services_t));
- if (runtime != NULL) {
- /*
- * We will only need *early* access to the following
- * two EFI runtime services before set_virtual_address_map
- * is invoked.
- */
- efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
- efi_phys.set_virtual_address_map =
- (efi_set_virtual_address_map_t *)
- runtime->set_virtual_address_map;
- /*
- * Make efi_get_time can be called before entering
- * virtual mode.
- */
- efi.get_time = phys_efi_get_time;
- } else
- printk(KERN_ERR "Could not map the EFI runtime service "
- "table!\n");
- early_iounmap(runtime, sizeof(efi_runtime_services_t));
-
- /* Map the EFI memory map */
- memmap.map = early_ioremap((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size);
- if (memmap.map == NULL)
- printk(KERN_ERR "Could not map the EFI memory map!\n");
- memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
-
- if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING
- "Kernel-defined memdesc doesn't match the one from EFI!\n");
-
- if (add_efi_memmap)
- do_add_efi_memmap();
-
-#ifdef CONFIG_X86_32
- x86_platform.get_wallclock = efi_get_time;
- x86_platform.set_wallclock = efi_set_rtc_mmss;
-#endif
-
- /* Setup for EFI runtime service */
- reboot_type = BOOT_EFI;
-
-#if EFI_DEBUG
- print_efi_memmap();
-#endif
-}
-
-static void __init runtime_code_page_mkexec(void)
-{
- efi_memory_desc_t *md;
- void *p;
- u64 addr, npages;
-
- /* Make EFI runtime service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
-
- if (md->type != EFI_RUNTIME_SERVICES_CODE)
- continue;
-
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_x(addr, npages);
- }
-}
-
-/*
- * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
- */
-void __init efi_enter_virtual_mode(void)
-{
- efi_memory_desc_t *md;
- efi_status_t status;
- unsigned long size;
- u64 end, systab, addr, npages, end_pfn;
- void *p, *va;
-
- efi.systab = NULL;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME))
- continue;
-
- size = md->num_pages << EFI_PAGE_SHIFT;
- end = md->phys_addr + size;
-
- end_pfn = PFN_UP(end);
- if (end_pfn <= max_low_pfn_mapped
- || (end_pfn > (1UL << (32 - PAGE_SHIFT))
- && end_pfn <= max_pfn_mapped))
- va = __va(md->phys_addr);
- else
- va = efi_ioremap(md->phys_addr, size, md->type);
-
- md->virt_addr = (u64) (unsigned long) va;
-
- if (!va) {
- printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
- (unsigned long long)md->phys_addr);
- continue;
- }
-
- if (!(md->attribute & EFI_MEMORY_WB)) {
- addr = md->virt_addr;
- npages = md->num_pages;
- memrange_efi_to_native(&addr, &npages);
- set_memory_uc(addr, npages);
- }
-
- systab = (u64) (unsigned long) efi_phys.systab;
- if (md->phys_addr <= systab && systab < end) {
- systab += md->virt_addr - md->phys_addr;
- efi.systab = (efi_system_table_t *) (unsigned long) systab;
- }
- }
-
- BUG_ON(!efi.systab);
-
- status = phys_efi_set_virtual_address_map(
- memmap.desc_size * memmap.nr_map,
- memmap.desc_size,
- memmap.desc_version,
- memmap.phys_map);
-
- if (status != EFI_SUCCESS) {
- printk(KERN_ALERT "Unable to switch EFI into virtual mode "
- "(status=%lx)!\n", status);
- panic("EFI call to SetVirtualAddressMap() failed!");
- }
-
- /*
- * Now that EFI is in virtual mode, update the function
- * pointers in the runtime service table to the new virtual addresses.
- *
- * Call EFI services through wrapper functions.
- */
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
- if (__supported_pte_mask & _PAGE_NX)
- runtime_code_page_mkexec();
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
- memmap.map = NULL;
-}
-
-/*
- * Convenience functions to obtain memory types and attributes
- */
-u32 efi_mem_type(unsigned long phys_addr)
-{
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if ((md->phys_addr <= phys_addr) &&
- (phys_addr < (md->phys_addr +
- (md->num_pages << EFI_PAGE_SHIFT))))
- return md->type;
- }
- return 0;
-}
-
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if ((md->phys_addr <= phys_addr) &&
- (phys_addr < (md->phys_addr +
- (md->num_pages << EFI_PAGE_SHIFT))))
- return md->attribute;
- }
- return 0;
-}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
deleted file mode 100644
index 5cab48ee61a4..000000000000
--- a/arch/x86/kernel/efi_32.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2002 Hewlett-Packard Co.
- * David Mosberger-Tang <davidm@hpl.hp.com>
- * Stephane Eranian <eranian@hpl.hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version. --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls. --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- * Skip non-WB memory and ignore empty memory ranges.
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/efi.h>
-
-#include <asm/io.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/efi.h>
-
-/*
- * To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
- */
-
-static unsigned long efi_rt_eflags;
-static pgd_t efi_bak_pg_dir_pointer[2];
-
-void efi_call_phys_prelog(void)
-{
- unsigned long cr4;
- unsigned long temp;
- struct desc_ptr gdt_descr;
-
- local_irq_save(efi_rt_eflags);
-
- /*
- * If I don't have PAE, I should just duplicate two entries in page
- * directory. If I have PAE, I just need to duplicate one entry in
- * page directory.
- */
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- swapper_pg_dir[0].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- } else {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- efi_bak_pg_dir_pointer[1].pgd =
- swapper_pg_dir[pgd_index(0x400000)].pgd;
- swapper_pg_dir[pgd_index(0)].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- temp = PAGE_OFFSET + 0x400000;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- swapper_pg_dir[pgd_index(temp)].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
- __flush_tlb_all();
-
- gdt_descr.address = __pa(get_cpu_gdt_table(0));
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-}
-
-void efi_call_phys_epilog(void)
-{
- unsigned long cr4;
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
-
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- } else {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- efi_bak_pg_dir_pointer[1].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
- __flush_tlb_all();
-
- local_irq_restore(efi_rt_eflags);
-}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
deleted file mode 100644
index ac0621a7ac3d..000000000000
--- a/arch/x86/kernel/efi_64.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * x86_64 specific EFI support functions
- * Based on Extensible Firmware Interface Specification version 1.0
- *
- * Copyright (C) 2005-2008 Intel Co.
- * Fenghua Yu <fenghua.yu@intel.com>
- * Bibo Mao <bibo.mao@intel.com>
- * Chandramouli Narayanan <mouli@linux.intel.com>
- * Huang Ying <ying.huang@intel.com>
- *
- * Code to convert EFI to E820 map has been implemented in elilo bootloader
- * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
- * is setup appropriately for EFI runtime code.
- * - mouli 06/14/2007.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/efi.h>
-#include <asm/cacheflush.h>
-#include <asm/fixmap.h>
-
-static pgd_t save_pgd __initdata;
-static unsigned long efi_flags __initdata;
-
-static void __init early_mapping_set_exec(unsigned long start,
- unsigned long end,
- int executable)
-{
- unsigned long num_pages;
-
- start &= PMD_MASK;
- end = (end + PMD_SIZE - 1) & PMD_MASK;
- num_pages = (end - start) >> PAGE_SHIFT;
- if (executable)
- set_memory_x((unsigned long)__va(start), num_pages);
- else
- set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
-{
- efi_memory_desc_t *md;
- void *p;
-
- if (!(__supported_pte_mask & _PAGE_NX))
- return;
-
- /* Make EFI runtime service code area executable */
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if (md->type == EFI_RUNTIME_SERVICES_CODE) {
- unsigned long end;
- end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
- early_mapping_set_exec(md->phys_addr, end, executable);
- }
- }
-}
-
-void __init efi_call_phys_prelog(void)
-{
- unsigned long vaddress;
-
- early_runtime_code_mapping_set_exec(1);
- local_irq_save(efi_flags);
- vaddress = (unsigned long)__va(0x0UL);
- save_pgd = *pgd_offset_k(0x0UL);
- set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
- __flush_tlb_all();
-}
-
-void __init efi_call_phys_epilog(void)
-{
- /*
- * After the lock is released, the original page table is restored.
- */
- set_pgd(pgd_offset_k(0x0UL), save_pgd);
- __flush_tlb_all();
- local_irq_restore(efi_flags);
- early_runtime_code_mapping_set_exec(0);
-}
-
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
- u32 type)
-{
- unsigned long last_map_pfn;
-
- if (type == EFI_MEMORY_MAPPED_IO)
- return ioremap(phys_addr, size);
-
- last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
- if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
- return NULL;
-
- return (void __iomem *)__va(phys_addr);
-}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
deleted file mode 100644
index fbe66e626c09..000000000000
--- a/arch/x86/kernel/efi_stub_32.S
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.
- */
-
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-ENTRY(efi_call_phys)
- /*
- * 0. The function can only be called in Linux kernel. So CS has been
- * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
- * the values of these registers are the same. And, the corresponding
- * GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prelog and epilog.
- */
-
- /*
- * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
- * But to make it smoothly switch from virtual mode to flat mode.
- * The mapping of lower virtual memory has been created in prelog and
- * epilog.
- */
- movl $1f, %edx
- subl $__PAGE_OFFSET, %edx
- jmp *%edx
-1:
-
- /*
- * 2. Now on the top of stack is the return
- * address in the caller of efi_call_phys(), then parameter 1,
- * parameter 2, ..., param n. To make things easy, we save the return
- * address of efi_call_phys in a global variable.
- */
- popl %edx
- movl %edx, saved_return_addr
- /* get the function pointer into ECX*/
- popl %ecx
- movl %ecx, efi_rt_function_ptr
- movl $2f, %edx
- subl $__PAGE_OFFSET, %edx
- pushl %edx
-
- /*
- * 3. Clear PG bit in %CR0.
- */
- movl %cr0, %edx
- andl $0x7fffffff, %edx
- movl %edx, %cr0
- jmp 1f
-1:
-
- /*
- * 4. Adjust stack pointer.
- */
- subl $__PAGE_OFFSET, %esp
-
- /*
- * 5. Call the physical function.
- */
- jmp *%ecx
-
-2:
- /*
- * 6. After EFI runtime service returns, control will return to
- * following instruction. We'd better readjust stack pointer first.
- */
- addl $__PAGE_OFFSET, %esp
-
- /*
- * 7. Restore PG bit
- */
- movl %cr0, %edx
- orl $0x80000000, %edx
- movl %edx, %cr0
- jmp 1f
-1:
- /*
- * 8. Now restore the virtual mode from flat mode by
- * adding EIP with PAGE_OFFSET.
- */
- movl $1f, %edx
- jmp *%edx
-1:
-
- /*
- * 9. Balance the stack. And because EAX contain the return value,
- * we'd better not clobber it.
- */
- leal efi_rt_function_ptr, %edx
- movl (%edx), %ecx
- pushl %ecx
-
- /*
- * 10. Push the saved return address onto the stack and return.
- */
- leal saved_return_addr, %edx
- movl (%edx), %ecx
- pushl %ecx
- ret
-ENDPROC(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
- .long 0
-efi_rt_function_ptr:
- .long 0
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
deleted file mode 100644
index 4c07ccab8146..000000000000
--- a/arch/x86/kernel/efi_stub_64.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Function calling ABI conversion from Linux to EFI for x86_64
- *
- * Copyright (C) 2007 Intel Corp
- * Bibo Mao <bibo.mao@intel.com>
- * Huang Ying <ying.huang@intel.com>
- */
-
-#include <linux/linkage.h>
-
-#define SAVE_XMM \
- mov %rsp, %rax; \
- subq $0x70, %rsp; \
- and $~0xf, %rsp; \
- mov %rax, (%rsp); \
- mov %cr0, %rax; \
- clts; \
- mov %rax, 0x8(%rsp); \
- movaps %xmm0, 0x60(%rsp); \
- movaps %xmm1, 0x50(%rsp); \
- movaps %xmm2, 0x40(%rsp); \
- movaps %xmm3, 0x30(%rsp); \
- movaps %xmm4, 0x20(%rsp); \
- movaps %xmm5, 0x10(%rsp)
-
-#define RESTORE_XMM \
- movaps 0x60(%rsp), %xmm0; \
- movaps 0x50(%rsp), %xmm1; \
- movaps 0x40(%rsp), %xmm2; \
- movaps 0x30(%rsp), %xmm3; \
- movaps 0x20(%rsp), %xmm4; \
- movaps 0x10(%rsp), %xmm5; \
- mov 0x8(%rsp), %rsi; \
- mov %rsi, %cr0; \
- mov (%rsp), %rsp
-
-ENTRY(efi_call0)
- SAVE_XMM
- subq $32, %rsp
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call0)
-
-ENTRY(efi_call1)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call1)
-
-ENTRY(efi_call2)
- SAVE_XMM
- subq $32, %rsp
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call2)
-
-ENTRY(efi_call3)
- SAVE_XMM
- subq $32, %rsp
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call3)
-
-ENTRY(efi_call4)
- SAVE_XMM
- subq $32, %rsp
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $32, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call4)
-
-ENTRY(efi_call5)
- SAVE_XMM
- subq $48, %rsp
- mov %r9, 32(%rsp)
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $48, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call5)
-
-ENTRY(efi_call6)
- SAVE_XMM
- mov (%rsp), %rax
- mov 8(%rax), %rax
- subq $48, %rsp
- mov %r9, 32(%rsp)
- mov %rax, 40(%rsp)
- mov %r8, %r9
- mov %rcx, %r8
- mov %rsi, %rcx
- call *%rdi
- addq $48, %rsp
- RESTORE_XMM
- ret
-ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000000..9535a6507db7
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * EISA specific code
+ */
+#include <linux/cc_platform.h>
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+#include <xen/xen.h>
+
+static __init int eisa_bus_probe(void)
+{
+ u32 *p;
+
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return 0;
+
+ p = memremap(0x0FFFD9, 4, MEMREMAP_WB);
+ if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ EISA_bus = 1;
+ memunmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
deleted file mode 100644
index cd49141cf153..000000000000
--- a/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1500 +0,0 @@
-/*
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * ptrace needs to have all regs on the stack.
- * if the order here is changed, it needs to be
- * updated in fork.c:copy_process, signal.c:do_signal,
- * ptrace.c and ptrace.h
- *
- * 0(%esp) - %ebx
- * 4(%esp) - %ecx
- * 8(%esp) - %edx
- * C(%esp) - %esi
- * 10(%esp) - %edi
- * 14(%esp) - %ebp
- * 18(%esp) - %eax
- * 1C(%esp) - %ds
- * 20(%esp) - %es
- * 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
- * 2C(%esp) - orig_eax
- * 30(%esp) - %eip
- * 34(%esp) - %cs
- * 38(%esp) - %eflags
- * 3C(%esp) - %oldesp
- * 40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit syscall_trace_entry
-#define sysexit_audit syscall_exit_work
-#endif
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#define nr_syscalls ((syscall_table_size)/4)
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-#ifdef CONFIG_VM86
-#define resume_userspace_sig check_userspace
-#else
-#define resume_userspace_sig resume_userspace
-#endif
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
- CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl %gs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98: popl %gs
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE gs*/
- .if \pop <> 0
- add $\pop, %esp
- CFI_ADJUST_CFA_OFFSET -\pop
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
-.popsection
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
-.popsection
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
- /*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
- cld
- PUSH_GS
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0;*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0;*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0;*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- movl $(__USER_DS), %edx
- movl %edx, %ds
- movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
- SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
- popl %ebx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebx
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- popl %esi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE esi
- popl %edi
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edi
- popl %ebp
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ebp
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
- RESTORE_INT_REGS
-1: popl %ds
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE ds;*/
-2: popl %es
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE es;*/
-3: popl %fs
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_RESTORE fs;*/
- POP_GS \pop
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.section __ex_table, "a"
- .align 4
- .long 1b, 4b
- .long 2b, 5b
- .long 3b, 6b
-.popsection
- POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 3*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 4*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
- CFI_OFFSET eip, PT_EIP-PT_OLDESP
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
- CFI_OFFSET eax, PT_EAX-PT_OLDESP
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP
- CFI_OFFSET edi, PT_EDI-PT_OLDESP
- CFI_OFFSET esi, PT_ESI-PT_OLDESP
- CFI_OFFSET edx, PT_EDX-PT_OLDESP
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP
- CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
- CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- pushl $0x0202 # Reset kernel eflags
- CFI_ADJUST_CFA_OFFSET 4
- popfl
- CFI_ADJUST_CFA_OFFSET -4
- jmp syscall_exit
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * Interrupt exit functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
- ALIGN
- RING0_PTREGS_FRAME
-ret_from_exception:
- preempt_stop(CLBR_ANY)
-ret_from_intr:
- GET_THREAD_INFO(%ebp)
-check_userspace:
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
- jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_all
-need_resched:
- movl TI_flags(%ebp), %ecx # need_resched set ?
- testb $_TIF_NEED_RESCHED, %cl
- jz restore_all
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp need_resched
-END(resume_kernel)
-#endif
- CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
- the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
-
- # sysenter call handler stub
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 0
- CFI_REGISTER esp, ebp
- movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
- /*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl $(__USER_DS)
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ss, 0*/
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esp, 0
- pushfl
- orl $X86_EFLAGS_IF, (%esp)
- CFI_ADJUST_CFA_OFFSET 4
- pushl $(__USER_CS)
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET cs, 0*/
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
- * pushed above; +8 corresponds to copy_thread's esp0 setting.
- */
- pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eip, 0
-
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
-1: movl (%ebp),%ebp
- movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
- .align 4
- .long 1b,syscall_fault
-.previous
-
- GET_THREAD_INFO(%ebp)
-
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz sysenter_audit
-sysenter_do_call:
- cmpl $(nr_syscalls), %eax
- jae syscall_badsys
- call *sys_call_table(,%eax,4)
- movl %eax,PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jne sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp,%ebp
- TRACE_IRQS_ON
-1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
- ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
- jnz syscall_trace_entry
- addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
- /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
- /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
- /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
- movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
- movl %eax,%edx /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
- call audit_syscall_entry
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- movl PT_EAX(%esp),%eax /* reload syscall number */
- jmp sysenter_do_call
-
-sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax,%edx /* second arg, syscall return value */
- cmpl $0,%eax /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
- movzbl %al,%eax /* zero-extend that */
- inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
- movl PT_EAX(%esp),%eax /* reload syscall return value */
- jmp sysenter_exit
-#endif
-
- CFI_ENDPROC
-.pushsection .fixup,"ax"
-2: movl $0,PT_FS(%esp)
- jmp 1b
-.section __ex_table,"a"
- .align 4
- .long 1b,2b
-.popsection
- PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
- # system call handler stub
-ENTRY(system_call)
- RING0_INT_FRAME # can't unwind into user space anyway
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(nr_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(,%eax,4)
- movl %eax,PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jne syscall_exit_work
-
-restore_all:
- TRACE_IRQS_IRET
-restore_all_notrace:
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
- CFI_ADJUST_CFA_OFFSET -4
-irq_return:
- INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
-.previous
-.section __ex_table,"a"
- .align 4
- .long irq_return,iret_exc
-.previous
-
- CFI_RESTORE_STATE
-ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- PER_CPU(gdt_page, %ebx)
- shr $16, %edx
- mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
- mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
- pushl $__ESPFIX_SS
- CFI_ADJUST_CFA_OFFSET 4
- push %eax /* new kernel esp */
- CFI_ADJUST_CFA_OFFSET 4
- /* Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the iret */
- DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
- CFI_ENDPROC
-ENDPROC(system_call)
-
- # perform work that needs to be done immediately before resumption
- ALIGN
- RING0_PTREGS_FRAME # can't unwind into user space anyway
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jne work_notifysig_v86 # returning to kernel-space or
- # vm86-space
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace_sig
-
- ALIGN
-work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- CFI_ADJUST_CFA_OFFSET 4
- call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- movl %eax, %esp
-#else
- movl %esp, %eax
-#endif
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace_sig
-END(work_pending)
-
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(nr_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
- movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
-END(syscall_exit_work)
- CFI_ENDPROC
-
- RING0_INT_FRAME # can't unwind into user space anyway
-syscall_fault:
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_badsys)
- CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
-
-/*
- * System calls that need a pt_regs pointer.
- */
-#define PTREGSCALL0(name) \
- ALIGN; \
-ptregs_##name: \
- leal 4(%esp),%eax; \
- jmp sys_##name;
-
-#define PTREGSCALL1(name) \
- ALIGN; \
-ptregs_##name: \
- leal 4(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
-
-#define PTREGSCALL2(name) \
- ALIGN; \
-ptregs_##name: \
- leal 4(%esp),%ecx; \
- movl (PT_ECX+4)(%esp),%edx; \
- movl (PT_EBX+4)(%esp),%eax; \
- jmp sys_##name;
-
-#define PTREGSCALL3(name) \
- ALIGN; \
-ptregs_##name: \
- leal 4(%esp),%eax; \
- pushl %eax; \
- movl PT_EDX(%eax),%ecx; \
- movl PT_ECX(%eax),%edx; \
- movl PT_EBX(%eax),%eax; \
- call sys_##name; \
- addl $4,%esp; \
- ret
-
-PTREGSCALL1(iopl)
-PTREGSCALL0(fork)
-PTREGSCALL0(vfork)
-PTREGSCALL3(execve)
-PTREGSCALL2(sigaltstack)
-PTREGSCALL0(sigreturn)
-PTREGSCALL0(rt_sigreturn)
-PTREGSCALL2(vm86)
-PTREGSCALL1(vm86old)
-
-/* Clone is an oddball. The 4th arg is in %edi */
- ALIGN;
-ptregs_clone:
- leal 4(%esp),%eax
- pushl %eax
- pushl PT_EDI(%eax)
- movl PT_EDX(%eax),%ecx
- movl PT_ECX(%eax),%edx
- movl PT_EBX(%eax),%eax
- call sys_clone
- addl $8,%esp
- ret
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
- /* fixup the stack */
- PER_CPU(gdt_page, %ebx)
- mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
- mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
- shl $16, %eax
- addl %esp, %eax /* the adjusted stack pointer */
- pushl $__KERNEL_DS
- CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-.endm
-.macro UNWIND_ESPFIX_STACK
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
-27:
-.endm
-
-/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
- */
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-ENTRY(irq_entries_start)
- RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < NR_VECTORS
- .if vector <> FIRST_EXTERNAL_VECTOR
- CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 4
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .long 1b
- .text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
-END(irq_entries_start)
-
-.previous
-END(interrupt)
-.previous
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
- SAVE_ALL
- TRACE_IRQS_OFF
- movl %esp,%eax
- call do_IRQ
- jmp ret_from_intr
-ENDPROC(common_interrupt)
- CFI_ENDPROC
-
-/*
- * Irq entries should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- RING0_INT_FRAME; \
- pushl $~(nr); \
- CFI_ADJUST_CFA_OFFSET 4; \
- SAVE_ALL; \
- TRACE_IRQS_OFF \
- movl %esp,%eax; \
- call fn; \
- jmp ret_from_intr; \
- CFI_ENDPROC; \
-ENDPROC(name)
-
-#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_error
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl $do_general_protection
-662:
-.section .altinstructions,"a"
- .balign 4
- .long 661b
- .long 663f
- .byte X86_FEATURE_XMM
- .byte 662b-661b
- .byte 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663: pushl $do_simd_coprocessor_error
-664:
-.previous
-#else
- pushl $do_simd_coprocessor_error
-#endif
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_device_not_available
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iret
-.section __ex_table,"a"
- .align 4
- .long native_iret, iret_exc
-.previous
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_overflow
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_bounds
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_invalid_op
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_coprocessor_segment_overrun
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
- RING0_EC_FRAME
- pushl $do_invalid_TSS
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- RING0_EC_FRAME
- pushl $do_segment_not_present
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
- RING0_EC_FRAME
- pushl $do_stack_segment
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
- RING0_EC_FRAME
- pushl $do_alignment_check
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
- RING0_INT_FRAME
- pushl $0 # no error code
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_divide_error
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl machine_check_vector
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
- RING0_INT_FRAME
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- pushl $do_spurious_interrupt_bug
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
- .popsection
-
-ENTRY(kernel_thread_helper)
- pushl $0 # fake return address for unwinder
- CFI_STARTPROC
- movl %edi,%eax
- call *%esi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-ENDPROC(kernel_thread_helper)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
- entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
- RING0_INT_FRAME
- addl $5*4, %esp /* remove xen-provided frame */
- CFI_ADJUST_CFA_OFFSET -5*4
- jmp sysenter_past_esp
- CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
- CFI_STARTPROC
- pushl $0
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
-
- /* Check to see if we got the event in the critical
- region in xen_iret_direct, after we've reenabled
- events and checked for pending events. This simulates
- iret instruction's behaviour where it delivers a
- pending interrupt when enabling interrupts. */
- movl PT_EIP(%esp),%eax
- cmpl $xen_iret_start_crit,%eax
- jb 1f
- cmpl $xen_iret_end_crit,%eax
- jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
- call xen_evtchn_do_upcall
- jmp ret_from_intr
- CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
- CFI_STARTPROC
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl $1,%eax
-1: mov 4(%esp),%ds
-2: mov 8(%esp),%es
-3: mov 12(%esp),%fs
-4: mov 16(%esp),%gs
- testl %eax,%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- lea 16(%esp),%esp
- CFI_ADJUST_CFA_OFFSET -16
- jz 5f
- addl $16,%esp
- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- jmp ret_from_exception
- CFI_ENDPROC
-
-.section .fixup,"ax"
-6: xorl %eax,%eax
- movl %eax,4(%esp)
- jmp 1b
-7: xorl %eax,%eax
- movl %eax,8(%esp)
- jmp 2b
-8: xorl %eax,%eax
- movl %eax,12(%esp)
- jmp 3b
-9: xorl %eax,%eax
- movl %eax,16(%esp)
- jmp 4b
-.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,6b
- .long 2b,7b
- .long 3b,8b
- .long 4b,9b
-.previous
-ENDPROC(xen_failsafe_callback)
-
-#endif /* CONFIG_XEN */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- popl %edx
- popl %ecx
- popl %eax
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
- ret
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-trace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %edx
- lea 0x4(%ebp), %eax
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %edx
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
-#endif
-
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %gs's slot on the stack */
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
-\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- CFI_DEF_CFA esp, 0
- CFI_UNDEFINED eip
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- pushl $__KERNEL_CS
- CFI_ADJUST_CFA_OFFSET 4
- pushl $sysenter_past_esp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
- RING0_INT_FRAME
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_all_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-END(general_protection)
-
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
deleted file mode 100644
index 4db7c4d12ffa..000000000000
--- a/arch/x86/kernel/entry_64.S
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*
- * linux/arch/x86_64/entry.S
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
- *
- * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
- * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/msr.h>
-#include <asm/unistd.h>
-#include <asm/thread_info.h>
-#include <asm/hw_irq.h>
-#include <asm/page_types.h>
-#include <asm/irqflags.h>
-#include <asm/paravirt.h>
-#include <asm/ftrace.h>
-#include <asm/percpu.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
- .code64
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
- retq
-END(mcount)
-
-ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- MCOUNT_SAVE_FRAME
-
- leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
- movq (%rbp), %rdx
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
-
-
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret64)
- swapgs
- sysretq
-ENDPROC(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT */
-
-
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
-#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq $-1,RCX+\offset(%rsp)
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
- .endm
-
- .macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
- .endm
-
- .macro FAKE_STACK_FRAME child_rip
- /* push in order ss, rsp, eflags, cs, rip */
- xorl %eax, %eax
- pushq $__KERNEL_DS /* ss */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET ss,0*/
- pushq %rax /* rsp */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rsp,0
- pushq $X86_EFLAGS_IF /* eflags - interrupts on */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET rflags,0*/
- pushq $__KERNEL_CS /* cs */
- CFI_ADJUST_CFA_OFFSET 8
- /*CFI_REL_OFFSET cs,0*/
- pushq \child_rip /* rip */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip,0
- pushq %rax /* orig rax */
- CFI_ADJUST_CFA_OFFSET 8
- .endm
-
- .macro UNFAKE_STACK_FRAME
- addq $8*6, %rsp
- CFI_ADJUST_CFA_OFFSET -(6*8)
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro EMPTY_FRAME start=1 offset=0
- .if \start
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,8+\offset
- .else
- CFI_DEF_CFA_OFFSET 8+\offset
- .endif
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
- .endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
- .macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
- .endm
-
-/*
- * frame that enables calling into C.
- */
- .macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
- CFI_REL_OFFSET rbx, RBX+\offset
- CFI_REL_OFFSET rbp, RBP+\offset
- CFI_REL_OFFSET r12, R12+\offset
- CFI_REL_OFFSET r13, R13+\offset
- CFI_REL_OFFSET r14, R14+\offset
- CFI_REL_OFFSET r15, R15+\offset
- .endm
-
-/* save partial stack frame */
-ENTRY(save_args)
- XCPT_FRAME
- cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq_cfi rbp, 8 /* push %rbp */
- leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
- testl $3, CS(%rdi)
- je 1f
- SWAPGS
- /*
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
-1: incl PER_CPU_VAR(irq_count)
- jne 2f
- popq_cfi %rax /* move return address... */
- mov PER_CPU_VAR(irq_stack_ptr),%rsp
- EMPTY_FRAME 0
- pushq_cfi %rbp /* backlink for unwinder */
- pushq_cfi %rax /* ... to the new stack */
- /*
- * We entered an interrupt context - irqs are off:
- */
-2: TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-END(save_args)
-
-ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
- movq 5*8+16(%rsp), %r11 /* save return address */
- movq_cfi rbx, RBX+16
- movq_cfi rbp, RBP+16
- movq_cfi r12, R12+16
- movq_cfi r13, R13+16
- movq_cfi r14, R14+16
- movq_cfi r15, R15+16
- movq %r11, 8(%rsp) /* return address */
- FIXUP_TOP_OF_STACK %r11, 16
- ret
- CFI_ENDPROC
-END(save_rest)
-
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
- .popsection
-
-/*
- * A newly forked process directly context switches into this address.
- *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 8
- popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -8
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_REST
-
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
-
- testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
- jnz int_ret_from_sys_call
-
- RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
- jmp ret_from_sys_call # go to the SYSRET fastpath
-
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Upto 6 arguments in registers are supported.
- *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
- */
-
-/*
- * Register setup:
- * rax system call number
- * rdi arg0
- * rcx return address for syscall/sysret, C arg3
- * rsi arg1
- * rdx arg2
- * r10 arg3 (--> moved to rcx for C)
- * r8 arg4
- * r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
- * Interrupts are off on entry.
- * Only called from user space.
- *
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- * and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
- * it deals with uncanonical addresses better. SYSRET has trouble
- * with them due to bugs in both AMD and Intel CPUs.
- */
-
-ENTRY(system_call)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-ENTRY(system_call_after_swapgs)
-
- movq %rsp,PER_CPU_VAR(old_rsp)
- movq PER_CPU_VAR(kernel_stack),%rsp
- /*
- * No need to follow this irqs off/on section - it's straight
- * and short:
- */
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,1
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
- jnz tracesys
-system_call_fastpath:
- cmpq $__NR_syscall_max,%rax
- ja badsys
- movq %r10,%rcx
- call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
-/*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
-ret_from_sys_call:
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: flagmask */
-sysret_check:
- LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz sysret_careful
- CFI_REMEMBER_STATE
- /*
- * sysretq will re-enable interrupts:
- */
- TRACE_IRQS_ON
- movq RIP-ARGOFFSET(%rsp),%rcx
- CFI_REGISTER rip,rcx
- RESTORE_ARGS 0,-ARG_SKIP,1
- /*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
- USERGS_SYSRET64
-
- CFI_RESTORE_STATE
- /* Handle reschedules */
- /* edx: work, edi: workmask */
-sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- jmp sysret_check
-
- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /*
- * We have a signal, or exit tracing or single-step.
- * These all wind up with the iret return path anyway,
- * so just join that path right now.
- */
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_check_syscall_exit_work
-
-badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp ret_from_sys_call
-
-#ifdef CONFIG_AUDITSYSCALL
- /*
- * Fast path for syscall audit without full syscall trace.
- * We just call audit_syscall_entry() directly, and then
- * jump back to the normal fast path.
- */
-auditsys:
- movq %r10,%r9 /* 6th arg: 4th syscall arg */
- movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
- movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
- movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
- movq %rax,%rsi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call audit_syscall_entry
- LOAD_ARGS 0 /* reload call-clobbered registers */
- jmp system_call_fastpath
-
- /*
- * Return fast path for syscall audit. Call audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $0,%rsi /* is it < 0? */
- setl %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
- call audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
-
- /* Do syscall tracing */
-tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
- jz auditsys
-#endif
- SAVE_REST
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
- FIXUP_TOP_OF_STACK %rdi
- movq %rsp,%rdi
- call syscall_trace_enter
- /*
- * Reload arg registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_enter() returned
- * the value it wants us to use in the table lookup.
- */
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
- cmpq $__NR_syscall_max,%rax
- ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
- movq %r10,%rcx /* fixup for C */
- call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- /* Use IRET because user could have changed frame */
-
-/*
- * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
- */
-GLOBAL(int_ret_from_sys_call)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: mask to check */
-GLOBAL(int_with_check)
- LOCKDEP_SYS_EXIT_IRQ
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
-
- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
-int_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc int_very_careful
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
- /* handle signals and tracing -- both require a full stack frame */
-int_very_careful:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
- SAVE_REST
- /* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT,%edx
- jz int_signal
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
- call syscall_trace_leave
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
- jmp int_restore_rest
-
-int_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
- movq %rsp,%rdi # &ptregs -> arg1
- xorl %esi,%esi # oldset -> arg2
- call do_notify_resume
-1: movl $_TIF_WORK_MASK,%edi
-int_restore_rest:
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
- CFI_ENDPROC
-END(system_call)
-
-/*
- * Certain special system calls that need to save a complete full stack frame.
- */
- .macro PTREGSCALL label,func,arg
-ENTRY(\label)
- PARTIAL_FRAME 1 8 /* offset 8: return address */
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- call save_rest
- DEFAULT_FRAME 0 8 /* offset 8: return address */
- leaq 8(%rsp), \arg /* pt_regs pointer */
- call \func
- jmp ptregscall_common
- CFI_ENDPROC
-END(\label)
- .endm
-
- PTREGSCALL stub_clone, sys_clone, %r8
- PTREGSCALL stub_fork, sys_fork, %rdi
- PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
- PTREGSCALL stub_iopl, sys_iopl, %rsi
-
-ENTRY(ptregscall_common)
- DEFAULT_FRAME 1 8 /* offset 8: return address */
- RESTORE_TOP_OF_STACK %r11, 8
- movq_cfi_restore R15+8, r15
- movq_cfi_restore R14+8, r14
- movq_cfi_restore R13+8, r13
- movq_cfi_restore R12+8, r12
- movq_cfi_restore RBP+8, rbp
- movq_cfi_restore RBX+8, rbx
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(ptregscall_common)
-
-ENTRY(stub_execve)
- CFI_STARTPROC
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
- call sys_execve
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_execve)
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
- CFI_STARTPROC
- addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
- SAVE_REST
- movq %rsp,%rdi
- FIXUP_TOP_OF_STACK %r11
- call sys_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_rt_sigreturn)
-
-/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
- */
- .section .init.rodata,"a"
-ENTRY(interrupt)
- .text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-ENTRY(irq_entries_start)
- INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < NR_VECTORS
- .if vector <> FIRST_EXTERNAL_VECTOR
- CFI_ADJUST_CFA_OFFSET -8
- .endif
-1: pushq $(~vector+0x80) /* Note: always in signed byte range */
- CFI_ADJUST_CFA_OFFSET 8
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .quad 1b
- .text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
- CFI_ENDPROC
-END(irq_entries_start)
-
-.previous
-END(interrupt)
-.previous
-
-/*
- * Interrupt entry/exit.
- *
- * Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
-
-/* 0(%rsp): ~(interrupt number) */
- .macro interrupt func
- subq $10*8, %rsp
- CFI_ADJUST_CFA_OFFSET 10*8
- call save_args
- PARTIAL_FRAME 0
- call \func
- .endm
-
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- XCPT_FRAME
- addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
- interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
- leaveq
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
-exit_intr:
- GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_kernel
-
- /* Interrupt came from user space */
- /*
- * Has a correct top of stack, but a partial stack frame
- * %rcx: thread info. Interrupts off.
- */
-retint_with_reschedule:
- movl $_TIF_WORK_MASK,%edi
-retint_check:
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- CFI_REMEMBER_STATE
- jnz retint_careful
-
-retint_swapgs: /* return to user-space */
- /*
- * The iretq could re-enable interrupts:
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_IRETQ
- SWAPGS
- jmp restore_args
-
-retint_restore_args: /* return to kernel space */
- DISABLE_INTERRUPTS(CLBR_ANY)
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-restore_args:
- RESTORE_ARGS 0,8,0
-
-irq_return:
- INTERRUPT_RETURN
-
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iretq
-
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
-#endif
-
- .section .fixup,"ax"
-bad_iret:
- /*
- * The iret traps when the %cs or %ss being restored is bogus.
- * We've lost the original trap vector and error code.
- * #GPF is the most likely one to get for an invalid selector.
- * So pretend we completed the iret and took the #GPF in user mode.
- *
- * We are now running with the kernel GS after exception recovery.
- * But error_entry expects us to have user GS to match the user %cs,
- * so swap back.
- */
- pushq $0
-
- SWAPGS
- jmp general_protection
-
- .previous
-
- /* edi: workmask, edx: work */
-retint_careful:
- CFI_RESTORE_STATE
- bt $TIF_NEED_RESCHED,%edx
- jnc retint_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- CFI_ADJUST_CFA_OFFSET 8
- call schedule
- popq %rdi
- CFI_ADJUST_CFA_OFFSET -8
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp retint_check
-
-retint_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz retint_swapgs
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
- xorl %esi,%esi # oldset
- movq %rsp,%rdi # &pt_regs
- call do_notify_resume
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
-
-#ifdef CONFIG_PREEMPT
- /* Returning to kernel space. Check if we need preemption */
- /* rcx: threadinfo. interrupts off. */
-ENTRY(retint_kernel)
- cmpl $0,TI_preempt_count(%rcx)
- jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jnc retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
- jnc retint_restore_args
- call preempt_schedule_irq
- jmp exit_intr
-#endif
-
- CFI_ENDPROC
-END(common_interrupt)
-/*
- * End of kprobes section
- */
- .popsection
-
-/*
- * APIC interrupts.
- */
-.macro apicinterrupt num sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- pushq $~(\num)
- CFI_ADJUST_CFA_OFFSET 8
- interrupt \do_sym
- jmp ret_from_intr
- CFI_ENDPROC
-END(\sym)
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
- irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt REBOOT_VECTOR \
- reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt UV_BAU_MESSAGE \
- uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-apicinterrupt LOCAL_TIMER_VECTOR \
- apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR \
- x86_platform_ipi smp_x86_platform_ipi
-
-#ifdef CONFIG_SMP
-apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
- invalidate_interrupt0 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
- invalidate_interrupt1 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
- invalidate_interrupt2 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
- invalidate_interrupt3 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
- invalidate_interrupt4 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
- invalidate_interrupt5 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
- invalidate_interrupt6 smp_invalidate_interrupt
-apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
- invalidate_interrupt7 smp_invalidate_interrupt
-#endif
-
-apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt smp_threshold_interrupt
-apicinterrupt THERMAL_APIC_VECTOR \
- thermal_interrupt smp_thermal_interrupt
-
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
- mce_self_interrupt smp_mce_self_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
- call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
- call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
- reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR \
- error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
- spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
- perf_pending_interrupt smp_perf_pending_interrupt
-#endif
-
-/*
- * Exception entry points.
- */
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro paranoidzeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
- subq $15*8, %rsp
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %r12)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
- call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $15*8,%rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
-
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
-ENTRY(native_load_gs_index)
- CFI_STARTPROC
- pushf
- CFI_ADJUST_CFA_OFFSET 8
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- SWAPGS
-gs_change:
- movl %edi,%gs
-2: mfence /* workaround */
- SWAPGS
- popf
- CFI_ADJUST_CFA_OFFSET -8
- ret
- CFI_ENDPROC
-END(native_load_gs_index)
-
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
- .section .fixup,"ax"
- /* running with kernelgs */
-bad_gs:
- SWAPGS /* switch back to user gs */
- xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
- .previous
-
-ENTRY(kernel_thread_helper)
- pushq $0 # fake return address
- CFI_STARTPROC
- /*
- * Here we are in the child and the registers are set as they were
- * at kernel_thread() invocation in the parent.
- */
- call *%rsi
- # exit
- mov %eax, %edi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-END(kernel_thread_helper)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- * extern long execve(char *name, char **argv, char **envp)
- *
- * asm input arguments:
- * rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
- *
- * do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
- */
-ENTRY(kernel_execve)
- CFI_STARTPROC
- FAKE_STACK_FRAME $0
- SAVE_ALL
- movq %rsp,%rcx
- call sys_execve
- movq %rax, RAX(%rsp)
- RESTORE_REST
- testq %rax,%rax
- je int_ret_from_sys_call
- RESTORE_ARGS
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_execve)
-
-/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
- CFI_STARTPROC
- push %rbp
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp,0
- mov %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr),%rsp
- push %rbp # backlink for old unwinder
- call __do_softirq
- leaveq
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
- decl PER_CPU_VAR(irq_count)
- ret
- CFI_ENDPROC
-END(call_softirq)
-
-#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
-
-/*
- * A note on the "critical region" in our callback handler.
- * We want to avoid stacking callback handlers due to events occurring
- * during handling of the last event. To do this, we keep events disabled
- * until we've done all processing. HOWEVER, we must enable events before
- * popping the stack frame (can't be done atomically) and so it would still
- * be possible to get enough handler activations to overflow the stack.
- * Although unlikely, bugs of that kind are hard to track down, so we'd
- * like to avoid the possibility.
- * So, on entry to the handler we detect whether we interrupted an
- * existing activation in its critical region -- if so, we pop the current
- * activation and restart the handler using the previous one.
- */
-ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
- CFI_STARTPROC
-/*
- * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
- * see the correct pointer to the pt_regs
- */
- movq %rdi, %rsp # we don't return, adjust the stack frame
- CFI_ENDPROC
- DEFAULT_FRAME
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- pushq %rbp # backlink for old unwinder
- call xen_evtchn_do_upcall
- popq %rsp
- CFI_DEF_CFA_REGISTER rsp
- decl PER_CPU_VAR(irq_count)
- jmp error_exit
- CFI_ENDPROC
-END(do_hypervisor_callback)
-
-/*
- * Hypervisor uses this for application faults while it executes.
- * We get here for two reasons:
- * 1. Fault while reloading DS, ES, FS or GS
- * 2. Fault while executing IRET
- * Category 1 we do not need to fix up as Xen has already reloaded all segment
- * registers that could be reloaded and zeroed the others.
- * Category 2 we fix up by killing the current process. We cannot use the
- * normal Linux return path in this case because if we use the IRET hypercall
- * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
- * We distinguish between categories by comparing each saved segment register
- * with its current contents: any discrepancy means we in category 1.
- */
-ENTRY(xen_failsafe_callback)
- INTR_FRAME 1 (6*8)
- /*CFI_REL_OFFSET gs,GS*/
- /*CFI_REL_OFFSET fs,FS*/
- /*CFI_REL_OFFSET es,ES*/
- /*CFI_REL_OFFSET ds,DS*/
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
- movw %ds,%cx
- cmpw %cx,0x10(%rsp)
- CFI_REMEMBER_STATE
- jne 1f
- movw %es,%cx
- cmpw %cx,0x18(%rsp)
- jne 1f
- movw %fs,%cx
- cmpw %cx,0x20(%rsp)
- jne 1f
- movw %gs,%cx
- cmpw %cx,0x28(%rsp)
- jne 1f
- /* All segments match their saved values => Category 2 (Bad IRET). */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0 /* RIP */
- pushq_cfi %r11
- pushq_cfi %rcx
- jmp general_protection
- CFI_RESTORE_STATE
-1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
- SAVE_ALL
- jmp error_exit
- CFI_ENDPROC
-END(xen_failsafe_callback)
-
-#endif /* CONFIG_XEN */
-
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
-#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
-#endif
-errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
-#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
-#endif
-
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
-
- /* ebx: no swapgs flag */
-ENTRY(paranoid_exit)
- INTR_FRAME
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
- TRACE_IRQS_IRETQ 0
- SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
- jmp irq_return
-paranoid_restore:
- TRACE_IRQS_IRETQ 0
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
- CFI_ENDPROC
-END(paranoid_exit)
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
- */
-ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
- */
-error_kernelspace:
- incl %ebx
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%eax /* zero extend */
- cmpq %rax,RIP+8(%rsp)
- je bstep_iret
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
-
-bstep_iret:
- /* Fix truncated RIP */
- movq %rcx,RIP+8(%rsp)
- jmp error_swapgs
-END(error_entry)
-
-
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
-END(error_exit)
-
-
- /* runs on exception stack */
-ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
-nmi_swapgs:
- SWAPGS_UNSAFE_STACK
-nmi_restore:
- RESTORE_ALL 8
- jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
- CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
-END(nmi)
-
-ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-END(ignore_sysret)
-
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000000..6726e0473d0b
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <linux/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more virtual address space for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand = get_random_long();
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap(0);
+}
+
+void init_espfix_ap(int cpu)
+{
+ unsigned int page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n, node;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
+ /* We only have to do this once... */
+ if (likely(per_cpu(espfix_stack, cpu)))
+ return; /* Already initialized */
+
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ node = cpu_to_node(cpu);
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
+ /*
+ * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since
+ * this is mapped to userspace.
+ */
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ WRITE_ONCE(espfix_pages[page], stack_page);
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
+}
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 000000000000..78c5621457d4
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Build rules for the FPU support code:
+#
+
+obj-y += init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 000000000000..edbafc5940e3
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 FPU bug checks:
+ */
+#include <linux/printk.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+void __init fpu__init_check_bugs(void)
+{
+ s32 fdiv_bug;
+
+ /* kernel_fpu_begin/end() relies on patched alternative instructions. */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ kernel_fpu_begin();
+
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+ */
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+
+ kernel_fpu_end();
+
+ if (fdiv_bug) {
+ set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+ pr_warn("Hmm, FPU with FDIV bug\n");
+ }
+}
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
new file mode 100644
index 000000000000..10d0a720659c
--- /dev/null
+++ b/arch/x86/kernel/fpu/context.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_CONTEXT_H
+#define __X86_KERNEL_FPU_CONTEXT_H
+
+#include <asm/fpu/xstate.h>
+#include <asm/trace/fpu.h>
+
+/* Functions related to FPU context tracking */
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Invalidate a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ int cpu = smp_processor_id();
+
+ if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ return;
+
+ if (!fpregs_state_valid(fpu, cpu)) {
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu->xsave state.
+ *
+ * XFD state is handled in restore_fpregs_from_fpstate().
+ */
+ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = cpu;
+ }
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 000000000000..da233f20ae6f
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,989 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/api.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
+#include <asm/msr.h>
+#include <asm/traps.h>
+#include <asm/irq_regs.h>
+
+#include <uapi/asm/kvm.h>
+
+#include <linux/hardirq.h>
+#include <linux/kvm_types.h>
+#include <linux/pkeys.h>
+#include <linux/vmalloc.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/fpu.h>
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
+DEFINE_PER_CPU(u64, xfd_state);
+#endif
+
+/* The FPU state configuration data for kernel and user space */
+struct fpu_state_config fpu_kernel_cfg __ro_after_init;
+struct fpu_state_config fpu_user_cfg __ro_after_init;
+struct vcpu_fpu_config guest_default_cfg __ro_after_init;
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+struct fpstate init_fpstate __ro_after_init;
+
+/*
+ * Track FPU initialization and kernel-mode usage. 'true' means the FPU is
+ * initialized and is not currently being used by the kernel:
+ */
+DEFINE_PER_CPU(bool, kernel_fpu_allowed);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+#ifdef CONFIG_X86_DEBUG_FPU
+struct fpu *x86_task_fpu(struct task_struct *task)
+{
+ if (WARN_ON_ONCE(task->flags & PF_KTHREAD))
+ return NULL;
+
+ return (void *)task + sizeof(*task);
+}
+#endif
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ */
+bool irq_fpu_usable(void)
+{
+ if (WARN_ON_ONCE(in_nmi()))
+ return false;
+
+ /*
+ * Return false in the following cases:
+ *
+ * - FPU is not yet initialized. This can happen only when the call is
+ * coming from CPU onlining, for example for microcode checksumming.
+ * - The kernel is already using the FPU, either because of explicit
+ * nesting (which should never be done), or because of implicit
+ * nesting when a hardirq interrupted a kernel-mode FPU section.
+ *
+ * The single boolean check below handles both cases:
+ */
+ if (!this_cpu_read(kernel_fpu_allowed))
+ return false;
+
+ /*
+ * When not in NMI or hard interrupt context, FPU can be used in:
+ *
+ * - Task context except from within fpregs_lock()'ed critical
+ * regions.
+ *
+ * - Soft interrupt processing context which cannot happen
+ * while in a fpregs_lock()'ed critical region.
+ */
+ if (!in_hardirq())
+ return true;
+
+ /*
+ * In hard interrupt context it's safe when soft interrupts
+ * are enabled, which means the interrupt did not hit in
+ * a fpregs_lock()'ed critical region.
+ */
+ return !softirq_count();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+/*
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
+ * Save the FPU register state in fpu->fpstate->regs. The register state is
+ * preserved.
+ *
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
+ */
+void save_fpregs_to_fpstate(struct fpu *fpu)
+{
+ if (likely(use_xsave())) {
+ os_xsave(fpu->fpstate);
+ update_avx_timestamp(fpu);
+ return;
+ }
+
+ if (likely(use_fxsr())) {
+ fxsave(&fpu->fpstate->regs.fxsave);
+ return;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to reload them from the memory state.
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
+ frstor(&fpu->fpstate->regs.fsave);
+}
+
+void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
+{
+ /*
+ * AMD K7/K8 and later CPUs up to Zen don't save/restore
+ * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+ * here by setting it to fixed values. "m" is a random variable
+ * that should be in L1.
+ */
+ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
+ }
+
+ if (use_xsave()) {
+ /*
+ * Dynamically enabled features are enabled in XCR0, but
+ * usage requires also that the corresponding bits in XFD
+ * are cleared. If the bits are set then using a related
+ * instruction will raise #NM. This allows to do the
+ * allocation of the larger FPU buffer lazy from #NM or if
+ * the task has no permission to kill it which would happen
+ * via #UD if the feature is disabled in XCR0.
+ *
+ * XFD state is following the same life time rules as
+ * XSTATE and to restore state correctly XFD has to be
+ * updated before XRSTORS otherwise the component would
+ * stay in or go into init state even if the bits are set
+ * in fpstate::regs::xsave::xfeatures.
+ */
+ xfd_update_state(fpstate);
+
+ /*
+ * Restoring state always needs to modify all features
+ * which are in @mask even if the current task cannot use
+ * extended features.
+ *
+ * So fpstate->xfeatures cannot be used here, because then
+ * a feature for which the task has no permission but was
+ * used by the previous task would not go into init state.
+ */
+ mask = fpu_kernel_cfg.max_features & mask;
+
+ os_xrstor(fpstate, mask);
+ } else {
+ if (use_fxsr())
+ fxrstor(&fpstate->regs.fxsave);
+ else
+ frstor(&fpstate->regs.fsave);
+ }
+}
+
+void fpu_reset_from_exception_fixup(void)
+{
+ restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+static void __fpstate_reset(struct fpstate *fpstate);
+
+static void fpu_lock_guest_permissions(void)
+{
+ struct fpu_state_perm *fpuperm;
+ u64 perm;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ spin_lock_irq(&current->sighand->siglock);
+ fpuperm = &x86_task_fpu(current->group_leader)->guest_perm;
+ perm = fpuperm->__state_perm;
+
+ /* First fpstate allocation locks down permissions. */
+ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate;
+ unsigned int size;
+
+ size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ fpstate = vzalloc(size);
+ if (!fpstate)
+ return false;
+
+ /* Initialize indicators to reflect properties of the fpstate */
+ fpstate->is_valloc = true;
+ fpstate->is_guest = true;
+
+ __fpstate_reset(fpstate);
+ fpstate_init_user(fpstate);
+
+ gfpu->fpstate = fpstate;
+ gfpu->xfeatures = guest_default_cfg.features;
+
+ /*
+ * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
+ * to userspace, even when XSAVE is unsupported, so that restoring FPU
+ * state on a different CPU that does support XSAVE can cleanly load
+ * the incoming state using its natural XSAVE. In other words, KVM's
+ * uABI size may be larger than this host's default size. Conversely,
+ * the default size should never be larger than KVM's base uABI size;
+ * all features that can expand the uABI size must be opt-in.
+ */
+ gfpu->uabi_size = sizeof(struct kvm_xsave);
+ if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
+ gfpu->uabi_size = fpu_user_cfg.default_size;
+
+ fpu_lock_guest_permissions();
+
+ return true;
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_alloc_guest_fpstate);
+
+void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
+{
+ struct fpstate *fpstate = gfpu->fpstate;
+
+ if (!fpstate)
+ return;
+
+ if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use))
+ return;
+
+ gfpu->fpstate = NULL;
+ vfree(fpstate);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_free_guest_fpstate);
+
+/*
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu: Pointer to the guest FPU container
+ * @xfeatures: Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+ lockdep_assert_preemption_enabled();
+
+ /* Nothing to do if all requested features are already enabled. */
+ xfeatures &= ~guest_fpu->xfeatures;
+ if (!xfeatures)
+ return 0;
+
+ return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+ fpregs_lock();
+ guest_fpu->fpstate->xfd = xfd;
+ if (guest_fpu->fpstate->in_use)
+ xfd_update_state(guest_fpu->fpstate);
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be updated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
+
+ lockdep_assert_irqs_disabled();
+ if (fpu_state_size_dynamic()) {
+ rdmsrq(MSR_IA32_XFD, fpstate->xfd);
+ __this_cpu_write(xfd_state, fpstate->xfd);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
+int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
+{
+ struct fpstate *guest_fps = guest_fpu->fpstate;
+ struct fpu *fpu = x86_task_fpu(current);
+ struct fpstate *cur_fps = fpu->fpstate;
+
+ fpregs_lock();
+ if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ /* Swap fpstate */
+ if (enter_guest) {
+ fpu->__task_fpstate = cur_fps;
+ fpu->fpstate = guest_fps;
+ guest_fps->in_use = true;
+ } else {
+ guest_fps->in_use = false;
+ fpu->fpstate = fpu->__task_fpstate;
+ fpu->__task_fpstate = NULL;
+ }
+
+ cur_fps = fpu->fpstate;
+
+ if (!cur_fps->is_confidential) {
+ /* Includes XFD update */
+ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+ } else {
+ /*
+ * XSTATE is restored by firmware from encrypted
+ * memory. Make sure XFD state is correct while
+ * running with guest fpstate
+ */
+ xfd_update_state(cur_fps);
+ }
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_swap_kvm_fpstate);
+
+void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
+ unsigned int size, u64 xfeatures, u32 pkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ union fpregs_state *ustate = buf;
+ struct membuf mb = { .p = buf, .left = size };
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
+ XSTATE_COPY_XSAVE);
+ } else {
+ memcpy(&ustate->fxsave, &kstate->regs.fxsave,
+ sizeof(ustate->fxsave));
+ /* Make it restorable on a XSAVE enabled host */
+ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_guest_fpstate_to_uabi);
+
+int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
+ u64 xcr0, u32 *vpkru)
+{
+ struct fpstate *kstate = gfpu->fpstate;
+ const union fpregs_state *ustate = buf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
+ if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
+ return -EINVAL;
+ if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
+ return 0;
+ }
+
+ if (ustate->xsave.header.xfeatures & ~xcr0)
+ return -EINVAL;
+
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
+
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
+}
+EXPORT_SYMBOL_FOR_KVM(fpu_copy_uabi_to_guest_fpstate);
+#endif /* CONFIG_KVM */
+
+void kernel_fpu_begin_mask(unsigned int kfpu_mask)
+{
+ if (!irqs_disabled())
+ fpregs_lock();
+
+ WARN_ON_FPU(!irq_fpu_usable());
+
+ /* Toggle kernel_fpu_allowed to false: */
+ WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, false);
+
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
+ !test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ save_fpregs_to_fpstate(x86_task_fpu(current));
+ }
+ __cpu_invalidate_fpregs_state();
+
+ /* Put sane initial values into the control registers. */
+ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
+ ldmxcsr(MXCSR_DEFAULT);
+
+ if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
+ asm volatile ("fninit");
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
+
+void kernel_fpu_end(void)
+{
+ /* Toggle kernel_fpu_allowed back to true: */
+ WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed));
+ this_cpu_write(kernel_fpu_allowed, true);
+
+ if (!irqs_disabled())
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
+ */
+void fpu_sync_fpstate(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
+
+ fpregs_lock();
+ trace_x86_fpu_before_save(fpu);
+
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
+
+ trace_x86_fpu_after_save(fpu);
+ fpregs_unlock();
+}
+
+static inline unsigned int init_fpstate_copy_size(void)
+{
+ if (!use_xsave())
+ return fpu_kernel_cfg.default_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.regs.xsave);
+}
+
+static inline void fpstate_init_fxstate(struct fpstate *fpstate)
+{
+ fpstate->regs.fxsave.cwd = 0x37f;
+ fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fpstate *fpstate)
+{
+ fpstate->regs.fsave.cwd = 0xffff037fu;
+ fpstate->regs.fsave.swd = 0xffff0000u;
+ fpstate->regs.fsave.twd = 0xffffffffu;
+ fpstate->regs.fsave.fos = 0xffff0000u;
+}
+
+/*
+ * Used in two places:
+ * 1) Early boot to setup init_fpstate for non XSAVE systems
+ * 2) fpu_alloc_guest_fpstate() which is invoked from KVM
+ */
+void fpstate_init_user(struct fpstate *fpstate)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&fpstate->regs.soft);
+ return;
+ }
+
+ xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);
+
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(fpstate);
+ else
+ fpstate_init_fstate(fpstate);
+}
+
+static void __fpstate_reset(struct fpstate *fpstate)
+{
+ /*
+ * Supervisor features (and thus sizes) may diverge between guest
+ * FPUs and host FPUs, as some supervisor features are supported
+ * for guests despite not being utilized by the host. User
+ * features and sizes are always identical, which allows for
+ * common guest and userspace ABI.
+ *
+ * For the host, set XFD to the kernel's desired initialization
+ * value. For guests, set XFD to its architectural RESET value.
+ */
+ if (fpstate->is_guest) {
+ fpstate->size = guest_default_cfg.size;
+ fpstate->xfeatures = guest_default_cfg.features;
+ fpstate->xfd = 0;
+ } else {
+ fpstate->size = fpu_kernel_cfg.default_size;
+ fpstate->xfeatures = fpu_kernel_cfg.default_features;
+ fpstate->xfd = init_fpstate.xfd;
+ }
+
+ fpstate->user_size = fpu_user_cfg.default_size;
+ fpstate->user_xfeatures = fpu_user_cfg.default_features;
+}
+
+void fpstate_reset(struct fpu *fpu)
+{
+ /* Set the fpstate pointer to the default fpstate */
+ fpu->fpstate = &fpu->__fpstate;
+ __fpstate_reset(fpu->fpstate);
+
+ /* Initialize the permission related info in fpu */
+ fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
+ fpu->perm.__state_size = fpu_kernel_cfg.default_size;
+ fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+
+ fpu->guest_perm.__state_perm = guest_default_cfg.features;
+ fpu->guest_perm.__state_size = guest_default_cfg.size;
+ /*
+ * User features and sizes are always identical between host and
+ * guest FPUs, which allows for common guest and userspace ABI.
+ */
+ fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size;
+}
+
+static inline void fpu_inherit_perms(struct fpu *dst_fpu)
+{
+ if (fpu_state_size_dynamic()) {
+ struct fpu *src_fpu = x86_task_fpu(current->group_leader);
+
+ spin_lock_irq(&current->sighand->siglock);
+ /* Fork also inherits the permissions of the parent */
+ dst_fpu->perm = src_fpu->perm;
+ dst_fpu->guest_perm = src_fpu->guest_perm;
+ spin_unlock_irq(&current->sighand->siglock);
+ }
+}
+
+/* A passed ssp of zero will not cause any update */
+static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
+{
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ struct cet_user_state *xstate;
+
+ /* If ssp update is not needed. */
+ if (!ssp)
+ return 0;
+
+ xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave,
+ XFEATURE_CET_USER);
+
+ /*
+ * If there is a non-zero ssp, then 'dst' must be configured with a shadow
+ * stack and the fpu state should be up to date since it was just copied
+ * from the parent in fpu_clone(). So there must be a valid non-init CET
+ * state location in the buffer.
+ */
+ if (WARN_ON_ONCE(!xstate))
+ return 1;
+
+ xstate->user_ssp = (u64)ssp;
+#endif
+ return 0;
+}
+
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst, u64 clone_flags, bool minimal,
+ unsigned long ssp)
+{
+ /*
+ * We allocate the new FPU structure right after the end of the task struct.
+ * task allocation size already took this into account.
+ *
+ * This is safe because task_struct size is a multiple of cacheline size,
+ * thus x86_task_fpu() will always be cacheline aligned as well.
+ */
+ struct fpu *dst_fpu = (void *)dst + sizeof(*dst);
+
+ BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0);
+
+ /* The new task's FPU state cannot be valid in the hardware. */
+ dst_fpu->last_cpu = -1;
+
+ fpstate_reset(dst_fpu);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return 0;
+
+ /*
+ * Enforce reload for user space tasks and prevent kernel threads
+ * from trying to save the FPU registers on context switch.
+ */
+ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
+
+ /*
+ * No FPU state inheritance for kernel threads and IO
+ * worker threads.
+ */
+ if (minimal) {
+ /* Clear out the minimal state */
+ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
+ init_fpstate_copy_size());
+ return 0;
+ }
+
+ /*
+ * If a new feature is added, ensure all dynamic features are
+ * caller-saved from here!
+ */
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ /*
+ * Save the default portion of the current FPU state into the
+ * clone. Assume all dynamic features to be defined as caller-
+ * saved, which enables skipping both the expansion of fpstate
+ * and the copying of any dynamic state.
+ *
+ * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
+ * copying is not valid when current uses non-default states.
+ */
+ fpregs_lock();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+ save_fpregs_to_fpstate(dst_fpu);
+ fpregs_unlock();
+ if (!(clone_flags & CLONE_THREAD))
+ fpu_inherit_perms(dst_fpu);
+
+ /*
+ * Children never inherit PASID state.
+ * Force it to have its init value:
+ */
+ if (use_xsave())
+ dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
+
+ /*
+ * Update shadow stack pointer, in case it changed during clone.
+ */
+ if (update_fpu_shstk(dst, ssp))
+ return 1;
+
+ trace_x86_fpu_copy_dst(dst_fpu);
+
+ return 0;
+}
+
+/*
+ * While struct fpu is no longer part of struct thread_struct, it is still
+ * allocated after struct task_struct in the "task_struct" kmem cache. But
+ * since FPU is expected to be part of struct thread_struct, we have to
+ * adjust for it here.
+ */
+void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ /* The allocation follows struct task_struct. */
+ *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread);
+ *offset += offsetof(struct fpu, __fpstate.regs);
+ *size = fpu_kernel_cfg.default_size;
+}
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct task_struct *tsk)
+{
+ struct fpu *fpu;
+
+ if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD))
+ return;
+
+ fpu = x86_task_fpu(tsk);
+
+ preempt_disable();
+
+ if (fpu == x86_task_fpu(current)) {
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+ fpregs_deactivate(fpu);
+ }
+
+ trace_x86_fpu_dropped(fpu);
+
+ preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from the init fpstate.
+ * Caller must do fpregs_[un]lock() around it.
+ */
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
+{
+ if (use_xsave())
+ os_xrstor(&init_fpstate, features_mask);
+ else if (use_fxsr())
+ fxrstor(&init_fpstate.regs.fxsave);
+ else
+ frstor(&init_fpstate.regs.fsave);
+
+ pkru_write_default();
+}
+
+/*
+ * Reset current->fpu memory state to the init values.
+ */
+static void fpu_reset_fpstate_regs(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ fpregs_lock();
+ __fpu_invalidate_fpregs_state(fpu);
+ /*
+ * This does not change the actual hardware registers. It just
+ * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+ * subsequent return to usermode will reload the registers from the
+ * task's memory image.
+ *
+ * Do not use fpstate_init() here. Just copy init_fpstate which has
+ * the correct content already except for PKRU.
+ *
+ * PKRU handling does not rely on the xstate when restoring for
+ * user space as PKRU is eagerly written in switch_to() and
+ * flush_thread().
+ */
+ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ fpregs_unlock();
+}
+
+/*
+ * Reset current's user FPU states to the init states. current's
+ * supervisor states, if any, are not modified by this function. The
+ * caller guarantees that the XSTATE header in memory is intact.
+ */
+void fpu__clear_user_states(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != x86_task_fpu(current));
+
+ fpregs_lock();
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpu_reset_fpstate_regs();
+ fpregs_unlock();
+ return;
+ }
+
+ /*
+ * Ensure that current's supervisor states are loaded into their
+ * corresponding registers.
+ */
+ if (xfeatures_mask_supervisor() &&
+ !fpregs_state_valid(fpu, smp_processor_id()))
+ os_xrstor_supervisor(fpu->fpstate);
+
+ /* Ensure XFD state is in sync before reloading XSTATE */
+ xfd_update_state(fpu->fpstate);
+
+ /* Reset user states in registers. */
+ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);
+
+ /*
+ * Now all FPU registers have their desired values. Inform the FPU
+ * state machine that current's FPU registers are in the hardware
+ * registers. The memory image does not need to be updated because
+ * any operation relying on it has to save the registers first when
+ * current's FPU is marked active.
+ */
+ fpregs_mark_activate();
+ fpregs_unlock();
+}
+
+void fpu_flush_thread(void)
+{
+ fpstate_reset(x86_task_fpu(current));
+ fpu_reset_fpstate_regs();
+}
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ fpregs_restore_userregs();
+}
+EXPORT_SYMBOL_FOR_KVM(switch_fpu_return);
+
+void fpregs_lock_and_load(void)
+{
+ /*
+ * fpregs_lock() only disables preemption (mostly). So modifying state
+ * in an interrupt could screw up some in progress fpregs operation.
+ * Warn about it.
+ */
+ WARN_ON_ONCE(!irq_fpu_usable());
+ WARN_ON_ONCE(current->flags & PF_KTHREAD);
+
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * If current FPU state according to its tracking (loaded FPU context on this
+ * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
+ * loaded on return to userland.
+ */
+void fpregs_assert_state_consistent(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ return;
+
+ WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
+}
+EXPORT_SYMBOL_FOR_KVM(fpregs_assert_state_consistent);
+#endif
+
+void fpregs_mark_activate(void)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+
+ fpregs_activate(fpu);
+ fpu->last_cpu = smp_processor_id();
+ clear_thread_flag(TIF_NEED_FPU_LOAD);
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+ int err;
+
+ if (trap_nr == X86_TRAP_MF) {
+ unsigned short cwd, swd;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception.
+ */
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ cwd = fpu->fpstate->regs.fxsave.cwd;
+ swd = fpu->fpstate->regs.fxsave.swd;
+ } else {
+ cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
+ swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
+ }
+
+ err = swd & ~cwd;
+ } else {
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ unsigned short mxcsr = MXCSR_DEFAULT;
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ mxcsr = fpu->fpstate->regs.fxsave.mxcsr;
+
+ err = ~(mxcsr >> 7) & mxcsr;
+ }
+
+ if (err & 0x001) { /* Invalid op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ return FPE_FLTINV;
+ } else if (err & 0x004) { /* Divide by Zero */
+ return FPE_FLTDIV;
+ } else if (err & 0x008) { /* Overflow */
+ return FPE_FLTOVF;
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ return FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
+ return FPE_FLTRES;
+ }
+
+ /*
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
+ */
+ return 0;
+}
+
+/*
+ * Initialize register state that may prevent from entering low-power idle.
+ * This function will be invoked from the cpuidle driver only when needed.
+ */
+noinstr void fpu_idle_fpregs(void)
+{
+ /* Note: AMX_TILE being enabled implies XGETBV1 support */
+ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
+ (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
+ tile_release();
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ }
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 000000000000..ff988b9ea39f
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/api.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ cr4_set_bits(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
+
+ /* Flush out any pending x87 state: */
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ ;
+ else
+#endif
+ asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+ fpu__init_cpu_generic();
+ fpu__init_cpu_xstate();
+
+ /* Start allowing kernel-mode FPU: */
+ this_cpu_write(kernel_fpu_allowed, true);
+}
+
+static bool __init fpu__probe_without_cpuid(void)
+{
+ unsigned long cr0;
+ u16 fsw, fcw;
+
+ fsw = fcw = 0xffff;
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+ write_cr0(cr0);
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void __init fpu__init_system_early_generic(void)
+{
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+
+ if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+ !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ if (fpu__probe_without_cpuid())
+ setup_force_cpu_cap(X86_FEATURE_FPU);
+ else
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+ }
+
+#ifndef CONFIG_MATH_EMULATION
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
+ pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+ for (;;)
+ asm volatile("hlt");
+ }
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+ unsigned int mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ /* Static because GCC does not get 16-byte stack alignment right: */
+ static struct fxregs_state fxregs __initdata;
+
+ asm volatile("fxsave %0" : "+m" (fxregs));
+
+ mask = fxregs.mxcsr_mask;
+
+ /*
+ * If zero then use the default features mask,
+ * which has all features set, except the
+ * denormals-are-zero feature bit:
+ */
+ if (mask == 0)
+ mask = 0x0000ffbf;
+ }
+ mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+ /*
+ * Set up the legacy init FPU context. Will be updated when the
+ * CPU supports XSAVE[S].
+ */
+ fpstate_init_user(&init_fpstate);
+
+ fpu__init_system_mxcsr();
+}
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != \
+ ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE)))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ task_size += sizeof(struct fpu);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(union fpregs_state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += fpu_kernel_cfg.default_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * 'state' be at the end of 'it:
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate);
+
+ arch_task_struct_size = task_size;
+}
+
+/*
+ * Set up the user and kernel xstate sizes based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+ unsigned int size;
+
+ /*
+ * Note that the size configuration might be overwritten later
+ * during fpu__init_system_xstate().
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ size = sizeof(struct swregs_state);
+ } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ size = sizeof(struct fxregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE;
+ } else {
+ size = sizeof(struct fregs_state);
+ fpu_user_cfg.legacy_features = XFEATURE_MASK_FP;
+ }
+
+ fpu_kernel_cfg.max_size = size;
+ fpu_kernel_cfg.default_size = size;
+ fpu_user_cfg.max_size = size;
+ fpu_user_cfg.default_size = size;
+ guest_default_cfg.size = size;
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(void)
+{
+ fpu__init_system_early_generic();
+
+ /*
+ * The FPU has to be operational for some of the
+ * later FPU init activities:
+ */
+ fpu__init_cpu();
+
+ fpu__init_system_generic();
+ fpu__init_system_xstate_size_legacy();
+ fpu__init_system_xstate(fpu_kernel_cfg.max_size);
+ fpu__init_task_struct_size();
+}
diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h
new file mode 100644
index 000000000000..975de070c9c9
--- /dev/null
+++ b/arch/x86/kernel/fpu/internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_INTERNAL_H
+#define __X86_KERNEL_FPU_INTERNAL_H
+
+extern struct fpstate init_fpstate;
+
+/* CPU feature check wrappers */
+static __always_inline __pure bool use_xsave(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_FXSR);
+}
+
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; })
+#endif
+
+/* Used in init.c */
+extern void fpstate_init_user(struct fpstate *fpstate);
+extern void fpstate_reset(struct fpu *fpu);
+
+#endif
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
new file mode 100644
index 000000000000..098f367bb8a7
--- /dev/null
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_LEGACY_H
+#define __X86_KERNEL_FPU_LEGACY_H
+
+#include <asm/fpu/types.h>
+
+extern unsigned int mxcsr_feature_mask;
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+/*
+ * Returns 0 on success or the trap number when the operation raises an
+ * exception.
+ */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn_err(insn, output, input...) \
+({ \
+ int err; \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \
+ : output : input)
+
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+}
+
+static inline void fxrstor(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_safe(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void frstor(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_safe(struct fregs_state *fx)
+{
+ return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+#endif
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 000000000000..0986c2200adc
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/prctl.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilities supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ return regset->n;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ return regset->n;
+ else
+ return 0;
+}
+
+/*
+ * The regset get() functions are invoked from:
+ *
+ * - coredump to dump the current task's fpstate. If the current task
+ * owns the FPU then the memory state has to be synchronized and the
+ * FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ * - ptrace to dump fpstate of a stopped task, in which case the registers
+ * have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+ if (fpu == x86_task_fpu(current))
+ fpu_sync_fpstate(fpu);
+}
+
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == x86_task_fpu(current));
+
+ __fpu_invalidate_fpregs_state(fpu);
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+
+ if (!use_xsave()) {
+ return membuf_write(&to, &fpu->fpstate->regs.fxsave,
+ sizeof(fpu->fpstate->regs.fxsave));
+ }
+
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
+ return 0;
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct fxregs_state newstate;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(newstate))
+ return -EINVAL;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+ if (ret)
+ return ret;
+
+ /* Do not allow an invalid MXCSR value. */
+ if (newstate.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ /* Copy the state */
+ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate));
+
+ /* Clear xmm8..15 for 32-bit callers */
+ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16);
+ if (in_ia32_syscall())
+ memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16);
+
+ /* Mark FP and SSE as in use when XSAVE is enabled */
+ if (use_xsave())
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+ return 0;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ sync_fpstate(x86_task_fpu(target));
+
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
+ return 0;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct xregs_state *tmpbuf = NULL;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ /*
+ * A whole standard-format XSAVE buffer is needed:
+ */
+ if (pos != 0 || count != fpu_user_cfg.max_size)
+ return -EFAULT;
+
+ if (!kbuf) {
+ tmpbuf = vmalloc(count);
+ if (!tmpbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(tmpbuf, ubuf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ fpu_force_restore(fpu);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
+
+out:
+ vfree(tmpbuf);
+ return ret;
+}
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+int ssp_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (target->thread.features & ARCH_SHSTK_SHSTK)
+ return regset->n;
+
+ return 0;
+}
+
+int ssp_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct cet_user_state *cetregs;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+ cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ return membuf_write(&to, (unsigned long *)&cetregs->user_ssp,
+ sizeof(cetregs->user_ssp));
+}
+
+int ssp_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct xregs_state *xsave = &fpu->fpstate->regs.xsave;
+ struct cet_user_state *cetregs;
+ unsigned long user_ssp;
+ int r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ if (pos != 0 || count != sizeof(user_ssp))
+ return -EINVAL;
+
+ r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1);
+ if (r)
+ return r;
+
+ /*
+ * Some kernel instructions (IRET, etc) can cause exceptions in the case
+ * of disallowed CET register values. Just prevent invalid values.
+ */
+ if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8))
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ cetregs->user_ssp = user_ssp;
+ return 0;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+ return tmp;
+}
+
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID 0
+#define FP_EXP_TAG_ZERO 1
+#define FP_EXP_TAG_SPECIAL 2
+#define FP_EXP_TAG_EMPTY 3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+ struct _fpxreg *st;
+ u32 tos = (fxsave->swd >> 11) & 7;
+ u32 twd = (unsigned long) fxsave->twd;
+ u32 tag;
+ u32 ret = 0xffff0000u;
+ int i;
+
+ for (i = 0; i < 8; i++, twd >>= 1) {
+ if (twd & 0x1) {
+ st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ case 0x0000:
+ if (!st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3])
+ tag = FP_EXP_TAG_ZERO;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ default:
+ if (st->significand[3] & 0x8000)
+ tag = FP_EXP_TAG_VALID;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ }
+ } else {
+ tag = FP_EXP_TAG_EMPTY;
+ }
+ ret |= tag << (2 * i);
+ }
+ return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk,
+ struct fxregs_state *fxsave)
+{
+ struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ env->cwd = fxsave->cwd | 0xffff0000u;
+ env->swd = fxsave->swd | 0xffff0000u;
+ env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+ env->fip = fxsave->rip;
+ env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
+ if (tsk == current) {
+ savesegment(ds, env->fos);
+ } else {
+ env->fos = tsk->thread.ds;
+ }
+ env->fos |= 0xffff0000;
+#else
+ env->fip = fxsave->fip;
+ env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+ env->foo = fxsave->foo;
+ env->fos = fxsave->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave);
+}
+
+void convert_to_fxsr(struct fxregs_state *fxsave,
+ const struct user_i387_ia32_struct *env)
+
+{
+ struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ fxsave->cwd = env->cwd;
+ fxsave->swd = env->swd;
+ fxsave->twd = twd_i387_to_fxsr(env->twd);
+ fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+ fxsave->rip = env->fip;
+ fxsave->rdp = env->foo;
+ /* cs and ds ignored */
+#else
+ fxsave->fip = env->fip;
+ fxsave->fcs = (env->fcs & 0xffff);
+ fxsave->foo = env->foo;
+ fxsave->fos = env->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct user_i387_ia32_struct env;
+ struct fxregs_state fxsave, *fx;
+
+ sync_fpstate(fpu);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return fpregs_soft_get(target, regset, to);
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
+ return membuf_write(&to, &fpu->fpstate->regs.fsave,
+ sizeof(struct fregs_state));
+ }
+
+ if (use_xsave()) {
+ struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
+
+ /* Handle init state optimized xstate correctly */
+ copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
+ fx = &fxsave;
+ } else {
+ fx = &fpu->fpstate->regs.fxsave;
+ }
+
+ __convert_from_fxsr(&env, target, fx);
+ return membuf_write(&to, &env, sizeof(env));
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = x86_task_fpu(target);
+ struct user_i387_ia32_struct env;
+ int ret;
+
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+ return -EINVAL;
+
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
+ return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (ret)
+ return ret;
+
+ fpu_force_restore(fpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env);
+ else
+ memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env));
+
+ /*
+ * Update the header bit in the xsave header, indicating the
+ * presence of FP.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 000000000000..c3ec2512f2bb
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/pagemap.h>
+
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/sigframe.h>
+#include <asm/trapnr.h>
+#include <asm/trace/fpu.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
+{
+ void __user *fpstate = fxbuf;
+ unsigned int magic2;
+
+ if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+ return false;
+
+ /* Check for the first magic field */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1)
+ goto setfx;
+
+ /*
+ * Check for the presence of second magic word at the end of memory
+ * layout. This detects the case where the user just copied the legacy
+ * fpstate layout with out copying the extended state information
+ * in the memory layout.
+ */
+ if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size)))
+ return false;
+
+ if (likely(magic2 == FP_XSTATE_MAGIC2))
+ return true;
+setfx:
+ trace_x86_fpu_xstate_check_failed(x86_task_fpu(current));
+
+ /* Set the parameters for fx only state */
+ fx_sw->magic1 = 0;
+ fx_sw->xstate_size = sizeof(struct fxregs_state);
+ fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
+ return true;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+ if (use_fxsr()) {
+ struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_32 __user *fp = buf;
+
+ fpregs_lock();
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave);
+ fpregs_unlock();
+
+ convert_from_fxsr(&env, tsk);
+
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return false;
+ } else {
+ struct fregs_state __user *fp = buf;
+ u32 swd;
+
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed to by the fpstate pointer in the sigcontext.
+ * This is saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
+ struct fpstate *fpstate)
+{
+ sw_bytes->magic1 = FP_XSTATE_MAGIC1;
+ sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
+ sw_bytes->xfeatures = fpstate->user_xfeatures;
+ sw_bytes->xstate_size = fpstate->user_size;
+
+ if (ia32_frame)
+ sw_bytes->extended_size += sizeof(struct fregs_state);
+}
+
+static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
+ struct fpstate *fpstate)
+{
+ struct xregs_state __user *x = buf;
+ struct _fpx_sw_bytes sw_bytes = {};
+ int err;
+
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
+ err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));
+
+ if (!use_xsave())
+ return !err;
+
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 __user *)(buf + fpstate->user_size));
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xfeatures in the xsave header.
+ *
+ * xsave aware apps can change the xfeatures in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE);
+
+ return !err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ if (use_xsave())
+ return xsave_to_user_sigframe(buf, pkru);
+
+ if (use_fxsr())
+ return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
+ else
+ return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * Save it directly to the user frame with disabled page fault handler. If
+ * that faults, try to clear the frame which handles the page fault.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
+{
+ struct task_struct *tsk = current;
+ struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate;
+ bool ia32_fxstate = (buf != buf_fx);
+ int ret;
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ struct user_i387_ia32_struct fp;
+
+ fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
+ .left = sizeof(fp)});
+ return !copy_to_user(buf, &fp, sizeof(fp));
+ }
+
+ if (!access_ok(buf, size))
+ return false;
+
+ if (use_xsave()) {
+ struct xregs_state __user *xbuf = buf_fx;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
+ return false;
+ }
+retry:
+ /*
+ * Load the FPU registers if they are not valid for the current task.
+ * With a valid FPU state we can attempt to save the state directly to
+ * userland's stack frame which will likely succeed. If it does not,
+ * resolve the fault in the user memory and try again.
+ */
+ fpregs_lock();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ pagefault_disable();
+ ret = copy_fpregs_to_sigframe(buf_fx, pkru);
+ pagefault_enable();
+ fpregs_unlock();
+
+ if (ret) {
+ if (!__clear_user(buf_fx, fpstate->user_size))
+ goto retry;
+ return false;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
+ return false;
+
+ if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
+ return false;
+
+ return true;
+}
+
+static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
+ u64 xrestore, bool fx_only)
+{
+ if (use_xsave()) {
+ u64 init_bv = ufeatures & ~xrestore;
+ int ret;
+
+ if (likely(!fx_only))
+ ret = xrstor_from_user_sigframe(buf, xrestore);
+ else
+ ret = fxrstor_from_user_sigframe(buf);
+
+ if (!ret && unlikely(init_bv))
+ os_xrstor(&init_fpstate, init_bv);
+ return ret;
+ } else if (use_fxsr()) {
+ return fxrstor_from_user_sigframe(buf);
+ } else {
+ return frstor_from_user_sigframe(buf);
+ }
+}
+
+/*
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
+ */
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ int ret;
+
+ /* Restore enabled features only. */
+ xrestore &= fpu->fpstate->user_xfeatures;
+retry:
+ fpregs_lock();
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpu->fpstate);
+ pagefault_disable();
+ ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
+ xrestore, fx_only);
+ pagefault_enable();
+
+ if (unlikely(ret)) {
+ /*
+ * The above did an FPU restore operation, restricted to
+ * the user portion of the registers, and failed, but the
+ * microcode might have modified the FPU registers
+ * nevertheless.
+ *
+ * If the FPU registers do not belong to current, then
+ * invalidate the FPU register state otherwise the task
+ * might preempt current and return to user space with
+ * corrupted FPU registers.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
+
+ /* Try to handle #PF, but anything else is fatal. */
+ if (ret != X86_TRAP_PF)
+ return false;
+
+ if (!fault_in_readable(buf, fpu->fpstate->user_size))
+ goto retry;
+ return false;
+ }
+
+ /*
+ * Restore supervisor states: previous context switch etc has done
+ * XSAVES and saved the supervisor states in the kernel buffer from
+ * which they can be restored now.
+ *
+ * It would be optimal to handle this with a single XRSTORS, but
+ * this does not work because the rest of the FPU registers have
+ * been restored from a user buffer directly.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+ os_xrstor_supervisor(fpu->fpstate);
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return true;
+}
+
+static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
+{
+ struct task_struct *tsk = current;
+ struct fpu *fpu = x86_task_fpu(tsk);
+ struct user_i387_ia32_struct env;
+ bool success, fx_only = false;
+ union fpregs_state *fpregs;
+ u64 user_xfeatures = 0;
+
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+
+ if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
+ return false;
+
+ fx_only = !fx_sw_user.magic1;
+ user_xfeatures = fx_sw_user.xfeatures;
+ } else {
+ user_xfeatures = XFEATURE_MASK_FPSSE;
+ }
+
+ if (likely(!ia32_fxstate)) {
+ /* Restore the FPU registers directly from user memory. */
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
+ }
+
+ /*
+ * Copy the legacy state because the FP portion of the FX frame has
+ * to be ignored for histerical raisins. The legacy state is folded
+ * in once the larger state has been copied.
+ */
+ if (__copy_from_user(&env, buf, sizeof(env)))
+ return false;
+
+ /*
+ * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
+ * not modified on context switch and that the xstate is considered
+ * to be loaded again on return to userland (overriding last_cpu avoids
+ * the optimisation).
+ */
+ fpregs_lock();
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ /*
+ * If supervisor states are available then save the
+ * hardware state in current's fpstate so that the
+ * supervisor state is preserved. Save the full state for
+ * simplicity. There is no point in optimizing this by only
+ * saving the supervisor states and then shuffle them to
+ * the right place in memory. It's ia32 mode. Shrug.
+ */
+ if (xfeatures_mask_supervisor())
+ os_xsave(fpu->fpstate);
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ }
+ __fpu_invalidate_fpregs_state(fpu);
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
+
+ fpregs = &fpu->fpstate->regs;
+ if (use_xsave() && !fx_only) {
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
+ return false;
+ } else {
+ if (__copy_from_user(&fpregs->fxsave, buf_fx,
+ sizeof(fpregs->fxsave)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Reject invalid MXCSR values. */
+ if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
+ return false;
+ } else {
+ /* Mask invalid bits out for historical reasons (broken hardware). */
+ fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
+ }
+
+ /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+ if (use_xsave())
+ fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ }
+
+ /* Fold the legacy FP storage */
+ convert_to_fxsr(&fpregs->fxsave, &env);
+
+ fpregs_lock();
+ if (use_xsave()) {
+ /*
+ * Remove all UABI feature bits not set in user_xfeatures
+ * from the memory xstate header which makes the full
+ * restore below bring them into init state. This works for
+ * fx_only mode as well because that has only FP and SSE
+ * set in user_xfeatures.
+ *
+ * Preserve supervisor states!
+ */
+ u64 mask = user_xfeatures | xfeatures_mask_supervisor();
+
+ fpregs->xsave.header.xfeatures &= mask;
+ success = !os_xrstor_safe(fpu->fpstate,
+ fpu_kernel_cfg.max_features);
+ } else {
+ success = !fxrstor_safe(&fpregs->fxsave);
+ }
+
+ if (likely(success))
+ fpregs_mark_activate();
+
+ fpregs_unlock();
+ return success;
+}
+
+static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
+{
+ unsigned int size = fpstate->user_size;
+
+ return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+bool fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ void __user *buf_fx = buf;
+ bool ia32_fxstate = false;
+ bool success = false;
+ unsigned int size;
+
+ if (unlikely(!buf)) {
+ fpu__clear_user_states(fpu);
+ return true;
+ }
+
+ size = xstate_sigframe_size(fpu->fpstate);
+
+ ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ /*
+ * Only FXSR enabled systems need the FX state quirk.
+ * FRSTOR does not need it and can use the fast path.
+ */
+ if (ia32_frame && use_fxsr()) {
+ buf_fx = buf + sizeof(struct fregs_state);
+ size += sizeof(struct fregs_state);
+ ia32_fxstate = true;
+ }
+
+ if (!access_ok(buf, size))
+ goto out;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+ success = !fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
+ } else {
+ success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+ }
+
+out:
+ if (unlikely(!success))
+ fpu__clear_user_states(fpu);
+ return success;
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size)
+{
+ unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate);
+
+ *buf_fx = sp = round_down(sp - frame_size, 64);
+ if (ia32_frame && use_fxsr()) {
+ frame_size += sizeof(struct fregs_state);
+ sp -= sizeof(struct fregs_state);
+ }
+
+ *size = frame_size;
+
+ return sp;
+}
+
+unsigned long __init fpu__get_fpstate_size(void)
+{
+ unsigned long ret = fpu_user_cfg.max_size;
+
+ if (use_xsave())
+ ret += FP_XSTATE_MAGIC2_SIZE;
+
+ /*
+ * This space is needed on (most) 32-bit kernels, or when a 32-bit
+ * app is running on a 64-bit kernel. To keep things simple, just
+ * assume the worst case and always include space for 'freg_state',
+ * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+ * space, but keeps the code simple.
+ */
+ if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+ ret += sizeof(struct fregs_state);
+
+ return ret;
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 000000000000..48113c5193aa
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,2012 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/bitops.h>
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/mman.h>
+#include <linux/kvm_types.h>
+#include <linux/nospec.h>
+#include <linux/pkeys.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/coredump.h>
+#include <linux/sort.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xcr.h>
+
+#include <asm/cpuid/api.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/prctl.h>
+#include <asm/elf.h>
+
+#include <uapi/asm/elf.h>
+
+#include "context.h"
+#include "internal.h"
+#include "legacy.h"
+#include "xstate.h"
+
+#define for_each_extended_xfeature(bit, mask) \
+ (bit) = FIRST_EXTENDED_XFEATURE; \
+ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
+
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
+static const char *xfeature_names[] =
+{
+ "x87 floating point registers",
+ "SSE registers",
+ "AVX registers",
+ "MPX bounds registers",
+ "MPX CSR",
+ "AVX-512 opmask",
+ "AVX-512 Hi256",
+ "AVX-512 ZMM_Hi256",
+ "Processor Trace (unused)",
+ "Protection Keys User registers",
+ "PASID state",
+ "Control-flow User registers",
+ "Control-flow Kernel registers (KVM only)",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "AMX Tile config",
+ "AMX Tile data",
+ "APX registers",
+ "unknown xstate feature",
+};
+
+static unsigned short xsave_cpuid_features[] __initdata = {
+ [XFEATURE_FP] = X86_FEATURE_FPU,
+ [XFEATURE_SSE] = X86_FEATURE_XMM,
+ [XFEATURE_YMM] = X86_FEATURE_AVX,
+ [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
+ [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
+ [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
+ [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
+ [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
+ [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
+ [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
+ [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
+ [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK,
+ [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
+ [XFEATURE_APX] = X86_FEATURE_APX,
+};
+
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
+
+/*
+ * Ordering of xstate components in uncompacted format: The xfeature
+ * number does not necessarily indicate its position in the XSAVE buffer.
+ * This array defines the traversal order of xstate features.
+ */
+static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+
+static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
+{
+ for (; xfeature_uncompact_order[i] != -1; i++) {
+ if (mask & BIT_ULL(xfeature_uncompact_order[i]))
+ break;
+ }
+
+ return i;
+}
+
+/* Iterate xstate features in uncompacted order: */
+#define for_each_extended_xfeature_in_order(i, mask) \
+ for (i = 0; \
+ i = next_xfeature_order(i, mask), \
+ xfeature_uncompact_order[i] != -1; \
+ i++)
+
+#define XSTATE_FLAG_SUPERVISOR BIT(0)
+#define XSTATE_FLAG_ALIGNED64 BIT(1)
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+ u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
+
+ if (unlikely(feature_name)) {
+ long xfeature_idx, max_idx;
+ u64 xfeatures_print;
+ /*
+ * So we use FLS here to be able to print the most advanced
+ * feature that was requested but is missing. So if a driver
+ * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
+ * missing AVX feature - this is the most informative message
+ * to users:
+ */
+ if (xfeatures_missing)
+ xfeatures_print = xfeatures_missing;
+ else
+ xfeatures_print = xfeatures_needed;
+
+ xfeature_idx = fls64(xfeatures_print)-1;
+ max_idx = ARRAY_SIZE(xfeature_names)-1;
+ xfeature_idx = min(xfeature_idx, max_idx);
+
+ *feature_name = xfeature_names[xfeature_idx];
+ }
+
+ if (xfeatures_missing)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+static bool xfeature_is_aligned64(int xfeature_nr)
+{
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
+}
+
+static bool xfeature_is_supervisor(int xfeature_nr)
+{
+ return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
+}
+
+static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
+{
+ unsigned int offs, i;
+
+ /*
+ * Non-compacted format and legacy features use the cached fixed
+ * offsets.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
+ xfeature <= XFEATURE_SSE)
+ return xstate_offsets[xfeature];
+
+ /*
+ * Compacted format offsets depend on the actual content of the
+ * compacted xsave area which is determined by the xcomp_bv header
+ * field.
+ */
+ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for_each_extended_xfeature(i, xcomp_bv) {
+ if (xfeature_is_aligned64(i))
+ offs = ALIGN(offs, 64);
+ if (i == xfeature)
+ break;
+ offs += xstate_sizes[i];
+ }
+ return offs;
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
+ return;
+
+ cr4_set_bits(X86_CR4_OSXSAVE);
+
+ /*
+ * Must happen after CR4 setup and before xsetbv() to allow KVM
+ * lazy passthrough. Write independent of the dynamic state static
+ * key as that does not work on the boot CPU. This also ensures
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XFD))
+ xfd_set_state(init_fpstate.xfd);
+
+ /*
+ * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
+ * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
+ * states can be set here.
+ */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
+
+ /*
+ * MSR_IA32_XSS sets supervisor states managed by XSAVES.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_independent());
+ }
+}
+
+static bool xfeature_enabled(enum xfeature xfeature)
+{
+ return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
+}
+
+static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
+{
+ return xstate_offsets[*(unsigned int *)xfeature1] -
+ xstate_offsets[*(unsigned int *)xfeature2];
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout. Also, create an ordered
+ * list of xfeatures for handling out-of-order offsets.
+ */
+static void __init setup_xstate_cache(void)
+{
+ u32 eax, ebx, ecx, edx, xfeature, i = 0;
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_offsets[XFEATURE_FP] = 0;
+ xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
+ xmm_space);
+
+ xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
+ xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
+ xmm_space);
+
+ for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
+
+ xstate_sizes[xfeature] = eax;
+ xstate_flags[xfeature] = ecx;
+
+ /*
+ * If an xfeature is supervisor state, the offset in EBX is
+ * invalid, leave it to -1.
+ */
+ if (xfeature_is_supervisor(xfeature))
+ continue;
+
+ xstate_offsets[xfeature] = ebx;
+
+ /* Populate the list of xfeatures before sorting */
+ xfeature_uncompact_order[i++] = xfeature;
+ }
+
+ /*
+ * Sort xfeatures by their offsets to support out-of-order
+ * offsets in the uncompacted format.
+ */
+ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+ int i;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = BIT_ULL(i);
+ const char *name;
+
+ if (cpu_has_xfeatures(mask, &name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
+ }
+}
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * Print out xstate component offsets and sizes
+ */
+static void __init print_xstate_offset_size(void)
+{
+ int i;
+
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
+ i, xstate_sizes[i]);
+ }
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static __init void os_xrstor_booting(struct xregs_state *xstate)
+{
+ u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER | \
+ XFEATURE_MASK_CET_KERNEL | \
+ XFEATURE_MASK_XTILE | \
+ XFEATURE_MASK_APX)
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return;
+
+ print_xstate_features();
+
+ xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
+
+ /*
+ * Init all the features state with header.xfeatures being 0x0
+ */
+ os_xrstor_booting(&init_fpstate.regs.xsave);
+
+ /*
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVEC/S is available because XSAVEC/S uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
+ */
+ fxsave(&init_fpstate.regs.fxsave);
+}
+
+int xfeature_size(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+static int validate_user_xstate_header(const struct xstate_header *hdr,
+ struct fpstate *fpstate)
+{
+ /* No unknown or supervisor features may be set */
+ if (hdr->xfeatures & ~fpstate->user_xfeatures)
+ return -EINVAL;
+
+ /* Userspace must use the uncompacted format */
+ if (hdr->xcomp_bv)
+ return -EINVAL;
+
+ /*
+ * If 'reserved' is shrunken to add a new field, make sure to validate
+ * that new field here!
+ */
+ BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+ /* No reserved bits may be set */
+ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __init __xstate_dump_leaves(void)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+ static int should_dump = 1;
+
+ if (!should_dump)
+ return;
+ should_dump = 0;
+ /*
+ * Dump out a few leaves past the ones that we support
+ * just in case there are some goodies up there
+ */
+ for (i = 0; i < XFEATURE_MAX + 10; i++) {
+ cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
+ pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
+ }
+}
+
+#define XSTATE_WARN_ON(x, fmt, ...) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+#define XCHECK_SZ(sz, nr, __struct) ({ \
+ if (WARN_ONCE(sz != sizeof(__struct), \
+ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
+ xfeature_names[nr], sizeof(__struct), sz)) { \
+ __xstate_dump_leaves(); \
+ } \
+ true; \
+})
+
+
+/**
+ * check_xtile_data_against_struct - Check tile data state size.
+ *
+ * Calculate the state size by multiplying the single tile size which is
+ * recorded in a C struct, and the number of tiles that the CPU informs.
+ * Compare the provided size with the calculation.
+ *
+ * @size: The tile data state size
+ *
+ * Returns: 0 on success, -EINVAL on mismatch.
+ */
+static int __init check_xtile_data_against_struct(int size)
+{
+ u32 max_palid, palid, state_size;
+ u32 eax, ebx, ecx, edx;
+ u16 max_tile;
+
+ /*
+ * Check the maximum palette id:
+ * eax: the highest numbered palette subleaf.
+ */
+ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
+
+ /*
+ * Cross-check each tile size and find the maximum number of
+ * supported tiles.
+ */
+ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
+ u16 tile_size, max;
+
+ /*
+ * Check the tile size info:
+ * eax[31:16]: bytes per title
+ * ebx[31:16]: the max names (or max number of tiles)
+ */
+ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
+ tile_size = eax >> 16;
+ max = ebx >> 16;
+
+ if (tile_size != sizeof(struct xtile_data)) {
+ pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA),
+ sizeof(struct xtile_data), tile_size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+
+ if (max > max_tile)
+ max_tile = max;
+ }
+
+ state_size = sizeof(struct xtile_data) * max_tile;
+ if (size != state_size) {
+ pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
+ __stringify(XFEATURE_XTILE_DATA), state_size, size);
+ __xstate_dump_leaves();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * We have a C struct for each 'xstate'. We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static bool __init check_xstate_against_struct(int nr)
+{
+ /*
+ * Ask the CPU for the size of the state.
+ */
+ int sz = xfeature_size(nr);
+
+ /*
+ * Match each CPU state with the corresponding software
+ * structure.
+ */
+ switch (nr) {
+ case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
+ case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
+ case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
+ case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
+ case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
+ case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
+ case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
+ case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
+ case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
+ case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
+ case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state);
+ case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
+ default:
+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
+ return false;
+ }
+
+ return true;
+}
+
+static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
+{
+ unsigned int topmost = fls64(xfeatures) - 1;
+ unsigned int offset, i;
+
+ if (topmost <= XFEATURE_SSE)
+ return sizeof(struct xregs_state);
+
+ if (compacted) {
+ offset = xfeature_get_offset(xfeatures, topmost);
+ } else {
+ /* Walk through the xfeature order to pick the last */
+ for_each_extended_xfeature_in_order(i, xfeatures)
+ topmost = xfeature_uncompact_order[i];
+ offset = xstate_offsets[topmost];
+ }
+
+ return offset + xstate_sizes[topmost];
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be. We are recalculating
+ * it to be safe.
+ *
+ * Independent XSAVE features allocate their own buffers and are not
+ * covered by these checks. Only the size of the buffer for task->fpu
+ * is checked here.
+ */
+static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
+{
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
+ if (!check_xstate_against_struct(i))
+ return false;
+ /*
+ * Supervisor state components can be managed only by
+ * XSAVES.
+ */
+ if (!xsaves && xfeature_is_supervisor(i)) {
+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
+ return false;
+ }
+ }
+ size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
+ XSTATE_WARN_ON(size != kernel_size,
+ "size %u != kernel_size %u\n", size, kernel_size);
+ return size == kernel_size;
+}
+
+/*
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
+ *
+ * Note the SDM's wording here. "sub-function 0" only enumerates
+ * the size of the *user* states. If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * This also takes compaction into account. So this works for
+ * XSAVEC as well.
+ */
+static unsigned int __init get_compacted_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ *
+ * When XSAVES is not available but XSAVEC is (virt), then there
+ * are no supervisor states, but XSAVEC still uses compacted
+ * format.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+/*
+ * Get the total size of the enabled xstates without the independent supervisor
+ * features.
+ */
+static unsigned int __init get_xsave_compacted_size(void)
+{
+ u64 mask = xfeatures_mask_independent();
+ unsigned int size;
+
+ if (!mask)
+ return get_compacted_size();
+
+ /* Disable independent features. */
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
+
+ /*
+ * Ask the hardware what size is required of the buffer.
+ * This is the size required for the task->fpu buffer.
+ */
+ size = get_compacted_size();
+
+ /* Re-enable independent features so XSAVES will work on them again. */
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
+
+ return size;
+}
+
+static unsigned int __init get_xsave_size_user(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static int __init init_xstate_size(void)
+{
+ /* Recompute the context size for enabled features: */
+ unsigned int user_size, kernel_size, kernel_default_size;
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+
+ /* Uncompacted user space size */
+ user_size = get_xsave_size_user();
+
+ /*
+ * XSAVES kernel size includes supervisor states and uses compacted
+ * format. XSAVEC uses compacted format, but does not save
+ * supervisor states.
+ *
+ * XSAVE[OPT] do not support supervisor states so kernel and user
+ * size is identical.
+ */
+ if (compacted)
+ kernel_size = get_xsave_compacted_size();
+ else
+ kernel_size = user_size;
+
+ kernel_default_size =
+ xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
+
+ if (!paranoid_xstate_size_valid(kernel_size))
+ return -EINVAL;
+
+ fpu_kernel_cfg.max_size = kernel_size;
+ fpu_user_cfg.max_size = user_size;
+
+ fpu_kernel_cfg.default_size = kernel_default_size;
+ fpu_user_cfg.default_size =
+ xstate_calculate_size(fpu_user_cfg.default_features, false);
+
+ guest_default_cfg.size =
+ xstate_calculate_size(guest_default_cfg.features, compacted);
+
+ return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it. Disable it.
+ */
+static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
+{
+ pr_info("x86/fpu: XSAVE disabled\n");
+
+ fpu_kernel_cfg.max_features = 0;
+ cr4_clear_bits(X86_CR4_OSXSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ /* Restore the legacy size.*/
+ fpu_kernel_cfg.max_size = legacy_size;
+ fpu_kernel_cfg.default_size = legacy_size;
+ fpu_user_cfg.max_size = legacy_size;
+ fpu_user_cfg.default_size = legacy_size;
+ guest_default_cfg.size = legacy_size;
+
+ /*
+ * Prevent enabling the static branch which enables writes to the
+ * XFD MSR.
+ */
+ init_fpstate.xfd = 0;
+
+ fpstate_reset(x86_task_fpu(current));
+}
+
+static u64 __init host_default_mask(void)
+{
+ /*
+ * Exclude dynamic features (require userspace opt-in) and features
+ * that are supported only for KVM guests.
+ */
+ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
+}
+
+static u64 __init guest_default_mask(void)
+{
+ /*
+ * Exclude dynamic features, which require userspace opt-in even
+ * for KVM guests.
+ */
+ return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(unsigned int legacy_size)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 xfeatures;
+ int err;
+ int i;
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ pr_info("x86/fpu: No FPU detected\n");
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ pr_info("x86/fpu: x87 FPU will use %s\n",
+ boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
+ return;
+ }
+
+ /*
+ * Find user xstates supported by the processor.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
+ fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
+
+ /*
+ * Find supervisor xstates supported by the processor.
+ */
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
+ fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
+
+ if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * This indicates that something really unexpected happened
+ * with the enumeration. Disable XSAVE and try to continue
+ * booting without it. This is too early to BUG().
+ */
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
+ fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
+ fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
+ /*
+ * This is a problematic CPU configuration where two
+ * conflicting state components are both enumerated.
+ */
+ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
+ fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ unsigned short cid = xsave_cpuid_features[i];
+
+ /* Careful: X86_FEATURE_FPU is 0! */
+ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
+ fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_XFD))
+ fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
+
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+ else
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+ fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
+ fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+
+ /*
+ * Now, given maximum feature set, determine default values by
+ * applying default masks.
+ */
+ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
+ fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask();
+ guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask();
+
+ /* Store it for paranoia check at the end */
+ xfeatures = fpu_kernel_cfg.max_features;
+
+ /*
+ * Initialize the default XFD state in initfp_state and enable the
+ * dynamic sizing mechanism if dynamic states are available. The
+ * static key cannot be enabled here because this runs before
+ * jump_label_init(). This is delayed to an initcall.
+ */
+ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
+
+ /* Set up compaction feature bit */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
+ cpu_feature_enabled(X86_FEATURE_XSAVES))
+ setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
+
+ /* Enable xstate instructions to be able to continue with initialization: */
+ fpu__init_cpu_xstate();
+
+ /* Cache size, offset and flags for initialization */
+ setup_xstate_cache();
+
+ err = init_xstate_size();
+ if (err)
+ goto out_disable;
+
+ /*
+ * Update info used for ptrace frames; use standard-format size and no
+ * supervisor xstates:
+ */
+ update_regset_xstate_info(fpu_user_cfg.max_size,
+ fpu_user_cfg.max_features);
+
+ /*
+ * init_fpstate excludes dynamic states as they are large but init
+ * state is zero.
+ */
+ init_fpstate.size = fpu_kernel_cfg.default_size;
+ init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
+
+ if (init_fpstate.size > sizeof(init_fpstate.regs)) {
+ pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
+ sizeof(init_fpstate.regs), init_fpstate.size);
+ goto out_disable;
+ }
+
+ setup_init_fpu_buf();
+
+ /*
+ * Paranoia check whether something in the setup modified the
+ * xfeatures mask.
+ */
+ if (xfeatures != fpu_kernel_cfg.max_features) {
+ pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
+ xfeatures, fpu_kernel_cfg.max_features);
+ goto out_disable;
+ }
+
+ /*
+ * CPU capabilities initialization runs before FPU init. So
+ * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
+ * functional, set the feature bit so depending code works.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
+
+ print_xstate_offset_size();
+ pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
+ fpu_kernel_cfg.max_features,
+ fpu_kernel_cfg.max_size,
+ boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
+ return;
+
+out_disable:
+ /* something went wrong, try to boot without any XSAVE support */
+ fpu__init_disable_system_xstate(legacy_size);
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+ /*
+ * Restore XCR0 on xsave capable CPUs:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
+
+ /*
+ * Restore IA32_XSS. The same CPUID bit enumerates support
+ * of XSAVES and MSR_IA32_XSS.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_independent());
+ }
+
+ if (fpu_state_size_dynamic())
+ wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
+}
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ */
+static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+ u64 xcomp_bv = xsave->header.xcomp_bv;
+
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
+ if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
+ return NULL;
+ }
+
+ return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ * xstate: the thread's storage area for all FPU data
+ * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
+ * XFEATURE_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area, or NULL if the
+ * field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
+{
+ /*
+ * Do we even *have* xsave state?
+ */
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return NULL;
+
+ /*
+ * We should not ever be requesting features that we
+ * have not enabled.
+ */
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ /*
+ * This assumes the last 'xsave*' instruction to
+ * have requested that 'xfeature_nr' be saved.
+ * If it did not, we might be seeing and old value
+ * of the field in the buffer.
+ *
+ * This can happen because the last 'xsave' did not
+ * request that this feature be saved (unlikely)
+ * or because the "init optimization" caused it
+ * to not be saved.
+ */
+ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
+ return NULL;
+
+ return __raw_xsave_addr(xsave, xfeature_nr);
+}
+EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave buffer the state is.
+ * The xsave buffer should be in standard format, not compacted (e.g. user mode
+ * signal frames).
+ */
+void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void __user *)xsave + xstate_offsets[xfeature_nr];
+}
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
+/*
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u32 old_pkru, new_pkru_bits = 0;
+ int pkey_shift;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /*
+ * This code should only be called with valid 'pkey'
+ * values originating from in-kernel users. Complain
+ * if a bad value is observed.
+ */
+ if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+ return -EINVAL;
+
+ /* Set the bits we need in PKRU: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey: */
+ pkey_shift = pkey * PKRU_BITS_PER_PKEY;
+ new_pkru_bits <<= pkey_shift;
+
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
+
+ return 0;
+}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+ void *init_xstate, unsigned int size)
+{
+ membuf_write(to, from_xstate ? xstate : init_xstate, size);
+}
+
+/**
+ * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @fpstate: The fpstate buffer from which to copy
+ * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
+ * @pkru_val: The PKRU value to store in the PKRU component
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode)
+{
+ const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ struct xregs_state *xinit = &init_fpstate.regs.xsave;
+ struct xregs_state *xsave = &fpstate->regs.xsave;
+ unsigned int zerofrom, i, xfeature;
+ struct xstate_header header;
+ u64 mask;
+
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+
+ /* Mask out the feature bits depending on copy mode */
+ switch (copy_mode) {
+ case XSTATE_COPY_FP:
+ header.xfeatures &= XFEATURE_MASK_FP;
+ break;
+
+ case XSTATE_COPY_FX:
+ header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+ break;
+
+ case XSTATE_COPY_XSAVE:
+ header.xfeatures &= fpstate->user_xfeatures & xfeatures;
+ break;
+ }
+
+ /* Copy FP state up to MXCSR */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+ &xinit->i387, off_mxcsr);
+
+ /* Copy MXCSR when SSE or YMM are set in the feature mask */
+ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+ &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+ MXCSR_AND_FLAGS_SIZE);
+
+ /* Copy the remaining FP state */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+ &to, &xsave->i387.st_space, &xinit->i387.st_space,
+ sizeof(xsave->i387.st_space));
+
+ /* Copy the SSE state - shared with YMM, but independently managed */
+ copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+ &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+ sizeof(xsave->i387.xmm_space));
+
+ if (copy_mode != XSTATE_COPY_XSAVE)
+ goto out;
+
+ /* Zero the padding area */
+ membuf_zero(&to, sizeof(xsave->i387.padding));
+
+ /* Copy xsave->i387.sw_reserved */
+ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
+
+ /* Copy the user space relevant state of @xsave->header */
+ membuf_write(&to, &header, sizeof(header));
+
+ zerofrom = offsetof(struct xregs_state, extended_state_area);
+
+ /*
+ * This 'mask' indicates which states to copy from fpstate.
+ * Those extended states that are not present in fpstate are
+ * either disabled or initialized:
+ *
+ * In non-compacted format, disabled features still occupy
+ * state space but there is no state to copy from in the
+ * compacted init_fpstate. The gap tracking will zero these
+ * states.
+ *
+ * The extended features have an all zeroes init state. Thus,
+ * remove them from 'mask' to zero those features in the user
+ * buffer instead of retrieving them from init_fpstate.
+ */
+ mask = header.xfeatures;
+
+ for_each_extended_xfeature_in_order(i, mask) {
+ xfeature = xfeature_uncompact_order[i];
+ /*
+ * If there was a feature or alignment gap, zero the space
+ * in the destination buffer.
+ */
+ if (zerofrom < xstate_offsets[xfeature])
+ membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
+
+ if (xfeature == XFEATURE_PKRU) {
+ struct pkru_state pkru = {0};
+ /*
+ * PKRU is not necessarily up to date in the
+ * XSAVE buffer. Use the provided value.
+ */
+ pkru.pkru = pkru_val;
+ membuf_write(&to, &pkru, sizeof(pkru));
+ } else {
+ membuf_write(&to,
+ __raw_xsave_addr(xsave, xfeature),
+ xstate_sizes[xfeature]);
+ }
+ /*
+ * Keep track of the last copied state in the non-compacted
+ * target buffer for gap zeroing.
+ */
+ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
+ }
+
+out:
+ if (to.left)
+ membuf_zero(&to, to.left);
+}
+
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
+ *
+ * It supports partial copy but @to.pos always starts from zero.
+ */
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
+{
+ __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
+ x86_task_fpu(tsk)->fpstate->user_xfeatures,
+ tsk->thread.pkru, copy_mode);
+}
+
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+ const void *kbuf, const void __user *ubuf)
+{
+ if (kbuf) {
+ memcpy(dst, kbuf + offset, size);
+ } else {
+ if (copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+
+/**
+ * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
+ * @fpstate: The fpstate buffer to copy to
+ * @kbuf: The UABI format buffer, if it comes from the kernel
+ * @ubuf: The UABI format buffer, if it comes from userspace
+ * @pkru: The location to write the PKRU value to
+ *
+ * Converts from the UABI format into the kernel internal hardware
+ * dependent format.
+ *
+ * This function ultimately has three different callers with distinct PKRU
+ * behavior.
+ * 1. When called from sigreturn the PKRU register will be restored from
+ * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
+ * @fpstate is sufficient to cover this case, but the caller will also
+ * pass a pointer to the thread_struct's pkru field in @pkru and updating
+ * it is harmless.
+ * 2. When called from ptrace the PKRU register will be restored from the
+ * thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * The kernel will restore it manually, so the XRSTOR behavior that resets
+ * the PKRU register to the hardware init value (0) if the corresponding
+ * xfeatures bit is not set is emulated here.
+ * 3. When called from KVM the PKRU register will be restored from the vcpu's
+ * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
+ * XRSTOR and hasn't had the PKRU resetting behavior described above. To
+ * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
+ * bit is not set.
+ */
+static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
+ const void __user *ubuf, u32 *pkru)
+{
+ struct xregs_state *xsave = &fpstate->regs.xsave;
+ unsigned int offset, size;
+ struct xstate_header hdr;
+ u64 mask;
+ int i;
+
+ offset = offsetof(struct xregs_state, header);
+ if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
+ return -EFAULT;
+
+ if (validate_user_xstate_header(&hdr, fpstate))
+ return -EINVAL;
+
+ /* Validate MXCSR when any of the related features is in use */
+ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+ if (hdr.xfeatures & mask) {
+ u32 mxcsr[2];
+
+ offset = offsetof(struct fxregs_state, mxcsr);
+ if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+ return -EFAULT;
+
+ /* Reserved bits in MXCSR must be zero. */
+ if (mxcsr[0] & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ /* SSE and YMM require MXCSR even when FP is not in use. */
+ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+ xsave->i387.mxcsr = mxcsr[0];
+ xsave->i387.mxcsr_mask = mxcsr[1];
+ }
+ }
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ mask = BIT_ULL(i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
+ return -EFAULT;
+ }
+ }
+
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ } else {
+ /*
+ * KVM may pass NULL here to indicate that it does not need
+ * PKRU updated.
+ */
+ if (pkru)
+ *pkru = 0;
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. Used by ptrace and KVM.
+ */
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
+{
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
+}
+
+/*
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
+ */
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
+ const void __user *ubuf)
+{
+ return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
+}
+
+static bool validate_independent_components(u64 mask)
+{
+ u64 xchk;
+
+ if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
+ return false;
+
+ xchk = ~xfeatures_mask_independent();
+
+ if (WARN_ON_ONCE(!mask || mask & xchk))
+ return false;
+
+ return true;
+}
+
+/**
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to save
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xsaves(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+/**
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
+ *
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
+ *
+ * The feature mask must be a subset of the independent features.
+ */
+void xrstors(struct xregs_state *xstate, u64 mask)
+{
+ int err;
+
+ if (!validate_independent_components(mask))
+ return;
+
+ XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
+{
+ void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
+
+ if (addr)
+ memset(addr, 0, xstate_sizes[xfeature]);
+}
+EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_X86_DEBUG_FPU
+/*
+ * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
+ * can safely operate on the @fpstate buffer.
+ */
+static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ u64 xfd = __this_cpu_read(xfd_state);
+
+ if (fpstate->xfd == xfd)
+ return true;
+
+ /*
+ * The XFD MSR does not match fpstate->xfd. That's invalid when
+ * the passed in fpstate is current's fpstate.
+ */
+ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
+ return false;
+
+ /*
+ * XRSTOR(S) from init_fpstate are always correct as it will just
+ * bring all components into init state and not read from the
+ * buffer. XSAVE(S) raises #PF after init.
+ */
+ if (fpstate == &init_fpstate)
+ return rstor;
+
+ /*
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
+ */
+
+ /*
+ * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
+ * the buffer area for XFD-disabled state components.
+ */
+ mask &= ~xfd;
+
+ /*
+ * Remove features which are valid in fpstate. They
+ * have space allocated in fpstate.
+ */
+ mask &= ~fpstate->xfeatures;
+
+ /*
+ * Any remaining state components in 'mask' might be written
+ * by XSAVE/XRSTOR. Fail validation it found.
+ */
+ return !mask;
+}
+
+void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
+{
+ WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
+}
+#endif /* CONFIG_X86_DEBUG_FPU */
+
+static int __init xfd_update_static_branch(void)
+{
+ /*
+ * If init_fpstate.xfd has bits set then dynamic features are
+ * available and the dynamic sizing must be enabled.
+ */
+ if (init_fpstate.xfd)
+ static_branch_enable(&__fpu_state_size_dynamic);
+ return 0;
+}
+arch_initcall(xfd_update_static_branch)
+
+void fpstate_free(struct fpu *fpu)
+{
+ if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
+ vfree(fpu->fpstate);
+}
+
+/**
+ * fpstate_realloc - Reallocate struct fpstate for the requested new features
+ *
+ * @xfeatures: A bitmap of xstate features which extend the enabled features
+ * of that task
+ * @ksize: The required size for the kernel buffer
+ * @usize: The required size for user space buffers
+ * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
+ *
+ * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
+ * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
+ * with large states are likely to live longer.
+ *
+ * Returns: 0 on success, -ENOMEM on allocation error.
+ */
+static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
+ unsigned int usize, struct fpu_guest *guest_fpu)
+{
+ struct fpu *fpu = x86_task_fpu(current);
+ struct fpstate *curfps, *newfps = NULL;
+ unsigned int fpsize;
+ bool in_use;
+
+ fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
+
+ newfps = vzalloc(fpsize);
+ if (!newfps)
+ return -ENOMEM;
+ newfps->size = ksize;
+ newfps->user_size = usize;
+ newfps->is_valloc = true;
+
+ /*
+ * When a guest FPU is supplied, use @guest_fpu->fpstate
+ * as reference independent whether it is in use or not.
+ */
+ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
+
+ /* Determine whether @curfps is the active fpstate */
+ in_use = fpu->fpstate == curfps;
+
+ if (guest_fpu) {
+ newfps->is_guest = true;
+ newfps->is_confidential = curfps->is_confidential;
+ newfps->in_use = curfps->in_use;
+ guest_fpu->xfeatures |= xfeatures;
+ guest_fpu->uabi_size = usize;
+ }
+
+ fpregs_lock();
+ /*
+ * If @curfps is in use, ensure that the current state is in the
+ * registers before swapping fpstate as that might invalidate it
+ * due to layout changes.
+ */
+ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+
+ newfps->xfeatures = curfps->xfeatures | xfeatures;
+ newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
+ newfps->xfd = curfps->xfd & ~xfeatures;
+
+ /* Do the final updates within the locked region */
+ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
+
+ if (guest_fpu) {
+ guest_fpu->fpstate = newfps;
+ /* If curfps is active, update the FPU fpstate pointer */
+ if (in_use)
+ fpu->fpstate = newfps;
+ } else {
+ fpu->fpstate = newfps;
+ }
+
+ if (in_use)
+ xfd_update_state(fpu->fpstate);
+ fpregs_unlock();
+
+ /* Only free valloc'ed state */
+ if (curfps && curfps->is_valloc)
+ vfree(curfps);
+
+ return 0;
+}
+
+static int validate_sigaltstack(unsigned int usize)
+{
+ struct task_struct *thread, *leader = current->group_leader;
+ unsigned long framesize = get_sigframe_size();
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ /* get_sigframe_size() is based on fpu_user_cfg.max_size */
+ framesize -= fpu_user_cfg.max_size;
+ framesize += usize;
+ for_each_thread(leader, thread) {
+ if (thread->sas_ss_size && thread->sas_ss_size < framesize)
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
+{
+ /*
+ * This deliberately does not exclude !XSAVES as we still might
+ * decide to optionally context switch XCR0 or talk the silicon
+ * vendors into extending XFD for the pre AMX states, especially
+ * AVX512.
+ */
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ u64 mask;
+ int ret = 0;
+
+ /* Check whether fully enabled */
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /*
+ * Calculate the resulting kernel state size. Note, @permitted also
+ * contains supervisor xfeatures even though supervisor are always
+ * permitted for kernel and guest FPUs, and never permitted for user
+ * FPUs.
+ */
+ mask = permitted | requested;
+ ksize = xstate_calculate_size(mask, compacted);
+
+ /*
+ * Calculate the resulting user state size. Take care not to clobber
+ * the supervisor xfeatures in the new mask!
+ */
+ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
+
+ if (!guest) {
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
+ }
+
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
+ WRITE_ONCE(perm->__state_perm, mask);
+ /* Protected by sighand lock */
+ perm->__state_size = ksize;
+ perm->__user_state_size = usize;
+ return ret;
+}
+
+/*
+ * Permissions array to map facilities with more than one component
+ */
+static const u64 xstate_prctl_req[XFEATURE_MAX] = {
+ [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
+};
+
+static int xstate_request_perm(unsigned long idx, bool guest)
+{
+ u64 permitted, requested;
+ int ret;
+
+ if (idx >= XFEATURE_MAX)
+ return -EINVAL;
+
+ /*
+ * Look up the facility mask which can require more than
+ * one xstate component.
+ */
+ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
+ requested = xstate_prctl_req[idx];
+ if (!requested)
+ return -EOPNOTSUPP;
+
+ if ((fpu_user_cfg.max_features & requested) != requested)
+ return -EOPNOTSUPP;
+
+ /* Lockless quick check */
+ permitted = xstate_get_group_perm(guest);
+ if ((permitted & requested) == requested)
+ return 0;
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+ permitted = xstate_get_group_perm(guest);
+
+ /* First vCPU allocation locks the permissions. */
+ if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+ ret = -EBUSY;
+ else
+ ret = __xstate_request_perm(permitted, requested, guest);
+ spin_unlock_irq(&current->sighand->siglock);
+ return ret;
+}
+
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+ u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ struct fpu_state_perm *perm;
+ unsigned int ksize, usize;
+ struct fpu *fpu;
+
+ if (!xfd_event) {
+ if (!guest_fpu)
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ return 0;
+ }
+
+ /* Protect against concurrent modifications */
+ spin_lock_irq(&current->sighand->siglock);
+
+ /* If not permitted let it die */
+ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return -EPERM;
+ }
+
+ fpu = x86_task_fpu(current->group_leader);
+ perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+ ksize = perm->__state_size;
+ usize = perm->__user_state_size;
+
+ /*
+ * The feature is permitted. State size is sufficient. Dropping
+ * the lock is safe here even if more features are added from
+ * another task, the retrieved buffer sizes are valid for the
+ * currently requested feature(s).
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Try to allocate a new fpstate. If that fails there is no way
+ * out.
+ */
+ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
+ return -EFAULT;
+ return 0;
+}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ return __xfd_enable_feature(xfd_err, NULL);
+}
+
+#else /* CONFIG_X86_64 */
+static inline int xstate_request_perm(unsigned long idx, bool guest)
+{
+ return -EPERM;
+}
+#endif /* !CONFIG_X86_64 */
+
+u64 xstate_get_guest_group_perm(void)
+{
+ return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
+
+/**
+ * fpu_xstate_prctl - xstate permission operations
+ * @option: A subfunction of arch_prctl()
+ * @arg2: option argument
+ * Return: 0 if successful; otherwise, an error code
+ *
+ * Option arguments:
+ *
+ * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
+ * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
+ * ARCH_REQ_XCOMP_PERM: Facility number requested
+ *
+ * For facilities which require more than one XSTATE component, the request
+ * must be the highest state component number related to that facility,
+ * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
+ * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
+ */
+long fpu_xstate_prctl(int option, unsigned long arg2)
+{
+ u64 __user *uptr = (u64 __user *)arg2;
+ u64 permitted, supported;
+ unsigned long idx = arg2;
+ bool guest = false;
+
+ switch (option) {
+ case ARCH_GET_XCOMP_SUPP:
+ supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
+ return put_user(supported, uptr);
+
+ case ARCH_GET_XCOMP_PERM:
+ /*
+ * Lockless snapshot as it can also change right after the
+ * dropping the lock.
+ */
+ permitted = xstate_get_host_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ permitted = xstate_get_guest_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ guest = true;
+ fallthrough;
+
+ case ARCH_REQ_XCOMP_PERM:
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return -EOPNOTSUPP;
+
+ return xstate_request_perm(idx, guest);
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+/*
+ * Report the amount of time elapsed in millisecond since last AVX512
+ * use in the task. Report -1 if no AVX-512 usage.
+ */
+static void avx512_status(struct seq_file *m, struct task_struct *task)
+{
+ unsigned long timestamp;
+ long delta = -1;
+
+ /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
+ if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
+ return;
+
+ timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
+
+ if (timestamp) {
+ delta = (long)(jiffies - timestamp);
+ /*
+ * Cap to LONG_MAX if time difference > LONG_MAX
+ */
+ if (delta < 0)
+ delta = LONG_MAX;
+ delta = jiffies_to_msecs(delta);
+ }
+
+ seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
+ seq_putc(m, '\n');
+}
+
+/*
+ * Report architecture specific information
+ */
+int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ /*
+ * Report AVX512 state if the processor and build option supported.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_AVX512F))
+ avx512_status(m, task);
+
+ return 0;
+}
+#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_COREDUMP
+static const char owner_name[] = "LINUX";
+
+/*
+ * Dump type, size, offset and flag values for every xfeature that is present.
+ */
+static int dump_xsave_layout_desc(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
+ struct x86_xfeat_component xc = {
+ .type = i,
+ .size = xstate_sizes[i],
+ .offset = xstate_offsets[i],
+ /* reserved for future use */
+ .flags = 0,
+ };
+
+ if (!dump_emit(cprm, &xc, sizeof(xc)))
+ return 0;
+
+ num_records++;
+ }
+ return num_records;
+}
+
+static u32 get_xsave_desc_size(void)
+{
+ u32 cnt = 0;
+ u32 i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features)
+ cnt++;
+
+ return cnt * (sizeof(struct x86_xfeat_component));
+}
+
+int elf_coredump_extra_notes_write(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ struct elf_note en;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ en.n_namesz = sizeof(owner_name);
+ en.n_descsz = get_xsave_desc_size();
+ en.n_type = NT_X86_XSAVE_LAYOUT;
+
+ if (!dump_emit(cprm, &en, sizeof(en)))
+ return 1;
+ if (!dump_emit(cprm, owner_name, en.n_namesz))
+ return 1;
+ if (!dump_align(cprm, 4))
+ return 1;
+
+ num_records = dump_xsave_layout_desc(cprm);
+ if (!num_records)
+ return 1;
+
+ /* Total size should be equal to the number of records */
+ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
+ return 1;
+
+ return 0;
+}
+
+int elf_coredump_extra_notes_size(void)
+{
+ int size;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ /* .note header */
+ size = sizeof(struct elf_note);
+ /* Name plus alignment to 4 bytes */
+ size += roundup(sizeof(owner_name), 4);
+ size += get_xsave_desc_size();
+
+ return size;
+}
+#endif /* CONFIG_COREDUMP */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
new file mode 100644
index 000000000000..52ce19289989
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_FPU_XSTATE_H
+#define __X86_KERNEL_FPU_XSTATE_H
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u64, xfd_state);
+#endif
+
+static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
+ xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
+}
+
+static inline u64 xstate_get_group_perm(bool guest)
+{
+ struct fpu *fpu = x86_task_fpu(current->group_leader);
+ struct fpu_state_perm *perm;
+
+ /* Pairs with WRITE_ONCE() in xstate_request_perm() */
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ return xstate_get_group_perm(false);
+}
+
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
+ u64 xfeatures, u32 pkru_val,
+ enum xstate_copy_mode copy_mode);
+extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
+
+
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system_xstate(unsigned int legacy_size);
+
+extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
+
+static inline u64 xfeatures_mask_supervisor(void)
+{
+ return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+}
+
+static inline u64 xfeatures_mask_independent(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
+
+ return fpu_kernel_cfg.independent_features;
+}
+
+static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask)
+{
+ u64 xfeatures;
+ int err;
+
+ /* Read the xfeatures value already saved in the user buffer */
+ err = __get_user(xfeatures, &xbuf->header.xfeatures);
+ xfeatures |= mask;
+ err |= __put_user(xfeatures, &xbuf->header.xfeatures);
+
+ return err;
+}
+
+/*
+ * Update the value of PKRU register that was already pushed onto the signal frame.
+ */
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ int err;
+
+ if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
+ return 0;
+
+ /* Mark PKRU as in-use so that it is restored correctly. */
+ err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU);
+ if (err)
+ return err;
+
+ /* Update PKRU value in the userspace xsave buffer. */
+ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
+}
+
+/* XSAVE/XRSTOR wrapper functions */
+
+#ifdef CONFIG_X86_64
+#define REX_SUFFIX "64"
+#else
+#define REX_SUFFIX
+#endif
+
+#define XSAVE "xsave" REX_SUFFIX " %[xa]"
+#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]"
+#define XSAVEC "xsavec" REX_SUFFIX " %[xa]"
+#define XSAVES "xsaves" REX_SUFFIX " %[xa]"
+#define XRSTOR "xrstor" REX_SUFFIX " %[xa]"
+#define XRSTORS "xrstors" REX_SUFFIX " %[xa]"
+
+/*
+ * After this @err contains 0 on success or the trap number when the
+ * operation raises an exception.
+ *
+ * The [xa] input parameter below represents the struct xregs_state pointer
+ * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns
+ * above.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \
+ : [err] "=a" (err) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
+ * states in addition to XSAVEC.
+ *
+ * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
+ * compacted storage format in addition to XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * Use XSAVE as a fallback.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile("1: " ALTERNATIVE_3(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVEC, X86_FEATURE_XSAVEC, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n\t" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ : [err] "=r" (err) \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile("1: " ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
+ : \
+ : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
+extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
+#else
+static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
+#endif
+
+#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrq(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
+static inline void xfd_update_state(struct fpstate *fpstate)
+{
+ if (fpu_state_size_dynamic()) {
+ u64 xfd = fpstate->xfd;
+
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
+ }
+}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
+#else
+static inline void xfd_set_state(u64 xfd) { }
+
+static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+ return -EPERM;
+}
+#endif
+
+/*
+ * Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
+ */
+static inline void os_xsave(struct fpstate *fpstate)
+{
+ u64 mask = fpstate->xfeatures;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+ xfd_validate_state(fpstate, mask, false);
+
+ XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
+ */
+static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ xfd_validate_state(fpstate, mask, true);
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/* Restore of supervisor state. Does not require XFD */
+static inline void os_xrstor_supervisor(struct fpstate *fpstate)
+{
+ u64 mask = xfeatures_mask_supervisor();
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
+}
+
+/*
+ * XSAVE itself always writes all requested xfeatures. Removing features
+ * from the request bitmap reduces the features which are written.
+ * Generate a mask of features which must be written to a sigframe. The
+ * unset features can be optimized away and not written.
+ *
+ * This optimization is user-visible. Only use for states where
+ * uninitialized sigframe contents are tolerable, like dynamic features.
+ *
+ * Users of buffers produced with this optimization must check XSTATE_BV
+ * to determine which features have been optimized out.
+ */
+static inline u64 xfeatures_need_sigframe_write(void)
+{
+ u64 xfeaures_to_write;
+
+ /* In-use features must be written: */
+ xfeaures_to_write = xfeatures_in_use();
+
+ /* Also write all non-optimizable sigframe features: */
+ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
+ ~XFEATURE_MASK_SIGFRAME_INITOPT;
+
+ return xfeaures_to_write;
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for backward compatibility for
+ * old applications which don't understand the compacted format of the
+ * xsave area.
+ *
+ * The caller has to zero buf::header before calling this because XSAVE*
+ * does not touch the reserved fields in the header.
+ */
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ struct fpstate *fpstate = x86_task_fpu(current)->fpstate;
+ u64 mask = fpstate->user_xfeatures;
+ u32 lmask;
+ u32 hmask;
+ int err;
+
+ /* Optimize away writing unnecessary xfeatures: */
+ if (fpu_state_size_dynamic())
+ mask &= xfeatures_need_sigframe_write();
+
+ lmask = mask;
+ hmask = mask >> 32;
+ xfd_validate_state(fpstate, mask, false);
+
+ stac();
+ XSTATE_OP(XSAVE, buf, lmask, hmask, err);
+ clac();
+
+ if (!err)
+ err = update_pkru_in_sigframe(buf, pkru);
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true);
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from kernel space xsave area, return an error code instead of
+ * an exception.
+ */
+static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
+{
+ struct xregs_state *xstate = &fpstate->regs.xsave;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ /* Ensure that XFD is up to date */
+ xfd_update_state(fpstate);
+
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ return err;
+}
+
+
+#endif
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
new file mode 100644
index 000000000000..816187da3a47
--- /dev/null
+++ b/arch/x86/kernel/fred.c
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/traps.h>
+
+/* #DB in the kernel would imply the use of a kernel debugger. */
+#define FRED_DB_STACK_LEVEL 1UL
+#define FRED_NMI_STACK_LEVEL 2UL
+#define FRED_MC_STACK_LEVEL 2UL
+/*
+ * #DF is the highest level because a #DF means "something went wrong
+ * *while delivering an exception*." The number of cases for which that
+ * can happen with FRED is drastically reduced and basically amounts to
+ * "the stack you pointed me to is broken." Thus, always change stacks
+ * on #DF, which means it should be at the highest level.
+ */
+#define FRED_DF_STACK_LEVEL 3UL
+
+#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector)))
+
+DEFINE_PER_CPU(unsigned long, fred_rsp0);
+EXPORT_PER_CPU_SYMBOL(fred_rsp0);
+
+void cpu_init_fred_exceptions(void)
+{
+ /* When FRED is enabled by default, remove this log message */
+ pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+
+ /*
+ * If a kernel event is delivered before a CPU goes to user level for
+ * the first time, its SS is NULL thus NULL is pushed into the SS field
+ * of the FRED stack frame. But before ERETS is executed, the CPU may
+ * context switch to another task and go to user level. Then when the
+ * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later
+ * when ERETS is executed to return from the kernel event handler, a #GP
+ * fault is generated because SS doesn't match the SS saved in the FRED
+ * stack frame.
+ *
+ * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs.
+ */
+ loadsegment(ss, __KERNEL_DS);
+
+ wrmsrq(MSR_IA32_FRED_CONFIG,
+ /* Reserve for CALL emulation */
+ FRED_CONFIG_REDZONE |
+ FRED_CONFIG_INT_STKLVL(0) |
+ FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+
+ wrmsrq(MSR_IA32_FRED_STKLVLS, 0);
+
+ /*
+ * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be
+ * resynchronized with its per-CPU cache.
+ */
+ wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0));
+
+ wrmsrq(MSR_IA32_FRED_RSP1, 0);
+ wrmsrq(MSR_IA32_FRED_RSP2, 0);
+ wrmsrq(MSR_IA32_FRED_RSP3, 0);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ /* Any further IDT use is a bug */
+ idt_invalidate();
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/* Must be called after setup_cpu_entry_areas() */
+void cpu_init_fred_rsps(void)
+{
+ /*
+ * The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
+ * (remember that user space faults are always taken on stack level 0)
+ * is to avoid overflowing the kernel stack.
+ */
+ wrmsrq(MSR_IA32_FRED_STKLVLS,
+ FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL));
+
+ /* The FRED equivalents to IST stacks... */
+ wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..0543b57f54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Code for replacing ftrace calls with jumps.
+ * Dynamic function tracing support.
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
*
@@ -17,496 +18,652 @@
#include <linux/ftrace.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include <trace/syscall.h>
-#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
+#include <asm/text-patching.h>
#ifdef CONFIG_DYNAMIC_FTRACE
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
+static int ftrace_poke_late = 0;
-int ftrace_arch_code_modify_prepare(void)
+void ftrace_arch_code_modify_prepare(void)
+ __acquires(&text_mutex)
{
- set_kernel_text_rw();
- modifying_code = 1;
- return 0;
+ /*
+ * Need to grab text_mutex to prevent a race from module loading
+ * and live kernel patching from changing the text permissions while
+ * ftrace has it set to "read/write".
+ */
+ mutex_lock(&text_mutex);
+ ftrace_poke_late = 1;
}
-int ftrace_arch_code_modify_post_process(void)
+void ftrace_arch_code_modify_post_process(void)
+ __releases(&text_mutex)
{
- modifying_code = 0;
- set_kernel_text_ro();
- return 0;
+ /*
+ * ftrace_make_{call,nop}() may be called during
+ * module load, and we need to finish the smp_text_poke_batch_add()
+ * that they do, here.
+ */
+ smp_text_poke_batch_finish();
+ ftrace_poke_late = 0;
+ mutex_unlock(&text_mutex);
}
-union ftrace_code_union {
- char code[MCOUNT_INSN_SIZE];
- struct {
- char e8;
- int offset;
- } __attribute__((packed));
-};
-
-static int ftrace_calc_offset(long ip, long addr)
+static const char *ftrace_nop_replace(void)
{
- return (int)(addr - ip);
+ return x86_nops[5];
}
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- static union ftrace_code_union calc;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting itself.
+ */
+ if (ftrace_is_jmp(addr)) {
+ addr = ftrace_jmp_get(addr);
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
+ } else {
+ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
+ }
+}
- calc.e8 = 0xe8;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
+{
+ char cur_code[MCOUNT_INSN_SIZE];
/*
- * No locking needed, this must be called via kstop_machine
- * which in essence is like running on a uniprocessor machine.
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
*/
- return calc.code;
+ /* read the text we want to modify */
+ if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ ftrace_expected = old_code;
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ return 0;
}
/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
+ * Marked __ref because it calls text_poke_early() which is .init.text. That is
+ * ok because that call will happen early, during boot, when .init sections are
+ * still present.
*/
-
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static void *mod_code_newcode; /* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
+static int __ref
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
{
- int r;
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
+ /* replace the text with the new text */
+ if (ftrace_poke_late)
+ smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ return 0;
}
-static void clear_mod_flag(void)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
-
- if (old == new)
- break;
+ unsigned long ip = rec->ip;
+ const char *new, *old;
- old = atomic_cmpxchg(&nmi_running, old, new);
- }
-}
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
-static void ftrace_mod_code(void)
-{
/*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
*/
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(ip, old, new);
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
}
-void ftrace_nmi_enter(void)
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- __get_cpu_var(save_modifying_code) = modifying_code;
+ unsigned long ip = rec->ip;
+ const char *new, *old;
- if (!__get_cpu_var(save_modifying_code))
- return;
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
}
-void ftrace_nmi_exit(void)
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
{
- if (!__get_cpu_var(save_modifying_code))
- return;
-
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
+ WARN_ON(1);
+ return -EINVAL;
}
-static void wait_for_nmi_and_set_mod_flag(void)
+int ftrace_update_ftrace_func(ftrace_func_t func)
{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
+ unsigned long ip;
+ const char *new;
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- nmi_wait_count++;
+ return 0;
}
-static void wait_for_nmi(void)
+void ftrace_replace_code(int enable)
{
- if (!atomic_read(&nmi_running))
- return;
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *new, *old;
+ int ret;
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
- nmi_wait_count++;
-}
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
-}
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
-static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
-{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa(ip));
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
+
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_expected = old;
+ ftrace_bug(ret, rec);
+ ftrace_expected = NULL;
+ return;
+ }
+ }
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- wait_for_nmi_and_set_mod_flag();
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- ftrace_mod_code();
+ smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
+ }
+ smp_text_poke_batch_finish();
+}
- /* Make sure the write happens before clearing the bit */
- smp_mb();
+void arch_ftrace_update_code(int command)
+{
+ ftrace_modify_all_code(command);
+}
- clear_mod_flag();
- wait_for_nmi();
+/* Currently only x86_64 supports dynamic trampolines */
+#ifdef CONFIG_X86_64
- return mod_code_status;
+static inline void *alloc_tramp(unsigned long size)
+{
+ return execmem_alloc_rw(EXECMEM_FTRACE, size);
+}
+static inline void tramp_free(void *tramp)
+{
+ execmem_free(tramp);
}
+/* Defined as markers to the end of the ftrace default trampolines */
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_caller_end(void);
+extern void ftrace_caller_op_ptr(void);
+extern void ftrace_regs_caller_op_ptr(void);
+extern void ftrace_regs_caller_jmp(void);
+/* movq function_trace_op(%rip), %rdx */
+/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
+#define OP_REF_SIZE 7
+/*
+ * The ftrace_ops is passed to the function callback. Since the
+ * trampoline only services a single ftrace_ops, we can pass in
+ * that ops directly.
+ *
+ * The ftrace_op_code_union is used to create a pointer to the
+ * ftrace_ops that will be passed to the callback function.
+ */
+union ftrace_op_code_union {
+ char code[OP_REF_SIZE];
+ struct {
+ char op[3];
+ int offset;
+ } __attribute__((packed));
+};
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#define RET_SIZE \
+ (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS))
-static unsigned char *ftrace_nop_replace(void)
+static unsigned long
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
{
- return ftrace_nop;
-}
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long op_offset;
+ unsigned long call_offset;
+ unsigned long jmp_offset;
+ unsigned long offset;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long *ptr;
+ void *trampoline;
+ void *ip, *dest;
+ /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
+ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
+ union ftrace_op_code_union op_ptr;
+ int ret;
-static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
- unsigned char *new_code)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
+ jmp_offset = (unsigned long)ftrace_regs_caller_jmp;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_caller_end;
+ op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
+ jmp_offset = 0;
+ }
+
+ size = end_offset - start_offset;
/*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
+ * Allocate enough size to store the ftrace_caller code,
+ * the iret , as well as the address of the ftrace_ops this
+ * trampoline is used for.
*/
+ trampoline = alloc_tramp(size + RET_SIZE + sizeof(void *));
+ if (!trampoline)
+ return 0;
+
+ *tramp_size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
+
+ /* Copy ftrace_caller onto the trampoline memory */
+ ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0))
+ goto fail;
+
+ ip = trampoline + size;
+ if (cpu_wants_rethunk_at(ip))
+ __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
+ else
+ memcpy(ip, retq, sizeof(retq));
+
+ /* No need to test direct calls on created trampolines */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ /* NOP the jnz 1f; but make sure it's a 2 byte jnz */
+ ip = trampoline + (jmp_offset - start_offset);
+ if (WARN_ON(*(char *)ip != 0x75))
+ goto fail;
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
+ if (ret < 0)
+ goto fail;
+ }
- /* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ /*
+ * The address of the ftrace_ops that is used for this trampoline
+ * is stored at the end of the trampoline. This will be used to
+ * load the third parameter for the callback. Basically, that
+ * location at the end of the trampoline takes the place of
+ * the global function_trace_op variable.
+ */
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
+ ptr = (unsigned long *)(trampoline + size + RET_SIZE);
+ *ptr = (unsigned long)ops;
- /* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
- return -EPERM;
+ op_offset -= start_offset;
+ memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
- sync_core();
+ /* Are we pointing to the reference? */
+ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0))
+ goto fail;
+ /* Load the contents of ptr into the callback parameter */
+ offset = (unsigned long)ptr;
+ offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
+
+ op_ptr.offset = offset;
+
+ /* put in the new offset to the ftrace_ops */
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting before the call already.
+ */
+ dest = ftrace_ops_get_func(ops);
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
+ /* ALLOC_TRAMP flags lets us know we created it */
+ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+ set_memory_rox((unsigned long)trampoline, npages);
+ return (unsigned long)trampoline;
+fail:
+ tramp_free(trampoline);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+void set_ftrace_ops_ro(void)
{
- unsigned char *new, *old;
- unsigned long ip = rec->ip;
+ struct ftrace_ops *ops;
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long npages;
+ unsigned long size;
+
+ do_for_each_ftrace_op(ops, ftrace_ops_list) {
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ continue;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_caller_end;
+ }
+ size = end_offset - start_offset;
+ size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(size, PAGE_SIZE);
+ set_memory_ro((unsigned long)ops->trampoline, npages);
+ } while_for_each_ftrace_op(ops);
+}
- old = ftrace_call_replace(ip, addr);
- new = ftrace_nop_replace();
+static unsigned long calc_trampoline_call_offset(bool save_regs)
+{
+ unsigned long start_offset;
+ unsigned long call_offset;
+
+ if (save_regs) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ call_offset = (unsigned long)ftrace_regs_call;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ call_offset = (unsigned long)ftrace_call;
+ }
- return ftrace_modify_code(rec->ip, old, new);
+ return call_offset - start_offset;
}
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
- unsigned char *new, *old;
- unsigned long ip = rec->ip;
+ ftrace_func_t func;
+ unsigned long offset;
+ unsigned long ip;
+ unsigned int size;
+ const char *new;
+
+ if (!ops->trampoline) {
+ ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline)
+ return;
+ ops->trampoline_size = size;
+ return;
+ }
- old = ftrace_nop_replace();
- new = ftrace_call_replace(ip, addr);
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
- return ftrace_modify_code(rec->ip, old, new);
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ ip = ops->trampoline + offset;
+ func = ftrace_ops_get_func(ops);
+
+ mutex_lock(&text_mutex);
+ /* Do a safe modify in case the trampoline is executing */
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
-int ftrace_update_ftrace_func(ftrace_func_t func)
+/* Return the address of the function the trampoline calls */
+static void *addr_from_call(void *ptr)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ union text_poke_insn call;
int ret;
- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = ftrace_modify_code(ip, old, new);
+ ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE);
+ if (WARN_ON_ONCE(ret < 0))
+ return NULL;
+
+ /* Make sure this is a call */
+ if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) {
+ pr_warn("Expected E8, got %x\n", call.opcode);
+ return NULL;
+ }
- return ret;
+ return ptr + CALL_INSN_SIZE + call.disp;
}
-int __init ftrace_dyn_arch_init(void *data)
+/*
+ * If the ops->trampoline was not allocated, then it probably
+ * has a static trampoline func, or is the ftrace caller itself.
+ */
+static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
{
- extern const unsigned char ftrace_test_p6nop[];
- extern const unsigned char ftrace_test_nop5[];
- extern const unsigned char ftrace_test_jmp[];
- int faulted = 0;
-
- /*
- * There is no good nop for all x86 archs.
- * We will default to using the P6_NOP5, but first we
- * will test to make sure that the nop will actually
- * work on this CPU. If it faults, we will then
- * go to a lesser efficient 5 byte nop. If that fails
- * we then just use a jmp as our nop. This isn't the most
- * efficient nop, but we can not use a multi part nop
- * since we would then risk being preempted in the middle
- * of that nop, and if we enabled tracing then, it might
- * cause a system crash.
- *
- * TODO: check the cpuid to determine the best nop.
- */
- asm volatile (
- "ftrace_test_jmp:"
- "jmp ftrace_test_p6nop\n"
- "nop\n"
- "nop\n"
- "nop\n" /* 2 byte jmp + 3 bytes */
- "ftrace_test_p6nop:"
- P6_NOP5
- "jmp 1f\n"
- "ftrace_test_nop5:"
- ".byte 0x66,0x66,0x66,0x66,0x90\n"
- "1:"
- ".section .fixup, \"ax\"\n"
- "2: movl $1, %0\n"
- " jmp ftrace_test_nop5\n"
- "3: movl $2, %0\n"
- " jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(ftrace_test_p6nop, 2b)
- _ASM_EXTABLE(ftrace_test_nop5, 3b)
- : "=r"(faulted) : "0" (faulted));
-
- switch (faulted) {
- case 0:
- pr_info("converting mcount calls to 0f 1f 44 00 00\n");
- memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
- break;
- case 1:
- pr_info("converting mcount calls to 66 66 66 66 90\n");
- memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
- break;
- case 2:
- pr_info("converting mcount calls to jmp . + 5\n");
- memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
- break;
+ unsigned long offset;
+ bool save_regs = rec->flags & FTRACE_FL_REGS_EN;
+ void *ptr;
+
+ if (ops && ops->trampoline) {
+#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \
+ defined(CONFIG_FUNCTION_GRAPH_TRACER)
+ /*
+ * We only know about function graph tracer setting as static
+ * trampoline.
+ */
+ if (ops->trampoline == FTRACE_GRAPH_ADDR)
+ return (void *)prepare_ftrace_return;
+#endif
+ return NULL;
}
- /* The return code is retured via data */
- *(unsigned long *)data = 0;
+ offset = calc_trampoline_call_offset(save_regs);
- return 0;
+ if (save_regs)
+ ptr = (void *)FTRACE_REGS_ADDR + offset;
+ else
+ ptr = (void *)FTRACE_ADDR + offset;
+
+ return addr_from_call(ptr);
}
-#endif
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_graph_call(void);
+ /* If we didn't allocate this trampoline, consider it static */
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return static_tramp_func(ops, rec);
-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
- unsigned char code[MCOUNT_INSN_SIZE];
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ tramp_free((void *)ops->trampoline);
+ ops->trampoline = 0;
+}
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_DYNAMIC_FTRACE */
- *(int *)(&code[1]) = new_offset;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
+#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
+extern void ftrace_graph_call(void);
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
+}
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ const char *new;
+
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
+ smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
}
int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_stub);
}
+#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
-#endif /* !CONFIG_DYNAMIC_FTRACE */
+static inline bool skip_ftrace_return(void)
+{
+ /*
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
+ */
+ if ((long)__builtin_frame_address(0) >= 0)
+ return true;
+
+ if (ftrace_graph_is_dead())
+ return true;
+
+ if (atomic_read(&current->tracing_graph_pause))
+ return true;
+ return false;
+}
/*
* Hook the return address and push it in the stack of return addrs
* in current thread info.
*/
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
unsigned long frame_pointer)
{
- unsigned long old;
- int faulted;
- struct ftrace_graph_ent trace;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
- if (unlikely(atomic_read(&current->tracing_graph_pause)))
+ if (unlikely(skip_ftrace_return()))
return;
- /*
- * Protect against fault, even if it shouldn't
- * happen. This tool is too much intrusive to
- * ignore such a protection.
- */
- asm volatile(
- "1: " _ASM_MOV " (%[parent]), %[old]\n"
- "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
- " movl $0, %[faulted]\n"
- "3:\n"
-
- ".section .fixup, \"ax\"\n"
- "4: movl $1, %[faulted]\n"
- " jmp 3b\n"
- ".previous\n"
-
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 4b)
-
- : [old] "=&r" (old), [faulted] "=r" (faulted)
- : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
- : "memory"
- );
-
- if (unlikely(faulted)) {
- ftrace_graph_stop();
- WARN_ON(1);
- return;
- }
+ if (!function_graph_enter(*parent, ip, frame_pointer, parent))
+ *parent = return_hooker;
+}
- if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer) == -EBUSY) {
- *parent = old;
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
+ unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
+ unsigned long *parent = (unsigned long *)stack;
+
+ if (unlikely(skip_ftrace_return()))
return;
- }
- trace.func = self_addr;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace)) {
- current->curr_ret_stack--;
- *parent = old;
- }
+ if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs))
+ *parent = return_hooker;
}
+#endif
+
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 000000000000..f4e0c3361234
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+SYM_FUNC_START(__fentry__)
+ RET
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+
+SYM_CODE_START(ftrace_caller)
+
+#ifdef CONFIG_FRAME_POINTER
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef CONFIG_FRAME_POINTER
+ popl %ebp
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
+ RET
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_regs_caller)
+ /*
+ * We're here from an mcount/fentry CALL, and the stack frame looks like:
+ *
+ * <previous context>
+ * RET-IP
+ *
+ * The purpose of this function is to call out in an emulated INT3
+ * environment with a stack frame like:
+ *
+ * <previous context>
+ * gap / RET-IP
+ * gap
+ * gap
+ * gap
+ * pt_regs
+ *
+ * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds
+ */
+ subl $3*4, %esp # RET-IP + 3 gaps
+ pushl %ss # ss
+ pushl %esp # points at ss
+ addl $5*4, (%esp) # make it point at <previous context>
+ pushfl # flags
+ pushl $__KERNEL_CS # cs
+ pushl 7*4(%esp) # ip <- RET-IP
+ pushl $0 # orig_eax
+
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ ENCODE_FRAME_POINTER
+
+ movl PT_EIP(%esp), %eax # 1st argument: IP
+ subl $MCOUNT_INSN_SIZE, %eax
+ movl 21*4(%esp), %edx # 2nd argument: parent ip
+ movl function_trace_op, %ecx # 3rd argument: ftrace_pos
+ pushl %esp # 4th argument: pt_regs
+
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+ call ftrace_stub
+
+ addl $4, %esp # skip 4th argument
+
+ /* place IP below the new SP */
+ movl PT_OLDESP(%esp), %eax
+ movl PT_EIP(%esp), %ecx
+ movl %ecx, -4(%eax)
+
+ /* place EAX below that */
+ movl PT_EAX(%esp), %ecx
+ movl %ecx, -8(%eax)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+
+ lea -8(%eax), %esp
+ popl %eax
+
+ jmp .Lftrace_ret
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_CODE_START(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+SYM_CODE_END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ subl $(PTREGS_SIZE), %esp
+ movl $0, PT_EBP(%esp)
+ movl %edx, PT_EDX(%esp)
+ movl %eax, PT_EAX(%esp)
+ movl %esp, %eax
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ movl PT_EAX(%esp), %eax
+ movl PT_EDX(%esp), %edx
+ addl $(PTREGS_SIZE), %esp
+ JMP_NOSPEC ecx
+#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
new file mode 100644
index 000000000000..a132608265f6
--- /dev/null
+++ b/arch/x86/kernel/ftrace_64.S
@@ -0,0 +1,402 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/export.h>
+#include <linux/cfi_types.h>
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+#include <asm/frame.h>
+
+ .code64
+ .section .text, "ax"
+
+#ifdef CONFIG_FRAME_POINTER
+/* Save parent and function stack frames (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16*2)
+#else
+/* No need to save a stack frame */
+# define MCOUNT_FRAME_SIZE 0
+#endif /* CONFIG_FRAME_POINTER */
+
+/* Size of stack used to save mcount regs in save_mcount_regs */
+#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE)
+
+/*
+ * gcc -pg option adds a call to 'mcount' in most functions.
+ * When -mfentry is used, the call is to 'fentry' and not 'mcount'
+ * and is done before the function's stack frame is set up.
+ * They both require a set of regs to be saved before calling
+ * any C code and restored before returning back to the function.
+ *
+ * On boot up, all these calls are converted into nops. When tracing
+ * is enabled, the call can jump to either ftrace_caller or
+ * ftrace_regs_caller. Callbacks (tracing functions) that require
+ * ftrace_regs_caller (like kprobes) need to have pt_regs passed to
+ * it. For this reason, the size of the pt_regs structure will be
+ * allocated on the stack and the required mcount registers will
+ * be saved in the locations that pt_regs has them in.
+ */
+
+/*
+ * @added: the amount of stack added before calling this
+ *
+ * After this is called, the following registers contain:
+ *
+ * %rdi - holds the address that called the trampoline
+ * %rsi - holds the parent function (traced function's return address)
+ * %rdx - holds the original %rbp
+ */
+.macro save_mcount_regs added=0
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Save the original rbp */
+ pushq %rbp
+
+ /*
+ * Stack traces will stop at the ftrace trampoline if the frame pointer
+ * is not set up properly. If fentry is used, we need to save a frame
+ * pointer for the parent as well as the function traced, because the
+ * fentry is called before the stack frame is set up, where as mcount
+ * is called afterward.
+ */
+
+ /* Save the parent pointer (skip orig rbp and our return address) */
+ pushq \added+8*2(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+ /* Save the return address (now skip orig rbp, rbp and parent) */
+ pushq \added+8*3(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+#endif /* CONFIG_FRAME_POINTER */
+
+ /*
+ * We add enough stack to save all regs.
+ */
+ subq $(FRAME_SIZE), %rsp
+ movq %rax, RAX(%rsp)
+ movq %rcx, RCX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rsi, RSI(%rsp)
+ movq %rdi, RDI(%rsp)
+ movq %r8, R8(%rsp)
+ movq %r9, R9(%rsp)
+ movq $0, ORIG_RAX(%rsp)
+ /*
+ * Save the original RBP. Even though the mcount ABI does not
+ * require this, it helps out callers.
+ */
+#ifdef CONFIG_FRAME_POINTER
+ movq MCOUNT_REG_SIZE-8(%rsp), %rdx
+#else
+ movq %rbp, %rdx
+#endif
+ movq %rdx, RBP(%rsp)
+
+ /* Copy the parent address into %rsi (second parameter) */
+ movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi
+
+ /* Move RIP to its proper location */
+ movq MCOUNT_REG_SIZE+\added(%rsp), %rdi
+ movq %rdi, RIP(%rsp)
+
+ /*
+ * Now %rdi (the first parameter) has the return address of
+ * where ftrace_call returns. But the callbacks expect the
+ * address of the call itself.
+ */
+ subq $MCOUNT_INSN_SIZE, %rdi
+ .endm
+
+.macro restore_mcount_regs save=0
+
+ /* ftrace_regs_caller or frame pointers require this */
+ movq RBP(%rsp), %rbp
+
+ movq R9(%rsp), %r9
+ movq R8(%rsp), %r8
+ movq RDI(%rsp), %rdi
+ movq RSI(%rsp), %rsi
+ movq RDX(%rsp), %rdx
+ movq RCX(%rsp), %rcx
+ movq RAX(%rsp), %rax
+
+ addq $MCOUNT_REG_SIZE-\save, %rsp
+
+ .endm
+
+SYM_TYPED_FUNC_START(ftrace_stub)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_TYPED_FUNC_START(ftrace_stub_graph)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_graph)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+
+SYM_FUNC_START(ftrace_caller)
+ ANNOTATE_NOENDBR
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ CALL_DEPTH_ACCOUNT
+
+ /* Stack - skipping return address of ftrace_caller */
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Only ops with REGS flag set should have CS register set */
+ movq $0, CS(%rsp)
+
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ call ftrace_stub
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs
+
+ /*
+ * The code up to this label is copied into trampolines so
+ * think twice before adding any new code or changing the
+ * layout here.
+ */
+SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ RET
+SYM_FUNC_END(ftrace_caller);
+STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
+
+SYM_FUNC_START(ftrace_regs_caller)
+ ANNOTATE_NOENDBR
+ /* Save the current flags before any operations that can change them */
+ pushfq
+
+ /* added 8 bytes to save flags */
+ save_mcount_regs 8
+ /* save_mcount_regs fills in first two parameters */
+
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq MCOUNT_REG_SIZE(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address and flags */
+ leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ ENCODE_FRAME_POINTER
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
+SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBX(%rsp), %rbx
+
+ movq ORIG_RAX(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE-8(%rsp)
+
+ /*
+ * If ORIG_RAX is anything but zero, make this a call to that.
+ * See arch_ftrace_set_direct_caller().
+ */
+ testq %rax, %rax
+SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ jnz 1f
+
+ restore_mcount_regs
+ /* Restore flags */
+ popfq
+
+ /*
+ * The trampoline will add the return.
+ */
+SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ RET
+
+1:
+ testb $1, %al
+ jz 2f
+ andq $0xfffffffffffffffe, %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+ restore_mcount_regs
+ /* Restore flags */
+ popfq
+ RET
+
+ /* Swap the flags with orig_rax */
+2: movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+ /* Restore flags */
+ popfq
+ UNWIND_HINT_FUNC
+
+ /*
+ * The above left an extra return value on the stack; effectively
+ * doing a tail-call without using a register. This PUSH;RET
+ * pattern unbalances the RSB, inject a pointless CALL to rebalance.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ CALL .Ldo_rebalance
+ int3
+.Ldo_rebalance:
+ add $8, %rsp
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
+
+SYM_FUNC_END(ftrace_regs_caller)
+STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
+
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+SYM_FUNC_START(__fentry__)
+ ANNOTATE_NOENDBR
+ CALL_DEPTH_ACCOUNT
+
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+ RET
+
+trace:
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
+ movq ftrace_trace_function, %r8
+ CALL_NOSPEC r8
+ restore_mcount_regs
+
+ jmp ftrace_stub
+SYM_FUNC_END(__fentry__)
+EXPORT_SYMBOL(__fentry__)
+STACK_FRAME_NON_STANDARD_FP(__fentry__)
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+SYM_CODE_START(return_to_handler)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
+ /* Save ftrace_regs for function exit context */
+ subq $(FRAME_SIZE), %rsp
+
+ movq %rax, RAX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
+ movq %rsp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq RDX(%rsp), %rdx
+ movq RAX(%rsp), %rax
+
+ addq $(FRAME_SIZE) + 8, %rsp
+
+ /*
+ * Jump back to the old return address. This cannot be JMP_NOSPEC rdi
+ * since IBT would demand that contain ENDBR, which simply isn't so for
+ * return addresses. Use a retpoline here to keep the RSB balanced.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call .Ldo_rop
+ int3
+.Ldo_rop:
+ mov %rdi, (%rsp)
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
+SYM_CODE_END(return_to_handler)
+#endif
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
deleted file mode 100644
index 3e66bd364a9d..000000000000
--- a/arch/x86/kernel/head.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-
-#include <asm/setup.h>
-#include <asm/bios_ebda.h>
-
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved");
-}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..375f2d7f1762 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/arch/i386/kernel/head32.c -- prepare to run common code
*
@@ -8,66 +9,162 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/page.h>
-#include <asm/trampoline.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/microcode.h>
+#include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
static void __init i386_default_early_setup(void)
{
- /* Initilize 32bit specific setup functions */
- x86_init.resources.probe_roms = probe_roms;
+ /* Initialize 32bit specific setup functions */
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-
- reserve_ebda_region();
}
-void __init i386_start_kernel(void)
+#ifdef CONFIG_MICROCODE_INITRD32
+unsigned long __initdata initrd_start_early;
+static pte_t __initdata *initrd_pl2p_start, *initrd_pl2p_end;
+
+static void zap_early_initrd_mapping(void)
{
-#ifdef CONFIG_X86_TRAMPOLINE
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
- "EX TRAMPOLINE");
-#endif
+ pte_t *pl2p = initrd_pl2p_start;
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ for (; pl2p < initrd_pl2p_end; pl2p++) {
+ *pl2p = (pte_t){ .pte = 0 };
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
- u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ if (!IS_ENABLED(CONFIG_X86_PAE))
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = (pte_t) {.pte = 0};
}
+}
+#else
+static inline void zap_early_initrd_mapping(void) { }
#endif
+asmlinkage __visible void __init __noreturn i386_start_kernel(void)
+{
+ /* Make sure IDT is set up before any exception happens */
+ idt_setup_early_handler();
+
+ load_ucode_bsp();
+ zap_early_initrd_mapping();
+
+ cr4_init_shadow();
+
+ sanitize_boot_params(&boot_params);
+
+ x86_early_init_platform_quirks();
+
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
- case X86_SUBARCH_MRST:
- x86_mrst_early_setup();
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ case X86_SUBARCH_CE4100:
+ x86_ce4100_early_setup();
break;
default:
i386_default_early_setup();
break;
}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
start_kernel();
}
+
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+#ifdef CONFIG_X86_PAE
+typedef pmd_t pl2_t;
+#define pl2_base initial_pg_pmd
+#define SET_PL2(val) { .pmd = (val), }
+#else
+typedef pgd_t pl2_t;
+#define pl2_base initial_page_table
+#define SET_PL2(val) { .pgd = (val), }
+#endif
+
+static __init __no_stack_protector pte_t init_map(pte_t pte, pte_t **ptep, pl2_t **pl2p,
+ const unsigned long limit)
+{
+ while ((pte.pte & PTE_PFN_MASK) < limit) {
+ pl2_t pl2 = SET_PL2((unsigned long)*ptep | PDE_IDENT_ATTR);
+ int i;
+
+ **pl2p = pl2;
+ if (!IS_ENABLED(CONFIG_X86_PAE)) {
+ /* Kernel PDE entry */
+ *(*pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+ }
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ **ptep = pte;
+ pte.pte += PAGE_SIZE;
+ (*ptep)++;
+ }
+ (*pl2p)++;
+ }
+ return pte;
+}
+
+void __init __no_stack_protector mk_early_pgtbl_32(void)
+{
+ /* Enough space to fit pagetables for the low memory linear map */
+ unsigned long limit = __pa_nodebug(_end) + (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+ pte_t pte, *ptep = (pte_t *)__pa_nodebug(__brk_base);
+ struct boot_params __maybe_unused *params;
+ pl2_t *pl2p = (pl2_t *)__pa_nodebug(pl2_base);
+ unsigned long *ptr;
+
+ pte.pte = PTE_IDENT_ATTR;
+ pte = init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&max_pfn_mapped);
+ /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+ *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+ ptr = (unsigned long *)__pa_nodebug(&_brk_end);
+ *ptr = (unsigned long)ptep + PAGE_OFFSET;
+
+#ifdef CONFIG_MICROCODE_INITRD32
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image)
+ return;
+
+ /* Save the virtual start address */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_start_early);
+ *ptr = (pte.pte & PTE_PFN_MASK) + PAGE_OFFSET;
+ *ptr += ((unsigned long)params->hdr.ramdisk_image) & ~PAGE_MASK;
+
+ /* Save PLP2 for cleanup */
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_start);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+
+ limit = (unsigned long)params->hdr.ramdisk_image;
+ pte.pte = PTE_IDENT_ATTR | PFN_ALIGN(limit);
+ limit = (unsigned long)params->hdr.ramdisk_image + params->hdr.ramdisk_size;
+
+ init_map(pte, &ptep, &pl2p, limit);
+
+ ptr = (unsigned long *)__pa_nodebug(&initrd_pl2p_end);
+ *ptr = (unsigned long)pl2p + PAGE_OFFSET;
+#endif
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..fd28b53dbac5 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -1,9 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
@@ -12,112 +16,308 @@
#include <linux/percpu.h>
#include <linux/start_kernel.h>
#include <linux/io.h>
+#include <linux/memblock.h>
+#include <linux/cc_platform.h>
+#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
#include <asm/desc.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
-#include <asm/e820.h>
-#include <asm/trampoline.h>
+#include <asm/e820/api.h>
#include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
+#include <asm/kasan.h>
+#include <asm/fixmap.h>
+#include <asm/realmode.h>
+#include <asm/extable.h>
+#include <asm/trapnr.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/init.h>
+
+/*
+ * Manage page tables very early on.
+ */
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+unsigned int __initdata next_early_pgt;
+SYM_PIC_ALIAS(next_early_pgt);
+pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+
+unsigned int __pgtable_l5_enabled __ro_after_init;
+SYM_PIC_ALIAS(__pgtable_l5_enabled);
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+SYM_PIC_ALIAS(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+SYM_PIC_ALIAS(ptrs_per_p4d);
-static void __init zap_identity_mappings(void)
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ next_early_pgt = 0;
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
+}
+
+/* Create a new PMD entry */
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
+ pudval_t pud, *pud_p;
+ pmdval_t *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
+ return false;
+
+again:
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (!pgtable_l5_enabled())
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd_p[pmd_index(address)] = pmd;
+
+ return true;
+}
+
+static bool __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
+void __init do_early_exception(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr == X86_TRAP_PF &&
+ early_make_pgtable(native_read_cr2()))
+ return;
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) &&
+ trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
+ return;
+
+ if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs))
+ return;
+
+ early_fixup_exception(regs, trapnr);
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
-static void __init clear_bss(void)
+void __init clear_bss(void)
{
memset(__bss_start, 0,
(unsigned long) __bss_stop - (unsigned long) __bss_start);
+ memset(__brk_base, 0,
+ (unsigned long) __brk_limit - (unsigned long) __brk_base);
+}
+
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
}
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
+ unsigned long cmd_line_ptr;
+
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
- memcpy(&boot_params, real_mode_data, sizeof boot_params);
- if (boot_params.hdr.cmd_line_ptr) {
- command_line = __va(boot_params.hdr.cmd_line_ptr);
+ memcpy(&boot_params, real_mode_data, sizeof(boot_params));
+ sanitize_boot_params(&boot_params);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
}
-void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data)
{
- int i;
-
/*
* Build-time sanity checks on the kernel image and module
* area mappings. (these are purely build-time and produce no code)
*/
- BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
- BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
+ BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
- BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
- /* clear bss before set_intr_gate with early_idt_handler */
+ cr4_init_shadow();
+
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
+ if (pgtable_l5_enabled()) {
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
clear_bss();
- /* Make NULL pointers segfault */
- zap_identity_mappings();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
+ clear_page(init_top_pgt);
- /* Cleanup the over mapped high alias */
- cleanup_highmap();
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
- for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
-#ifdef CONFIG_EARLY_PRINTK
- set_intr_gate(i, &early_idt_handlers[i]);
-#else
- set_intr_gate(i, early_idt_handler);
-#endif
- }
- load_idt((const struct desc_ptr *)&idt_descr);
+ kasan_early_init();
- if (console_loglevel == 10)
- early_printk("Kernel alive\n");
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
+ idt_setup_early_handler();
+
+ /* Needed before cc_platform_has() can be used for TDX */
+ tdx_early_init();
+
+ copy_bootdata(__va(real_mode_data));
+
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
+
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data);
}
-void __init x86_64_start_reservations(char *real_mode_data)
+void __init __noreturn x86_64_start_reservations(char *real_mode_data)
{
- copy_bootdata(__va(real_mode_data));
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
- reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ x86_early_init_platform_quirks();
-#ifdef CONFIG_BLK_DEV_INITRD
- /* Reserve INITRD */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
- /* Assume only end is not page aligned */
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ default:
+ break;
}
-#endif
- reserve_ebda_region();
+ start_kernel();
+}
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
+void early_setup_idt(void)
+{
+ void *handler = NULL;
- start_kernel();
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ setup_ghcb();
+ handler = vc_boot_ghcb;
+ }
+
+ __pi_startup_64_load_idt(handler);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..80ef5d386b03 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
*
* Copyright (C) 1991, 1992 Linus Torvalds
@@ -7,6 +8,7 @@
*/
.text
+#include <linux/export.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <linux/linkage.h>
@@ -19,8 +21,12 @@
#include <asm/setup.h>
#include <asm/processor-flags.h>
#include <asm/msr-index.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
+#include <asm/nospec-branch.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable_32.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -32,46 +38,20 @@
#define X86 new_cpu_data+CPUINFO_x86
#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
#define X86_MODEL new_cpu_data+CPUINFO_x86_model
-#define X86_MASK new_cpu_data+CPUINFO_x86_mask
+#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping
#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
-
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
- PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
-
-/*
* Worst-case size of the kernel mapping we need to make:
- * the worst-case size of the kernel itself, plus the extra we need
- * to map for the linear map.
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
*/
-KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+KERNEL_PAGES = LOWMEM_PAGES
-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
RESERVE_BRK(pagetables, INIT_MAP_SIZE)
/*
@@ -81,13 +61,10 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
* any particular GDT layout, because we load our own as soon as we
* can.
*/
-__HEAD
-ENTRY(startup_32)
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 2f
-
+ __INIT
+SYM_CODE_START(startup_32)
+ movl pa(initial_stack),%ecx
+
/*
* Set segments to known values.
*/
@@ -97,7 +74,8 @@ ENTRY(startup_32)
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
-2:
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
/*
* Clear BSS first so that there are no surprises...
@@ -108,7 +86,7 @@ ENTRY(startup_32)
movl $pa(__bss_stop),%ecx
subl %edi,%ecx
shrl $2,%ecx
- rep ; stosl
+ rep stosl
/*
* Copy bootup parameters out of the way.
* Note: %esi still has the pointer to the real-mode data.
@@ -120,141 +98,36 @@ ENTRY(startup_32)
movl $pa(boot_params),%edi
movl $(PARAM_SIZE/4),%ecx
cld
- rep
- movsl
+ rep movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
- jz 1f # No comand line
+ jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
- rep
- movsl
+ rep movsl
1:
-#ifdef CONFIG_PARAVIRT
- /* This is can only trip for a broken bootloader... */
- cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
-
- /* Paravirt-compatible boot parameters. Look to see what architecture
- we're booting under. */
- movl pa(boot_params + BP_hardware_subarch), %eax
- cmpl $num_subarch_entries, %eax
- jae bad_subarch
-
- movl pa(subarch_entries)(,%eax,4), %eax
- subl $__PAGE_OFFSET, %eax
- jmp *%eax
-
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
- /* Unknown implementation; there's really
- nothing we can do at this point. */
- ud2a
-
- __INITDATA
-
-subarch_entries:
- .long default_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
- .long xen_entry /* Xen hypervisor */
- .long default_entry /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
-
-/*
- * Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base. The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- *
- * Note that the stack is not yet set up!
- */
-default_entry:
-#ifdef CONFIG_X86_PAE
+#ifdef CONFIG_OLPC
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
- /*
- * In PAE mode swapper_pg_dir is statically defined to contain enough
- * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
- * entries). The identity mapping is handled by pointing two PGD
- * entries to the first kernel PMD.
- *
- * Note the upper half of each PMD or PTE are always zero at
- * this stage.
- */
+ /* Create early pagetables. */
+ call mk_early_pgtbl_32
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef CONFIG_X86_PAE
#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else
+ movl %eax,pa(initial_page_table+0xffc)
+#endif
- xorl %ebx,%ebx /* %ebx is kept at zero */
-
- movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_pmd), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
- movl %ecx,(%edx) /* Store PMD entry */
- /* Upper half already zero */
- addl $8,%edx
- movl $512,%ecx
-11:
- stosl
- xchgl %eax,%ebx
- stosl
- xchgl %eax,%ebx
- addl $0x1000,%eax
- loop 11b
-
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
-1:
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
-
- /* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
-#else /* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
- movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_dir), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
- movl %ecx,(%edx) /* Store identity PDE entry */
- movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
- addl $4,%edx
- movl $1024, %ecx
-11:
- stosl
- addl $0x1000,%eax
- loop 11b
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
+ jmp .Ldefault_entry
+SYM_CODE_END(startup_32)
- /* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_dir+0xffc)
-#endif
- jmp 3f
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
@@ -263,44 +136,64 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-
-__CPUINIT
-
-#ifdef CONFIG_SMP
-ENTRY(startup_32_smp)
+#ifdef CONFIG_HOTPLUG_CPU
+ .text
+#endif
+SYM_FUNC_START(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
movl %eax,%ds
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
-#endif /* CONFIG_SMP */
-3:
+ movl pa(initial_stack),%ecx
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+.Ldefault_entry:
+ movl $(CR0_STATE & ~X86_CR0_PG),%eax
+ movl %eax,%cr0
/*
- * New page tables may be in 4Mbyte page mode and may
- * be using the global pages.
- *
- * NOTE! If we are on a 486 we may have no cr4 at all!
- * So we do not try to touch it unless we really have
- * some bits in it to set. This won't work if the BSP
- * implements cr4 but this AP does not -- very unlikely
- * but be warned! The same applies to the pse feature
- * if not equally supported. --macro
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
+ */
+ pushl $0
+ popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
*
- * NOTE! We have to correct for the fact that we're
- * not yet offset PAGE_OFFSET..
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
*/
-#define cr4_bits pa(mmu_cr4_features)
- movl cr4_bits,%edx
- andl %edx,%edx
- jz 6f
- movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
- orl %edx,%eax
+ movl $-1,pa(X86_CPUID) # preset CPUID level
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl # set EFLAGS=ID
+ pushfl
+ popl %eax # get EFLAGS
+ testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
+ jz .Lenable_paging # hw disallowed setting of ID bit
+ # which means no CPUID and no CR4
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,pa(X86_CPUID) # save largest std CPUID function
+
+ movl $1,%eax
+ cpuid
+ andl $~1,%edx # Ignore CPUID.FPU
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
+
+ movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
- jz 6f
+ jz .Lenable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
@@ -308,12 +201,16 @@ ENTRY(startup_32_smp)
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
- ja 6f
+ ja .Lenable_paging
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
btl $(X86_FEATURE_NX & 31), %edx
- jnc 6f
+ jnc .Lenable_paging
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
@@ -323,71 +220,26 @@ ENTRY(startup_32_smp)
/* Make changes effective */
wrmsr
-6:
+.Lenable_paging:
/*
* Enable paging
*/
- movl $pa(swapper_pg_dir),%eax
+ movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
- movl %cr0,%eax
- orl $X86_CR0_PG,%eax
+ movl $CR0_STATE,%eax
movl %eax,%cr0 /* ..and set paging (PG) bit */
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
1:
- /* Set up the stack pointer */
- lss stack_start,%esp
-
-/*
- * Initialize eflags. Some BIOS's leave bits like NT set. This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
- pushl $0
- popfl
-
-#ifdef CONFIG_SMP
- cmpb $0, ready
- jz 1f /* Initial CPU cleans BSS */
- jmp checkCPUtype
-1:
-#endif /* CONFIG_SMP */
-
-/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
- call setup_idt
-
-checkCPUtype:
-
- movl $-1,X86_CPUID # -1 for no CPUID initially
+ /* Shift the stack pointer to a virtual address */
+ addl $__PAGE_OFFSET, %esp
-/* check if it is 486 or 386. */
/*
- * XXX - this does a lot of unnecessary setup. Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
+ * Check if it is 486
*/
-
- movb $3,X86 # at least 386
- pushfl # push EFLAGS
- popl %eax # get EFLAGS
- movl %eax,%ecx # save original EFLAGS
- xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
- pushl %eax # copy to EFLAGS
- popfl # set EFLAGS
- pushfl # get new EFLAGS
- popl %eax # put it in eax
- xorl %ecx,%eax # change in flags
- pushl %ecx # restore original EFLAGS
- popfl
- testl $0x40000,%eax # check if AC bit changed
- je is386
-
- movb $4,X86 # at least 486
- testl $0x200000,%eax # check if ID bit changed
- je is486
+ movb $4,X86 # at least 486
+ cmpl $-1,X86_CPUID
+ je .Lis486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
@@ -398,7 +250,7 @@ checkCPUtype:
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
- je is486
+ je .Lis486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
@@ -409,21 +261,17 @@ checkCPUtype:
shrb $4,%al
movb %al,X86_MODEL
andb $0x0f,%cl # mask mask revision
- movb %cl,X86_MASK
+ movb %cl,X86_STEPPING
movl %edx,X86_CAPABILITY
-is486: movl $0x50022,%ecx # set AM, WP, NE and MP
- jmp 2f
-
-is386: movl $2,%ecx # set MP
-2: movl %cr0,%eax
+.Lis486:
+ movl $0x50022,%ecx # set AM, WP, NE and MP
+ movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
orl %ecx,%eax
movl %eax,%cr0
- call check_x87
lgdt early_gdt_descr
- lidt idt_descr
ljmp $(__KERNEL_CS),$1f
1: movl $(__KERNEL_DS),%eax # reload all the segment registers
movl %eax,%ss # after changing gdt.
@@ -435,144 +283,91 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
-#ifdef CONFIG_CC_STACKPROTECTOR
- /*
- * The linker can't handle this by relocation. Manually set
- * base address in stack canary segment descriptor.
- */
- cmpb $0,ready
- jne 1f
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
- movl $(__KERNEL_STACK_CANARY),%eax
- movl %eax,%gs
+ xorl %eax,%eax
+ movl %eax,%gs # clear possible garbage in %gs
xorl %eax,%eax # Clear LDT
lldt %ax
- cld # gcc2 wants the direction flag cleared at all times
- pushl $0 # fake return address for unwinder
-#ifdef CONFIG_SMP
- movb ready, %cl
- movb $1, ready
- cmpb $0,%cl # the first CPU calls start_kernel
- je 1f
- movl (stack_start), %esp
-1:
-#endif /* CONFIG_SMP */
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
+SYM_FUNC_END(startup_32_smp)
+
+#include "verify_cpu.S"
+
+__INIT
+SYM_FUNC_START(early_idt_handler_array)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler_common
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+SYM_FUNC_END(early_idt_handler_array)
+
+SYM_CODE_START_LOCAL(early_idt_handler_common)
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
- movb $0,X86_HARD_MATH
- clts
- fninit
- fstsw %ax
- cmpb $0,%al
- je 1f
- movl %cr0,%eax /* no coprocessor: have to set bits */
- xorl $4,%eax /* set EM */
- movl %eax,%cr0
- ret
- ALIGN
-1: movb $1,X86_HARD_MATH
- .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
- ret
+ incl %ss:early_recursion_flag
+
+ /* The vector number is in pt_regs->gs */
-/*
- * setup_idt
- *
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
- *
- * Warning: %esi is live across this function.
- */
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
-
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
- movl %eax,(%edi)
- movl %edx,4(%edi)
- addl $8,%edi
- dec %ecx
- jne rp_sidt
-
-.macro set_early_handler handler,trapno
- lea \handler,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
- lea idt_table,%edi
- movl %eax,8*\trapno(%edi)
- movl %edx,8*\trapno+4(%edi)
-.endm
-
- set_early_handler handler=early_divide_err,trapno=0
- set_early_handler handler=early_illegal_opcode,trapno=6
- set_early_handler handler=early_protection_fault,trapno=13
- set_early_handler handler=early_page_fault,trapno=14
-
- ret
-
-early_divide_err:
- xor %edx,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
-
-early_illegal_opcode:
- movl $6,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
-
-early_protection_fault:
- movl $13,%edx
- jmp early_fault
-
-early_page_fault:
- movl $14,%edx
- jmp early_fault
-
-early_fault:
cld
-#ifdef CONFIG_PRINTK
- pusha
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- cmpl $2,early_recursion_flag
- je hlt_loop
- incl early_recursion_flag
- movl %cr2,%eax
- pushl %eax
- pushl %edx /* trapno */
- pushl $fault_msg
- call printk
-#endif
- call dump_stack
-hlt_loop:
- hlt
- jmp hlt_loop
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
+ pushl %eax /* pt_regs->ax */
+ pushl %ebp /* pt_regs->bp */
+ pushl %edi /* pt_regs->di */
+ pushl %esi /* pt_regs->si */
+ pushl %edx /* pt_regs->dx */
+ pushl %ecx /* pt_regs->cx */
+ pushl %ebx /* pt_regs->bx */
+
+ /* Fix up DS and ES */
+ movl $(__KERNEL_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+
+ /* Load the vector number into EDX */
+ movl PT_GS(%esp), %edx
+
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
+ movw %gs, PT_GS(%esp)
+
+ movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
+ call early_fixup_exception
+
+ popl %ebx /* pt_regs->bx */
+ popl %ecx /* pt_regs->cx */
+ popl %edx /* pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
+ decl %ss:early_recursion_flag
+ addl $4, %esp /* pop pt_regs->orig_ax */
+ iret
+SYM_CODE_END(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
- ALIGN
-ignore_int:
+SYM_FUNC_START(early_ignore_irq)
cld
#ifdef CONFIG_PRINTK
pushl %eax
@@ -591,7 +386,7 @@ ignore_int:
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
- call printk
+ call _printk
call dump_stack
@@ -604,27 +399,52 @@ ignore_int:
#endif
iret
- __REFDATA
-.align 4
-ENTRY(initial_code)
- .long i386_start_kernel
+hlt_loop:
+ hlt
+ jmp hlt_loop
+SYM_FUNC_END(early_ignore_irq)
+
+__INITDATA
+ .align 4
+SYM_DATA(early_recursion_flag, .long 0)
+
+__REFDATA
+ .align 4
+SYM_DATA(initial_code, .long i386_start_kernel)
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+#define PGD_ALIGN (2 * PAGE_SIZE)
+#define PTI_USER_PGD_FILL 1024
+#else
+#define PGD_ALIGN (PAGE_SIZE)
+#define PTI_USER_PGD_FILL 0
+#endif
/*
* BSS section
*/
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE_asm
+ .align PGD_ALIGN
#ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+.globl initial_pg_pmd
+initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(swapper_pg_dir)
+.globl initial_page_table
+initial_page_table:
.fill 1024,4,0
#endif
-swapper_pg_fixmap:
+ .align PGD_ALIGN
+initial_pg_fixmap:
.fill 1024,4,0
-ENTRY(empty_zero_page)
+.globl swapper_pg_dir
+ .align PGD_ALIGN
+swapper_pg_dir:
+ .fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
+.globl empty_zero_page
+empty_zero_page:
.fill 4096,1,0
+EXPORT_SYMBOL(empty_zero_page)
/*
* This starts the data section.
@@ -632,53 +452,47 @@ ENTRY(empty_zero_page)
#ifdef CONFIG_X86_PAE
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
- .align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+ .align PGD_ALIGN
+SYM_DATA_START(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
- .align PAGE_SIZE_asm /* needs to be page-sized too */
-#endif
+ .align PAGE_SIZE /* needs to be page-sized too */
-.data
-ENTRY(stack_start)
- .long init_thread_union+THREAD_SIZE
- .long __BOOT_DS
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ /*
+ * PTI needs another page so sync_initial_pagetable() works correctly
+ * and does not scribble over the data which is placed behind the
+ * actual initial_page_table. See clone_pgd_range().
+ */
+ .fill 1024, 4, 0
+#endif
-ready: .byte 0
+SYM_DATA_END(initial_page_table)
+#endif
-early_recursion_flag:
- .long 0
+.data
+.balign 4
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
+__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-fault_msg:
-/* fault info: */
- .ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
- .ascii " EDI %p ESI %p EBP %p ESP %p\n"
- .ascii " EBX %p EDX %p ECX %p EAX %p\n"
-/* fault frame: */
- .ascii " err %p EIP %p CS %p flg %p\n"
- .ascii "Stack: %p %p %p %p %p %p %p %p\n"
- .ascii " %p %p %p %p %p %p %p %p\n"
- .asciz " %p %p %p %p %p %p %p %p\n"
-
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
@@ -687,33 +501,29 @@ fault_msg:
* segment size, and 32-bit linear address value:
*/
-.globl boot_gdt_descr
-.globl idt_descr
-
+ .data
ALIGN
# early boot GDT descriptor (must use 1:1 address mapping)
.word 0 # 32 bit align gdt_desc.address
-boot_gdt_descr:
+SYM_DATA_START_LOCAL(boot_gdt_descr)
.word __BOOT_DS+7
.long boot_gdt - __PAGE_OFFSET
-
- .word 0 # 32-bit align idt_desc.address
-idt_descr:
- .word IDT_ENTRIES*8-1 # idt contains 256 entries
- .long idt_table
+SYM_DATA_END(boot_gdt_descr)
# boot GDT descriptor (later on used by CPU#0):
.word 0 # 32 bit align gdt_desc.address
-ENTRY(early_gdt_descr)
+SYM_DATA_START(early_gdt_descr)
.word GDT_ENTRIES*8-1
.long gdt_page /* Overwritten for secondary CPUs */
+SYM_DATA_END(early_gdt_descr)
/*
* The boot_gdt must mirror the equivalent in setup.S and is
* used only for booting.
*/
.align L1_CACHE_BYTES
-ENTRY(boot_gdt)
+SYM_DATA_START(boot_gdt)
.fill GDT_ENTRY_BOOT_CS,8,0
.quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
.quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+SYM_DATA_END(boot_gdt)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..21816b48537c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
- * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -8,53 +9,46 @@
* Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
*/
-
+#include <linux/export.h>
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
#include <asm/segment.h>
-#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
#include <asm/processor-flags.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
+#include "../entry/calling.h"
+#include <asm/nospec-branch.h>
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/smp.h>
+#include <asm/thread_info.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/asm-offsets.h>
-#include <asm/paravirt.h>
-#else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
-#endif
-
-/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
+/*
+ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
- *
*/
-#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
-L3_START_KERNEL = pud_index(__START_KERNEL_map)
-
- .text
- __HEAD
+ __INIT
.code64
- .globl startup_64
-startup_64:
-
+SYM_CODE_START_NOALIGN(startup_64)
+ UNWIND_HINT_END_OF_STACK
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %RSI holds the physical address of the boot_params structure
+ * provided by the bootloader. Preserve it in %R15 so C function calls
+ * will not clobber it.
*
* We come here either directly from a 64bit bootloader, or from
- * arch/x86_64/boot/compressed/head.S.
+ * arch/x86/boot/compressed/head_64.S.
*
* We only come here initially at boot nothing else comes here.
*
@@ -62,97 +56,101 @@ startup_64:
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
+ mov %rsi, %r15
- /* Compute the delta between the address I am compiled to run at and the
- * address I am actually running at.
- */
- leaq _text(%rip), %rbp
- subq $_text - __START_KERNEL_map, %rbp
-
- /* Is the address not 2M aligned? */
- movq %rbp, %rax
- andl $~PMD_PAGE_MASK, %eax
- testl %eax, %eax
- jnz bad_address
-
- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
- */
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ /* Set up the stack for verify_cpu() */
+ leaq __top_init_kernel_stack(%rip), %rsp
- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Set up GSBASE.
+ * Note that on SMP the boot CPU uses the init data section until
+ * the per-CPU areas are set up.
+ */
+ movl $MSR_GS_BASE, %ecx
+ xorl %eax, %eax
+ xorl %edx, %edx
+ wrmsr
- addq %rbp, level3_kernel_pgt + (510*8)(%rip)
- addq %rbp, level3_kernel_pgt + (511*8)(%rip)
+ call __pi_startup_64_setup_gdt_idt
- addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
+ /* Now switch to __KERNEL_CS so IRET works reliably */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
- /* Add an Identity mapping if I am above 1G */
- leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+.Lon_kernel_cs:
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_END_OF_STACK
- movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Activate SEV/SME memory encryption if supported/enabled. This needs to
+ * be done now, since this also includes setup of the SEV-SNP CPUID table,
+ * which needs to be done before any CPUID instructions are executed in
+ * subsequent code. Pass the boot_params pointer as the first argument.
+ */
+ movq %r15, %rdi
+ call __pi_sme_enable
+#endif
- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ /* Sanitize CPU configuration */
+ call verify_cpu
- movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ /*
+ * Derive the kernel's physical-to-virtual offset from the physical and
+ * virtual addresses of common_startup_64().
+ */
+ leaq common_startup_64(%rip), %rdi
+ subq .Lcommon_startup_64(%rip), %rdi
/*
- * Fixup the kernel text+data virtual addresses. Note that
- * we might write invalid pmds, when the kernel is relocated
- * cleanup_highmap() fixes this up along with the mappings
- * beyond _end.
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
*/
+ movq %r15, %rsi
+ call __pi___startup_64
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ leaq early_top_pgt(%rip), %rcx
+ addq %rcx, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
- leaq level2_kernel_pgt(%rip), %rdi
- leaq 4096(%rdi), %r8
- /* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
- jz 2f
- addq %rbp, 0(%rdi)
- /* Go to the next page */
-2: addq $8, %rdi
- cmp %r8, %rdi
- jne 1b
-
- /* Fixup phys_base */
- addq %rbp, phys_base(%rip)
-
-#ifdef CONFIG_X86_TRAMPOLINE
- addq %rbp, trampoline_level4_pgt + 0(%rip)
- addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
#endif
- /* Due to ENTRY(), sometimes the empty space gets filled with
- * zeros. Better take a jmp than relying on empty space being
- * filled with 0x90 (nop)
+ /*
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
*/
- jmp secondary_startup_64
-ENTRY(secondary_startup_64)
+ movq %rax, %cr3
+
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *.Lcommon_startup_64(%rip)
+SYM_CODE_END(startup_64)
+
+ __INITRODATA
+SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64)
+
+ .text
+SYM_CODE_START(secondary_startup_64)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
- * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
- *
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
@@ -161,56 +159,194 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
- movq %rax, %cr4
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
+ /*
+ * The secondary_startup_64_no_verify entry point is only used by
+ * SEV-ES guests. In those guests the call to verify_cpu() would cause
+ * #VC exceptions which can not be handled at this stage of secondary
+ * CPU bringup.
+ *
+ * All non SEV-ES systems, especially Intel systems, need to execute
+ * verify_cpu() above to make sure NX is enabled.
+ */
+SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+
+ /* Clear %R15 which holds the boot_params pointer on the boot CPU */
+ xorl %r15d, %r15d
+
+ /* Derive the runtime physical address of init_top_pgt[] */
+ movq phys_base(%rip), %rax
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
- /* Setup early boot stage 4 level pagetables. */
- movq $(init_level4_pgt - __START_KERNEL_map), %rax
- addq phys_base(%rip), %rax
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ addq sme_me_mask(%rip), %rax
+#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
movq %rax, %cr3
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- jmp *%rax
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
- /* Check if nx is implemented */
- movl $0x80000001, %eax
- cpuid
- movl %edx,%edi
+ /*
+ * Create a mask of CR4 bits to preserve. Omit PGE in order to flush
+ * global 1:1 translations from the TLBs.
+ *
+ * From the SDM:
+ * "If CR4.PGE is changing from 0 to 1, there were no global TLB
+ * entries before the execution; if CR4.PGE is changing from 1 to 0,
+ * there will be no global TLB entries after the execution."
+ */
+ movl $(X86_CR4_PAE | X86_CR4_LA57), %edx
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ orl $X86_CR4_MCE, %edx
+#endif
+ movq %cr4, %rcx
+ andl %edx, %ecx
- /* Setup EFER (Extended Feature Enable Register) */
- movl $MSR_EFER, %ecx
+ /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */
+ btsl $X86_CR4_PSE_BIT, %ecx
+ movq %rcx, %cr4
+
+ /*
+ * Set CR4.PGE to re-enable global translations.
+ */
+ btsl $X86_CR4_PGE_BIT, %ecx
+ movq %rcx, %cr4
+
+#ifdef CONFIG_SMP
+ /*
+ * For parallel boot, the APIC ID is read from the APIC, and then
+ * used to look up the CPU number. For booting a single CPU, the
+ * CPU number is encoded in smpboot_control.
+ *
+ * Bit 31 STARTUP_READ_APICID (Read APICID from APIC)
+ * Bit 0-23 CPU# if STARTUP_xx flags are not set
+ */
+ movl smpboot_control(%rip), %ecx
+ testl $STARTUP_READ_APICID, %ecx
+ jnz .Lread_apicid
+ /*
+ * No control bit set, single CPU bringup. CPU number is provided
+ * in bit 0-23. This is also the boot CPU case (CPU number 0).
+ */
+ andl $(~STARTUP_PARALLEL_MASK), %ecx
+ jmp .Lsetup_cpu
+
+.Lread_apicid:
+ /* Check whether X2APIC mode is already enabled */
+ mov $MSR_IA32_APICBASE, %ecx
rdmsr
- btsl $_EFER_SCE, %eax /* Enable System Call */
- btl $20,%edi /* No Execute supported? */
- jnc 1f
- btsl $_EFER_NX, %eax
-1: wrmsr /* Make changes effective */
+ testl $X2APIC_ENABLE, %eax
+ jnz .Lread_apicid_msr
- /* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
- movl $CR0_STATE, %eax
- /* Make changes effective */
- movq %rax, %cr0
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * If system is in X2APIC mode then MMIO base might not be
+ * mapped causing the MMIO read below to fault. Faults can't
+ * be handled at that point.
+ */
+ cmpl $0, x2apic_mode(%rip)
+ jz .Lread_apicid_mmio
- /* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ /* Force the AP into X2APIC mode. */
+ orl $X2APIC_ENABLE, %eax
+ wrmsr
+ jmp .Lread_apicid_msr
+#endif
- /* zero EFLAGS after setting rsp */
- pushq $0
- popfq
+.Lread_apicid_mmio:
+ /* Read the APIC ID from the fix-mapped MMIO space. */
+ movq apic_mmio_base(%rip), %rcx
+ addq $APIC_ID, %rcx
+ movl (%rcx), %eax
+ shr $24, %eax
+ jmp .Llookup_AP
+
+.Lread_apicid_msr:
+ mov $APIC_X2APIC_ID_MSR, %ecx
+ rdmsr
+
+.Llookup_AP:
+ /* EAX contains the APIC ID of the current CPU */
+ xorl %ecx, %ecx
+ leaq cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+ cmpl (%rbx,%rcx,4), %eax
+ jz .Lsetup_cpu
+ inc %ecx
+#ifdef CONFIG_FORCE_NR_CPUS
+ cmpl $NR_CPUS, %ecx
+#else
+ cmpl nr_cpu_ids(%rip), %ecx
+#endif
+ jb .Lfind_cpunr
+
+ /* APIC ID not found in the table. Drop the trampoline lock and bail. */
+ movq trampoline_lock(%rip), %rax
+ movl $0, (%rax)
+
+1: cli
+ hlt
+ jmp 1b
+
+.Lsetup_cpu:
+ /* Get the per cpu offset for the given CPU# which is in ECX */
+ movq __per_cpu_offset(,%rcx,8), %rdx
+#else
+ xorl %edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack.
+ *
+ * RDX contains the per-cpu offset
+ */
+ movq current_task(%rdx), %rax
+ movq TASK_threadsp(%rax), %rsp
/*
+ * Now that this CPU is running on its own stack, drop the realmode
+ * protection. For the boot CPU the pointer is NULL!
+ */
+ movq trampoline_lock(%rip), %rax
+ testq %rax, %rax
+ jz .Lsetup_gdt
+ movl $0, (%rax)
+
+.Lsetup_gdt:
+ /*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt early_gdt_descr(%rip)
+ subq $16, %rsp
+ movw $(GDT_SIZE-1), (%rsp)
+ leaq gdt_page(%rdx), %rax
+ movq %rax, 2(%rsp)
+ lgdt (%rsp)
+ addq $16, %rsp
/* set up data segments */
xorl %eax,%eax
@@ -226,200 +362,360 @@ ENTRY(secondary_startup_64)
movl %eax,%fs
movl %eax,%gs
- /* Set up %gs.
- *
- * The base of %gs always points to the bottom of the irqstack
- * union. If the stack protector canary is enabled, it is
- * located at %gs:40. Note that, on SMP, the boot cpu uses
- * init data section till per cpu areas are set up.
+ /*
+ * Set up GSBASE.
+ * Note that, on SMP, the boot cpu uses init data section until
+ * the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq initial_gs(%rip),%rax
- movq %rax,%rdx
- shrq $32,%rdx
- wrmsr
-
- /* esi is pointer to real mode structure with interesting info.
- pass it to C */
- movl %esi, %edi
-
- /* Finally jump to run C code and to be on real kernel address
- * Since we are running on identity-mapped space we have to jump
- * to the full 64bit address, this is only possible as indirect
- * jump. In addition we need to ensure %cs is set so we make this
- * a far return.
+ movl %edx, %eax
+ shrq $32, %rdx
+ wrmsr
+
+ /* Setup and Load IDT */
+ call early_setup_idt
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
*/
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ movl %eax, %edx
+ btsl $_EFER_SCE, %eax /* Enable System Call */
+ btl $20,%edi /* No Execute supported? */
+ jnc 1f
+ btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
+
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
+ /* Setup cr0 */
+ movl $CR0_STATE, %eax
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
- /* SMP bootup changes these two */
+ /* Pass the boot_params pointer as first argument */
+ movq %r15, %rdi
+
+.Ljump_to_C_code:
+ xorl %ebp, %ebp # clear frame pointer
+ ANNOTATE_RETPOLINE_SAFE
+ callq *initial_code(%rip)
+ ud2
+SYM_CODE_END(secondary_startup_64)
+
+#include "verify_cpu.S"
+#include "sev_verify_cbit.S"
+
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_AMD_MEM_ENCRYPT)
+/*
+ * Entry point for soft restart of a CPU. Invoked from xxx_play_dead() for
+ * restarting the boot CPU or for restarting SEV guest CPUs after CPU hot
+ * unplug. Everything is set up already except the stack.
+ */
+SYM_CODE_START(soft_restart_cpu)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_END_OF_STACK
+
+ /* Find the idle task stack */
+ movq PER_CPU_VAR(current_task), %rcx
+ movq TASK_threadsp(%rcx), %rsp
+
+ jmp .Ljump_to_C_code
+SYM_CODE_END(soft_restart_cpu)
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during early boot when running on kernel
+ * addresses, but before the switch to the idt_table can be made.
+ * The early_idt_handler_array can't be used here because it calls into a lot
+ * of __init code and this handler is also used during CPU offlining/onlining.
+ * Therefore this handler ends up in the .text section so that it stays around
+ * when .init.text is freed.
+ */
+SYM_CODE_START_NOALIGN(vc_boot_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
+
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
+
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ movq initial_vc_handler(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rax
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ iretq
+SYM_CODE_END(vc_boot_ghcb)
+#endif
+
+ /* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
- .align 8
- ENTRY(initial_code)
- .quad x86_64_start_kernel
- ENTRY(initial_gs)
- .quad INIT_PER_CPU_VAR(irq_stack_union)
-
- ENTRY(stack_start)
- .quad init_thread_union+THREAD_SIZE-8
- .word 0
- __FINITDATA
+ .balign 8
+SYM_DATA(initial_code, .quad x86_64_start_kernel)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
+#endif
-bad_address:
- jmp bad_address
+SYM_DATA(trampoline_lock, .quad 0);
+ __FINITDATA
- .section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
- .globl early_idt_handlers
-early_idt_handlers:
+ __INIT
+SYM_CODE_START(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- movl $i, %esi
- jmp early_idt_handler
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ ENDBR
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
+ .endif
+ pushq $i # 72(%rsp) Vector number
+ jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-#endif
+SYM_CODE_END(early_idt_handler_array)
+ ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS]
+
+SYM_CODE_START_LOCAL(early_idt_handler_common)
+ UNWIND_HINT_IRET_REGS offset=16
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
-ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
- cmpl $2,early_recursion_flag(%rip)
- jz 1f
incl early_recursion_flag(%rip)
- GET_CR2_INTO_RCX
- movq %rcx,%r9
- xorl %r8d,%r8d # zero for error code
- movl %esi,%ecx # get vector number
- # Test %ecx against mask of vectors that push error code.
- cmpl $31,%ecx
- ja 0f
- movl $1,%eax
- salq %cl,%rax
- testl $0x27d00,%eax
- je 0f
- popq %r8 # get error code
-0: movq 0(%rsp),%rcx # get ip
- movq 8(%rsp),%rdx # get cs
- xorl %eax,%eax
- leaq early_idt_msg(%rip),%rdi
- call early_printk
- cmpl $2,early_recursion_flag(%rip)
- jz 1f
- call dump_stack
-#ifdef CONFIG_KALLSYMS
- leaq early_idt_ripmsg(%rip),%rdi
- movq 0(%rsp),%rsi # get rip again
- call __print_symbol
+
+ /* The vector number is currently in the pt_regs->di slot. */
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* RSI = vector number */
+ movq %rdi, 8(%rsp) /* pt_regs->di = RDI */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->bx */
+ pushq %rbp /* pt_regs->bp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
+
+ movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
+ call do_early_exception
+
+ decl early_recursion_flag(%rip)
+ jmp restore_regs_and_return_to_kernel
+SYM_CODE_END(early_idt_handler_common)
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during very early boot. The
+ * early_idt_handler_array can't be used because it returns via the
+ * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
+ *
+ * XXX it does, fix this.
+ *
+ * This handler will end up in the .init.text section and not be
+ * available to boot secondary CPUs.
+ */
+SYM_CODE_START_NOALIGN(vc_no_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+ ENDBR
+
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
+
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ call __pi_do_vc_no_ghcb
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
+ iretq
+SYM_CODE_END(vc_no_ghcb)
+SYM_PIC_ALIAS(vc_no_ghcb);
#endif
-#endif /* EARLY_PRINTK */
-1: hlt
- jmp 1b
-
-#ifdef CONFIG_EARLY_PRINTK
-early_recursion_flag:
- .long 0
-
-early_idt_msg:
- .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
-early_idt_ripmsg:
- .asciz "RIP %s\n"
-#endif /* CONFIG_EARLY_PRINTK */
- .previous
-
-#define NEXT_PAGE(name) \
- .balign PAGE_SIZE; \
-ENTRY(name)
-
-/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
- i = i + 1 ; \
- .endr
- .data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
-NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign 2 * PAGE_SIZE)
+#else
+#define SYM_DATA_START_PTI_ALIGNED(name) \
+ SYM_DATA_START_PAGE_ALIGNED(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
+ __INITDATA
+ .balign 4
-NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
.fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(early_top_pgt)
+SYM_PIC_ALIAS(early_top_pgt)
-NEXT_PAGE(level3_kernel_pgt)
- .fill L3_START_KERNEL,8,0
- /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+SYM_DATA_END(early_dynamic_pgts)
+SYM_PIC_ALIAS(early_dynamic_pgts);
-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
+SYM_DATA(early_recursion_flag, .long 0)
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
+ .data
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_PVH)
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+SYM_DATA_END(level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(level2_ident_pgt)
+#else
+SYM_DATA_START_PTI_ALIGNED(init_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+SYM_DATA_END(init_top_pgt)
+#endif
+
+SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level4_kernel_pgt)
+SYM_PIC_ALIAS(level4_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(level3_kernel_pgt)
+SYM_PIC_ALIAS(level3_kernel_pgt)
-NEXT_PAGE(level2_kernel_pgt)
+SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
- * 512 MB kernel mapping. We spend a full page on this pagetable
- * anyway.
+ * Kernel high mapping.
+ *
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
*
- * The kernel code+data+bss must not be bigger than that.
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
*
- * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
- * If you want to increase this then increase MODULES_VADDR
- * too.)
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
- KERNEL_IMAGE_SIZE/PMD_SIZE)
-
-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
+SYM_DATA_END(level2_kernel_pgt)
+SYM_PIC_ALIAS(level2_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
+SYM_DATA_END(level2_fixmap_pgt)
+SYM_PIC_ALIAS(level2_fixmap_pgt)
-#undef PMDS
-#undef NEXT_PAGE
+SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
+ .fill 512,8,0
+ .endr
+SYM_DATA_END(level1_fixmap_pgt)
.data
.align 16
- .globl early_gdt_descr
-early_gdt_descr:
- .word GDT_ENTRIES*8-1
-early_gdt_descr_base:
- .quad INIT_PER_CPU_VAR(gdt_page)
-
-ENTRY(phys_base)
- /* This must match the first entry in level2_kernel_pgt */
- .quad 0x0000000000000000
-
-#include "../../x86/xen/xen-head.S"
-
- .section .bss, "aw", @nobits
- .align L1_CACHE_BYTES
-ENTRY(idt_table)
- .skip IDT_ENTRIES * 16
+
+SYM_DATA(smpboot_control, .long 0)
+
+ .align 16
+/* This must match the first entry in level2_kernel_pgt */
+SYM_DATA(phys_base, .quad 0x0)
+SYM_PIC_ALIAS(phys_base);
+EXPORT_SYMBOL(phys_base)
+
+#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
.skip PAGE_SIZE
+SYM_DATA_END(empty_zero_page)
+EXPORT_SYMBOL(empty_zero_page)
+
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..d6387dde3ff9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,56 +1,80 @@
-#include <linux/clocksource.h>
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/clockchips.h>
#include <linux/interrupt.h>
-#include <linux/sysdev.h>
+#include <linux/export.h>
#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
#include <linux/hpet.h>
-#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/pm.h>
-#include <linux/io.h>
+#include <linux/irq.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
+#include <asm/cpuid/api.h>
+#include <asm/irq_remapping.h>
#include <asm/hpet.h>
+#include <asm/time.h>
+#include <asm/mwait.h>
+#include <asm/msr.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "hpet: " fmt
+
+enum hpet_mode {
+ HPET_MODE_UNUSED,
+ HPET_MODE_LEGACY,
+ HPET_MODE_CLOCKEVT,
+ HPET_MODE_DEVICE,
+};
-#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
+struct hpet_channel {
+ struct clock_event_device evt;
+ unsigned int num;
+ unsigned int cpu;
+ unsigned int irq;
+ unsigned int in_use;
+ enum hpet_mode mode;
+ unsigned int boot_cfg;
+ char name[10];
+};
-/* FSEC = 10^-15
- NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
+struct hpet_base {
+ unsigned int nr_channels;
+ unsigned int nr_clockevents;
+ unsigned int boot_cfg;
+ struct hpet_channel *channels;
+};
-#define HPET_DEV_USED_BIT 2
-#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
-#define HPET_DEV_VALID 0x8
-#define HPET_DEV_FSB_CAP 0x1000
-#define HPET_DEV_PERI_CAP 0x2000
+#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
-u8 hpet_msi_disable;
-u8 hpet_readback_cmp;
+bool hpet_msi_disable;
-#ifdef CONFIG_PCI_MSI
-static unsigned long hpet_num_timers;
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
+static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
+static struct irq_domain *hpet_domain;
#endif
+
static void __iomem *hpet_virt_address;
-struct hpet_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int flags;
- char name[10];
-};
+static struct hpet_base hpet_base;
+
+static bool hpet_legacy_int_enabled;
+static unsigned long hpet_freq;
+
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
+
+static inline
+struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt)
+{
+ return container_of(evt, struct hpet_channel, evt);
+}
inline unsigned int hpet_readl(unsigned int a)
{
@@ -62,16 +86,9 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
writel(d, hpet_virt_address + a);
}
-#ifdef CONFIG_X86_64
-#include <asm/pgtable.h>
-#endif
-
static inline void hpet_set_mapping(void)
{
- hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-#endif
+ hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE);
}
static inline void hpet_clear_mapping(void)
@@ -83,19 +100,20 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-static int boot_hpet_disable;
-int hpet_force_user;
-static int hpet_verbose;
-
static int __init hpet_setup(char *str)
{
- if (str) {
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
if (!strncmp("disable", str, 7))
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
if (!strncmp("force", str, 5))
- hpet_force_user = 1;
+ hpet_force_user = true;
if (!strncmp("verbose", str, 7))
- hpet_verbose = 1;
+ hpet_verbose = true;
+ str = next;
}
return 1;
}
@@ -103,7 +121,7 @@ __setup("hpet=", hpet_setup);
static int __init disable_hpet(char *str)
{
- boot_hpet_disable = 1;
+ boot_hpet_disable = true;
return 1;
}
__setup("nohpet", disable_hpet);
@@ -113,13 +131,8 @@ static inline int is_hpet_capable(void)
return !boot_hpet_disable && hpet_address;
}
-/*
- * HPET timer interrupt enable / disable
- */
-static int hpet_legacy_int_enabled;
-
/**
- * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled
*/
int is_hpet_enabled(void)
{
@@ -129,67 +142,60 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
static void _hpet_print_config(const char *function, int line)
{
- u32 i, timers, l, h;
- printk(KERN_INFO "hpet: %s(%d):\n", function, line);
- l = hpet_readl(HPET_ID);
- h = hpet_readl(HPET_PERIOD);
- timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
- l = hpet_readl(HPET_CFG);
- h = hpet_readl(HPET_STATUS);
- printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+ u32 i, id, period, cfg, status, channels, l, h;
+
+ pr_info("%s(%d):\n", function, line);
+
+ id = hpet_readl(HPET_ID);
+ period = hpet_readl(HPET_PERIOD);
+ pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period);
+
+ cfg = hpet_readl(HPET_CFG);
+ status = hpet_readl(HPET_STATUS);
+ pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status);
+
l = hpet_readl(HPET_COUNTER);
h = hpet_readl(HPET_COUNTER+4);
- printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+ pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- for (i = 0; i < timers; i++) {
+ for (i = 0; i < channels; i++) {
l = hpet_readl(HPET_Tn_CFG(i));
h = hpet_readl(HPET_Tn_CFG(i)+4);
- printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_CMP(i));
h = hpet_readl(HPET_Tn_CMP(i)+4);
- printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_ROUTE(i));
h = hpet_readl(HPET_Tn_ROUTE(i)+4);
- printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h);
}
}
#define hpet_print_config() \
do { \
if (hpet_verbose) \
- _hpet_print_config(__FUNCTION__, __LINE__); \
+ _hpet_print_config(__func__, __LINE__); \
} while (0)
/*
- * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * When the HPET driver (/dev/hpet) is enabled, we need to reserve
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd);
-
-static void hpet_reserve_platform_timers(unsigned int id)
+static void __init hpet_reserve_platform_timers(void)
{
- struct hpet __iomem *hpet = hpet_virt_address;
- struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
- unsigned int nrtimers, i;
struct hpet_data hd;
-
- nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+ unsigned int i;
memset(&hd, 0, sizeof(hd));
hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
- hpet_reserve_timer(&hd, 0);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
- hpet_reserve_timer(&hd, 1);
-#endif
+ hd.hd_address = hpet_virt_address;
+ hd.hd_nirqs = hpet_base.nr_channels;
/*
* NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
@@ -199,46 +205,52 @@ static void hpet_reserve_platform_timers(unsigned int id)
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
- for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) &
- Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
- }
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
- hpet_reserve_msi_timers(&hd);
+ if (i >= 2)
+ hd.hd_irq[i] = hc->irq;
- hpet_alloc(&hd);
+ switch (hc->mode) {
+ case HPET_MODE_UNUSED:
+ case HPET_MODE_DEVICE:
+ hc->mode = HPET_MODE_DEVICE;
+ break;
+ case HPET_MODE_CLOCKEVT:
+ case HPET_MODE_LEGACY:
+ hpet_reserve_timer(&hd, hc->num);
+ break;
+ }
+ }
+ hpet_alloc(&hd);
}
-#else
-static void hpet_reserve_platform_timers(unsigned int id) { }
-#endif
-/*
- * Common hpet info
- */
-static unsigned long hpet_period;
+static void __init hpet_select_device_channel(void)
+{
+ int i;
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt);
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
- .name = "hpet",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = hpet_legacy_set_mode,
- .set_next_event = hpet_legacy_next_event,
- .shift = 32,
- .irq = 0,
- .rating = 50,
-};
+ /* Associate the first unused channel to /dev/hpet */
+ if (hc->mode == HPET_MODE_UNUSED) {
+ hc->mode = HPET_MODE_DEVICE;
+ return;
+ }
+ }
+}
+#else
+static inline void hpet_reserve_platform_timers(void) { }
+static inline void hpet_select_device_channel(void) {}
+#endif
+
+/* Common HPET functions */
static void hpet_stop_counter(void)
{
- unsigned long cfg = hpet_readl(HPET_CFG);
+ u32 cfg = hpet_readl(HPET_CFG);
+
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -252,6 +264,7 @@ static void hpet_reset_counter(void)
static void hpet_start_counter(void)
{
unsigned int cfg = hpet_readl(HPET_CFG);
+
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -280,505 +293,559 @@ static void hpet_enable_legacy_int(void)
cfg |= HPET_CFG_LEGACY;
hpet_writel(cfg, HPET_CFG);
- hpet_legacy_int_enabled = 1;
+ hpet_legacy_int_enabled = true;
}
-static void hpet_legacy_clockevent_register(void)
+static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt)
{
- /* Start HPET legacy interrupts */
- hpet_enable_legacy_int();
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg, cmp, now;
+ uint64_t delta;
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+ hpet_writel(cmp, HPET_Tn_CMP(channel));
+ udelay(1);
/*
- * The mult factor is defined as (include/linux/clockchips.h)
- * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
- * hpet_period is in units of femtoseconds (per cycle), so
- * mult/2^shift = cyc/ns = 10^6/hpet_period
- * mult = (10^6 * 2^shift)/hpet_period
- * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
*/
- hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
- hpet_period, hpet_clockevent.shift);
- /* Calculate the min / max delta */
- hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
- &hpet_clockevent);
- /* 5 usec minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = 5000;
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel));
+ hpet_start_counter();
+ hpet_print_config();
- /*
- * Start hpet with the boot cpu mask and make it
- * global after the IO_APIC has been initialized.
- */
- hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
- clockevents_register_device(&hpet_clockevent);
- global_clock_event = &hpet_clockevent;
- printk(KERN_DEBUG "hpet clockevent registered\n");
+ return 0;
}
-static int hpet_setup_msi_irq(unsigned int irq);
+static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt)
+{
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+
+ return 0;
+}
-static void hpet_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt, int timer)
+static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt)
{
- unsigned int cfg, cmp, now;
- uint64_t delta;
+ unsigned int channel = clockevent_to_channel(evt)->num;
+ unsigned int cfg;
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- hpet_stop_counter();
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
- delta >>= evt->shift;
- now = hpet_readl(HPET_COUNTER);
- cmp = now + (unsigned int) delta;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- /* Make sure we use edge triggered interrupts */
- cfg &= ~HPET_TN_LEVEL;
- cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
- HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- hpet_writel(cmp, HPET_Tn_CMP(timer));
- udelay(1);
- /*
- * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
- * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
- * bit is automatically cleared after the first write.
- * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
- * Publication # 24674)
- */
- hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
- hpet_start_counter();
- hpet_print_config();
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- if (timer == 0) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_setup_msi_irq(hdev->irq);
- disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
- hpet_print_config();
- break;
- }
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+
+ return 0;
+}
+
+static int hpet_clkevt_legacy_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
+ hpet_print_config();
+ return 0;
}
-static int hpet_next_event(unsigned long delta,
- struct clock_event_device *evt, int timer)
+static int
+hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
u32 cnt;
+ s32 res;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_Tn_CMP(timer));
+ hpet_writel(cnt, HPET_Tn_CMP(channel));
/*
- * We need to read back the CMP register on certain HPET
- * implementations (ATI chipsets) which seem to delay the
- * transfer of the compare register into the internal compare
- * logic. With small deltas this might actually be too late as
- * the counter could already be higher than the compare value
- * at that point and we would wait for the next hpet interrupt
- * forever. We found out that reading the CMP register back
- * forces the transfer so we can rely on the comparison with
- * the counter register below.
- *
- * That works fine on those ATI chipsets, but on newer Intel
- * chipsets (ICH9...) this triggers due to an erratum: Reading
- * the comparator immediately following a write is returning
- * the old value.
- *
- * We restrict the read back to the affected ATI chipsets (set
- * by quirks) and also run it with hpet=verbose for debugging
- * purposes.
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than HPET_MIN_CYCLES
+ * away from the event or if the counter is already ahead of
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
*/
- if (hpet_readback_cmp || hpet_verbose) {
- u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- if (cmp != cnt)
- printk_once(KERN_WARNING
- "hpet: compare register read back failed.\n");
- }
-
- return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating)
{
- hpet_set_mode(mode, evt, 0);
+ struct clock_event_device *evt = &hc->evt;
+
+ evt->rating = rating;
+ evt->irq = hc->irq;
+ evt->name = hc->name;
+ evt->cpumask = cpumask_of(hc->cpu);
+ evt->set_state_oneshot = hpet_clkevt_set_state_oneshot;
+ evt->set_next_event = hpet_clkevt_set_next_event;
+ evt->set_state_shutdown = hpet_clkevt_set_state_shutdown;
+
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hc->boot_cfg & HPET_TN_PERIODIC) {
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_clkevt_set_state_periodic;
+ }
}
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
{
- return hpet_next_event(delta, evt, 0);
+ /*
+ * Start HPET with the boot CPU's cpumask and make it global after
+ * the IO_APIC has been initialized.
+ */
+ hc->cpu = boot_cpu_data.cpu_index;
+ strscpy(hc->name, "hpet", sizeof(hc->name));
+ hpet_init_clockevent(hc, 50);
+
+ hc->evt.tick_resume = hpet_clkevt_legacy_resume;
+
+ /*
+ * Legacy horrors and sins from the past. HPET used periodic mode
+ * unconditionally forever on the legacy channel 0. Removing the
+ * below hack and using the conditional in hpet_init_clockevent()
+ * makes at least Qemu and one hardware machine fail to boot.
+ * There are two issues which cause the boot failure:
+ *
+ * #1 After the timer delivery test in IOAPIC and the IOAPIC setup
+ * the next interrupt is not delivered despite the HPET channel
+ * being programmed correctly. Reprogramming the HPET after
+ * switching to IOAPIC makes it work again. After fixing this,
+ * the next issue surfaces:
+ *
+ * #2 Due to the unconditional periodic mode availability the Local
+ * APIC timer calibration can hijack the global clockevents
+ * event handler without causing damage. Using oneshot at this
+ * stage makes if hang because the HPET does not get
+ * reprogrammed due to the handler hijacking. Duh, stupid me!
+ *
+ * Both issues require major surgery and especially the kick HPET
+ * again after enabling IOAPIC results in really nasty hackery.
+ * This 'assume periodic works' magic has survived since HPET
+ * support got added, so it's questionable whether this should be
+ * fixed. Both Qemu and the failing hardware machine support
+ * periodic mode despite the fact that both don't advertise it in
+ * the configuration register and both need that extra kick after
+ * switching to IOAPIC. Seems to be a feature...
+ */
+ hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC;
+ hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic;
+
+ /* Start HPET legacy interrupts */
+ hpet_enable_legacy_int();
+
+ clockevents_config_and_register(&hc->evt, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
+ global_clock_event = &hc->evt;
+ pr_debug("Clockevent registered\n");
}
/*
* HPET MSI Support
*/
-#ifdef CONFIG_PCI_MSI
-
-static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
-static struct hpet_dev *hpet_devs;
-
-void hpet_msi_unmask(unsigned int irq)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_GENERIC_MSI_IRQ)
+static void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = get_irq_data(irq);
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
- /* unmask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg |= HPET_TN_FSB;
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_mask(unsigned int irq)
+static void hpet_msi_mask(struct irq_data *data)
{
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
- struct hpet_dev *hdev = get_irq_data(irq);
- /* mask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg &= ~HPET_TN_FSB;
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
- hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
- hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
}
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct hpet_dev *hdev = get_irq_data(irq);
-
- msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
- msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
- msg->address_hi = 0;
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
-static void hpet_msi_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_set_mode(mode, evt, hdev->num);
-}
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
+};
-static int hpet_msi_next_event(unsigned long delta,
- struct clock_event_device *evt)
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- return hpet_next_event(delta, evt, hdev->num);
+ irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
}
-static int hpet_setup_msi_irq(unsigned int irq)
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .msi_init = hpet_msi_init,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
{
- if (arch_setup_hpet_msi(irq, hpet_blockid)) {
- destroy_irq(irq);
- return -EINVAL;
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
}
- return 0;
+
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = hpet_id;
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
+ if (!parent) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ return NULL;
+ }
+ if (parent != x86_vector_domain)
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
+ return d;
}
-static int hpet_assign_irq(struct hpet_dev *dev)
+static inline int hpet_dev_id(struct irq_domain *domain)
{
- unsigned int irq;
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
- irq = create_irq();
- if (!irq)
- return -EINVAL;
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+ int dev_num)
+{
+ struct irq_alloc_info info;
- set_irq_data(irq, dev);
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.data = hc;
+ info.devid = hpet_dev_id(domain);
+ info.hwirq = dev_num;
- if (hpet_setup_msi_irq(irq))
- return -EINVAL;
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
- dev->irq = irq;
+static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_channel *hc = clockevent_to_channel(evt);
+ struct irq_data *data = irq_get_irq_data(hc->irq);
+ struct msi_msg msg;
+
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hc, &msg);
+ hpet_msi_unmask(data);
return 0;
}
-static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data)
{
- struct hpet_dev *dev = (struct hpet_dev *)data;
- struct clock_event_device *hevt = &dev->evt;
+ struct hpet_channel *hc = data;
+ struct clock_event_device *evt = &hc->evt;
- if (!hevt->event_handler) {
- printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
- dev->num);
+ if (!evt->event_handler) {
+ pr_info("Spurious interrupt HPET channel %d\n", hc->num);
return IRQ_HANDLED;
}
- hevt->event_handler(hevt);
+ evt->event_handler(evt);
return IRQ_HANDLED;
}
-static int hpet_setup_irq(struct hpet_dev *dev)
+static int hpet_setup_msi_irq(struct hpet_channel *hc)
{
-
- if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- dev->name, dev))
+ if (request_irq(hc->irq, hpet_msi_interrupt_handler,
+ IRQF_TIMER | IRQF_NOBALANCING,
+ hc->name, hc))
return -1;
- disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
- enable_irq(dev->irq);
+ disable_irq(hc->irq);
+ irq_set_affinity(hc->irq, cpumask_of(hc->cpu));
+ enable_irq(hc->irq);
- printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
- dev->name, dev->irq);
+ pr_debug("%s irq %u for MSI\n", hc->name, hc->irq);
return 0;
}
-/* This should be called in specific @cpu */
-static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+/* Invoked from the hotplug callback on @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu)
{
- struct clock_event_device *evt = &hdev->evt;
- uint64_t hpet_freq;
+ struct clock_event_device *evt = &hc->evt;
- WARN_ON(cpu != smp_processor_id());
- if (!(hdev->flags & HPET_DEV_VALID))
- return;
+ hc->cpu = cpu;
+ per_cpu(cpu_hpet_channel, cpu) = hc;
+ hpet_setup_msi_irq(hc);
- if (hpet_setup_msi_irq(hdev->irq))
- return;
+ hpet_init_clockevent(hc, 110);
+ evt->tick_resume = hpet_clkevt_msi_resume;
- hdev->cpu = cpu;
- per_cpu(cpu_hpet_dev, cpu) = hdev;
- evt->name = hdev->name;
- hpet_setup_irq(hdev);
- evt->irq = hdev->irq;
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
+}
- evt->rating = 110;
- evt->features = CLOCK_EVT_FEAT_ONESHOT;
- if (hdev->flags & HPET_DEV_PERI_CAP)
- evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+static struct hpet_channel *hpet_get_unused_clockevent(void)
+{
+ int i;
- evt->set_mode = hpet_msi_set_mode;
- evt->set_next_event = hpet_msi_next_event;
- evt->shift = 32;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
- /*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
- */
- hpet_freq = 1000000000000000ULL;
- do_div(hpet_freq, hpet_period);
- evt->mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, evt->shift);
- /* Calculate the max delta */
- evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
- /* 5 usec minimum reprogramming delta. */
- evt->min_delta_ns = 5000;
-
- evt->cpumask = cpumask_of(hdev->cpu);
- clockevents_register_device(evt);
+ if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use)
+ continue;
+ hc->in_use = 1;
+ return hc;
+ }
+ return NULL;
}
-#ifdef CONFIG_HPET
-/* Reserve at least one timer for userspace (/dev/hpet) */
-#define RESERVE_TIMERS 1
-#else
-#define RESERVE_TIMERS 0
-#endif
+static int hpet_cpuhp_online(unsigned int cpu)
+{
+ struct hpet_channel *hc = hpet_get_unused_clockevent();
+
+ if (hc)
+ init_one_hpet_msi_clockevent(hc, cpu);
+ return 0;
+}
-static void hpet_msi_capability_lookup(unsigned int start_timer)
+static int hpet_cpuhp_dead(unsigned int cpu)
{
- unsigned int id;
- unsigned int num_timers;
- unsigned int num_timers_used = 0;
- int i;
+ struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu);
- if (hpet_msi_disable)
- return;
+ if (!hc)
+ return 0;
+ free_irq(hc->irq, hc);
+ hc->in_use = 0;
+ per_cpu(cpu_hpet_channel, cpu) = NULL;
+ return 0;
+}
+
+static void __init hpet_select_clockevents(void)
+{
+ unsigned int i;
- if (boot_cpu_has(X86_FEATURE_ARAT))
+ hpet_base.nr_clockevents = 0;
+
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
+ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
return;
- id = hpet_readl(HPET_ID);
- num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
- num_timers++; /* Value read out starts from 0 */
hpet_print_config();
- hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
- if (!hpet_devs)
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
return;
- hpet_num_timers = num_timers;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
+ int irq;
- for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
- struct hpet_dev *hdev = &hpet_devs[num_timers_used];
- unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hc->mode != HPET_MODE_UNUSED)
+ continue;
- /* Only consider HPET timer with MSI support */
- if (!(cfg & HPET_TN_FSB_CAP))
+ /* Only consider HPET channel with MSI support */
+ if (!(hc->boot_cfg & HPET_TN_FSB_CAP))
continue;
- hdev->flags = 0;
- if (cfg & HPET_TN_PERIODIC_CAP)
- hdev->flags |= HPET_DEV_PERI_CAP;
- hdev->num = i;
+ sprintf(hc->name, "hpet%d", i);
- sprintf(hdev->name, "hpet%d", i);
- if (hpet_assign_irq(hdev))
+ irq = hpet_assign_irq(hpet_domain, hc, hc->num);
+ if (irq <= 0)
continue;
- hdev->flags |= HPET_DEV_FSB_CAP;
- hdev->flags |= HPET_DEV_VALID;
- num_timers_used++;
- if (num_timers_used == num_possible_cpus())
+ hc->irq = irq;
+ hc->mode = HPET_MODE_CLOCKEVT;
+
+ if (++hpet_base.nr_clockevents == num_possible_cpus())
break;
}
- printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
- num_timers, num_timers_used);
+ pr_info("%d channels of %d reserved for per-cpu timers\n",
+ hpet_base.nr_channels, hpet_base.nr_clockevents);
}
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
-{
- int i;
-
- if (!hpet_devs)
- return;
+#else
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
+static inline void hpet_select_clockevents(void) { }
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
+#define hpet_cpuhp_online NULL
+#define hpet_cpuhp_dead NULL
- hd->hd_irq[hdev->num] = hdev->irq;
- hpet_reserve_timer(hd, hdev->num);
- }
-}
#endif
-static struct hpet_dev *hpet_get_unused_timer(void)
-{
- int i;
-
- if (!hpet_devs)
- return NULL;
-
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
-
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
- if (test_and_set_bit(HPET_DEV_USED_BIT,
- (unsigned long *)&hdev->flags))
- continue;
- return hdev;
- }
- return NULL;
-}
+/*
+ * Clock source related code
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delays and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe, that it can actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. We also need 64-bit atomic read.
+ *
+ * The lock and the HPET value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
-struct hpet_work_struct {
- struct delayed_work work;
- struct completion complete;
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
};
-static void hpet_work(struct work_struct *w)
+static u64 read_hpet(struct clocksource *cs)
{
- struct hpet_dev *hdev;
- int cpu = smp_processor_id();
- struct hpet_work_struct *hpet_work;
+ unsigned long flags;
+ union hpet_lock old, new;
- hpet_work = container_of(w, struct hpet_work_struct, work.work);
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
- hdev = hpet_get_unused_timer();
- if (hdev)
- init_one_hpet_msi_clockevent(hdev, cpu);
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (u64)hpet_readl(HPET_COUNTER);
- complete(&hpet_work->complete);
-}
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- unsigned long cpu = (unsigned long)hcpu;
- struct hpet_work_struct work;
- struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
-
- switch (action & 0xf) {
- case CPU_ONLINE:
- INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
- init_completion(&work.complete);
- /* FIXME: add schedule_work_on() */
- schedule_delayed_work_on(cpu, &work.work, 0);
- wait_for_completion(&work.complete);
- destroy_timer_on_stack(&work.work.timer);
- break;
- case CPU_DEAD:
- if (hdev) {
- free_irq(hdev->irq, hdev);
- hdev->flags &= ~HPET_DEV_USED;
- per_cpu(cpu_hpet_dev, cpu) = NULL;
- }
- break;
- }
- return NOTIFY_OK;
-}
-#else
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
-static int hpet_setup_msi_irq(unsigned int irq)
-{
- return 0;
-}
-static void hpet_msi_capability_lookup(unsigned int start_timer)
-{
- return;
-}
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (u64)new.value;
+ }
+ local_irq_restore(flags);
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
-{
- return;
-}
-#endif
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
-static int hpet_cpuhp_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- return NOTIFY_OK;
+ return (u64)new.value;
}
-
-#endif
-
+#else
/*
- * Clock source related code
+ * For UP or 32-bit.
*/
-static cycle_t read_hpet(struct clocksource *cs)
-{
- return (cycle_t)hpet_readl(HPET_COUNTER);
-}
-
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
+static u64 read_hpet(struct clocksource *cs)
{
- return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+ return (u64)hpet_readl(HPET_COUNTER);
}
#endif
@@ -787,25 +854,46 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
-#ifdef CONFIG_X86_64
- .vread = vread_hpet,
-#endif
};
-static int hpet_clocksource_register(void)
+/*
+ * AMD SB700 based systems with spread spectrum enabled use a SMM based
+ * HPET emulation to provide proper frequency setting.
+ *
+ * On such systems the SMM code is initialized with the first HPET register
+ * access and takes some time to complete. During this time the config
+ * register reads 0xffffffff. We check for max 1000 loops whether the
+ * config register reads a non-0xffffffff value to make sure that the
+ * HPET is up and running before we proceed any further.
+ *
+ * A counting loop is safe, as the HPET access takes thousands of CPU cycles.
+ *
+ * On non-SB700 based machines this check is only done once and has no
+ * side effects.
+ */
+static bool __init hpet_cfg_working(void)
+{
+ int i;
+
+ for (i = 0; i < 1000; i++) {
+ if (hpet_readl(HPET_CFG) != 0xFFFFFFFF)
+ return true;
+ }
+
+ pr_warn("Config register invalid. Disabling HPET\n");
+ return false;
+}
+
+static bool __init hpet_counting(void)
{
- u64 start, now;
- cycle_t t1;
+ u64 start, now, t1;
- /* Start the counter */
hpet_restart_counter();
- /* Verify whether hpet counter works */
t1 = hpet_readl(HPET_COUNTER);
- rdtscll(start);
+ start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@@ -814,29 +902,87 @@ static int hpet_clocksource_register(void)
* 1 GHz == 200us
*/
do {
- rep_nop();
- rdtscll(now);
+ if (t1 != hpet_readl(HPET_COUNTER))
+ return true;
+ now = rdtsc();
} while ((now - start) < 200000UL);
- if (t1 == hpet_readl(HPET_COUNTER)) {
- printk(KERN_WARNING
- "HPET counter not counting. HPET disabled\n");
- return -ENODEV;
- }
+ pr_warn("Counter not counting. HPET disabled\n");
+ return false;
+}
- /*
- * The definition of mult is (include/linux/clocksource.h)
- * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
- * so we first need to convert hpet_period to ns/cyc units:
- * mult/2^shift = ns/cyc = hpet_period/10^6
- * mult = (hpet_period * 2^shift)/10^6
- * mult = (hpet_period << shift)/FSEC_PER_NSEC
- */
- clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
+static bool __init mwait_pc10_supported(void)
+{
+ unsigned int eax, ebx, ecx, mwait_substates;
- clocksource_register(&clocksource_hpet);
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
+ return false;
+
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates);
+
+ return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
+ (ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
+ (mwait_substates & (0xF << 28));
+}
+
+/*
+ * Check whether the system supports PC10. If so force disable HPET as that
+ * stops counting in PC10. This check is overbroad as it does not take any
+ * of the following into account:
+ *
+ * - ACPI tables
+ * - Enablement of intel_idle
+ * - Command line arguments which limit intel_idle C-state support
+ *
+ * That's perfectly fine. HPET is a piece of hardware designed by committee
+ * and the only reasons why it is still in use on modern systems is the
+ * fact that it is impossible to reliably query TSC and CPU frequency via
+ * CPUID or firmware.
+ *
+ * If HPET is functional it is useful for calibrating TSC, but this can be
+ * done via PMTIMER as well which seems to be the last remaining timer on
+ * X86/INTEL platforms that has not been completely wreckaged by feature
+ * creep.
+ *
+ * In theory HPET support should be removed altogether, but there are older
+ * systems out there which depend on it because TSC and APIC timer are
+ * dysfunctional in deeper C-states.
+ *
+ * It's only 20 years now that hardware people have been asked to provide
+ * reliable and discoverable facilities which can be used for timekeeping
+ * and per CPU timer interrupts.
+ *
+ * The probability that this problem is going to be solved in the
+ * foreseeable future is close to zero, so the kernel has to be cluttered
+ * with heuristics to keep up with the ever growing amount of hardware and
+ * firmware trainwrecks. Hopefully some day hardware people will understand
+ * that the approach of "This can be fixed in software" is not sustainable.
+ * Hope dies last...
+ */
+static bool __init hpet_is_pc10_damaged(void)
+{
+ unsigned long long pcfg;
+
+ /* Check whether PC10 substates are supported */
+ if (!mwait_pc10_supported())
+ return false;
+
+ /* Check whether PC10 is enabled in PKG C-state limit */
+ rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg);
+ if ((pcfg & 0xF) < 8)
+ return false;
+
+ if (hpet_force_user) {
+ pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n");
+ return false;
+ }
+
+ pr_info("HPET dysfunctional in PC10. Force disabled.\n");
+ boot_hpet_disable = true;
+ return true;
}
/**
@@ -844,44 +990,37 @@ static int hpet_clocksource_register(void)
*/
int __init hpet_enable(void)
{
- unsigned int id;
- int i;
+ u32 hpet_period, cfg, id, irq;
+ unsigned int i, channels;
+ struct hpet_channel *hc;
+ u64 freq;
if (!is_hpet_capable())
return 0;
+ if (hpet_is_pc10_damaged())
+ return 0;
+
hpet_set_mapping();
+ if (!hpet_virt_address)
+ return 0;
+
+ /* Validate that the config register is working */
+ if (!hpet_cfg_working())
+ goto out_nohpet;
/*
* Read the period and check for a sane value:
*/
hpet_period = hpet_readl(HPET_PERIOD);
-
- /*
- * AMD SB700 based systems with spread spectrum enabled use a
- * SMM based HPET emulation to provide proper frequency
- * setting. The SMM code is initialized with the first HPET
- * register access and takes some time to complete. During
- * this time the config register reads 0xffffffff. We check
- * for max. 1000 loops whether the config register reads a non
- * 0xffffffff value to make sure that HPET is up and running
- * before we go further. A counting loop is safe, as the HPET
- * access takes thousands of CPU cycles. On non SB700 based
- * machines this check is only done once and has no side
- * effects.
- */
- for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
- if (i == 1000) {
- printk(KERN_WARNING
- "HPET config register value = 0xFFFFFFFF. "
- "Disabling HPET\n");
- goto out_nohpet;
- }
- }
-
if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
goto out_nohpet;
+ /* The period is a femtoseconds value. Convert it to a frequency. */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
/*
* Read the HPET ID register to retrieve the IRQ routing
* information and the number of channels
@@ -889,42 +1028,99 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
-#ifdef CONFIG_HPET_EMULATE_RTC
+ /* This is the HPET channel number which is zero based */
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!(id & HPET_ID_NUMBER))
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2)
goto out_nohpet;
-#endif
- if (hpet_clocksource_register())
+ hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL);
+ if (!hc) {
+ pr_warn("Disabling HPET.\n");
+ goto out_nohpet;
+ }
+ hpet_base.channels = hc;
+ hpet_base.nr_channels = channels;
+
+ /* Read, store and sanitize the global configuration */
+ cfg = hpet_readl(HPET_CFG);
+ hpet_base.boot_cfg = cfg;
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("Global config: Unknown bits %#x\n", cfg);
+
+ /* Read, store and sanitize the per channel configuration */
+ for (i = 0; i < channels; i++, hc++) {
+ hc->num = i;
+
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ hc->boot_cfg = cfg;
+ irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
+ hc->irq = irq;
+
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg);
+ }
+ hpet_print_config();
+
+ /*
+ * Validate that the counter is counting. This needs to be done
+ * after sanitizing the config registers to properly deal with
+ * force enabled HPETs.
+ */
+ if (!hpet_counting())
goto out_nohpet;
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
+
if (id & HPET_ID_LEGSUP) {
- hpet_legacy_clockevent_register();
+ hpet_legacy_clockevent_register(&hpet_base.channels[0]);
+ hpet_base.channels[0].mode = HPET_MODE_LEGACY;
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC))
+ hpet_base.channels[1].mode = HPET_MODE_LEGACY;
return 1;
}
return 0;
out_nohpet:
+ kfree(hpet_base.channels);
+ hpet_base.channels = NULL;
+ hpet_base.nr_channels = 0;
hpet_clear_mapping();
hpet_address = 0;
return 0;
}
/*
- * Needs to be late, as the reserve_timer code calls kalloc !
+ * The late initialization runs after the PCI quirks have been invoked
+ * which might have detected a system on which the HPET can be enforced.
*
- * Not a problem on i386 as hpet_enable is called from late_time_init,
- * but on x86_64 it is necessary !
+ * Also, the MSI machinery is not working yet when the HPET is initialized
+ * early.
+ *
+ * If the HPET is enabled, then:
+ *
+ * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y
+ * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents
+ * 3) Setup /dev/hpet if CONFIG_HPET=y
+ * 4) Register hotplug callbacks when clockevents are available
*/
static __init int hpet_late_init(void)
{
- int cpu;
-
- if (boot_hpet_disable)
- return -ENODEV;
+ int ret;
if (!hpet_address) {
if (!force_hpet_address)
@@ -937,64 +1133,76 @@ static __init int hpet_late_init(void)
if (!hpet_virt_address)
return -ENODEV;
- if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
- hpet_msi_capability_lookup(2);
- else
- hpet_msi_capability_lookup(0);
-
- hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ hpet_select_device_channel();
+ hpet_select_clockevents();
+ hpet_reserve_platform_timers();
hpet_print_config();
- if (hpet_msi_disable)
- return 0;
-
- if (boot_cpu_has(X86_FEATURE_ARAT))
+ if (!hpet_base.nr_clockevents)
return 0;
- for_each_online_cpu(cpu) {
- hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
- }
-
- /* This notifier should be called after workqueue is ready */
- hotcpu_notifier(hpet_cpuhp_notify, -20);
-
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
+ hpet_cpuhp_online, NULL);
+ if (ret)
+ return ret;
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
+ hpet_cpuhp_dead);
+ if (ret)
+ goto err_cpuhp;
return 0;
+
+err_cpuhp:
+ cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE);
+ return ret;
}
fs_initcall(hpet_late_init);
void hpet_disable(void)
{
- if (is_hpet_capable() && hpet_virt_address) {
- unsigned int cfg = hpet_readl(HPET_CFG);
+ unsigned int i;
+ u32 cfg;
- if (hpet_legacy_int_enabled) {
- cfg &= ~HPET_CFG_LEGACY;
- hpet_legacy_int_enabled = 0;
- }
- cfg &= ~HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
- }
+ if (!is_hpet_capable() || !hpet_virt_address)
+ return;
+
+ /* Restore boot configuration with the enable bit cleared */
+ cfg = hpet_base.boot_cfg;
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+
+ /* Restore the channel boot configuration */
+ for (i = 0; i < hpet_base.nr_channels; i++)
+ hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i));
+
+ /* If the HPET was enabled at boot time, reenable it */
+ if (hpet_base.boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(hpet_base.boot_cfg, HPET_CFG);
}
#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+/*
+ * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET
* is enabled, we support RTC interrupt functionality in software.
+ *
* RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- * is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
+ *
+ * 1) Update Interrupt - generate an interrupt, every second, when the
+ * RTC clock is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2)
+ *
+ * (1) and (2) above are implemented using polling at a frequency of 64 Hz:
+ * DEFAULT_RTC_INT_FREQ.
+ *
+ * The exact frequency is a tradeoff between accuracy and interrupt overhead.
+ *
+ * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency,
+ * if it's higher.
*/
#include <linux/mc146818rtc.h>
#include <linux/rtc.h>
-#include <asm/rtc.h>
#define DEFAULT_RTC_INT_FREQ 64
#define DEFAULT_RTC_SHIFT 6
@@ -1012,7 +1220,7 @@ static unsigned long hpet_pie_limit;
static rtc_irq_handler irq_handler;
/*
- * Check that the hpet counter c1 is ahead of the c2
+ * Check that the HPET counter c1 is ahead of c2
*/
static inline int hpet_cnt_ahead(u32 c1, u32 c2)
{
@@ -1050,8 +1258,8 @@ void hpet_unregister_irq_handler(rtc_irq_handler handler)
EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
/*
- * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
- * is not supported by all HPET implementations for timer 1.
+ * Channel 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for channel 1.
*
* hpet_rtc_timer_init() is called when the rtc is initialized.
*/
@@ -1064,10 +1272,11 @@ int hpet_rtc_timer_init(void)
return 0;
if (!hpet_default_delta) {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
uint64_t clc;
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
- clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
+ clc >>= evt->shift + DEFAULT_RTC_SHIFT;
hpet_default_delta = clc;
}
@@ -1093,6 +1302,14 @@ int hpet_rtc_timer_init(void)
}
EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
+static void hpet_disable_rtc_channel(void)
+{
+ u32 cfg = hpet_readl(HPET_T1_CFG);
+
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+}
+
/*
* The functions below are called from rtc driver.
* Return 0 if HPET is not being used.
@@ -1104,6 +1321,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
return 0;
hpet_rtc_flags &= ~bit_mask;
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1127,8 +1347,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
}
EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
- unsigned char sec)
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
{
if (!is_hpet_enabled())
return 0;
@@ -1148,36 +1367,29 @@ int hpet_set_periodic_freq(unsigned long freq)
if (!is_hpet_enabled())
return 0;
- if (freq <= DEFAULT_RTC_INT_FREQ)
+ if (freq <= DEFAULT_RTC_INT_FREQ) {
hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
- else {
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ } else {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
+
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
do_div(clc, freq);
- clc >>= hpet_clockevent.shift;
+ clc >>= evt->shift;
hpet_pie_delta = clc;
hpet_pie_limit = 0;
}
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
-int hpet_rtc_dropped_irq(void)
-{
- return is_hpet_enabled();
-}
-EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
-
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, delta;
+ unsigned int delta;
int lost_ints = -1;
- if (unlikely(!hpet_rtc_flags)) {
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T1_CFG);
- return;
- }
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
delta = hpet_default_delta;
@@ -1198,8 +1410,7 @@ static void hpet_rtc_timer_reinit(void)
if (hpet_rtc_flags & RTC_PIE)
hpet_pie_count += lost_ints;
if (printk_ratelimit())
- printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
- lost_ints);
+ pr_warn("Lost %d RTC interrupts\n", lost_ints);
}
}
@@ -1211,8 +1422,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_rtc_timer_reinit();
memset(&curr_time, 0, sizeof(struct rtc_time));
- if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- get_rtc_time(&curr_time);
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
+ if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) {
+ pr_err_ratelimited("unable to read current time from RTC\n");
+ return IRQ_HANDLED;
+ }
+ }
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
@@ -1221,8 +1436,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_prev_update_sec = curr_time.tm_sec;
}
- if (hpet_rtc_flags & RTC_PIE &&
- ++hpet_pie_count >= hpet_pie_limit) {
+ if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) {
rtc_int_flag |= RTC_PF;
hpet_pie_count = 0;
}
@@ -1231,7 +1445,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
(curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
(curr_time.tm_min == hpet_alarm_time.tm_min) &&
(curr_time.tm_hour == hpet_alarm_time.tm_hour))
- rtc_int_flag |= RTC_AF;
+ rtc_int_flag |= RTC_AF;
if (rtc_int_flag) {
rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..f846c15f21ca 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -1,17 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2007 Alan Stern
* Copyright (C) 2009 IBM Corporation
@@ -36,14 +24,17 @@
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/kvm_types.h>
+#include <linux/export.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/user.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
@@ -109,8 +100,10 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
unsigned long *dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
- struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
if (!*slot) {
*slot = bp;
@@ -122,12 +115,20 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
return -EBUSY;
set_debugreg(info->address, i);
- __get_cpu_var(cpu_debugreg[i]) = info->address;
+ __this_cpu_write(cpu_debugreg[i], info->address);
- dr7 = &__get_cpu_var(cpu_dr7);
+ dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
+ /*
+ * Ensure we first write cpu_dr7 before we set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
+
set_debugreg(*dr7, 7);
+ if (info->mask)
+ amd_set_dr_addr_mask(info->mask, i);
return 0;
}
@@ -144,11 +145,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- unsigned long *dr7;
+ unsigned long dr7;
int i;
+ lockdep_assert_irqs_disabled();
+
for (i = 0; i < HBP_NUM; i++) {
- struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
if (*slot == bp) {
*slot = NULL;
@@ -159,78 +162,54 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
- dr7 = &__get_cpu_var(cpu_dr7);
- *dr7 &= ~__encode_dr7(i, info->len, info->type);
-
- set_debugreg(*dr7, 7);
-}
+ dr7 = this_cpu_read(cpu_dr7);
+ dr7 &= ~__encode_dr7(i, info->len, info->type);
-static int get_hbp_len(u8 hbp_len)
-{
- unsigned int len_in_bytes = 0;
-
- switch (hbp_len) {
- case X86_BREAKPOINT_LEN_1:
- len_in_bytes = 1;
- break;
- case X86_BREAKPOINT_LEN_2:
- len_in_bytes = 2;
- break;
- case X86_BREAKPOINT_LEN_4:
- len_in_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86_BREAKPOINT_LEN_8:
- len_in_bytes = 8;
- break;
-#endif
- }
- return len_in_bytes;
-}
-
-/*
- * Check for virtual address in kernel space.
- */
-int arch_check_bp_in_kernelspace(struct perf_event *bp)
-{
- unsigned int len;
- unsigned long va;
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ set_debugreg(dr7, 7);
+ if (info->mask)
+ amd_set_dr_addr_mask(0, i);
- va = info->address;
- len = get_hbp_len(info->len);
+ /*
+ * Ensure the write to cpu_dr7 is after we've set the DR7 register.
+ * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
+ */
+ barrier();
- return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+ this_cpu_write(cpu_dr7, dr7);
}
-int arch_bp_generic_fields(int x86_len, int x86_type,
- int *gen_len, int *gen_type)
+static int arch_bp_generic_len(int x86_len)
{
- /* Len */
switch (x86_len) {
case X86_BREAKPOINT_LEN_1:
- *gen_len = HW_BREAKPOINT_LEN_1;
- break;
+ return HW_BREAKPOINT_LEN_1;
case X86_BREAKPOINT_LEN_2:
- *gen_len = HW_BREAKPOINT_LEN_2;
- break;
+ return HW_BREAKPOINT_LEN_2;
case X86_BREAKPOINT_LEN_4:
- *gen_len = HW_BREAKPOINT_LEN_4;
- break;
+ return HW_BREAKPOINT_LEN_4;
#ifdef CONFIG_X86_64
case X86_BREAKPOINT_LEN_8:
- *gen_len = HW_BREAKPOINT_LEN_8;
- break;
+ return HW_BREAKPOINT_LEN_8;
#endif
default:
return -EINVAL;
}
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ int len;
/* Type */
switch (x86_type) {
case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
*gen_type = HW_BREAKPOINT_X;
- break;
+ *gen_len = sizeof(long);
+ return 0;
case X86_BREAKPOINT_WRITE:
*gen_type = HW_BREAKPOINT_W;
break;
@@ -241,72 +220,223 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
return -EINVAL;
}
+ /* Len */
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
+ return -EINVAL;
+ *gen_len = len;
+
return 0;
}
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+ unsigned long va;
+ int len;
+
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
-static int arch_build_bp_info(struct perf_event *bp)
+/*
+ * Checks whether the range [addr, end], overlaps the area [base, base + size).
+ */
+static inline bool within_area(unsigned long addr, unsigned long end,
+ unsigned long base, unsigned long size)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ return end >= base && addr < (base + size);
+}
+
+/*
+ * Checks whether the range from addr to end, inclusive, overlaps the fixed
+ * mapped CPU entry area range or other ranges used for CPU entry.
+ */
+static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
+{
+ int cpu;
+
+ /* CPU entry erea is always used for CPU entry */
+ if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_MAP_SIZE))
+ return true;
+
+ /*
+ * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+ * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+ */
+#ifdef CONFIG_SMP
+ if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+ sizeof(unsigned long) * nr_cpu_ids))
+ return true;
+#else
+ if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
+ sizeof(pcpu_unit_offsets)))
+ return true;
+#endif
+
+ for_each_possible_cpu(cpu) {
+ /* The original rw GDT is being used after load_direct_gdt() */
+ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
+ GDT_SIZE))
+ return true;
+
+ /*
+ * cpu_tss_rw is not directly referenced by hardware, but
+ * cpu_tss_rw is also used in CPU entry code,
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct)))
+ return true;
- info->address = bp->attr.bp_addr;
+ /*
+ * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
+ * If a data breakpoint on it, it will cause an unwanted #DB.
+ * Protect the full cpu_tlbstate structure to be sure.
+ */
+ if (within_area(addr, end,
+ (unsigned long)&per_cpu(cpu_tlbstate, cpu),
+ sizeof(struct tlb_state)))
+ return true;
+
+ /*
+ * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+ * will read per-cpu cpu_dr7 before clear dr7 register.
+ */
+ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
+ sizeof(cpu_dr7)))
+ return true;
+ }
+
+ return false;
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ unsigned long bp_end;
+
+ bp_end = attr->bp_addr + attr->bp_len - 1;
+ if (bp_end < attr->bp_addr)
+ return -EINVAL;
+
+ /*
+ * Prevent any breakpoint of any type that overlaps the CPU
+ * entry area and data. This protects the IST stacks and also
+ * reduces the chance that we ever find out what happens if
+ * there's a data breakpoint on the GDT, IDT, or TSS.
+ */
+ if (within_cpu_entry(attr->bp_addr, bp_end))
+ return -EINVAL;
+
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
+
+ /* Type */
+ switch (attr->bp_type) {
+ case HW_BREAKPOINT_W:
+ hw->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ hw->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ /*
+ * We don't allow kernel breakpoints in places that are not
+ * acceptable for kprobes. On non-kprobes kernels, we don't
+ * allow kernel breakpoints at all.
+ */
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
+ if (within_kprobe_blacklist(attr->bp_addr))
+ return -EINVAL;
+ }
+
+ hw->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
/* Len */
- switch (bp->attr.bp_len) {
+ switch (attr->bp_len) {
case HW_BREAKPOINT_LEN_1:
- info->len = X86_BREAKPOINT_LEN_1;
+ hw->len = X86_BREAKPOINT_LEN_1;
break;
case HW_BREAKPOINT_LEN_2:
- info->len = X86_BREAKPOINT_LEN_2;
+ hw->len = X86_BREAKPOINT_LEN_2;
break;
case HW_BREAKPOINT_LEN_4:
- info->len = X86_BREAKPOINT_LEN_4;
+ hw->len = X86_BREAKPOINT_LEN_4;
break;
#ifdef CONFIG_X86_64
case HW_BREAKPOINT_LEN_8:
- info->len = X86_BREAKPOINT_LEN_8;
+ hw->len = X86_BREAKPOINT_LEN_8;
break;
#endif
default:
- return -EINVAL;
- }
+ /* AMD range breakpoint */
+ if (!is_power_of_2(attr->bp_len))
+ return -EINVAL;
+ if (attr->bp_addr & (attr->bp_len - 1))
+ return -EINVAL;
- /* Type */
- switch (bp->attr.bp_type) {
- case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
- break;
- case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
- break;
- case HW_BREAKPOINT_X:
- info->type = X86_BREAKPOINT_EXECUTE;
- break;
- default:
- return -EINVAL;
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
+ /*
+ * It's impossible to use a range breakpoint to fake out
+ * user vs kernel detection because bp_len - 1 can't
+ * have the high bit set. If we ever allow range instruction
+ * breakpoints, then we'll have to check for kprobe-blacklisted
+ * addresses anywhere in the range.
+ */
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
}
return 0;
}
+
/*
* Validate the arch-specific HW Breakpoint register settings
*/
-int arch_validate_hwbkpt_settings(struct perf_event *bp)
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
{
- struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
int ret;
- ret = arch_build_bp_info(bp);
+ ret = arch_build_bp_info(bp, attr, hw);
if (ret)
return ret;
- ret = -EINVAL;
-
- switch (info->len) {
+ switch (hw->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
+ if (hw->mask)
+ align = hw->mask;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
@@ -320,56 +450,21 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
break;
#endif
default:
- return ret;
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
/*
* Check that the low-order bits of the address are appropriate
* for the alignment implied by len.
*/
- if (info->address & align)
+ if (hw->address & align)
return -EINVAL;
return 0;
}
/*
- * Dump the debug register contents to the user.
- * We can't dump our per cpu values because it
- * may contain cpu wide breakpoint, something that
- * doesn't belong to the current task.
- *
- * TODO: include non-ptrace user breakpoints (perf)
- */
-void aout_dump_debugregs(struct user *dump)
-{
- int i;
- int dr7 = 0;
- struct perf_event *bp;
- struct arch_hw_breakpoint *info;
- struct thread_struct *thread = &current->thread;
-
- for (i = 0; i < HBP_NUM; i++) {
- bp = thread->ptrace_bps[i];
-
- if (bp && !bp->attr.disabled) {
- dump->u_debugreg[i] = bp->attr.bp_addr;
- info = counter_arch_bp(bp);
- dr7 |= encode_dr7(i, info->len, info->type);
- } else {
- dump->u_debugreg[i] = 0;
- }
- }
-
- dump->u_debugreg[4] = 0;
- dump->u_debugreg[5] = 0;
- dump->u_debugreg[6] = current->thread.debugreg6;
-
- dump->u_debugreg[7] = dr7;
-}
-EXPORT_SYMBOL_GPL(aout_dump_debugregs);
-
-/*
* Release the user breakpoints used by ptrace
*/
void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
@@ -381,18 +476,21 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
unregister_hw_breakpoint(t->ptrace_bps[i]);
t->ptrace_bps[i] = NULL;
}
+
+ t->virtual_dr6 = 0;
+ t->ptrace_dr7 = 0;
}
void hw_breakpoint_restore(void)
{
- set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
- set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
- set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
- set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
- set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(__get_cpu_var(cpu_dr7), 7);
+ set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
+ set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
+ set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
+ set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
+ set_debugreg(DR6_RESERVED, 6);
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
}
-EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+EXPORT_SYMBOL_FOR_KVM(hw_breakpoint_restore);
/*
* Handle debug exception notifications.
@@ -410,12 +508,13 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
* NOTIFY_STOP returned for all other cases
*
*/
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
{
- int i, cpu, rc = NOTIFY_STOP;
+ int i, rc = NOTIFY_STOP;
struct perf_event *bp;
- unsigned long dr7, dr6;
unsigned long *dr6_p;
+ unsigned long dr6;
+ bool bpx;
/* The DR6 value is pointed by args->err */
dr6_p = (unsigned long *)ERR_PTR(args->err);
@@ -425,68 +524,61 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
- get_debugreg(dr7, 7);
- /* Disable breakpoints during exception handling */
- set_debugreg(0UL, 7);
- /*
- * Assert that local interrupts are disabled
- * Reset the DRn bits in the virtualized register value.
- * The ptrace trigger routine will add in whatever is needed.
- */
- current->thread.debugreg6 &= ~DR_TRAP_BITS;
- cpu = get_cpu();
-
/* Handle all the breakpoints that were triggered */
for (i = 0; i < HBP_NUM; ++i) {
if (likely(!(dr6 & (DR_TRAP0 << i))))
continue;
+ bp = this_cpu_read(bp_per_reg[i]);
+ if (!bp)
+ continue;
+
+ bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
+
/*
- * The counter may be concurrently released but that can only
- * occur from a call_rcu() path. We can then safely fetch
- * the breakpoint, use its callback, touch its counter
- * while we are in an rcu_read_lock() path.
+ * TF and data breakpoints are traps and can be merged, however
+ * instruction breakpoints are faults and will be raised
+ * separately.
+ *
+ * However DR6 can indicate both TF and instruction
+ * breakpoints. In that case take TF as that has precedence and
+ * delay the instruction breakpoint for the next exception.
*/
- rcu_read_lock();
+ if (bpx && (dr6 & DR_STEP))
+ continue;
- bp = per_cpu(bp_per_reg[i], cpu);
/*
* Reset the 'i'th TRAP bit in dr6 to denote completion of
* exception handling
*/
(*dr6_p) &= ~(DR_TRAP0 << i);
- /*
- * bp can be NULL due to lazy debug register switching
- * or due to concurrent perf counter removing.
- */
- if (!bp) {
- rcu_read_unlock();
- break;
- }
perf_bp_event(bp, args->regs);
- rcu_read_unlock();
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bpx)
+ args->regs->flags |= X86_EFLAGS_RF;
}
+
/*
* Further processing in do_debug() is needed for a) user-space
* breakpoints (to generate signals) and b) when the system has
* taken exception due to multiple causes
*/
- if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
+ if ((current->thread.virtual_dr6 & DR_TRAP_BITS) ||
(dr6 & (~DR_TRAP_BITS)))
rc = NOTIFY_DONE;
- set_debugreg(dr7, 7);
- put_cpu();
-
return rc;
}
/*
* Handle debug exception notifications.
*/
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
deleted file mode 100644
index 9c3bd4a2050e..000000000000
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <linux/module.h>
-
-#include <asm/checksum.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
-EXPORT_SYMBOL(mcount);
-#endif
-
-/*
- * Note, this is a prototype to get at the symbol for
- * the export, but dont use it from C code, it is used
- * by assembly code and is not using C calling convention!
- */
-#ifndef CONFIG_X86_CMPXCHG64
-extern void cmpxchg8b_emu(void);
-EXPORT_SYMBOL(cmpxchg8b_emu);
-#endif
-
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_generic);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(strstr);
-
-EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
deleted file mode 100644
index 86cef6b32253..000000000000
--- a/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,730 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- * Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/user.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-#else
-# define save_i387_xstate_ia32 save_i387_xstate
-# define restore_i387_xstate_ia32 restore_i387_xstate
-# define _fpstate_ia32 _fpstate
-# define _xstate_ia32 _xstate
-# define sig_xstate_ia32_size sig_xstate_size
-# define fx_sw_reserved_ia32 fx_sw_reserved
-# define user_i387_ia32_struct user_i387_struct
-# define user32_fxsr_struct user_fxsr_struct
-#endif
-
-#ifdef CONFIG_MATH_EMULATION
-# define HAVE_HWFP (boot_cpu_data.hard_math)
-#else
-# define HAVE_HWFP 1
-#endif
-
-static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
-static struct i387_fxsave_struct fx_scratch __cpuinitdata;
-
-void __cpuinit mxcsr_feature_mask_init(void)
-{
- unsigned long mask = 0;
-
- clts();
- if (cpu_has_fxsr) {
- memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (fx_scratch));
- mask = fx_scratch.mxcsr_mask;
- if (mask == 0)
- mask = 0x0000ffbf;
- }
- mxcsr_feature_mask &= mask;
- stts();
-}
-
-void __cpuinit init_thread_xstate(void)
-{
- if (!HAVE_HWFP) {
- xstate_size = sizeof(struct i387_soft_struct);
- return;
- }
-
- if (cpu_has_xsave) {
- xsave_cntxt_init();
- return;
- }
-
- if (cpu_has_fxsr)
- xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
- else
- xstate_size = sizeof(struct i387_fsave_struct);
-#endif
-}
-
-#ifdef CONFIG_X86_64
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-void __cpuinit fpu_init(void)
-{
- unsigned long oldcr0 = read_cr0();
-
- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
-
- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
-
- /*
- * Boot processor to setup the FP and extended state context info.
- */
- if (!smp_processor_id())
- init_thread_xstate();
- xsave_init();
-
- mxcsr_feature_mask_init();
- /* clean state in init */
- current_thread_info()->status = 0;
- clear_used_math();
-}
-#endif /* CONFIG_X86_64 */
-
-static void fpu_finit(struct fpu *fpu)
-{
-#ifdef CONFIG_X86_32
- if (!HAVE_HWFP) {
- finit_soft_fpu(&fpu->state->soft);
- return;
- }
-#endif
-
- if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- memset(fx, 0, xstate_size);
- fx->cwd = 0x37f;
- if (cpu_has_xmm)
- fx->mxcsr = MXCSR_DEFAULT;
- } else {
- struct i387_fsave_struct *fp = &fpu->state->fsave;
- memset(fp, 0, xstate_size);
- fp->cwd = 0xffff037fu;
- fp->swd = 0xffff0000u;
- fp->twd = 0xffffffffu;
- fp->fos = 0xffff0000u;
- }
-}
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
- int ret;
-
- if (tsk_used_math(tsk)) {
- if (HAVE_HWFP && tsk == current)
- unlazy_fpu(tsk);
- return 0;
- }
-
- /*
- * Memory allocation at the first usage of the FPU and other state.
- */
- ret = fpu_alloc(&tsk->thread.fpu);
- if (ret)
- return ret;
-
- fpu_finit(&tsk->thread.fpu);
-
- set_stopped_child_used_math(tsk);
- return 0;
-}
-
-/*
- * The xstateregs_active() routine is the same as the fpregs_active() routine,
- * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
- */
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
- return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
- return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- int ret;
-
- if (!cpu_has_fxsr)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- int ret;
-
- if (!cpu_has_fxsr)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->fxsave, 0, -1);
-
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
- /*
- * update the header bits in the xsave header, indicating the
- * presence of FP and SSE state.
- */
- if (cpu_has_xsave)
- target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- return ret;
-}
-
-int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- int ret;
-
- if (!cpu_has_xsave)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- /*
- * Copy the 48bytes defined by the software first into the xstate
- * memory layout in the thread struct, so that we can copy the entire
- * xstateregs to the user using one user_regset_copyout().
- */
- memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
- xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
- /*
- * Copy the xstate memory layout.
- */
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
- return ret;
-}
-
-int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- int ret;
- struct xsave_hdr_struct *xsave_hdr;
-
- if (!cpu_has_xsave)
- return -ENODEV;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
-
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
- xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
- /*
- * These bits must be zero.
- */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
-
- return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
- unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
- /* Transform each pair of bits into 01 (valid) or 00 (empty) */
- tmp = ~twd;
- tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
- /* and move the valid bits to the lower byte. */
- tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
- tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
- tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
- return tmp;
-}
-
-#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
-#define FP_EXP_TAG_VALID 0
-#define FP_EXP_TAG_ZERO 1
-#define FP_EXP_TAG_SPECIAL 2
-#define FP_EXP_TAG_EMPTY 3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
- struct _fpxreg *st;
- u32 tos = (fxsave->swd >> 11) & 7;
- u32 twd = (unsigned long) fxsave->twd;
- u32 tag;
- u32 ret = 0xffff0000u;
- int i;
-
- for (i = 0; i < 8; i++, twd >>= 1) {
- if (twd & 0x1) {
- st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
- switch (st->exponent & 0x7fff) {
- case 0x7fff:
- tag = FP_EXP_TAG_SPECIAL;
- break;
- case 0x0000:
- if (!st->significand[0] &&
- !st->significand[1] &&
- !st->significand[2] &&
- !st->significand[3])
- tag = FP_EXP_TAG_ZERO;
- else
- tag = FP_EXP_TAG_SPECIAL;
- break;
- default:
- if (st->significand[3] & 0x8000)
- tag = FP_EXP_TAG_VALID;
- else
- tag = FP_EXP_TAG_SPECIAL;
- break;
- }
- } else {
- tag = FP_EXP_TAG_EMPTY;
- }
- ret |= tag << (2 * i);
- }
- return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-static void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
- struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
- struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
- struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
- int i;
-
- env->cwd = fxsave->cwd | 0xffff0000u;
- env->swd = fxsave->swd | 0xffff0000u;
- env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
- env->fip = fxsave->rip;
- env->foo = fxsave->rdp;
- if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
- } else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
- }
-#else
- env->fip = fxsave->fip;
- env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
- env->foo = fxsave->foo;
- env->fos = fxsave->fos;
-#endif
-
- for (i = 0; i < 8; ++i)
- memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-static void convert_to_fxsr(struct task_struct *tsk,
- const struct user_i387_ia32_struct *env)
-
-{
- struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
- struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
- struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
- int i;
-
- fxsave->cwd = env->cwd;
- fxsave->swd = env->swd;
- fxsave->twd = twd_i387_to_fxsr(env->twd);
- fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
- fxsave->rip = env->fip;
- fxsave->rdp = env->foo;
- /* cs and ds ignored */
-#else
- fxsave->fip = env->fip;
- fxsave->fcs = (env->fcs & 0xffff);
- fxsave->foo = env->foo;
- fxsave->fos = env->fos;
-#endif
-
- for (i = 0; i < 8; ++i)
- memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
-{
- struct user_i387_ia32_struct env;
- int ret;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- if (!HAVE_HWFP)
- return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
- if (!cpu_has_fxsr) {
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->fsave, 0,
- -1);
- }
-
- if (kbuf && pos == 0 && count == sizeof(env)) {
- convert_from_fxsr(kbuf, target);
- return 0;
- }
-
- convert_from_fxsr(&env, target);
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- const void *kbuf, const void __user *ubuf)
-{
- struct user_i387_ia32_struct env;
- int ret;
-
- ret = init_fpu(target);
- if (ret)
- return ret;
-
- if (!HAVE_HWFP)
- return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
- if (!cpu_has_fxsr) {
- return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->fsave, 0, -1);
- }
-
- if (pos > 0 || count < sizeof(env))
- convert_from_fxsr(&env, target);
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
- if (!ret)
- convert_to_fxsr(target, &env);
-
- /*
- * update the header bit in the xsave header, indicating the
- * presence of FP.
- */
- if (cpu_has_xsave)
- target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
- return ret;
-}
-
-/*
- * Signal frame handlers.
- */
-
-static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
-
- fp->status = fp->swd;
- if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
- return -1;
- return 1;
-}
-
-static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
- struct user_i387_ia32_struct env;
- int err = 0;
-
- convert_from_fxsr(&env, tsk);
- if (__copy_to_user(buf, &env, sizeof(env)))
- return -1;
-
- err |= __put_user(fx->swd, &buf->status);
- err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
- if (err)
- return -1;
-
- if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
- return -1;
- return 1;
-}
-
-static int save_i387_xsave(void __user *buf)
-{
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fx = buf;
- int err = 0;
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context.
- * This will enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware applications can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
- if (save_i387_fxsave(fx) < 0)
- return -1;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
- sizeof(struct _fpx_sw_bytes));
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_ia32_size
- - FP_XSTATE_MAGIC2_SIZE));
- if (err)
- return -1;
-
- return 1;
-}
-
-int save_i387_xstate_ia32(void __user *buf)
-{
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
- struct task_struct *tsk = current;
-
- if (!used_math())
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
- return -EACCES;
- /*
- * This will cause a "finit" to be triggered by the next
- * attempted FPU operation by the 'current' process.
- */
- clear_used_math();
-
- if (!HAVE_HWFP) {
- return fpregs_soft_get(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) ? -1 : 1;
- }
-
- unlazy_fpu(tsk);
-
- if (cpu_has_xsave)
- return save_i387_xsave(fp);
- if (cpu_has_fxsr)
- return save_i387_fxsave(fp);
- else
- return save_i387_fsave(fp);
-}
-
-static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
-{
- struct task_struct *tsk = current;
-
- return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
- sizeof(struct i387_fsave_struct));
-}
-
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
- unsigned int size)
-{
- struct task_struct *tsk = current;
- struct user_i387_ia32_struct env;
- int err;
-
- err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
- size);
- /* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
- if (err || __copy_from_user(&env, buf, sizeof(env)))
- return 1;
- convert_to_fxsr(tsk, &env);
-
- return 0;
-}
-
-static int restore_i387_xsave(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- struct _fpstate_ia32 __user *fx_user =
- ((struct _fpstate_ia32 __user *) buf);
- struct i387_fxsave_struct __user *fx =
- (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
- struct xsave_hdr_struct *xsave_hdr =
- &current->thread.fpu.state->xsave.xsave_hdr;
- u64 mask;
- int err;
-
- if (check_for_xstate(fx, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
- /*
- * These bits must be zero.
- */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
-
- /*
- * Init the state that is not present in the memory layout
- * and enabled by the OS.
- */
- mask = ~(pcntxt_mask & ~mask);
- xsave_hdr->xstate_bv &= mask;
-
- return err;
-fx_only:
- /*
- * Couldn't find the extended state information in the memory
- * layout. Restore the FP/SSE and init the other extended state
- * enabled by the OS.
- */
- xsave_hdr->xstate_bv = XSTATE_FPSSE;
- return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
-}
-
-int restore_i387_xstate_ia32(void __user *buf)
-{
- int err;
- struct task_struct *tsk = current;
- struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
-
- if (HAVE_HWFP)
- clear_fpu(tsk);
-
- if (!buf) {
- if (used_math()) {
- clear_fpu(tsk);
- clear_used_math();
- }
-
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (HAVE_HWFP) {
- if (cpu_has_xsave)
- err = restore_i387_xsave(buf);
- else if (cpu_has_fxsr)
- err = restore_i387_fxsave(fp, sizeof(struct
- i387_fxsave_struct));
- else
- err = restore_i387_fsave(fp);
- } else {
- err = fpregs_soft_set(current, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- NULL, fp) != 0;
- }
- set_used_math();
-
- return err;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
- struct task_struct *tsk = current;
- int fpvalid;
-
- fpvalid = !!used_math();
- if (fpvalid)
- fpvalid = !fpregs_get(tsk, NULL,
- 0, sizeof(struct user_i387_ia32_struct),
- fpu, NULL);
-
- return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..896d46b44284 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 8237A DMA controller suspend functions.
*
* Written by Pierre Ossman, 2005.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
*/
+#include <linux/dmi.h>
#include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <asm/dma.h>
+#include <asm/x86_init.h>
/*
* This module just handles suspend/resume issues with the
@@ -21,7 +19,7 @@
* in asm/dma.h.
*/
-static int i8237A_resume(struct sys_device *dev)
+static void i8237A_resume(void *data)
{
unsigned long flags;
int i;
@@ -41,31 +39,42 @@ static int i8237A_resume(struct sys_device *dev)
enable_dma(4);
release_dma_lock(flags);
-
- return 0;
-}
-
-static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
-{
- return 0;
}
-static struct sysdev_class i8237_sysdev_class = {
- .name = "i8237",
- .suspend = i8237A_suspend,
+static const struct syscore_ops i8237_syscore_ops = {
.resume = i8237A_resume,
};
-static struct sys_device device_i8237A = {
- .id = 0,
- .cls = &i8237_sysdev_class,
+static struct syscore i8237_syscore = {
+ .ops = &i8237_syscore_ops,
};
-static int __init i8237A_init_sysfs(void)
+static int __init i8237A_init_ops(void)
{
- int error = sysdev_class_register(&i8237_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8237A);
- return error;
+ /*
+ * From SKL PCH onwards, the legacy DMA device is removed in which the
+ * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed
+ * as well. All removed ports must return 0xff for a inb() request.
+ *
+ * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting
+ * the presence of DMA device since it may be used by BIOS to decode
+ * LPC traffic for POST codes. Original LPC only decodes one byte of
+ * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x
+ * decoding.
+ */
+ if (dma_inb(DMA_PAGE_0) == 0xFF)
+ return -ENODEV;
+
+ /*
+ * It is not required to load this driver as newer SoC may not
+ * support 8237 DMA or bus mastering from LPC. Platform firmware
+ * must announce the support for such legacy devices via
+ * ACPI_FADT_LEGACY_DEVICES field in FADT table.
+ */
+ if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
+ return -ENODEV;
+
+ register_syscore(&i8237_syscore);
+ return 0;
}
-device_initcall(i8237A_init_sysfs);
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..cb9852ad6098 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -1,24 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* 8253/PIT functions
*
*/
#include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-#include <linux/timex.h>
-#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
-#include <asm/i8253.h>
+#include <asm/hypervisor.h>
+#include <asm/apic.h>
#include <asm/hpet.h>
+#include <asm/time.h>
#include <asm/smp.h>
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
/*
* HPET replaces the PIT, when enabled. So we need to know, which of
* the two timers is used
@@ -26,172 +21,41 @@ EXPORT_SYMBOL(i8253_lock);
struct clock_event_device *global_clock_event;
/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
-
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* binary, mode 2, LSB/MSB, ch 0 */
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
- outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
- break;
-
- case CLOCK_EVT_MODE_SHUTDOWN:
- case CLOCK_EVT_MODE_UNUSED:
- if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
- evt->mode == CLOCK_EVT_MODE_ONESHOT) {
- outb_pit(0x30, PIT_MODE);
- outb_pit(0, PIT_CH0);
- outb_pit(0, PIT_CH0);
- }
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- /* One shot setup */
- outb_pit(0x38, PIT_MODE);
- break;
-
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
- raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
- raw_spin_lock(&i8253_lock);
- outb_pit(delta & 0xff , PIT_CH0); /* LSB */
- outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- raw_spin_unlock(&i8253_lock);
-
- return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
+ * Modern chipsets can disable the PIT clock which makes it unusable. It
+ * would be possible to enable the clock but the registers are chipset
+ * specific and not discoverable. Avoid the whack a mole game.
*
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
- .name = "pit",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = init_pit_timer,
- .set_next_event = pit_next_event,
- .shift = 32,
- .irq = 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
+ * These platforms have discoverable TSC/CPU frequencies but this also
+ * requires to know the local APIC timer frequency as it normally is
+ * calibrated against the PIT interrupt.
*/
-void __init setup_pit_timer(void)
+static bool __init use_pit(void)
{
- /*
- * Start pit with the boot cpu mask and make it global after the
- * IO_APIC has been initialized.
- */
- pit_ce.cpumask = cpumask_of(smp_processor_id());
- pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
- pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
- pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
+ if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC))
+ return true;
- clockevents_register_device(&pit_ce);
- global_clock_event = &pit_ce;
+ /* This also returns true when APIC is disabled */
+ return apic_needs_pit();
}
-#ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
+bool __init pit_timer_init(void)
{
- static int old_count;
- static u32 old_jifs;
- unsigned long flags;
- int count;
- u32 jifs;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- /*
- * Although our caller may have the read side of xtime_lock,
- * this is now a seqlock, and we are cheating in this routine
- * by having side effects on state that we cannot undo if
- * there is a collision on the seqlock and our caller has to
- * retry. (Namely, old_jifs and old_count.) So we must treat
- * jiffies as volatile despite the lock. We read jiffies
- * before latching the timer count to guarantee that although
- * the jiffies value might be older than the count (that is,
- * the counter may underflow between the last point where
- * jiffies was incremented and the point where we latch the
- * count), it cannot be newer.
- */
- jifs = jiffies;
- outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
- count = inb_pit(PIT_CH0); /* read the latched count */
- count |= inb_pit(PIT_CH0) << 8;
-
- /* VIA686a test code... reset the latch if count > max + 1 */
- if (count > LATCH) {
- outb_pit(0x34, PIT_MODE);
- outb_pit(LATCH & 0xff, PIT_CH0);
- outb_pit(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
+ if (!use_pit()) {
+ /*
+ * Don't just ignore the PIT. Ensure it's stopped, because
+ * VMMs otherwise steal CPU time just to pointlessly waggle
+ * the (masked) IRQ.
+ */
+ scoped_guard(irq)
+ clockevent_i8253_disable();
+ return false;
}
-
- /*
- * It's possible for count to appear to go the wrong way for a
- * couple of reasons:
- *
- * 1. The timer counter underflows, but we haven't handled the
- * resulting interrupt and incremented jiffies yet.
- * 2. Hardware problem with the timer, not giving us continuous time,
- * the counter does small "jumps" upwards on some Pentium systems,
- * (see c't 95/10 page 335 for Neptun bug.)
- *
- * Previous attempts to handle these cases intelligently were
- * buggy, so we just do the simple thing now.
- */
- if (count > old_count && jifs == old_jifs)
- count = old_count;
-
- old_count = count;
- old_jifs = jifs;
-
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
- count = (LATCH - 1) - count;
-
- return (cycle_t)(jifs * LATCH) + count;
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
+ return true;
}
-static struct clocksource pit_cs = {
- .name = "pit",
- .rating = 110,
- .read = pit_read,
- .mask = CLOCKSOURCE_MASK(32),
- .mult = 0,
- .shift = 20,
-};
-
+#ifndef CONFIG_X86_64
static int __init init_pit_clocksource(void)
{
/*
@@ -202,13 +66,10 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+ !clockevent_state_periodic(&i8253_clockevent))
return 0;
- pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
- return clocksource_register(&pit_cs);
+ return clocksource_i8253_init();
}
arch_initcall(init_pit_clocksource);
-
#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..f67063df6723 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,27 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>
+#include <asm/io_apic.h>
/*
* This is the 'legacy' 8259A Programmable Interrupt Controller,
@@ -29,24 +31,11 @@
* plus some generic x86 specific things if generic specifics makes
* any sense at all.
*/
+static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
/*
* 8259A PIC functions to handle ISA devices:
@@ -68,7 +57,7 @@ unsigned int cached_irq_mask = 0xffff;
*/
unsigned long io_apic_irqs;
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
{
unsigned int mask = 1 << irq;
unsigned long flags;
@@ -82,7 +71,12 @@ static void disable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
{
unsigned int mask = ~(1 << irq);
unsigned long flags;
@@ -96,6 +90,11 @@ static void enable_8259A_irq(unsigned int irq)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
static int i8259A_irq_pending(unsigned int irq)
{
unsigned int mask = 1<<irq;
@@ -116,9 +115,10 @@ static void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
- set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
+ irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
+ lapic_assign_legacy_vector(irq, true);
}
/*
@@ -150,8 +150,9 @@ static inline int i8259A_irq_real(unsigned int irq)
* first, _then_ send the EOI, and the order of EOI
* to the two 8259s is important!
*/
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
{
+ unsigned int irq = data->irq;
unsigned int irqmask = 1 << irq;
unsigned long flags;
@@ -209,7 +210,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
@@ -223,57 +224,59 @@ spurious_8259A_irq:
}
}
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
+/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */
static void restore_ELCR(char *trigger)
{
- outb(trigger[0], 0x4d0);
- outb(trigger[1], 0x4d1);
+ outb(trigger[0], PIC_ELCR1);
+ outb(trigger[1], PIC_ELCR2);
}
static void save_ELCR(char *trigger)
{
/* IRQ 0,1,2,8,13 are marked as reserved */
- trigger[0] = inb(0x4d0) & 0xF8;
- trigger[1] = inb(0x4d1) & 0xDE;
+ trigger[0] = inb(PIC_ELCR1) & 0xF8;
+ trigger[1] = inb(PIC_ELCR2) & 0xDE;
}
-static int i8259A_resume(struct sys_device *dev)
+static void i8259A_resume(void *data)
{
init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
- return 0;
}
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+static int i8259A_suspend(void *data)
{
save_ELCR(irq_trigger);
return 0;
}
-static int i8259A_shutdown(struct sys_device *dev)
+static void i8259A_shutdown(void *data)
{
/* Put the i8259A into a quiescent state that
* the kernel initialization code can get it
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
- return 0;
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
}
-static struct sysdev_class i8259_sysdev_class = {
- .name = "i8259",
+static const struct syscore_ops i8259_syscore_ops = {
.suspend = i8259A_suspend,
.resume = i8259A_resume,
.shutdown = i8259A_shutdown,
};
-static struct sys_device device_i8259A = {
- .id = 0,
- .cls = &i8259_sysdev_class,
+static struct syscore i8259_syscore = {
+ .ops = &i8259_syscore_ops,
};
static void mask_8259A(void)
@@ -300,6 +303,49 @@ static void unmask_8259A(void)
raw_spin_unlock_irqrestore(&i8259A_lock, flags);
}
+static int probe_8259A(void)
+{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned long flags;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
+ /*
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
+ */
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ }
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
+}
+
static void init_8259A(int auto_eoi)
{
unsigned long flags;
@@ -309,16 +355,14 @@ static void init_8259A(int auto_eoi)
raw_spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
- to 0x20-0x27 on i386 */
- outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -330,8 +374,8 @@ static void init_8259A(int auto_eoi)
outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
- outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
/* 8259A-2 is a slave on master's IR2 */
outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
/* (slave's support for AEOI in flat mode is to be investigated) */
@@ -342,9 +386,9 @@ static void init_8259A(int auto_eoi)
* In AEOI mode we just have to mask the interrupt
* when acking.
*/
- i8259A_chip.mask_ack = disable_8259A_irq;
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
udelay(100); /* wait for 8259A to initialize */
@@ -363,52 +407,54 @@ static void init_8259A(int auto_eoi)
static void legacy_pic_noop(void) { };
static void legacy_pic_uint_noop(unsigned int unused) { };
static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip = {
- .name = "dummy pic",
- .mask = legacy_pic_uint_noop,
- .unmask = legacy_pic_uint_noop,
- .disable = legacy_pic_uint_noop,
- .mask_ack = legacy_pic_uint_noop,
-};
static int legacy_pic_irq_pending_noop(unsigned int irq)
{
return 0;
}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
struct legacy_pic null_legacy_pic = {
.nr_legacy_irqs = 0,
- .chip = &dummy_pic_chip,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
.mask_all = legacy_pic_noop,
.restore_mask = legacy_pic_noop,
.init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
.irq_pending = legacy_pic_irq_pending_noop,
.make_irq = legacy_pic_uint_noop,
};
-struct legacy_pic default_legacy_pic = {
+static struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
- .mask_all = mask_8259A,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
.restore_mask = unmask_8259A,
.init = init_8259A,
+ .probe = probe_8259A,
.irq_pending = i8259A_irq_pending,
.make_irq = make_8259A_irq,
};
struct legacy_pic *legacy_pic = &default_legacy_pic;
+EXPORT_SYMBOL(legacy_pic);
-static int __init i8259A_init_sysfs(void)
+static int __init i8259A_init_ops(void)
{
- int error;
-
- if (legacy_pic != &default_legacy_pic)
- return 0;
+ if (legacy_pic == &default_legacy_pic)
+ register_syscore(&i8259_syscore);
- error = sysdev_class_register(&i8259_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8259A);
- return error;
+ return 0;
}
+device_initcall(i8259A_init_ops);
-device_initcall(i8259A_init_sysfs);
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
+}
diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S
new file mode 100644
index 000000000000..c43c4ed28a9c
--- /dev/null
+++ b/arch/x86/kernel/ibt_selftest.S
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/nospec-branch.h>
+
+SYM_CODE_START(ibt_selftest_noendbr)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ /* #CP handler sets %ax to 0 */
+ RET
+SYM_CODE_END(ibt_selftest_noendbr)
+
+SYM_FUNC_START(ibt_selftest)
+ lea ibt_selftest_noendbr(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rax
+SYM_FUNC_END(ibt_selftest)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000000..f445bec516a0
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interrupt descriptor table related code
+ */
+#include <linux/interrupt.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+#include <asm/ia32.h>
+#include <asm/idtentry.h>
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+#ifdef CONFIG_X86_64
+/*
+ * Interrupt gate with interrupt stack. The _ist index is the index in
+ * the tss.ist[] array, but for the descriptor it needs to start at 1.
+ */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
+
+static bool idt_setup_done __initdata;
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, asm_exc_debug),
+ SYSG(X86_TRAP_BP, asm_exc_int3),
+
+#ifdef CONFIG_X86_32
+ /*
+ * Not possible on 64-bit. See idt_setup_early_pf() for details.
+ */
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initconst struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, asm_exc_divide_error),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ INTG(X86_TRAP_BR, asm_exc_bounds),
+ INTG(X86_TRAP_UD, asm_exc_invalid_op),
+ INTG(X86_TRAP_NM, asm_exc_device_not_available),
+ INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
+ INTG(X86_TRAP_TS, asm_exc_invalid_tss),
+ INTG(X86_TRAP_NP, asm_exc_segment_not_present),
+ INTG(X86_TRAP_SS, asm_exc_stack_segment),
+ INTG(X86_TRAP_GP, asm_exc_general_protection),
+ INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
+ INTG(X86_TRAP_AC, asm_exc_alignment_check),
+ INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+#endif
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_X86_CET
+ INTG(X86_TRAP_CP, asm_exc_control_protection),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
+#endif
+
+ SYSG(X86_TRAP_OF, asm_exc_overflow),
+};
+
+static const struct idt_data ia32_idt[] __initconst = {
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initconst struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
+ INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
+ INTG(REBOOT_VECTOR, asm_sysvec_reboot),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
+# if IS_ENABLED(CONFIG_KVM)
+ INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
+# endif
+ INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
+ INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
+#endif
+};
+
+/* Must be page-aligned because the real IDT is used in the cpu entry area */
+static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+static struct desc_ptr idt_descr __ro_after_init = {
+ .size = IDT_TABLE_SIZE - 1,
+ .address = (unsigned long) idt_table,
+};
+
+void load_current_idt(void)
+{
+ lockdep_assert_irqs_disabled();
+ load_idt(&idt_descr);
+}
+
+#ifdef CONFIG_X86_F00F_BUG
+bool idt_is_f00f_address(unsigned long address)
+{
+ return ((address - idt_descr.address) >> 3) == 6;
+}
+#endif
+
+static __init void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, system_vectors);
+ }
+}
+
+static __init void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ init_idt_data(&data, n, addr);
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+
+ if (ia32_enabled())
+ idt_setup_from_table(idt_table, ia32_idt, ARRAY_SIZE(ia32_idt), true);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, asm_exc_page_fault),
+};
+
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * Note, that X86_64 cannot install the real #PF handler in
+ * idt_setup_early_traps() because the memory initialization needs the #PF
+ * handler from the early_idt_handler_array to initialize the early page
+ * tables.
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+#endif
+
+static void __init idt_map_in_cea(void)
+{
+ /*
+ * Set the IDT descriptor to a fixed read-only location in the cpu
+ * entry area, so that the "sidt" instruction will not leak the
+ * location of the kernel, and to defend the IDT against arbitrary
+ * memory write vulnerabilities.
+ */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+}
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+ entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR);
+ set_intr_gate(i, entry);
+ }
+#endif
+ /* Map IDT into CPU entry area and reload it. */
+ idt_map_in_cea();
+ load_idt(&idt_descr);
+
+ /* Make the IDT table read only */
+ set_memory_ro((unsigned long)&idt_table, 1);
+
+ idt_setup_done = true;
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ */
+void idt_invalidate(void)
+{
+ static const struct desc_ptr idt = { .address = 0, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init idt_install_sysvec(unsigned int n, const void *function)
+{
+ if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
+ return;
+
+ if (WARN_ON(idt_setup_done))
+ return;
+
+ if (!WARN_ON(test_and_set_bit(n, system_vectors)))
+ set_intr_gate(n, function);
+}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf44947..000000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index a979b5bd2fc0..fdb6506ceaaa 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* I/O delay strategies for inb_p/outb_p
*
@@ -6,13 +7,28 @@
* outb_p/inb_p API uses.
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/dmi.h>
#include <linux/io.h>
-int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+#define IO_DELAY_TYPE_0X80 0
+#define IO_DELAY_TYPE_0XED 1
+#define IO_DELAY_TYPE_UDELAY 2
+#define IO_DELAY_TYPE_NONE 3
+
+#if defined(CONFIG_IO_DELAY_0X80)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80
+#elif defined(CONFIG_IO_DELAY_0XED)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED
+#elif defined(CONFIG_IO_DELAY_UDELAY)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY
+#elif defined(CONFIG_IO_DELAY_NONE)
+#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE
+#endif
+
+int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE;
static int __initdata io_delay_override;
@@ -23,13 +39,13 @@ void native_io_delay(void)
{
switch (io_delay_type) {
default:
- case CONFIG_IO_DELAY_TYPE_0X80:
+ case IO_DELAY_TYPE_0X80:
asm volatile ("outb %al, $0x80");
break;
- case CONFIG_IO_DELAY_TYPE_0XED:
+ case IO_DELAY_TYPE_0XED:
asm volatile ("outb %al, $0xed");
break;
- case CONFIG_IO_DELAY_TYPE_UDELAY:
+ case IO_DELAY_TYPE_UDELAY:
/*
* 2 usecs is an upper-bound for the outb delay but
* note that udelay doesn't have the bus-level
@@ -38,7 +54,8 @@ void native_io_delay(void)
* are shorter until calibrated):
*/
udelay(2);
- case CONFIG_IO_DELAY_TYPE_NONE:
+ break;
+ case IO_DELAY_TYPE_NONE:
break;
}
}
@@ -46,9 +63,9 @@ EXPORT_SYMBOL(native_io_delay);
static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
{
- if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+ if (io_delay_type == IO_DELAY_TYPE_0X80) {
pr_notice("%s: using 0xed I/O delay port\n", id->ident);
- io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ io_delay_type = IO_DELAY_TYPE_0XED;
}
return 0;
@@ -58,7 +75,7 @@ static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
* Quirk table for systems that misbehave (lock up, etc.) if port
* 0x80 is used:
*/
-static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
+static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = {
{
.callback = dmi_io_delay_0xed_port,
.ident = "Compaq Presario V6000",
@@ -114,13 +131,13 @@ static int __init io_delay_param(char *s)
return -EINVAL;
if (!strcmp(s, "0x80"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+ io_delay_type = IO_DELAY_TYPE_0X80;
else if (!strcmp(s, "0xed"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ io_delay_type = IO_DELAY_TYPE_0XED;
else if (!strcmp(s, "udelay"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+ io_delay_type = IO_DELAY_TYPE_UDELAY;
else if (!strcmp(s, "none"))
- io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
+ io_delay_type = IO_DELAY_TYPE_NONE;
else
return -EINVAL;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..ff40f09ad911 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,47 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This contains the io-permission bitmap code - written by obz, with changes
* by Linus. 32/64 bits code unification by Miguel Botón.
*/
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
+#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
+
+#include <asm/io_bitmap.h>
+#include <asm/desc.h>
#include <asm/syscalls.h>
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base,
- unsigned int extent, int new_value)
+#ifdef CONFIG_X86_IOPL_IOPERM
+
+static atomic64_t io_bitmap_sequence;
+
+void io_bitmap_share(struct task_struct *tsk)
{
- unsigned int i;
+ /* Can be NULL when current->thread.iopl_emul == 3 */
+ if (current->thread.io_bitmap) {
+ /*
+ * Take a refcount on current's bitmap. It can be used by
+ * both tasks as long as none of them changes the bitmap.
+ */
+ refcount_inc(&current->thread.io_bitmap->refcnt);
+ tsk->thread.io_bitmap = current->thread.io_bitmap;
+ }
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+}
- for (i = base; i < base + extent; i++) {
- if (new_value)
- __set_bit(i, bitmap);
- else
- __clear_bit(i, bitmap);
+static void task_update_io_bitmap(void)
+{
+ struct task_struct *tsk = current;
+ struct thread_struct *t = &tsk->thread;
+
+ if (t->iopl_emul == 3 || t->io_bitmap) {
+ /* TSS update is handled on exit to user space */
+ set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+ } else {
+ clear_tsk_thread_flag(tsk, TIF_IO_BITMAP);
+ /* Invalidate TSS */
+ preempt_disable();
+ tss_update_io_bitmap();
+ preempt_enable();
}
}
+void io_bitmap_exit(struct task_struct *tsk)
+{
+ struct io_bitmap *iobm = tsk->thread.io_bitmap;
+
+ tsk->thread.io_bitmap = NULL;
+ /*
+ * Don't touch the TSS when invoked on a failed fork(). TSS
+ * reflects the state of @current and not the state of @tsk.
+ */
+ if (tsk == current)
+ task_update_io_bitmap();
+ if (iobm && refcount_dec_and_test(&iobm->refcnt))
+ kfree(iobm);
+}
+
/*
- * this changes the io permissions bitmap in the current task.
+ * This changes the io permissions bitmap in the current task.
*/
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
{
struct thread_struct *t = &current->thread;
- struct tss_struct *tss;
- unsigned int i, max_long, bytes, bytes_updated;
+ unsigned int i, max_long;
+ struct io_bitmap *iobm;
if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
return -EINVAL;
- if (turn_on && !capable(CAP_SYS_RAWIO))
+ if (turn_on && (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT)))
return -EPERM;
/*
@@ -49,75 +85,136 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* IO bitmap up. ioperm() is much less timing critical than clone(),
* this is why we delay this operation until now:
*/
- if (!t->io_bitmap_ptr) {
- unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-
- if (!bitmap)
+ iobm = t->io_bitmap;
+ if (!iobm) {
+ /* No point to allocate a bitmap just to clear permissions */
+ if (!turn_on)
+ return 0;
+ iobm = kmalloc(sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
return -ENOMEM;
- memset(bitmap, 0xff, IO_BITMAP_BYTES);
- t->io_bitmap_ptr = bitmap;
- set_thread_flag(TIF_IO_BITMAP);
+ memset(iobm->bitmap, 0xff, sizeof(iobm->bitmap));
+ refcount_set(&iobm->refcnt, 1);
+ }
+
+ /*
+ * If the bitmap is not shared, then nothing can take a refcount as
+ * current can obviously not fork at the same time. If it's shared
+ * duplicate it and drop the refcount on the original one.
+ */
+ if (refcount_read(&iobm->refcnt) > 1) {
+ iobm = kmemdup(iobm, sizeof(*iobm), GFP_KERNEL);
+ if (!iobm)
+ return -ENOMEM;
+ refcount_set(&iobm->refcnt, 1);
+ io_bitmap_exit(current);
}
/*
- * do it in the per-thread copy and in the TSS ...
- *
- * Disable preemption via get_cpu() - we must not switch away
- * because the ->io_bitmap_max value must match the bitmap
- * contents:
+ * Store the bitmap pointer (might be the same if the task already
+ * head one). Must be done here so freeing the bitmap when all
+ * permissions are dropped has the pointer set up.
*/
- tss = &per_cpu(init_tss, get_cpu());
+ t->io_bitmap = iobm;
+ /* Mark it active for context switching and exit to user mode */
+ set_thread_flag(TIF_IO_BITMAP);
- set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+ /*
+ * Update the tasks bitmap. The update of the TSS bitmap happens on
+ * exit to user mode. So this needs no protection.
+ */
+ if (turn_on)
+ bitmap_clear(iobm->bitmap, from, num);
+ else
+ bitmap_set(iobm->bitmap, from, num);
/*
* Search for a (possibly new) maximum. This is simple and stupid,
* to keep it obviously correct:
*/
- max_long = 0;
- for (i = 0; i < IO_BITMAP_LONGS; i++)
- if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = UINT_MAX;
+ for (i = 0; i < IO_BITMAP_LONGS; i++) {
+ if (iobm->bitmap[i] != ~0UL)
max_long = i;
+ }
+ /* All permissions dropped? */
+ if (max_long == UINT_MAX) {
+ io_bitmap_exit(current);
+ return 0;
+ }
- bytes = (max_long + 1) * sizeof(unsigned long);
- bytes_updated = max(bytes, t->io_bitmap_max);
-
- t->io_bitmap_max = bytes;
-
- /* Update the TSS: */
- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+ iobm->max = (max_long + 1) * sizeof(unsigned long);
- put_cpu();
+ /*
+ * Update the sequence number to force a TSS update on return to
+ * user mode.
+ */
+ iobm->sequence = atomic64_inc_return(&io_bitmap_sequence);
return 0;
}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return ksys_ioperm(from, num, turn_on);
+}
+
/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ * The sys_iopl functionality depends on the level argument, which if
+ * granted for the task is used to enable access to all 65536 I/O ports.
+ *
+ * This does not use the IOPL mechanism provided by the CPU as that would
+ * also allow the user space task to use the CLI/STI instructions.
+ *
+ * Disabling interrupts in a user space task is dangerous as it might lock
+ * up the machine and the semantics vs. syscalls and exceptions is
+ * undefined.
+ *
+ * Setting IOPL to level 0-2 is disabling I/O permissions. Level 3
+ * 3 enables them.
*
- * Here we just change the flags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
+ * IOPL is strictly per thread and inherited on fork.
*/
-long sys_iopl(unsigned int level, struct pt_regs *regs)
+SYSCALL_DEFINE1(iopl, unsigned int, level)
{
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ unsigned int old;
if (level > 3)
return -EINVAL;
+
+ old = t->iopl_emul;
+
+ /* No point in going further if nothing changes */
+ if (level == old)
+ return 0;
+
/* Trying to gain more privileges? */
if (level > old) {
- if (!capable(CAP_SYS_RAWIO))
+ if (!capable(CAP_SYS_RAWIO) ||
+ security_locked_down(LOCKDOWN_IOPORT))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
- set_iopl_mask(t->iopl);
+ t->iopl_emul = level;
+ task_update_io_bitmap();
return 0;
}
+
+#else /* CONFIG_X86_IOPL_IOPERM */
+
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ return -ENOSYS;
+}
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return -ENOSYS;
+}
+
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+ return -ENOSYS;
+}
+#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..86f4e574de02 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -1,24 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common interrupt code for 32 and 64 bit
*/
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/of.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/kvm_types.h>
+#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
+#include <asm/desc.h>
+#include <asm/traps.h>
+#include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
+
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR)
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
+#endif
-atomic_t irq_err_count;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
-/* Function pointer for generic interrupt vector handling */
-void (*x86_platform_ipi_callback)(void) = NULL;
+DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending);
+EXPORT_PER_CPU_SYMBOL(__softirq_pending);
+
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr);
+
+atomic_t irq_err_count;
/*
* 'what should we do if we get a hw irq event on an illegal vector'.
@@ -38,139 +59,146 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- ack_APIC_irq();
+ apic_eoi();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
/*
- * /proc/interrupts printing:
+ * /proc/interrupts printing for arch specific interrupts
*/
-static int show_other_interrupts(struct seq_file *p, int prec)
+int arch_show_interrupts(struct seq_file *p, int prec)
{
int j;
seq_printf(p, "%*s: ", prec, "NMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
- seq_printf(p, " Non-maskable interrupts\n");
+ seq_puts(p, " Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
- seq_printf(p, " Local timer interrupts\n");
+ seq_puts(p, " Local timer interrupts\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
- seq_printf(p, " Spurious interrupts\n");
+ seq_puts(p, " Spurious interrupts\n");
seq_printf(p, "%*s: ", prec, "PMI");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
- seq_printf(p, " Performance monitoring interrupts\n");
- seq_printf(p, "%*s: ", prec, "PND");
+ seq_puts(p, " Performance monitoring interrupts\n");
+ seq_printf(p, "%*s: ", prec, "IWI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
- seq_printf(p, " Performance pending work\n");
-#endif
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_puts(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_puts(p, " APIC ICR read retries\n");
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
- seq_printf(p, " Platform interrupts\n");
+ seq_puts(p, " Platform interrupts\n");
}
+#endif
#ifdef CONFIG_SMP
seq_printf(p, "%*s: ", prec, "RES");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
- seq_printf(p, " Rescheduling interrupts\n");
+ seq_puts(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
- seq_printf(p, " Function call interrupts\n");
+ seq_puts(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
- seq_printf(p, " TLB shootdowns\n");
+ seq_puts(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
seq_printf(p, "%*s: ", prec, "TRM");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
- seq_printf(p, " Thermal event interrupts\n");
+ seq_puts(p, " Thermal event interrupts\n");
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
seq_printf(p, "%*s: ", prec, "THR");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
- seq_printf(p, " Threshold APIC interrupts\n");
+ seq_puts(p, " Threshold APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
- seq_printf(p, " Machine check exceptions\n");
+ seq_puts(p, " Machine check exceptions\n");
seq_printf(p, "%*s: ", prec, "MCP");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
- seq_printf(p, " Machine check polls\n");
+ seq_puts(p, " Machine check polls\n");
+#endif
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HYP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_callback_count);
+ seq_puts(p, " Hypervisor callback interrupts\n");
+ }
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HRE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_reenlightenment_count);
+ seq_puts(p, " Hyper-V reenlightenment interrupts\n");
+ }
+ if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HVS");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->hyperv_stimer0_count);
+ seq_puts(p, " Hyper-V stimer0 interrupts\n");
+ }
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
- return 0;
-}
-
-int show_interrupts(struct seq_file *p, void *v)
-{
- unsigned long flags, any_count = 0;
- int i = *(loff_t *) v, j, prec;
- struct irqaction *action;
- struct irq_desc *desc;
-
- if (i > nr_irqs)
- return 0;
-
- for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
- j *= 10;
-
- if (i == nr_irqs)
- return show_other_interrupts(p, prec);
-
- /* print header */
- if (i == 0) {
- seq_printf(p, "%*s", prec + 8, "");
- for_each_online_cpu(j)
- seq_printf(p, "CPU%-8d", j);
- seq_putc(p, '\n');
- }
-
- desc = irq_to_desc(i);
- if (!desc)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
+#if IS_ENABLED(CONFIG_KVM)
+ seq_printf(p, "%*s: ", prec, "PIN");
for_each_online_cpu(j)
- any_count |= kstat_irqs_cpu(i, j);
- action = desc->action;
- if (!action && !any_count)
- goto out;
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
- seq_printf(p, "%*d: ", prec, i);
+ seq_printf(p, "%*s: ", prec, "NPI");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
- seq_printf(p, " %8s", desc->chip->name);
- seq_printf(p, "-%-8s", desc->name);
-
- if (action) {
- seq_printf(p, " %s", action->name);
- while ((action = action->next) != NULL)
- seq_printf(p, ", %s", action->name);
- }
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
- seq_putc(p, '\n');
-out:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
+#endif
return 0;
}
@@ -185,14 +213,14 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
- sum += irq_stats(cpu)->apic_pending_irqs;
-#endif
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
- sum += irq_stats(cpu)->irq_tlb_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
@@ -200,6 +228,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_MCE_THRESHOLD
sum += irq_stats(cpu)->irq_threshold_count;
#endif
+#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
+ sum += irq_stats(cpu)->irq_hv_callback_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ sum += irq_stats(cpu)->irq_hv_reenlightenment_count;
+ sum += irq_stats(cpu)->hyperv_stimer0_count;
+#endif
#ifdef CONFIG_X86_MCE
sum += per_cpu(mce_exception_count, cpu);
sum += per_cpu(mce_poll_count, cpu);
@@ -210,132 +245,252 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
u64 arch_irq_stat(void)
{
u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
- sum += atomic_read(&irq_mis_count);
-#endif
return sum;
}
-
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+static __always_inline void handle_irq(struct irq_desc *desc,
+ struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
+ if (IS_ENABLED(CONFIG_X86_64))
+ generic_handle_irq_desc(desc);
+ else
+ __handle_irq(desc, regs);
+}
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
- unsigned irq;
+static struct irq_desc *reevaluate_vector(int vector)
+{
+ struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
- exit_idle();
- irq_enter();
+ if (!IS_ERR_OR_NULL(desc))
+ return desc;
- irq = __get_cpu_var(vector_irq)[vector];
+ if (desc == VECTOR_UNUSED)
+ pr_emerg_ratelimited("No irq handler for %d.%u\n", smp_processor_id(), vector);
+ else
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ return NULL;
+}
- if (!handle_irq(irq, regs)) {
- ack_APIC_irq();
+static __always_inline bool call_irq_handler(int vector, struct pt_regs *regs)
+{
+ struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
- if (printk_ratelimit())
- pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ if (likely(!IS_ERR_OR_NULL(desc))) {
+ handle_irq(desc, regs);
+ return true;
}
- irq_exit();
+ /*
+ * Reevaluate with vector_lock held to prevent a race against
+ * request_irq() setting up the vector:
+ *
+ * CPU0 CPU1
+ * interrupt is raised in APIC IRR
+ * but not handled
+ * free_irq()
+ * per_cpu(vector_irq, CPU1)[vector] = VECTOR_SHUTDOWN;
+ *
+ * request_irq() common_interrupt()
+ * d = this_cpu_read(vector_irq[vector]);
+ *
+ * per_cpu(vector_irq, CPU1)[vector] = desc;
+ *
+ * if (d == VECTOR_SHUTDOWN)
+ * this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ *
+ * This requires that the same vector on the same target CPU is
+ * handed out or that a spurious interrupt hits that CPU/vector.
+ */
+ lock_vector_lock();
+ desc = reevaluate_vector(vector);
+ unlock_vector_lock();
- set_irq_regs(old_regs);
- return 1;
+ if (!desc)
+ return false;
+
+ handle_irq(desc, regs);
+ return true;
}
/*
- * Handler for X86_PLATFORM_IPI_VECTOR.
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
*/
-void smp_x86_platform_ipi(struct pt_regs *regs)
+DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- ack_APIC_irq();
+ /* entry code tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
- exit_idle();
+ if (unlikely(!call_irq_handler(vector, regs)))
+ apic_eoi();
- irq_enter();
+ set_irq_regs(old_regs);
+}
- inc_irq_stat(x86_platform_ipis);
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ apic_eoi();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ set_irq_regs(old_regs);
+}
+#endif
- irq_exit();
+#if IS_ENABLED(CONFIG_KVM)
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
- set_irq_regs(old_regs);
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else {
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
}
+EXPORT_SYMBOL_FOR_KVM(kvm_set_posted_intr_wakeup_handler);
-EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
+{
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_ipis);
+}
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
-void fixup_irqs(void)
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
- unsigned int irq, vector;
- static int warned;
- struct irq_desc *desc;
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
+}
- for_each_irq_desc(irq, desc) {
- int break_affinity = 0;
- int set_affinity = 1;
- const struct cpumask *affinity;
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
+{
+ apic_eoi();
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
+}
+#endif
- if (!desc)
- continue;
- if (irq == 2)
- continue;
+#ifdef CONFIG_X86_POSTED_MSI
- /* interrupt's are disabled at this point */
- raw_spin_lock(&desc->lock);
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
- affinity = desc->affinity;
- if (!irq_has_action(irq) ||
- cpumask_equal(affinity, cpu_online_mask)) {
- raw_spin_unlock(&desc->lock);
- continue;
- }
+void intel_posted_msi_init(void)
+{
+ u32 destination;
+ u32 apic_id;
- /*
- * Complete the irq move. This cpu is going down and for
- * non intr-remapping case, we can't wait till this interrupt
- * arrives at this cpu before completing the irq move.
- */
- irq_force_complete_move(irq);
+ this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
- if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
- break_affinity = 1;
- affinity = cpu_all_mask;
- }
+ /*
+ * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+ * VT-d spec. CH 9.11
+ */
+ apic_id = this_cpu_read(x86_cpu_to_apicid);
+ destination = x2apic_enabled() ? apic_id : apic_id << 8;
+ this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
- desc->chip->mask(irq);
+static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
+{
+ unsigned long pir_copy[NR_PIR_WORDS];
+ int vec = FIRST_EXTERNAL_VECTOR;
- if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, affinity);
- else if (!(warned++))
- set_affinity = 0;
+ if (!pi_harvest_pir(pir, pir_copy))
+ return false;
- if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
- desc->chip->unmask(irq);
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
- raw_spin_unlock(&desc->lock);
+ return true;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
- if (break_affinity && set_affinity)
- printk("Broke affinity for irq %i\n", irq);
- else if (!set_affinity)
- printk("Cannot set affinity for irq %i\n", irq);
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+
+ pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
+ irq_enter();
+
+ /*
+ * Max coalescing count includes the extra round of handle_pending_pir
+ * after clearing the outstanding notification bit. Hence, at most
+ * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ */
+ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ if (!handle_pending_pir(pid->pir, regs))
+ break;
}
/*
- * We can remove mdelay() and then send spuriuous interrupts to
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
+
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ handle_pending_pir(pid->pir, regs);
+
+ apic_eoi();
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+#endif /* X86_POSTED_MSI */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int vector;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+
+ irq_migrate_all_off_this_cpu();
+
+ /*
+ * We can remove mdelay() and then send spurious interrupts to
* new cpu targets for all the irqs that were handled previously by
* this cpu. While it works, I have seen spurious interrupt messages
* (nothing wrong but still...).
@@ -345,22 +500,49 @@ void fixup_irqs(void)
*/
mdelay(1);
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irr;
-
- if (__get_cpu_var(vector_irq)[vector] < 0)
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1 << (vector % 32))) {
- irq = __get_cpu_var(vector_irq)[vector];
+ if (is_vector_pending(vector)) {
+ desc = __this_cpu_read(vector_irq[vector]);
- desc = irq_to_desc(irq);
raw_spin_lock(&desc->lock);
- if (desc->chip->retrigger)
- desc->chip->retrigger(irq);
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_retrigger) {
+ chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
raw_spin_unlock(&desc->lock);
}
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ apic_eoi();
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..c7a5d2960d57 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
@@ -8,32 +9,29 @@
* io_apic.c.)
*/
-#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/percpu.h>
+#include <linux/mm.h>
#include <asm/apic.h>
+#include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
+int sysctl_panic_on_stackoverflow __read_mostly;
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
-static int check_stack_overflow(void)
+static bool check_stack_overflow(void)
{
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
+ unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1);
return sp < (sizeof(struct thread_info) + STACK_WARN);
}
@@ -42,47 +40,39 @@ static void print_stack_overflow(void)
{
printk(KERN_WARNING "low stack detected by irq handler\n");
dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
}
#else
-static inline int check_stack_overflow(void) { return 0; }
+static inline bool check_stack_overflow(void) { return false; }
static inline void print_stack_overflow(void) { }
#endif
-#ifdef CONFIG_4KSTACKS
-/*
- * per-CPU IRQ handling contexts (thread information and stack)
- */
-union irq_ctx {
- struct thread_info tinfo;
- u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(PAGE_SIZE)));
-
-static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
-static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
+DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr);
static void call_on_stack(void *func, void *stack)
{
- asm volatile("xchgl %%ebx,%%esp \n"
- "call *%%edi \n"
- "movl %%ebx,%%esp \n"
- : "=b" (stack)
- : "0" (stack),
- "D"(func)
+ asm volatile("xchgl %[sp], %%esp\n"
+ CALL_NOSPEC
+ "movl %[sp], %%esp"
+ : [sp] "+b" (stack)
+ : [thunk_target] "D" (func)
: "memory", "cc", "edx", "ecx", "eax");
}
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc)
{
- union irq_ctx *curctx, *irqctx;
- u32 *isp, arg1, arg2;
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp;
- curctx = (union irq_ctx *) current_thread_info();
- irqctx = __get_cpu_var(hardirq_ctx);
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -90,124 +80,78 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (unlikely(curctx == irqctx))
- return 0;
+ if (unlikely(curstk == irqstk))
+ return false;
- /* build the stack frame on the IRQ stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
- asm volatile("xchgl %%ebx,%%esp \n"
- "call *%%edi \n"
- "movl %%ebx,%%esp \n"
- : "=a" (arg1), "=d" (arg2), "=b" (isp)
- : "0" (irq), "1" (desc), "2" (isp),
- "D" (desc->handle_irq)
- : "memory", "cc", "ecx");
- return 1;
+ asm volatile("xchgl %[sp], %%esp\n"
+ CALL_NOSPEC
+ "movl %[sp], %%esp"
+ : "+a" (desc), [sp] "+b" (isp)
+ : [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "edx", "ecx");
+ return true;
}
/*
- * allocate per-cpu stacks for hardirq and for softirq processing
+ * Allocate per-cpu stacks for hardirq and softirq processing
*/
-void __cpuinit irq_ctx_init(int cpu)
+int irq_init_percpu_irqstack(unsigned int cpu)
{
- union irq_ctx *irqctx;
-
- if (per_cpu(hardirq_ctx, cpu))
- return;
+ int node = cpu_to_node(cpu);
+ struct page *ph, *ps;
- irqctx = &per_cpu(hardirq_stack, cpu);
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(hardirq_ctx, cpu) = irqctx;
-
- irqctx = &per_cpu(softirq_stack, cpu);
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
- per_cpu(softirq_ctx, cpu) = irqctx;
+ ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ph)
+ return -ENOMEM;
+ ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ if (!ps) {
+ __free_pages(ph, THREAD_SIZE_ORDER);
+ return -ENOMEM;
+ }
- printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
+ per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
+ return 0;
}
-void irq_ctx_exit(int cpu)
+#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK
+void do_softirq_own_stack(void)
{
- per_cpu(hardirq_ctx, cpu) = NULL;
-}
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
-asmlinkage void do_softirq(void)
-{
- unsigned long flags;
- struct thread_info *curctx;
- union irq_ctx *irqctx;
- u32 *isp;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
-
- if (local_softirq_pending()) {
- curctx = current_thread_info();
- irqctx = __get_cpu_var(softirq_ctx);
- irqctx->tinfo.task = curctx->task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
-
- /* build the stack frame on the softirq stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
-
- call_on_stack(__do_softirq, isp);
- /*
- * Shouldnt happen, we returned above if in_interrupt():
- */
- WARN_ON_ONCE(softirq_count());
- }
+ irqstk = __this_cpu_read(softirq_stack_ptr);
- local_irq_restore(flags);
-}
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
-#else
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
+
+ call_on_stack(__do_softirq, isp);
+}
#endif
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
- int overflow;
-
- overflow = check_stack_overflow();
+ bool overflow = check_stack_overflow();
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
- return false;
-
- if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
print_stack_overflow();
- desc->handle_irq(irq, desc);
+ generic_handle_irq_desc(desc);
}
-
- return true;
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index acf8fbf8fbda..ca78dce39361 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
*
@@ -10,75 +11,67 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
-#include <linux/module.h>
#include <linux/delay.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/smp.h>
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
+#include <asm/irq_stack.h>
#include <asm/io_apic.h>
-#include <asm/idle.h>
#include <asm/apic.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
+DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse);
+DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
+#ifdef CONFIG_VMAP_STACK
/*
- * Probabilistic stack overflow check:
- *
- * Only check the stack in process context, because everything else
- * runs on the big interrupt stacks. Checking reliably is too expensive,
- * so we just check from interrupts.
+ * VMAP the backing store with guard pages
*/
-static inline void stack_overflow_check(struct pt_regs *regs)
+static int map_irq_stack(unsigned int cpu)
{
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
- u64 curbase = (u64)task_stack_page(current);
+ char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu);
+ struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE];
+ void *va;
+ int i;
- WARN_ONCE(regs->sp >= curbase &&
- regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + 128,
+ for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) {
+ phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT));
- "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
-#endif
-}
-
-bool handle_irq(unsigned irq, struct pt_regs *regs)
-{
- struct irq_desc *desc;
-
- stack_overflow_check(regs);
+ pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
+ }
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
- return false;
+ va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
+ if (!va)
+ return -ENOMEM;
- generic_handle_irq_desc(irq, desc);
- return true;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ return 0;
}
-
-
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
+#else
+/*
+ * If VMAP stacks are disabled due to KASAN, just use the per cpu
+ * backing store without guard pages.
+ */
+static int map_irq_stack(unsigned int cpu)
{
- __u32 pending;
- unsigned long flags;
+ void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
- if (in_interrupt())
- return;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ return 0;
+}
+#endif
- local_irq_save(flags);
- pending = local_softirq_pending();
- /* Switch to interrupt stack */
- if (pending) {
- call_softirq();
- WARN_ON_ONCE(softirq_count());
- }
- local_irq_restore(flags);
+int irq_init_percpu_irqstack(unsigned int cpu)
+{
+ if (per_cpu(hardirq_stack_ptr, cpu))
+ return 0;
+ return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..b0a24deab4a1
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+#include <asm/idtentry.h>
+#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
+{
+ apic_eoi();
+ trace_irq_work_entry(IRQ_WORK_VECTOR);
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ trace_irq_work_exit(IRQ_WORK_VECTOR);
+}
+
+void arch_irq_work_raise(void)
+{
+ if (!arch_irq_work_has_interrupt())
+ return;
+
+ __apic_send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
new file mode 100644
index 000000000000..fdabd5dda154
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <linux/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+.pushsection .noinstr.text, "ax"
+SYM_FUNC_START(native_save_fl)
+ ENDBR
+ pushf
+ pop %_ASM_AX
+ RET
+SYM_FUNC_END(native_save_fl)
+.popsection
+EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 990ae7cfc578..6ab9eac64670 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,30 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
+#include <linux/pgtable.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
#include <asm/desc.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
#include <asm/apic.h>
#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
+#include <asm/fred.h>
+#include <asm/prom.h>
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -42,83 +47,28 @@
* (these are usually mapped into the 0x30-0xff vector range)
*/
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
- outb(0, 0xF0);
- if (ignore_fpu_irq || !boot_cpu_data.hard_math)
- return IRQ_NONE;
- math_error(get_irq_regs(), 0, 16);
- return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
- .handler = math_error_irq,
- .name = "fpu",
-};
-#endif
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
- .handler = no_action,
- .name = "cascade",
-};
-
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = -1,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
- return 1;
- }
-
- return 0;
-}
-
void __init init_ISA_irqs(void)
{
+ struct irq_chip *chip = legacy_pic->chip;
int i;
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- init_bsp_APIC();
-#endif
- legacy_pic->init(0);
-
/*
- * 16 old-style INTA-cycle interrupts:
+ * Try to set up the through-local-APIC virtual wire mode earlier.
+ *
+ * On some 32-bit UP machines, whose APIC has been disabled by BIOS
+ * and then got re-enabled by "lapic", it hangs at boot time without this.
*/
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
- struct irq_desc *desc = irq_to_desc(i);
+ init_bsp_APIC();
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
+ legacy_pic->init(0);
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
}
}
@@ -127,142 +77,38 @@ void __init init_IRQ(void)
int i;
/*
- * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+ * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
* If these IRQ's are handled by legacy interrupt-controllers like PIC,
* then this configuration will likely be static after the boot. If
- * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * these IRQs are handled by more modern controllers like IO-APIC,
* then this vector space can be freed and re-used dynamically as the
* irq's migrate etc.
*/
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
- per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
-
- x86_init.irqs.intr_init();
-}
-
-/*
- * Setup the vector to irq mappings.
- */
-void setup_vector_irq(int cpu)
-{
-#ifndef CONFIG_X86_IO_APIC
- int irq;
-
- /*
- * On most of the platforms, legacy PIC delivers the interrupts on the
- * boot cpu. But there are certain platforms where PIC interrupts are
- * delivered to multiple cpu's. If the legacy IRQ is handled by the
- * legacy PIC, for the new cpu that is coming online, setup the static
- * legacy vector to irq mapping:
- */
- for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
- per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
-#endif
-
- __setup_vector_irq(cpu);
-}
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-
- /* IPI used for rebooting/stopping */
- alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif
-#endif /* CONFIG_SMP */
-}
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#ifdef CONFIG_X86_MCE_THRESHOLD
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
- alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI for X86 platform specific use */
- alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+ BUG_ON(irq_init_percpu_irqstack(smp_processor_id()));
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
- alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
-# endif
-
-#endif
+ x86_init.irqs.intr_init();
}
void __init native_init_IRQ(void)
{
- int i;
-
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
- apic_intr_init();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- if (!test_bit(i, used_vectors))
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
- }
+ /* FRED's IRQ path may be used even if FRED isn't fully enabled. */
+ if (IS_ENABLED(CONFIG_X86_FRED))
+ fred_complete_exception_setup();
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_apic_and_irq_gates();
-#ifdef CONFIG_X86_32
- /*
- * External FPU? Set up irq13 if so, for
- * original braindamaged IBM FERR coupling.
- */
- if (boot_cpu_data.hard_math && !cpu_has_fpu)
- setup_irq(FPU_IRQ, &fpu_irq);
+ lapic_assign_system_vectors();
- irq_ctx_init(smp_processor_id());
-#endif
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
+ /* IRQ2 is cascade interrupt to second interrupt controller */
+ if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL))
+ pr_err("%s: request_irq() failed\n", "cascade");
+ }
}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..243a769fdd97
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/debugfs.h>
+#include <linux/mutex.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
+ */
+bool __read_mostly sysctl_sched_itmt_enabled;
+
+static ssize_t sched_itmt_enabled_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ guard(mutex)(&itmt_update_mutex);
+
+ orig = sysctl_sched_itmt_enabled;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sysctl_sched_itmt_enabled != orig) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ return result;
+}
+
+static int sched_core_priority_show(struct seq_file *s, void *unused)
+{
+ int cpu;
+
+ seq_puts(s, "CPU #\tPriority\n");
+ for_each_possible_cpu(cpu)
+ seq_printf(s, "%d\t%d\n", cpu, arch_asym_cpu_priority(cpu));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(sched_core_priority);
+
+static const struct file_operations dfs_sched_itmt_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_itmt_enabled_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static struct dentry *dfs_sched_itmt;
+static struct dentry *dfs_sched_core_prio;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ guard(mutex)(&itmt_update_mutex);
+
+ if (sched_itmt_capable)
+ return 0;
+
+ dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
+ 0644,
+ arch_debugfs_dir,
+ &sysctl_sched_itmt_enabled,
+ &dfs_sched_itmt_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
+ dfs_sched_itmt = NULL;
+ return -ENOMEM;
+ }
+
+ dfs_sched_core_prio = debugfs_create_file("sched_core_priority", 0644,
+ arch_debugfs_dir, NULL,
+ &sched_core_priority_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_core_prio)) {
+ dfs_sched_core_prio = NULL;
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ x86_topology_update = true;
+ rebuild_sched_domains();
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ guard(mutex)(&itmt_update_mutex);
+
+ if (!sched_itmt_capable)
+ return;
+
+ sched_itmt_capable = false;
+
+ debugfs_remove(dfs_sched_itmt);
+ dfs_sched_itmt = NULL;
+ debugfs_remove(dfs_sched_core_prio);
+ dfs_sched_core_prio = NULL;
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of @cpu
+ * @cpu: The CPU number
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPUs with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int cpu)
+{
+ per_cpu(sched_core_priority, cpu) = prio;
+}
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
new file mode 100644
index 000000000000..9e9a591a5fec
--- /dev/null
+++ b/arch/x86/kernel/jailhouse.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Jailhouse paravirt_ops implementation
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#include <linux/acpi_pmtmr.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/serial_8250.h>
+#include <linux/acpi.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/cpu.h>
+#include <asm/hypervisor.h>
+#include <asm/i8259.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/jailhouse_para.h>
+
+static struct jailhouse_setup_data setup_data;
+#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1))
+#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2))
+
+static unsigned int precalibrated_tsc_khz;
+
+static void jailhouse_setup_irq(unsigned int irq)
+{
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ .srcbusirq = irq,
+ .dstirq = irq,
+ };
+ mp_save_irq(&mp_irq);
+}
+
+static uint32_t jailhouse_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0 ||
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ return cpuid_base_hypervisor("Jailhouse\0\0\0", 0);
+}
+
+static uint32_t __init jailhouse_detect(void)
+{
+ return jailhouse_cpuid_base();
+}
+
+static void jailhouse_get_wallclock(struct timespec64 *now)
+{
+ memset(now, 0, sizeof(*now));
+}
+
+static void __init jailhouse_timer_init(void)
+{
+ lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ);
+}
+
+static unsigned long jailhouse_get_tsc(void)
+{
+ return precalibrated_tsc_khz;
+}
+
+static void __init jailhouse_x2apic_init(void)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (!x2apic_enabled())
+ return;
+ /*
+ * We do not have access to IR inside Jailhouse non-root cells. So
+ * we have to run in physical mode.
+ */
+ x2apic_phys = 1;
+ /*
+ * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
+ * ensure that only this APIC driver picks up the call.
+ */
+ default_acpi_madt_oem_check("", "");
+#endif
+}
+
+static void __init jailhouse_parse_smp_config(void)
+{
+ struct ioapic_domain_cfg ioapic_cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+ unsigned int cpu;
+
+ jailhouse_x2apic_init();
+
+ register_lapic_address(0xfee00000);
+
+ for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++)
+ topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true);
+
+ smp_found_config = 1;
+
+ if (setup_data.v1.standard_ioapic) {
+ mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
+
+ if (IS_ENABLED(CONFIG_SERIAL_8250) &&
+ setup_data.hdr.version < 2) {
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ jailhouse_setup_irq(3);
+ jailhouse_setup_irq(4);
+ }
+ }
+}
+
+static void jailhouse_no_restart(void)
+{
+ pr_notice("Jailhouse: Restart not supported, halting\n");
+ machine_halt();
+}
+
+static int __init jailhouse_pci_arch_init(void)
+{
+ pci_direct_init(1);
+
+ /*
+ * There are no bridges on the virtual PCI root bus under Jailhouse,
+ * thus no other way to discover all devices than a full scan.
+ * Respect any overrides via the command line, though.
+ */
+ if (pcibios_last_bus < 0)
+ pcibios_last_bus = 0xff;
+
+#ifdef CONFIG_PCI_MMCONFIG
+ if (setup_data.v1.pci_mmconfig_base) {
+ pci_mmconfig_add(0, 0, pcibios_last_bus,
+ setup_data.v1.pci_mmconfig_base);
+ pci_mmcfg_arch_init();
+ }
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_SERIAL_8250
+static inline bool jailhouse_uart_enabled(unsigned int uart_nr)
+{
+ return setup_data.v2.flags & BIT(uart_nr);
+}
+
+static void jailhouse_serial_fixup(int port, struct uart_port *up,
+ u32 *capabilities)
+{
+ static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8};
+ unsigned int n;
+
+ for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) {
+ if (pcuart_base[n] != up->iobase)
+ continue;
+
+ if (jailhouse_uart_enabled(n)) {
+ pr_info("Enabling UART%u (port 0x%lx)\n", n,
+ up->iobase);
+ jailhouse_setup_irq(up->irq);
+ } else {
+ /* Deactivate UART if access isn't allowed */
+ up->iobase = 0;
+ }
+ break;
+ }
+}
+
+static void __init jailhouse_serial_workaround(void)
+{
+ /*
+ * There are flags inside setup_data that indicate availability of
+ * platform UARTs since setup data version 2.
+ *
+ * In case of version 1, we don't know which UARTs belong Linux. In
+ * this case, unconditionally register 1:1 mapping for legacy UART IRQs
+ * 3 and 4.
+ */
+ if (setup_data.hdr.version > 1)
+ serial8250_set_isa_configurator(jailhouse_serial_fixup);
+}
+#else /* !CONFIG_SERIAL_8250 */
+static inline void jailhouse_serial_workaround(void)
+{
+}
+#endif /* CONFIG_SERIAL_8250 */
+
+static void __init jailhouse_init_platform(void)
+{
+ u64 pa_data = boot_params.hdr.setup_data;
+ unsigned long setup_data_len;
+ struct setup_data header;
+ void *mapping;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = jailhouse_timer_init;
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+ x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
+ x86_init.pci.arch_init = jailhouse_pci_arch_init;
+
+ x86_platform.calibrate_cpu = jailhouse_get_tsc;
+ x86_platform.calibrate_tsc = jailhouse_get_tsc;
+ x86_platform.get_wallclock = jailhouse_get_wallclock;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+
+ legacy_pic = &null_legacy_pic;
+
+ machine_ops.emergency_restart = jailhouse_no_restart;
+
+ while (pa_data) {
+ mapping = early_memremap(pa_data, sizeof(header));
+ memcpy(&header, mapping, sizeof(header));
+ early_memunmap(mapping, sizeof(header));
+
+ if (header.type == SETUP_JAILHOUSE)
+ break;
+
+ pa_data = header.next;
+ }
+
+ if (!pa_data)
+ panic("Jailhouse: No valid setup data found");
+
+ /* setup data must at least contain the header */
+ if (header.len < sizeof(setup_data.hdr))
+ goto unsupported;
+
+ pa_data += offsetof(struct setup_data, data);
+ setup_data_len = min_t(unsigned long, sizeof(setup_data),
+ (unsigned long)header.len);
+ mapping = early_memremap(pa_data, setup_data_len);
+ memcpy(&setup_data, mapping, setup_data_len);
+ early_memunmap(mapping, setup_data_len);
+
+ if (setup_data.hdr.version == 0 ||
+ setup_data.hdr.compatible_version !=
+ JAILHOUSE_SETUP_REQUIRED_VERSION ||
+ (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) ||
+ (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN))
+ goto unsupported;
+
+ pmtmr_ioport = setup_data.v1.pm_timer_address;
+ pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
+
+ precalibrated_tsc_khz = setup_data.v1.tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ pci_probe = 0;
+
+ /*
+ * Avoid that the kernel complains about missing ACPI tables - there
+ * are none in a non-root cell.
+ */
+ disable_acpi();
+
+ jailhouse_serial_workaround();
+ return;
+
+unsupported:
+ panic("Jailhouse: Unsupported setup data structure");
+}
+
+bool jailhouse_paravirt(void)
+{
+ return jailhouse_cpuid_base() != 0;
+}
+
+static bool __init jailhouse_x2apic_available(void)
+{
+ /*
+ * The x2APIC is only available if the root cell enabled it. Jailhouse
+ * does not support switching between xAPIC and x2APIC.
+ */
+ return x2apic_enabled();
+}
+
+const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
+ .name = "Jailhouse",
+ .detect = jailhouse_detect,
+ .init.init_platform = jailhouse_init_platform,
+ .init.x2apic_available = jailhouse_x2apic_available,
+ .ignore_nopv = true,
+};
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..a7949a54a0ff
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+#include <asm/text-patching.h>
+#include <asm/insn.h>
+
+int arch_jump_entry_size(struct jump_entry *entry)
+{
+ struct insn insn = {};
+
+ insn_decode_kernel(&insn, (void *)jump_entry_code(entry));
+ BUG_ON(insn.length != 2 && insn.length != 5);
+
+ return insn.length;
+}
+
+struct jump_label_patch {
+ const void *code;
+ int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type)
+{
+ const void *expect, *code, *nop;
+ const void *addr, *dest;
+ int size;
+
+ addr = (void *)jump_entry_code(entry);
+ dest = (void *)jump_entry_target(entry);
+
+ size = arch_jump_entry_size(entry);
+ switch (size) {
+ case JMP8_INSN_SIZE:
+ code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ case JMP32_INSN_SIZE:
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ default: BUG();
+ }
+
+ if (type == JUMP_LABEL_JMP)
+ expect = nop;
+ else
+ expect = code;
+
+ if (memcmp(addr, expect, size)) {
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n",
+ addr, addr, addr, expect, size, type);
+ BUG();
+ }
+
+ if (type == JUMP_LABEL_NOP)
+ code = nop;
+
+ return (struct jump_label_patch){.code = code, .size = size};
+}
+
+static __always_inline void
+__jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
+{
+ const struct jump_label_patch jlp = __jump_label_patch(entry, type);
+
+ /*
+ * As long as only a single processor is running and the code is still
+ * not marked as RO, text_poke_early() can be used; Checking that
+ * system_state is SYSTEM_BOOTING guarantees it. It will be set to
+ * SYSTEM_SCHEDULING before other cores are awaken and before the
+ * code is write-protected.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ */
+ if (init || system_state == SYSTEM_BOOTING) {
+ text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
+ return;
+ }
+
+ smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+}
+
+static void __ref jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
+{
+ mutex_lock(&text_mutex);
+ __jump_label_transform(entry, type, init);
+ mutex_unlock(&text_mutex);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ jump_label_transform(entry, type, 0);
+}
+
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ struct jump_label_patch jlp;
+
+ if (system_state == SYSTEM_BOOTING) {
+ /*
+ * Fallback to the non-batching mode.
+ */
+ arch_jump_label_transform(entry, type);
+ return true;
+ }
+
+ mutex_lock(&text_mutex);
+ jlp = __jump_label_patch(entry, type);
+ smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
+ mutex_unlock(&text_mutex);
+ return true;
+}
+
+void arch_jump_label_transform_apply(void)
+{
+ mutex_lock(&text_mutex);
+ smp_text_poke_batch_finish();
+ mutex_unlock(&text_mutex);
+}
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index 0f7bc20cfcde..000000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-
-static u32 *flush_words;
-
-struct pci_device_id k8_nb_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
- {}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-
-struct pci_dev **k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(&k8_nb_ids[0], dev));
- return dev;
-}
-
-int cache_k8_northbridges(void)
-{
- int i;
- struct pci_dev *dev;
-
- if (num_k8_northbridges)
- return 0;
-
- dev = NULL;
- while ((dev = next_k8_northbridge(dev)) != NULL)
- num_k8_northbridges++;
-
- k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
- GFP_KERNEL);
- if (!k8_northbridges)
- return -ENOMEM;
-
- if (!num_k8_northbridges) {
- k8_northbridges[0] = NULL;
- return 0;
- }
-
- flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges);
- return -ENOMEM;
- }
-
- dev = NULL;
- i = 0;
- while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges[i] = dev;
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
- }
- k8_northbridges[i] = NULL;
- return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-
-/* Ignores subdevice/subvendor but as far as I can figure out
- they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
- struct pci_device_id *id;
- u32 vendor = device & 0xffff;
- device >>= 16;
- for (id = k8_nb_ids; id->vendor; id++)
- if (vendor == id->vendor && device == id->device)
- return 1;
- return 0;
-}
-
-void k8_flush_garts(void)
-{
- int flushed, i;
- unsigned long flags;
- static DEFINE_SPINLOCK(gart_lock);
-
- /* Avoid races between AGP and IOMMU. In theory it's not needed
- but I'm not sure if the hardware won't lose flush requests
- when another is pending. This whole thing is so expensive anyways
- that it doesn't matter to serialize more. -AK */
- spin_lock_irqsave(&gart_lock, flags);
- flushed = 0;
- for (i = 0; i < num_k8_northbridges; i++) {
- pci_write_config_dword(k8_northbridges[i], 0x9c,
- flush_words[i]|1);
- flushed++;
- }
- for (i = 0; i < num_k8_northbridges; i++) {
- u32 w;
- /* Make sure the hardware actually executed the flush*/
- for (;;) {
- pci_read_config_dword(k8_northbridges[i],
- 0x9c, &w);
- if (!(w & 1))
- break;
- cpu_relax();
- }
- }
- spin_unlock_irqrestore(&gart_lock, flags);
- if (!flushed)
- printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-
-static __init int init_k8_nbs(void)
-{
- int err = 0;
-
- err = cache_k8_northbridges();
-
- if (err < 0)
- printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
-
- return err;
-}
-
-/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..e2e89bebcbc3 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -1,14 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Architecture specific debugfs files
*
* Copyright (C) 2007, Intel Corp.
* Huang Ying <ying.huang@intel.com>
- *
- * This file is released under the GPLv2.
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/stat.h>
@@ -33,7 +32,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
struct setup_data_node *node = file->private_data;
unsigned long remain;
loff_t pos = *ppos;
- struct page *pg;
void *p;
u64 pa;
@@ -46,19 +44,19 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
if (count > node->len - pos)
count = node->len - pos;
- pa = node->paddr + sizeof(struct setup_data) + pos;
- pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- p = ioremap_cache(pa, count);
- if (!p)
- return -ENXIO;
- } else
- p = __va(pa);
+ pa = node->paddr + pos;
+
+ /* Is it direct data or invalid indirect one? */
+ if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT)
+ pa += sizeof(struct setup_data);
+
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
remain = copy_to_user(user_buf, p, count);
- if (PageHighMem(pg))
- iounmap(p);
+ memunmap(p);
if (remain)
return -EFAULT;
@@ -68,96 +66,94 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
return count;
}
-static int setup_data_open(struct inode *inode, struct file *file)
-{
- file->private_data = inode->i_private;
-
- return 0;
-}
-
static const struct file_operations fops_setup_data = {
.read = setup_data_read,
- .open = setup_data_open,
+ .open = simple_open,
+ .llseek = default_llseek,
};
-static int __init
+static void __init
create_setup_data_node(struct dentry *parent, int no,
struct setup_data_node *node)
{
- struct dentry *d, *type, *data;
+ struct dentry *d;
char buf[16];
sprintf(buf, "%d", no);
d = debugfs_create_dir(buf, parent);
- if (!d)
- return -ENOMEM;
-
- type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
- if (!type)
- goto err_dir;
- data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
- if (!data)
- goto err_type;
-
- return 0;
-
-err_type:
- debugfs_remove(type);
-err_dir:
- debugfs_remove(d);
- return -ENOMEM;
+ debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
}
static int __init create_setup_data_nodes(struct dentry *parent)
{
+ struct setup_indirect *indirect;
struct setup_data_node *node;
struct setup_data *data;
- int error = -ENOMEM;
+ u64 pa_data, pa_next;
struct dentry *d;
- struct page *pg;
- u64 pa_data;
+ int error;
+ u32 len;
int no = 0;
d = debugfs_create_dir("setup_data", parent);
- if (!d)
- return -ENOMEM;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
goto err_dir;
+ }
+ pa_next = data->next;
- pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
- if (PageHighMem(pg)) {
- data = ioremap_cache(pa_data, sizeof(*data));
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
if (!data) {
kfree(node);
- error = -ENXIO;
+ error = -ENOMEM;
goto err_dir;
}
- } else
- data = __va(pa_data);
-
- node->paddr = pa_data;
- node->type = data->type;
- node->len = data->len;
- error = create_setup_data_node(d, no, node);
- pa_data = data->next;
-
- if (PageHighMem(pg))
- iounmap(data);
- if (error)
- goto err_dir;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ node->paddr = indirect->addr;
+ node->type = indirect->type;
+ node->len = indirect->len;
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+ } else {
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ }
+
+ create_setup_data_node(d, no, node);
+ pa_data = pa_next;
+
+ memunmap(data);
no++;
}
return 0;
err_dir:
- debugfs_remove(d);
+ debugfs_remove_recursive(d);
return error;
}
@@ -168,35 +164,18 @@ static struct debugfs_blob_wrapper boot_params_blob = {
static int __init boot_params_kdebugfs_init(void)
{
- struct dentry *dbp, *version, *data;
- int error = -ENOMEM;
-
- dbp = debugfs_create_dir("boot_params", NULL);
- if (!dbp)
- return -ENOMEM;
+ struct dentry *dbp;
+ int error;
- version = debugfs_create_x16("version", S_IRUGO, dbp,
- &boot_params.hdr.version);
- if (!version)
- goto err_dir;
+ dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
- data = debugfs_create_blob("data", S_IRUGO, dbp,
- &boot_params_blob);
- if (!data)
- goto err_version;
+ debugfs_create_x16("version", S_IRUGO, dbp, &boot_params.hdr.version);
+ debugfs_create_blob("data", S_IRUGO, dbp, &boot_params_blob);
error = create_setup_data_nodes(dbp);
if (error)
- goto err_data;
+ debugfs_remove_recursive(dbp);
- return 0;
-
-err_data:
- debugfs_remove(data);
-err_version:
- debugfs_remove(version);
-err_dir:
- debugfs_remove(dbp);
return error;
}
#endif /* CONFIG_DEBUG_BOOT_PARAMS */
@@ -206,8 +185,6 @@ static int __init arch_kdebugfs_init(void)
int error = 0;
arch_debugfs_dir = debugfs_create_dir("x86", NULL);
- if (!arch_debugfs_dir)
- return -ENOMEM;
#ifdef CONFIG_DEBUG_BOOT_PARAMS
error = boot_params_kdebugfs_init();
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..c3244ac680d1
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/libfdt.h>
+#include <linux/of_fdt.h>
+#include <linux/efi.h>
+#include <linux/random.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+#include <asm/e820/api.h>
+#include <asm/kexec-bzimage64.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */
+
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len = 0;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->elf_load_addr);
+
+ if (image->dm_crypt_keys_addr != 0)
+ len += sprintf(cmdline_ptr + len,
+ "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr);
+ }
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_table_kexec->nr_entries;
+
+ /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */
+ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE)
+ nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry));
+
+ return 0;
+}
+
+enum { RNG_SEED_LENGTH = 32 };
+
+static void
+setup_rng_seed(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int rng_seed_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + rng_seed_setup_data_offset;
+ unsigned long setup_data_phys;
+
+ if (!rng_is_initialized())
+ return;
+
+ sd->type = SETUP_RNG_SEED;
+ sd->len = RNG_SEED_LENGTH;
+ get_random_bytes(sd->data, RNG_SEED_LENGTH);
+ setup_data_phys = params_load_addr + rng_seed_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi_fw_vendor;
+ esd->tables = efi_config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ params->secure_boot = boot_params.secure_boot;
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_OF_FLATTREE
+static void setup_dtb(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int dtb_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + dtb_setup_data_offset;
+ unsigned long setup_data_phys, dtb_len;
+
+ dtb_len = fdt_totalsize(initial_boot_params);
+ sd->type = SETUP_DTB;
+ sd->len = dtb_len;
+
+ /* Carry over current boot DTB with setup_data */
+ memcpy(sd->data, initial_boot_params, dtb_len);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + dtb_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+#endif /* CONFIG_OF_FLATTREE */
+
+static void
+setup_ima_state(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int ima_setup_data_offset)
+{
+#ifdef CONFIG_IMA_KEXEC
+ struct setup_data *sd = (void *)params + ima_setup_data_offset;
+ unsigned long setup_data_phys;
+ struct ima_setup_data *ima;
+
+ if (!image->ima_buffer_size)
+ return;
+
+ sd->type = SETUP_IMA;
+ sd->len = sizeof(*ima);
+
+ ima = (void *)sd + sizeof(struct setup_data);
+ ima->addr = image->ima_buffer_addr;
+ ima->size = image->ima_buffer_size;
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + ima_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+#endif /* CONFIG_IMA_KEXEC */
+}
+
+static void setup_kho(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + setup_data_offset;
+ struct kho_data *kho = (void *)sd + sizeof(*sd);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return;
+
+ sd->type = SETUP_KEXEC_KHO;
+ sd->len = sizeof(struct kho_data);
+
+ /* Only add if we have all KHO images in place */
+ if (!image->kho.fdt || !image->kho.scratch)
+ return;
+
+ /* Add setup data */
+ kho->fdt_addr = image->kho.fdt;
+ kho->fdt_size = PAGE_SIZE;
+ kho->scratch_addr = image->kho.scratch->mem;
+ kho->scratch_size = image->kho.scratch->bufsz;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = params_load_addr + setup_data_offset;
+}
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Always fill in RSDP: it is either 0 or a valid value */
+ params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+#ifdef CONFIG_CRASH_DUMP
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+#endif
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ kexec_dprintk("E820 memmap:\n");
+ for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
+ if (params->e820_table[i].type != E820_TYPE_RAM)
+ continue;
+ start = params->e820_table[i].addr;
+ end = params->e820_table[i].addr + params->e820_table[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+#endif
+
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params) {
+ setup_dtb(params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+ } else {
+ pr_debug("Not carrying over DTB, force_dtb = %d\n",
+ image->force_dtb);
+ }
+#endif
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC)) {
+ /* Setup IMA log buffer state */
+ setup_ima_state(image, params, params_load_addr,
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
+ }
+
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ /* Setup space to store preservation metadata */
+ setup_kho(image, params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+ }
+
+ /* Setup RNG seed */
+ setup_rng_seed(params, params_load_addr, setup_data_offset);
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+static int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be at least two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) {
+ pr_err("bzImage cannot handle 5-level paging mode.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+static void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
+ struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR,
+ .buf_max = ULONG_MAX, .top_down = true };
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+#ifdef CONFIG_CRASH_DUMP
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = crash_load_dm_crypt_keys(image);
+ if (ret == -ENOENT) {
+ kexec_dprintk("No dm crypt key to load\n");
+ } else if (ret) {
+ pr_err("Failed to load dm crypt keys\n");
+ return ERR_PTR(ret);
+ }
+ if (image->dm_crypt_keys_addr &&
+ cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN >
+ header->cmdline_size) {
+ pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+#endif
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, &pbuf);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ if (image->dm_crypt_keys_addr)
+ params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data) +
+ sizeof(struct setup_data) +
+ RNG_SEED_LENGTH;
+
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params)
+ kbuf.bufsz += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+#endif
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
+
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+
+ params = kvzalloc(kbuf.bufsz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
+
+ /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load kernel */
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ if (header->pref_address < MIN_KERNEL_LOAD_ADDR)
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ else
+ kbuf.buf_min = header->pref_address;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ kernel_load_addr = kbuf.mem;
+
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ initrd_load_addr = kbuf.mem;
+
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kvfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+static int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kvfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+const struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = kexec_kernel_verify_pe_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..8b1a9733d13e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -1,14 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
*/
/*
@@ -26,7 +17,7 @@
* Updated by: Tom Rini <trini@kernel.crashing.org>
* Updated by: Jason Wessel <jason.wessel@windriver.com>
* Modified for 386 by Jim Kingdon, Cygnus Support.
- * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * Original kgdb, compatibility with 2.1.xx kernel by
* David Grothe <dave@gcom.com>
* Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
* X86_64 changes from Andi Kleen's patch merged by Jim Houston
@@ -39,65 +30,101 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/kgdb.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+#include <asm/text-patching.h>
#include <asm/debugreg.h>
#include <asm/apicdef.h>
-#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/switch_to.h>
-/**
- * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- * @gdb_regs: A pointer to hold the registers in the order GDB wants.
- * @regs: The &struct pt_regs of the current process.
- *
- * Convert the pt_regs in @regs into the format for registers that
- * GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, si) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
#endif
- gdb_regs[GDB_AX] = regs->ax;
- gdb_regs[GDB_BX] = regs->bx;
- gdb_regs[GDB_CX] = regs->cx;
- gdb_regs[GDB_DX] = regs->dx;
- gdb_regs[GDB_SI] = regs->si;
- gdb_regs[GDB_DI] = regs->di;
- gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PC] = regs->ip;
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
#ifdef CONFIG_X86_32
- gdb_regs[GDB_PS] = regs->flags;
- gdb_regs[GDB_DS] = regs->ds;
- gdb_regs[GDB_ES] = regs->es;
- gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_FS] = 0xFFFF;
- gdb_regs[GDB_GS] = 0xFFFF;
- if (user_mode_vm(regs)) {
- gdb_regs[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
- } else {
- gdb_regs[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
+ }
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+#ifdef CONFIG_X86_32
+ switch (regno) {
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
}
-#else
- gdb_regs[GDB_R8] = regs->r8;
- gdb_regs[GDB_R9] = regs->r9;
- gdb_regs[GDB_R10] = regs->r10;
- gdb_regs[GDB_R11] = regs->r11;
- gdb_regs[GDB_R12] = regs->r12;
- gdb_regs[GDB_R13] = regs->r13;
- gdb_regs[GDB_R14] = regs->r14;
- gdb_regs[GDB_R15] = regs->r15;
- gdb_regs32[GDB_PS] = regs->flags;
- gdb_regs32[GDB_CS] = regs->cs;
- gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
#endif
+ return dbg_reg_def[regno].name;
}
/**
@@ -123,21 +150,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_DX] = 0;
gdb_regs[GDB_SI] = 0;
gdb_regs[GDB_DI] = 0;
- gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
#ifdef CONFIG_X86_32
gdb_regs[GDB_DS] = __KERNEL_DS;
gdb_regs[GDB_ES] = __KERNEL_DS;
gdb_regs[GDB_PS] = 0;
gdb_regs[GDB_CS] = __KERNEL_CS;
- gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_PS] = 0;
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -147,57 +172,17 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_R14] = 0;
gdb_regs[GDB_R15] = 0;
#endif
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_SP] = p->thread.sp;
}
-/**
- * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- * @gdb_regs: A pointer to hold the registers we've received from GDB.
- * @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- * in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
- regs->ax = gdb_regs[GDB_AX];
- regs->bx = gdb_regs[GDB_BX];
- regs->cx = gdb_regs[GDB_CX];
- regs->dx = gdb_regs[GDB_DX];
- regs->si = gdb_regs[GDB_SI];
- regs->di = gdb_regs[GDB_DI];
- regs->bp = gdb_regs[GDB_BP];
- regs->ip = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
- regs->flags = gdb_regs[GDB_PS];
- regs->ds = gdb_regs[GDB_DS];
- regs->es = gdb_regs[GDB_ES];
- regs->cs = gdb_regs[GDB_CS];
-#else
- regs->r8 = gdb_regs[GDB_R8];
- regs->r9 = gdb_regs[GDB_R9];
- regs->r10 = gdb_regs[GDB_R10];
- regs->r11 = gdb_regs[GDB_R11];
- regs->r12 = gdb_regs[GDB_R12];
- regs->r13 = gdb_regs[GDB_R13];
- regs->r14 = gdb_regs[GDB_R14];
- regs->r15 = gdb_regs[GDB_R15];
- regs->flags = gdb_regs32[GDB_PS];
- regs->cs = gdb_regs32[GDB_CS];
- regs->ss = gdb_regs32[GDB_SS];
-#endif
-}
-
static struct hw_breakpoint {
unsigned enabled;
unsigned long addr;
int len;
int type;
- struct perf_event **pev;
-} breakinfo[4];
+ struct perf_event * __percpu *pev;
+} breakinfo[HBP_NUM];
static unsigned long early_dr7;
@@ -205,7 +190,7 @@ static void kgdb_correct_hw_break(void)
{
int breakno;
- for (breakno = 0; breakno < 4; breakno++) {
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
struct perf_event *bp;
struct arch_hw_breakpoint *info;
int val;
@@ -279,7 +264,7 @@ static int hw_break_release_slot(int breakno)
pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
if (dbg_release_bp_slot(*pevent))
/*
- * The debugger is responisble for handing the retry on
+ * The debugger is responsible for handing the retry on
* remove failure.
*/
return -1;
@@ -292,10 +277,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (breakinfo[i].addr == addr && breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
if (hw_break_release_slot(i)) {
@@ -313,18 +298,22 @@ static void kgdb_remove_all_hw_break(void)
int cpu = raw_smp_processor_id();
struct perf_event *bp;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
- if (bp->attr.disabled == 1)
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
continue;
+ }
if (dbg_is_early)
early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
breakinfo[i].type);
- else
- arch_uninstall_hw_breakpoint(bp);
- bp->attr.disabled = 1;
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
}
}
@@ -333,10 +322,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (!breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
switch (bptype) {
@@ -389,15 +378,15 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
* disable hardware debugging while it is processing gdb packets or
* handling exception.
*/
-void kgdb_disable_hw_debug(struct pt_regs *regs)
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
{
int i;
int cpu = raw_smp_processor_id();
struct perf_event *bp;
/* Disable hardware debugging while we are in kgdb: */
- set_debugreg(0UL, 7);
- for (i = 0; i < 4; i++) {
+ set_debugreg(DR7_FIXED_1, 7);
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
if (dbg_is_early) {
@@ -416,34 +405,29 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
#ifdef CONFIG_SMP
/**
* kgdb_roundup_cpus - Get other CPUs into a holding pattern
- * @flags: Current IRQ state
*
* On SMP systems, we need to get the attention of the other CPUs
* and get them be in a known state. This should do what is needed
* to get the other CPUs to call kgdb_wait(). Note that on some arches,
* the NMI approach is not used for rounding up all the CPUs. For example,
- * in case of MIPS, smp_call_function() is used to roundup CPUs. In
- * this case, we have to make sure that interrupts are enabled before
- * calling smp_call_function(). The argument to this function is
- * the flags that will be used when restoring the interrupts. There is
- * local_irq_save() call before kgdb_roundup_cpus().
+ * in case of MIPS, smp_call_function() is used to roundup CPUs.
*
* On non-SMP systems, this is not called.
*/
-void kgdb_roundup_cpus(unsigned long flags)
+void kgdb_roundup_cpus(void)
{
- apic->send_IPI_allbutself(APIC_DM_NMI);
+ apic_send_IPI_allbutself(NMI_VECTOR);
}
#endif
/**
* kgdb_arch_handle_exception - Handle architecture specific GDB packets.
- * @vector: The error vector of the exception that happened.
+ * @e_vector: The error vector of the exception that happened.
* @signo: The signal number of the exception that happened.
* @err_code: The error code of the exception that happened.
- * @remcom_in_buffer: The buffer of the packet we have read.
- * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
- * @regs: The &struct pt_regs of the current process.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
*
* This function MUST handle the 'c' and 's' command packets,
* as well packets to set / remove a hardware breakpoint, if used.
@@ -458,7 +442,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
{
unsigned long addr;
char *ptr;
- int newPC;
switch (remcomInBuffer[0]) {
case 'c':
@@ -467,10 +450,9 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
ptr = &remcomInBuffer[1];
if (kgdb_hex2long(&ptr, &addr))
linux_regs->ip = addr;
+ fallthrough;
case 'D':
case 'k':
- newPC = linux_regs->ip;
-
/* clear the trace bit */
linux_regs->flags &= ~X86_EFLAGS_TF;
atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -482,8 +464,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
raw_smp_processor_id());
}
- kgdb_correct_hw_break();
-
return 0;
}
@@ -511,43 +491,44 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
return NOTIFY_STOP;
}
-static int was_in_debug_nmi[NR_CPUS];
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
-static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
- struct pt_regs *regs = args->regs;
+ int cpu;
switch (cmd) {
- case DIE_NMI:
+ case NMI_LOCAL:
if (atomic_read(&kgdb_active) != -1) {
/* KGDB CPU roundup */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- was_in_debug_nmi[raw_smp_processor_id()] = 1;
+ cpu = raw_smp_processor_id();
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
touch_nmi_watchdog();
- return NOTIFY_STOP;
+
+ return NMI_HANDLED;
}
- return NOTIFY_DONE;
+ break;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
+ case NMI_UNKNOWN:
+ cpu = raw_smp_processor_id();
- case DIE_NMIUNKNOWN:
- if (was_in_debug_nmi[raw_smp_processor_id()]) {
- was_in_debug_nmi[raw_smp_processor_id()] = 0;
- return NOTIFY_STOP;
- }
- return NOTIFY_DONE;
+ if (__test_and_clear_bit(cpu, was_in_debug_nmi))
+ return NMI_HANDLED;
- case DIE_NMIWATCHDOG:
- if (atomic_read(&kgdb_active) != -1) {
- /* KGDB CPU roundup: */
- kgdb_nmicallback(raw_smp_processor_id(), regs);
- return NOTIFY_STOP;
- }
- /* Enter debugger: */
break;
+ default:
+ /* do nothing */
+ break;
+ }
+ return NMI_DONE;
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+ switch (cmd) {
case DIE_DEBUG:
if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
if (user_mode(regs))
@@ -558,7 +539,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
* a system call which should be ignored
*/
return NOTIFY_DONE;
- /* fall through */
+ fallthrough;
default:
if (user_mode(regs))
return NOTIFY_DONE;
@@ -605,28 +586,53 @@ kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
static struct notifier_block kgdb_notifier = {
.notifier_call = kgdb_notify,
-
- /*
- * Lowest-prio notifier priority, we want to be notified last:
- */
- .priority = -INT_MAX,
};
/**
- * kgdb_arch_init - Perform any architecture specific initalization.
+ * kgdb_arch_init - Perform any architecture specific initialization.
*
- * This function will handle the initalization of any architecture
+ * This function will handle the initialization of any architecture
* specific callbacks.
*/
int kgdb_arch_init(void)
{
- return register_die_notifier(&kgdb_notifier);
+ int retval;
+
+ retval = register_die_notifier(&kgdb_notifier);
+ if (retval)
+ goto out;
+
+ retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
+ 0, "kgdb");
+ if (retval)
+ goto out1;
+
+ retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
+ 0, "kgdb");
+
+ if (retval)
+ goto out2;
+
+ return retval;
+
+out2:
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+out1:
+ unregister_die_notifier(&kgdb_notifier);
+out:
+ return retval;
}
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
struct perf_sample_data *data, struct pt_regs *regs)
{
- kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].enabled)
+ tsk->thread.virtual_dr6 |= (DR_TRAP0 << i);
+ }
}
void kgdb_arch_late(void)
@@ -636,7 +642,7 @@ void kgdb_arch_late(void)
struct perf_event **pevent;
/*
- * Pre-allocate the hw breakpoint structions in the non-atomic
+ * Pre-allocate the hw breakpoint instructions in the non-atomic
* portion of kgdb because this operation requires mutexs to
* complete.
*/
@@ -645,11 +651,11 @@ void kgdb_arch_late(void)
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (breakinfo[i].pev)
continue;
- breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
- if (IS_ERR(breakinfo[i].pev)) {
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
+ if (IS_ERR_PCPU(breakinfo[i].pev)) {
printk(KERN_ERR "kgdb: Could not allocate hw"
"breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
@@ -683,11 +689,12 @@ void kgdb_arch_exit(void)
breakinfo[i].pev = NULL;
}
}
+ unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
unregister_die_notifier(&kgdb_notifier);
}
/**
- *
* kgdb_skipexception - Bail out of KGDB when we've been triggered.
* @exception: Exception vector number
* @regs: Current &struct pt_regs.
@@ -720,12 +727,58 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
regs->ip = ip;
}
-struct kgdb_arch arch_kgdb_ops = {
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+
+ bpt->type = BP_BREAKPOINT;
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke_kgdb() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ bpt->type = BP_POKE_BREAKPOINT;
+
+ return 0;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke_kgdb() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
+ BREAK_INSTR_SIZE);
+ return 0;
+
+knl_write:
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+const struct kgdb_arch arch_kgdb_ops = {
/* Breakpoint instruction: */
.gdb_bpt_instr = { 0xcc },
.flags = KGDB_HW_BREAKPOINT,
.set_hw_breakpoint = kgdb_set_hw_break,
.remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
.remove_all_hw_break = kgdb_remove_all_hw_break,
.correct_hw_break = kgdb_correct_hw_break,
};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
deleted file mode 100644
index 675879b65ce6..000000000000
--- a/arch/x86/kernel/kprobes.c
+++ /dev/null
@@ -1,1450 +0,0 @@
-/*
- * Kernel Probes (KProbes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- * Probes initial implementation ( includes contributions from
- * Rusty Russell).
- * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- * interface to access function arguments.
- * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- * <prasanna@in.ibm.com> adapted for x86_64 from i386.
- * 2005-Mar Roland McGrath <roland@redhat.com>
- * Fixed to handle %rip-relative addressing mode correctly.
- * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
- * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
- * <prasanna@in.ibm.com> added function-return probes.
- * 2005-May Rusty Lynch <rusty.lynch@intel.com>
- * Added function return probes functionality
- * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
- * kprobe-booster and kretprobe-booster for i386.
- * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
- * and kretprobe-booster for x86-64
- * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
- * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
- * unified x86 kprobes code.
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/hardirq.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/kallsyms.h>
-#include <linux/ftrace.h>
-
-#include <asm/cacheflush.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-#include <asm/insn.h>
-#include <asm/debugreg.h>
-
-void jprobe_return_end(void);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
-
-#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
- (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
- (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
- (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
- (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
- << (row % 32))
- /*
- * Undefined/reserved opcodes, conditional jump, Opcode Extension
- * Groups, and some special opcodes can not boost.
- */
-static const u32 twobyte_is_boostable[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ---------------------------------------------- */
- W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
- W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
- W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
- W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
- W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
- W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
- W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
- /* ----------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-#undef W
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {
- {"__switch_to", }, /* This function switches only current task, but
- doesn't switch kernel stack.*/
- {NULL, NULL} /* Terminator */
-};
-const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
-{
- struct __arch_relative_insn {
- u8 op;
- s32 raddr;
- } __attribute__((packed)) *insn;
-
- insn = (struct __arch_relative_insn *)from;
- insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
- insn->op = op;
-}
-
-/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-static void __kprobes synthesize_reljump(void *from, void *to)
-{
- __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
-}
-
-/*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
- */
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
-{
-#ifdef CONFIG_X86_64
- if ((*insn & 0xf0) == 0x40)
- return 1;
-#endif
- return 0;
-}
-
-/*
- * Returns non-zero if opcode is boostable.
- * RIP relative instructions are adjusted at copying time in 64 bits mode
- */
-static int __kprobes can_boost(kprobe_opcode_t *opcodes)
-{
- kprobe_opcode_t opcode;
- kprobe_opcode_t *orig_opcodes = opcodes;
-
- if (search_exception_tables((unsigned long)opcodes))
- return 0; /* Page fault may occur on this address. */
-
-retry:
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- opcode = *(opcodes++);
-
- /* 2nd-byte opcode */
- if (opcode == 0x0f) {
- if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
- return 0;
- return test_bit(*opcodes,
- (unsigned long *)twobyte_is_boostable);
- }
-
- switch (opcode & 0xf0) {
-#ifdef CONFIG_X86_64
- case 0x40:
- goto retry; /* REX prefix is boostable */
-#endif
- case 0x60:
- if (0x63 < opcode && opcode < 0x67)
- goto retry; /* prefixes */
- /* can't boost Address-size override and bound */
- return (opcode != 0x62 && opcode != 0x67);
- case 0x70:
- return 0; /* can't boost conditional jump */
- case 0xc0:
- /* can't boost software-interruptions */
- return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
- case 0xd0:
- /* can boost AA* and XLAT */
- return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
- case 0xe0:
- /* can boost in/out and absolute jmps */
- return ((opcode & 0x04) || opcode == 0xea);
- case 0xf0:
- if ((opcode & 0x0c) == 0 && opcode != 0xf1)
- goto retry; /* lock/rep(ne) prefix */
- /* clear and set flags are boostable */
- return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
- default:
- /* segment override prefixes are boostable */
- if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
- goto retry; /* prefixes */
- /* CS override prefix and call are not boostable */
- return (opcode != 0x2e && opcode != 0x9a);
- }
-}
-
-/* Recover the probed instruction at addr for further analysis. */
-static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
-{
- struct kprobe *kp;
- kp = get_kprobe((void *)addr);
- if (!kp)
- return -EINVAL;
-
- /*
- * Basically, kp->ainsn.insn has an original instruction.
- * However, RIP-relative instruction can not do single-stepping
- * at different place, __copy_instruction() tweaks the displacement of
- * that instruction. In that case, we can't recover the instruction
- * from the kp->ainsn.insn.
- *
- * On the other hand, kp->opcode has a copy of the first byte of
- * the probed instruction, which is overwritten by int3. And
- * the instruction at kp->addr is not modified by kprobes except
- * for the first byte, we can recover the original instruction
- * from it and kp->opcode.
- */
- memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
- buf[0] = kp->opcode;
- return 0;
-}
-
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
-/* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
-{
- int ret;
- unsigned long addr, offset = 0;
- struct insn insn;
- kprobe_opcode_t buf[MAX_INSN_SIZE];
-
- if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
- return 0;
-
- /* Decode instructions */
- addr = paddr - offset;
- while (addr < paddr) {
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
-
- /*
- * Check if the instruction has been modified by another
- * kprobe, in which case we replace the breakpoint by the
- * original instruction in our buffer.
- */
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- /*
- * Another debugging subsystem might insert
- * this breakpoint. In that case, we can't
- * recover it.
- */
- return 0;
- kernel_insn_init(&insn, buf);
- }
- insn_get_length(&insn);
- addr += insn.length;
- }
-
- return (addr == paddr);
-}
-
-/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
-{
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- /*
- * on X86_64, 0x40-0x4f are REX prefixes so we need to look
- * at the next byte instead.. but of course not recurse infinitely
- */
- if (is_REX_prefix(insn))
- return is_IF_modifier(++insn);
-
- return 0;
-}
-
-/*
- * Copy an instruction and adjust the displacement if the instruction
- * uses the %rip-relative addressing mode.
- * If it does, Return the address of the 32-bit displacement word.
- * If not, return null.
- * Only applicable to 64-bit x86.
- */
-static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
-{
- struct insn insn;
- int ret;
- kprobe_opcode_t buf[MAX_INSN_SIZE];
-
- kernel_insn_init(&insn, src);
- if (recover) {
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf,
- (unsigned long)src);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- }
- insn_get_length(&insn);
- memcpy(dest, insn.kaddr, insn.length);
-
-#ifdef CONFIG_X86_64
- if (insn_rip_relative(&insn)) {
- s64 newdisp;
- u8 *disp;
- kernel_insn_init(&insn, dest);
- insn_get_displacement(&insn);
- /*
- * The copied instruction uses the %rip-relative addressing
- * mode. Adjust the displacement for the difference between
- * the original location of this instruction and the location
- * of the copy that will actually be run. The tricky bit here
- * is making sure that the sign extension happens correctly in
- * this calculation, since we need a signed 32-bit result to
- * be sign-extended to 64 bits when it's added to the %rip
- * value and yield the same 64-bit result that the sign-
- * extension of the original signed 32-bit displacement would
- * have given.
- */
- newdisp = (u8 *) src + (s64) insn.displacement.value -
- (u8 *) dest;
- BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
- disp = (u8 *) dest + insn_offset_displacement(&insn);
- *(s32 *) disp = (s32) newdisp;
- }
-#endif
- return insn.length;
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
- /*
- * Copy an instruction without recovering int3, because it will be
- * put by another subsystem.
- */
- __copy_instruction(p->ainsn.insn, p->addr, 0);
-
- if (can_boost(p->addr))
- p->ainsn.boostable = 0;
- else
- p->ainsn.boostable = -1;
-
- p->opcode = *p->addr;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
- if (alternatives_text_reserved(p->addr, p->addr))
- return -EINVAL;
-
- if (!can_probe((unsigned long)p->addr))
- return -EILSEQ;
- /* insn: must be on special executable page on x86. */
- p->ainsn.insn = get_insn_slot();
- if (!p->ainsn.insn)
- return -ENOMEM;
- arch_copy_kprobe(p);
- return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
- text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
- if (p->ainsn.insn) {
- free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
- p->ainsn.insn = NULL;
- }
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
- kcb->prev_kprobe.kp = kprobe_running();
- kcb->prev_kprobe.status = kcb->kprobe_status;
- kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
- kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
- kcb->kprobe_status = kcb->prev_kprobe.status;
- kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
- kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- __get_cpu_var(current_kprobe) = p;
- kcb->kprobe_saved_flags = kcb->kprobe_old_flags
- = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- if (is_IF_modifier(p->ainsn.insn))
- kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
-}
-
-static void __kprobes clear_btf(void)
-{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- }
-}
-
-static void __kprobes restore_btf(void)
-{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- }
-}
-
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
- struct pt_regs *regs)
-{
- unsigned long *sara = stack_addr(regs);
-
- ri->ret_addr = (kprobe_opcode_t *) *sara;
-
- /* Replace the return addr with trampoline addr */
- *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-#ifdef CONFIG_OPTPROBES
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter);
-#else
-#define setup_detour_execution(p, regs, reenter) (0)
-#endif
-
-static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb, int reenter)
-{
- if (setup_detour_execution(p, regs, reenter))
- return;
-
-#if !defined(CONFIG_PREEMPT)
- if (p->ainsn.boostable == 1 && !p->post_handler) {
- /* Boost up -- we can execute copied instructions directly */
- if (!reenter)
- reset_current_kprobe();
- /*
- * Reentering boosted probe doesn't reset current_kprobe,
- * nor set current_kprobe, because it doesn't use single
- * stepping.
- */
- regs->ip = (unsigned long)p->ainsn.insn;
- preempt_enable_no_resched();
- return;
- }
-#endif
- if (reenter) {
- save_previous_kprobe(kcb);
- set_current_kprobe(p, regs, kcb);
- kcb->kprobe_status = KPROBE_REENTER;
- } else
- kcb->kprobe_status = KPROBE_HIT_SS;
- /* Prepare real single stepping */
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
-}
-
-/*
- * We have reentered the kprobe_handler(), since another probe was hit while
- * within the handler. We save the original kprobes variables and just single
- * step on the instruction of the new probe without calling any user handlers.
- */
-static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SSDONE:
- case KPROBE_HIT_ACTIVE:
- kprobes_inc_nmissed_count(p);
- setup_singlestep(p, regs, kcb, 1);
- break;
- case KPROBE_HIT_SS:
- /* A probe has been hit in the codepath leading up to, or just
- * after, single-stepping of a probed instruction. This entire
- * codepath should strictly reside in .kprobes.text section.
- * Raise a BUG or we'll continue in an endless reentering loop
- * and eventually a stack overflow.
- */
- printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
- p->addr);
- dump_kprobe(p);
- BUG();
- default:
- /* impossible cases */
- WARN_ON(1);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled throughout this function.
- */
-static int __kprobes kprobe_handler(struct pt_regs *regs)
-{
- kprobe_opcode_t *addr;
- struct kprobe *p;
- struct kprobe_ctlblk *kcb;
-
- addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
- /*
- * We don't want to be preempted for the entire
- * duration of kprobe processing. We conditionally
- * re-enable preemption at the end of this function,
- * and also in reenter_kprobe() and setup_singlestep().
- */
- preempt_disable();
-
- kcb = get_kprobe_ctlblk();
- p = get_kprobe(addr);
-
- if (p) {
- if (kprobe_running()) {
- if (reenter_kprobe(p, regs, kcb))
- return 1;
- } else {
- set_current_kprobe(p, regs, kcb);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
- /*
- * If we have no pre-handler or it returned 0, we
- * continue with normal processing. If we have a
- * pre-handler and it returned non-zero, it prepped
- * for calling the break_handler below on re-entry
- * for jprobe processing, so get out doing nothing
- * more here.
- */
- if (!p->pre_handler || !p->pre_handler(p, regs))
- setup_singlestep(p, regs, kcb, 0);
- return 1;
- }
- } else if (*addr != BREAKPOINT_INSTRUCTION) {
- /*
- * The breakpoint instruction was removed right
- * after we hit it. Another cpu has removed
- * either a probepoint or a debugger breakpoint
- * at this address. In either case, no further
- * handling of this interrupt is appropriate.
- * Back up over the (now missing) int3 and run
- * the original instruction.
- */
- regs->ip = (unsigned long)addr;
- preempt_enable_no_resched();
- return 1;
- } else if (kprobe_running()) {
- p = __get_cpu_var(current_kprobe);
- if (p->break_handler && p->break_handler(p, regs)) {
- setup_singlestep(p, regs, kcb, 0);
- return 1;
- }
- } /* else: not a kprobe fault; let the kernel handle it */
-
- preempt_enable_no_resched();
- return 0;
-}
-
-#ifdef CONFIG_X86_64
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax. */ \
- " subq $24, %rsp\n" \
- " pushq %rdi\n" \
- " pushq %rsi\n" \
- " pushq %rdx\n" \
- " pushq %rcx\n" \
- " pushq %rax\n" \
- " pushq %r8\n" \
- " pushq %r9\n" \
- " pushq %r10\n" \
- " pushq %r11\n" \
- " pushq %rbx\n" \
- " pushq %rbp\n" \
- " pushq %r12\n" \
- " pushq %r13\n" \
- " pushq %r14\n" \
- " pushq %r15\n"
-#define RESTORE_REGS_STRING \
- " popq %r15\n" \
- " popq %r14\n" \
- " popq %r13\n" \
- " popq %r12\n" \
- " popq %rbp\n" \
- " popq %rbx\n" \
- " popq %r11\n" \
- " popq %r10\n" \
- " popq %r9\n" \
- " popq %r8\n" \
- " popq %rax\n" \
- " popq %rcx\n" \
- " popq %rdx\n" \
- " popq %rsi\n" \
- " popq %rdi\n" \
- /* Skip orig_ax, ip, cs */ \
- " addq $24, %rsp\n"
-#else
-#define SAVE_REGS_STRING \
- /* Skip cs, ip, orig_ax and gs. */ \
- " subl $16, %esp\n" \
- " pushl %fs\n" \
- " pushl %es\n" \
- " pushl %ds\n" \
- " pushl %eax\n" \
- " pushl %ebp\n" \
- " pushl %edi\n" \
- " pushl %esi\n" \
- " pushl %edx\n" \
- " pushl %ecx\n" \
- " pushl %ebx\n"
-#define RESTORE_REGS_STRING \
- " popl %ebx\n" \
- " popl %ecx\n" \
- " popl %edx\n" \
- " popl %esi\n" \
- " popl %edi\n" \
- " popl %ebp\n" \
- " popl %eax\n" \
- /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
- " addl $24, %esp\n"
-#endif
-
-/*
- * When a retprobed function returns, this code saves registers and
- * calls trampoline_handler() runs, which calls the kretprobe's handler.
- */
-static void __used __kprobes kretprobe_trampoline_holder(void)
-{
- asm volatile (
- ".global kretprobe_trampoline\n"
- "kretprobe_trampoline: \n"
-#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rdi\n"
- " call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 152(%rsp)\n"
- RESTORE_REGS_STRING
- " popfq\n"
-#else
- " pushf\n"
- SAVE_REGS_STRING
- " movl %esp, %eax\n"
- " call trampoline_handler\n"
- /* Move flags to cs */
- " movl 56(%esp), %edx\n"
- " movl %edx, 52(%esp)\n"
- /* Replace saved flags with true return address. */
- " movl %eax, 56(%esp)\n"
- RESTORE_REGS_STRING
- " popf\n"
-#endif
- " ret\n");
-}
-
-/*
- * Called from kretprobe_trampoline
- */
-static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
-{
- struct kretprobe_instance *ri = NULL;
- struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
- unsigned long flags, orig_ret_address = 0;
- unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
- /* fixup registers */
-#ifdef CONFIG_X86_64
- regs->cs = __KERNEL_CS;
-#else
- regs->cs = __KERNEL_CS | get_kernel_rpl();
- regs->gs = 0;
-#endif
- regs->ip = trampoline_address;
- regs->orig_ax = ~0UL;
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because multiple functions in the call path have
- * return probes installed on them, and/or more than one
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always pushed into the head of the list
- * - when multiple return probes are registered for the same
- * function, the (chronologically) first instance's ret_addr
- * will be the real return address, and all the rest will
- * point to kretprobe_trampoline.
- */
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = &ri->rp->kp;
- get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
- ri->rp->handler(ri, regs);
- __get_cpu_var(current_kprobe) = NULL;
- }
-
- orig_ret_address = (unsigned long)ri->ret_addr;
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- return (void *)orig_ret_address;
-}
-
-/*
- * Called after single-stepping. p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction. To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction. The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt. We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction. We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
- */
-static void __kprobes resume_execution(struct kprobe *p,
- struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
- unsigned long *tos = stack_addr(regs);
- unsigned long copy_ip = (unsigned long)p->ainsn.insn;
- unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /*skip the REX prefix*/
- if (is_REX_prefix(insn))
- insn++;
-
- regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
- *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
- *tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = 1;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
- *tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = 1;
- goto no_change;
- }
- default:
- break;
- }
-
- if (p->ainsn.boostable == 0) {
- if ((regs->ip > copy_ip) &&
- (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
- /*
- * These instructions can be executed directly if it
- * jumps back to correct address.
- */
- synthesize_reljump((void *)regs->ip,
- (void *)orig_ip + (regs->ip - copy_ip));
- p->ainsn.boostable = 1;
- } else {
- p->ainsn.boostable = -1;
- }
- }
-
- regs->ip += orig_ip - copy_ip;
-
-no_change:
- restore_btf();
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled throughout this function.
- */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- if (!cur)
- return 0;
-
- resume_execution(cur, regs, kcb);
- regs->flags |= kcb->kprobe_saved_flags;
-
- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- cur->post_handler(cur, regs, 0);
- }
-
- /* Restore back the original saved kprobes variables and continue. */
- if (kcb->kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe(kcb);
- goto out;
- }
- reset_current_kprobe();
-out:
- preempt_enable_no_resched();
-
- /*
- * if somebody else is singlestepping across a probe point, flags
- * will have TF set, in which case, continue the remaining processing
- * of do_debug, as if this is not a probe hit.
- */
- if (regs->flags & X86_EFLAGS_TF)
- return 0;
-
- return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SS:
- case KPROBE_REENTER:
- /*
- * We are here because the instruction being single
- * stepped caused a page fault. We reset the current
- * kprobe and the ip points back to the probe address
- * and allow the page fault handler to continue as a
- * normal page fault.
- */
- regs->ip = (unsigned long)cur->addr;
- regs->flags |= kcb->kprobe_old_flags;
- if (kcb->kprobe_status == KPROBE_REENTER)
- restore_previous_kprobe(kcb);
- else
- reset_current_kprobe();
- preempt_enable_no_resched();
- break;
- case KPROBE_HIT_ACTIVE:
- case KPROBE_HIT_SSDONE:
- /*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(cur);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
- return 1;
-
- /*
- * In case the user-specified fault handler returned
- * zero, try to fix up.
- */
- if (fixup_exception(regs))
- return 1;
-
- /*
- * fixup routine could not handle it,
- * Let do_page_fault() fix it.
- */
- break;
- default:
- break;
- }
- return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct die_args *args = data;
- int ret = NOTIFY_DONE;
-
- if (args->regs && user_mode_vm(args->regs))
- return ret;
-
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs)) {
- /*
- * Reset the BS bit in dr6 (pointed by args->err) to
- * denote completion of processing
- */
- (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
- ret = NOTIFY_STOP;
- }
- break;
- case DIE_GPF:
- /*
- * To be potentially processing a kprobe fault and to
- * trust the result from kprobe_running(), we have
- * be non-preemptible.
- */
- if (!preemptible() && kprobe_running() &&
- kprobe_fault_handler(args->regs, args->trapnr))
- ret = NOTIFY_STOP;
- break;
- default:
- break;
- }
- return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct jprobe *jp = container_of(p, struct jprobe, kp);
- unsigned long addr;
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- kcb->jprobe_saved_regs = *regs;
- kcb->jprobe_saved_sp = stack_addr(regs);
- addr = (unsigned long)(kcb->jprobe_saved_sp);
-
- /*
- * As Linus pointed out, gcc assumes that the callee
- * owns the argument space and could overwrite it, e.g.
- * tailcall optimization. So, to be absolutely safe
- * we also save and restore enough stack bytes to cover
- * the argument area.
- */
- memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
- MIN_STACK_SIZE(addr));
- regs->flags &= ~X86_EFLAGS_IF;
- trace_hardirqs_off();
- regs->ip = (unsigned long)(jp->entry);
- return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- asm volatile (
-#ifdef CONFIG_X86_64
- " xchg %%rbx,%%rsp \n"
-#else
- " xchgl %%ebx,%%esp \n"
-#endif
- " int3 \n"
- " .globl jprobe_return_end\n"
- " jprobe_return_end: \n"
- " nop \n"::"b"
- (kcb->jprobe_saved_sp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- u8 *addr = (u8 *) (regs->ip - 1);
- struct jprobe *jp = container_of(p, struct jprobe, kp);
-
- if ((addr > (u8 *) jprobe_return) &&
- (addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != kcb->jprobe_saved_sp) {
- struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
- printk(KERN_ERR
- "current sp %p does not match saved sp %p\n",
- stack_addr(regs), kcb->jprobe_saved_sp);
- printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
- printk(KERN_ERR "Current registers\n");
- show_registers(regs);
- BUG();
- }
- *regs = kcb->jprobe_saved_regs;
- memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
- kcb->jprobes_stack,
- MIN_STACK_SIZE(kcb->jprobe_saved_sp));
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-
-
-#ifdef CONFIG_OPTPROBES
-
-/* Insert a call instruction at address 'from', which calls address 'to'.*/
-static void __kprobes synthesize_relcall(void *from, void *to)
-{
- __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
-}
-
-/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
- unsigned long val)
-{
-#ifdef CONFIG_X86_64
- *addr++ = 0x48;
- *addr++ = 0xbf;
-#else
- *addr++ = 0xb8;
-#endif
- *(unsigned long *)addr = val;
-}
-
-void __kprobes kprobes_optinsn_template_holder(void)
-{
- asm volatile (
- ".global optprobe_template_entry\n"
- "optprobe_template_entry: \n"
-#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rsi\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- /* Move flags to rsp */
- " movq 144(%rsp), %rdx\n"
- " movq %rdx, 152(%rsp)\n"
- RESTORE_REGS_STRING
- /* Skip flags entry */
- " addq $8, %rsp\n"
- " popfq\n"
-#else /* CONFIG_X86_32 */
- " pushf\n"
- SAVE_REGS_STRING
- " movl %esp, %edx\n"
- ".global optprobe_template_val\n"
- "optprobe_template_val: \n"
- ASM_NOP5
- ".global optprobe_template_call\n"
- "optprobe_template_call: \n"
- ASM_NOP5
- RESTORE_REGS_STRING
- " addl $4, %esp\n" /* skip cs */
- " popf\n"
-#endif
- ".global optprobe_template_end\n"
- "optprobe_template_end: \n");
-}
-
-#define TMPL_MOVE_IDX \
- ((long)&optprobe_template_val - (long)&optprobe_template_entry)
-#define TMPL_CALL_IDX \
- ((long)&optprobe_template_call - (long)&optprobe_template_entry)
-#define TMPL_END_IDX \
- ((long)&optprobe_template_end - (long)&optprobe_template_entry)
-
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
-/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op,
- struct pt_regs *regs)
-{
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- preempt_disable();
- if (kprobe_running()) {
- kprobes_inc_nmissed_count(&op->kp);
- } else {
- /* Save skipped registers */
-#ifdef CONFIG_X86_64
- regs->cs = __KERNEL_CS;
-#else
- regs->cs = __KERNEL_CS | get_kernel_rpl();
- regs->gs = 0;
-#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
- regs->orig_ax = ~0UL;
-
- __get_cpu_var(current_kprobe) = &op->kp;
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
- opt_pre_handler(&op->kp, regs);
- __get_cpu_var(current_kprobe) = NULL;
- }
- preempt_enable_no_resched();
-}
-
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
-{
- int len = 0, ret;
-
- while (len < RELATIVEJUMP_SIZE) {
- ret = __copy_instruction(dest + len, src + len, 1);
- if (!ret || !can_boost(dest + len))
- return -EINVAL;
- len += ret;
- }
- /* Check whether the address range is reserved */
- if (ftrace_text_reserved(src, src + len - 1) ||
- alternatives_text_reserved(src, src + len - 1))
- return -EBUSY;
-
- return len;
-}
-
-/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
-{
- return ((insn->opcode.bytes[0] == 0xff &&
- (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
- insn->opcode.bytes[0] == 0xea); /* Segment based jump */
-}
-
-/* Check whether insn jumps into specified address range */
-static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
-{
- unsigned long target = 0;
-
- switch (insn->opcode.bytes[0]) {
- case 0xe0: /* loopne */
- case 0xe1: /* loope */
- case 0xe2: /* loop */
- case 0xe3: /* jcxz */
- case 0xe9: /* near relative jump */
- case 0xeb: /* short relative jump */
- break;
- case 0x0f:
- if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
- break;
- return 0;
- default:
- if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
- break;
- return 0;
- }
- target = (unsigned long)insn->next_byte + insn->immediate.value;
-
- return (start <= target && target <= start + len);
-}
-
-/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
-{
- int ret;
- unsigned long addr, size = 0, offset = 0;
- struct insn insn;
- kprobe_opcode_t buf[MAX_INSN_SIZE];
- /* Dummy buffers for lookup_symbol_attrs */
- static char __dummy_buf[KSYM_NAME_LEN];
-
- /* Lookup symbol including addr */
- if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
- return 0;
-
- /* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
- return 0;
-
- /* Decode instructions */
- addr = paddr - offset;
- while (addr < paddr - offset + size) { /* Decode until function end */
- if (search_exception_tables(addr))
- /*
- * Since some fixup code will jumps into this function,
- * we can't optimize kprobe in this function.
- */
- return 0;
- kernel_insn_init(&insn, (void *)addr);
- insn_get_opcode(&insn);
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
- ret = recover_probed_instruction(buf, addr);
- if (ret)
- return 0;
- kernel_insn_init(&insn, buf);
- }
- insn_get_length(&insn);
- /* Recover address */
- insn.kaddr = (void *)addr;
- insn.next_byte = (void *)(addr + insn.length);
- /* Check any instructions don't jump into target */
- if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
- return 0;
- addr += insn.length;
- }
-
- return 1;
-}
-
-/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
-{
- int i;
- struct kprobe *p;
-
- for (i = 1; i < op->optinsn.size; i++) {
- p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
- return -EEXIST;
- }
-
- return 0;
-}
-
-/* Check the addr is within the optimized instructions. */
-int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
- unsigned long addr)
-{
- return ((unsigned long)op->kp.addr <= addr &&
- (unsigned long)op->kp.addr + op->optinsn.size > addr);
-}
-
-/* Free optimized instruction slot */
-static __kprobes
-void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
-{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
- op->optinsn.insn = NULL;
- op->optinsn.size = 0;
- }
-}
-
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
-{
- __arch_remove_optimized_kprobe(op, 1);
-}
-
-/*
- * Copy replacing target instructions
- * Target instructions MUST be relocatable (checked inside)
- */
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
-{
- u8 *buf;
- int ret;
- long rel;
-
- if (!can_optimize((unsigned long)op->kp.addr))
- return -EILSEQ;
-
- op->optinsn.insn = get_optinsn_slot();
- if (!op->optinsn.insn)
- return -ENOMEM;
-
- /*
- * Verify if the address gap is in 2GB range, because this uses
- * a relative jump.
- */
- rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
- return -ERANGE;
-
- buf = (u8 *)op->optinsn.insn;
-
- /* Copy instructions into the out-of-line buffer */
- ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
- if (ret < 0) {
- __arch_remove_optimized_kprobe(op, 0);
- return ret;
- }
- op->optinsn.size = ret;
-
- /* Copy arch-dep-instance from template */
- memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
-
- /* Set probe information */
- synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
-
- /* Set probe function call */
- synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
-
- /* Set returning jmp instruction at the tail of out-of-line buffer */
- synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
- (u8 *)op->kp.addr + op->optinsn.size);
-
- flush_icache_range((unsigned long) buf,
- (unsigned long) buf + TMPL_END_IDX +
- op->optinsn.size + RELATIVEJUMP_SIZE);
- return 0;
-}
-
-/* Replace a breakpoint (int3) with a relative jump. */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
-{
- unsigned char jmp_code[RELATIVEJUMP_SIZE];
- s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- /* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
-
- jmp_code[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&jmp_code[1]) = rel;
-
- /*
- * text_poke_smp doesn't support NMI/MCE code modifying.
- * However, since kprobes itself also doesn't support NMI/MCE
- * code probing, it's not a problem.
- */
- text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
- return 0;
-}
-
-/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
-{
- u8 buf[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- buf[0] = BREAKPOINT_INSTRUCTION;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
-}
-
-static int __kprobes setup_detour_execution(struct kprobe *p,
- struct pt_regs *regs,
- int reenter)
-{
- struct optimized_kprobe *op;
-
- if (p->flags & KPROBE_FLAG_OPTIMIZED) {
- /* This kprobe is really able to run optimized path. */
- op = container_of(p, struct optimized_kprobe, kp);
- /* Detour through copied instructions */
- regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
- if (!reenter)
- reset_current_kprobe();
- preempt_enable_no_resched();
- return 1;
- }
- return 0;
-}
-#endif
-
-int __init arch_init_kprobes(void)
-{
- return 0;
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
- return 0;
-}
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 000000000000..8a753432b2d4
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES) += core.o
+obj-$(CONFIG_OPTPROBES) += opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
new file mode 100644
index 000000000000..e772276f5aa9
--- /dev/null
+++ b/arch/x86/kernel/kprobes/common.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#include <asm/asm.h>
+#include <asm/frame.h>
+#include <asm/insn.h>
+
+#ifdef CONFIG_X86_64
+
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ " pushq %rbp\n" \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n" \
+ ENCODE_FRAME_POINTER
+
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $4*4, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ " pushl %ebp\n" \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n" \
+ ENCODE_FRAME_POINTER
+
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\
+ " addl $7*4, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern bool can_boost(struct insn *insn, void *orig_addr);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+
+#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
new file mode 100644
index 000000000000..c1fac3a9fecc
--- /dev/null
+++ b/arch/x86/kernel/kprobes/core.c
@@ -0,0 +1,1081 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel Probes (KProbes)
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation ( includes contributions from
+ * Rusty Russell).
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar Roland McGrath <roland@redhat.com>
+ * Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
+ * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ * kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * and kretprobe-booster for x86-64
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/ftrace.h>
+#include <linux/kasan.h>
+#include <linux/objtool.h>
+#include <linux/vmalloc.h>
+#include <linux/pgtable.h>
+#include <linux/set_memory.h>
+#include <linux/cfi.h>
+#include <linux/execmem.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/ibt.h>
+
+#include "common.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+ /*
+ * Undefined/reserved opcodes, conditional jump, Opcode Extension
+ * Groups, and some special opcodes can not boost.
+ * This is non-const and volatile to keep gcc from statically
+ * optimizing it out, as variable_test_bit makes gcc think only
+ * *(unsigned long*) is used.
+ */
+static volatile u32 twobyte_is_boostable[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+ W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+ W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+ W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
+ /* ----------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+ {"__switch_to", }, /* This function switches only current task, but
+ doesn't switch kernel stack.*/
+ {NULL, NULL} /* Terminator */
+};
+
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+static nokprobe_inline void
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
+{
+ struct __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } __packed *insn;
+
+ insn = (struct __arch_relative_insn *)dest;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+void synthesize_reljump(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_reljump);
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void synthesize_relcall(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_relcall);
+
+/*
+ * Returns non-zero if INSN is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+bool can_boost(struct insn *insn, void *addr)
+{
+ kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+
+ if (search_exception_tables((unsigned long)addr))
+ return false; /* Page fault may occur on this address. */
+
+ /* 2nd-byte opcode */
+ if (insn->opcode.nbytes == 2)
+ return test_bit(insn->opcode.bytes[1],
+ (unsigned long *)twobyte_is_boostable);
+
+ if (insn->opcode.nbytes != 1)
+ return false;
+
+ for_each_insn_prefix(insn, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return false;
+ }
+
+ opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0x62: /* bound */
+ case 0x70 ... 0x7f: /* Conditional jumps */
+ case 0x9a: /* Call far */
+ case 0xcc ... 0xce: /* software exceptions */
+ case 0xd6: /* (UD) */
+ case 0xd8 ... 0xdf: /* ESC */
+ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
+ case 0xe8 ... 0xe9: /* near Call, JMP */
+ case 0xeb: /* Short JMP */
+ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ /* ... are not boostable */
+ return false;
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ /*
+ * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved.
+ */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110;
+ case 0xf6 ... 0xf7: /* Grp3 */
+ /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001;
+ case 0xfe: /* Grp4 */
+ /* Only INC and DEC are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001;
+ case 0xff: /* Grp5 */
+ /* Only INC, DEC, and indirect JMP are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100;
+ default:
+ return true;
+ }
+}
+
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ bool faddr;
+
+ kp = get_kprobe((void *)addr);
+ faddr = ftrace_location(addr) == addr;
+ /*
+ * Use the current code if it is not modified by Kprobe
+ * and it cannot be modified by ftrace.
+ */
+ if (!kp && !faddr)
+ return addr;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, __copy_instruction() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+ * of the first byte of the probed instruction, which is overwritten
+ * by int3. And the instruction at kp->addr is not modified by kprobes
+ * except for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ *
+ * In case of Kprobes using ftrace, we do not have a copy of
+ * the original instruction. In fact, the ftrace location might
+ * be modified at anytime and even could be in an inconsistent state.
+ * Fortunately, we know that the original code is the ideal 5-byte
+ * long NOP.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (faddr)
+ memcpy(buf, x86_nops[5], 5);
+ else
+ buf[0] = kp->opcode;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered (or access failed).
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
+}
+
+/* Check if insn is INT or UD */
+static inline bool is_exception_insn(struct insn *insn)
+{
+ /* UD uses 0f escape */
+ if (insn->opcode.bytes[0] == 0x0f) {
+ /* UD0 / UD1 / UD2 */
+ return insn->opcode.bytes[1] == 0xff ||
+ insn->opcode.bytes[1] == 0xb9 ||
+ insn->opcode.bytes[1] == 0x0b;
+ }
+
+ /* INT3 / INT n / INTO / INT1 */
+ return insn->opcode.bytes[0] == 0xcc ||
+ insn->opcode.bytes[0] == 0xcd ||
+ insn->opcode.bytes[0] == 0xce ||
+ insn->opcode.bytes[0] == 0xf1;
+}
+
+/*
+ * Check if paddr is at an instruction boundary and that instruction can
+ * be probed
+ */
+static bool can_probe(unsigned long paddr)
+{
+ unsigned long addr, __addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return false;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
+ */
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+#ifdef CONFIG_KGDB
+ /*
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
+ */
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return false;
+#endif
+ addr += insn.length;
+ }
+
+ /* Check if paddr is at an instruction boundary */
+ if (addr != paddr)
+ return false;
+
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+ /* INT and UD are special and should not be kprobed */
+ if (is_exception_insn(&insn))
+ return false;
+
+ if (IS_ENABLED(CONFIG_CFI)) {
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks and cfi.c decodes this;
+ *
+ *  movl -<id>, %r10d ; 6 bytes
+ * addl -4(%reg), %r10d ; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * Also, these movl and addl are used for showing expected
+ * type. So those must not be touched.
+ */
+ if (insn.opcode.value == 0xBA)
+ offset = 12;
+ else if (insn.opcode.value == 0x3)
+ offset = 6;
+ else
+ goto out;
+
+ /* This movl/addl is used for decoding CFI. */
+ if (is_cfi_trap(addr + offset))
+ return false;
+ }
+
+out:
+ return true;
+}
+
+/* If x86 supports IBT (ENDBR) it must be skipped. */
+kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
+ bool *on_func_entry)
+{
+ if (is_endbr((u32 *)addr)) {
+ *on_func_entry = !offset || offset == 4;
+ if (*on_func_entry)
+ offset = 4;
+
+ } else {
+ *on_func_entry = !offset;
+ }
+
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
+ * This returns the length of copied instruction, or 0 if it has an error.
+ */
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
+{
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
+ int ret;
+
+ if (!recovered_insn || !insn)
+ return 0;
+
+ /* This can access kernel text if given address is not recovered */
+ if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
+ MAX_INSN_SIZE))
+ return 0;
+
+ ret = insn_decode_kernel(insn, dest);
+ if (ret < 0)
+ return 0;
+
+ /* We can not probe force emulate prefixed instruction */
+ if (insn_has_emulate_prefix(insn))
+ return 0;
+
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
+ return 0;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return 0;
+
+#ifdef CONFIG_X86_64
+ /* Only x86_64 has RIP relative instructions */
+ if (insn_rip_relative(insn)) {
+ s64 newdisp;
+ u8 *disp;
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) src + (s64) insn->displacement.value
+ - (u8 *) real;
+ if ((s64) (s32) newdisp != newdisp) {
+ pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
+ return 0;
+ }
+ disp = (u8 *) dest + insn_offset_displacement(insn);
+ *(s32 *) disp = (s32) newdisp;
+ }
+#endif
+ return insn->length;
+}
+
+/* Prepare reljump or int3 right after instruction */
+static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
+{
+ int len = insn->length;
+
+ if (!IS_ENABLED(CONFIG_PREEMPTION) &&
+ !p->post_handler && can_boost(insn, p->addr) &&
+ MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
+ /*
+ * These instructions can be executed directly if it
+ * jumps back to correct address.
+ */
+ synthesize_reljump(buf + len, p->ainsn.insn + len,
+ p->addr + insn->length);
+ len += JMP32_INSN_SIZE;
+ p->ainsn.boostable = 1;
+ } else {
+ /* Otherwise, put an int3 for trapping singlestep */
+ if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
+ return -ENOSPC;
+
+ buf[len] = INT3_INSN_OPCODE;
+ len += INT3_INSN_SIZE;
+ }
+
+ return len;
+}
+
+/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
+
+static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
+{
+ switch (p->ainsn.opcode) {
+ case 0xfa: /* cli */
+ regs->flags &= ~(X86_EFLAGS_IF);
+ break;
+ case 0xfb: /* sti */
+ regs->flags |= X86_EFLAGS_IF;
+ break;
+ case 0x9c: /* pushf */
+ int3_emulate_push(regs, regs->flags);
+ break;
+ case 0x9d: /* popf */
+ regs->flags = int3_emulate_pop(regs);
+ break;
+ }
+ regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
+
+static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
+{
+ int3_emulate_ret(regs);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ret);
+
+static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ func += p->ainsn.rel32;
+ int3_emulate_call(regs, func);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call);
+
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp);
+
+static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jcc);
+
+static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+ bool match;
+
+ if (p->ainsn.loop.type != 3) { /* LOOP* */
+ if (p->ainsn.loop.asize == 32)
+ match = ((*(u32 *)&regs->cx)--) != 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = ((*(u64 *)&regs->cx)--) != 0;
+#endif
+ else
+ match = ((*(u16 *)&regs->cx)--) != 0;
+ } else { /* JCXZ */
+ if (p->ainsn.loop.asize == 32)
+ match = *(u32 *)(&regs->cx) == 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = *(u64 *)(&regs->cx) == 0;
+#endif
+ else
+ match = *(u16 *)(&regs->cx) == 0;
+ }
+
+ if (p->ainsn.loop.type == 0) /* LOOPNE */
+ match = match && !(regs->flags & X86_EFLAGS_ZF);
+ else if (p->ainsn.loop.type == 1) /* LOOPE */
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+
+ if (match)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_loop);
+
+static const int addrmode_regoffs[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+};
+
+static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size);
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
+
+static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
+
+static int prepare_emulation(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9c: /* pushfl */
+ case 0x9d: /* popf/popfd */
+ /*
+ * IF modifiers must be emulated since it will enable interrupt while
+ * int3 single stepping.
+ */
+ p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
+ p->ainsn.opcode = opcode;
+ break;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ p->ainsn.emulate_op = kprobe_emulate_ret;
+ break;
+ case 0x9a: /* far call absolute -- segment is not supported */
+ case 0xea: /* far jmp absolute -- segment is not supported */
+ case 0xcc: /* int3 */
+ case 0xcf: /* iret -- in-kernel IRET is not supported */
+ return -EOPNOTSUPP;
+ break;
+ case 0xe8: /* near call relative */
+ p->ainsn.emulate_op = kprobe_emulate_call;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0xeb: /* short jump relative */
+ case 0xe9: /* near jump relative */
+ p->ainsn.emulate_op = kprobe_emulate_jmp;
+ if (insn->immediate.nbytes == 1)
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ else if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0x70 ... 0x7f:
+ /* 1 byte conditional jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ p->ainsn.rel32 = insn->immediate.value;
+ break;
+ case 0x0f:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0xf0) == 0x80) {
+ /* 2 bytes Conditional Jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ } else if (opcode == 0x01 &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
+ X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
+ /* VM extensions - not supported */
+ return -EOPNOTSUPP;
+ }
+ break;
+ case 0xe0: /* Loop NZ */
+ case 0xe1: /* Loop */
+ case 0xe2: /* Loop */
+ case 0xe3: /* J*CXZ */
+ p->ainsn.emulate_op = kprobe_emulate_loop;
+ p->ainsn.loop.type = opcode & 0x3;
+ p->ainsn.loop.asize = insn->addr_bytes * 8;
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ break;
+ case 0xff:
+ /*
+ * Since the 0xff is an extended group opcode, the instruction
+ * is determined by the MOD/RM byte.
+ */
+ opcode = insn->modrm.bytes[0];
+ switch (X86_MODRM_REG(opcode)) {
+ case 0b010: /* FF /2, call near, absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_call_indirect;
+ break;
+ case 0b100: /* FF /4, jmp near, absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
+ break;
+ case 0b011: /* FF /3, call far, absolute indirect */
+ case 0b101: /* FF /5, jmp far, absolute indirect */
+ return -EOPNOTSUPP;
+ }
+
+ if (!p->ainsn.emulate_op)
+ break;
+
+ if (insn->addr_bytes != sizeof(unsigned long))
+ return -EOPNOTSUPP; /* Don't support different size */
+ if (X86_MODRM_MOD(opcode) != 3)
+ return -EOPNOTSUPP; /* TODO: support memory addressing */
+
+ p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
+#ifdef CONFIG_X86_64
+ if (X86_REX_B(insn->rex_prefix.value))
+ p->ainsn.indirect.reg += 8;
+#endif
+ break;
+ default:
+ break;
+ }
+ p->ainsn.size = insn->length;
+
+ return 0;
+}
+
+static int arch_copy_kprobe(struct kprobe *p)
+{
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int ret, len;
+
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
+ if (!len)
+ return -EINVAL;
+
+ /* Analyze the opcode and setup emulate functions */
+ ret = prepare_emulation(p, &insn);
+ if (ret < 0)
+ return ret;
+
+ /* Add int3 for single-step or booster jmp */
+ len = prepare_singlestep(buf, p, &insn);
+ if (len < 0)
+ return len;
+
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = buf[0];
+
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
+ /* OK, write back the instruction(s) into ROX insn buffer */
+ text_poke(p->ainsn.insn, buf, len);
+
+ return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ int ret;
+
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
+ /* insn: must be on special executable page on x86. */
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
+ smp_text_poke_sync_each_cpu();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
+ text_poke(p->addr, &p->opcode, 1);
+ smp_text_poke_sync_each_cpu();
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
+ free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
+ p->ainsn.insn = NULL;
+ }
+}
+
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+ kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+ kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+ = (regs->flags & X86_EFLAGS_IF);
+}
+
+static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ /* This will restore both kcb and current_kprobe */
+ restore_previous_kprobe(kcb);
+ } else {
+ /*
+ * Always update the kcb status because
+ * reset_curent_kprobe() doesn't update kcb.
+ */
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ if (cur->post_handler)
+ cur->post_handler(cur, regs, 0);
+ reset_current_kprobe();
+ }
+}
+NOKPROBE_SYMBOL(kprobe_post_process);
+
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPTION)
+ if (p->ainsn.boostable) {
+ /* Boost up -- we can execute copied instructions directly */
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
+ regs->ip = (unsigned long)p->ainsn.insn;
+ return;
+ }
+#endif
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+
+ if (p->ainsn.emulate_op) {
+ p->ainsn.emulate_op(p, regs);
+ kprobe_post_process(p, regs, kcb);
+ return;
+ }
+
+ /* Disable interrupt, and set ip register on trampoline */
+ regs->flags &= ~X86_EFLAGS_IF;
+ regs->ip = (unsigned long)p->ainsn.insn;
+}
+NOKPROBE_SYMBOL(setup_singlestep);
+
+/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
+ * right after the copied instruction.
+ * Different from the trap single-step, "int3" single-step can not
+ * handle the instruction which changes the ip register, e.g. jmp,
+ * call, conditional jmp, and the instructions which changes the IF
+ * flags because interrupt must be disabled around the single-stepping.
+ * Such instructions are software emulated, but others are single-stepped
+ * using "int3".
+ *
+ * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
+ * be adjusted, so that we can resume execution on correct code.
+ */
+static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+
+ /* Restore saved interrupt flag and ip register */
+ regs->flags |= kcb->kprobe_saved_flags;
+ /* Note that regs->ip is executed int3 so must be a step back */
+ regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
+}
+NOKPROBE_SYMBOL(resume_singlestep);
+
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_REENTER:
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ pr_err("Unrecoverable kprobe detected.\n");
+ dump_kprobe(p);
+ BUG();
+ default:
+ /* impossible cases */
+ WARN_ON(1);
+ return 0;
+ }
+
+ return 1;
+}
+NOKPROBE_SYMBOL(reenter_kprobe);
+
+static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
+{
+ return (kcb->kprobe_status == KPROBE_HIT_SS ||
+ kcb->kprobe_status == KPROBE_REENTER);
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_int3_handler(struct pt_regs *regs)
+{
+ kprobe_opcode_t *addr;
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ if (user_mode(regs))
+ return 0;
+
+ addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+ /*
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
+ */
+
+ kcb = get_kprobe_ctlblk();
+ p = get_kprobe(addr);
+
+ if (p) {
+ if (kprobe_running()) {
+ if (reenter_kprobe(p, regs, kcb))
+ return 1;
+ } else {
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
+ return 1;
+ }
+ } else if (kprobe_is_ss(kcb)) {
+ p = kprobe_running();
+ if ((unsigned long)p->ainsn.insn < regs->ip &&
+ (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
+ /* Most provably this is the second int3 for singlestep */
+ resume_singlestep(p, regs, kcb);
+ kprobe_post_process(p, regs, kcb);
+ return 1;
+ }
+ } /* else: not a kprobe fault; let the kernel handle it */
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
+
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ regs->ip = (unsigned long)cur->addr;
+
+ /*
+ * If the IF flag was set before the kprobe hit,
+ * don't touch it:
+ */
+ regs->flags |= kcb->kprobe_old_flags;
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+int __init arch_populate_kprobe_blacklist(void)
+{
+ return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
+ (unsigned long)__entry_text_end);
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 000000000000..2be55ec3f392
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+#include <asm/text-patching.h>
+
+#include "common.h"
+
+/* Ftrace callback handler for kprobes -- called under preempt disabled */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ int bit;
+
+ if (unlikely(kprobe_ftrace_disabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_ip = instruction_pointer(regs);
+
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ instruction_pointer_set(regs, ip + INT3_INSN_SIZE);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ if (unlikely(p->post_handler)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ /* Recover IP address */
+ instruction_pointer_set(regs, orig_ip);
+ }
+ /*
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
+ */
+ __this_cpu_write(current_kprobe, NULL);
+ }
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = false;
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
new file mode 100644
index 000000000000..6f826a00eca2
--- /dev/null
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/ftrace.h>
+#include <linux/objtool.h>
+#include <linux/pgtable.h>
+#include <linux/static_call.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+#include <asm/sections.h>
+#include <asm/nospec-branch.h>
+
+#include "common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < JMP32_INSN_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op is optimized or under unoptimizing */
+ if (list_empty(&op->list) || optprobe_queued_unopt(op))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+static void synthesize_clac(kprobe_opcode_t *addr)
+{
+ /*
+ * Can't be static_cpu_has() due to how objtool treats this feature bit.
+ * This isn't a fast path anyway.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return;
+
+ /* Replace the NOP3 with CLAC */
+ addr[0] = 0x0f;
+ addr[1] = 0x01;
+ addr[2] = 0xca;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ ".global optprobe_template_clac\n"
+ "optprobe_template_clac:\n"
+ ASM_NOP3
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Copy 'regs->flags' into 'regs->ss'. */
+ " movq 18*8(%rsp), %rdx\n"
+ " movq %rdx, 20*8(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addq $16, %rsp\n"
+ /* And pop flags register from 'regs->ss'. */
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushl %ss\n"
+ /* Save the 'sp - 4', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ ".global optprobe_template_clac\n"
+ "optprobe_template_clac:\n"
+ ASM_NOP3
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Copy 'regs->flags' into 'regs->ss'. */
+ " movl 14*4(%esp), %edx\n"
+ " movl %edx, 16*4(%esp)\n"
+ RESTORE_REGS_STRING
+ /* Skip 'regs->flags' and 'regs->sp'. */
+ " addl $8, %esp\n"
+ /* And pop flags register from 'regs->ss'. */
+ " popfl\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n"
+ ".popsection\n");
+
+#define TMPL_CLAC_IDX \
+ ((long)optprobe_template_clac - (long)optprobe_template_entry)
+#define TMPL_MOVE_IDX \
+ ((long)optprobe_template_val - (long)optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)optprobe_template_call - (long)optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)optprobe_template_end - (long)optprobe_template_entry)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Adjust stack pointer */
+ regs->sp += sizeof(long);
+ /* Save skipped registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
+{
+ struct insn insn;
+ int len = 0, ret;
+
+ while (len < JMP32_INSN_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, real + len, &insn);
+ if (!ret || !can_boost(&insn, src + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1) ||
+ static_call_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling and registers setup.
+ */
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < JMP32_INSN_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ unsigned long recovered_insn;
+ int ret;
+
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ recovered_insn = recover_probed_instruction(buf, addr);
+ if (!recovered_insn)
+ return 0;
+
+ ret = insn_decode_kernel(&insn, (void *)recovered_insn);
+ if (ret < 0)
+ return 0;
+#ifdef CONFIG_KGDB
+ /*
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
+ */
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return 0;
+#endif
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /*
+ * Check any instructions don't jump into target, indirectly or
+ * directly.
+ *
+ * The indirect case is present to handle a code with jump
+ * tables. When the kernel uses retpolines, the check should in
+ * theory additionally look for jumps to indirect thunks.
+ * However, the kernel built with retpolines or IBT has jump
+ * tables disabled so the check can be skipped altogether.
+ */
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
+ insn_is_indirect_jump(&insn))
+ return 0;
+ if (insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ DISP32_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disarmed(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ kprobe_opcode_t *addr)
+{
+ return (op->kp.addr <= addr &&
+ op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
+{
+ u8 *buf = NULL, *slot;
+ int ret, len;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ op->optinsn.insn = slot = get_optinsn_slot();
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
+ if (abs(rel) > 0x7fffffff) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+ slot + TMPL_END_IDX);
+ if (ret < 0)
+ goto err;
+ op->optinsn.size = ret;
+ len = TMPL_END_IDX + op->optinsn.size;
+
+ synthesize_clac(buf + TMPL_CLAC_IDX);
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX,
+ slot + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + len, slot + len,
+ (u8 *)op->kp.addr + op->optinsn.size);
+ len += JMP32_INSN_SIZE;
+
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
+ /* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
+ text_poke(slot, buf, len);
+
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+
+err:
+ __arch_remove_optimized_kprobe(op, 0);
+ goto out;
+}
+
+/*
+ * Replace breakpoints (INT3) with relative jumps (JMP.d32).
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ *
+ * The caller will have installed a regular kprobe and after that issued
+ * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
+ * the 4 bytes after the INT3 are unused and can now be overwritten.
+ */
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ u8 insn_buff[JMP32_INSN_SIZE];
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + JMP32_INSN_SIZE));
+
+ WARN_ON(kprobe_disabled(&op->kp));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
+ DISP32_SIZE);
+
+ insn_buff[0] = JMP32_INSN_OPCODE;
+ *(s32 *)(&insn_buff[1]) = rel;
+
+ smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
+
+ list_del_init(&op->list);
+ }
+}
+
+/*
+ * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
+ *
+ * After that, we can restore the 4 bytes after the INT3 to undo what
+ * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
+ * unused once the INT3 lands.
+ */
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
+ smp_text_poke_sync_each_cpu();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ smp_text_poke_sync_each_cpu();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ arch_unoptimize_kprobe(op);
+ list_move(&op->list, done_list);
+ }
+}
+
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..d547de9b3ed8
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ const struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static const struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static const struct bin_attribute *const boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static const struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ u64 pa_data = boot_params.hdr.setup_data, pa_next;
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int i = 0;
+ u32 len;
+
+ while (pa_data) {
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+ pa_next = data->next;
+
+ if (nr == i) {
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(pa_data, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT)
+ *size = indirect->len;
+ else
+ *size = data->len;
+ } else {
+ *size = data->len;
+ }
+
+ memunmap(data);
+ return 0;
+ }
+
+ pa_data = pa_next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int nr, ret;
+ u64 paddr;
+ u32 len;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ ret = sprintf(buf, "0x%x\n", indirect->type);
+ } else {
+ ret = sprintf(buf, "0x%x\n", data->type);
+ }
+
+ memunmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ const struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ struct setup_indirect *indirect;
+ struct setup_data *data;
+ int nr, ret = 0;
+ u64 paddr, len;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (data->type == SETUP_INDIRECT) {
+ len = sizeof(*data) + data->len;
+ memunmap(data);
+ data = memremap(paddr, len, MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ } else {
+ /*
+ * Even though this is technically undefined, return
+ * the data as though it is a normal setup_data struct.
+ * This will at least allow it to be inspected.
+ */
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+ } else {
+ paddr += sizeof(*data);
+ len = data->len;
+ }
+
+ if (off > len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > len - off)
+ count = len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = memremap(paddr, len, MEMREMAP_WB);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ memunmap(p);
+out:
+ memunmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr __ro_after_init = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static const struct bin_attribute *const setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static const struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ memunmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j >= 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..df78ddee0abb 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -1,48 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* KVM paravirt_ops implementation
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright IBM Corporation, 2007
* Authors: Anthony Liguori <aliguori@us.ibm.com>
*/
-#include <linux/module.h>
+#define pr_fmt(fmt) "kvm-guest: " fmt
+
+#include <linux/context_tracking.h>
+#include <linux/init.h>
+#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/swait.h>
+#include <linux/syscore_ops.h>
+#include <linux/cc_platform.h>
+#include <linux/efi.h>
+#include <linux/kvm_types.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
+#include <asm/mtrr.h>
+#include <asm/tlb.h>
+#include <asm/cpuidle_haltpoll.h>
+#include <asm/msr.h>
+#include <asm/ptrace.h>
+#include <asm/reboot.h>
+#include <asm/svm.h>
+#include <asm/e820/api.h>
-#define MMU_QUEUE_SIZE 1024
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
-struct kvm_para_state {
- u8 mmu_queue[MMU_QUEUE_SIZE];
- int mmu_queue_len;
-};
+static int kvmapf = 1;
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static int __init parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
-static struct kvm_para_state *kvm_para_state(void)
+static int steal_acc = 1;
+static int __init parse_no_stealacc(char *arg)
{
- return &per_cpu(para_state, raw_smp_processor_id());
+ steal_acc = 0;
+ return 0;
}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
+static int has_steal_clock = 0;
+
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -50,191 +81,1099 @@ static void kvm_io_delay(void)
{
}
-static void kvm_mmu_op(void *buffer, unsigned len)
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ struct swait_queue_head wq;
+ u32 token;
+ int cpu;
+};
+
+static struct kvm_task_sleep_head {
+ raw_spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
{
- int r;
- unsigned long a1, a2;
+ struct hlist_node *p;
- do {
- a1 = __pa(buffer);
- a2 = 0; /* on i386 __pa() always returns <4G */
- r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
- buffer += r;
- len -= r;
- } while (len);
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
}
-static void mmu_queue_flush(struct kvm_para_state *state)
+static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
- if (state->mmu_queue_len) {
- kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
- state->mmu_queue_len = 0;
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *e;
+
+ raw_spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ raw_spin_unlock(&b->lock);
+ kfree(e);
+ return false;
}
+
+ n->token = token;
+ n->cpu = smp_processor_id();
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ raw_spin_unlock(&b->lock);
+ return true;
}
-static void kvm_deferred_mmu_op(void *buffer, int len)
+/*
+ * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
+ * @token: Token to identify the sleep node entry
+ *
+ * Invoked from the async pagefault handling code or from the VM exit page
+ * fault handler. In both cases RCU is watching.
+ */
+void kvm_async_pf_task_wait_schedule(u32 token)
{
- struct kvm_para_state *state = kvm_para_state();
+ struct kvm_task_sleep_node n;
+ DECLARE_SWAITQUEUE(wait);
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
- kvm_mmu_op(buffer, len);
+ lockdep_assert_irqs_disabled();
+
+ if (!kvm_async_pf_queue_task(token, &n))
return;
+
+ for (;;) {
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
}
- if (state->mmu_queue_len + len > sizeof state->mmu_queue)
- mmu_queue_flush(state);
- memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
- state->mmu_queue_len += len;
+ finish_swait(&n.wq, &wait);
}
+EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
-static void kvm_mmu_write(void *dest, u64 val)
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
- __u64 pte_phys;
- struct kvm_mmu_op_write_pte wpte;
+ hlist_del_init(&n->link);
+ if (swq_has_sleeper(&n->wq))
+ swake_up_one(&n->wq);
+}
-#ifdef CONFIG_HIGHPTE
- struct page *page;
- unsigned long dst = (unsigned long) dest;
+static void apf_task_wake_all(void)
+{
+ int i;
- page = kmap_atomic_to_page(dest);
- pte_phys = page_to_pfn(page);
- pte_phys <<= PAGE_SHIFT;
- pte_phys += (dst & ~(PAGE_MASK));
-#else
- pte_phys = (unsigned long)__pa(dest);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ struct kvm_task_sleep_node *n;
+ struct hlist_node *p, *next;
+
+ raw_spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ n = hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+ }
+}
+
+static void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n, *dummy = NULL;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ raw_spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * Async #PF not yet handled, add a dummy entry for the token.
+ * Allocating the token must be down outside of the raw lock
+ * as the allocator is preemptible on PREEMPT_RT kernels.
+ */
+ if (!dummy) {
+ raw_spin_unlock(&b->lock);
+ dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
+
+ /*
+ * Continue looping on allocation failure, eventually
+ * the async #PF will be handled and allocating a new
+ * node will be unnecessary.
+ */
+ if (!dummy)
+ cpu_relax();
+
+ /*
+ * Recheck for async #PF completion before enqueueing
+ * the dummy token to avoid duplicate list entries.
+ */
+ goto again;
+ }
+ dummy->token = token;
+ dummy->cpu = smp_processor_id();
+ init_swait_queue_head(&dummy->wq);
+ hlist_add_head(&dummy->link, &b->list);
+ dummy = NULL;
+ } else {
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+
+ /* A dummy token might be allocated and ultimately not used. */
+ kfree(dummy);
+}
+
+noinstr u32 kvm_read_and_reset_apf_flags(void)
+{
+ u32 flags = 0;
+
+ if (__this_cpu_read(async_pf_enabled)) {
+ flags = __this_cpu_read(apf_reason.flags);
+ __this_cpu_write(apf_reason.flags, 0);
+ }
+
+ return flags;
+}
+EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
+
+noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
+{
+ u32 flags = kvm_read_and_reset_apf_flags();
+ irqentry_state_t state;
+
+ if (!flags)
+ return false;
+
+ state = irqentry_enter(regs);
+ instrumentation_begin();
+
+ /*
+ * If the host managed to inject an async #PF into an interrupt
+ * disabled region, then die hard as this is not going to end well
+ * and the host side is seriously broken.
+ */
+ if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
+ panic("Host injected async #PF in interrupt disabled region\n");
+
+ if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ if (unlikely(!(user_mode(regs))))
+ panic("Host injected async #PF in kernel mode\n");
+ /* Page is swapped out by the host. */
+ kvm_async_pf_task_wait_schedule(token);
+ } else {
+ WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
+ }
+
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ return true;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ u32 token;
+
+ apic_eoi();
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (__this_cpu_read(async_pf_enabled)) {
+ token = __this_cpu_read(apf_reason.token);
+ kvm_async_pf_task_wake(token);
+ __this_cpu_write(apf_reason.token, 0);
+ wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
+ }
+
+ set_irq_regs(old_regs);
+}
+
+static void __init paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_ops.cpu.io_delay = kvm_io_delay;
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
#endif
- wpte.header.op = KVM_MMU_OP_WRITE_PTE;
- wpte.pte_val = val;
- wpte.pte_phys = pte_phys;
+}
+
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
+ (unsigned long long) slow_virt_to_phys(st));
+}
+
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
+{
+ /**
+ * This relies on __test_and_clear_bit to modify the memory
+ * in a way that is atomic with respect to the local CPU.
+ * The hypervisor only accesses this memory from the local CPU so
+ * there's no need for lock or memory barriers.
+ * An optimization barrier is implied in apic write.
+ */
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
+ return;
+ apic_native_eoi();
+}
- kvm_deferred_mmu_op(&wpte, sizeof wpte);
+static void kvm_guest_cpu_init(void)
+{
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ u64 pa;
+
+ WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+ pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
+ pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
+
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
+ __this_cpu_write(async_pf_enabled, true);
+ pr_debug("setup async PF for cpu %d\n", smp_processor_id());
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+ unsigned long pa;
+
+ /* Size alignment is implied but just to make it explicit. */
+ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+ __this_cpu_write(kvm_apic_eoi, 0);
+ pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
+ wrmsrq(MSR_KVM_PV_EOI_EN, pa);
+ }
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
+}
+
+static void kvm_pv_disable_apf(void)
+{
+ if (!__this_cpu_read(async_pf_enabled))
+ return;
+
+ wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
+ __this_cpu_write(async_pf_enabled, false);
+
+ pr_debug("disable async PF for cpu %d\n", smp_processor_id());
+}
+
+static void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsrq(MSR_KVM_STEAL_TIME, 0);
+}
+
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ virt_rmb();
+ steal = src->steal;
+ virt_rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
}
/*
- * We only need to hook operations that are MMU writes. We hook these so that
- * we can use lazy MMU mode to batch these operations. We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
*/
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
+static void __init sev_map_percpu_data(void)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ int cpu;
+
+ if (cc_vendor != CC_VENDOR_AMD ||
+ !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
}
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static void kvm_guest_cpu_offline(bool shutdown)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrq(MSR_KVM_PV_EOI_EN, 0);
+ if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
+ kvm_pv_disable_apf();
+ if (!shutdown)
+ apf_task_wake_all();
+ kvmclock_disable();
}
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static int kvm_cpu_online(unsigned int cpu)
{
- kvm_mmu_write(pmdp, pmd_val(pmd));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_init();
+ local_irq_restore(flags);
+ return 0;
}
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
+static bool pv_tlb_flush_supported(void)
{
- kvm_mmu_write(ptep, pte_val(pte));
+ return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
}
-static void kvm_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static bool pv_ipi_supported(void)
{
- kvm_mmu_write(ptep, 0);
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
+ (num_possible_cpus() != 1));
}
-static void kvm_pmd_clear(pmd_t *pmdp)
+static bool pv_sched_yield_supported(void)
{
- kvm_mmu_write(pmdp, 0);
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
+ !boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (num_possible_cpus() != 1));
}
+
+#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ int cpu, min = 0, max = 0;
+#ifdef CONFIG_X86_64
+ __uint128_t ipi_bitmap = 0;
+#else
+ u64 ipi_bitmap = 0;
#endif
+ u32 apic_id, icr;
+ long ret;
+
+ if (cpumask_empty(mask))
+ return;
+
+ local_irq_save(flags);
+
+ switch (vector) {
+ default:
+ icr = APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr = APIC_DM_NMI;
+ break;
+ }
+
+ for_each_cpu(cpu, mask) {
+ apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+ if (!ipi_bitmap) {
+ min = max = apic_id;
+ } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+ ipi_bitmap <<= min - apic_id;
+ min = apic_id;
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ max = apic_id < max ? max : apic_id;
+ } else {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
+ min = max = apic_id;
+ ipi_bitmap = 0;
+ }
+ __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+ }
+
+ if (ipi_bitmap) {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
+ ret);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ __send_ipi_mask(mask, vector);
+}
+
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+ const struct cpumask *local_mask;
+
+ cpumask_copy(new_mask, mask);
+ cpumask_clear_cpu(this_cpu, new_mask);
+ local_mask = new_mask;
+ __send_ipi_mask(local_mask, vector);
+}
+
+static int __init setup_efi_kvm_sev_migration(void)
+{
+ efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
+ efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
+ efi_status_t status;
+ unsigned long size;
+ bool enabled;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
+ !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
+ return 0;
+
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
+ pr_info("%s : EFI runtime services are not enabled\n", __func__);
+ return 0;
+ }
+
+ size = sizeof(enabled);
+
+ /* Get variable contents into buffer */
+ status = efi.get_variable(efi_sev_live_migration_enabled,
+ &efi_variable_guid, NULL, &size, &enabled);
+
+ if (status == EFI_NOT_FOUND) {
+ pr_info("%s : EFI live migration variable not found\n", __func__);
+ return 0;
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_info("%s : EFI variable retrieval failed\n", __func__);
+ return 0;
+ }
+
+ if (enabled == 0) {
+ pr_info("%s: live migration disabled in EFI\n", __func__);
+ return 0;
+ }
+
+ pr_info("%s : live migration enabled in EFI\n", __func__);
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
+
+ return 1;
+}
+
+late_initcall(setup_efi_kvm_sev_migration);
+
+/*
+ * Set the IPI entry points
+ */
+static __init void kvm_setup_pv_ipi(void)
+{
+ apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
+ apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
+ pr_info("setup PV IPIs\n");
+}
+
+static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
+{
+ int cpu;
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
+ native_send_call_func_ipi(mask);
+
+ /* Make sure other vCPUs get a chance to run if they need to. */
+ for_each_cpu(cpu, mask) {
+ if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
+ kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
+ break;
+ }
+ }
+}
+
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ /*
+ * The local vCPU is never preempted, so we do not explicitly
+ * skip check for local vCPU - it will never be cleared from
+ * flushmask.
+ */
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_multi(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+ int cpu;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported() || pv_ipi_supported())
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+
+ return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
+static void __init kvm_smp_prepare_boot_cpu(void)
{
- kvm_mmu_write(pudp, pud_val(pud));
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
}
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+static int kvm_cpu_down_prepare(unsigned int cpu)
{
- kvm_mmu_write(pgdp, pgd_val(pgd));
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kvm_guest_cpu_offline(false);
+ local_irq_restore(flags);
+ return 0;
}
+
#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-static void kvm_flush_tlb(void)
+static int kvm_suspend(void *data)
{
- struct kvm_mmu_op_flush_tlb ftlb = {
- .header.op = KVM_MMU_OP_FLUSH_TLB,
- };
+ u64 val = 0;
- kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+ kvm_guest_cpu_offline(false);
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrq(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
+ return 0;
}
-static void kvm_release_pt(unsigned long pfn)
+static void kvm_resume(void *data)
{
- struct kvm_mmu_op_release_pt rpt = {
- .header.op = KVM_MMU_OP_RELEASE_PT,
- .pt_phys = (u64)pfn << PAGE_SHIFT,
- };
+ kvm_cpu_online(raw_smp_processor_id());
- kvm_mmu_op(&rpt, sizeof rpt);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
+#endif
}
-static void kvm_enter_lazy_mmu(void)
+static const struct syscore_ops kvm_syscore_ops = {
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct syscore kvm_syscore = {
+ .ops = &kvm_syscore_ops,
+};
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
{
- paravirt_enter_lazy_mmu();
+ kvm_guest_cpu_offline(true);
}
-static void kvm_leave_lazy_mmu(void)
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
{
- struct kvm_para_state *state = kvm_para_state();
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
- mmu_queue_flush(state);
- paravirt_leave_lazy_mmu();
+/*
+ * After a PV feature is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written.
+ */
+#ifdef CONFIG_CRASH_DUMP
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ kvm_guest_cpu_offline(true);
+ native_machine_crash_shutdown(regs);
}
+#endif
-static void __init paravirt_ops_setup(void)
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
{
- pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
- if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
- pv_cpu_ops.io_delay = kvm_io_delay;
-
- if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
- pv_mmu_ops.set_pte = kvm_set_pte;
- pv_mmu_ops.set_pte_at = kvm_set_pte_at;
- pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.pte_clear = kvm_pte_clear;
- pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+#define PV_VCPU_PREEMPTED_ASM \
+ "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
+ "setne %al\n\t"
+
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
+#endif
+
+static void __init kvm_guest_init(void)
+{
+ int i;
+
+ paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ static_call_update(pv_steal_clock, kvm_steal_clock);
+
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ apic_update_callback(eoi, kvm_guest_apic_eoi_write);
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ static_branch_enable(&kvm_async_pf_enabled);
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+ }
+
+#ifdef CONFIG_SMP
+ if (pv_tlb_flush_supported()) {
+ pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ if (pv_sched_yield_supported()) {
+ smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
+ pr_info("setup PV sched yield\n");
+ }
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("failed to install cpu hotplug callbacks\n");
+#else
+ sev_map_percpu_data();
+ kvm_guest_cpu_init();
#endif
- pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = kvm_set_pgd;
+
+#ifdef CONFIG_CRASH_DUMP
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
+
+ register_syscore(&kvm_syscore);
+
+ /*
+ * Hard lockup detection is enabled by default. Disable it, as guests
+ * can get false positives too easily, for example if the host is
+ * overcommitted.
+ */
+ hardlockup_detector_disable();
+}
+
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
+
+static uint32_t __init kvm_detect(void)
+{
+ return kvm_cpuid_base();
+}
+
+static void __init kvm_apic_init(void)
+{
+#ifdef CONFIG_SMP
+ if (pv_ipi_supported())
+ kvm_setup_pv_ipi();
#endif
- pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
- pv_mmu_ops.release_pte = kvm_release_pt;
- pv_mmu_ops.release_pmd = kvm_release_pt;
- pv_mmu_ops.release_pud = kvm_release_pt;
+}
+
+static bool __init kvm_msi_ext_dest_id(void)
+{
+ return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
+}
+
+static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
+{
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
+ KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+}
+
+static void __init kvm_init_platform(void)
+{
+ u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
+ /*
+ * Note, hardware requires variable MTRR ranges to be power-of-2 sized
+ * and naturally aligned. But when forcing guest MTRR state, Linux
+ * doesn't program the forced ranges into hardware. Don't bother doing
+ * the math to generate a technically-legal range.
+ */
+ struct mtrr_var_range pci_hole = {
+ .base_lo = tolud | X86_MEMTYPE_UC,
+ .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
+ .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
+ };
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+ kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
+ unsigned long nr_pages;
+ int i;
+
+ pv_ops.mmu.notify_page_enc_status_changed =
+ kvm_sev_hc_page_enc_status;
+
+ /*
+ * Reset the host's shared pages list related to kernel
+ * specific page encryption status settings before we load a
+ * new kernel by kexec. Reset the page encryption status
+ * during early boot instead of just before kexec to avoid SMP
+ * races during kvm_pv_guest_cpu_reboot().
+ * NOTE: We cannot reset the complete shared pages list
+ * here as we need to retain the UEFI/OVMF firmware
+ * specific settings.
+ */
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
- pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+ kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
+ nr_pages,
+ KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
+ }
+
+ /*
+ * Ensure that _bss_decrypted section is marked as decrypted in the
+ * shared pages list.
+ */
+ early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
+ __end_bss_decrypted - __start_bss_decrypted, 0);
+
+ /*
+ * If not booted using EFI, enable Live migration support.
+ */
+ if (!efi_enabled(EFI_BOOT))
+ wrmsrq(MSR_KVM_MIGRATION_CONTROL,
+ KVM_MIGRATION_READY);
}
-#ifdef CONFIG_X86_IO_APIC
- no_timer_check = 1;
+ kvmclock_init();
+ x86_platform.apic_post_init = kvm_apic_init;
+
+ /*
+ * Set WB as the default cache mode for SEV-SNP and TDX, with a single
+ * UC range for the legacy PCI hole, e.g. so that devices that expect
+ * to get UC/WC mappings don't get surprised with WB.
+ */
+ guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
+}
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+}
+
+static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
+ .name = "KVM",
+ .detect = kvm_detect,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
+ .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
+ .init.init_platform = kvm_init_platform,
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+ .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
#endif
+};
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+ unsigned long flags = 0;
+ u32 apicid;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
-void __init kvm_guest_init(void)
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
{
- if (!kvm_para_available())
+ if (in_nmi())
return;
- paravirt_ops_setup();
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (irqs_disabled()) {
+ if (READ_ONCE(*ptr) == val)
+ halt();
+ } else {
+ local_irq_disable();
+
+ /* safe_halt() will enable IRQ */
+ if (READ_ONCE(*ptr) == val)
+ safe_halt();
+ else
+ local_irq_enable();
+ }
}
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ /*
+ * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
+ * are available.
+ */
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
+ pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
+ goto out;
+ }
+
+ if (num_possible_cpus() == 1) {
+ pr_info("PV spinlocks disabled, single CPU\n");
+ goto out;
+ }
+
+ if (nopvspin) {
+ pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
+ goto out;
+ }
+
+ /*
+ * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+ * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+ * preferred over native qspinlock when vCPU is preempted.
+ */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+ pr_info("PV spinlocks disabled, no host support\n");
+ return;
+ }
+
+ pr_info("PV spinlocks enabled\n");
+
+ __pv_init_lock_hash();
+ pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_ops.lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_ops.lock.wait = kvm_wait;
+ pv_ops.lock.kick = kvm_kick_cpu;
+
+ /*
+ * When PV spinlock is enabled which is preferred over
+ * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
+ * Just disable it anyway.
+ */
+out:
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+
+static void kvm_disable_host_haltpoll(void *i)
+{
+ wrmsrq(MSR_KVM_POLL_CONTROL, 0);
+}
+
+static void kvm_enable_host_haltpoll(void *i)
+{
+ wrmsrq(MSR_KVM_POLL_CONTROL, 1);
+}
+
+void arch_haltpoll_enable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
+ pr_err_once("host does not support poll control\n");
+ pr_err_once("host upgrade recommended\n");
+ return;
+ }
+
+ /* Enable guest halt poll disables host halt poll */
+ smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
+
+void arch_haltpoll_disable(unsigned int cpu)
+{
+ if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ return;
+
+ /* Disable guest halt poll enables host halt poll */
+ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
+#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..ca0a49eeac4a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* KVM paravirtual clock driver. A clocksource implementation
Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/clocksource.h>
@@ -22,71 +9,102 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
-
+#include <linux/hardirq.h>
+#include <linux/cpuhotplug.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/set_memory.h>
+#include <linux/cc_platform.h>
+
+#include <asm/hypervisor.h>
#include <asm/x86_init.h>
-#include <asm/reboot.h>
-
-#define KVM_SCALE 22
+#include <asm/kvmclock.h>
-static int kvmclock = 1;
-static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
+static int msr_kvm_system_time __ro_after_init;
+static int msr_kvm_wall_clock __ro_after_init;
+static u64 kvm_sched_clock_offset __ro_after_init;
-static int parse_no_kvmclock(char *arg)
+static int __init parse_no_kvmclock(char *arg)
{
kvmclock = 0;
return 0;
}
early_param("no-kvmclock", parse_no_kvmclock);
-/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
-static struct pvclock_wall_clock wall_clock;
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
+#define HVC_BOOT_ARRAY_SIZE \
+ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
+
+static struct pvclock_vsyscall_time_info
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
+static struct pvclock_vsyscall_time_info *hvclock_mem;
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
-static unsigned long kvm_get_wallclock(void)
+static void kvm_get_wallclock(struct timespec64 *now)
{
- struct pvclock_vcpu_time_info *vcpu_time;
- struct timespec ts;
- int low, high;
-
- low = (int)__pa_symbol(&wall_clock);
- high = ((u64)__pa_symbol(&wall_clock) >> 32);
-
- native_write_msr(msr_kvm_wall_clock, low, high);
-
- vcpu_time = &get_cpu_var(hv_clock);
- pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
- put_cpu_var(hv_clock);
-
- return ts.tv_sec;
+ wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ preempt_disable();
+ pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
+ preempt_enable();
}
-static int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(const struct timespec64 *now)
{
- return -1;
+ return -ENODEV;
}
-static cycle_t kvm_clock_read(void)
+static u64 kvm_clock_read(void)
{
- struct pvclock_vcpu_time_info *src;
- cycle_t ret;
+ u64 ret;
- src = &get_cpu_var(hv_clock);
- ret = pvclock_clocksource_read(src);
- put_cpu_var(hv_clock);
+ preempt_disable_notrace();
+ ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
+ preempt_enable_notrace();
return ret;
}
-static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
{
return kvm_clock_read();
}
+static noinstr u64 kvm_sched_clock_read(void)
+{
+ return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable)
+ clear_sched_clock_stable();
+ kvm_sched_clock_offset = kvm_clock_read();
+ paravirt_set_sched_clock(kvm_sched_clock_read);
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
@@ -98,12 +116,11 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- struct pvclock_vcpu_time_info *src;
- src = &per_cpu(hv_clock, 0);
- return pvclock_tsc_khz(src);
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ return pvclock_tsc_khz(this_cpu_pvti());
}
-static void kvm_get_preset_lpj(void)
+static void __init kvm_get_preset_lpj(void)
{
unsigned long khz;
u64 lpj;
@@ -115,107 +132,218 @@ static void kvm_get_preset_lpj(void)
preset_lpj = lpj;
}
+bool kvm_check_and_clear_guest_paused(void)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ bool ret = false;
+
+ if (!src)
+ return ret;
+
+ if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ ret = true;
+ }
+ return ret;
+}
+
+static int kvm_cs_enable(struct clocksource *cs)
+{
+ vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
+ return 0;
+}
+
static struct clocksource kvm_clock = {
- .name = "kvm-clock",
- .read = kvm_clock_get_cycles,
- .rating = 400,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 1 << KVM_SCALE,
- .shift = KVM_SCALE,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .name = "kvm-clock",
+ .read = kvm_clock_get_cycles,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .id = CSID_X86_KVM_CLK,
+ .enable = kvm_cs_enable,
};
-static int kvm_register_clock(char *txt)
+static void kvm_register_clock(char *txt)
{
- int cpu = smp_processor_id();
- int low, high;
- low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
- high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
- printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
- cpu, high, low, txt);
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
- return native_write_msr_safe(msr_kvm_system_time, low, high);
+ if (!src)
+ return;
+
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrq(msr_kvm_system_time, pa);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
}
#ifdef CONFIG_X86_LOCAL_APIC
-static void __cpuinit kvm_setup_secondary_clock(void)
+static void kvm_setup_secondary_clock(void)
{
- /*
- * Now that the first cpu already had this clocksource initialized,
- * we shouldn't fail.
- */
- WARN_ON(kvm_register_clock("secondary cpu clock"));
- /* ok, done with our trickery, call native */
- setup_secondary_APIC_clock();
+ kvm_register_clock("secondary cpu clock");
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
+void kvmclock_disable(void)
{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0);
}
-#endif
-/*
- * After the clock is registered, the host will keep writing to the
- * registered memory location. If the guest happens to shutdown, this memory
- * won't be valid. In cases like kexec, in which you install a new kernel, this
- * means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writting anything
- * that does not have the 'enable' bit set in the msr
- */
-#ifdef CONFIG_KEXEC
-static void kvm_crash_shutdown(struct pt_regs *regs)
+static void __init kvmclock_init_mem(void)
{
- native_write_msr(msr_kvm_system_time, 0, 0);
- native_machine_crash_shutdown(regs);
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
}
+
+static int __init kvm_setup_vsyscall_timeinfo(void)
+{
+ if (!kvm_para_available() || !kvmclock || nopv)
+ return 0;
+
+ kvmclock_init_mem();
+
+#ifdef CONFIG_X86_64
+ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
+ u8 flags;
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
+
+ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ }
#endif
-static void kvm_shutdown(void)
+ return 0;
+}
+early_initcall(kvm_setup_vsyscall_timeinfo);
+
+static int kvmclock_setup_percpu(unsigned int cpu)
{
- native_write_msr(msr_kvm_system_time, 0, 0);
- native_machine_shutdown();
+ struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
+
+ /*
+ * The per cpu area setup replicates CPU0 data to all cpu
+ * pointers. So carefully check. CPU0 has been set up in init
+ * already.
+ */
+ if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+ return 0;
+
+ /* Use the static page for the first CPUs, allocate otherwise */
+ if (cpu < HVC_BOOT_ARRAY_SIZE)
+ p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
+ else
+ return -ENOMEM;
+
+ per_cpu(hv_clock_per_cpu, cpu) = p;
+ return p ? 0 : -ENOMEM;
}
void __init kvmclock_init(void)
{
- if (!kvm_para_available())
+ u8 flags;
+
+ if (!kvm_para_available() || !kvmclock)
return;
- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
- } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+ } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+ } else {
+ return;
+ }
+
+ if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL) < 0) {
return;
+ }
- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ pr_info("kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- if (kvm_register_clock("boot clock"))
- return;
- pv_time_ops.sched_clock = kvm_clock_read;
+ this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
+ kvm_register_clock("primary cpu clock");
+ pvclock_set_pvti_cpu0_va(hv_clock_boot);
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
- kvm_setup_secondary_clock;
-#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
- machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
- machine_ops.crash_shutdown = kvm_crash_shutdown;
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
- pv_info.paravirt_enabled = 1;
- pv_info.name = "KVM";
- if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
- pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+ /*
+ * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * Invariant TSC exposed by host means kvmclock is not necessary:
+ * can use TSC as clocksource.
+ *
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ !check_tsc_unstable())
+ kvm_clock.rating = 299;
+
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
+ pv_info.name = "KVM";
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea697263b373..0f19ef355f5f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,9 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
* Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * context.ldt_usr_sem
+ * mmap_lock
+ * context.lock
*/
#include <linux/errno.h>
@@ -12,110 +18,468 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/system.h>
#include <asm/ldt.h>
+#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
-#include <asm/syscalls.h>
+#include <asm/pgtable_areas.h>
+
+#include <xen/xen.h>
+
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
-#ifdef CONFIG_SMP
-static void flush_ldt(void *current_mm)
+static inline void *ldt_slot_va(int slot)
{
- if (current->active_mm == current_mm)
- load_LDT(&current->active_mm->context);
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
}
-#endif
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+void load_mm_ldt(struct mm_struct *mm)
{
- void *oldldt, *newldt;
- int oldsize;
+ struct ldt_struct *ldt;
- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
- else
- newldt = (void *)__get_free_page(GFP_KERNEL);
+ /* READ_ONCE synchronizes with smp_store_release */
+ ldt = READ_ONCE(mm->context.ldt);
- if (!newldt)
- return -ENOMEM;
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
- paravirt_alloc_ldt(newldt, mincount);
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
+ clear_LDT();
+ }
+}
+
+void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+static void refresh_ldt_segments(void)
+{
#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
-
- if (reload) {
-#ifdef CONFIG_SMP
- preempt_disable();
- load_LDT(pc);
- if (!cpumask_equal(mm_cpumask(current->mm),
- cpumask_of(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#else
- load_LDT(pc);
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
#endif
+}
+
+/* context.lock is held by the task which issued the smp function call */
+static void flush_ldt(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
+ return;
+
+ load_mm_ldt(mm);
+
+ refresh_ldt_segments();
+}
+
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
+{
+ struct ldt_struct *new_ldt;
+ unsigned int alloc_size;
+
+ if (num_entries > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ else
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
+
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
+ new_ldt->nr_entries = num_entries;
+ return new_ldt;
+}
+
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+
+static void do_sanity_check(struct mm_struct *mm,
+ bool had_kernel_mapping,
+ bool had_user_mapping)
+{
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_kernel_mapping);
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!had_user_mapping);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_kernel_mapping);
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(had_user_mapping);
}
- if (oldsize) {
- paravirt_free_ldt(oldldt, oldsize);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
+}
+
+#ifdef CONFIG_X86_PAE
+
+static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (pgd->pgd == 0)
+ return NULL;
+
+ p4d = p4d_offset(pgd, va);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, va);
+ if (pud_none(*pud))
+ return NULL;
+
+ return pmd_offset(pud, va);
+}
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+
+ if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pmd(u_pmd, *k_pmd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ bool had_kernel, had_user;
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+ had_kernel = (k_pmd->pmd != 0);
+ had_user = (u_pmd->pmd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#else /* !CONFIG_X86_PAE */
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ bool had_kernel = (pgd->pgd != 0);
+ bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#endif /* CONFIG_X86_PAE */
+
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ unsigned long va;
+ bool is_vmalloc;
+ spinlock_t *ptl;
+ int i, nr_pages;
+
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /* Check if the current mappings are sane */
+ sanity_check_ldt_mapping(mm);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pgprot_t pte_prot;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
+ /* Filter out unsuppored __PAGE_KERNEL* bits: */
+ pgprot_val(pte_prot) &= __supported_pte_mask;
+ pte = pfn_pte(pfn, pte_prot);
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
}
+
+ /* Propagate LDT mapping to the user page-table */
+ map_ldt_struct_to_user(mm);
+
+ ldt->slot = slot;
return 0;
}
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
- int err = alloc_ldt(new, old->size, 0);
- int i;
+ unsigned long va;
+ int i, nr_pages;
+
+ if (!ldt)
+ return;
- if (err < 0)
- return err;
+ /* LDT map/unmap is only required for PTI */
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return;
- for (i = 0; i < old->size; i++)
- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!WARN_ON_ONCE(!ptep)) {
+ pte_clear(mm, va, ptep);
+ pte_unmap_unlock(ptep, ptl);
+ }
+ }
+
+ va = (unsigned long)ldt_slot_va(ldt->slot);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
+}
+
+#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
return 0;
}
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+}
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = LDT_END_ADDR;
+
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb);
+#endif
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
+}
+
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ mutex_lock(&mm->context.lock);
+
+ /* Synchronizes with READ_ONCE in load_mm_ldt. */
+ smp_store_release(&mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+ mutex_unlock(&mm->context.lock);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
+
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree_atomic(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
- struct mm_struct *old_mm;
+ struct ldt_struct *new_ldt;
int retval = 0;
- mutex_init(&mm->context.lock);
- mm->context.size = 0;
- old_mm = current->mm;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm)
+ return 0;
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt)
+ goto out_unlock;
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
}
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}
@@ -124,55 +488,54 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
*/
-void destroy_context(struct mm_struct *mm)
+void destroy_context_ldt(struct mm_struct *mm)
{
- if (mm->context.size) {
-#ifdef CONFIG_X86_32
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
-#endif
- paravirt_free_ldt(mm->context.ldt, mm->context.size);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
+}
+
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
- unsigned long size;
struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
+
+ down_read(&mm->context.ldt_usr_sem);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
- if (!mm->context.size)
- return 0;
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
- if (size > bytecount)
- size = bytecount;
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
- if (size != bytecount) {
- /* zero-fill the rest */
- if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entries_size != bytecount) {
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ up_read(&mm->context.ldt_usr_sem);
+ return retval;
}
static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -190,12 +553,36 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
return bytecount;
}
+static bool allow_16bit_segments(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_16BIT))
+ return false;
+
+#ifdef CONFIG_XEN_PV
+ /*
+ * Xen PV does not implement ESPFIX64, which means that 16-bit
+ * segments will not work correctly. Until either Xen PV implements
+ * ESPFIX64 and can signal this fact to the guest or unless someone
+ * provides compelling evidence that allowing broken 16-bit segments
+ * is worthwhile, disallow 16-bit segments under Xen PV.
+ */
+ if (xen_pv_domain()) {
+ pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int old_nr_entries, new_nr_entries;
+ struct user_desc ldt_info;
struct desc_struct ldt;
int error;
- struct user_desc ldt_info;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -214,39 +601,71 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
goto out;
}
- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(&current->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
+
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(new_nr_entries);
+ if (!new_ldt)
+ goto out_unlock;
- /* Install the new entry ... */
-install:
- write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ /*
+ * This only can fail for the first LDT setup. If an LDT is
+ * already installed then the PTE page is already
+ * populated. Mop up a half populated page table.
+ */
+ if (!WARN_ON_ONCE(old_ldt))
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+
+ install_ldt(mm, new_ldt);
+ unmap_ldt_struct(mm, old_ldt);
+ free_ldt_struct(old_ldt);
error = 0;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_write(&mm->context.ldt_usr_sem);
out:
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
@@ -264,5 +683,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but the ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43ba5d3b..1f325304c4a8 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -1,55 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
#include <linux/gfp.h>
#include <linux/io.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
-#include <asm/system.h>
-#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
#include <asm/debugreg.h>
-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
-
static void load_segments(void)
{
#define __STR(X) #X
@@ -61,8 +34,6 @@ static void load_segments(void)
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
: : : "eax", "memory");
#undef STR
@@ -71,18 +42,24 @@ static void load_segments(void)
static void machine_kexec_free_page_tables(struct kimage *image)
{
- free_page((unsigned long)image->arch.pgd);
+ free_pages((unsigned long)image->arch.pgd, pgd_allocation_order());
+ image->arch.pgd = NULL;
#ifdef CONFIG_X86_PAE
free_page((unsigned long)image->arch.pmd0);
+ image->arch.pmd0 = NULL;
free_page((unsigned long)image->arch.pmd1);
+ image->arch.pmd1 = NULL;
#endif
free_page((unsigned long)image->arch.pte0);
+ image->arch.pte0 = NULL;
free_page((unsigned long)image->arch.pte1);
+ image->arch.pte1 = NULL;
}
static int machine_kexec_alloc_page_tables(struct kimage *image)
{
- image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ pgd_allocation_order());
#ifdef CONFIG_X86_PAE
image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
@@ -94,7 +71,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
!image->arch.pmd0 || !image->arch.pmd1 ||
#endif
!image->arch.pte0 || !image->arch.pte1) {
- machine_kexec_free_page_tables(image);
return -ENOMEM;
}
return 0;
@@ -104,6 +80,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
{
+ p4d_t *p4d;
pud_t *pud;
pgd += pgd_index(vaddr);
@@ -111,7 +88,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
#endif
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
@@ -158,7 +136,7 @@ int machine_kexec_prepare(struct kimage *image)
{
int error;
- set_pages_x(image->control_code_page, 1);
+ set_memory_x((unsigned long)page_address(image->control_code_page), 1);
error = machine_kexec_alloc_page_tables(image);
if (error)
return error;
@@ -172,7 +150,7 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
- set_pages_nx(image->control_code_page, 1);
+ set_memory_nx((unsigned long)page_address(image->control_code_page), 1);
machine_kexec_free_page_tables(image);
}
@@ -182,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ relocate_kernel_fn *relocate_kernel_ptr;
unsigned long page_list[PAGES_NR];
void *control_page;
int save_ftrace_enabled;
- asmlinkage unsigned long
- (*relocate_kernel_ptr)(unsigned long indirection_page,
- unsigned long control_page,
- unsigned long start_address,
- unsigned int has_pae,
- unsigned int preserve_context);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -208,11 +181,11 @@ void machine_kexec(struct kimage *image)
/*
* We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
- * paths already have calls to disable_IO_APIC() in
- * one form or other. kexec jump path also need
- * one.
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
*/
- disable_IO_APIC();
+ clear_IO_APIC();
+ restore_boot_irq_mode();
#endif
}
@@ -243,13 +216,14 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
(unsigned long)page_list,
- image->start, cpu_has_pae,
+ image->start,
+ boot_cpu_has(X86_FEATURE_PAE),
image->preserve_context);
#ifdef CONFIG_KEXEC_JUMP
@@ -259,15 +233,3 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_X86_PAE
- VMCOREINFO_CONFIG(X86_PAE);
-#endif
-}
-
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..201137b98fb8 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -1,11 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* handle transition of Linux booting another kernel
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -15,152 +15,186 @@
#include <linux/ftrace.h>
#include <linux/io.h>
#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+#include <linux/efi.h>
+#include <linux/cc_platform.h>
-#include <asm/pgtable.h>
+#include <asm/init.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/io_apic.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
+#include <asm/set_memory.h>
+#include <asm/cpu.h>
+#include <asm/efi.h>
+#include <asm/processor.h>
+
+#ifdef CONFIG_ACPI
+/*
+ * Used while adding mapping for ACPI tables.
+ * Can be reused when other iomem regions need be mapped
+ */
+struct init_pgtable_data {
+ struct x86_mapping_info *info;
+ pgd_t *level4p;
+};
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
- unsigned long addr)
+static int mem_region_callback(struct resource *res, void *arg)
{
- pud_t *pud;
- pmd_t *pmd;
- struct page *page;
- int result = -ENOMEM;
+ struct init_pgtable_data *data = arg;
- addr &= PMD_MASK;
- pgd += pgd_index(addr);
- if (!pgd_present(*pgd)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pud = (pud_t *)page_address(page);
- memset(pud, 0, PAGE_SIZE);
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
- }
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud)) {
- page = kimage_alloc_control_pages(image, 0);
- if (!page)
- goto out;
- pmd = (pmd_t *)page_address(page);
- memset(pmd, 0, PAGE_SIZE);
- set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
- }
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- result = 0;
-out:
- return result;
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
+static int
+map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
+ struct init_pgtable_data data;
+ unsigned long flags;
+ int ret;
- addr &= PAGE_MASK;
- end_addr = addr + PUD_SIZE;
- while (addr < end_addr) {
- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
- addr += PMD_SIZE;
- }
+ data.info = info;
+ data.level4p = level4p;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
+ &data, mem_region_callback);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ /* ACPI tables could be located in ACPI Non-volatile Storage region */
+ ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
+ &data, mem_region_callback);
+ if (ret && ret != -EINVAL)
+ return ret;
+
+ return 0;
}
+#else
+static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
+#endif
-static int init_level3_page(struct kimage *image, pud_t *level3p,
- unsigned long addr, unsigned long last_addr)
+static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
- int result;
+ unsigned long mstart, mend;
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + PGDIR_SIZE;
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pmd_t *level2p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level2p = (pmd_t *)page_address(page);
- init_level2_page(level2p, addr);
- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
- addr += PUD_SIZE;
- }
- /* clear the unused entries */
- while (addr < end_addr) {
- pud_clear(level3p++);
- addr += PUD_SIZE;
- }
-out:
- return result;
+ if (!kexec_debug_8250_mmio32)
+ return 0;
+
+ mstart = kexec_debug_8250_mmio32 & PAGE_MASK;
+ mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK;
+ pr_info("Map PCI serial at %lx - %lx\n", mstart, mend);
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
}
+#ifdef CONFIG_KEXEC_FILE
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+ NULL
+};
+#endif
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
- unsigned long addr, unsigned long last_addr)
+static int
+map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
- unsigned long end_addr;
- int result;
+#ifdef CONFIG_EFI
+ unsigned long mstart, mend;
+ void *kaddr;
+ int ret;
- result = 0;
- addr &= PAGE_MASK;
- end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
- while ((addr < last_addr) && (addr < end_addr)) {
- struct page *page;
- pud_t *level3p;
-
- page = kimage_alloc_control_pages(image, 0);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- level3p = (pud_t *)page_address(page);
- result = init_level3_page(image, level3p, addr, last_addr);
- if (result)
- goto out;
- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
- addr += PGDIR_SIZE;
+ if (!efi_enabled(EFI_BOOT))
+ return 0;
+
+ mstart = (boot_params.efi_info.efi_systab |
+ ((u64)boot_params.efi_info.efi_systab_hi<<32));
+
+ if (efi_enabled(EFI_64BIT))
+ mend = mstart + sizeof(efi_system_table_64_t);
+ else
+ mend = mstart + sizeof(efi_system_table_32_t);
+
+ if (!mstart)
+ return 0;
+
+ ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+ if (ret)
+ return ret;
+
+ kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+ if (!kaddr) {
+ pr_err("Could not map UEFI system table\n");
+ return -ENOMEM;
}
- /* clear the unused entries */
- while (addr < end_addr) {
- pgd_clear(level4p++);
- addr += PGDIR_SIZE;
+
+ mstart = efi_config_table;
+
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
}
-out:
- return result;
+
+ memunmap(kaddr);
+
+ return kernel_ident_mapping_init(info, level4p, mstart, mend);
+#endif
+ return 0;
}
static void free_transition_pgtable(struct kimage *image)
{
+ free_page((unsigned long)image->arch.p4d);
+ image->arch.p4d = NULL;
free_page((unsigned long)image->arch.pud);
+ image->arch.pud = NULL;
free_page((unsigned long)image->arch.pmd);
+ image->arch.pmd = NULL;
free_page((unsigned long)image->arch.pte);
+ image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
+ unsigned long control_page)
{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ int result = -ENOMEM;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- unsigned long vaddr, paddr;
- int result = -ENOMEM;
- vaddr = (unsigned long)relocate_kernel;
- paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ /*
+ * For the transition to the identity mapped page tables, the control
+ * code page also needs to be mapped at the virtual address it starts
+ * off running from.
+ */
+ vaddr = (unsigned long)__va(control_page);
+ paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ goto err;
+ image->arch.p4d = p4d;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
- pud = pud_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
@@ -177,60 +211,104 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
}
pte = pte_offset_kernel(pmd, vaddr);
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ prot = PAGE_KERNEL_EXEC;
+
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
return 0;
err:
- free_transition_pgtable(image);
return result;
}
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+ return p;
+}
+
+static int init_pgtable(struct kimage *image, unsigned long control_page)
{
- pgd_t *level4p;
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
int result;
- level4p = (pgd_t *)__va(start_pgtable);
- result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
- if (result)
- return result;
+ int i;
+
+ image->arch.pgd = alloc_pgt_page(image);
+ if (!image->arch.pgd)
+ return -ENOMEM;
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ info.page_flag |= _PAGE_ENC;
+ info.kernpg_flag |= _PAGE_ENC;
+ }
+
+ if (direct_gbpages)
+ info.direct_gbpages = true;
+
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
+ if (result)
+ return result;
+ }
+
/*
- * image->start may be outside 0 ~ max_pfn, for example when
- * jump back to original kernel from kexeced kernel
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
*/
- result = init_one_level2_page(image, level4p, image->start);
- if (result)
- return result;
- return init_transition_pgtable(image, level4p);
-}
-
-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
- /* x86-64 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
+ if (result)
+ return result;
+ }
+ /*
+ * Prepare EFI systab and ACPI tables for kexec kernel since they are
+ * not covered by pfn_mapped.
+ */
+ result = map_efi_systab(&info, image->arch.pgd);
+ if (result)
+ return result;
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
+ result = map_acpi_tables(&info, image->arch.pgd);
+ if (result)
+ return result;
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
+ result = map_mmio_serial(&info, image->arch.pgd);
+ if (result)
+ return result;
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
+ /*
+ * This must be last because the intermediate page table pages it
+ * allocates will not be control pages and may overlap the image.
+ */
+ return init_transition_pgtable(image, image->arch.pgd, control_page);
+}
static void load_segments(void)
{
@@ -244,24 +322,74 @@ static void load_segments(void)
);
}
+static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs)
+{
+ gate_desc idtentry = { 0 };
+ int i;
+
+ idtentry.bits.p = 1;
+ idtentry.bits.type = GATE_TRAP;
+ idtentry.segment = __KERNEL_CS;
+ idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs;
+ idtentry.offset_middle = (control_page >> 16) & 0xFFFF;
+ idtentry.offset_high = control_page >> 32;
+
+ for (i = 0; i < 16; i++) {
+ kexec_debug_idt[i] = idtentry;
+ idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE;
+ }
+}
+
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable;
+ void *control_page = page_address(image->control_code_page);
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
- /* Calculate the offsets */
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ /*
+ * Some early TDX-capable platforms have an erratum. A kernel
+ * partial write (a write transaction of less than cacheline
+ * lands at memory controller) to TDX private memory poisons that
+ * memory, and a subsequent read triggers a machine check.
+ *
+ * On those platforms the old kernel must reset TDX private
+ * memory before jumping to the new kernel otherwise the new
+ * kernel may see unexpected machine check. For simplicity
+ * just fail kexec/kdump on those platforms.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) {
+ pr_info_once("Not allowed on platform with tdx_pw_mce bug\n");
+ return -EOPNOTSUPP;
+ }
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, start_pgtable);
+ result = init_pgtable(image, __pa(control_page));
if (result)
return result;
+ kexec_va_control_page = (unsigned long)control_page;
+ kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+
+ prepare_debug_idt((unsigned long)__pa(control_page),
+ (unsigned long)kexec_debug_exc_vectors - reloc_start);
+
+ __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
+
+ set_memory_rox((unsigned long)control_page, 1);
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
+ void *control_page = page_address(image->control_code_page);
+
+ set_memory_nx((unsigned long)control_page, 1);
+ set_memory_rw((unsigned long)control_page, 1);
+
free_transition_pgtable(image);
}
@@ -269,11 +397,13 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-void machine_kexec(struct kimage *image)
+void __nocfi machine_kexec(struct kimage *image)
{
- unsigned long page_list[PAGES_NR];
- void *control_page;
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ relocate_kernel_fn *relocate_kernel_ptr;
+ unsigned int relocate_kernel_flags;
int save_ftrace_enabled;
+ void *control_page;
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -285,31 +415,39 @@ void machine_kexec(struct kimage *image)
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
hw_breakpoint_disable();
+ cet_disable();
if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
/*
* We need to put APICs in legacy mode so that we can
* get timer interrupts in second kernel. kexec/kdump
- * paths already have calls to disable_IO_APIC() in
- * one form or other. kexec jump path also need
- * one.
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
*/
- disable_IO_APIC();
+ clear_IO_APIC();
+ restore_boot_irq_mode();
#endif
}
- control_page = page_address(image->control_code_page) + PAGE_SIZE;
- memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ control_page = page_address(image->control_code_page);
- page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] =
- (unsigned long)__pa(page_address(image->control_code_page));
+ /*
+ * Allow for the possibility that relocate_kernel might not be at
+ * the very start of the page.
+ */
+ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
- if (image->type == KEXEC_TYPE_DEFAULT)
- page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
- << PAGE_SHIFT);
+ relocate_kernel_flags = 0;
+ if (image->preserve_context)
+ relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT;
+
+ /*
+ * This must be done before load_segments() since it resets
+ * GS to 0 and percpu data needs the correct GS to work.
+ */
+ if (this_cpu_read(cache_state_incoherent))
+ relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT;
/*
* The segment registers are funny things, they have both a
@@ -318,22 +456,21 @@ void machine_kexec(struct kimage *image)
* with from a table in memory. At no other time is the
* descriptor table in memory accessed.
*
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
+ * Take advantage of this here by force loading the segments,
+ * before the GDT is zapped with an invalid value.
+ *
+ * load_segments() resets GS to 0. Don't make any function call
+ * after here since call depth tracking uses percpu variables to
+ * operate (relocate_kernel() is explicitly ignored by call depth
+ * tracking).
*/
load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
/* now call it */
- image->start = relocate_kernel((unsigned long)image->head,
- (unsigned long)page_list,
- image->start,
- image->preserve_context);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ virt_to_phys(control_page),
+ image->start,
+ relocate_kernel_flags);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -342,15 +479,250 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
+/*
+ * Handover to the next kernel, no CFI concern.
+ */
+ANNOTATE_NOCFI_SYM(machine_kexec);
+
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+/*
+ * Apply purgatory relocations.
+ *
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtabsec: Corresponding symtab.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section, const Elf_Shdr *relsec,
+ const Elf_Shdr *symtabsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ rel = (void *)pi->ehdr + relsec->sh_offset;
+
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
+
+ for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update. This is temporary buffer
+ * where section is currently loaded. This will finally be
+ * loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = pi->purgatory_buf;
+ location += section->sh_offset;
+ location += rel[i].r_offset;
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (void *)pi->ehdr + symtabsec->sh_offset;
+ sym += ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= pi->ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
-void arch_crash_save_vmcoreinfo(void)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- VMCOREINFO_SYMBOL(phys_base);
- VMCOREINFO_SYMBOL(init_level4_pgt);
+ vfree(image->elf_headers);
+ image->elf_headers = NULL;
+ image->elf_headers_sz = 0;
+
+ return kexec_image_post_load_cleanup_default(image);
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_DUMP
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ kexec_mark_range(crashk_res.start, control - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+/* make the memory storing dm crypt keys in/accessible */
+static void kexec_mark_dm_crypt_keys(bool protect)
+{
+ unsigned long start_paddr, end_paddr;
+ unsigned int nr_pages;
+
+ if (kexec_crash_image->dm_crypt_keys_addr) {
+ start_paddr = kexec_crash_image->dm_crypt_keys_addr;
+ end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1;
+ nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE;
+ if (protect)
+ set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages);
+ else
+ __set_memory_prot(
+ (unsigned long)phys_to_virt(start_paddr),
+ nr_pages,
+ __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW));
+ }
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+ kexec_mark_dm_crypt_keys(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_dm_crypt_keys(false);
+ kexec_mark_crashkres(false);
+}
#endif
+
+/*
+ * During a traditional boot under SME, SME will encrypt the kernel,
+ * so the SME kexec kernel also needs to be un-encrypted in order to
+ * replicate a normal SME boot.
+ *
+ * During a traditional boot under SEV, the kernel has already been
+ * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
+ * order to replicate a normal SEV boot.
+ */
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return 0;
+
+ /*
+ * If host memory encryption is active we need to be sure that kexec
+ * pages are not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
}
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * If host memory encryption is active we need to reset the pages back
+ * to being an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 63eaf6596233..000000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- * Chris Beauregard July 28th, 1996
- * - Fixed up integrated SCSI detection
- *
- * Chris Beauregard August 3rd, 1996
- * - Made mca_info local
- * - Made integrated registers accessible through standard function calls
- * - Added name field
- * - More sanity checking
- *
- * Chris Beauregard August 9th, 1996
- * - Rewrote /proc/mca
- *
- * Chris Beauregard January 7th, 1997
- * - Added basic NMI-processing
- * - Added more information to mca_info structure
- *
- * David Weinehall October 12th, 1998
- * - Made a lot of cleaning up in the source
- * - Added use of save_flags / restore_flags
- * - Added the 'driver_loaded' flag in MCA_adapter
- * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- * David Weinehall March 24th, 1999
- * - Fixed the output of 'Driver Installed' in /proc/mca/pos
- * - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- * Alexander Viro November 9th, 1999
- * - Switched to regular procfs methods
- *
- * Alfred Arnold & David Weinehall August 23rd, 2000
- * - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <linux/slab.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
- mca_dev->status = MCA_ADAPTER_NONE;
-
- mca_dev->pos_id = mca_dev->pos[0]
- + (mca_dev->pos[1] << 8);
-
- if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
- /*
- * id = 0x0000 usually indicates hardware failure,
- * however, ZP Gu (zpg@castle.net> reports that his 9556
- * has 0x0000 as id and everything still works. There
- * also seem to be an adapter with id = 0x0000; the
- * NCR Parallel Bus Memory Card. Until this is confirmed,
- * however, this code will stay.
- */
-
- mca_dev->status = MCA_ADAPTER_ERROR;
-
- return;
- } else if (mca_dev->pos_id != 0xffff) {
-
- /*
- * 0xffff usually indicates that there's no adapter,
- * however, some integrated adapters may have 0xffff as
- * their id and still be valid. Examples are on-board
- * VGA of the 55sx, the integrated SCSI of the 56 & 57,
- * and possibly also the 95 ULTIMEDIA.
- */
-
- mca_dev->status = MCA_ADAPTER_NORMAL;
- }
-
- if ((mca_dev->pos_id == 0xffff ||
- mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
- int j;
-
- for (j = 2; j < 8; j++) {
- if (mca_dev->pos[j] != 0xff) {
- mca_dev->status = MCA_ADAPTER_NORMAL;
- break;
- }
- }
- }
-
- if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
- /* enabled bit is in POS 2 */
-
- mca_dev->status = MCA_ADAPTER_DISABLED;
- }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
- { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
- { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
- { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
- { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
- { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
- { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
- { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/*
- * mca_read_and_store_pos - read the POS registers into a memory buffer
- * @pos: a char pointer to 8 bytes, contains the POS register value on
- * successful return
- *
- * Returns 1 if a card actually exists (i.e. the pos isn't
- * all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
- int j;
- int found = 0;
-
- for (j = 0; j < 8; j++) {
- pos[j] = inb_p(MCA_POS_REG(j));
- if (pos[j] != 0xff) {
- /* 0xff all across means no device. 0x00 means
- * something's broken, but a device is
- * probably there. However, if you get 0x00
- * from a motherboard register it won't matter
- * what we find. For the record, on the
- * 57SLC, the integrated SCSI adapter has
- * 0xffff for the adapter ID, but nonzero for
- * other registers. */
-
- found = 1;
- }
- }
- return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
- unsigned char byte;
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return 0;
-
- spin_lock_irqsave(&mca_lock, flags);
- if (mca_dev->pos_register) {
- /* Disable adapter setup, enable motherboard setup */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- } else {
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read the appropriate register */
-
- outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- }
- spin_unlock_irqrestore(&mca_lock, flags);
-
- mca_dev->pos[reg] = byte;
-
- return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
- unsigned char byte)
-{
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return;
-
- spin_lock_irqsave(&mca_lock, flags);
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read in the appropriate register */
-
- outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
- outb_p(byte, MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- spin_unlock_irqrestore(&mca_lock, flags);
-
- /* Update the global register list, while we have the byte */
-
- mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
- return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
- return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
- return mem;
-}
-
-
-static int __init mca_init(void)
-{
- unsigned int i, j;
- struct mca_device *mca_dev;
- unsigned char pos[8];
- short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
- struct mca_bus *bus;
-
- /*
- * WARNING: Be careful when making changes here. Putting an adapter
- * and the motherboard simultaneously into setup mode may result in
- * damage to chips (according to The Indispensible PC Hardware Book
- * by Hans-Peter Messmer). Also, we disable system interrupts (so
- * that we are not disturbed in the middle of this).
- */
-
- /* Make sure the MCA bus is present */
-
- if (mca_system_init()) {
- printk(KERN_ERR "MCA bus system initialisation failed\n");
- return -ENODEV;
- }
-
- if (!MCA_bus)
- return -ENODEV;
-
- printk(KERN_INFO "Micro Channel bus detected.\n");
-
- /* All MCA systems have at least a primary bus */
- bus = mca_attach_bus(MCA_PRIMARY_BUS);
- if (!bus)
- goto out_nomem;
- bus->default_dma_mask = 0xffffffffLL;
- bus->f.mca_write_pos = mca_pc_write_pos;
- bus->f.mca_read_pos = mca_pc_read_pos;
- bus->f.mca_transform_irq = mca_dummy_transform_irq;
- bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
- bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
- /* get the motherboard device */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if (unlikely(!mca_dev))
- goto out_nomem;
-
- /*
- * We do not expect many MCA interrupts during initialization,
- * but let us be safe:
- */
- spin_lock_irq(&mca_lock);
-
- /* Make sure adapter setup is off */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Read motherboard POS registers */
-
- mca_dev->pos_register = 0x7f;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for a motherboard */
- mca_dev->pos_id = MCA_MOTHERBOARD_POS;
- mca_dev->slot = MCA_MOTHERBOARD;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- /* Put motherboard into video setup mode, read integrated video
- * POS registers, and turn motherboard setup off.
- */
-
- mca_dev->pos_register = 0xdf;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for the integrated video */
- mca_dev->pos_id = MCA_INTEGVIDEO_POS;
- mca_dev->slot = MCA_INTEGVIDEO;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- /*
- * Put motherboard into scsi setup mode, read integrated scsi
- * POS registers, and turn motherboard setup off.
- *
- * It seems there are two possible SCSI registers. Martin says that
- * for the 56,57, 0xf7 is the one, but fails on the 76.
- * Alfredo (apena@vnet.ibm.com) says
- * 0xfd works on his machine. We'll try both of them. I figure it's
- * a good bet that only one could be valid at a time. This could
- * screw up though if one is used for something else on the other
- * machine.
- */
-
- for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
- outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if (mca_read_and_store_pos(pos))
- break;
- }
- if (which_scsi) {
- /* found a scsi card */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for integrated SCSI controller */
- mca_dev->pos_id = MCA_INTEGSCSI_POS;
- mca_dev->slot = MCA_INTEGSCSI;
- mca_dev->pos_register = which_scsi;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
-
- /* Turn off motherboard setup */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /*
- * Now loop over MCA slots: put each adapter into setup mode, and
- * read its POS registers. Then put adapter setup off.
- */
-
- for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
- outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if (!mca_read_and_store_pos(pos))
- continue;
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_dev->driver_loaded = 0;
- mca_dev->slot = i;
- mca_dev->pos_register = 0;
- mca_configure_adapter_status(mca_dev);
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Enable interrupts and return memory start */
- spin_unlock_irq(&mca_lock);
-
- for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
- request_resource(&ioport_resource, mca_standard_resources + i);
-
- mca_do_proc_init();
-
- return 0;
-
- out_unlock_nomem:
- spin_unlock_irq(&mca_lock);
- out_nomem:
- printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
- return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
- int slot = mca_dev->slot;
-
- if (slot == MCA_INTEGSCSI) {
- printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_INTEGVIDEO) {
- printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_MOTHERBOARD) {
- printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
- mca_dev->name);
- }
-
- /* More info available in POS 6 and 7? */
-
- if (check_flag) {
- unsigned char pos6, pos7;
-
- pos6 = mca_device_read_pos(mca_dev, 6);
- pos7 = mca_device_read_pos(mca_dev, 7);
-
- printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
- }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
- struct mca_device *mca_dev = to_mca_device(dev);
- unsigned char pos5;
-
- pos5 = mca_device_read_pos(mca_dev, 5);
-
- if (!(pos5 & 0x80)) {
- /*
- * Bit 7 of POS 5 is reset when this adapter has a hardware
- * error. Bit 7 it reset if there's error information
- * available in POS 6 and 7.
- */
- mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
- return 1;
- }
- return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
- /*
- * First try - scan the various adapters and see if a specific
- * adapter was responsible for the error.
- */
- bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
deleted file mode 100644
index e1af7c055c7d..000000000000
--- a/arch/x86/kernel/microcode_amd.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * AMD CPU Microcode Update Driver for Linux
- * Copyright (C) 2008 Advanced Micro Devices Inc.
- *
- * Author: Peter Oruba <peter.oruba@amd.com>
- *
- * Based on work by:
- * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *
- * This driver allows to upgrade microcode on AMD
- * family 0x10 and 0x11 processors.
- *
- * Licensed under the terms of the GNU General Public
- * License version 2. See file COPYING for details.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/firmware.h>
-#include <linux/pci_ids.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
-
-#define UCODE_MAGIC 0x00414d44
-#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
-#define UCODE_UCODE_TYPE 0x00000001
-
-struct equiv_cpu_entry {
- u32 installed_cpu;
- u32 fixed_errata_mask;
- u32 fixed_errata_compare;
- u16 equiv_cpu;
- u16 res;
-} __attribute__((packed));
-
-struct microcode_header_amd {
- u32 data_code;
- u32 patch_id;
- u16 mc_patch_data_id;
- u8 mc_patch_data_len;
- u8 init_flag;
- u32 mc_patch_data_checksum;
- u32 nb_dev_id;
- u32 sb_dev_id;
- u16 processor_rev_id;
- u8 nb_rev_id;
- u8 sb_rev_id;
- u8 bios_api_rev;
- u8 reserved1[3];
- u32 match_reg[8];
-} __attribute__((packed));
-
-struct microcode_amd {
- struct microcode_header_amd hdr;
- unsigned int mpb[0];
-};
-
-#define UCODE_MAX_SIZE 2048
-#define UCODE_CONTAINER_SECTION_HDR 8
-#define UCODE_CONTAINER_HEADER_SIZE 12
-
-static struct equiv_cpu_entry *equiv_cpu_table;
-
-static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- u32 dummy;
-
- memset(csig, 0, sizeof(*csig));
- if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
- pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
- "supported\n", cpu, c->x86);
- return -1;
- }
- rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
- pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
- return 0;
-}
-
-static int get_matching_microcode(int cpu, void *mc, int rev)
-{
- struct microcode_header_amd *mc_header = mc;
- unsigned int current_cpu_id;
- u16 equiv_cpu_id = 0;
- unsigned int i = 0;
-
- BUG_ON(equiv_cpu_table == NULL);
- current_cpu_id = cpuid_eax(0x00000001);
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
- equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
- break;
- }
- i++;
- }
-
- if (!equiv_cpu_id)
- return 0;
-
- if (mc_header->processor_rev_id != equiv_cpu_id)
- return 0;
-
- /* ucode might be chipset specific -- currently we don't support this */
- if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
- pr_err("CPU%d: loading of chipset specific code not yet supported\n",
- cpu);
- return 0;
- }
-
- if (mc_header->patch_id <= rev)
- return 0;
-
- return 1;
-}
-
-static int apply_microcode_amd(int cpu)
-{
- u32 rev, dummy;
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- struct microcode_amd *mc_amd = uci->mc;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_amd == NULL)
- return 0;
-
- wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
- /* get patch id after patching */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- /* check current patch id and patch's id for match */
- if (rev != mc_amd->hdr.patch_id) {
- pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
- cpu, mc_amd->hdr.patch_id);
- return -1;
- }
-
- pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
- uci->cpu_sig.rev = rev;
-
- return 0;
-}
-
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
-static void *
-get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
-{
- unsigned int total_size;
- u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
- void *mc;
-
- if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
- return NULL;
-
- if (section_hdr[0] != UCODE_UCODE_TYPE) {
- pr_err("error: invalid type field in container file section header\n");
- return NULL;
- }
-
- total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
-
- if (total_size > size || total_size > UCODE_MAX_SIZE) {
- pr_err("error: size mismatch\n");
- return NULL;
- }
-
- mc = vmalloc(UCODE_MAX_SIZE);
- if (mc) {
- memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
- total_size)) {
- vfree(mc);
- mc = NULL;
- } else
- *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
- }
- return mc;
-}
-
-static int install_equiv_cpu_table(const u8 *buf)
-{
- u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
- unsigned int *buf_pos = (unsigned int *)container_hdr;
- unsigned long size;
-
- if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
- return 0;
-
- size = buf_pos[2];
-
- if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
- pr_err("error: invalid type field in container file section header\n");
- return 0;
- }
-
- equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
- if (!equiv_cpu_table) {
- pr_err("failed to allocate equivalent CPU table\n");
- return 0;
- }
-
- buf += UCODE_CONTAINER_HEADER_SIZE;
- if (get_ucode_data(equiv_cpu_table, buf, size)) {
- vfree(equiv_cpu_table);
- return 0;
- }
-
- return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
-}
-
-static void free_equiv_cpu_table(void)
-{
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
-}
-
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- const u8 *ucode_ptr = data;
- void *new_mc = NULL;
- void *mc;
- int new_rev = uci->cpu_sig.rev;
- unsigned int leftover;
- unsigned long offset;
- enum ucode_state state = UCODE_OK;
-
- offset = install_equiv_cpu_table(ucode_ptr);
- if (!offset) {
- pr_err("failed to create equivalent cpu table\n");
- return UCODE_ERROR;
- }
-
- ucode_ptr += offset;
- leftover = size - offset;
-
- while (leftover) {
- unsigned int uninitialized_var(mc_size);
- struct microcode_header_amd *mc_header;
-
- mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
- if (!mc)
- break;
-
- mc_header = (struct microcode_header_amd *)mc;
- if (get_matching_microcode(cpu, mc, new_rev)) {
- vfree(new_mc);
- new_rev = mc_header->patch_id;
- new_mc = mc;
- } else
- vfree(mc);
-
- ucode_ptr += mc_size;
- leftover -= mc_size;
- }
-
- if (new_mc) {
- if (!leftover) {
- vfree(uci->mc);
- uci->mc = new_mc;
- pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
- } else {
- vfree(new_mc);
- state = UCODE_ERROR;
- }
- } else
- state = UCODE_NFOUND;
-
- free_equiv_cpu_table();
-
- return state;
-}
-
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
-{
- const char *fw_name = "amd-ucode/microcode_amd.bin";
- const struct firmware *firmware;
- enum ucode_state ret;
-
- if (request_firmware(&firmware, fw_name, device)) {
- printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
- return UCODE_NFOUND;
- }
-
- if (*(u32 *)firmware->data != UCODE_MAGIC) {
- pr_err("invalid UCODE_MAGIC (0x%08x)\n",
- *(u32 *)firmware->data);
- return UCODE_ERROR;
- }
-
- ret = generic_load_microcode(cpu, firmware->data, firmware->size);
-
- release_firmware(firmware);
-
- return ret;
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
- return UCODE_ERROR;
-}
-
-static void microcode_fini_cpu_amd(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_amd_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
-};
-
-struct microcode_ops * __init init_amd_microcode(void)
-{
- return &microcode_amd_ops;
-}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
deleted file mode 100644
index fa6551d36c10..000000000000
--- a/arch/x86/kernel/microcode_core.c
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/platform_device.h>
-#include <linux/miscdevice.h>
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION "2.00"
-
-static struct microcode_ops *microcode_ops;
-
-/*
- * Synchronization.
- *
- * All non cpu-hotplug-callback call sites use:
- *
- * - microcode_mutex to synchronize with each other;
- * - get/put_online_cpus() to synchronize with
- * the cpu-hotplug-callback call sites.
- *
- * We guarantee that only a single cpu is being
- * updated at any particular moment of time.
- */
-static DEFINE_MUTEX(microcode_mutex);
-
-struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-EXPORT_SYMBOL_GPL(ucode_cpu_info);
-
-/*
- * Operations that are run on a target cpu:
- */
-
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
-static void collect_cpu_info_local(void *arg)
-{
- struct cpu_info_ctx *ctx = arg;
-
- ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
- ctx->cpu_sig);
-}
-
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
-{
- struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-static int collect_cpu_info(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int ret;
-
- memset(uci, 0, sizeof(*uci));
-
- ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
- if (!ret)
- uci->valid = 1;
-
- return ret;
-}
-
-struct apply_microcode_ctx {
- int err;
-};
-
-static void apply_microcode_local(void *arg)
-{
- struct apply_microcode_ctx *ctx = arg;
-
- ctx->err = microcode_ops->apply_microcode(smp_processor_id());
-}
-
-static int apply_microcode_on_target(int cpu)
-{
- struct apply_microcode_ctx ctx = { .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static int do_microcode_update(const void __user *buf, size_t size)
-{
- int error = 0;
- int cpu;
-
- for_each_online_cpu(cpu) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
-
- if (!uci->valid)
- continue;
-
- ustate = microcode_ops->request_microcode_user(cpu, buf, size);
- if (ustate == UCODE_ERROR) {
- error = -1;
- break;
- } else if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- }
-
- return error;
-}
-
-static int microcode_open(struct inode *inode, struct file *file)
-{
- return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
-}
-
-static ssize_t microcode_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- ssize_t ret = -EINVAL;
-
- if ((len >> PAGE_SHIFT) > totalram_pages) {
- pr_err("too much data (max %ld pages)\n", totalram_pages);
- return ret;
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- if (do_microcode_update(buf, len) == 0)
- ret = (ssize_t)len;
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- return ret;
-}
-
-static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .nodename = "cpu/microcode",
- .fops = &microcode_fops,
-};
-
-static int __init microcode_dev_init(void)
-{
- int error;
-
- error = misc_register(&microcode_dev);
- if (error) {
- pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
- return error;
- }
-
- return 0;
-}
-
-static void microcode_dev_exit(void)
-{
- misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-MODULE_ALIAS("devname:cpu/microcode");
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while (0)
-#endif
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int reload_for_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int err = 0;
-
- mutex_lock(&microcode_mutex);
- if (uci->valid) {
- enum ucode_state ustate;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- else
- if (ustate == UCODE_ERROR)
- err = -EINVAL;
- }
- mutex_unlock(&microcode_mutex);
-
- return err;
-}
-
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
- const char *buf, size_t size)
-{
- unsigned long val;
- int cpu = dev->id;
- int ret = 0;
- char *end;
-
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
- return -EINVAL;
-
- if (val == 1) {
- get_online_cpus();
- if (cpu_online(cpu))
- ret = reload_for_cpu(cpu);
- put_online_cpus();
- }
-
- if (!ret)
- ret = size;
-
- return ret;
-}
-
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
- return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
- NULL
-};
-
-static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
-};
-
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- microcode_ops->microcode_fini_cpu(cpu);
- uci->valid = 0;
-}
-
-static enum ucode_state microcode_resume_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->mc)
- return UCODE_NFOUND;
-
- pr_debug("CPU%d updated upon resume\n", cpu);
- apply_microcode_on_target(cpu);
-
- return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu)
-{
- enum ucode_state ustate;
-
- if (collect_cpu_info(cpu))
- return UCODE_ERROR;
-
- /* --dimm. Trigger a delayed update? */
- if (system_state != SYSTEM_RUNNING)
- return UCODE_NFOUND;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
-
- if (ustate == UCODE_OK) {
- pr_debug("CPU%d updated upon init\n", cpu);
- apply_microcode_on_target(cpu);
- }
-
- return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
-
- if (uci->valid)
- ustate = microcode_resume_cpu(cpu);
- else
- ustate = microcode_init_cpu(cpu);
-
- return ustate;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
- int err, cpu = sys_dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("CPU%d added\n", cpu);
-
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
- if (err)
- return err;
-
- if (microcode_init_cpu(cpu) == UCODE_ERROR)
- err = -EINVAL;
-
- return err;
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
- int cpu = sys_dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
- int cpu = dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!cpu_online(cpu))
- return 0;
-
- /*
- * All non-bootup cpus are still disabled,
- * so only CPU 0 will apply ucode here.
- *
- * Moreover, there can be no concurrent
- * updates from any other places at this point.
- */
- WARN_ON(cpu != 0);
-
- if (uci->valid && uci->mc)
- microcode_ops->apply_microcode(cpu);
-
- return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
-
- sys_dev = get_cpu_sysdev(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- microcode_update_cpu(cpu);
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- pr_debug("CPU%d added\n", cpu);
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- pr_err("Failed to create group for CPU%d\n", cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
- break;
- case CPU_DEAD:
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
- int error;
-
- if (c->x86_vendor == X86_VENDOR_INTEL)
- microcode_ops = init_intel_microcode();
- else if (c->x86_vendor == X86_VENDOR_AMD)
- microcode_ops = init_amd_microcode();
-
- if (!microcode_ops) {
- pr_err("no support for this CPU vendor\n");
- return -ENODEV;
- }
-
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
- if (IS_ERR(microcode_pdev)) {
- microcode_dev_exit();
- return PTR_ERR(microcode_pdev);
- }
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- if (error) {
- platform_device_unregister(microcode_pdev);
- return error;
- }
-
- error = microcode_dev_init();
- if (error)
- return error;
-
- register_hotcpu_notifier(&mc_cpu_notifier);
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
-
- return 0;
-}
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
- microcode_dev_exit();
-
- unregister_hotcpu_notifier(&mc_cpu_notifier);
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- platform_device_unregister(microcode_pdev);
-
- microcode_ops = NULL;
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
deleted file mode 100644
index 356170262a93..000000000000
--- a/arch/x86/kernel/microcode_intel.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-
-#include <asm/microcode.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-struct microcode_header_intel {
- unsigned int hdrver;
- unsigned int rev;
- unsigned int date;
- unsigned int sig;
- unsigned int cksum;
- unsigned int ldrver;
- unsigned int pf;
- unsigned int datasize;
- unsigned int totalsize;
- unsigned int reserved[3];
-};
-
-struct microcode_intel {
- struct microcode_header_intel hdr;
- unsigned int bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
- unsigned int sig;
- unsigned int pf;
- unsigned int cksum;
-};
-
-struct extended_sigtable {
- unsigned int count;
- unsigned int cksum;
- unsigned int reserved[3];
- struct extended_signature sigs[0];
-};
-
-#define DEFAULT_UCODE_DATASIZE (2000)
-#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
-#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
-#define DWSIZE (sizeof(u32))
-
-#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
- ((struct microcode_intel *)mc)->hdr.totalsize : \
- DEFAULT_UCODE_TOTALSIZE)
-
-#define get_datasize(mc) \
- (((struct microcode_intel *)mc)->hdr.datasize ? \
- ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
- (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned int val[2];
-
- memset(csig, 0, sizeof(*csig));
-
- if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
- cpu_has(c, X86_FEATURE_IA64)) {
- pr_err("CPU%d not a capable Intel processor\n", cpu_num);
- return -1;
- }
-
- csig->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-
- pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
- cpu_num, csig->sig, csig->pf, csig->rev);
-
- return 0;
-}
-
-static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
-{
- return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
-}
-
-static inline int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- int sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- pr_err("error! Bad data size in microcode data file\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- pr_err("error! Unknown microcode update format\n");
- return -EINVAL;
- }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- pr_err("error! Small exttable size in microcode data file\n");
- return -EINVAL;
- }
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- pr_err("error! Bad exttable size in microcode data file\n");
- return -EFAULT;
- }
- ext_sigcount = ext_header->count;
- }
-
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int *ext_tablep = (int *)ext_header;
-
- i = ext_table_size / DWSIZE;
- while (i--)
- ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- pr_warning("aborting, bad extended signature table checksum\n");
- return -EINVAL;
- }
- }
-
- /* calculate the checksum */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--)
- orig_sum += ((int *)mc)[i];
- if (orig_sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- if (!ext_table_size)
- return 0;
- /* check extended signature checksum */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
- sum = orig_sum
- - (mc_header->sig + mc_header->pf + mc_header->cksum)
- + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- pr_err("aborting, bad checksum\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int
-get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
-{
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
-
- if (!update_match_revision(mc_header, rev))
- return 0;
-
- if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
- return 0;
-
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-static int apply_microcode(int cpu)
-{
- struct microcode_intel *mc_intel;
- struct ucode_cpu_info *uci;
- unsigned int val[2];
- int cpu_num;
-
- cpu_num = raw_smp_processor_id();
- uci = ucode_cpu_info + cpu;
- mc_intel = uci->mc;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_intel == NULL)
- return 0;
-
- /* write microcode via MSR 0x79 */
- wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* see notes above for revision 1.07. Apparent chip bug */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- if (val[1] != mc_intel->hdr.rev) {
- pr_err("CPU%d update to revision 0x%x failed\n",
- cpu_num, mc_intel->hdr.rev);
- return -1;
- }
- pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu_num, val[1],
- mc_intel->hdr.date & 0xffff,
- mc_intel->hdr.date >> 24,
- (mc_intel->hdr.date >> 16) & 0xff);
-
- uci->cpu_sig.rev = val[1];
-
- return 0;
-}
-
-static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
- int new_rev = uci->cpu_sig.rev;
- unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
- unsigned int curr_mc_size = 0;
-
- while (leftover) {
- struct microcode_header_intel mc_header;
- unsigned int mc_size;
-
- if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
- break;
-
- mc_size = get_totalsize(&mc_header);
- if (!mc_size || mc_size > leftover) {
- pr_err("error! Bad data in microcode data file\n");
- break;
- }
-
- /* For performance reasons, reuse mc area when possible */
- if (!mc || mc_size > curr_mc_size) {
- if (mc)
- vfree(mc);
- mc = vmalloc(mc_size);
- if (!mc)
- break;
- curr_mc_size = mc_size;
- }
-
- if (get_ucode_data(mc, ucode_ptr, mc_size) ||
- microcode_sanity_check(mc) < 0) {
- vfree(mc);
- break;
- }
-
- if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
- new_rev = mc_header.rev;
- new_mc = mc;
- mc = NULL; /* trigger new vmalloc */
- }
-
- ucode_ptr += mc_size;
- leftover -= mc_size;
- }
-
- if (mc)
- vfree(mc);
-
- if (leftover) {
- if (new_mc)
- vfree(new_mc);
- state = UCODE_ERROR;
- goto out;
- }
-
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
- }
-
- if (uci->mc)
- vfree(uci->mc);
- uci->mc = (struct microcode_intel *)new_mc;
-
- pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
-out:
- return state;
-}
-
-static int get_ucode_fw(void *to, const void *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
-{
- char name[30];
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- const struct firmware *firmware;
- enum ucode_state ret;
-
- sprintf(name, "intel-ucode/%02x-%02x-%02x",
- c->x86, c->x86_model, c->x86_mask);
-
- if (request_firmware(&firmware, name, device)) {
- pr_debug("data file %s load failed\n", name);
- return UCODE_NFOUND;
- }
-
- ret = generic_load_microcode(cpu, (void *)firmware->data,
- firmware->size, &get_ucode_fw);
-
- release_firmware(firmware);
-
- return ret;
-}
-
-static int get_ucode_user(void *to, const void *from, size_t n)
-{
- return copy_from_user(to, from, n);
-}
-
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
-static struct microcode_ops microcode_intel_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode,
- .microcode_fini_cpu = microcode_fini_cpu,
-};
-
-struct microcode_ops * __init init_intel_microcode(void)
-{
- return &microcode_intel_ops;
-}
-
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ef6104e7cc72 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* AMD Family 10h mmconfig enablement
*/
@@ -8,6 +9,7 @@
#include <linux/pci.h>
#include <linux/dmi.h>
#include <linux/range.h>
+#include <linux/acpi.h>
#include <asm/pci-direct.h>
#include <linux/sort.h>
@@ -24,15 +26,14 @@ struct pci_hostbridge_probe {
u32 device;
};
-static u64 __cpuinitdata fam10h_pci_mmconf_base;
-static int __cpuinitdata fam10h_pci_mmconf_base_status;
+static u64 fam10h_pci_mmconf_base;
-static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
+static struct pci_hostbridge_probe pci_probes[] = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-static int __cpuinit cmp_range(const void *x1, const void *x2)
+static int cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
const struct range *r2 = x2;
@@ -44,11 +45,13 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
return start1 - start2;
}
-/*[47:0] */
-/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
-#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32)))
-static void __cpuinit get_fam10h_pci_mmconf_base(void)
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
+static void get_fam10h_pci_mmconf_base(void)
{
int i;
unsigned bus;
@@ -64,12 +67,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
struct range range[8];
/* only try to get setting from BSP */
- /* -1 or 1 */
- if (fam10h_pci_mmconf_base_status)
+ if (fam10h_pci_mmconf_base)
return;
if (!early_pci_allowed())
- goto fail;
+ return;
found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,24 +93,24 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
}
if (!found)
- goto fail;
+ return;
/* SYS_CFG */
- address = MSR_K8_SYSCFG;
- rdmsrl(address, val);
+ address = MSR_AMD64_SYSCFG;
+ rdmsrq(address, val);
/* TOP_MEM2 is not enabled? */
if (!(val & (1<<21))) {
- tom2 = 0;
+ tom2 = 1ULL << 32;
} else {
/* TOP_MEM2 */
address = MSR_K8_TOP_MEM2;
- rdmsrl(address, val);
- tom2 = val & (0xffffULL<<32);
+ rdmsrq(address, val);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
}
if (base <= tom2)
- base = tom2 + (1ULL<<32);
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
/*
* need to check if the range is in the high mmio range that is
@@ -123,11 +125,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (!(reg & 3))
continue;
- start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
- end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
- if (!end)
+ if (end < tom2)
continue;
range[hi_mmio_num].start = start;
@@ -143,35 +145,30 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (range[hi_mmio_num - 1].end < base)
goto out;
- if (range[0].start > base)
+ if (range[0].start > base + MMCONF_SIZE)
goto out;
/* need to find one window */
- base = range[0].start - (1ULL << 32);
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
if ((base > tom2) && BASE_VALID(base))
goto out;
- base = range[hi_mmio_num - 1].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
goto out;
/* need to find window between ranges */
- if (hi_mmio_num > 1)
- for (i = 0; i < hi_mmio_num - 1; i++) {
- if (range[i + 1].start > (range[i].end + (1ULL << 32))) {
- base = range[i].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
- goto out;
- }
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
}
-
-fail:
- fam10h_pci_mmconf_base_status = -1;
return;
+
out:
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
}
-void __cpuinit fam10h_check_enable_mmcfg(void)
+void fam10h_check_enable_mmcfg(void)
{
u64 val;
u32 address;
@@ -180,7 +177,7 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
return;
address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, val);
+ rdmsrq(address, val);
/* try to make sure that AP's setting is identical to BSP setting */
if (val & FAM10H_MMIO_CONF_ENABLE) {
@@ -190,11 +187,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
/* only trust the one handle 256 buses, if acpi=off */
if (!acpi_pci_disabled || busnbits >= 8) {
- u64 base;
- base = val & (0xffffULL << 32);
- if (fam10h_pci_mmconf_base_status <= 0) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
return;
} else if (fam10h_pci_mmconf_base == base)
return;
@@ -206,24 +202,26 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
* with 256 buses
*/
get_fam10h_pci_mmconf_base();
- if (fam10h_pci_mmconf_base_status <= 0)
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
return;
+ }
printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
(FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
FAM10H_MMIO_CONF_ENABLE;
- wrmsrl(address, val);
+ wrmsrq(address, val);
}
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
{
pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
return 0;
}
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
@@ -234,7 +232,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{}
};
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a non __init function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d7501..11c45ce42694 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -1,73 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Kernel module help for x86.
Copyright (C) 2001 Rusty Russell.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
+*/
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
#include <linux/moduleloader.h>
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
+#include <linux/jump_label.h>
+#include <linux/random.h>
+#include <linux/memory.h>
+#include <linux/stackprotector.h>
-#include <asm/system.h>
+#include <asm/text-patching.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/unwind.h>
#if 0
-#define DEBUGP printk
+#define DEBUGP(fmt, ...) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__)
#else
-#define DEBUGP(fmt...)
+#define DEBUGP(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
#endif
-void *module_alloc(unsigned long size)
-{
- struct vm_struct *area;
-
- if (!size)
- return NULL;
- size = PAGE_ALIGN(size);
- if (size > MODULES_LEN)
- return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC);
-}
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
- vfree(module_region);
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
@@ -80,8 +49,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
Elf32_Sym *sym;
uint32_t *location;
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
/* This is where to make the change */
location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -97,44 +66,41 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
- /* Add the value, subtract its postition */
+ case R_386_PLT32:
+ /* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
default:
- printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+ pr_err("%s: Unknown relocation: %u\n",
me->name, ELF32_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
}
return 0;
}
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
-{
- printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
- me->name);
- return -ENOEXEC;
-}
#else /*X86_64*/
-int apply_relocate_add(Elf64_Shdr *sechdrs,
+static int __write_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
- struct module *me)
+ struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len),
+ bool apply)
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
+ u64 zero = 0ULL;
+
+ DEBUGP("%s relocate section %u to %u\n",
+ apply ? "Applying" : "Clearing",
+ relsec, sechdrs[relsec].sh_info);
- DEBUGP("Applying relocate section %u to %u\n", relsec,
- sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ size_t size;
+
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
@@ -145,60 +111,130 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
+ ELF64_R_SYM(rel[i].r_info);
DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info),
- sym->st_value, rel[i].r_addend, (u64)loc);
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
val = sym->st_value + rel[i].r_addend;
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
- break;
+ continue; /* nothing to write */
case R_X86_64_64:
- *(u64 *)loc = val;
+ size = 8;
break;
case R_X86_64_32:
- *(u32 *)loc = val;
- if (val != *(u32 *)loc)
+ if (val != *(u32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_32S:
- *(s32 *)loc = val;
- if ((s64)val != *(s32 *)loc)
+ if ((s64)val != *(s32 *)&val)
goto overflow;
+ size = 4;
break;
+#if defined(CONFIG_STACKPROTECTOR) && \
+ defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
+ case R_X86_64_REX_GOTPCRELX: {
+ static unsigned long __percpu *const addr = &__stack_chk_guard;
+
+ if (sym->st_value != (u64)addr) {
+ pr_err("%s: Unsupported GOTPCREL relocation\n", me->name);
+ return -ENOEXEC;
+ }
+
+ val = (u64)&addr + rel[i].r_addend;
+ fallthrough;
+ }
+#endif
case R_X86_64_PC32:
+ case R_X86_64_PLT32:
val -= (u64)loc;
- *(u32 *)loc = val;
-#if 0
- if ((s64)val != *(s32 *)loc)
- goto overflow;
-#endif
+ size = 4;
+ break;
+ case R_X86_64_PC64:
+ val -= (u64)loc;
+ size = 8;
break;
default:
- printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
+ pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
+
+ if (apply) {
+ if (memcmp(loc, &zero, size)) {
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &val, size);
+ } else {
+ if (memcmp(loc, &val, size)) {
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n",
+ relsec, i, (int)ELF64_R_TYPE(rel[i].r_info),
+ (unsigned long)loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &zero, size);
+ }
}
return 0;
overflow:
- printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), val);
- printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+ pr_err("overflow in relocation type %d val %llx sec %u idx %d\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i);
+ pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
me->name);
return -ENOEXEC;
}
-int apply_relocate(Elf_Shdr *sechdrs,
+static int write_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me,
+ bool apply)
+{
+ int ret;
+ bool early = me->state == MODULE_STATE_UNFORMED;
+ void *(*write)(void *, const void *, size_t) = memcpy;
+
+ if (!early) {
+ write = text_poke;
+ mutex_lock(&text_mutex);
+ }
+
+ ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write, apply);
+
+ if (!early) {
+ smp_text_poke_sync_each_cpu();
+ mutex_unlock(&text_mutex);
+ }
+
+ return ret;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
- printk(KERN_ERR "non add relocation not supported\n");
- return -ENOSYS;
+ return write_relocate_add(sechdrs, strtab, symindex, relsec, me, true);
+}
+
+#ifdef CONFIG_LIVEPATCH
+void clear_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ write_relocate_add(sechdrs, strtab, symindex, relsec, me, false);
}
+#endif
#endif
@@ -206,44 +242,97 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
- *para = NULL;
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
+ *orc = NULL, *orc_ip = NULL,
+ *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
+ *calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
+ if (!strcmp(".retpoline_sites", secstrings + s->sh_name))
+ retpolines = s;
+ if (!strcmp(".return_sites", secstrings + s->sh_name))
+ returns = s;
+ if (!strcmp(".call_sites", secstrings + s->sh_name))
+ calls = s;
+ if (!strcmp(".cfi_sites", secstrings + s->sh_name))
+ cfi = s;
+ if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
+ ibt_endbr = s;
}
+ its_init_mod(me);
+
+ if (retpolines || cfi) {
+ void *rseg = NULL, *cseg = NULL;
+ unsigned int rsize = 0, csize = 0;
+
+ if (retpolines) {
+ rseg = (void *)retpolines->sh_addr;
+ rsize = retpolines->sh_size;
+ }
+
+ if (cfi) {
+ cseg = (void *)cfi->sh_addr;
+ csize = cfi->sh_size;
+ }
+
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ }
+ if (retpolines) {
+ void *rseg = (void *)retpolines->sh_addr;
+ apply_retpolines(rseg, rseg + retpolines->sh_size);
+ }
+
+ its_fini_mod(me);
+
+ if (returns) {
+ void *rseg = (void *)returns->sh_addr;
+ apply_returns(rseg, rseg + returns->sh_size);
+ }
+ if (calls) {
+ struct callthunk_sites cs = {};
+
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
+
+ callthunks_patch_module_calls(&cs, me);
+ }
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
- if (locks && text) {
+ if (ibt_endbr) {
+ void *iseg = (void *)ibt_endbr->sh_addr;
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
+ }
+ if (locks) {
void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
+ void *text = me->mem[MOD_TEXT].base;
+ void *text_end = text + me->mem[MOD_TEXT].size;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
+ text, text_end);
}
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
- return module_bug_finalize(hdr, sechdrs, me);
+ return 0;
}
void module_arch_cleanup(struct module *mod)
{
alternatives_smp_module_del(mod);
- module_bug_cleanup(mod);
+ its_free_mod(mod);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..4a1b1b28abf9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Intel Multiprocessor Specification 1.1 and 1.4
* compliant MP-table parsing routines.
@@ -10,23 +11,23 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
-#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
+#include <asm/acpi.h>
+#include <asm/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
#include <asm/proto.h>
#include <asm/bios_ebda.h>
-#include <asm/e820.h>
-#include <asm/trampoline.h>
+#include <asm/e820/api.h>
#include <asm/setup.h>
#include <asm/smp.h>
@@ -35,6 +36,8 @@
* Checksum an MP configuration block.
*/
+static unsigned int num_procs __initdata;
+
static int __init mpf_checksum(unsigned char *mp, int len)
{
int sum = 0;
@@ -45,175 +48,88 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
-int __init default_mpc_apic_id(struct mpc_cpu *m)
-{
- return m->apicid;
-}
-
static void __init MP_processor_info(struct mpc_cpu *m)
{
- int apicid;
char *bootup_cpu = "";
- if (!(m->cpuflag & CPU_ENABLED)) {
- disabled_cpus++;
+ topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED);
+ if (!(m->cpuflag & CPU_ENABLED))
return;
- }
- apicid = x86_init.mpparse.mpc_apic_id(m);
-
- if (m->cpuflag & CPU_BOOTPROCESSOR) {
+ if (m->cpuflag & CPU_BOOTPROCESSOR)
bootup_cpu = " (Bootup-CPU)";
- boot_cpu_physical_apicid = m->apicid;
- }
- printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
- generic_processor_info(apicid, m->apicver);
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
+ num_procs++;
}
#ifdef CONFIG_X86_IO_APIC
-void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
+static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str)
{
memcpy(str, m->bustype, 6);
str[6] = 0;
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+ apic_pr_verbose("Bus #%d is %s\n", m->busid, str);
}
static void __init MP_bus_info(struct mpc_bus *m)
{
char str[7];
- x86_init.mpparse.mpc_oem_bus_info(m, str);
+ mpc_oem_bus_info(m, str);
#if MAX_MP_BUSSES < 256
if (m->busid >= MAX_MP_BUSSES) {
- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
- " is too large, max. supported is %d\n",
- m->busid, str, MAX_MP_BUSSES - 1);
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
+ set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
- if (x86_init.mpparse.mpc_oem_pci_bus)
- x86_init.mpparse.mpc_oem_pci_bus(m);
-
clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+ pr_warn("Unknown bustype %s - ignoring\n", str);
}
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
- if (!(m->flags & MPC_APIC_USABLE))
- return;
-
- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->apicid, m->apicver, m->apicaddr);
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
- mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
-}
-
-static void print_MP_intsrc_info(struct mpc_intsrc *m)
-{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ if (m->flags & MPC_APIC_USABLE)
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
}
static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
mp_irq->irqtype, mp_irq->irqflag & 3,
(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
-static void __init assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- mp_irq->dstapic = m->dstapic;
- mp_irq->type = m->type;
- mp_irq->irqtype = m->irqtype;
- mp_irq->irqflag = m->irqflag;
- mp_irq->srcbus = m->srcbus;
- mp_irq->srcbusirq = m->srcbusirq;
- mp_irq->dstirq = m->dstirq;
-}
-
-static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- m->dstapic = mp_irq->dstapic;
- m->type = mp_irq->type;
- m->irqtype = mp_irq->irqtype;
- m->irqflag = mp_irq->irqflag;
- m->srcbus = mp_irq->srcbus;
- m->srcbusirq = mp_irq->srcbusirq;
- m->dstirq = mp_irq->dstirq;
-}
-
-static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- if (mp_irq->dstapic != m->dstapic)
- return 1;
- if (mp_irq->type != m->type)
- return 2;
- if (mp_irq->irqtype != m->irqtype)
- return 3;
- if (mp_irq->irqflag != m->irqflag)
- return 4;
- if (mp_irq->srcbus != m->srcbus)
- return 5;
- if (mp_irq->srcbusirq != m->srcbusirq)
- return 6;
- if (mp_irq->dstirq != m->dstirq)
- return 7;
-
- return 0;
-}
-
-static void __init MP_intsrc_info(struct mpc_intsrc *m)
-{
- int i;
-
- print_MP_intsrc_info(m);
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
#else /* CONFIG_X86_IO_APIC */
static inline void __init MP_bus_info(struct mpc_bus *m) {}
static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
-static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
#endif /* CONFIG_X86_IO_APIC */
-
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
- apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
m->srcbusirq, m->destapic, m->destapiclint);
}
@@ -221,39 +137,37 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
/*
* Read/parse the MPC
*/
-
static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
- printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->signature[0], mpc->signature[1],
mpc->signature[2], mpc->signature[3]);
return 0;
}
if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
- printk(KERN_ERR "MPTABLE: checksum error!\n");
+ pr_err("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->spec != 0x01 && mpc->spec != 0x04) {
- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->spec);
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
return 0;
}
if (!mpc->lapic) {
- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+ pr_err("MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(oem, mpc->oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->productid, 12);
str[12] = 0;
- printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+ pr_info("MPTABLE: Product ID: %s\n", str);
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
return 1;
}
@@ -266,14 +180,12 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
{
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
- "type %x\n", *mpt);
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
1, mpc, mpc->length, 1);
}
-void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
-
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -285,24 +197,14 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
-#ifdef CONFIG_X86_32
- generic_mps_oem_check(mpc, oem, str);
-#endif
- /* save the local APIC address, it might be non-default */
- if (!acpi_lapic)
- mp_lapic_addr = mpc->lapic;
-
- if (early)
+ if (early) {
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
return 1;
+ }
- if (mpc->oemptr)
- x86_init.mpparse.smp_read_mpc_oem(mpc);
-
- /*
- * Now process the configuration blocks.
- */
- x86_init.mpparse.mpc_record(0);
-
+ /* Now process the configuration blocks. */
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -320,7 +222,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
break;
case MP_INTSRC:
- MP_intsrc_info((struct mpc_intsrc *)mpt);
+ mp_save_irq((struct mpc_intsrc *)mpt);
skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
break;
case MP_LINTSRC:
@@ -333,12 +235,11 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
count = mpc->length;
break;
}
- x86_init.mpparse.mpc_record(1);
}
- if (!num_processors)
- printk(KERN_ERR "MPTABLE: no processors registered!\n");
- return num_processors;
+ if (!num_procs && !acpi_lapic)
+ pr_err("MPTABLE: no processors registered!\n");
+ return num_procs || acpi_lapic;
}
#ifdef CONFIG_X86_IO_APIC
@@ -347,7 +248,7 @@ static int __init ELCR_trigger(unsigned int irq)
{
unsigned int port;
- port = 0x4d0 + (irq >> 3);
+ port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -358,9 +259,9 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
int ELCR_fallback = 0;
intsrc.type = MP_INTSRC;
- intsrc.irqflag = 0; /* conforming */
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
intsrc.srcbus = 0;
- intsrc.dstapic = mp_ioapics[0].apicid;
+ intsrc.dstapic = mpc_ioapic_id(0);
intsrc.irqtype = mp_INT;
@@ -373,16 +274,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
- "falling back to ELCR\n");
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
ELCR_trigger(13))
- printk(KERN_ERR "ELCR contains invalid data... "
- "not using ELCR\n");
+ pr_err("ELCR contains invalid data... not using ELCR\n");
else {
- printk(KERN_INFO
- "Using ELCR to identify PCI interrupts\n");
+ pr_info("Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
@@ -392,7 +290,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
case 2:
if (i == 0 || i == 13)
continue; /* IRQ0 & IRQ13 not connected */
- /* fall through */
+ fallthrough;
default:
if (i == 2)
continue; /* IRQ2 is never connected */
@@ -404,21 +302,24 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* copy that information over to the MP table in the
* irqflag field (level sensitive, active high polarity).
*/
- if (ELCR_trigger(i))
- intsrc.irqflag = 13;
- else
- intsrc.irqflag = 0;
+ if (ELCR_trigger(i)) {
+ intsrc.irqflag = MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_HIGH;
+ } else {
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT |
+ MP_IRQPOL_DEFAULT;
+ }
}
intsrc.srcbusirq = i;
intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
intsrc.irqtype = mp_ExtINT;
intsrc.srcbusirq = 0;
intsrc.dstirq = 0; /* 8259A to INTIN0 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
@@ -431,9 +332,9 @@ static void __init construct_ioapic_table(int mpc_default_type)
bus.busid = 0;
switch (mpc_default_type) {
default:
- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ pr_err("???\nUnknown standard configuration %d\n",
mpc_default_type);
- /* fall through */
+ fallthrough;
case 1:
case 5:
memcpy(bus.bustype, "ISA ", 6);
@@ -443,9 +344,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
case 3:
memcpy(bus.bustype, "EISA ", 6);
break;
- case 4:
- case 7:
- memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -478,11 +376,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
int i;
/*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
* 2 CPUs, numbered 0 & 1.
*/
processor.type = MP_PROCESSOR;
@@ -490,8 +383,8 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
processor.cpuflag = CPU_ENABLED;
processor.cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.featureflag = boot_cpu_data.x86_capability[0];
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
+ processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
processor.reserved[0] = 0;
processor.reserved[1] = 0;
for (i = 0; i < 2; i++) {
@@ -502,7 +395,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
construct_ioapic_table(mpc_default_type);
lintsrc.type = MP_LINTSRC;
- lintsrc.irqflag = 0; /* conforming */
+ lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
lintsrc.srcbusid = 0;
lintsrc.srcbusirq = 0;
lintsrc.destapic = MP_APIC_ALL;
@@ -513,17 +406,18 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
}
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
+static bool mpf_found;
static unsigned long __init get_mpc_size(unsigned long physptr)
{
struct mpc_table *mpc;
unsigned long size;
- mpc = early_ioremap(physptr, PAGE_SIZE);
+ mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
- early_iounmap(mpc, PAGE_SIZE);
- apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+ early_memunmap(mpc, PAGE_SIZE);
+ apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size);
return size;
}
@@ -534,7 +428,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
unsigned long size;
size = get_mpc_size(mpf->physptr);
- mpc = early_ioremap(mpf->physptr, size);
+ mpc = early_memremap(mpf->physptr, size);
+
/*
* Read the physical hardware table. Anything here will
* override the defaults.
@@ -543,12 +438,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
- "... disabling SMP support. (tell your hw vendor)\n");
- early_iounmap(mpc, size);
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
+ early_memunmap(mpc, size);
return -1;
}
- early_iounmap(mpc, size);
+ early_memunmap(mpc, size);
if (early)
return -1;
@@ -562,8 +457,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
if (!mp_irq_entries) {
struct mpc_bus bus;
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. (tell your hw vendor)\n");
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
bus.type = MP_BUS;
bus.busid = 0;
@@ -580,11 +474,14 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
/*
* Scan the memory blocks for an SMP configuration block.
*/
-void __init default_get_smp_config(unsigned int early)
+static __init void mpparse_get_smp_config(unsigned int early)
{
- struct mpf_intel *mpf = mpf_found;
+ struct mpf_intel *mpf;
+
+ if (!smp_found_config)
+ return;
- if (!mpf)
+ if (!mpf_found)
return;
if (acpi_lapic && early)
@@ -597,64 +494,77 @@ void __init default_get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->specification);
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
+ }
+
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->feature2 & (1 << 7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+ pr_info(" IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+ pr_info(" Virtual Wire compatibility mode.\n");
pic_mode = 0;
}
#endif
/*
* Now see if we need to read further.
*/
- if (mpf->feature1 != 0) {
+ if (mpf->feature1) {
if (early) {
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return;
+ /* Local APIC has default address */
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ goto out;
}
- printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->feature1);
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
if (check_physptr(mpf, early))
- return;
+ goto out;
} else
BUG();
- if (!early)
- printk(KERN_INFO "Processors: %d\n", num_processors);
+ if (!early && !acpi_lapic)
+ pr_info("Processors: %d\n", num_procs);
/*
* Only use the first configuration found.
*/
+out:
+ early_memunmap(mpf, sizeof(*mpf));
}
-static void __init smp_reserve_memory(struct mpf_intel *mpf)
+void __init mpparse_parse_early_smp_config(void)
{
- unsigned long size = get_mpc_size(mpf->physptr);
+ mpparse_get_smp_config(true);
+}
- reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
+void __init mpparse_parse_smp_config(void)
+{
+ mpparse_get_smp_config(false);
+}
+
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
+{
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
{
- unsigned int *bp = phys_to_virt(base);
+ unsigned int *bp;
struct mpf_intel *mpf;
- unsigned long mem;
+ int ret = 0;
- apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
- bp, length);
+ apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
+ bp = early_memremap(base, length);
mpf = (struct mpf_intel *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
(mpf->length == 1) &&
@@ -664,25 +574,30 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
#endif
- mpf_found = mpf;
+ mpf_base = base;
+ mpf_found = true;
- printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
- mpf, (u64)virt_to_phys(mpf));
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n",
+ base, base + sizeof(*mpf) - 1);
- mem = virt_to_phys(mpf);
- reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
+ memblock_reserve(base, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
- return 1;
+ ret = 1;
}
- bp += 4;
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
length -= 16;
}
- return 0;
+ return ret;
}
-void __init default_find_smp_config(void)
+void __init mpparse_find_mptable(void)
{
unsigned int address;
@@ -700,7 +615,7 @@ void __init default_find_smp_config(void)
return;
/*
* If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
+ * configuration is in an EISA bus machine with an
* extended bios data area.
*
* there is a real-mode segmented pointer pointing to the
@@ -730,7 +645,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (m->irqtype != mp_INT)
return 0;
- if (m->irqflag != 0x0f)
+ if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
return 0;
/* not legacy */
@@ -739,7 +654,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
continue;
if (mp_irqs[i].srcbus != m->srcbus)
@@ -766,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
{
int i;
- apic_printk(APIC_VERBOSE, "OLD ");
- print_MP_intsrc_info(m);
+ apic_pr_verbose("OLD ");
+ print_mp_irq_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
- apic_printk(APIC_VERBOSE, "NEW ");
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ apic_pr_verbose("NEW ");
print_mp_irq_info(&mp_irqs[i]);
return;
}
@@ -789,23 +705,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
*nr_m_spare += 1;
}
}
-#else /* CONFIG_X86_IO_APIC */
-static
-inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
-#endif /* CONFIG_X86_IO_APIC */
-static int
+static int __init
check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- int ret = 0;
-
if (!mpc_new_phys || count <= mpc_new_length) {
WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
return -1;
}
- return ret;
+ return 0;
}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
static int __init replace_intsrc_all(struct mpc_table *mpc,
unsigned long mpc_new_phys,
@@ -818,7 +732,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ pr_info("mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -852,20 +766,21 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
if (mp_irqs[i].irqtype != mp_INT)
continue;
- if (mp_irqs[i].irqflag != 0x0f)
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
continue;
if (nr_m_spare > 0) {
- apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ apic_pr_verbose("*NEW* found\n");
nr_m_spare--;
- assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
m_spare[nr_m_spare] = NULL;
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
- assign_to_mpc_intsrc(&mp_irqs[i], m);
+ memcpy(m, &mp_irqs[i], sizeof(*m));
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
}
@@ -911,12 +826,10 @@ static int __init parse_alloc_mptable_opt(char *p)
}
early_param("alloc_mptable", parse_alloc_mptable_opt);
-void __init early_reserve_e820_mpc_new(void)
+void __init e820__memblock_alloc_reserved_mpc_new(void)
{
- if (enable_update_mptable && alloc_mptable) {
- u64 startt = 0;
- mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
- }
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4);
}
static int __init update_mp_table(void)
@@ -925,67 +838,89 @@ static int __init update_mp_table(void)
char oem[10];
struct mpf_intel *mpf;
struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
if (!enable_update_mptable)
return 0;
- mpf = mpf_found;
- if (!mpf)
+ if (!mpf_found)
+ return 0;
+
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
return 0;
+ }
/*
* Now see if we need to go further.
*/
- if (mpf->feature1 != 0)
- return 0;
+ if (mpf->feature1)
+ goto do_unmap_mpf;
if (!mpf->physptr)
- return 0;
+ goto do_unmap_mpf;
- mpc = phys_to_virt(mpf->physptr);
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
if (!smp_check_mpc(mpc, oem, str))
- return 0;
+ goto do_unmap_mpc;
- printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
- printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+ pr_info("mpf: %llx\n", (u64)mpf_base);
+ pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
- printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
- mpc_new_length);
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
if (!mpc_new_phys) {
unsigned char old, new;
- /* check if we can change the postion */
+ /* check if we can change the position */
mpc->checksum = 0;
old = mpf_checksum((unsigned char *)mpc, mpc->length);
mpc->checksum = 0xff;
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
- printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
- return 0;
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
+ goto do_unmap_mpc;
}
- printk(KERN_INFO "use in-positon replacing\n");
+ pr_info("use in-position replacing\n");
} else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
mpf->physptr = mpc_new_phys;
- mpc_new = phys_to_virt(mpc_new_phys);
memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
mpc = mpc_new;
+ size = mpc_new_length;
/* check if we can modify that */
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
- printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
- mpf_new = phys_to_virt(0x400 - 16);
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("mpf new: %x\n", 0x400 - 16);
memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
mpf = mpf_new;
mpf->physptr = mpc_new_phys;
}
mpf->checksum = 0;
mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+ pr_info("physptr new: %x\n", mpf->physptr);
}
/*
@@ -996,6 +931,12 @@ static int __init update_mp_table(void)
*/
replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
return 0;
}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
deleted file mode 100644
index 5915e0b33303..000000000000
--- a/arch/x86/kernel/mrst.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * mrst.c: Intel Moorestown platform specific setup code
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sfi.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/mrst.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/apb_timer.h>
-
-static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
-static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-int sfi_mtimer_num;
-
-struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
-EXPORT_SYMBOL_GPL(sfi_mrtc_array);
-int sfi_mrtc_num;
-
-static inline void assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
-/* parse all the mtimer info to a static mtimer array */
-static int __init sfi_parse_mtmr(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_timer_table_entry *pentry;
- struct mpc_intsrc mp_irq;
- int totallen;
-
- sb = (struct sfi_table_simple *)table;
- if (!sfi_mtimer_num) {
- sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
- struct sfi_timer_table_entry);
- pentry = (struct sfi_timer_table_entry *) sb->pentry;
- totallen = sfi_mtimer_num * sizeof(*pentry);
- memcpy(sfi_mtimer_array, pentry, totallen);
- }
-
- printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
- pentry = sfi_mtimer_array;
- for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
- printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
- " irq = %d\n", totallen, (u32)pentry->phys_addr,
- pentry->freq_hz, pentry->irq);
- if (!pentry->irq)
- continue;
- mp_irq.type = MP_IOAPIC;
- mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
- mp_irq.irqflag = 5;
- mp_irq.srcbus = 0;
- mp_irq.srcbusirq = pentry->irq; /* IRQ */
- mp_irq.dstapic = MP_APIC_ALL;
- mp_irq.dstirq = pentry->irq;
- save_mp_irq(&mp_irq);
- }
-
- return 0;
-}
-
-struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
-{
- int i;
- if (hint < sfi_mtimer_num) {
- if (!sfi_mtimer_usage[hint]) {
- pr_debug("hint taken for timer %d irq %d\n",\
- hint, sfi_mtimer_array[hint].irq);
- sfi_mtimer_usage[hint] = 1;
- return &sfi_mtimer_array[hint];
- }
- }
- /* take the first timer available */
- for (i = 0; i < sfi_mtimer_num;) {
- if (!sfi_mtimer_usage[i]) {
- sfi_mtimer_usage[i] = 1;
- return &sfi_mtimer_array[i];
- }
- i++;
- }
- return NULL;
-}
-
-void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
-{
- int i;
- for (i = 0; i < sfi_mtimer_num;) {
- if (mtmr->irq == sfi_mtimer_array[i].irq) {
- sfi_mtimer_usage[i] = 0;
- return;
- }
- i++;
- }
-}
-
-/* parse all the mrtc info to a global mrtc array */
-int __init sfi_parse_mrtc(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_rtc_table_entry *pentry;
- struct mpc_intsrc mp_irq;
-
- int totallen;
-
- sb = (struct sfi_table_simple *)table;
- if (!sfi_mrtc_num) {
- sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
- struct sfi_rtc_table_entry);
- pentry = (struct sfi_rtc_table_entry *)sb->pentry;
- totallen = sfi_mrtc_num * sizeof(*pentry);
- memcpy(sfi_mrtc_array, pentry, totallen);
- }
-
- printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
- pentry = sfi_mrtc_array;
- for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
- printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
- totallen, (u32)pentry->phys_addr, pentry->irq);
- mp_irq.type = MP_IOAPIC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = 0;
- mp_irq.srcbus = 0;
- mp_irq.srcbusirq = pentry->irq; /* IRQ */
- mp_irq.dstapic = MP_APIC_ALL;
- mp_irq.dstirq = pentry->irq;
- save_mp_irq(&mp_irq);
- }
- return 0;
-}
-
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
- /* restore default lapic clock if disabled by cmdline */
- if (disable_apbt_percpu)
- return setup_secondary_APIC_clock();
- apbt_setup_secondary_clock();
-}
-
-static unsigned long __init mrst_calibrate_tsc(void)
-{
- unsigned long flags, fast_calibrate;
-
- local_irq_save(flags);
- fast_calibrate = apbt_quick_calibrate();
- local_irq_restore(flags);
-
- if (fast_calibrate)
- return fast_calibrate;
-
- return 0;
-}
-
-void __init mrst_time_init(void)
-{
- sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
- pre_init_apic_IRQ0();
- apbt_time_init();
-}
-
-void __init mrst_rtc_init(void)
-{
- sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
-{
- pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
- if (disable_apbt_percpu)
- setup_boot_APIC_clock();
-};
-
-/* MID systems don't have i8042 controller */
-static int mrst_i8042_detect(void)
-{
- return 0;
-}
-
-/*
- * Moorestown specific x86_init function overrides and early setup
- * calls.
- */
-void __init x86_mrst_early_setup(void)
-{
- x86_init.resources.probe_roms = x86_init_noop;
- x86_init.resources.reserve_resources = x86_init_noop;
-
- x86_init.timers.timer_init = mrst_time_init;
- x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
-
- x86_init.irqs.pre_vector_init = x86_init_noop;
-
- x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
-
- x86_platform.calibrate_tsc = mrst_calibrate_tsc;
- x86_platform.i8042_detect = mrst_i8042_detect;
- x86_init.pci.init = pci_mrst_init;
- x86_init.pci.fixup_irqs = x86_init_noop;
-
- legacy_pic = &null_legacy_pic;
-
- /* Avoid searching for BIOS MP tables */
- x86_init.mpparse.find_smp_config = x86_init_noop;
- x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-
-}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..4469c784eaa0 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* ----------------------------------------------------------------------- *
*
* Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
* Copyright 2009 Intel Corporation; author: H. Peter Anvin
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
- * USA; either version 2 of the License, or (at your option) any later
- * version; incorporated herein by reference.
- *
* ----------------------------------------------------------------------- */
/*
@@ -22,6 +17,8 @@
* an SMP box will direct the access to CPU %d.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/types.h>
@@ -30,7 +27,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
@@ -38,34 +34,20 @@
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
+#include <linux/security.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <asm/msr.h>
-#include <asm/system.h>
-static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
+enum allow_write_msrs {
+ MSR_WRITES_ON,
+ MSR_WRITES_OFF,
+ MSR_WRITES_DEFAULT,
+};
+
+static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -73,7 +55,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
u32 __user *tmp = (u32 __user *) buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
@@ -95,16 +77,52 @@ static ssize_t msr_read(struct file *file, char __user *buf,
return bytes ? bytes : err;
}
+static int filter_write(u32 reg)
+{
+ /*
+ * MSRs writes usually happen all at once, and can easily saturate kmsg.
+ * Only allow one message every 30 seconds.
+ *
+ * It's possible to be smarter here and do it (for example) per-MSR, but
+ * it would certainly be more complex, and this is enough at least to
+ * avoid saturating the ring buffer.
+ */
+ static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1);
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: return 0;
+ case MSR_WRITES_OFF: return -EPERM;
+ default: break;
+ }
+
+ if (!__ratelimit(&fw_rs))
+ return 0;
+
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d), tainting CPU_OUT_OF_SPEC.\n",
+ reg, current->comm, current->pid);
+ pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
+
+ return 0;
+}
+
static ssize_t msr_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
const u32 __user *tmp = (const u32 __user *)buf;
u32 data[2];
u32 reg = *ppos;
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err = 0;
ssize_t bytes = 0;
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ return err;
+
+ err = filter_write(reg);
+ if (err)
+ return err;
+
if (count % 8)
return -EINVAL; /* Invalid chunk size */
@@ -113,9 +131,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
err = -EFAULT;
break;
}
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
if (err)
break;
+
tmp += 2;
bytes += 8;
}
@@ -127,7 +149,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
{
u32 __user *uregs = (u32 __user *)arg;
u32 regs[8];
- int cpu = iminor(file->f_path.dentry->d_inode);
+ int cpu = iminor(file_inode(file));
int err;
switch (ioc) {
@@ -136,14 +158,14 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = -EBADF;
break;
}
- if (copy_from_user(&regs, uregs, sizeof regs)) {
+ if (copy_from_user(&regs, uregs, sizeof(regs))) {
err = -EFAULT;
break;
}
err = rdmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
- if (copy_to_user(uregs, &regs, sizeof regs))
+ if (copy_to_user(uregs, &regs, sizeof(regs)))
err = -EFAULT;
break;
@@ -152,14 +174,24 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = -EBADF;
break;
}
- if (copy_from_user(&regs, uregs, sizeof regs)) {
+ if (copy_from_user(&regs, uregs, sizeof(regs))) {
err = -EFAULT;
break;
}
+ err = security_locked_down(LOCKDOWN_MSR);
+ if (err)
+ break;
+
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
- if (copy_to_user(uregs, &regs, sizeof regs))
+ if (copy_to_user(uregs, &regs, sizeof(regs)))
err = -EFAULT;
break;
@@ -173,10 +205,12 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
static int msr_open(struct inode *inode, struct file *file)
{
- unsigned int cpu;
+ unsigned int cpu = iminor(file_inode(file));
struct cpuinfo_x86 *c;
- cpu = iminor(file->f_path.dentry->d_inode);
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
@@ -192,7 +226,7 @@ static int msr_open(struct inode *inode, struct file *file)
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
- .llseek = msr_seek,
+ .llseek = no_seek_end_llseek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
@@ -200,99 +234,101 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
-static int __cpuinit msr_device_create(int cpu)
+static char *msr_devnode(const struct device *dev, umode_t *mode)
{
- struct device *dev;
-
- dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
- "msr%d", cpu);
- return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
-static void msr_device_destroy(int cpu)
-{
- device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
-}
+static const struct class msr_class = {
+ .name = "msr",
+ .devnode = msr_devnode,
+};
-static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int msr_device_create(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
+ struct device *dev;
- switch (action) {
- case CPU_UP_PREPARE:
- err = msr_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- msr_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
+ dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
}
-static struct notifier_block __refdata msr_class_cpu_notifier = {
- .notifier_call = msr_class_cpu_callback,
-};
-
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static int msr_device_destroy(unsigned int cpu)
{
- return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+ device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
}
static int __init msr_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
- printk(KERN_ERR "msr: unable to get major %d for msr\n",
- MSR_MAJOR);
- err = -EBUSY;
- goto out;
+ pr_err("unable to get major %d for msr\n", MSR_MAJOR);
+ return -EBUSY;
}
- msr_class = class_create(THIS_MODULE, "msr");
- if (IS_ERR(msr_class)) {
- err = PTR_ERR(msr_class);
+ err = class_register(&msr_class);
+ if (err)
goto out_chrdev;
- }
- msr_class->devnode = msr_devnode;
- for_each_online_cpu(i) {
- err = msr_device_create(i);
- if (err != 0)
- goto out_class;
- }
- register_hotcpu_notifier(&msr_class_cpu_notifier);
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i)
- msr_device_destroy(i);
- class_destroy(msr_class);
+ class_unregister(&msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-out:
return err;
}
+module_init(msr_init);
static void __exit msr_exit(void)
{
- int cpu = 0;
- for_each_online_cpu(cpu)
- msr_device_destroy(cpu);
- class_destroy(msr_class);
+ cpuhp_remove_state(cpuhp_msr_state);
+ class_unregister(&msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- unregister_hotcpu_notifier(&msr_class_cpu_notifier);
}
-
-module_init(msr_init);
module_exit(msr_exit)
+static int set_allow_writes(const char *val, const struct kernel_param *cp)
+{
+ /* val is NUL-terminated, see kernfs_fop_write() */
+ char *s = strstrip((char *)val);
+
+ if (!strcmp(s, "on"))
+ allow_writes = MSR_WRITES_ON;
+ else if (!strcmp(s, "off"))
+ allow_writes = MSR_WRITES_OFF;
+ else
+ allow_writes = MSR_WRITES_DEFAULT;
+
+ return 0;
+}
+
+static int get_allow_writes(char *buf, const struct kernel_param *kp)
+{
+ const char *res;
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: res = "on"; break;
+ case MSR_WRITES_OFF: res = "off"; break;
+ default: res = "default"; break;
+ }
+
+ return sprintf(buf, "%s\n", res);
+}
+
+static const struct kernel_param_ops allow_writes_ops = {
+ .set = set_allow_writes,
+ .get = get_allow_writes
+};
+
+module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600);
+
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic MSR driver");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 000000000000..3d239ed12744
--- /dev/null
+++ b/arch/x86/kernel/nmi.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2011 Don Zickus Red Hat, Inc.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/nmi.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/atomic.h>
+#include <linux/sched/clock.h>
+#include <linux/kvm_types.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/traps.h>
+#include <asm/mach_traps.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/cache.h>
+#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
+#include <asm/sev.h>
+#include <asm/fred.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/nmi.h>
+
+/*
+ * An emergency handler can be set in any context including NMI
+ */
+struct nmi_desc {
+ raw_spinlock_t lock;
+ nmi_handler_t emerg_handler;
+ struct list_head head;
+};
+
+#define NMI_DESC_INIT(type) { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \
+ .head = LIST_HEAD_INIT(nmi_desc[type].head), \
+}
+
+static struct nmi_desc nmi_desc[NMI_MAX] = {
+ NMI_DESC_INIT(NMI_LOCAL),
+ NMI_DESC_INIT(NMI_UNKNOWN),
+ NMI_DESC_INIT(NMI_SERR),
+ NMI_DESC_INIT(NMI_IO_CHECK),
+};
+
+#define nmi_to_desc(type) (&nmi_desc[type])
+
+struct nmi_stats {
+ unsigned int normal;
+ unsigned int unknown;
+ unsigned int external;
+ unsigned int swallow;
+ unsigned long recv_jiffies;
+ unsigned long idt_seq;
+ unsigned long idt_nmi_seq;
+ unsigned long idt_ignored;
+ atomic_long_t idt_calls;
+ unsigned long idt_seq_snap;
+ unsigned long idt_nmi_seq_snap;
+ unsigned long idt_ignored_snap;
+ long idt_calls_snap;
+};
+
+static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
+
+static int ignore_nmis __read_mostly;
+
+int unknown_nmi_panic;
+int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
+
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
+static int __init nmi_warning_debugfs(void)
+{
+ debugfs_create_u64("nmi_longest_ns", 0644,
+ arch_debugfs_dir, &nmi_longest_ns);
+ return 0;
+}
+fs_initcall(nmi_warning_debugfs);
+
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
+{
+ int remainder_ns, decimal_msecs;
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
+
+ /* Convert duration from nsec to msec */
+ remainder_ns = do_div(duration, NSEC_PER_MSEC);
+ decimal_msecs = remainder_ns / NSEC_PER_USEC;
+
+ pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ action->handler, duration, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ nmi_handler_t ehandler;
+ struct nmiaction *a;
+ int handled=0;
+
+ /*
+ * Call the emergency handler, if set
+ *
+ * In the case of crash_nmi_callback() emergency handler, it will
+ * return in the case of the crashing CPU to enable it to complete
+ * other necessary crashing actions ASAP. Other handlers in the
+ * linked list won't need to be run.
+ */
+ ehandler = desc->emerg_handler;
+ if (ehandler)
+ return ehandler(type, regs);
+
+ rcu_read_lock();
+
+ /*
+ * NMIs are edge-triggered, which means if you have enough
+ * of them concurrently, you can lose some because only one
+ * can be latched at any given time. Walk the whole list
+ * to handle those situations.
+ */
+ list_for_each_entry_rcu(a, &desc->head, list) {
+ int thishandled;
+ u64 delta;
+
+ delta = sched_clock();
+ thishandled = a->handler(type, regs);
+ handled += thishandled;
+ delta = sched_clock() - delta;
+ trace_nmi_handler(a->handler, (int)delta, thishandled);
+
+ nmi_check_duration(a, delta);
+ }
+
+ rcu_read_unlock();
+
+ /* return total number of NMI events handled */
+ return handled;
+}
+NOKPROBE_SYMBOL(nmi_handle);
+
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /*
+ * Indicate if there are multiple registrations on the
+ * internal NMI handler call chains (SERR and IO_CHECK).
+ */
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
+
+ /*
+ * some handlers need to be executed first otherwise a fake
+ * event confuses some handlers (kdump uses this flag)
+ */
+ if (action->flags & NMI_FLAG_FIRST)
+ list_add_rcu(&action->list, &desc->head);
+ else
+ list_add_tail_rcu(&action->list, &desc->head);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(__register_nmi_handler);
+
+void unregister_nmi_handler(unsigned int type, const char *name)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *n, *found = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ list_for_each_entry_rcu(n, &desc->head, list) {
+ /*
+ * the name passed in to describe the nmi handler
+ * is used as the lookup key
+ */
+ if (!strcmp(n->name, name)) {
+ WARN(in_nmi(),
+ "Trying to free NMI (%s) from NMI context!\n", n->name);
+ list_del_rcu(&n->list);
+ found = n;
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (found) {
+ synchronize_rcu();
+ INIT_LIST_HEAD(&found->list);
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+
+/**
+ * set_emergency_nmi_handler - Set emergency handler
+ * @type: NMI type
+ * @handler: the emergency handler to be stored
+ *
+ * Set an emergency NMI handler which, if set, will preempt all the other
+ * handlers in the linked list. If a NULL handler is passed in, it will clear
+ * it. It is expected that concurrent calls to this function will not happen
+ * or the system is screwed beyond repair.
+ */
+void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+
+ if (WARN_ON_ONCE(desc->emerg_handler == handler))
+ return;
+ desc->emerg_handler = handler;
+
+ /*
+ * Ensure the emergency handler is visible to other CPUs before
+ * function return
+ */
+ smp_wmb();
+}
+
+static void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs))
+ return;
+
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(pci_serr_error);
+
+static void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+ unsigned long i;
+
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs))
+ return;
+
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ show_regs(regs);
+
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
+
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(io_check_error);
+
+static void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+ int handled;
+
+ /*
+ * As a last resort, let the "unknown" handlers make a
+ * best-effort attempt to figure out if they can claim
+ * responsibility for this Unknown NMI.
+ */
+ handled = nmi_handle(NMI_UNKNOWN, regs);
+ if (handled) {
+ __this_cpu_add(nmi_stats.unknown, handled);
+ return;
+ }
+
+ __this_cpu_add(nmi_stats.unknown, 1);
+
+ pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg_ratelimited("Dazed and confused, but trying to continue\n");
+}
+NOKPROBE_SYMBOL(unknown_nmi_error);
+
+static DEFINE_PER_CPU(bool, swallow_nmi);
+static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+
+static noinstr void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+ int handled;
+ bool b2b = false;
+
+ /*
+ * Back-to-back NMIs are detected by comparing the RIP of the
+ * current NMI with that of the previous NMI. If it is the same,
+ * it is assumed that the CPU did not have a chance to jump back
+ * into a non-NMI context and execute code in between the two
+ * NMIs.
+ *
+ * They are interesting because even if there are more than two,
+ * only a maximum of two can be detected (anything over two is
+ * dropped due to NMI being edge-triggered). If this is the
+ * second half of the back-to-back NMI, assume we dropped things
+ * and process more handlers. Otherwise, reset the 'swallow' NMI
+ * behavior.
+ */
+ if (regs->ip == __this_cpu_read(last_nmi_rip))
+ b2b = true;
+ else
+ __this_cpu_write(swallow_nmi, false);
+
+ __this_cpu_write(last_nmi_rip, regs->ip);
+
+ instrumentation_begin();
+
+ if (microcode_nmi_handler_enabled() && microcode_nmi_handler())
+ goto out;
+
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+ handled = nmi_handle(NMI_LOCAL, regs);
+ __this_cpu_add(nmi_stats.normal, handled);
+ if (handled) {
+ /*
+ * There are cases when a NMI handler handles multiple
+ * events in the current NMI. One of these events may
+ * be queued for in the next NMI. Because the event is
+ * already handled, the next NMI will result in an unknown
+ * NMI. Instead lets flag this for a potential NMI to
+ * swallow.
+ */
+ if (handled > 1)
+ __this_cpu_write(swallow_nmi, true);
+ goto out;
+ }
+
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
+ reason = x86_platform.get_nmi_reason();
+
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ reassert_nmi();
+
+ __this_cpu_add(nmi_stats.external, 1);
+ raw_spin_unlock(&nmi_reason_lock);
+ goto out;
+ }
+ raw_spin_unlock(&nmi_reason_lock);
+
+ /*
+ * Only one NMI can be latched at a time. To handle
+ * this we may process multiple nmi handlers at once to
+ * cover the case where an NMI is dropped. The downside
+ * to this approach is we may process an NMI prematurely,
+ * while its real NMI is sitting latched. This will cause
+ * an unknown NMI on the next run of the NMI processing.
+ *
+ * We tried to flag that condition above, by setting the
+ * swallow_nmi flag when we process more than one event.
+ * This condition is also only present on the second half
+ * of a back-to-back NMI, so we flag that condition too.
+ *
+ * If both are true, we assume we already processed this
+ * NMI previously and we swallow it. Otherwise we reset
+ * the logic.
+ *
+ * There are scenarios where we may accidentally swallow
+ * a 'real' unknown NMI. For example, while processing
+ * a perf NMI another perf NMI comes in along with a
+ * 'real' unknown NMI. These two NMIs get combined into
+ * one (as described above). When the next NMI gets
+ * processed, it will be flagged by perf as handled, but
+ * no one will know that there was a 'real' unknown NMI sent
+ * also. As a result it gets swallowed. Or if the first
+ * perf NMI returns two events handled then the second
+ * NMI will get eaten by the logic below, again losing a
+ * 'real' unknown NMI. But this is the best we can do
+ * for now.
+ */
+ if (b2b && __this_cpu_read(swallow_nmi))
+ __this_cpu_add(nmi_stats.swallow, 1);
+ else
+ unknown_nmi_error(reason, regs);
+
+out:
+ instrumentation_end();
+}
+
+/*
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING = 0,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+static DEFINE_PER_CPU(unsigned long, nmi_dr7);
+
+DEFINE_IDTENTRY_RAW(exc_nmi)
+{
+ irqentry_state_t irq_state;
+ struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
+
+ /*
+ * Re-enable NMIs right here when running as an SEV-ES guest. This might
+ * cause nested NMIs, but those can be handled safely.
+ */
+ sev_es_nmi_complete();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
+ raw_atomic_long_inc(&nsp->idt_calls);
+
+ if (arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
+ return;
+ }
+
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+
+nmi_restart:
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
+
+ /*
+ * Needs to happen before DR7 is accessed, because the hypervisor can
+ * intercept DR7 reads/writes, turning those into #VC exceptions.
+ */
+ sev_es_ist_enter(regs);
+
+ this_cpu_write(nmi_dr7, local_db_save());
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
+ WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
+ } else if (!ignore_nmis) {
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
+ }
+ default_do_nmi(regs);
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
+ }
+ }
+
+ irqentry_nmi_exit(regs, irq_state);
+
+ local_db_restore(this_cpu_read(nmi_dr7));
+
+ sev_es_ist_exit();
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(nsp->idt_seq & 0x1);
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
+}
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
+{
+ exc_nmi(regs);
+}
+EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx);
+#endif
+
+#ifdef CONFIG_NMI_CHECK_CPU
+
+static char *nmi_check_stall_msg[] = {
+/* */
+/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */
+/* | +------ cpu_is_offline(cpu) */
+/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
+/* | | | NMI handler has been invoked. */
+/* | | | */
+/* V V V */
+/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler",
+/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs",
+/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler",
+/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs",
+/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler",
+/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs",
+/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler",
+/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs",
+};
+
+void nmi_backtrace_stall_snap(const struct cpumask *btp)
+{
+ int cpu;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq);
+ nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq);
+ nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored);
+ nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls);
+ }
+}
+
+void nmi_backtrace_stall_check(const struct cpumask *btp)
+{
+ int cpu;
+ int idx;
+ unsigned long nmi_seq;
+ unsigned long j = jiffies;
+ char *modp;
+ char *msgp;
+ char *msghp;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ modp = "";
+ msghp = "";
+ nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
+ msgp = "CPU entered NMI handler function, but has not exited";
+ } else if (nsp->idt_nmi_seq_snap == nmi_seq ||
+ nsp->idt_nmi_seq_snap + 1 == nmi_seq) {
+ idx = ((nmi_seq & 0x1) << 2) |
+ (cpu_is_offline(cpu) << 1) |
+ (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
+ msgp = nmi_check_stall_msg[idx];
+ if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
+ modp = ", but OK because ignore_nmis was set";
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ msghp = " (CPU exited one NMI handler function)";
+ else if (nmi_seq & 0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else
+ msghp = " (CPU was never in an NMI handler function)";
+ } else {
+ msgp = "CPU is handling NMIs";
+ }
+ pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp);
+ pr_alert("%s: last activity: %lu jiffies ago.\n",
+ __func__, j - READ_ONCE(nsp->recv_jiffies));
+ }
+}
+
+#endif
+
+#ifdef CONFIG_X86_FRED
+/*
+ * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED
+ * event delivery, i.e., there is no problem of transient states.
+ * And NMI unblocking only happens when the stack frame indicates
+ * that so should happen.
+ *
+ * Thus, the NMI entry stub for FRED is really straightforward and
+ * as simple as most exception handlers. As such, #DB is allowed
+ * during NMI handling.
+ */
+DEFINE_FREDENTRY_NMI(exc_nmi)
+{
+ irqentry_state_t irq_state;
+
+ if (arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
+ return;
+ }
+
+ /*
+ * Save CR2 for eventual restore to cover the case where the NMI
+ * hits the VMENTER/VMEXIT region where guest CR2 is life. This
+ * prevents guest state corruption in case that the NMI handler
+ * takes a page fault.
+ */
+ this_cpu_write(nmi_cr2, read_cr2());
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+ default_do_nmi(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+}
+#endif
+
+void stop_nmi(void)
+{
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+}
+
+/* reset the back-to-back NMI logic */
+void local_touch_nmi(void)
+{
+ __this_cpu_write(last_nmi_rip, 0);
+}
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..a010e9d062bf
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int __initdata nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
+
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
+
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void __init init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+ __initdata);
+}
+
+static void __init cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void __init test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && --timeout)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void __init remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void __init dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ pr_cont("FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ pr_cont("TIMEOUT|");
+ else
+ pr_cont("ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ pr_cont(" ok |");
+ }
+ pr_cont("\n");
+
+ testcase_total++;
+ reset_nmi();
+}
+
+void __init nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ pr_info("----------------\n");
+ pr_info("| NMI testsuite:\n");
+ pr_info("--------------------\n");
+
+ pr_info("%12s:", "remote IPI");
+ dotest(remote_ipi, SUCCESS);
+
+ pr_info("%12s:", "local IPI");
+ dotest(local_ipi, SUCCESS);
+
+ cleanup_nmi_testsuite();
+
+ pr_info("--------------------\n");
+ if (unexpected_testcase_failures) {
+ pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ } else {
+ pr_info("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ }
+ pr_info("-----------------------------------------------------------------\n");
+}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 676b8c77a976..9e1ea99ad9df 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -1,28 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
#include <asm/paravirt.h>
-static inline void
-default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
- arch_spin_lock(lock);
+ native_queued_spin_unlock(lock);
}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
-struct pv_lock_ops pv_lock_ops = {
-#ifdef CONFIG_SMP
- .spin_is_locked = __ticket_spin_is_locked,
- .spin_is_contended = __ticket_spin_is_contended,
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_ops.lock.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
- .spin_lock = __ticket_spin_lock,
- .spin_lock_flags = default_spin_lock_flags,
- .spin_trylock = __ticket_spin_trylock,
- .spin_unlock = __ticket_spin_unlock,
-#endif
-};
-EXPORT_SYMBOL(pv_lock_ops);
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_ops.lock.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+void __init paravirt_set_cap(void)
+{
+ if (!pv_is_native_spin_unlock())
+ setup_force_cpu_cap(X86_FEATURE_PVUNLOCK);
+
+ if (!pv_is_native_vcpu_is_preempted())
+ setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..ab3e172dcc69 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -1,34 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Paravirtualization interfaces
Copyright (C) 2006 Rusty Russell IBM Corporation
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
*/
#include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
+#include <linux/pgtable.h>
+#include <linux/static_call.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
+#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@@ -37,22 +29,14 @@
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
+#include <asm/special_insns.h>
+#include <asm/tlb.h>
+#include <asm/io_bitmap.h>
+#include <asm/gsseg.h>
+#include <asm/msr.h>
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
-
-/* identity function, which can be inlined */
-u32 _paravirt_ident_32(u32 x)
-{
- return x;
-}
-
-u64 _paravirt_ident_64(u64 x)
-{
- return x;
-}
+/* stub always returning 0. */
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -60,414 +44,212 @@ void __init default_banner(void)
pv_info.name);
}
-/* Simple instruction patching code. */
-#define DEF_NATIVE(ops, name, code) \
- extern const char start_##ops##_##name[], end_##ops##_##name[]; \
- asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-
-/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
+#ifdef CONFIG_PARAVIRT_XXL
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+#endif
-unsigned paravirt_patch_nop(void)
-{
- return 0;
-}
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
-unsigned paravirt_patch_ignore(unsigned len)
+void __init native_pv_lock_init(void)
{
- return len;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
-struct branch {
- unsigned char opcode;
- u32 delta;
-} __attribute__((packed));
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
-unsigned paravirt_patch_call(void *insnbuf,
- const void *target, u16 tgt_clobbers,
- unsigned long addr, u16 site_clobbers,
- unsigned len)
+static u64 native_steal_clock(int cpu)
{
- struct branch *b = insnbuf;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (tgt_clobbers & ~site_clobbers)
- return len; /* target would clobber too much for this site */
- if (len < 5)
- return len; /* call too long for patch site */
-
- b->opcode = 0xe8; /* call */
- b->delta = delta;
- BUILD_BUG_ON(sizeof(*b) != 5);
-
- return 5;
-}
-
-unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
- unsigned long addr, unsigned len)
-{
- struct branch *b = insnbuf;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (len < 5)
- return len; /* call too long for patch site */
-
- b->opcode = 0xe9; /* jmp */
- b->delta = delta;
-
- return 5;
+ return 0;
}
-/* Neat trick to map patch type back to the call within the
- * corresponding structure. */
-static void *get_call_destination(u8 type)
-{
- struct paravirt_patch_template tmpl = {
- .pv_init_ops = pv_init_ops,
- .pv_time_ops = pv_time_ops,
- .pv_cpu_ops = pv_cpu_ops,
- .pv_irq_ops = pv_irq_ops,
- .pv_apic_ops = pv_apic_ops,
- .pv_mmu_ops = pv_mmu_ops,
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- .pv_lock_ops = pv_lock_ops,
-#endif
- };
- return *((void **)&tmpl + type);
-}
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
-unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
- unsigned long addr, unsigned len)
+void paravirt_set_sched_clock(u64 (*func)(void))
{
- void *opfunc = get_call_destination(type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
- else if (opfunc == _paravirt_nop)
- /* If the operation is a nop, then nop the callsite */
- ret = paravirt_patch_nop();
-
- /* identity functions just return their single argument */
- else if (opfunc == _paravirt_ident_32)
- ret = paravirt_patch_ident_32(insnbuf, len);
- else if (opfunc == _paravirt_ident_64)
- ret = paravirt_patch_ident_64(insnbuf, len);
-
- else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
- type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
- /* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
- else
- /* Otherwise call the function; assume target could
- clobber any caller-save reg */
- ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
- addr, clobbers, len);
-
- return ret;
+ static_call_update(pv_sched_clock, func);
}
-unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
- const char *start, const char *end)
+static noinstr void pv_native_safe_halt(void)
{
- unsigned insn_len = end - start;
-
- if (insn_len > len || start == NULL)
- insn_len = len;
- else
- memcpy(insnbuf, start, insn_len);
-
- return insn_len;
+ native_safe_halt();
}
-static void native_flush_tlb(void)
+#ifdef CONFIG_PARAVIRT_XXL
+static noinstr void pv_native_write_cr2(unsigned long val)
{
- __native_flush_tlb();
+ native_write_cr2(val);
}
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-static void native_flush_tlb_global(void)
+static noinstr unsigned long pv_native_read_cr3(void)
{
- __native_flush_tlb_global();
+ return __native_read_cr3();
}
-static void native_flush_tlb_single(unsigned long addr)
+static noinstr void pv_native_write_cr3(unsigned long cr3)
{
- __native_flush_tlb_single(addr);
+ native_write_cr3(cr3);
}
-/* These are in entry.S */
-extern void native_iret(void);
-extern void native_irq_enable_sysexit(void);
-extern void native_usergs_sysret32(void);
-extern void native_usergs_sysret64(void);
-
-static struct resource reserve_ioports = {
- .start = 0,
- .end = IO_SPACE_LIMIT,
- .name = "paravirt-ioport",
- .flags = IORESOURCE_IO | IORESOURCE_BUSY,
-};
-
-/*
- * Reserve the whole legacy IO space to prevent any legacy drivers
- * from wasting time probing for their hardware. This is a fairly
- * brute-force approach to disabling all non-virtual drivers.
- *
- * Note that this must be called very early to have any effect.
- */
-int paravirt_disable_iospace(void)
+static noinstr unsigned long pv_native_get_debugreg(int regno)
{
- return request_resource(&ioport_resource, &reserve_ioports);
+ return native_get_debugreg(regno);
}
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
+static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-
- percpu_write(paravirt_lazy_mode, mode);
+ native_set_debugreg(regno, val);
}
+#endif
-static void leave_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
-
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
+struct pv_info pv_info = {
+ .name = "bare hardware",
+#ifdef CONFIG_PARAVIRT_XXL
+ .extra_user_64bit_cs = __USER_CS,
+#endif
+};
-void paravirt_enter_lazy_mmu(void)
-{
- enter_lazy(PARAVIRT_LAZY_MMU);
-}
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
-void paravirt_leave_lazy_mmu(void)
-{
- leave_lazy(PARAVIRT_LAZY_MMU);
-}
+struct paravirt_patch_template pv_ops = {
+ /* Cpu ops. */
+ .cpu.io_delay = native_io_delay,
+
+#ifdef CONFIG_PARAVIRT_XXL
+ .cpu.cpuid = native_cpuid,
+ .cpu.get_debugreg = pv_native_get_debugreg,
+ .cpu.set_debugreg = pv_native_set_debugreg,
+ .cpu.read_cr0 = native_read_cr0,
+ .cpu.write_cr0 = native_write_cr0,
+ .cpu.write_cr4 = native_write_cr4,
+ .cpu.read_msr = native_read_msr,
+ .cpu.write_msr = native_write_msr,
+ .cpu.read_msr_safe = native_read_msr_safe,
+ .cpu.write_msr_safe = native_write_msr_safe,
+ .cpu.read_pmc = native_read_pmc,
+ .cpu.load_tr_desc = native_load_tr_desc,
+ .cpu.set_ldt = native_set_ldt,
+ .cpu.load_gdt = native_load_gdt,
+ .cpu.load_idt = native_load_idt,
+ .cpu.store_tr = native_store_tr,
+ .cpu.load_tls = native_load_tls,
+ .cpu.load_gs_index = native_load_gs_index,
+ .cpu.write_ldt_entry = native_write_ldt_entry,
+ .cpu.write_gdt_entry = native_write_gdt_entry,
+ .cpu.write_idt_entry = native_write_idt_entry,
+
+ .cpu.alloc_ldt = paravirt_nop,
+ .cpu.free_ldt = paravirt_nop,
+
+ .cpu.load_sp0 = native_load_sp0,
+
+#ifdef CONFIG_X86_IOPL_IOPERM
+ .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
+ .cpu.update_io_bitmap = native_tss_update_io_bitmap,
+#endif
-void paravirt_start_context_switch(struct task_struct *prev)
-{
- BUG_ON(preemptible());
+ .cpu.start_context_switch = paravirt_nop,
+ .cpu.end_context_switch = paravirt_nop,
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
-}
+ /* Irq ops. */
+ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
+ .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
+ .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
+#endif /* CONFIG_PARAVIRT_XXL */
-void paravirt_end_context_switch(struct task_struct *next)
-{
- BUG_ON(preemptible());
+ /* Irq HLT ops. */
+ .irq.safe_halt = pv_native_safe_halt,
+ .irq.halt = native_halt,
- leave_lazy(PARAVIRT_LAZY_CPU);
+ /* Mmu ops. */
+ .mmu.flush_tlb_user = native_flush_tlb_local,
+ .mmu.flush_tlb_kernel = native_flush_tlb_global,
+ .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
+ .mmu.flush_tlb_multi = native_flush_tlb_multi,
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
-}
+ .mmu.exit_mmap = paravirt_nop,
+ .mmu.notify_page_enc_status_changed = paravirt_nop,
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
+#ifdef CONFIG_PARAVIRT_XXL
+ .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
+ .mmu.write_cr2 = pv_native_write_cr2,
+ .mmu.read_cr3 = pv_native_read_cr3,
+ .mmu.write_cr3 = pv_native_write_cr3,
- return percpu_read(paravirt_lazy_mode);
-}
+ .mmu.pgd_alloc = __paravirt_pgd_alloc,
+ .mmu.pgd_free = paravirt_nop,
-void arch_flush_lazy_mmu_mode(void)
-{
- preempt_disable();
+ .mmu.alloc_pte = paravirt_nop,
+ .mmu.alloc_pmd = paravirt_nop,
+ .mmu.alloc_pud = paravirt_nop,
+ .mmu.alloc_p4d = paravirt_nop,
+ .mmu.release_pte = paravirt_nop,
+ .mmu.release_pmd = paravirt_nop,
+ .mmu.release_pud = paravirt_nop,
+ .mmu.release_p4d = paravirt_nop,
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
+ .mmu.set_pte = native_set_pte,
+ .mmu.set_pmd = native_set_pmd,
- preempt_enable();
-}
+ .mmu.ptep_modify_prot_start = __ptep_modify_prot_start,
+ .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-struct pv_info pv_info = {
- .name = "bare hardware",
- .paravirt_enabled = 0,
- .kernel_rpl = 0,
- .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
-};
+ .mmu.set_pud = native_set_pud,
-struct pv_init_ops pv_init_ops = {
- .patch = native_patch,
-};
+ .mmu.pmd_val = PTE_IDENT,
+ .mmu.make_pmd = PTE_IDENT,
-struct pv_time_ops pv_time_ops = {
- .sched_clock = native_sched_clock,
-};
+ .mmu.pud_val = PTE_IDENT,
+ .mmu.make_pud = PTE_IDENT,
-struct pv_irq_ops pv_irq_ops = {
- .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
- .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
- .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
- .safe_halt = native_safe_halt,
- .halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
-#endif
-};
+ .mmu.set_p4d = native_set_p4d,
-struct pv_cpu_ops pv_cpu_ops = {
- .cpuid = native_cpuid,
- .get_debugreg = native_get_debugreg,
- .set_debugreg = native_set_debugreg,
- .clts = native_clts,
- .read_cr0 = native_read_cr0,
- .write_cr0 = native_write_cr0,
- .read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
- .write_cr4 = native_write_cr4,
-#ifdef CONFIG_X86_64
- .read_cr8 = native_read_cr8,
- .write_cr8 = native_write_cr8,
-#endif
- .wbinvd = native_wbinvd,
- .read_msr = native_read_msr_safe,
- .rdmsr_regs = native_rdmsr_safe_regs,
- .write_msr = native_write_msr_safe,
- .wrmsr_regs = native_wrmsr_safe_regs,
- .read_tsc = native_read_tsc,
- .read_pmc = native_read_pmc,
- .read_tscp = native_read_tscp,
- .load_tr_desc = native_load_tr_desc,
- .set_ldt = native_set_ldt,
- .load_gdt = native_load_gdt,
- .load_idt = native_load_idt,
- .store_gdt = native_store_gdt,
- .store_idt = native_store_idt,
- .store_tr = native_store_tr,
- .load_tls = native_load_tls,
-#ifdef CONFIG_X86_64
- .load_gs_index = native_load_gs_index,
-#endif
- .write_ldt_entry = native_write_ldt_entry,
- .write_gdt_entry = native_write_gdt_entry,
- .write_idt_entry = native_write_idt_entry,
+ .mmu.p4d_val = PTE_IDENT,
+ .mmu.make_p4d = PTE_IDENT,
- .alloc_ldt = paravirt_nop,
- .free_ldt = paravirt_nop,
+ .mmu.set_pgd = native_set_pgd,
- .load_sp0 = native_load_sp0,
+ .mmu.pte_val = PTE_IDENT,
+ .mmu.pgd_val = PTE_IDENT,
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
- .irq_enable_sysexit = native_irq_enable_sysexit,
-#endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
- .usergs_sysret32 = native_usergs_sysret32,
-#endif
- .usergs_sysret64 = native_usergs_sysret64,
-#endif
- .iret = native_iret,
- .swapgs = native_swapgs,
+ .mmu.make_pte = PTE_IDENT,
+ .mmu.make_pgd = PTE_IDENT,
- .set_iopl_mask = native_set_iopl_mask,
- .io_delay = native_io_delay,
+ .mmu.enter_mmap = paravirt_nop,
- .start_context_switch = paravirt_nop,
- .end_context_switch = paravirt_nop,
-};
+ .mmu.lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ .flush = paravirt_nop,
+ },
-struct pv_apic_ops pv_apic_ops = {
-#ifdef CONFIG_X86_LOCAL_APIC
- .startup_ipi_hook = paravirt_nop,
+ .mmu.set_fixmap = native_set_fixmap,
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ /* Lock ops. */
+#ifdef CONFIG_SMP
+ .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .lock.queued_spin_unlock =
+ PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .lock.wait = paravirt_nop,
+ .lock.kick = paravirt_nop,
+ .lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+#endif /* SMP */
#endif
};
-#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
-/* 32-bit pagetable entries */
-#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
-#else
-/* 64-bit pagetable entries */
-#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#ifdef CONFIG_PARAVIRT_XXL
+NOKPROBE_SYMBOL(native_load_idt);
#endif
-struct pv_mmu_ops pv_mmu_ops = {
-
- .read_cr2 = native_read_cr2,
- .write_cr2 = native_write_cr2,
- .read_cr3 = native_read_cr3,
- .write_cr3 = native_write_cr3,
-
- .flush_tlb_user = native_flush_tlb,
- .flush_tlb_kernel = native_flush_tlb_global,
- .flush_tlb_single = native_flush_tlb_single,
- .flush_tlb_others = native_flush_tlb_others,
-
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
-
- .alloc_pte = paravirt_nop,
- .alloc_pmd = paravirt_nop,
- .alloc_pmd_clone = paravirt_nop,
- .alloc_pud = paravirt_nop,
- .release_pte = paravirt_nop,
- .release_pmd = paravirt_nop,
- .release_pud = paravirt_nop,
-
- .set_pte = native_set_pte,
- .set_pte_at = native_set_pte_at,
- .set_pmd = native_set_pmd,
- .pte_update = paravirt_nop,
- .pte_update_defer = paravirt_nop,
-
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- .set_pte_atomic = native_set_pte_atomic,
- .pte_clear = native_pte_clear,
- .pmd_clear = native_pmd_clear,
-#endif
- .set_pud = native_set_pud,
-
- .pmd_val = PTE_IDENT,
- .make_pmd = PTE_IDENT,
-
-#if PAGETABLE_LEVELS == 4
- .pud_val = PTE_IDENT,
- .make_pud = PTE_IDENT,
-
- .set_pgd = native_set_pgd,
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
- .pte_val = PTE_IDENT,
- .pgd_val = PTE_IDENT,
-
- .make_pte = PTE_IDENT,
- .make_pgd = PTE_IDENT,
-
- .dup_mmap = paravirt_nop,
- .exit_mmap = paravirt_nop,
- .activate_mm = paravirt_nop,
-
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- },
-
- .set_fixmap = native_set_fixmap,
-};
-
-EXPORT_SYMBOL_GPL(pv_time_ops);
-EXPORT_SYMBOL (pv_cpu_ops);
-EXPORT_SYMBOL (pv_mmu_ops);
-EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL(pv_ops);
EXPORT_SYMBOL_GPL(pv_info);
-EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
deleted file mode 100644
index d9f32e6d6ab6..000000000000
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ /dev/null
@@ -1,61 +0,0 @@
-#include <asm/paravirt.h>
-
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-
-unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
-{
- /* arg in %eax, return in %eax */
- return 0;
-}
-
-unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
-{
- /* arg in %edx:%eax, return in %edx:%eax */
- return 0;
-}
-
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- const unsigned char *start, *end;
- unsigned ret;
-
-#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
- switch (type) {
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_cpu_ops, read_tsc);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
-
- default:
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
- break;
- }
-#undef PATCH_SITE
- return ret;
-}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
deleted file mode 100644
index 3f08f34f93eb..000000000000
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <asm/paravirt.h>
-#include <asm/asm-offsets.h>
-#include <linux/stringify.h>
-
-DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
-DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
-DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
-DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
-DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
-
-DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
-DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
-DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
-
-DEF_NATIVE(, mov32, "mov %edi, %eax");
-DEF_NATIVE(, mov64, "mov %rdi, %rax");
-
-unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
-{
- return paravirt_patch_insns(insnbuf, len,
- start__mov32, end__mov32);
-}
-
-unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
-{
- return paravirt_patch_insns(insnbuf, len,
- start__mov64, end__mov64);
-}
-
-unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- const unsigned char *start, *end;
- unsigned ret;
-
-#define PATCH_SITE(ops, x) \
- case PARAVIRT_PATCH(ops.x): \
- start = start_##ops##_##x; \
- end = end_##ops##_##x; \
- goto patch_site
- switch(type) {
- PATCH_SITE(pv_irq_ops, restore_fl);
- PATCH_SITE(pv_irq_ops, save_fl);
- PATCH_SITE(pv_irq_ops, irq_enable);
- PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
- PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
- PATCH_SITE(pv_cpu_ops, usergs_sysret32);
- PATCH_SITE(pv_cpu_ops, usergs_sysret64);
- PATCH_SITE(pv_cpu_ops, swapgs);
- PATCH_SITE(pv_mmu_ops, read_cr2);
- PATCH_SITE(pv_mmu_ops, read_cr3);
- PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
- PATCH_SITE(pv_cpu_ops, wbinvd);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
-
- default:
- ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
- break;
- }
-#undef PATCH_SITE
- return ret;
-}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
deleted file mode 100644
index 078d4ec1a9d9..000000000000
--- a/arch/x86/kernel/pci-calgary_64.c
+++ /dev/null
@@ -1,1596 +0,0 @@
-/*
- * Derived from arch/powerpc/kernel/iommu.c
- *
- * Copyright IBM Corporation, 2006-2007
- * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
- *
- * Author: Jon Mason <jdmason@kudzu.us>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
-
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/crash_dump.h>
-#include <linux/dma-mapping.h>
-#include <linux/bitmap.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/scatterlist.h>
-#include <linux/iommu-helper.h>
-
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-#include <asm/tce.h>
-#include <asm/pci-direct.h>
-#include <asm/system.h>
-#include <asm/dma.h>
-#include <asm/rio.h>
-#include <asm/bios_ebda.h>
-#include <asm/x86_init.h>
-
-#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
-int use_calgary __read_mostly = 1;
-#else
-int use_calgary __read_mostly = 0;
-#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
-
-#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
-
-/* register offsets inside the host bridge space */
-#define CALGARY_CONFIG_REG 0x0108
-#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
-#define PHB_PLSSR_OFFSET 0x0120
-#define PHB_CONFIG_RW_OFFSET 0x0160
-#define PHB_IOBASE_BAR_LOW 0x0170
-#define PHB_IOBASE_BAR_HIGH 0x0180
-#define PHB_MEM_1_LOW 0x0190
-#define PHB_MEM_1_HIGH 0x01A0
-#define PHB_IO_ADDR_SIZE 0x01B0
-#define PHB_MEM_1_SIZE 0x01C0
-#define PHB_MEM_ST_OFFSET 0x01D0
-#define PHB_AER_OFFSET 0x0200
-#define PHB_CONFIG_0_HIGH 0x0220
-#define PHB_CONFIG_0_LOW 0x0230
-#define PHB_CONFIG_0_END 0x0240
-#define PHB_MEM_2_LOW 0x02B0
-#define PHB_MEM_2_HIGH 0x02C0
-#define PHB_MEM_2_SIZE_HIGH 0x02D0
-#define PHB_MEM_2_SIZE_LOW 0x02E0
-#define PHB_DOSHOLE_OFFSET 0x08E0
-
-/* CalIOC2 specific */
-#define PHB_SAVIOR_L2 0x0DB0
-#define PHB_PAGE_MIG_CTRL 0x0DA8
-#define PHB_PAGE_MIG_DEBUG 0x0DA0
-#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
-
-/* PHB_CONFIG_RW */
-#define PHB_TCE_ENABLE 0x20000000
-#define PHB_SLOT_DISABLE 0x1C000000
-#define PHB_DAC_DISABLE 0x01000000
-#define PHB_MEM2_ENABLE 0x00400000
-#define PHB_MCSR_ENABLE 0x00100000
-/* TAR (Table Address Register) */
-#define TAR_SW_BITS 0x0000ffffffff800fUL
-#define TAR_VALID 0x0000000000000008UL
-/* CSR (Channel/DMA Status Register) */
-#define CSR_AGENT_MASK 0xffe0ffff
-/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
-/* PMCR/PMDR (Page Migration Control/Debug Registers */
-#define PMR_SOFTSTOP 0x80000000
-#define PMR_SOFTSTOPFAULT 0x40000000
-#define PMR_HARDSTOP 0x20000000
-
-/*
- * The maximum PHB bus number.
- * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
- * x3950M2: 4 chassis, 48 PHBs per chassis = 192
- * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
- * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
- */
-#define MAX_PHB_BUS_NUM 256
-
-#define PHBS_PER_CALGARY 4
-
-/* register offsets in Calgary's internal register space */
-static const unsigned long tar_offsets[] = {
- 0x0580 /* TAR0 */,
- 0x0588 /* TAR1 */,
- 0x0590 /* TAR2 */,
- 0x0598 /* TAR3 */
-};
-
-static const unsigned long split_queue_offsets[] = {
- 0x4870 /* SPLIT QUEUE 0 */,
- 0x5870 /* SPLIT QUEUE 1 */,
- 0x6870 /* SPLIT QUEUE 2 */,
- 0x7870 /* SPLIT QUEUE 3 */
-};
-
-static const unsigned long phb_offsets[] = {
- 0x8000 /* PHB0 */,
- 0x9000 /* PHB1 */,
- 0xA000 /* PHB2 */,
- 0xB000 /* PHB3 */
-};
-
-/* PHB debug registers */
-
-static const unsigned long phb_debug_offsets[] = {
- 0x4000 /* PHB 0 DEBUG */,
- 0x5000 /* PHB 1 DEBUG */,
- 0x6000 /* PHB 2 DEBUG */,
- 0x7000 /* PHB 3 DEBUG */
-};
-
-/*
- * STUFF register for each debug PHB,
- * byte 1 = start bus number, byte 2 = end bus number
- */
-
-#define PHB_DEBUG_STUFF_OFFSET 0x0020
-
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
-unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
-static int translate_empty_slots __read_mostly = 0;
-static int calgary_detected __read_mostly = 0;
-
-static struct rio_table_hdr *rio_table_hdr __initdata;
-static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
-
-struct calgary_bus_info {
- void *tce_space;
- unsigned char translation_disabled;
- signed char phbid;
- void __iomem *bbar;
-};
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calgary_tce_cache_blast(struct iommu_table *tbl);
-static void calgary_dump_error_regs(struct iommu_table *tbl);
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calioc2_tce_cache_blast(struct iommu_table *tbl);
-static void calioc2_dump_error_regs(struct iommu_table *tbl);
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
-static void get_tce_space_from_tar(void);
-
-static struct cal_chipset_ops calgary_chip_ops = {
- .handle_quirks = calgary_handle_quirks,
- .tce_cache_blast = calgary_tce_cache_blast,
- .dump_error_regs = calgary_dump_error_regs
-};
-
-static struct cal_chipset_ops calioc2_chip_ops = {
- .handle_quirks = calioc2_handle_quirks,
- .tce_cache_blast = calioc2_tce_cache_blast,
- .dump_error_regs = calioc2_dump_error_regs
-};
-
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-
-static inline int translation_enabled(struct iommu_table *tbl)
-{
- /* only PHBs with translation enabled have an IOMMU table */
- return (tbl != NULL);
-}
-
-static void iommu_range_reserve(struct iommu_table *tbl,
- unsigned long start_addr, unsigned int npages)
-{
- unsigned long index;
- unsigned long end;
- unsigned long flags;
-
- index = start_addr >> PAGE_SHIFT;
-
- /* bail out if we're asked to reserve a region we don't cover */
- if (index >= tbl->it_size)
- return;
-
- end = index + npages;
- if (end > tbl->it_size) /* don't go off the table */
- end = tbl->it_size;
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_set(tbl->it_map, index, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static unsigned long iommu_range_alloc(struct device *dev,
- struct iommu_table *tbl,
- unsigned int npages)
-{
- unsigned long flags;
- unsigned long offset;
- unsigned long boundary_size;
-
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
-
- BUG_ON(npages == 0);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- tbl->chip_ops->tce_cache_blast(tbl);
-
- offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
- npages, 0, boundary_size, 0);
- if (offset == ~0UL) {
- printk(KERN_WARNING "Calgary: IOMMU full.\n");
- spin_unlock_irqrestore(&tbl->it_lock, flags);
- if (panic_on_overflow)
- panic("Calgary: fix the allocator.\n");
- else
- return DMA_ERROR_CODE;
- }
- }
-
- tbl->it_hint = offset + npages;
- BUG_ON(tbl->it_hint > tbl->it_size);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-
- return offset;
-}
-
-static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
- void *vaddr, unsigned int npages, int direction)
-{
- unsigned long entry;
- dma_addr_t ret;
-
- entry = iommu_range_alloc(dev, tbl, npages);
-
- if (unlikely(entry == DMA_ERROR_CODE)) {
- printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
- "iommu %p\n", npages, tbl);
- return DMA_ERROR_CODE;
- }
-
- /* set the return dma address */
- ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
-
- /* put the TCEs in the HW table */
- tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
- direction);
- return ret;
-}
-
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
- unsigned int npages)
-{
- unsigned long entry;
- unsigned long badend;
- unsigned long flags;
-
- /* were we called with bad_dma_address? */
- badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
- WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
- "address 0x%Lx\n", dma_addr);
- return;
- }
-
- entry = dma_addr >> PAGE_SHIFT;
-
- BUG_ON(entry + npages > tbl->it_size);
-
- tce_free(tbl, entry, npages);
-
- spin_lock_irqsave(&tbl->it_lock, flags);
-
- bitmap_clear(tbl->it_map, entry, npages);
-
- spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static inline struct iommu_table *find_iommu_table(struct device *dev)
-{
- struct pci_dev *pdev;
- struct pci_bus *pbus;
- struct iommu_table *tbl;
-
- pdev = to_pci_dev(dev);
-
- /* search up the device tree for an iommu */
- pbus = pdev->bus;
- do {
- tbl = pci_iommu(pbus);
- if (tbl && tbl->it_busno == pbus->number)
- break;
- tbl = NULL;
- pbus = pbus->parent;
- } while (pbus);
-
- BUG_ON(tbl && (tbl->it_busno != pbus->number));
-
- return tbl;
-}
-
-static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
- int nelems,enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- int i;
-
- if (!translation_enabled(tbl))
- return;
-
- for_each_sg(sglist, s, nelems, i) {
- unsigned int npages;
- dma_addr_t dma = s->dma_address;
- unsigned int dmalen = s->dma_length;
-
- if (dmalen == 0)
- break;
-
- npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
- iommu_free(tbl, dma, npages);
- }
-}
-
-static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
- int nelems, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- struct scatterlist *s;
- unsigned long vaddr;
- unsigned int npages;
- unsigned long entry;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- BUG_ON(!sg_page(s));
-
- vaddr = (unsigned long) sg_virt(s);
- npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
-
- entry = iommu_range_alloc(dev, tbl, npages);
- if (entry == DMA_ERROR_CODE) {
- /* makes sure unmap knows to stop */
- s->dma_length = 0;
- goto error;
- }
-
- s->dma_address = (entry << PAGE_SHIFT) | s->offset;
-
- /* insert into HW table */
- tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
-
- s->dma_length = s->length;
- }
-
- return nelems;
-error:
- calgary_unmap_sg(dev, sg, nelems, dir, NULL);
- for_each_sg(sg, s, nelems, i) {
- sg->dma_address = DMA_ERROR_CODE;
- sg->dma_length = 0;
- }
- return 0;
-}
-
-static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- void *vaddr = page_address(page) + offset;
- unsigned long uaddr;
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- uaddr = (unsigned long)vaddr;
- npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
-
- return iommu_alloc(dev, tbl, vaddr, npages, dir);
-}
-
-static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
- size_t size, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct iommu_table *tbl = find_iommu_table(dev);
- unsigned int npages;
-
- npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
- iommu_free(tbl, dma_addr, npages);
-}
-
-static void* calgary_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
-{
- void *ret = NULL;
- dma_addr_t mapping;
- unsigned int npages, order;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size); /* size rounded up to full pages */
- npages = size >> PAGE_SHIFT;
- order = get_order(size);
-
- flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- /* alloc enough pages (and possibly more) */
- ret = (void *)__get_free_pages(flag, order);
- if (!ret)
- goto error;
- memset(ret, 0, size);
-
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == DMA_ERROR_CODE)
- goto free;
- *dma_handle = mapping;
- return ret;
-free:
- free_pages((unsigned long)ret, get_order(size));
- ret = NULL;
-error:
- return ret;
-}
-
-static void calgary_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
-{
- unsigned int npages;
- struct iommu_table *tbl = find_iommu_table(dev);
-
- size = PAGE_ALIGN(size);
- npages = size >> PAGE_SHIFT;
-
- iommu_free(tbl, dma_handle, npages);
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static struct dma_map_ops calgary_dma_ops = {
- .alloc_coherent = calgary_alloc_coherent,
- .free_coherent = calgary_free_coherent,
- .map_sg = calgary_map_sg,
- .unmap_sg = calgary_unmap_sg,
- .map_page = calgary_map_page,
- .unmap_page = calgary_unmap_page,
-};
-
-static inline void __iomem * busno_to_bbar(unsigned char num)
-{
- return bus_info[num].bbar;
-}
-
-static inline int busno_to_phbid(unsigned char num)
-{
- return bus_info[num].phbid;
-}
-
-static inline unsigned long split_queue_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return split_queue_offsets[idx];
-}
-
-static inline unsigned long tar_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return tar_offsets[idx];
-}
-
-static inline unsigned long phb_offset(unsigned char num)
-{
- size_t idx = busno_to_phbid(num);
-
- return phb_offsets[idx];
-}
-
-static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
-{
- unsigned long target = ((unsigned long)bar) | offset;
- return (void __iomem*)target;
-}
-
-static inline int is_calioc2(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALIOC2);
-}
-
-static inline int is_calgary(unsigned short device)
-{
- return (device == PCI_DEVICE_ID_IBM_CALGARY);
-}
-
-static inline int is_cal_pci_dev(unsigned short device)
-{
- return (is_calgary(device) || is_calioc2(device));
-}
-
-static void calgary_tce_cache_blast(struct iommu_table *tbl)
-{
- u64 val;
- u32 aer;
- int i = 0;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
-
- /* disable arbitration on the bus */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- aer = readl(target);
- writel(0, target);
-
- /* read plssr to ensure it got there */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- val = readl(target);
-
- /* poll split queues until all DMA activity is done */
- target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
- do {
- val = readq(target);
- i++;
- } while ((val & 0xff) != 0xff && i < 100);
- if (i == 100)
- printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
- "continuing anyway\n");
-
- /* invalidate TCE cache */
- target = calgary_reg(bbar, tar_offset(tbl->it_busno));
- writeq(tbl->tar_val, target);
-
- /* enable arbitration */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
- writel(aer, target);
- (void)readl(target); /* flush */
-}
-
-static void calioc2_tce_cache_blast(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u64 val64;
- u32 val;
- int i = 0;
- int count = 1;
- unsigned char bus = tbl->it_busno;
-
-begin:
- printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
- "sequence - count %d\n", bus, count);
-
- /* 1. using the Page Migration Control reg set SoftStop */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
- val |= PMR_SOFTSTOP;
- printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
-
- /* 2. poll split queues until all DMA activity is done */
- printk(KERN_DEBUG "2a. starting to poll split queues\n");
- target = calgary_reg(bbar, split_queue_offset(bus));
- do {
- val64 = readq(target);
- i++;
- } while ((val64 & 0xff) != 0xff && i < 100);
- if (i == 100)
- printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
- "continuing anyway\n");
-
- /* 3. poll Page Migration DEBUG for SoftStopFault */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
-
- /* 4. if SoftStopFault - goto (1) */
- if (val & PMR_SOFTSTOPFAULT) {
- if (++count < 100)
- goto begin;
- else {
- printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
- "aborting TCE cache flush sequence!\n");
- return; /* pray for the best */
- }
- }
-
- /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
-
- /* 6. invalidate TCE cache */
- printk(KERN_DEBUG "6. invalidating TCE cache\n");
- target = calgary_reg(bbar, tar_offset(bus));
- writeq(tbl->tar_val, target);
-
- /* 7. Re-read PMCR */
- printk(KERN_DEBUG "7a. Re-reading PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
-
- /* 8. Remove HardStop */
- printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
- target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
- val = 0;
- printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
- writel(cpu_to_be32(val), target);
- val = be32_to_cpu(readl(target));
- printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
-}
-
-static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
- u64 limit)
-{
- unsigned int numpages;
-
- limit = limit | 0xfffff;
- limit++;
-
- numpages = ((limit - start) >> PAGE_SHIFT);
- iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
-}
-
-static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
-{
- void __iomem *target;
- u64 low, high, sizelow;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* peripheral MEM_1 region */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
- sizelow = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
-{
- void __iomem *target;
- u32 val32;
- u64 low, high, sizelow, sizehigh;
- u64 start, limit;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
-
- /* is it enabled? */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- if (!(val32 & PHB_MEM2_ENABLE))
- return;
-
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
- low = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
- high = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
- sizelow = be32_to_cpu(readl(target));
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
- sizehigh = be32_to_cpu(readl(target));
-
- start = (high << 32) | low;
- limit = (sizehigh << 32) | sizelow;
-
- calgary_reserve_mem_region(dev, start, limit);
-}
-
-/*
- * some regions of the IO address space do not get translated, so we
- * must not give devices IO addresses in those regions. The regions
- * are the 640KB-1MB region and the two PCI peripheral memory holes.
- * Reserve all of them in the IOMMU bitmap to avoid giving them out
- * later.
- */
-static void __init calgary_reserve_regions(struct pci_dev *dev)
-{
- unsigned int npages;
- u64 start;
- struct iommu_table *tbl = pci_iommu(dev->bus);
-
- /* reserve EMERGENCY_PAGES from bad_dma_address and up */
- iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
-
- /* avoid the BIOS/VGA first 640KB-1MB region */
- /* for CalIOC2 - avoid the entire first MB */
- if (is_calgary(dev->device)) {
- start = (640 * 1024);
- npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
- } else { /* calioc2 */
- start = 0;
- npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
- }
- iommu_range_reserve(tbl, start, npages);
-
- /* reserve the two PCI peripheral memory regions in IO space */
- calgary_reserve_peripheral_mem_1(dev);
- calgary_reserve_peripheral_mem_2(dev);
-}
-
-static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
-{
- u64 val64;
- u64 table_phys;
- void __iomem *target;
- int ret;
- struct iommu_table *tbl;
-
- /* build TCE tables for each PHB */
- ret = build_tce_table(dev, bbar);
- if (ret)
- return ret;
-
- tbl = pci_iommu(dev->bus);
- tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-
- if (is_kdump_kernel())
- calgary_init_bitmap_from_tce_table(tbl);
- else
- tce_free(tbl, 0, tbl->it_size);
-
- if (is_calgary(dev->device))
- tbl->chip_ops = &calgary_chip_ops;
- else if (is_calioc2(dev->device))
- tbl->chip_ops = &calioc2_chip_ops;
- else
- BUG();
-
- calgary_reserve_regions(dev);
-
- /* set TARs for each PHB */
- target = calgary_reg(bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
-
- /* zero out all TAR bits under sw control */
- val64 &= ~TAR_SW_BITS;
- table_phys = (u64)__pa(tbl->it_base);
-
- val64 |= table_phys;
-
- BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
- val64 |= (u64) specified_table_size;
-
- tbl->tar_val = cpu_to_be64(val64);
-
- writeq(tbl->tar_val, target);
- readq(target); /* flush */
-
- return 0;
-}
-
-static void __init calgary_free_bus(struct pci_dev *dev)
-{
- u64 val64;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *target;
- unsigned int bitmapsz;
-
- target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
- val64 = be64_to_cpu(readq(target));
- val64 &= ~TAR_SW_BITS;
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
- tbl->it_map = NULL;
-
- kfree(tbl);
-
- set_pci_iommu(dev->bus, NULL);
-
- /* Can't free bootmem allocated memory after system is up :-( */
- bus_info[dev->bus->number].tce_space = NULL;
-}
-
-static void calgary_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 csr, plssr;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
- "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
-}
-
-static void calioc2_dump_error_regs(struct iommu_table *tbl)
-{
- void __iomem *bbar = tbl->bbar;
- u32 csr, csmr, plssr, mck, rcstat;
- void __iomem *target;
- unsigned long phboff = phb_offset(tbl->it_busno);
- unsigned long erroff;
- u32 errregs[7];
- int i;
-
- /* dump CSR */
- target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
- csr = be32_to_cpu(readl(target));
- /* dump PLSSR */
- target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
- plssr = be32_to_cpu(readl(target));
- /* dump CSMR */
- target = calgary_reg(bbar, phboff | 0x290);
- csmr = be32_to_cpu(readl(target));
- /* dump mck */
- target = calgary_reg(bbar, phboff | 0x800);
- mck = be32_to_cpu(readl(target));
-
- printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
- tbl->it_busno);
-
- printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
- csr, plssr, csmr, mck);
-
- /* dump rest of error regs */
- printk(KERN_EMERG "Calgary: ");
- for (i = 0; i < ARRAY_SIZE(errregs); i++) {
- /* err regs are at 0x810 - 0x870 */
- erroff = (0x810 + (i * 0x10));
- target = calgary_reg(bbar, phboff | erroff);
- errregs[i] = be32_to_cpu(readl(target));
- printk("0x%08x@0x%lx ", errregs[i], erroff);
- }
- printk("\n");
-
- /* root complex status */
- target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
- rcstat = be32_to_cpu(readl(target));
- printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
- PHB_ROOT_COMPLEX_STATUS);
-}
-
-static void calgary_watchdog(unsigned long data)
-{
- struct pci_dev *dev = (struct pci_dev *)data;
- struct iommu_table *tbl = pci_iommu(dev->bus);
- void __iomem *bbar = tbl->bbar;
- u32 val32;
- void __iomem *target;
-
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
- val32 = be32_to_cpu(readl(target));
-
- /* If no error, the agent ID in the CSR is not valid */
- if (val32 & CSR_AGENT_MASK) {
- tbl->chip_ops->dump_error_regs(tbl);
-
- /* reset error */
- writel(0, target);
-
- /* Disable bus that caused the error */
- target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
- PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_SLOT_DISABLE;
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
- } else {
- /* Reset the timer */
- mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
- }
-}
-
-static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
- unsigned char busnum, unsigned long timeout)
-{
- u64 val64;
- void __iomem *target;
- unsigned int phb_shift = ~0; /* silence gcc */
- u64 mask;
-
- switch (busno_to_phbid(busnum)) {
- case 0: phb_shift = (63 - 19);
- break;
- case 1: phb_shift = (63 - 23);
- break;
- case 2: phb_shift = (63 - 27);
- break;
- case 3: phb_shift = (63 - 35);
- break;
- default:
- BUG_ON(busno_to_phbid(busnum));
- }
-
- target = calgary_reg(bbar, CALGARY_CONFIG_REG);
- val64 = be64_to_cpu(readq(target));
-
- /* zero out this PHB's timer bits */
- mask = ~(0xFUL << phb_shift);
- val64 &= mask;
- val64 |= (timeout << phb_shift);
- writeq(cpu_to_be64(val64), target);
- readq(target); /* flush */
-}
-
-static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
- void __iomem *bbar = tbl->bbar;
- void __iomem *target;
- u32 val;
-
- /*
- * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
- */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
- val = cpu_to_be32(readl(target));
- val |= 0x00800000;
- writel(cpu_to_be32(val), target);
-}
-
-static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
- unsigned char busnum = dev->bus->number;
-
- /*
- * Give split completion a longer timeout on bus 1 for aic94xx
- * http://bugzilla.kernel.org/show_bug.cgi?id=7180
- */
- if (is_calgary(dev->device) && (busnum == 1))
- calgary_set_split_completion_timeout(tbl->bbar, busnum,
- CCR_2SEC_TIMEOUT);
-}
-
-static void __init calgary_enable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* enable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
-
- printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
- (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
- "Calgary" : "CalIOC2", busnum);
- printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
- "bus.\n");
-
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- init_timer(&tbl->watchdog_timer);
- tbl->watchdog_timer.function = &calgary_watchdog;
- tbl->watchdog_timer.data = (unsigned long)dev;
- mod_timer(&tbl->watchdog_timer, jiffies);
-}
-
-static void __init calgary_disable_translation(struct pci_dev *dev)
-{
- u32 val32;
- unsigned char busnum;
- void __iomem *target;
- void __iomem *bbar;
- struct iommu_table *tbl;
-
- busnum = dev->bus->number;
- tbl = pci_iommu(dev->bus);
- bbar = tbl->bbar;
-
- /* disable TCE in PHB Config Register */
- target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
- val32 = be32_to_cpu(readl(target));
- val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
-
- printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
- writel(cpu_to_be32(val32), target);
- readl(target); /* flush */
-
- del_timer_sync(&tbl->watchdog_timer);
-}
-
-static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
-{
- pci_dev_get(dev);
- set_pci_iommu(dev->bus, NULL);
-
- /* is the device behind a bridge? */
- if (dev->bus->parent)
- dev->bus->parent->self = dev;
- else
- dev->bus->self = dev;
-}
-
-static int __init calgary_init_one(struct pci_dev *dev)
-{
- void __iomem *bbar;
- struct iommu_table *tbl;
- int ret;
-
- bbar = busno_to_bbar(dev->bus->number);
- ret = calgary_setup_tar(dev, bbar);
- if (ret)
- goto done;
-
- pci_dev_get(dev);
-
- if (dev->bus->parent) {
- if (dev->bus->parent->self)
- printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
- "bus->parent->self!\n", dev);
- dev->bus->parent->self = dev;
- } else
- dev->bus->self = dev;
-
- tbl = pci_iommu(dev->bus);
- tbl->chip_ops->handle_quirks(tbl, dev);
-
- calgary_enable_translation(dev);
-
- return 0;
-
-done:
- return ret;
-}
-
-static int __init calgary_locate_bbars(void)
-{
- int ret;
- int rioidx, phb, bus;
- void __iomem *bbar;
- void __iomem *target;
- unsigned long offset;
- u8 start_bus, end_bus;
- u32 val;
-
- ret = -ENODATA;
- for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
- struct rio_detail *rio = rio_devs[rioidx];
-
- if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
- continue;
-
- /* map entire 1MB of Calgary config space */
- bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
- if (!bbar)
- goto error;
-
- for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
- offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
- target = calgary_reg(bbar, offset);
-
- val = be32_to_cpu(readl(target));
-
- start_bus = (u8)((val & 0x00FF0000) >> 16);
- end_bus = (u8)((val & 0x0000FF00) >> 8);
-
- if (end_bus) {
- for (bus = start_bus; bus <= end_bus; bus++) {
- bus_info[bus].bbar = bbar;
- bus_info[bus].phbid = phb;
- }
- } else {
- bus_info[start_bus].bbar = bbar;
- bus_info[start_bus].phbid = phb;
- }
- }
- }
-
- return 0;
-
-error:
- /* scan bus_info and iounmap any bbars we previously ioremap'd */
- for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
- if (bus_info[bus].bbar)
- iounmap(bus_info[bus].bbar);
-
- return ret;
-}
-
-static int __init calgary_init(void)
-{
- int ret;
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- ret = calgary_locate_bbars();
- if (ret)
- return ret;
-
- /* Purely for kdump kernel case */
- if (is_kdump_kernel())
- get_tce_space_from_tar();
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- calgary_init_one_nontraslated(dev);
- continue;
- }
-
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- ret = calgary_init_one(dev);
- if (ret)
- goto error;
- } while (1);
-
- dev = NULL;
- for_each_pci_dev(dev) {
- struct iommu_table *tbl;
-
- tbl = find_iommu_table(&dev->dev);
-
- if (translation_enabled(tbl))
- dev->dev.archdata.dma_ops = &calgary_dma_ops;
- }
-
- return ret;
-
-error:
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled) {
- pci_dev_put(dev);
- continue;
- }
- if (!info->tce_space && !translate_empty_slots)
- continue;
-
- calgary_disable_translation(dev);
- calgary_free_bus(dev);
- pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
- dev->dev.archdata.dma_ops = NULL;
- } while (1);
-
- return ret;
-}
-
-static inline int __init determine_tce_table_size(u64 ram)
-{
- int ret;
-
- if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
- return specified_table_size;
-
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order(ram >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
- ret = TCE_TABLE_SIZE_8M;
-
- return ret;
-}
-
-static int __init build_detail_arrays(void)
-{
- unsigned long ptr;
- unsigned numnodes, i;
- int scal_detail_size, rio_detail_size;
-
- numnodes = rio_table_hdr->num_scal_dev;
- if (numnodes > MAX_NUMNODES){
- printk(KERN_WARNING
- "Calgary: MAX_NUMNODES too low! Defined as %d, "
- "but system has %d nodes.\n",
- MAX_NUMNODES, numnodes);
- return -ENODEV;
- }
-
- switch (rio_table_hdr->version){
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- default:
- printk(KERN_WARNING
- "Calgary: Invalid Rio Grande Table Version: %d\n",
- rio_table_hdr->version);
- return -EPROTO;
- }
-
- ptr = ((unsigned long)rio_table_hdr) + 3;
- for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev;
- i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 0;
-}
-
-static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
-{
- int dev;
- u32 val;
-
- if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
- /*
- * FIXME: properly scan for devices accross the
- * PCI-to-PCI bridge on every CalIOC2 port.
- */
- return 1;
- }
-
- for (dev = 1; dev < 8; dev++) {
- val = read_pci_config(bus, dev, 0, 0);
- if (val != 0xffffffff)
- break;
- }
- return (val != 0xffffffff);
-}
-
-/*
- * calgary_init_bitmap_from_tce_table():
- * Funtion for kdump case. In the second/kdump kernel initialize
- * the bitmap based on the tce table entries obtained from first kernel
- */
-static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
-{
- u64 *tp;
- unsigned int index;
- tp = ((u64 *)tbl->it_base);
- for (index = 0 ; index < tbl->it_size; index++) {
- if (*tp != 0x0)
- set_bit(index, tbl->it_map);
- tp++;
- }
-}
-
-/*
- * get_tce_space_from_tar():
- * Function for kdump case. Get the tce tables from first kernel
- * by reading the contents of the base address register of calgary iommu
- */
-static void __init get_tce_space_from_tar(void)
-{
- int bus;
- void __iomem *target;
- unsigned long tce_space;
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- target = calgary_reg(bus_info[bus].bbar,
- tar_offset(bus));
- tce_space = be64_to_cpu(readq(target));
- tce_space = tce_space & TAR_SW_BITS;
-
- tce_space = tce_space & (~specified_table_size);
- info->tce_space = (u64 *)__va(tce_space);
- }
- }
- return;
-}
-
-static int __init calgary_iommu_init(void)
-{
- int ret;
-
- /* ok, we're trying to use Calgary - let's roll */
- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
- ret = calgary_init();
- if (ret) {
- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
- "falling back to no_iommu\n", ret);
- return ret;
- }
-
- return 0;
-}
-
-void __init detect_calgary(void)
-{
- int bus;
- void *tbl;
- int calgary_found = 0;
- unsigned long ptr;
- unsigned int offset, prev_offset;
- int ret;
-
- /*
- * if the user specified iommu=off or iommu=soft or we found
- * another HW IOMMU already, bail out.
- */
- if (no_iommu || iommu_detected)
- return;
-
- if (!use_calgary)
- return;
-
- if (!early_pci_allowed())
- return;
-
- printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
-
- ptr = (unsigned long)phys_to_virt(get_bios_ebda());
-
- rio_table_hdr = NULL;
- prev_offset = 0;
- offset = 0x180;
- /*
- * The next offset is stored in the 1st word.
- * Only parse up until the offset increases:
- */
- while (offset > prev_offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- prev_offset = offset;
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
- "in EBDA - bailing!\n");
- return;
- }
-
- ret = build_detail_arrays();
- if (ret) {
- printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
- return;
- }
-
- specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
- saved_max_pfn : max_pfn) * PAGE_SIZE);
-
- for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
- struct calgary_bus_info *info = &bus_info[bus];
- unsigned short pci_device;
- u32 val;
-
- val = read_pci_config(bus, 0, 0, 0);
- pci_device = (val & 0xFFFF0000) >> 16;
-
- if (!is_cal_pci_dev(pci_device))
- continue;
-
- if (info->translation_disabled)
- continue;
-
- if (calgary_bus_has_devices(bus, pci_device) ||
- translate_empty_slots) {
- /*
- * If it is kdump kernel, find and use tce tables
- * from first kernel, else allocate tce tables here
- */
- if (!is_kdump_kernel()) {
- tbl = alloc_tce_table();
- if (!tbl)
- goto cleanup;
- info->tce_space = tbl;
- }
- calgary_found = 1;
- }
- }
-
- printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
- calgary_found ? "found" : "not found");
-
- if (calgary_found) {
- iommu_detected = 1;
- calgary_detected = 1;
- printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
- specified_table_size);
-
- x86_init.iommu.iommu_init = calgary_iommu_init;
- }
- return;
-
-cleanup:
- for (--bus; bus >= 0; --bus) {
- struct calgary_bus_info *info = &bus_info[bus];
-
- if (info->tce_space)
- free_tce_table(info->tce_space);
- }
-}
-
-static int __init calgary_parse_options(char *p)
-{
- unsigned int bridge;
- size_t len;
- char* endp;
-
- while (*p) {
- if (!strncmp(p, "64k", 3))
- specified_table_size = TCE_TABLE_SIZE_64K;
- else if (!strncmp(p, "128k", 4))
- specified_table_size = TCE_TABLE_SIZE_128K;
- else if (!strncmp(p, "256k", 4))
- specified_table_size = TCE_TABLE_SIZE_256K;
- else if (!strncmp(p, "512k", 4))
- specified_table_size = TCE_TABLE_SIZE_512K;
- else if (!strncmp(p, "1M", 2))
- specified_table_size = TCE_TABLE_SIZE_1M;
- else if (!strncmp(p, "2M", 2))
- specified_table_size = TCE_TABLE_SIZE_2M;
- else if (!strncmp(p, "4M", 2))
- specified_table_size = TCE_TABLE_SIZE_4M;
- else if (!strncmp(p, "8M", 2))
- specified_table_size = TCE_TABLE_SIZE_8M;
-
- len = strlen("translate_empty_slots");
- if (!strncmp(p, "translate_empty_slots", len))
- translate_empty_slots = 1;
-
- len = strlen("disable");
- if (!strncmp(p, "disable", len)) {
- p += len;
- if (*p == '=')
- ++p;
- if (*p == '\0')
- break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
- break;
-
- if (bridge < MAX_PHB_BUS_NUM) {
- printk(KERN_INFO "Calgary: disabling "
- "translation for PHB %#x\n", bridge);
- bus_info[bridge].translation_disabled = 1;
- }
- }
-
- p = strpbrk(p, ",");
- if (!p)
- break;
-
- p++; /* skip ',' */
- }
- return 1;
-}
-__setup("calgary=", calgary_parse_options);
-
-static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
-{
- struct iommu_table *tbl;
- unsigned int npages;
- int i;
-
- tbl = pci_iommu(dev->bus);
-
- for (i = 0; i < 4; i++) {
- struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
-
- /* Don't give out TCEs that map MEM resources */
- if (!(r->flags & IORESOURCE_MEM))
- continue;
-
- /* 0-based? we reserve the whole 1st MB anyway */
- if (!r->start)
- continue;
-
- /* cover the whole region */
- npages = (r->end - r->start) >> PAGE_SHIFT;
- npages++;
-
- iommu_range_reserve(tbl, r->start, npages);
- }
-}
-
-static int __init calgary_fixup_tce_spaces(void)
-{
- struct pci_dev *dev = NULL;
- struct calgary_bus_info *info;
-
- if (no_iommu || swiotlb || !calgary_detected)
- return -ENODEV;
-
- printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
-
- do {
- dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
- if (!dev)
- break;
- if (!is_cal_pci_dev(dev->device))
- continue;
-
- info = &bus_info[dev->bus->number];
- if (info->translation_disabled)
- continue;
-
- if (!info->tce_space)
- continue;
-
- calgary_fixup_one_tce_space(dev);
-
- } while (1);
-
- return 0;
-}
-
-/*
- * We need to be call after pcibios_assign_resources (fs_initcall level)
- * and before device_initcall.
- */
-rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..6267363e0189 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,25 +1,27 @@
-#include <linux/dma-mapping.h>
-#include <linux/dma-debug.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-map-ops.h>
+#include <linux/dma-direct.h>
+#include <linux/iommu.h>
#include <linux/dmar.h>
-#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/pci.h>
-#include <linux/kmemleak.h>
+#include <linux/amd-iommu.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
#include <asm/gart.h>
-#include <asm/calgary.h>
-#include <asm/amd_iommu.h>
#include <asm/x86_init.h>
-static int forbid_dac __read_mostly;
+#include <xen/xen.h>
+#include <xen/swiotlb-xen.h>
-struct dma_map_ops *dma_ops = &nommu_dma_ops;
-EXPORT_SYMBOL(dma_ops);
+static bool disable_dac_quirk __read_mostly;
-static int iommu_sac_force __read_mostly;
+const struct dma_map_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
@@ -35,153 +37,77 @@ int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/*
- * This variable becomes 1 if iommu=pt is passed on the kernel command line.
- * If this variable is 1, IOMMU implementations do no DMA translation for
- * devices and allow every device to access to whole physical memory. This is
- * useful if a user wants to use an IOMMU only for KVM device assignment to
- * guests and not for driver dma translation.
- */
-int iommu_pass_through __read_mostly;
-
-/* Dummy device used for NULL arguments (normally ISA). */
-struct device x86_dma_fallback_dev = {
- .init_name = "fallback device",
- .coherent_dma_mask = ISA_DMA_BIT_MASK,
- .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
-};
-EXPORT_SYMBOL(x86_dma_fallback_dev);
-
-/* Number of entries preallocated for DMA-API debugging */
-#define PREALLOC_DMA_DEBUG_ENTRIES 32768
-
-int dma_set_mask(struct device *dev, u64 mask)
-{
- if (!dev->dma_mask || !dma_supported(dev, mask))
- return -EIO;
-
- *dev->dma_mask = mask;
-
- return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-
-static int __init parse_dma32_size_opt(char *p)
-{
- if (!p)
- return -EINVAL;
- dma32_bootmem_size = memparse(p, &p);
- return 0;
-}
-early_param("dma32_size", parse_dma32_size_opt);
+#ifdef CONFIG_SWIOTLB
+bool x86_swiotlb_enable;
+static unsigned int x86_swiotlb_flags;
-void __init dma32_reserve_bootmem(void)
+static void __init pci_swiotlb_detect(void)
{
- unsigned long size, align;
- if (max_pfn <= MAX_DMA32_PFN)
- return;
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
+ x86_swiotlb_enable = true;
/*
- * check aperture_64.c allocate_aperture() for reason about
- * using 512M as goal
+ * Set swiotlb to 1 so that bounce buffers are allocated and used for
+ * devices that can't support DMA to encrypted memory.
*/
- align = 64ULL<<20;
- size = roundup(dma32_bootmem_size, align);
- dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- 512ULL<<20);
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ x86_swiotlb_enable = true;
+
/*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
+ * Guest with guest memory encryption currently perform all DMA through
+ * bounce buffers as the hypervisor can't access arbitrary VM memory
+ * that is not explicitly shared with it.
*/
- kmemleak_ignore(dma32_bootmem_ptr);
- if (dma32_bootmem_ptr)
- dma32_bootmem_size = size;
- else
- dma32_bootmem_size = 0;
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_FORCE;
+ }
}
-static void __init dma32_free_bootmem(void)
+#else
+static inline void __init pci_swiotlb_detect(void)
{
+}
+#define x86_swiotlb_flags 0
+#endif /* CONFIG_SWIOTLB */
- if (max_pfn <= MAX_DMA32_PFN)
- return;
+#ifdef CONFIG_SWIOTLB_XEN
+static bool xen_swiotlb_enabled(void)
+{
+ return xen_initial_domain() || x86_swiotlb_enable ||
+ (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible);
+}
- if (!dma32_bootmem_ptr)
+static void __init pci_xen_swiotlb_init(void)
+{
+ if (!xen_swiotlb_enabled())
return;
-
- free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-
- dma32_bootmem_ptr = NULL;
- dma32_bootmem_size = 0;
+ x86_swiotlb_enable = true;
+ x86_swiotlb_flags |= SWIOTLB_ANY;
+ swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
+ dma_ops = &xen_swiotlb_dma_ops;
+ if (IS_ENABLED(CONFIG_PCI))
+ pci_request_acs();
}
#else
-void __init dma32_reserve_bootmem(void)
+static inline void __init pci_xen_swiotlb_init(void)
{
}
-static void __init dma32_free_bootmem(void)
-{
-}
-
-#endif
+#endif /* CONFIG_SWIOTLB_XEN */
void __init pci_iommu_alloc(void)
{
- /* free the range so iommu could get some range less than 4G */
- dma32_free_bootmem();
-
- if (pci_swiotlb_detect())
- goto out;
-
+ if (xen_pv_domain()) {
+ pci_xen_swiotlb_init();
+ return;
+ }
+ pci_swiotlb_detect();
gart_iommu_hole_init();
-
- detect_calgary();
-
- detect_intel_iommu();
-
- /* needs to be called after gart_iommu_hole_init */
amd_iommu_detect();
-out:
- pci_swiotlb_init();
-}
-
-void *dma_generic_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_addr, gfp_t flag)
-{
- unsigned long dma_mask;
- struct page *page;
- dma_addr_t addr;
-
- dma_mask = dma_alloc_coherent_mask(dev, flag);
-
- flag |= __GFP_ZERO;
-again:
- page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
- if (!page)
- return NULL;
-
- addr = page_to_phys(page);
- if (addr + size > dma_mask) {
- __free_pages(page, get_order(size));
-
- if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
- flag = (flag & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
-
- return NULL;
- }
-
- *dma_addr = addr;
- return page_address(page);
+ detect_intel_iommu();
+ swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
-/*
- * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
- * documentation.
- */
static __init int iommu_setup(char *p)
{
iommu_merge = 1;
@@ -215,29 +141,26 @@ static __init int iommu_setup(char *p)
if (!strncmp(p, "nomerge", 7))
iommu_merge = 0;
if (!strncmp(p, "forcesac", 8))
- iommu_sac_force = 1;
+ pr_warn("forcesac option ignored.\n");
if (!strncmp(p, "allowdac", 8))
- forbid_dac = 0;
+ pr_warn("allowdac option ignored.\n");
if (!strncmp(p, "nodac", 5))
- forbid_dac = 1;
+ pr_warn("nodac option ignored.\n");
if (!strncmp(p, "usedac", 6)) {
- forbid_dac = -1;
+ disable_dac_quirk = true;
return 1;
}
#ifdef CONFIG_SWIOTLB
if (!strncmp(p, "soft", 4))
- swiotlb = 1;
+ x86_swiotlb_enable = true;
#endif
if (!strncmp(p, "pt", 2))
- iommu_pass_through = 1;
+ iommu_set_default_passthrough(true);
+ if (!strncmp(p, "nopt", 4))
+ iommu_set_default_translated(true);
gart_parse_options(p);
-#ifdef CONFIG_CALGARY_IOMMU
- if (!strncmp(p, "calgary", 7))
- use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
p += strcspn(p, ",");
if (*p == ',')
++p;
@@ -246,62 +169,19 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-int dma_supported(struct device *dev, u64 mask)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
-#ifdef CONFIG_PCI
- if (mask > 0xffffffff && forbid_dac > 0) {
- dev_info(dev, "PCI: Disallowing DAC for device\n");
- return 0;
- }
-#endif
-
- if (ops->dma_supported)
- return ops->dma_supported(dev, mask);
-
- /* Copied from i386. Doesn't make much sense, because it will
- only work for pci_alloc_coherent.
- The caller just has to use GFP_DMA in this case. */
- if (mask < DMA_BIT_MASK(24))
- return 0;
-
- /* Tell the device to use SAC when IOMMU force is on. This
- allows the driver to use cheaper accesses in some cases.
-
- Problem with this is that if we overflow the IOMMU area and
- return DAC as fallback address the device may not handle it
- correctly.
-
- As a special case some controllers have a 39bit address
- mode that is as efficient as 32bit (aic79xx). Don't force
- SAC for these. Assume all masks <= 40 bits are of this
- type. Normally this doesn't make any difference, but gives
- more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
- dev_info(dev, "Force SAC with mask %Lx\n", mask);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL(dma_supported);
-
static int __init pci_iommu_init(void)
{
- dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
-
-#ifdef CONFIG_PCI
- dma_debug_add_bus(&pci_bus_type);
-#endif
x86_init.iommu.iommu_init();
- if (swiotlb) {
- printk(KERN_INFO "PCI-DMA: "
- "Using software bounce buffering for IO (SWIOTLB)\n");
+#ifdef CONFIG_SWIOTLB
+ /* An IOMMU turned us off. */
+ if (x86_swiotlb_enable) {
+ pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
swiotlb_print_info();
- } else
- swiotlb_free();
+ } else {
+ swiotlb_exit();
+ }
+#endif
return 0;
}
@@ -311,12 +191,19 @@ rootfs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-static __devinit void via_no_dac(struct pci_dev *dev)
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.bus_dma_limit = DMA_BIT_MASK(32);
+ return 0;
+}
+
+static void via_no_dac(struct pci_dev *dev)
{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+ if (!disable_dac_quirk) {
dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
- forbid_dac = 1;
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
}
}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
deleted file mode 100644
index 3af4af810c07..000000000000
--- a/arch/x86/kernel/pci-nommu.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Fallback functions when the main IOMMU code is not compiled in. This
- code is roughly equivalent to i386. */
-#include <linux/dma-mapping.h>
-#include <linux/scatterlist.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/gfp.h>
-#include <linux/pci.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
- if (hwdev && !dma_capable(hwdev, bus, size)) {
- if (*hwdev->dma_mask >= DMA_BIT_MASK(32))
- printk(KERN_ERR
- "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
- name, (long long)bus, size,
- (long long)*hwdev->dma_mask);
- return 0;
- }
- return 1;
-}
-
-static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- dma_addr_t bus = page_to_phys(page) + offset;
- WARN_ON(size == 0);
- if (!check_addr("map_single", dev, bus, size))
- return DMA_ERROR_CODE;
- flush_write_buffers();
- return bus;
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA. This is the scatter-gather version of the
- * above pci_map_single interface. Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length. They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- * DMA address/length pairs than there are SG table elements.
- * (for example via virtual mapping capabilities)
- * The routine returns the number of addr/length pairs actually
- * used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir,
- struct dma_attrs *attrs)
-{
- struct scatterlist *s;
- int i;
-
- WARN_ON(nents == 0 || sg[0].length == 0);
-
- for_each_sg(sg, s, nents, i) {
- BUG_ON(!sg_page(s));
- s->dma_address = sg_phys(s);
- if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
- return 0;
- s->dma_length = s->length;
- }
- flush_write_buffers();
- return nents;
-}
-
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr)
-{
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static void nommu_sync_single_for_device(struct device *dev,
- dma_addr_t addr, size_t size,
- enum dma_data_direction dir)
-{
- flush_write_buffers();
-}
-
-
-static void nommu_sync_sg_for_device(struct device *dev,
- struct scatterlist *sg, int nelems,
- enum dma_data_direction dir)
-{
- flush_write_buffers();
-}
-
-struct dma_map_ops nommu_dma_ops = {
- .alloc_coherent = dma_generic_alloc_coherent,
- .free_coherent = nommu_free_coherent,
- .map_sg = nommu_map_sg,
- .map_page = nommu_map_page,
- .sync_single_for_device = nommu_sync_single_for_device,
- .sync_sg_for_device = nommu_sync_sg_for_device,
- .is_phys = 1,
-};
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
deleted file mode 100644
index a5bc528d4328..000000000000
--- a/arch/x86/kernel/pci-swiotlb.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Glue code to lib/swiotlb.c */
-
-#include <linux/pci.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <linux/swiotlb.h>
-#include <linux/bootmem.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/iommu.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-
-int swiotlb __read_mostly;
-
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
- dma_addr_t *dma_handle, gfp_t flags)
-{
- void *vaddr;
-
- vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
- if (vaddr)
- return vaddr;
-
- return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
-}
-
-static struct dma_map_ops swiotlb_dma_ops = {
- .mapping_error = swiotlb_dma_mapping_error,
- .alloc_coherent = x86_swiotlb_alloc_coherent,
- .free_coherent = swiotlb_free_coherent,
- .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
- .sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
- .sync_sg_for_device = swiotlb_sync_sg_for_device,
- .map_sg = swiotlb_map_sg_attrs,
- .unmap_sg = swiotlb_unmap_sg_attrs,
- .map_page = swiotlb_map_page,
- .unmap_page = swiotlb_unmap_page,
- .dma_supported = NULL,
-};
-
-/*
- * pci_swiotlb_detect - set swiotlb to 1 if necessary
- *
- * This returns non-zero if we are forced to use swiotlb (by the boot
- * option).
- */
-int __init pci_swiotlb_detect(void)
-{
- int use_swiotlb = swiotlb | swiotlb_force;
-
- /* don't initialize swiotlb if iommu=off (no_iommu=1) */
-#ifdef CONFIG_X86_64
- if (!no_iommu && max_pfn > MAX_DMA32_PFN)
- swiotlb = 1;
-#endif
- if (swiotlb_force)
- swiotlb = 1;
-
- return use_swiotlb;
-}
-
-void __init pci_swiotlb_init(void)
-{
- if (swiotlb) {
- swiotlb_init(0);
- dma_ops = &swiotlb_dma_ops;
- }
-}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index a311ffcaad16..4a710ffffd9a 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/platform_device.h>
#include <linux/err.h>
#include <linux/init.h>
@@ -8,6 +9,6 @@ static __init int add_pcspkr(void)
pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- return IS_ERR(pd) ? PTR_ERR(pd) : 0;
+ return PTR_ERR_OR_ZERO(pd);
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 000000000000..624703af80a1
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ struct x86_perf_regs *perf_regs;
+
+ if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) {
+ perf_regs = container_of(regs, struct x86_perf_regs, regs);
+ if (!perf_regs->xmm_regs)
+ return 0;
+ return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0];
+ }
+
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \
+ ~((1ULL << PERF_REG_X86_MAX) - 1))
+
+#ifdef CONFIG_X86_32
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \
+ (1ULL << PERF_REG_X86_R9) | \
+ (1ULL << PERF_REG_X86_R10) | \
+ (1ULL << PERF_REG_X86_R11) | \
+ (1ULL << PERF_REG_X86_R12) | \
+ (1ULL << PERF_REG_X86_R13) | \
+ (1ULL << PERF_REG_X86_R14) | \
+ (1ULL << PERF_REG_X86_R15))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (!user_64bit_mode(task_pt_regs(task)))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs)
+{
+ struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ if (!in_nmi()) {
+ regs_user->regs = user_regs;
+ regs_user->abi = perf_reg_abi(current);
+ return;
+ }
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ /*
+ * Store user space frame-pointer value on sample
+ * to facilitate stack unwinding for cases when
+ * user space executable code has such support
+ * enabled at compile time:
+ */
+ regs_user_copy->bp = user_regs->bp;
+
+ regs_user_copy->bx = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+
+ regs_user->regs = regs_user_copy;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 000000000000..b525fe6d6657
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pnp.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+void __init x86_early_init_platform_quirks(void)
+{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
+ x86_platform.legacy.rtc = 1;
+ x86_platform.legacy.warm_reset = 1;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 1;
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_PC:
+ x86_platform.legacy.reserve_bios_regions = 1;
+ break;
+ case X86_SUBARCH_XEN:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
+ case X86_SUBARCH_INTEL_MID:
+ case X86_SUBARCH_CE4100:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ break;
+ }
+
+ if (x86_platform.set_legacy_features)
+ x86_platform.set_legacy_features();
+}
+
+bool __init x86_pnpbios_disabled(void)
+{
+ return x86_platform.legacy.devices.pnpbios == 0;
+}
+
+#if defined(CONFIG_PNPBIOS)
+bool __init arch_pnpbios_disabled(void)
+{
+ return x86_pnpbios_disabled();
+}
+#endif
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..23154d24b117
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+
+static int found(struct resource *res, void *data)
+{
+ return 1;
+}
+
+static __init int register_e820_pmem(void)
+{
+ struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+
+ rc = platform_device_add(pdev);
+ if (rc)
+ platform_device_put(pdev);
+
+ return rc;
+}
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
- u32 a, b;
- for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
- a == b;
- b = inl(pmtmr_ioport) & ACPI_PM_MASK)
- cpu_relax();
- return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
- u32 a, b;
- a = pmtimer_wait_tick();
- do {
- b = inl(pmtmr_ioport);
- cpu_relax();
- } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
- pmtmr_ioport = 0;
- return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..cc2c34ba7228 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
@@ -10,15 +11,17 @@
#include <linux/dmi.h>
#include <linux/pfn.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-
+#include <linux/export.h>
-#include <asm/e820.h>
+#include <asm/probe_roms.h>
+#include <asm/pci-direct.h>
+#include <asm/e820/api.h>
#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/io.h>
#include <asm/setup_arch.h>
+#include <asm/sev.h>
static struct resource system_rom_resource = {
.name = "System ROM",
@@ -73,6 +76,107 @@ static struct resource video_rom_resource = {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
};
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const void *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (get_kernel_nofault(device, rom_list) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const void *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (get_kernel_nofault(offset, rom + 0x18) != 0)
+ continue;
+
+ if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0)
+ continue;
+
+ if (get_kernel_nofault(device, rom + offset + 0x6) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (get_kernel_nofault(list, rom + offset + 0x8) == 0 &&
+ get_kernel_nofault(rev, rom + offset + 0xc) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
#define ROMSIGNATURE 0xaa55
static int __init romsignature(const unsigned char *rom)
@@ -80,22 +184,22 @@ static int __init romsignature(const unsigned char *rom)
const unsigned short * const ptr = (const unsigned short *)rom;
unsigned short sig;
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+ return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE;
}
static int __init romchecksum(const unsigned char *rom, unsigned long length)
{
unsigned char sum, c;
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--)
sum += c;
return !length && !sum;
}
void __init probe_roms(void)
{
- const unsigned char *rom;
unsigned long start, length, upper;
+ const unsigned char *rom;
unsigned char c;
int i;
@@ -108,7 +212,7 @@ void __init probe_roms(void)
video_rom_resource.start = start;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
@@ -133,7 +237,7 @@ void __init probe_roms(void)
/* check for extension rom (ignore length byte!) */
rom = isa_bus_to_virt(extension_rom_resource.start);
if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ length = resource_size(&extension_rom_resource);
if (romchecksum(rom, length)) {
request_resource(&iomem_resource, &extension_rom_resource);
upper = extension_rom_resource.start;
@@ -146,7 +250,7 @@ void __init probe_roms(void)
if (!romsignature(rom))
continue;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..4c718f8adc59 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,116 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
#include <linux/random.h>
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuidle.h>
+#include <linux/acpi.h>
+#include <linux/elf-randomize.h>
+#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
-#include <asm/system.h>
+#include <linux/entry-common.h>
+#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
#include <asm/apic.h>
-#include <asm/syscalls.h>
-#include <asm/idle.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
+#include <linux/uaccess.h>
+#include <asm/mwait.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/sched.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
+#include <asm/nmi.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
+#include <asm/io_bitmap.h>
+#include <asm/proto.h>
+#include <asm/frame.h>
+#include <asm/unwind.h>
+#include <asm/tdx.h>
+#include <asm/mmu_context.h>
+#include <asm/msr.h>
+#include <asm/shstk.h>
+
+#include "process.h"
-unsigned long idle_halt;
-EXPORT_SYMBOL(idle_halt);
-unsigned long idle_nomwait;
-EXPORT_SYMBOL(idle_nomwait);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+ .x86_tss = {
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
-struct kmem_cache *task_xstate_cachep;
+#ifdef CONFIG_X86_32
+ .sp1 = TOP_OF_INIT_STACK,
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+#endif
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
+ },
+};
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
+
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
+
+/*
+ * The cache may be in an incoherent state and needs flushing during kexec.
+ * E.g., on SME/TDX platforms, dirty cacheline aliases with and without
+ * encryption bit(s) can coexist and the cache needs to be flushed before
+ * booting to the new kernel to avoid the silent memory corruption due to
+ * dirty cachelines with different encryption property being written back
+ * to the memory.
+ */
+DEFINE_PER_CPU(bool, cache_state_incoherent);
+
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- int ret;
+ /* fpu_clone() will initialize the "dst_fpu" memory */
+ memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0);
+
+#ifdef CONFIG_VM86
+ dst->thread.vm86 = NULL;
+#endif
- *dst = *src;
- if (fpu_allocated(&src->thread.fpu)) {
- memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
- ret = fpu_alloc(&dst->thread.fpu);
- if (ret)
- return ret;
- fpu_copy(&dst->thread.fpu, &src->thread.fpu);
- }
return 0;
}
-void free_thread_xstate(struct task_struct *tsk)
+#ifdef CONFIG_X86_64
+void arch_release_task_struct(struct task_struct *tsk)
{
- fpu_free(&tsk->thread.fpu);
+ if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ fpstate_free(x86_task_fpu(tsk));
}
+#endif
-void free_thread_info(struct thread_info *ti)
+/*
+ * Free thread data structures etc..
+ */
+void exit_thread(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+ struct thread_struct *t = &tsk->thread;
+
+ if (test_thread_flag(TIF_IO_BITMAP))
+ io_bitmap_exit(tsk);
+
+ free_vm86(t);
+
+ shstk_free(tsk);
+ fpu__drop(tsk);
}
-void arch_task_cache_init(void)
+static int set_new_tls(struct task_struct *p, unsigned long tls)
{
- task_xstate_cachep =
- kmem_cache_create("task_xstate", xstate_size,
- __alignof__(union thread_xstate),
- SLAB_PANIC | SLAB_NOTRACK, NULL);
+ struct user_desc __user *utls = (struct user_desc __user *)tls;
+
+ if (in_ia32_syscall())
+ return do_set_thread_area(p, -1, utls, 0);
+ else
+ return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
{
- struct task_struct *me = current;
- struct thread_struct *t = &me->thread;
- unsigned long *bp = t->io_bitmap_ptr;
-
- if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+ schedule_tail(prev);
- t->io_bitmap_ptr = NULL;
- clear_thread_flag(TIF_IO_BITMAP);
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
/*
- * Careful, clear this in the TSS too:
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
*/
- memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
- t->io_bitmap_max = 0;
- put_cpu();
- kfree(bp);
+ regs->ax = 0;
}
-}
-void show_regs(struct pt_regs *regs)
-{
- show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
- regs->bp);
+ syscall_exit_to_user_mode(regs);
}
-void show_regs_common(void)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
- const char *board, *product;
+ u64 clone_flags = args->flags;
+ unsigned long sp = args->stack;
+ unsigned long tls = args->tls;
+ struct inactive_task_frame *frame;
+ struct fork_frame *fork_frame;
+ struct pt_regs *childregs;
+ unsigned long new_ssp;
+ int ret = 0;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+
+ frame->bp = encode_frame_pointer(childregs);
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap = NULL;
+ clear_tsk_thread_flag(p, TIF_IO_BITMAP);
+ p->thread.iopl_warn = 0;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+#ifdef CONFIG_X86_64
+ current_save_fsgs();
+ p->thread.fsindex = current->thread.fsindex;
+ p->thread.fsbase = current->thread.fsbase;
+ p->thread.gsindex = current->thread.gsindex;
+ p->thread.gsbase = current->thread.gsbase;
+
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+
+ if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
+ set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
+#else
+ p->thread.sp0 = (unsigned long) (childregs + 1);
+ savesegment(gs, p->thread.gs);
+ /*
+ * Clear all status flags including IF and set fixed bit. 64bit
+ * does not have this initialization as the frame does not contain
+ * flags. The flags consistency (especially vs. AC) is there
+ * ensured via objtool, which lacks 32bit support.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
+#endif
+
+ /*
+ * Allocate a new shadow stack for thread if needed. If shadow stack,
+ * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
+ * update it.
+ */
+ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+ if (IS_ERR_VALUE(new_ssp))
+ return PTR_ERR((void *)new_ssp);
+
+ fpu_clone(p, clone_flags, args->fn, new_ssp);
+
+ /* Kernel thread ? */
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ p->thread.pkru = pkru_get_init_value();
+ memset(childregs, 0, sizeof(struct pt_regs));
+ kthread_frame_init(frame, args->fn, args->fn_arg);
+ return 0;
+ }
- board = dmi_get_system_info(DMI_BOARD_NAME);
- if (!board)
- board = "";
- product = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (!product)
- product = "";
+ /*
+ * Clone current's PKRU value from hardware. tsk->thread.pkru
+ * is only valid when scheduled out.
+ */
+ p->thread.pkru = read_pkru();
+
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
- printk(KERN_CONT "\n");
- printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
- current->pid, current->comm, print_tainted(),
- init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board, product);
+ if (unlikely(args->fn)) {
+ /*
+ * A user space thread, but it doesn't return to
+ * ret_after_fork().
+ *
+ * In order to indicate that to tools like gdb,
+ * we reset the stack and instruction pointers.
+ *
+ * It does the same kernel frame setup to return to a kernel
+ * function that a kernel thread does.
+ */
+ childregs->sp = 0;
+ childregs->ip = 0;
+ kthread_frame_init(frame, args->fn, args->fn_arg);
+ return 0;
+ }
+
+ /* Set a new TLS for the child thread? */
+ if (clone_flags & CLONE_SETTLS)
+ ret = set_new_tls(p, tls);
+
+ if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
+ io_bitmap_share(p);
+
+ return ret;
+}
+
+static void pkru_flush_thread(void)
+{
+ /*
+ * If PKRU is enabled the default PKRU value has to be loaded into
+ * the hardware right here (similar to context switch).
+ */
+ pkru_write_default();
}
void flush_thread(void)
@@ -119,17 +287,9 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
-}
-static void hard_disable_TSC(void)
-{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ fpu_flush_thread();
+ pkru_flush_thread();
}
void disable_TSC(void)
@@ -140,15 +300,10 @@ void disable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_disable_TSC();
+ cr4_set_bits(X86_CR4_TSD);
preempt_enable();
}
-static void hard_enable_TSC(void)
-{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
-}
-
static void enable_TSC(void)
{
preempt_disable();
@@ -157,7 +312,7 @@ static void enable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_enable_TSC();
+ cr4_clear_bits(X86_CR4_TSD);
preempt_enable();
}
@@ -185,471 +340,657 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread;
- next = &next_p->thread;
-
- if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
- test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
- debugctl |= DEBUGCTLMSR_BTF;
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
- update_debugctlmsr(debugctl);
- }
+static void set_cpuid_faulting(bool on)
+{
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval);
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (on)
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
else
- hard_enable_TSC();
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT);
}
+}
- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
/*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
*/
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+ set_cpuid_faulting(true);
+ }
+ preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
/*
- * Clear any possible leftover bits:
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
*/
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ set_cpuid_faulting(false);
}
- propagate_user_return_notify(prev_p, next_p);
+ preempt_enable();
}
-int sys_fork(struct pt_regs *regs)
+static int get_cpuid_mode(void)
{
- return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+ return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(unsigned long cpuid_enabled)
+{
+ if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;
+
+ if (cpuid_enabled)
+ enable_cpuid();
+ else
+ disable_cpuid();
+
+ return 0;
}
/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
+ * Called immediately after a successful exec.
*/
-int sys_vfork(struct pt_regs *regs)
+void arch_setup_new_exec(void)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
- NULL, NULL);
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+
+ /*
+ * Don't inherit TIF_SSBD across exec boundary when
+ * PR_SPEC_DISABLE_NOEXEC is used.
+ */
+ if (test_thread_flag(TIF_SSBD) &&
+ task_spec_ssb_noexec(current)) {
+ clear_thread_flag(TIF_SSBD);
+ task_clear_spec_ssb_disable(current);
+ task_clear_spec_ssb_noexec(current);
+ speculation_ctrl_update(read_thread_flags());
+ }
+
+ mm_reset_untag_mask(current->mm);
}
-long
-sys_clone(unsigned long clone_flags, unsigned long newsp,
- void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void switch_to_bitmap(unsigned long tifp)
{
- if (!newsp)
- newsp = regs->sp;
- return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+ /*
+ * Invalidate I/O bitmap if the previous task used it. This prevents
+ * any possible leakage of an active I/O bitmap.
+ *
+ * If the next task has an I/O bitmap it will handle it on exit to
+ * user mode.
+ */
+ if (tifp & _TIF_IO_BITMAP)
+ tss_invalidate_io_bitmap();
}
-/*
- * This gets run with %si containing the
- * function to call, and %di containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
+static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
+{
+ /*
+ * Copy at least the byte range of the incoming tasks bitmap which
+ * covers the permitted I/O ports.
+ *
+ * If the previous task which used an I/O bitmap had more bits
+ * permitted, then the copy needs to cover those as well so they
+ * get turned off.
+ */
+ memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
+ max(tss->io_bitmap.prev_max, iobm->max));
-/*
- * Create a kernel thread
+ /*
+ * Store the new max and the sequence number of this bitmap
+ * and a pointer to the bitmap itself.
+ */
+ tss->io_bitmap.prev_max = iobm->max;
+ tss->io_bitmap.prev_sequence = iobm->sequence;
+}
+
+/**
+ * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
*/
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+void native_tss_update_io_bitmap(void)
{
- struct pt_regs regs;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct thread_struct *t = &current->thread;
+ u16 *base = &tss->x86_tss.io_bitmap_base;
- memset(&regs, 0, sizeof(regs));
+ if (!test_thread_flag(TIF_IO_BITMAP)) {
+ native_tss_invalidate_io_bitmap();
+ return;
+ }
- regs.si = (unsigned long) fn;
- regs.di = (unsigned long) arg;
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
+ } else {
+ struct io_bitmap *iobm = t->io_bitmap;
-#ifdef CONFIG_X86_32
- regs.ds = __USER_DS;
- regs.es = __USER_DS;
- regs.fs = __KERNEL_PERCPU;
- regs.gs = __KERNEL_STACK_CANARY;
-#else
- regs.ss = __KERNEL_DS;
+ if (WARN_ON_ONCE(!iobm)) {
+ clear_thread_flag(TIF_IO_BITMAP);
+ native_tss_invalidate_io_bitmap();
+ }
+
+ /*
+ * Only copy bitmap data when the sequence number differs. The
+ * update time is accounted to the incoming task.
+ */
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
+ tss_copy_io_bitmap(tss, iobm);
+
+ /* Enable the bitmap */
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
+ }
+
+ /*
+ * Make sure that the TSS limit is covering the IO bitmap. It might have
+ * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
+ * access from user space to trigger a #GP because the bitmap is outside
+ * the TSS limit.
+ */
+ refresh_tss_limit();
+}
+#else /* CONFIG_X86_IOPL_IOPERM */
+static inline void switch_to_bitmap(unsigned long tifp) { }
#endif
- regs.orig_ax = -1;
- regs.ip = (unsigned long) kernel_thread_helper;
- regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | 0x2;
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+ struct ssb_state *shared_state;
+ raw_spinlock_t lock;
+ unsigned int disable_state;
+ unsigned long local_state;
+};
+
+#define LSTATE_SSB 0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ st->local_state = 0;
+
+ /*
+ * Shared state setup happens once on the first bringup
+ * of the CPU. It's not destroyed on CPU hotunplug.
+ */
+ if (st->shared_state)
+ return;
+
+ raw_spin_lock_init(&st->lock);
+
+ /*
+ * Go over HT siblings and check whether one of them has set up the
+ * shared state pointer already.
+ */
+ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+ if (cpu == this_cpu)
+ continue;
- /* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+ if (!per_cpu(ssb_state, cpu).shared_state)
+ continue;
+
+ /* Link it to the state of the sibling: */
+ st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+ return;
+ }
+
+ /*
+ * First HT sibling to come up on the core. Link shared state of
+ * the first HT sibling to itself. The siblings on the same core
+ * which come up later will see the shared state pointer and link
+ * themselves to the state of this CPU.
+ */
+ st->shared_state = st;
}
-EXPORT_SYMBOL(kernel_thread);
/*
- * sys_execve() executes a new program.
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
*/
-long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs *regs)
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
{
- long error;
- char *filename;
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ u64 msr = x86_amd_ls_cfg_base;
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = do_execve(filename, argv, envp, regs);
+ if (!static_cpu_has(X86_FEATURE_ZEN)) {
+ msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ return;
+ }
-#ifdef CONFIG_X86_32
- if (error == 0) {
- /* Make sure we don't return using sysenter.. */
- set_thread_flag(TIF_IRET);
- }
-#endif
+ if (tifn & _TIF_SSBD) {
+ /*
+ * Since this can race with prctl(), block reentry on the
+ * same CPU.
+ */
+ if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+ return;
- putname(filename);
- return error;
+ msr |= x86_amd_ls_cfg_ssbd_mask;
+
+ raw_spin_lock(&st->shared_state->lock);
+ /* First sibling enables SSBD: */
+ if (!st->shared_state->disable_state)
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ st->shared_state->disable_state++;
+ raw_spin_unlock(&st->shared_state->lock);
+ } else {
+ if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ raw_spin_lock(&st->shared_state->lock);
+ st->shared_state->disable_state--;
+ if (!st->shared_state->disable_state)
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+ raw_spin_unlock(&st->shared_state->lock);
+ }
}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
-/*
- * Idle related variables and functions
- */
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
+ wrmsrq(MSR_AMD64_LS_CFG, msr);
+}
+#endif
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+ /*
+ * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+ * so ssbd_tif_to_spec_ctrl() just works.
+ */
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
-#ifdef CONFIG_X86_32
/*
- * This halt magic was a workaround for ancient floppy DMA
- * wreckage. It should be safe to remove.
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
*/
-static int hlt_counter;
-void disable_hlt(void)
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
{
- hlt_counter++;
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ lockdep_assert_irqs_disabled();
+
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ }
+
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
+ }
+
+ if (updmsr)
+ update_spec_ctrl_cond(msr);
}
-EXPORT_SYMBOL(disable_hlt);
-void enable_hlt(void)
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
- hlt_counter--;
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return read_task_thread_flags(tsk);
}
-EXPORT_SYMBOL(enable_hlt);
-static inline int hlt_use_halt(void)
+void speculation_ctrl_update(unsigned long tif)
{
- return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+ unsigned long flags;
+
+ /* Forced update. Make sure all relevant TIF flags are different */
+ local_irq_save(flags);
+ __speculation_ctrl_update(~tif, tif);
+ local_irq_restore(flags);
}
-#else
-static inline int hlt_use_halt(void)
+
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
{
- return 1;
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
}
-#endif
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
+static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
{
- if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1);
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- } else {
- local_irq_enable();
- /* loop is done by the caller */
- cpu_relax();
+ newval = cr4 ^ mask;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
}
}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-void stop_this_cpu(void *dummy)
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- set_cpu_online(smp_processor_id(), false);
- disable_local_APIC();
+ unsigned long tifp, tifn;
- for (;;) {
- if (hlt_works(smp_processor_id()))
- halt();
+ tifn = read_task_thread_flags(next_p);
+ tifp = read_task_thread_flags(prev_p);
+
+ switch_to_bitmap(tifp);
+
+ propagate_user_return_notify(prev_p, next_p);
+
+ if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+ arch_has_block_step()) {
+ unsigned long debugctl, msk;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ msk = tifn & _TIF_BLOCKSTEP;
+ debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
}
-}
-static void do_nothing(void *unused)
-{
+ if ((tifp ^ tifn) & _TIF_NOTSC)
+ cr4_toggle_bits_irqsoff(X86_CR4_TSD);
+
+ if ((tifp ^ tifn) & _TIF_NOCPUID)
+ set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
}
/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
+ * Idle related variables and functions
*/
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
+ * We use this if we don't have any better idle routine..
*/
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+void __cpuidle default_idle(void)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1);
- if (!need_resched()) {
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
+ raw_safe_halt();
+ raw_local_irq_disable();
+}
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
+EXPORT_SYMBOL(default_idle);
+#endif
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
+DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
+
+static bool x86_idle_set(void)
+{
+ return !!static_call_query(x86_idle);
}
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
+#ifndef CONFIG_SMP
+static inline void __noreturn play_dead(void)
{
- if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1);
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
+ BUG();
+}
+#endif
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- } else
- local_irq_enable();
+void arch_cpu_idle_enter(void)
+{
+ tsc_verify_tsc_adjust(false);
+ local_touch_nmi();
}
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
+void __noreturn arch_cpu_idle_dead(void)
{
- trace_power_start(POWER_CSTATE, 0);
- local_irq_enable();
- while (!need_resched())
- cpu_relax();
- trace_power_end(0);
+ play_dead();
}
/*
- * mwait selection logic:
- *
- * It depends on the CPU. For AMD CPUs that support MWAIT this is
- * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
- * then depend on a clock divisor and current Pstate of the core. If
- * all cores of a processor are in halt state (C1) the processor can
- * enter the C1E (C1 enhanced) state. If mwait is used this will never
- * happen.
- *
- * idle=mwait overrides this decision and forces the usage of mwait.
+ * Called from the generic idle code.
*/
-static int __cpuinitdata force_mwait;
+void __cpuidle arch_cpu_idle(void)
+{
+ static_call(x86_idle)();
+}
+EXPORT_SYMBOL_GPL(arch_cpu_idle);
+
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
+{
+ bool ret = x86_idle_set();
+
+ static_call_update(x86_idle, default_idle);
+
+ return ret;
+}
+#endif
-#define MWAIT_INFO 0x05
-#define MWAIT_ECX_EXTENDED_INFO 0x01
-#define MWAIT_EDX_C1 0xf0
+struct cpumask cpus_stop_mask;
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+void __noreturn stop_this_cpu(void *dummy)
{
- u32 eax, ebx, ecx, edx;
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+ unsigned int cpu = smp_processor_id();
- if (force_mwait)
- return 1;
+ local_irq_disable();
- if (c->cpuid_level < MWAIT_INFO)
- return 0;
+ /*
+ * Remove this CPU from the online mask and disable it
+ * unconditionally. This might be redundant in case that the reboot
+ * vector was handled late and stop_other_cpus() sent an NMI.
+ *
+ * According to SDM and APM NMIs can be accepted even after soft
+ * disabling the local APIC.
+ */
+ set_cpu_online(cpu, false);
+ disable_local_APIC();
+ mcheck_cpu_clear(c);
- cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
- /* Check, whether EDX has extended info about MWAIT */
- if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
- return 1;
+ if (this_cpu_read(cache_state_incoherent))
+ wbinvd();
/*
- * edx enumeratios MONITOR/MWAIT extensions. Check, whether
- * C1 supports MWAIT
+ * This brings a cache line back and dirties it, but
+ * native_stop_other_cpus() will overwrite cpus_stop_mask after it
+ * observed that all CPUs reported stop. This write will invalidate
+ * the related cache line on this CPU.
*/
- return (edx & MWAIT_EDX_C1);
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
+#ifdef CONFIG_SMP
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+#endif
+
+ for (;;) {
+ /*
+ * Use native_halt() so that memory contents don't change
+ * (stack usage and variables) after possibly issuing the
+ * wbinvd() above.
+ */
+ native_halt();
+ }
}
/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
+ * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
+ * exists and whenever MONITOR/MWAIT extensions are present there is at
+ * least one C1 substate.
+ *
+ * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
+ * is passed to kernel commandline parameter.
*/
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+static __init bool prefer_mwait_c1_over_halt(void)
{
- u64 val;
- if (c->x86_vendor != X86_VENDOR_AMD)
- goto no_c1e_idle;
+ const struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 eax, ebx, ecx, edx;
- /* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0F && c->x86_model >= 0x40)
- return 1;
+ /* If override is enforced on the command line, fall back to HALT. */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return false;
- if (c->x86 == 0x10) {
- /*
- * check OSVW bit for CPUs that are not affected
- * by erratum #400
- */
- if (cpu_has(c, X86_FEATURE_OSVW)) {
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
- if (val >= 2) {
- rdmsrl(MSR_AMD64_OSVW_STATUS, val);
- if (!(val & BIT(1)))
- goto no_c1e_idle;
- }
- }
- return 1;
- }
+ /* MWAIT is not supported on this platform. Fallback to HALT */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ return false;
-no_c1e_idle:
- return 0;
-}
+ /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */
+ if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
+ return false;
-static cpumask_var_t c1e_mask;
-static int c1e_detected;
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
-void c1e_remove_cpu(int cpu)
-{
- if (c1e_mask != NULL)
- cpumask_clear_cpu(cpu, c1e_mask);
+ /*
+ * If MWAIT extensions are not available, it is safe to use MWAIT
+ * with EAX=0, ECX=0.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
+ return true;
+
+ /*
+ * If MWAIT extensions are available, there should be at least one
+ * MWAIT C1 substate present.
+ */
+ return !!(edx & MWAIT_C1_SUBSTATE_MASK);
}
/*
- * C1E aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
*/
-static void c1e_idle(void)
+static __cpuidle void mwait_idle(void)
{
if (need_resched())
return;
- if (!c1e_detected) {
- u32 lo, hi;
+ x86_idle_clear_cpu_buffers();
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
- }
- }
+ if (!current_set_polling_and_test()) {
+ const void *addr = &current_thread_info()->flags;
- if (c1e_detected) {
- int cpu = smp_processor_id();
-
- if (!cpumask_test_cpu(cpu, c1e_mask)) {
- cpumask_set_cpu(cpu, c1e_mask);
- /*
- * Force broadcast so ACPI can not interfere.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &cpu);
- printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
- cpu);
- }
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+ alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr));
+ __monitor(addr, 0, 0);
+ if (need_resched())
+ goto out;
- default_idle();
+ __sti_mwait(0, 0);
+ raw_local_irq_disable();
+ }
- /*
- * The switch back from broadcast mode needs to be
- * called with interrupts disabled.
- */
- local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- local_irq_enable();
- } else
- default_idle();
+out:
+ __current_clr_polling();
}
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+void __init select_idle_routine(void)
{
-#ifdef CONFIG_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
+ if (boot_option_idle_override == IDLE_POLL) {
+ if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
+ return;
}
-#endif
- if (pm_idle)
+
+ /* Required to guard against xen_set_default_idle() */
+ if (x86_idle_set())
return;
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * One CPU supports mwait => All CPUs supports mwait
- */
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- } else if (check_c1e_idle(c)) {
- printk(KERN_INFO "using C1E aware idle routine\n");
- pm_idle = c1e_idle;
- } else
- pm_idle = default_idle;
+ if (prefer_mwait_c1_over_halt()) {
+ pr_info("using mwait in idle threads\n");
+ static_call_update(x86_idle, mwait_idle);
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ static_call_update(x86_idle, tdx_halt);
+ } else {
+ static_call_update(x86_idle, default_idle);
+ }
}
-void __init init_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
{
- /* If we're using c1e_idle, we need to allocate c1e_mask. */
- if (pm_idle == c1e_idle)
- zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
+{
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE))
+ static_branch_enable(&arch_needs_tick_broadcast);
+ pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
}
static int __init idle_setup(char *str)
@@ -658,34 +999,19 @@ static int __init idle_setup(char *str)
return -EINVAL;
if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else if (!strcmp(str, "halt")) {
- /*
- * When the boot option of idle=halt is added, halt is
- * forced to be used for CPU idle. In such case CPU C2/C3
- * won't be used again.
- * To continue to load the CPU idle driver, don't touch
- * the boot_option_idle_override.
- */
- pm_idle = default_idle;
- idle_halt = 1;
- return 0;
+ pr_info("using polling idle threads\n");
+ boot_option_idle_override = IDLE_POLL;
+ cpu_idle_poll_ctrl(true);
+ } else if (!strcmp(str, "halt")) {
+ /* 'idle=halt' HALT for idle. C-states are disabled. */
+ boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
- /*
- * If the boot option of "idle=nomwait" is added,
- * it means that mwait will be disabled for CPU C2/C3
- * states. In such case it won't touch the variable
- * of boot_option_idle_override.
- */
- idle_nomwait = 1;
- return 0;
- } else
- return -1;
+ /* 'idle=nomwait' disables MWAIT for idle */
+ boot_option_idle_override = IDLE_NOMWAIT;
+ } else {
+ return -EINVAL;
+ }
- boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
@@ -693,13 +1019,69 @@ early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
+ sp -= get_random_u32_below(8192);
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ if (mmap_is_ia32())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long __get_wchan(struct task_struct *p)
+{
+ struct unwind_state state;
+ unsigned long addr = 0;
+
+ if (!try_get_task_stack(p))
+ return 0;
+
+ for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (in_sched_functions(addr))
+ continue;
+ break;
+ }
+
+ put_task_stack(p);
+
+ return addr;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ switch (option) {
+ case ARCH_GET_CPUID:
+ return get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(arg2);
+ case ARCH_GET_XCOMP_SUPP:
+ case ARCH_GET_XCOMP_PERM:
+ case ARCH_REQ_XCOMP_PERM:
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ return fpu_xstate_prctl(option, arg2);
+ }
+
+ if (!in_ia32_syscall())
+ return do_arch_prctl_64(current, option, arg2);
+
+ return -EINVAL;
}
+SYSCALL_DEFINE0(ni_syscall)
+{
+ return -ENOSYS;
+}
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 000000000000..76b547b83232
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long prev_tif = read_task_thread_flags(prev);
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIBP is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..3ef15c2f152f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,10 +9,11 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -25,152 +26,80 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/reboot.h>
-#include <linux/init.h>
#include <linux/mc146818rtc.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/personality.h>
-#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/syscalls.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/sched.h>
#include <asm/desc.h>
-#ifdef CONFIG_MATH_EMULATION
-#include <asm/math_emu.h>
-#endif
#include <linux/err.h>
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/idle.h>
-#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/vm86.h>
+#include <asm/resctrl.h>
+#include <asm/proto.h>
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+#include "process.h"
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *)tsk->thread.sp)[3];
-}
-
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
-{
- BUG();
-}
-#endif
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- current_thread_info()->status |= TS_POLLING;
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_stop_sched_tick(1);
- while (!need_resched()) {
-
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(cpu))
- play_dead();
-
- local_irq_disable();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- pm_idle();
- start_critical_timings();
- }
- tick_nohz_restart_sched_tick();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
- }
-}
-
-void __show_regs(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
- unsigned long sp;
- unsigned short ss, gs;
-
- if (user_mode_vm(regs)) {
- sp = regs->sp;
- ss = regs->ss & 0xffff;
- gs = get_user_gs(regs);
- } else {
- sp = kernel_stack_pointer(regs);
- savesegment(ss, ss);
- savesegment(gs, gs);
- }
+ unsigned short gs;
- show_regs_common();
+ savesegment(gs, gs);
- printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
+ show_ip(regs, log_lvl);
- printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
- regs->ax, regs->bx, regs->cx, regs->dx);
- printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
- regs->si, regs->di, regs->bp, sp);
- printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
- (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+ printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx, regs->dx);
+ printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ log_lvl, regs->si, regs->di, regs->bp, regs->sp);
+ printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
- if (!all)
+ if (mode != SHOW_REGS_ALL)
return;
cr0 = read_cr0();
cr2 = read_cr2();
- cr3 = read_cr3();
- cr4 = read_cr4_safe();
- printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
- cr0, cr2, cr3, cr4);
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+ printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ log_lvl, cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
get_debugreg(d3, 3);
- printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
- d0, d1, d2, d3);
-
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
- d6, d7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))
+ return;
+
+ printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ log_lvl, d0, d1, d2, d3);
+ printk("%sDR6: %08lx DR7: %08lx\n",
+ log_lvl, d6, d7);
}
void release_thread(struct task_struct *dead_task)
@@ -179,89 +108,24 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
-{
- struct pt_regs *childregs;
- struct task_struct *tsk;
- int err;
-
- childregs = task_pt_regs(p);
- *childregs = *regs;
- childregs->ax = 0;
- childregs->sp = sp;
-
- p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
-
- p->thread.ip = (unsigned long) ret_from_fork;
-
- task_user_gs(p) = get_user_gs(regs);
-
- p->thread.io_bitmap_ptr = NULL;
- tsk = current;
- err = -ENOMEM;
-
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
- if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
-
- err = 0;
-
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS)
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
-
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
- return err;
-}
-
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- set_user_gs(regs, 0);
+ loadsegment(gs, 0);
regs->fs = 0;
- set_fs(USER_DS);
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
- /*
- * Free the old FP and other extended state
- */
- free_thread_xstate(current);
+ regs->flags = X86_EFLAGS_IF;
}
EXPORT_SYMBOL_GPL(start_thread);
/*
- * switch_to(x,yn) should switch tasks from x to y.
+ * switch_to(x,y) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
@@ -287,34 +151,16 @@ EXPORT_SYMBOL_GPL(start_thread);
* the task-switch, and shows up in ret_from_fork in entry.S,
* for example.
*/
-__notrace_funcgraph struct task_struct *
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
- bool preload_fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- /*
- * If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- */
- preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
-
- __unlazy_fpu(prev_p);
-
- /* we're going to use this soon, after a few expensive things */
- if (preload_fpu)
- prefetch(next->fpu.state);
-
- /*
- * Reload esp0.
- */
- load_sp0(tss, next);
+ switch_fpu(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -326,81 +172,43 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- lazy_save_gs(prev->gs);
+ savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
load_TLS(next, cpu);
- /*
- * Restore IOPL if needed. In normal use, the flags restore
- * in the switch assembly will handle this. But if the kernel
- * is running virtualized at a non-zero CPL, the popf will
- * not restore flags, so it must be done in a separate step.
- */
- if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
- set_iopl_mask(next->iopl);
-
- /*
- * Now maybe handle debug registers and/or IO bitmaps
- */
- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
- __switch_to_xtra(prev_p, next_p, tss);
-
- /* If we're going to preload the fpu context, make sure clts
- is run while we're batching the cpu state updates. */
- if (preload_fpu)
- clts();
+ switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * the GDT and LDT are properly updated.
*/
arch_end_context_switch(next_p);
- if (preload_fpu)
- __math_state_restore();
+ /*
+ * Reload esp0 and cpu_current_top_of_stack. This changes
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
+ */
+ update_task_stack(next_p);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- lazy_load_gs(next->gs);
+ loadsegment(gs, next->gs);
- percpu_write(current_task, next_p);
-
- return prev_p;
-}
+ raw_cpu_write(current_task, next_p);
-#define top_esp (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
+ /* Load the Intel cache allocation PQR MSR. */
+ resctrl_arch_sched_in(next_p);
-unsigned long get_wchan(struct task_struct *p)
-{
- unsigned long bp, sp, ip;
- unsigned long stack_page;
- int count = 0;
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack_page = (unsigned long)task_stack_page(p);
- sp = p->thread.sp;
- if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
- return 0;
- /* include/asm-i386/system.h:switch_to() pushes bp last. */
- bp = *(unsigned long *) sp;
- do {
- if (bp < stack_page || bp > top_ebp+stack_page)
- return 0;
- ip = *(unsigned long *) (bp+4);
- if (!in_sched_functions(ip))
- return ip;
- bp = *(unsigned long *) bp;
- } while (count++ < 16);
- return 0;
+ return prev_p;
}
-
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..432c0a004c60 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1995 Linus Torvalds
*
@@ -14,10 +15,11 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -27,321 +29,553 @@
#include <linux/user.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/ptrace.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
-#include <linux/tick.h>
#include <linux/prctl.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
+#include <linux/syscalls.h>
+#include <linux/iommu.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/pkru.h>
+#include <asm/fpu/sched.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
-#include <asm/idle.h>
-#include <asm/syscalls.h>
#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/vdso.h>
+#include <asm/resctrl.h>
+#include <asm/unistd.h>
+#include <asm/fsgsbase.h>
+#include <asm/fred.h>
+#include <asm/msr.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
-asmlinkage extern void ret_from_fork(void);
+#include "process.h"
-DEFINE_PER_CPU(unsigned long, old_rsp);
-static DEFINE_PER_CPU(unsigned char, is_idle);
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, es;
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+ show_iret_regs(regs, log_lvl);
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
+ printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx);
+ printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
+ log_lvl, regs->dx, regs->si, regs->di);
+ printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
+ log_lvl, regs->bp, regs->r8, regs->r9);
+ printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
+ log_lvl, regs->r10, regs->r11, regs->r12);
+ printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
+ log_lvl, regs->r13, regs->r14, regs->r15);
+
+ if (mode == SHOW_REGS_SHORT)
+ return;
+
+ if (mode == SHOW_REGS_USER) {
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
+ printk("%sFS: %016lx GS: %016lx\n",
+ log_lvl, fs, shadowgs);
+ return;
+ }
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%es,%0" : "=r" (es));
+ asm("movl %%fs,%0" : "=r" (fsindex));
+ asm("movl %%gs,%0" : "=r" (gsindex));
+
+ rdmsrq(MSR_FS_BASE, fs);
+ rdmsrq(MSR_GS_BASE, gs);
+ rdmsrq(MSR_KERNEL_GS_BASE, shadowgs);
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+
+ printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ log_lvl, fs, fsindex, gs, gsindex, shadowgs);
+ printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n",
+ log_lvl, regs->cs, ds, es, cr0);
+ printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
+ log_lvl, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) {
+ printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
+ log_lvl, d0, d1, d2);
+ printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
+ log_lvl, d3, d6, d7);
+ }
+
+ if (cr4 & X86_CR4_PKE)
+ printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-void idle_notifier_unregister(struct notifier_block *n)
+void release_thread(struct task_struct *dead_task)
{
- atomic_notifier_chain_unregister(&idle_notifier, n);
+ WARN_ON(dead_task->mm);
}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-void enter_idle(void)
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr unsigned long __rdgsbase_inactive(void)
{
- percpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ unsigned long gsbase;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * SWAPGS is no longer needed thus NOT allowed with FRED because
+ * FRED transitions ensure that an operating system can _always_
+ * operate with its own GS base address:
+ * - For events that occur in ring 3, FRED event delivery swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ * - ERETU (the FRED transition that returns to ring 3) also swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ *
+ * And the operating system can still setup the GS segment for a
+ * user thread without the need of loading a user thread GS with:
+ * - Using LKGS, available with FRED, to modify other attributes
+ * of the GS segment without compromising its ability always to
+ * operate with its own GS base address.
+ * - Accessing the GS segment base address for a user thread as
+ * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
+ *
+ * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
+ * MSR instead of the GS segment’s descriptor cache. As such, the
+ * operating system never changes its runtime GS base address.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ gsbase = rdgsbase();
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
+
+ return gsbase;
}
-static void __exit_idle(void)
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr void __wrgsbase_inactive(unsigned long gsbase)
{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ lockdep_assert_irqs_disabled();
+
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ wrgsbase(gsbase);
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
}
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
}
-#ifndef CONFIG_SMP
-static inline void play_dead(void)
+static __always_inline void save_fsgs(struct task_struct *task)
{
- BUG();
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /*
+ * If FSGSBASE is enabled, we can't make any useful guesses
+ * about the base, and user code expects us to save the current
+ * value. Fortunately, reading the base directly is efficient.
+ */
+ task->thread.fsbase = rdfsbase();
+ task->thread.gsbase = __rdgsbase_inactive();
+ } else {
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+ }
}
-#endif
/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()).
*/
-void cpu_idle(void)
+void current_save_fsgs(void)
{
- current_thread_info()->status |= TS_POLLING;
+ unsigned long flags;
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. CPU0 already has it initialized but no harm in
- * doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-
- /* endless idle loop with no priority at all */
- while (1) {
- tick_nohz_stop_sched_tick(1);
- while (!need_resched()) {
+ /* Interrupts need to be off for FSGSBASE */
+ local_irq_save(flags);
+ save_fsgs(current);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_FOR_KVM(current_save_fsgs);
- rmb();
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
- if (cpu_is_offline(smp_processor_id()))
- play_dead();
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
/*
- * Idle routines should keep interrupts disabled
- * from here on, until they go to idle.
- * Otherwise, idle callbacks can misfire.
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
*/
- local_irq_disable();
- enter_idle();
- /* Don't trace irqs off for idle */
- stop_critical_timings();
- pm_idle();
- start_critical_timings();
- /* In many cases the interrupt that ended idle
- has already called exit_idle. But some idle
- loops can be woken up without interrupt. */
- __exit_idle();
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
}
-
- tick_nohz_restart_sched_tick();
- preempt_enable_no_resched();
- schedule();
- preempt_disable();
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
}
}
-/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs, int all)
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+ struct thread_struct *next)
{
- unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
- unsigned long d0, d1, d2, d3, d6, d7;
- unsigned int fsindex, gsindex;
- unsigned int ds, cs, es;
-
- show_regs_common();
- printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip, 1);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
- printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
- regs->ax, regs->bx, regs->cx);
- printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
- regs->dx, regs->si, regs->di);
- printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
- regs->bp, regs->r8, regs->r9);
- printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
- asm("movl %%fs,%0" : "=r" (fsindex));
- asm("movl %%gs,%0" : "=r" (gsindex));
-
- rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
-
- if (!all)
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
- cr0 = read_cr0();
- cr2 = read_cr2();
- cr3 = read_cr3();
- cr4 = read_cr4();
-
- printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
- es, cr0);
- printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
- cr4);
+ /* Stash the prev task's value: */
+ prev->pkru = rdpkru();
- get_debugreg(d0, 0);
- get_debugreg(d1, 1);
- get_debugreg(d2, 2);
- printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
- get_debugreg(d3, 3);
- get_debugreg(d6, 6);
- get_debugreg(d7, 7);
- printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ /*
+ * PKRU writes are slightly expensive. Avoid them when not
+ * strictly necessary:
+ */
+ if (prev->pkru != next->pkru)
+ wrpkru(next->pkru);
}
-void release_thread(struct task_struct *dead_task)
+static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
+ struct thread_struct *next)
{
- if (dead_task->mm) {
- if (dead_task->mm->context.size) {
- printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
- dead_task->comm,
- dead_task->mm->context.ldt,
- dead_task->mm->context.size);
- BUG();
- }
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /* Update the FS and GS selectors if they could have changed. */
+ if (unlikely(prev->fsindex || next->fsindex))
+ loadseg(FS, next->fsindex);
+ if (unlikely(prev->gsindex || next->gsindex))
+ loadseg(GS, next->gsindex);
+
+ /* Update the bases. */
+ wrfsbase(next->fsbase);
+ __wrgsbase_inactive(next->gsbase);
+ } else {
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
}
}
-static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ unsigned short selector)
{
- struct user_desc ud = {
- .base_addr = addr,
- .limit = 0xfffff,
- .seg_32bit = 1,
- .limit_in_pages = 1,
- .useable = 1,
- };
- struct desc_struct *desc = t->thread.tls_array;
- desc += tls;
- fill_ldt(desc, &ud);
+ unsigned short idx = selector >> 3;
+ unsigned long base;
+
+ if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ if (unlikely(idx >= GDT_ENTRIES))
+ return 0;
+
+ /*
+ * There are no user segments in the GDT with nonzero bases
+ * other than the TLS segments.
+ */
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return 0;
+
+ idx -= GDT_ENTRY_TLS_MIN;
+ base = get_desc_base(&task->thread.tls_array[idx]);
+ } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If performance here mattered, we could protect the LDT
+ * with RCU. This is a slow path, though, so we can just
+ * take the mutex.
+ */
+ mutex_lock(&task->mm->context.lock);
+ ldt = task->mm->context.ldt;
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
+ base = 0;
+ else
+ base = get_desc_base(ldt->entries + idx);
+ mutex_unlock(&task->mm->context.lock);
+#else
+ base = 0;
+#endif
+ }
+
+ return base;
}
-static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+unsigned long x86_gsbase_read_cpu_inactive(void)
{
- return get_desc_base(&t->thread.tls_array[tls]);
+ unsigned long gsbase;
+
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ gsbase = __rdgsbase_inactive();
+ local_irq_restore(flags);
+ } else {
+ rdmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ }
+
+ return gsbase;
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
- unlazy_fpu(tsk);
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wrgsbase_inactive(gsbase);
+ local_irq_restore(flags);
+ } else {
+ wrmsrq(MSR_KERNEL_GS_BASE, gsbase);
+ }
}
-int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
- struct task_struct *p, struct pt_regs *regs)
+unsigned long x86_fsbase_read_task(struct task_struct *task)
{
- int err;
- struct pt_regs *childregs;
- struct task_struct *me = current;
+ unsigned long fsbase;
- childregs = ((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(p))) - 1;
- *childregs = *regs;
-
- childregs->ax = 0;
- if (user_mode(regs))
- childregs->sp = sp;
+ if (task == current)
+ fsbase = x86_fsbase_read_cpu();
+ else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.fsindex == 0))
+ fsbase = task->thread.fsbase;
else
- childregs->sp = (unsigned long)childregs;
+ fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
- p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
- p->thread.usersp = me->thread.usersp;
+ return fsbase;
+}
- set_tsk_thread_flag(p, TIF_FORK);
+unsigned long x86_gsbase_read_task(struct task_struct *task)
+{
+ unsigned long gsbase;
- p->thread.io_bitmap_ptr = NULL;
+ if (task == current)
+ gsbase = x86_gsbase_read_cpu_inactive();
+ else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.gsindex == 0))
+ gsbase = task->thread.gsbase;
+ else
+ gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
- savesegment(gs, p->thread.gsindex);
- p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
- savesegment(fs, p->thread.fsindex);
- p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
- savesegment(es, p->thread.es);
- savesegment(ds, p->thread.ds);
+ return gsbase;
+}
- err = -ENOMEM;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+{
+ WARN_ON_ONCE(task == current);
- if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
- if (!p->thread.io_bitmap_ptr) {
- p->thread.io_bitmap_max = 0;
- return -ENOMEM;
- }
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES);
- set_tsk_thread_flag(p, TIF_IO_BITMAP);
- }
+ task->thread.fsbase = fsbase;
+}
- /*
- * Set a new TLS for the child thread?
- */
- if (clone_flags & CLONE_SETTLS) {
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- err = do_set_thread_area(p, -1,
- (struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
- goto out;
- }
- err = 0;
-out:
- if (err && p->thread.io_bitmap_ptr) {
- kfree(p->thread.io_bitmap_ptr);
- p->thread.io_bitmap_max = 0;
- }
+void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+{
+ WARN_ON_ONCE(task == current);
- return err;
+ task->thread.gsbase = gsbase;
}
static void
start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
- unsigned int _cs, unsigned int _ss, unsigned int _ds)
+ u16 _cs, u16 _ss, u16 _ds)
{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
+ reset_thread_features();
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
- regs->ip = new_ip;
- regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
- regs->cs = _cs;
- regs->ss = _ss;
- regs->flags = X86_EFLAGS_IF;
- set_fs(USER_DS);
+
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->csx = _cs;
+ regs->ssx = _ss;
/*
- * Free the old FP and other extended state
+ * Allow single-step trap and NMI when starting a new task, thus
+ * once the new task enters user space, single-step trap and NMI
+ * are both enabled immediately.
+ *
+ * Entering a new task is logically speaking a return from a
+ * system call (exec, fork, clone, etc.). As such, if ptrace
+ * enables single stepping a single step exception should be
+ * allowed to trigger immediately upon entering user space.
+ * This is not optional.
+ *
+ * NMI should *never* be disabled in user space. As such, this
+ * is an optional, opportunistic way to catch errors.
+ *
+ * Paranoia: High-order 48 bits above the lowest 16 bit SS are
+ * discarded by the legacy IRET instruction on all Intel, AMD,
+ * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
+ * even when FRED is not enabled. But we choose the safer side
+ * to use these bits only when FRED is enabled.
*/
- free_thread_xstate(current);
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ regs->fred_ss.swevent = true;
+ regs->fred_ss.nmi = true;
+ }
+
+ regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
}
void
@@ -350,12 +584,14 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
start_thread_common(regs, new_ip, new_sp,
__USER_CS, __USER_DS, 0);
}
+EXPORT_SYMBOL_GPL(start_thread);
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
{
start_thread_common(regs, new_ip, new_sp,
- __USER32_CS, __USER32_DS, __USER32_DS);
+ x32 ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
}
#endif
@@ -369,35 +605,52 @@ void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
* Kprobes not supported here. Set the probe on schedule instead.
* Function graph tracer not supported too.
*/
-__notrace_funcgraph struct task_struct *
+__no_kmsan_checks
+__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
- unsigned fsindex, gsindex;
- bool preload_fpu;
- /*
- * If the task has used fpu the last 5 timeslices, just do a full
- * restore of the math state immediately to avoid the trap; the
- * chances of needing FPU soon are obviously high now
- */
- preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(hardirq_stack_inuse));
- /* we're going to use this soon, after a few expensive things */
- if (preload_fpu)
- prefetch(next->fpu.state);
+ switch_fpu(prev_p, cpu);
+
+ /* We must save %fs and %gs before load_TLS() because
+ * %fs and %gs may be cleared by load_TLS().
+ *
+ * (e.g. xen_load_tls())
+ */
+ save_fsgs(prev_p);
/*
- * Reload esp0, LDT and the page table pointer:
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
*/
- load_sp0(tss, next);
+ load_TLS(next, cpu);
/*
- * Switch DS and ES.
- * This won't pick up thread selector changes, but I guess that is ok.
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them.
+ */
+ arch_end_context_switch(next_p);
+
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
@@ -407,88 +660,55 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
+ x86_fsgsbase_load(prev, next);
- /* We must save %fs and %gs before load_TLS() because
- * %fs and %gs may be cleared by load_TLS().
- *
- * (e.g. xen_load_tls())
- */
- savesegment(fs, fsindex);
- savesegment(gs, gsindex);
-
- load_TLS(next, cpu);
-
- /* Must be after DS reload */
- unlazy_fpu(prev_p);
-
- /* Make sure cpu is ready for new context */
- if (preload_fpu)
- clts();
+ x86_pkru_load(prev, next);
/*
- * Leave lazy mode, flushing any hypercalls made here.
- * This must be done before restoring TLS segments so
- * the GDT and LDT are properly updated, and must be
- * done before math_state_restore, so the TS bit is up
- * to date.
+ * Switch the PDA and FPU contexts.
*/
- arch_end_context_switch(next_p);
+ raw_cpu_write(current_task, next_p);
+ raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
- /*
- * Switch FS and GS.
- *
- * Segment register != 0 always requires a reload. Also
- * reload when it has changed. When prev process used 64bit
- * base always reload to avoid an information leak.
- */
- if (unlikely(fsindex | next->fsindex | prev->fs)) {
- loadsegment(fs, next->fsindex);
+ /* Reload sp0. */
+ update_task_stack(next_p);
+
+ switch_to_extra(prev_p, next_p);
+
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
- * Check if the user used a selector != 0; if yes
- * clear 64bit base, since overloaded base is always
- * mapped to the Null selector
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+ * does not update the cached descriptor. As a result, if we
+ * do SYSRET while SS is NULL, we'll end up in user mode with
+ * SS apparently equal to __USER_DS but actually unusable.
+ *
+ * The straightforward workaround would be to fix it up just
+ * before SYSRET, but that would slow down the system call
+ * fast paths. Instead, we ensure that SS is never NULL in
+ * system call context. We do this by replacing NULL SS
+ * selectors at every context switch. SYSCALL sets up a valid
+ * SS, so the only way to get NULL is to re-enter the kernel
+ * from CPL 3 through an interrupt. Since that can't happen
+ * in the same task as a running syscall, we are guaranteed to
+ * context switch between every interrupt vector entry and a
+ * subsequent SYSRET.
+ *
+ * We read SS first because SS reads are much faster than
+ * writes. Out of caution, we force SS to __KERNEL_DS even if
+ * it previously had a different non-NULL value.
*/
- if (fsindex)
- prev->fs = 0;
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
}
- /* when next process has a 64bit base use it */
- if (next->fs)
- wrmsrl(MSR_FS_BASE, next->fs);
- prev->fsindex = fsindex;
-
- if (unlikely(gsindex | next->gsindex | prev->gs)) {
- load_gs_index(next->gsindex);
- if (gsindex)
- prev->gs = 0;
- }
- if (next->gs)
- wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
- prev->gsindex = gsindex;
-
- /*
- * Switch the PDA and FPU contexts.
- */
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
- percpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ /* Load the Intel cache allocation PQR MSR. */
+ resctrl_arch_sched_in(next_p);
- /*
- * Now maybe reload the debug registers and handle I/O bitmaps
- */
- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
- __switch_to_xtra(prev_p, next_p, tss);
-
- /*
- * Preload the FPU context, now that we've determined that the
- * task is likely to be using it.
- */
- if (preload_fpu)
- __math_state_restore();
+ /* Reset hw history on AMD CPUs */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
+ wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
return prev_p;
}
@@ -498,138 +718,261 @@ void set_personality_64bit(void)
/* inherit personality from parent */
/* Make sure to be in 64bit mode */
- clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ /* Pretend that this comes from a 64bit execve */
+ task_pt_regs(current)->orig_ax = __NR_execve;
+ current_thread_info()->status &= ~TS_COMPAT;
+ if (current->mm)
+ __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
so it's not too bad. The main problem is just that
- 32bit childs are affected again. */
+ 32bit children are affected again. */
current->personality &= ~READ_IMPLIES_EXEC;
}
-void set_personality_ia32(void)
+static void __set_personality_x32(void)
{
- /* inherit personality from parent */
+#ifdef CONFIG_X86_X32_ABI
+ if (current->mm)
+ current->mm->context.flags = 0;
- /* Make sure to be in 32bit mode */
- set_thread_flag(TIF_IA32);
- current->personality |= force_personality32;
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /*
+ * in_32bit_syscall() uses the presence of the x32 syscall bit
+ * flag to determine compat status. The x86 mmap() code relies on
+ * the syscall bitness so set x32 syscall bit right here to make
+ * in_32bit_syscall() work during exec().
+ *
+ * Pretend to come from a x32 execve.
+ */
+ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+ current_thread_info()->status &= ~TS_COMPAT;
+#endif
+}
+
+static void __set_personality_ia32(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (current->mm) {
+ /*
+ * uprobes applied to this MM need to know this and
+ * cannot use user_64bit_mode() at that time.
+ */
+ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
+ }
+ current->personality |= force_personality32;
/* Prepare the first "return" to user space */
+ task_pt_regs(current)->orig_ax = __NR_ia32_execve;
current_thread_info()->status |= TS_COMPAT;
+#endif
}
-unsigned long get_wchan(struct task_struct *p)
+void set_personality_ia32(bool x32)
{
- unsigned long stack;
- u64 fp, ip;
- int count = 0;
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_ADDR32);
+
+ if (x32)
+ __set_personality_x32();
+ else
+ __set_personality_ia32();
+}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
+#ifdef CONFIG_ADDRESS_MASKING
+
+#define LAM_U57_BITS 6
+
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+ unsigned long lam;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ lam = mm_lam_cr3_mask(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
+ }
+}
+
+static void mm_enable_lam(struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+
+ /*
+ * Even though the process must still be single-threaded at this
+ * point, kernel threads may be using the mm. IPI those kernel
+ * threads if they exist.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+}
+
+static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return -ENODEV;
+
+ /* PTRACE_ARCH_PRCTL */
+ if (current->mm != mm)
+ return -EINVAL;
+
+ if (mm_valid_pasid(mm) &&
+ !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
+ return -EINVAL;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ /*
+ * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
+ * being enabled unless the process is single threaded:
+ */
+ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
+ mmap_write_unlock(mm);
+ return -EBUSY;
+ }
+
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ }
+
+ mm_enable_lam(mm);
+
+ mmap_write_unlock(mm);
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
- return 0;
- fp = *(u64 *)(p->thread.sp);
- do {
- if (fp < (unsigned long)stack ||
- fp >= (unsigned long)stack+THREAD_SIZE)
- return 0;
- ip = *(u64 *)(fp+8);
- if (!in_sched_functions(ip))
- return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
return 0;
}
+#endif
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
- int doit = task == current;
- int cpu;
- switch (code) {
- case ARCH_SET_GS:
- if (addr >= TASK_SIZE_OF(task))
+ switch (option) {
+ case ARCH_SET_GS: {
+ if (unlikely(arg2 >= TASK_SIZE_MAX))
return -EPERM;
- cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
- switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
- load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
- }
- task->thread.gsindex = GS_TLS_SEL;
- task->thread.gs = 0;
+
+ preempt_disable();
+ /*
+ * ARCH_SET_GS has always overwritten the index
+ * and the base. Zero is the most sensible value
+ * to put in the index, and is the only value that
+ * makes any sense if FSGSBASE is unavailable.
+ */
+ if (task == current) {
+ loadseg(GS, 0);
+ x86_gsbase_write_cpu_inactive(arg2);
+
+ /*
+ * On non-FSGSBASE systems, save_base_legacy() expects
+ * that we also fill in thread.gsbase.
+ */
+ task->thread.gsbase = arg2;
+
} else {
task->thread.gsindex = 0;
- task->thread.gs = addr;
- if (doit) {
- load_gs_index(0);
- ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ x86_gsbase_write_task(task, arg2);
}
- put_cpu();
+ preempt_enable();
break;
- case ARCH_SET_FS:
- /* Not strictly needed for fs, but do it for symmetry
- with gs */
- if (addr >= TASK_SIZE_OF(task))
+ }
+ case ARCH_SET_FS: {
+ /*
+ * Not strictly needed for %fs, but do it for symmetry
+ * with %gs
+ */
+ if (unlikely(arg2 >= TASK_SIZE_MAX))
return -EPERM;
- cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
- switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, FS_TLS, addr);
- if (doit) {
- load_TLS(&task->thread, cpu);
- loadsegment(fs, FS_TLS_SEL);
- }
- task->thread.fsindex = FS_TLS_SEL;
- task->thread.fs = 0;
+
+ preempt_disable();
+ /*
+ * Set the selector to 0 for the same reason
+ * as %gs above.
+ */
+ if (task == current) {
+ loadseg(FS, 0);
+ x86_fsbase_write_cpu(arg2);
+
+ /*
+ * On non-FSGSBASE systems, save_base_legacy() expects
+ * that we also fill in thread.fsbase.
+ */
+ task->thread.fsbase = arg2;
} else {
task->thread.fsindex = 0;
- task->thread.fs = addr;
- if (doit) {
- /* set the selector to 0 to not confuse
- __switch_to */
- loadsegment(fs, 0);
- ret = checking_wrmsrl(MSR_FS_BASE, addr);
- }
+ x86_fsbase_write_task(task, arg2);
}
- put_cpu();
+ preempt_enable();
break;
+ }
case ARCH_GET_FS: {
- unsigned long base;
- if (task->thread.fsindex == FS_TLS_SEL)
- base = read_32bit_tls(task, FS_TLS);
- else if (doit)
- rdmsrl(MSR_FS_BASE, base);
- else
- base = task->thread.fs;
- ret = put_user(base, (unsigned long __user *)addr);
+ unsigned long base = x86_fsbase_read_task(task);
+
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
- unsigned long base;
- unsigned gsindex;
- if (task->thread.gsindex == GS_TLS_SEL)
- base = read_32bit_tls(task, GS_TLS);
- else if (doit) {
- savesegment(gs, gsindex);
- if (gsindex)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
- base = task->thread.gs;
- } else
- base = task->thread.gs;
- ret = put_user(base, (unsigned long __user *)addr);
+ unsigned long base = x86_gsbase_read_task(task);
+
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, arg2);
+# endif
+# ifdef CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, arg2);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, arg2);
+#endif
+#ifdef CONFIG_ADDRESS_MASKING
+ case ARCH_GET_UNTAG_MASK:
+ return put_user(task->mm->context.untag_mask,
+ (unsigned long __user *)arg2);
+ case ARCH_ENABLE_TAGGED_ADDR:
+ return prctl_enable_tagged_addr(task->mm, arg2);
+ case ARCH_FORCE_TAGGED_SVA:
+ if (current != task)
+ return -EINVAL;
+ set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
+ return 0;
+ case ARCH_GET_MAX_TAG_BITS:
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return put_user(0, (unsigned long __user *)arg2);
+ else
+ return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
+#endif
+ case ARCH_SHSTK_ENABLE:
+ case ARCH_SHSTK_DISABLE:
+ case ARCH_SHSTK_LOCK:
+ case ARCH_SHSTK_UNLOCK:
+ case ARCH_SHSTK_STATUS:
+ return shstk_prctl(task, option, arg2);
default:
ret = -EINVAL;
break;
@@ -637,14 +980,3 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
return ret;
}
-
-long sys_arch_prctl(int code, unsigned long addr)
-{
- return do_arch_prctl(current, code, addr);
-}
-
-unsigned long KSTK_ESP(struct task_struct *task)
-{
- return (test_tsk_thread_flag(task, TIF_IA32)) ?
- (task_pt_regs(task)->sp) : ((task)->thread.usersp);
-}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..3dcadc13f09a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* By Ross Biro 1/23/92 */
/*
* Pentium III FXSR, SSE support
@@ -6,13 +7,12 @@
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/ptrace.h>
-#include <linux/regset.h>
-#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
@@ -21,34 +21,59 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/nospec.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
+#include <asm/syscall.h>
+#include <asm/fsgsbase.h>
+#include <asm/io_bitmap.h>
#include "tls.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-enum x86_regset {
- REGSET_GENERAL,
- REGSET_FP,
- REGSET_XFP,
- REGSET_IOPERM64 = REGSET_XFP,
- REGSET_XSTATE,
- REGSET_TLS,
- REGSET_IOPERM32,
+enum x86_regset_32 {
+ REGSET32_GENERAL,
+ REGSET32_FP,
+ REGSET32_XFP,
+ REGSET32_XSTATE,
+ REGSET32_TLS,
+ REGSET32_IOPERM,
};
+enum x86_regset_64 {
+ REGSET64_GENERAL,
+ REGSET64_FP,
+ REGSET64_IOPERM,
+ REGSET64_XSTATE,
+ REGSET64_SSP,
+};
+
+#define REGSET_GENERAL \
+({ \
+ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \
+ REGSET32_GENERAL; \
+})
+
+#define REGSET_FP \
+({ \
+ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \
+ REGSET32_FP; \
+})
+
+
struct pt_regs_offset {
const char *name;
int offset;
@@ -122,21 +147,6 @@ const char *regs_query_register_name(unsigned int offset)
return NULL;
}
-static const int arg_offs_table[] = {
-#ifdef CONFIG_X86_32
- [0] = offsetof(struct pt_regs, ax),
- [1] = offsetof(struct pt_regs, dx),
- [2] = offsetof(struct pt_regs, cx)
-#else /* CONFIG_X86_64 */
- [0] = offsetof(struct pt_regs, di),
- [1] = offsetof(struct pt_regs, si),
- [2] = offsetof(struct pt_regs, dx),
- [3] = offsetof(struct pt_regs, cx),
- [4] = offsetof(struct pt_regs, r8),
- [5] = offsetof(struct pt_regs, r9)
-#endif
-};
-
/*
* does not yet catch signals sent when the child dies.
* in exit.c or in signal.c.
@@ -180,9 +190,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
if (task == current)
- retval = get_user_gs(task_pt_regs(task));
+ savesegment(gs, retval);
else
- retval = task_user_gs(task);
+ retval = task->thread.gs;
}
return retval;
}
@@ -190,6 +200,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
@@ -210,16 +223,14 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct, ss):
if (unlikely(value == 0))
return -EIO;
+ fallthrough;
default:
*pt_regs_access(task_pt_regs(task), offset) = value;
break;
case offsetof(struct user_regs_struct, gs):
- if (task == current)
- set_user_gs(task_pt_regs(task), value);
- else
- task_user_gs(task) = value;
+ task->thread.gs = value;
}
return 0;
@@ -279,50 +290,33 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
if (invalid_selector(value))
return -EIO;
+ /*
+ * Writes to FS and GS will change the stored selector. Whether
+ * this changes the segment base as well depends on whether
+ * FSGSBASE is enabled.
+ */
+
switch (offset) {
case offsetof(struct user_regs_struct,fs):
- /*
- * If this is setting fs as for normal 64-bit use but
- * setting fs_base has implicitly changed it, leave it.
- */
- if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
- task->thread.fs != 0) ||
- (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
- task->thread.fs == 0))
- break;
task->thread.fsindex = value;
- if (task == current)
- loadsegment(fs, task->thread.fsindex);
break;
case offsetof(struct user_regs_struct,gs):
- /*
- * If this is setting gs as for normal 64-bit use but
- * setting gs_base has implicitly changed it, leave it.
- */
- if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
- task->thread.gs != 0) ||
- (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
- task->thread.gs == 0))
- break;
task->thread.gsindex = value;
- if (task == current)
- load_gs_index(task->thread.gsindex);
break;
case offsetof(struct user_regs_struct,ds):
task->thread.ds = value;
- if (task == current)
- loadsegment(ds, task->thread.ds);
break;
case offsetof(struct user_regs_struct,es):
task->thread.es = value;
- if (task == current)
- loadsegment(es, task->thread.es);
break;
/*
@@ -331,18 +325,12 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->cs = value;
-#endif
+ task_pt_regs(task)->cs = value;
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->ss = value;
-#endif
+ task_pt_regs(task)->ss = value;
break;
}
@@ -400,24 +388,14 @@ static int putreg(struct task_struct *child,
#ifdef CONFIG_X86_64
case offsetof(struct user_regs_struct,fs_base):
- if (value >= TASK_SIZE_OF(child))
+ if (value >= TASK_SIZE_MAX)
return -EIO;
- /*
- * When changing the segment base, use do_arch_prctl
- * to set either thread.fs or thread.fsindex and the
- * corresponding GDT slot.
- */
- if (child->thread.fs != value)
- return do_arch_prctl(child, ARCH_SET_FS, value);
+ x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
- if (value >= TASK_SIZE_OF(child))
+ if (value >= TASK_SIZE_MAX)
return -EIO;
- if (child->thread.gs != value)
- return do_arch_prctl(child, ARCH_SET_GS, value);
+ x86_gsbase_write_task(child, value);
return 0;
#endif
}
@@ -441,34 +419,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
return get_flags(task);
#ifdef CONFIG_X86_64
- case offsetof(struct user_regs_struct, fs_base): {
- /*
- * do_arch_prctl may have used a GDT slot instead of
- * the MSR. To userland, it appears the same either
- * way, except the %fs segment selector might not be 0.
- */
- unsigned int seg = task->thread.fsindex;
- if (task->thread.fs != 0)
- return task->thread.fs;
- if (task == current)
- asm("movl %%fs,%0" : "=r" (seg));
- if (seg != FS_TLS_SEL)
- return 0;
- return get_desc_base(&task->thread.tls_array[FS_TLS]);
- }
- case offsetof(struct user_regs_struct, gs_base): {
- /*
- * Exactly the same here as the %fs handling above.
- */
- unsigned int seg = task->thread.gsindex;
- if (task->thread.gs != 0)
- return task->thread.gs;
- if (task == current)
- asm("movl %%gs,%0" : "=r" (seg));
- if (seg != GS_TLS_SEL)
- return 0;
- return get_desc_base(&task->thread.tls_array[GS_TLS]);
- }
+ case offsetof(struct user_regs_struct, fs_base):
+ return x86_fsbase_read_task(task);
+ case offsetof(struct user_regs_struct, gs_base):
+ return x86_gsbase_read_task(task);
#endif
}
@@ -477,26 +431,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
static int genregs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count >= sizeof(*k)) {
- *k++ = getreg(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count >= sizeof(*u)) {
- if (__put_user(getreg(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++)
+ membuf_store(&to, getreg(target, reg * sizeof(unsigned long)));
return 0;
}
@@ -528,7 +468,7 @@ static int genregs_set(struct task_struct *target,
return ret;
}
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -544,7 +484,7 @@ static void ptrace_triggered(struct perf_event *bp, int nmi,
break;
}
- thread->debugreg6 |= (DR_TRAP0 << i);
+ thread->virtual_dr6 |= (DR_TRAP0 << i);
}
/*
@@ -568,30 +508,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[])
return dr7;
}
-static int
-ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
- struct task_struct *tsk, int disabled)
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+ int len, int type, bool disabled)
+{
+ int err, bp_len, bp_type;
+
+ err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+ if (!err) {
+ attr->bp_len = bp_len;
+ attr->bp_type = bp_type;
+ attr->disabled = disabled;
+ }
+
+ return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+ unsigned long addr, bool disabled)
{
- int err;
- int gen_len, gen_type;
struct perf_event_attr attr;
+ int err;
- /*
- * We should have at least an inactive breakpoint at this
- * slot. It means the user is writing dr7 without having
- * written the address register first
- */
- if (!bp)
- return -EINVAL;
+ ptrace_breakpoint_init(&attr);
+ attr.bp_addr = addr;
- err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
if (err)
- return err;
+ return ERR_PTR(err);
+
+ return register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
+}
- attr = bp->attr;
- attr.bp_len = gen_len;
- attr.bp_type = gen_type;
- attr.disabled = disabled;
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ int disabled)
+{
+ struct perf_event_attr attr = bp->attr;
+ int err;
+
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return err;
return modify_user_hw_breakpoint(bp, &attr);
}
@@ -601,61 +559,50 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
*/
static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
{
- struct thread_struct *thread = &(tsk->thread);
+ struct thread_struct *thread = &tsk->thread;
unsigned long old_dr7;
- int i, orig_ret = 0, rc = 0;
- int enabled, second_pass = 0;
- unsigned len, type;
- struct perf_event *bp;
+ bool second_pass = false;
+ int i, rc, ret = 0;
data &= ~DR_CONTROL_RESERVED;
old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
restore:
- /*
- * Loop through all the hardware breakpoints, making the
- * appropriate changes to each.
- */
+ rc = 0;
for (i = 0; i < HBP_NUM; i++) {
- enabled = decode_dr7(data, i, &len, &type);
- bp = thread->ptrace_bps[i];
-
- if (!enabled) {
- if (bp) {
- /*
- * Don't unregister the breakpoints right-away,
- * unless all register_user_hw_breakpoint()
- * requests have succeeded. This prevents
- * any window of opportunity for debug
- * register grabbing by other users.
- */
- if (!second_pass)
- continue;
-
- rc = ptrace_modify_breakpoint(bp, len, type,
- tsk, 1);
- if (rc)
- break;
+ unsigned len, type;
+ bool disabled = !decode_dr7(data, i, &len, &type);
+ struct perf_event *bp = thread->ptrace_bps[i];
+
+ if (!bp) {
+ if (disabled)
+ continue;
+
+ bp = ptrace_register_breakpoint(tsk,
+ len, type, 0, disabled);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ break;
}
+
+ thread->ptrace_bps[i] = bp;
continue;
}
- rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+ rc = ptrace_modify_breakpoint(bp, len, type, disabled);
if (rc)
break;
}
- /*
- * Make a second pass to free the remaining unused breakpoints
- * or to restore the original breakpoints if an error occurred.
- */
- if (!second_pass) {
- second_pass = 1;
- if (rc < 0) {
- orig_ret = rc;
- data = old_dr7;
- }
+
+ /* Restore if the first pass failed, second_pass shouldn't fail. */
+ if (rc && !WARN_ON(second_pass)) {
+ ret = rc;
+ data = old_dr7;
+ second_pass = true;
goto restore;
}
- return ((orig_ret < 0) ? orig_ret : rc);
+
+ return ret;
}
/*
@@ -663,18 +610,18 @@ restore:
*/
static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
{
- struct thread_struct *thread = &(tsk->thread);
+ struct thread_struct *thread = &tsk->thread;
unsigned long val = 0;
if (n < HBP_NUM) {
- struct perf_event *bp;
- bp = thread->ptrace_bps[n];
- if (!bp)
- return 0;
- val = bp->hw.info.address;
+ int index = array_index_nospec(n, HBP_NUM);
+ struct perf_event *bp = thread->ptrace_bps[index];
+
+ if (bp)
+ val = bp->hw.info.address;
} else if (n == 6) {
- val = thread->debugreg6;
- } else if (n == 7) {
+ val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */
+ } else if (n == 7) {
val = thread->ptrace_dr7;
}
return val;
@@ -683,24 +630,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
unsigned long addr)
{
- struct perf_event *bp;
struct thread_struct *t = &tsk->thread;
- struct perf_event_attr attr;
-
- if (!t->ptrace_bps[nr]) {
- ptrace_breakpoint_init(&attr);
- /*
- * Put stub len and type to register (reserve) an inactive but
- * correct bp
- */
- attr.bp_addr = addr;
- attr.bp_len = HW_BREAKPOINT_LEN_1;
- attr.bp_type = HW_BREAKPOINT_W;
- attr.disabled = 1;
-
- bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+ struct perf_event *bp = t->ptrace_bps[nr];
+ int err = 0;
+ if (!bp) {
/*
+ * Put stub len and type to create an inactive but correct bp.
+ *
* CHECKME: the previous code returned -EIO if the addr wasn't
* a valid task virtual addr. The new one will return -EINVAL in
* this case.
@@ -709,55 +646,43 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
* writing for the user. And anyway this is the previous
* behaviour.
*/
+ bp = ptrace_register_breakpoint(tsk,
+ X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+ addr, true);
if (IS_ERR(bp))
- return PTR_ERR(bp);
-
- t->ptrace_bps[nr] = bp;
+ err = PTR_ERR(bp);
+ else
+ t->ptrace_bps[nr] = bp;
} else {
- int err;
-
- bp = t->ptrace_bps[nr];
+ struct perf_event_attr attr = bp->attr;
- attr = bp->attr;
attr.bp_addr = addr;
err = modify_user_hw_breakpoint(bp, &attr);
- if (err)
- return err;
}
-
- return 0;
+ return err;
}
/*
* Handle PTRACE_POKEUSR calls for the debug register area.
*/
-int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
{
- struct thread_struct *thread = &(tsk->thread);
- int rc = 0;
-
+ struct thread_struct *thread = &tsk->thread;
/* There are no DR4 or DR5 registers */
- if (n == 4 || n == 5)
- return -EIO;
+ int rc = -EIO;
- if (n == 6) {
- thread->debugreg6 = val;
- goto ret_path;
- }
if (n < HBP_NUM) {
rc = ptrace_set_breakpoint_addr(tsk, n, val);
- if (rc)
- return rc;
- }
- /* All that's left is DR7 */
- if (n == 7) {
+ } else if (n == 6) {
+ thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */
+ rc = 0;
+ } else if (n == 7) {
rc = ptrace_write_dr7(tsk, val);
if (!rc)
thread->ptrace_dr7 = val;
}
-
-ret_path:
return rc;
}
@@ -768,20 +693,21 @@ ret_path:
static int ioperm_active(struct task_struct *target,
const struct user_regset *regset)
{
- return target->thread.io_bitmap_max / regset->size;
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0;
}
static int ioperm_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (!target->thread.io_bitmap_ptr)
+ struct io_bitmap *iobm = target->thread.io_bitmap;
+
+ if (!iobm)
return -ENXIO;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- target->thread.io_bitmap_ptr,
- 0, IO_BITMAP_BYTES);
+ return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES);
}
/*
@@ -792,28 +718,36 @@ static int ioperm_get(struct task_struct *target,
void ptrace_disable(struct task_struct *child)
{
user_disable_single_step(child);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-#endif
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
{
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
unsigned long tmp;
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
tmp = 0; /* Default return condition */
@@ -830,8 +764,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
ret = -EIO;
- if ((addr & (sizeof(data) - 1)) || addr < 0 ||
- addr >= sizeof(struct user))
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
break;
if (addr < sizeof(struct user_regs_struct))
@@ -846,28 +779,28 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -875,30 +808,30 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
#ifdef CONFIG_X86_32
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case PTRACE_GET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_get_thread_area(child, addr,
- (struct user_desc __user *) data);
+ (struct user_desc __user *)data);
break;
case PTRACE_SET_THREAD_AREA:
- if (addr < 0)
+ if ((int) addr < 0)
return -EIO;
ret = do_set_thread_area(child, addr,
- (struct user_desc __user *) data, 0);
+ (struct user_desc __user *)data, 0);
break;
#endif
@@ -907,7 +840,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
- ret = do_arch_prctl(child, data, addr);
+ ret = do_arch_prctl_64(child, data, addr);
break;
#endif
@@ -940,14 +873,39 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
static int putreg32(struct task_struct *child, unsigned regno, u32 value)
{
struct pt_regs *regs = task_pt_regs(child);
+ int ret;
switch (regno) {
SEG32(cs);
SEG32(ds);
SEG32(es);
- SEG32(fs);
- SEG32(gs);
+
+ /*
+ * A 32-bit ptracer on a 64-bit kernel expects that writing
+ * FS or GS will also update the base. This is needed for
+ * operations like PTRACE_SETREGS to fully restore a saved
+ * CPU state.
+ */
+
+ case offsetof(struct user32, regs.fs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, fs),
+ value);
+ if (ret == 0)
+ child->thread.fsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
+ case offsetof(struct user32, regs.gs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, gs),
+ value);
+ if (ret == 0)
+ child->thread.gsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
SEG32(ss);
R32(ebx, bx);
@@ -962,15 +920,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
case offsetof(struct user32, regs.orig_eax):
/*
- * A 32-bit debugger setting orig_eax means to restore
- * the state of the task restarting a 32-bit syscall.
- * Make sure we interpret the -ERESTART* codes correctly
- * in case the task is not actually still sitting at the
- * exit from a 32-bit syscall with TS_COMPAT still set.
+ * Warning: bizarre corner case fixup here. A 32-bit
+ * debugger setting orig_eax to -1 wants to disable
+ * syscall restart. Make sure that the syscall
+ * restart code sign-extends orig_ax. Also make sure
+ * we interpret the -ERESTART* codes correctly if
+ * loaded into regs->ax in case the task is not
+ * actually still sitting at the exit from a 32-bit
+ * syscall with TS_COMPAT still set.
*/
regs->orig_ax = value;
- if (syscall_get_nr(child, regs) >= 0)
- task_thread_info(child)->status |= TS_COMPAT;
+ if (syscall_get_nr(child, regs) != -1)
+ child->thread_info.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
@@ -1060,28 +1021,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
static int genregs32_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count >= sizeof(*k)) {
- getreg32(target, pos, k++);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count >= sizeof(*u)) {
- compat_ulong_t word;
- getreg32(target, pos, &word);
- if (__put_user(word, u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ int reg;
+ for (reg = 0; to.left; reg++) {
+ u32 val;
+ getreg32(target, reg * 4, &val);
+ membuf_store(&to, val);
+ }
return 0;
}
@@ -1113,8 +1061,8 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
-long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
- compat_ulong_t caddr, compat_ulong_t cdata)
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
{
unsigned long addr = caddr;
unsigned long data = cdata;
@@ -1158,13 +1106,13 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
@@ -1178,36 +1126,159 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
return ret;
}
+#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_IA32_EMULATION */
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ &user_x86_64_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ &user_x86_64_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ &user_x86_64_view,
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ &user_x86_64_view,
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_COMPAT
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (!in_ia32_syscall())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+ return 0;
+#endif
+}
+#endif /* CONFIG_COMPAT */
#ifdef CONFIG_X86_64
-static struct user_regset x86_64_regsets[] __read_mostly = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .get = genregs_get, .set = genregs_set
+static struct user_regset x86_64_regsets[] __ro_after_init = {
+ [REGSET64_GENERAL] = {
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .regset_get = genregs_get,
+ .set = genregs_set
+ },
+ [REGSET64_FP] = {
+ USER_REGSET_NOTE_TYPE(PRFPREG),
+ .n = sizeof(struct fxregs_state) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ [REGSET64_XSTATE] = {
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET64_IOPERM] = {
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
- [REGSET_IOPERM64] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_LONGS,
- .size = sizeof(long), .align = sizeof(long),
- .active = ioperm_active, .get = ioperm_get
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [REGSET64_SSP] = {
+ USER_REGSET_NOTE_TYPE(X86_SHSTK),
+ .n = 1,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = ssp_active,
+ .regset_get = ssp_get,
+ .set = ssp_set
},
+#endif
};
static const struct user_regset_view user_x86_64_view = {
@@ -1221,50 +1292,61 @@ static const struct user_regset_view user_x86_64_view = {
#define genregs32_get genregs_get
#define genregs32_set genregs_set
-#define user_i387_ia32_struct user_i387_struct
-#define user32_fxsr_struct user_fxsr_struct
-
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct32) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .get = genregs32_get, .set = genregs32_set
+static struct user_regset x86_32_regsets[] __ro_after_init = {
+ [REGSET32_GENERAL] = {
+ USER_REGSET_NOTE_TYPE(PRSTATUS),
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = genregs32_get,
+ .set = genregs32_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+ [REGSET32_FP] = {
+ USER_REGSET_NOTE_TYPE(PRFPREG),
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_fpregs_active,
+ .regset_get = fpregs_get,
+ .set = fpregs_set
},
- [REGSET_XFP] = {
- .core_note_type = NT_PRXFPREG,
- .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ [REGSET32_XFP] = {
+ USER_REGSET_NOTE_TYPE(PRXFPREG),
+ .n = sizeof(struct fxregs_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET32_XSTATE] = {
+ USER_REGSET_NOTE_TYPE(X86_XSTATE),
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_TLS] = {
- .core_note_type = NT_386_TLS,
- .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
- .size = sizeof(struct user_desc),
- .align = sizeof(struct user_desc),
- .active = regset_tls_active,
- .get = regset_tls_get, .set = regset_tls_set
+ [REGSET32_TLS] = {
+ USER_REGSET_NOTE_TYPE(386_TLS),
+ .n = GDT_ENTRY_TLS_ENTRIES,
+ .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .regset_get = regset_tls_get,
+ .set = regset_tls_set
},
- [REGSET_IOPERM32] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_BYTES / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = ioperm_active, .get = ioperm_get
+ [REGSET32_IOPERM] = {
+ USER_REGSET_NOTE_TYPE(386_IOPERM),
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1280,21 +1362,40 @@ static const struct user_regset_view user_x86_32_view = {
*/
u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
- x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64);
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
- x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64);
#endif
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
return &user_x86_32_view;
@@ -1304,113 +1405,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-static void fill_sigtrap_info(struct task_struct *tsk,
- struct pt_regs *regs,
- int error_code, int si_code,
- struct siginfo *info)
-{
- tsk->thread.trap_no = 1;
- tsk->thread.error_code = error_code;
-
- memset(info, 0, sizeof(*info));
- info->si_signo = SIGTRAP;
- info->si_code = si_code;
- info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
-}
-
-void user_single_step_siginfo(struct task_struct *tsk,
- struct pt_regs *regs,
- struct siginfo *info)
+void send_sigtrap(struct pt_regs *regs, int error_code, int si_code)
{
- fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
-}
+ struct task_struct *tsk = current;
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
- int error_code, int si_code)
-{
- struct siginfo info;
+ tsk->thread.trap_nr = X86_TRAP_DB;
+ tsk->thread.error_code = error_code;
- fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
/* Send us the fake SIGTRAP */
- force_sig_info(SIGTRAP, &info, tsk);
-}
-
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
-#endif
-
-/*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
- */
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
-{
- long ret = 0;
-
- /*
- * If we stepped into a sysenter/syscall insn, it trapped in
- * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
- * If user-mode had set TF itself, then it's still clear from
- * do_debug() and we need to set it again to restore the user
- * state. If we entered on the slow path, TF was already set.
- */
- if (test_thread_flag(TIF_SINGLESTEP))
- regs->flags |= X86_EFLAGS_TF;
-
- /* do the secure computing check first */
- secure_computing(regs->orig_ax);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
- ret = -1L;
-
- if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- ret = -1L;
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->orig_ax);
-
- if (unlikely(current->audit_context)) {
- if (IS_IA32)
- audit_syscall_entry(AUDIT_ARCH_I386,
- regs->orig_ax,
- regs->bx, regs->cx,
- regs->dx, regs->si);
-#ifdef CONFIG_X86_64
- else
- audit_syscall_entry(AUDIT_ARCH_X86_64,
- regs->orig_ax,
- regs->di, regs->si,
- regs->dx, regs->r10);
-#endif
- }
-
- return ret ?: regs->orig_ax;
+ force_sig_fault(SIGTRAP, si_code,
+ user_mode(regs) ? (void __user *)regs->ip : NULL);
}
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void user_single_step_report(struct pt_regs *regs)
{
- bool step;
-
- if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_exit(regs, regs->ax);
-
- /*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter().
- */
- step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
- !test_thread_flag(TIF_SYSCALL_EMU);
- if (step || test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, step);
+ send_sigtrap(regs, 0, TRAP_BRKPT);
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..b3f81379c2fc 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -1,148 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* paravirtual clock -- common code used by kvm/xen
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <linux/clocksource.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
-#include <asm/pvclock.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/nmi.h>
-/*
- * These are perodically updated
- * xen: magic shared_info page
- * kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
- u8 flags;
-};
+#include <asm/fixmap.h>
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
void pvclock_set_flags(u8 flags)
{
valid_flags = flags;
}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
+ u64 pv_tsc_khz = 1000000ULL << 32;
- if (shift < 0)
- delta >>= -shift;
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
}
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+void pvclock_touch_watchdogs(void)
{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
}
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
+void pvclock_resume(void)
{
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- dst->flags = src->flags;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) || (dst->version != src->version));
-
- return dst->version;
+ atomic64_set(&last_value, 0);
}
-unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
{
- u64 pv_tsc_khz = 1000000ULL << 32;
+ unsigned version;
+ u8 flags;
- do_div(pv_tsc_khz, src->tsc_to_system_mul);
- if (src->tsc_shift < 0)
- pv_tsc_khz <<= -src->tsc_shift;
- else
- pv_tsc_khz >>= src->tsc_shift;
- return pv_tsc_khz;
-}
+ do {
+ version = pvclock_read_begin(src);
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
-static atomic64_t last_value = ATOMIC64_INIT(0);
+ return flags & valid_flags;
+}
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
{
- struct pvclock_shadow_time shadow;
unsigned version;
- cycle_t ret, offset;
+ u64 ret;
u64 last;
+ u8 flags;
do {
- version = pvclock_get_time_values(&shadow, src);
- barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- barrier();
- } while (version != src->version);
+ version = pvclock_read_begin(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
- (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+ (flags & PVCLOCK_TSC_STABLE_BIT))
return ret;
/*
* Assumption here is that last_value, a global accumulator, always goes
* forward. If we are less than that, we should not be much smaller.
- * We assume there is an error marging we're inside, and then the correction
+ * We assume there is an error margin we're inside, and then the correction
* does not sacrifice accuracy.
*
* For reads: global may have changed between test and return,
@@ -153,38 +101,66 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
* updating at the same time, and one of them could be slightly behind,
* making the assumption that last_value always go forward fail to hold.
*/
- last = atomic64_read(&last_value);
+ last = raw_atomic64_read(&last_value);
do {
- if (ret < last)
+ if (ret <= last)
return last;
- last = atomic64_cmpxchg(&last_value, last, ret);
- } while (unlikely(last != ret));
+ } while (!raw_atomic64_try_cmpxchg(&last_value, &last, ret));
return ret;
}
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, false);
+}
+
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
- struct timespec *ts)
+ struct timespec64 *ts)
{
u32 version;
u64 delta;
- struct timespec now;
+ struct timespec64 now;
/* get wallclock at system boot */
do {
version = wall_clock->version;
rmb(); /* fetch version before time */
+ /*
+ * Note: wall_clock->sec is a u32 value, so it can
+ * only store dates between 1970 and 2106. To allow
+ * times beyond that, we need to create a new hypercall
+ * interface with an extended pvclock_wall_clock structure
+ * like ARM has.
+ */
now.tv_sec = wall_clock->sec;
now.tv_nsec = wall_clock->nsec;
rmb(); /* fetch time before checking version */
} while ((wall_clock->version & 1) || (version != wall_clock->version));
delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
now.tv_nsec = do_div(delta, NSEC_PER_SEC);
now.tv_sec = delta;
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
+}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..a92f18db9610 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -1,16 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains work-arounds for x86 and x86_64 platform bugs.
*/
+#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/irq.h>
#include <asm/hpet.h>
+#include <asm/setup.h>
+#include <asm/mce.h>
+
+#include <linux/platform_data/x86/apple.h>
#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
-static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+static void quirk_intel_irqbalance(struct pci_dev *dev)
{
- u8 config, rev;
+ u8 config;
u16 word;
/* BIOS may enable hardware IRQ balancing for
@@ -18,8 +24,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
* based platforms.
* Disable SW irqbalance/affinity on those platforms.
*/
- pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
- if (rev > 0x9)
+ if (dev->revision > 0x9)
return;
/* enable access to config space*/
@@ -88,14 +93,12 @@ static void ich_force_hpet_resume(void)
BUG();
else
printk(KERN_DEBUG "Force enabled HPET at resume\n");
-
- return;
}
static void ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(rcba);
+ u32 rcba;
int err = 0;
if (hpet_address || force_hpet_address)
@@ -110,7 +113,7 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
}
/* use bits 31:14, 16 kB aligned */
- rcba_base = ioremap_nocache(rcba, 0x4000);
+ rcba_base = ioremap(rcba, 0x4000);
if (rcba_base == NULL) {
dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
"cannot force enable HPET\n");
@@ -185,7 +188,7 @@ static void hpet_print_force_info(void)
static void old_ich_force_hpet_resume(void)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (!force_hpet_address || !cached_dev)
return;
@@ -207,7 +210,7 @@ static void old_ich_force_hpet_resume(void)
static void old_ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (hpet_address || force_hpet_address)
return;
@@ -298,7 +301,7 @@ static void vt8237_force_hpet_resume(void)
static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -344,6 +347,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
vt8237_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
static void ati_force_hpet_resume(void)
{
@@ -353,18 +358,22 @@ static void ati_force_hpet_resume(void)
static u32 ati_ixp4x0_rev(struct pci_dev *dev)
{
- u32 d;
- u8 b;
+ int err = 0;
+ u32 d = 0;
+ u8 b = 0;
- pci_read_config_byte(dev, 0xac, &b);
+ err = pci_read_config_byte(dev, 0xac, &b);
b &= ~(1<<5);
- pci_write_config_byte(dev, 0xac, b);
- pci_read_config_dword(dev, 0x70, &d);
+ err |= pci_write_config_byte(dev, 0xac, b);
+ err |= pci_read_config_dword(dev, 0x70, &d);
d |= 1<<8;
- pci_write_config_dword(dev, 0x70, d);
- pci_read_config_dword(dev, 0x8, &d);
+ err |= pci_write_config_dword(dev, 0x70, d);
+ err |= pci_read_config_dword(dev, 0x8, &d);
d &= 0xff;
dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+
+ WARN_ON_ONCE(err);
+
return d;
}
@@ -423,7 +432,7 @@ static void nvidia_force_hpet_resume(void)
static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -440,7 +449,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
force_hpet_address);
cached_dev = dev;
- return;
}
/* ISA Bridges */
@@ -493,6 +501,23 @@ void force_hpet_resume(void)
}
/*
+ * According to the datasheet e6xx systems have the HPET hardwired to
+ * 0xfed00000
+ */
+static void e6xx_force_enable_hpet(struct pci_dev *dev)
+{
+ if (hpet_address || force_hpet_address)
+ return;
+
+ force_hpet_address = 0xFED00000;
+ force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
+ e6xx_force_enable_hpet);
+
+/*
* HPET MSI on some boards (ATI SB700/SB800) has side effect on
* floppy DMA. Disable HPET MSI on such platforms.
* See erratum #27 (Misinterpreted MSI Requests May Result in
@@ -501,7 +526,7 @@ void force_hpet_resume(void)
*/
static void force_disable_hpet_msi(struct pci_dev *unused)
{
- hpet_msi_disable = 1;
+ hpet_msi_disable = true;
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
@@ -511,7 +536,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
/* Set correct numa_node information for AMD NB functions */
-static void __init quirk_amd_nb_node(struct pci_dev *dev)
+static void quirk_amd_nb_node(struct pci_dev *dev)
{
struct pci_dev *nb_ht;
unsigned int devfn;
@@ -524,7 +549,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- node = val & 7;
+ node = pcibus_to_node(dev->bus) | (val & 7);
/*
* Some hardware may return an invalid node ID,
* so check it first:
@@ -552,4 +577,95 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
quirk_amd_nb_node);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
+ quirk_amd_nb_node);
+
#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ enable_copy_mc_fragile();
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0, capid5;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+ pci_read_config_dword(pdev, 0x98, &capid5);
+
+ /*
+ * CAPID0{7:6} indicate whether this is an advanced RAS SKU
+ * CAPID5{8:5} indicate that various NVDIMM usage modes are
+ * enabled, so memory machine check recovery is also enabled.
+ */
+ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
+ enable_copy_mc_fragile();
+
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+ x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+ dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..6032fa9ec753 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,4 +1,7 @@
-#include <linux/module.h>
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/pm.h>
@@ -6,25 +9,30 @@
#include <linux/dmi.h>
#include <linux/sched.h>
#include <linux/tboot.h>
+#include <linux/delay.h>
+#include <linux/objtool.h>
+#include <linux/pgtable.h>
+#include <linux/kexec.h>
+#include <linux/kvm_types.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
#include <asm/pci_x86.h>
-#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
-#ifdef CONFIG_X86_32
-# include <linux/ctype.h>
-# include <linux/mc146818rtc.h>
-#else
-# include <asm/x86_init.h>
-#endif
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
+#include <asm/efi.h>
/*
* Power off function, if any
@@ -32,16 +40,8 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static const struct desc_ptr no_idt = {};
-static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
-int reboot_force;
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
-static int reboot_cpu = -1;
-#endif
-
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
* an inconsistent state and won't be able to do a clean cleanup
*/
@@ -50,91 +50,242 @@ static int reboot_emergency;
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- warm Don't set the cold reboot flag
- cold Set the cold reboot flag
- bios Reboot by jumping through the BIOS (only for X86_32)
- smp Reboot by executing reset on BSP or other CPU (only for X86_32)
- triple Force a triple fault (init)
- kbd Use the keyboard controller. cold reset (default)
- acpi Use the RESET_REG in the FADT
- efi Use efi reset_system runtime service
- pci Use the so-called "PCI reset register", CF9
- force Avoid anything that could hang.
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
*/
-static int __init reboot_setup(char *str)
-{
- for (;;) {
- switch (*str) {
- case 'w':
- reboot_mode = 0x1234;
- break;
- case 'c':
- reboot_mode = 0;
- break;
-
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_SMP
- case 's':
- if (isdigit(*(str+1))) {
- reboot_cpu = (int) (*(str+1) - '0');
- if (isdigit(*(str+2)))
- reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
- }
- /* we will leave sorting out the final value
- when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
- break;
-#endif /* CONFIG_SMP */
-
- case 'b':
-#endif
- case 'a':
- case 'k':
- case 't':
- case 'e':
- case 'p':
- reboot_type = *str;
- break;
+/*
+ * Some machines require the "reboot=a" commandline options
+ */
+static int __init set_acpi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_ACPI) {
+ reboot_type = BOOT_ACPI;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "ACPI");
+ }
+ return 0;
+}
- case 'f':
- reboot_force = 1;
- break;
- }
+/*
+ * Some machines require the "reboot=b" or "reboot=k" commandline options,
+ * this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_BIOS) {
+ reboot_type = BOOT_BIOS;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "BIOS");
+ }
+ return 0;
+}
- str = strchr(str, ',');
- if (str)
- str++;
- else
- break;
+/*
+ * Some machines don't handle the default ACPI reboot method and
+ * require the EFI reboot method:
+ */
+static int __init set_efi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) {
+ reboot_type = BOOT_EFI;
+ pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident);
}
- return 1;
+ return 0;
}
-__setup("reboot=", reboot_setup);
+void __noreturn machine_real_restart(unsigned int type)
+{
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+ /*
+ * Switch to the trampoline page table.
+ */
+ load_trampoline_pgtable();
+ /* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
-/*
- * Reboot options and system auto-detection code provided by
- * Dell Inc. so their systems "just work". :-)
- */
+ asm volatile("jmpl *%0" : :
+ "rm" (real_mode_header->machine_real_restart_asm),
+ "a" (type));
+#else
+ asm volatile("ljmpl *%0" : :
+ "m" (real_mode_header->machine_real_restart_asm),
+ "D" (type));
+#endif
+ unreachable();
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
- * Some machines require the "reboot=b" commandline option,
- * this quirk makes that automatic.
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
*/
-static int __init set_bios_reboot(const struct dmi_system_id *d)
+static int __init set_pci_reboot(const struct dmi_system_id *d)
{
- if (reboot_type != BOOT_BIOS) {
- reboot_type = BOOT_BIOS;
- printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "PCI");
}
return 0;
}
-static struct dmi_system_id __initdata reboot_dmi_table[] = {
+static int __init set_kbd_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_KBD) {
+ reboot_type = BOOT_KBD;
+ pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+ d->ident, "KBD");
+ }
+ return 0;
+}
+
+/*
+ * This is a single dmi_table handling all reboot quirks.
+ */
+static const struct dmi_system_id reboot_dmi_table[] __initconst = {
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
+ { /* Handle reboot issue on Acer TravelMate X514-51T */
+ .callback = set_efi_reboot,
+ .ident = "Acer TravelMate X514-51T",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"),
+ },
+ },
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
+
+ /* ASRock */
+ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+ .callback = set_pci_reboot,
+ .ident = "ASRock Q1900DC-ITX",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+ DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+ },
+ },
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
+ .callback = set_bios_reboot,
+ .ident = "ASUS P4S800",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TA */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TA",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TAW */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TAW",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+ },
+ },
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
+ .callback = set_bios_reboot,
+ .ident = "Dell DXP061",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ },
+ },
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
.ident = "Dell E520",
@@ -143,23 +294,57 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
},
},
- { /* Handle problems with rebooting on Dell 1300's */
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
- .ident = "Dell PowerEdge 1300",
+ .ident = "Dell OptiPlex 330",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell 300's */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
- .ident = "Dell PowerEdge 300",
+ .ident = "Dell OptiPlex 360",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -167,7 +352,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -176,7 +361,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -185,31 +370,38 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 330",
+ .ident = "Dell OptiPlex 760",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
- DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 360",
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990 BIOS A0x",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
- DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A0"),
},
},
- { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+ { /* Handle problems with rebooting on Dell 300's */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 760",
+ .ident = "Dell PowerEdge 300",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
- DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
},
},
{ /* Handle problems with rebooting on Dell 2400's */
@@ -220,6 +412,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Precision M6600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ /* Handle problems with rebooting on Dell T5400's */
.callback = set_bios_reboot,
.ident = "Dell Precision T5400",
@@ -236,14 +444,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
},
},
- { /* Handle problems with rebooting on HP laptops */
- .callback = set_bios_reboot,
- .ident = "HP Compaq Laptop",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
- },
- },
{ /* Handle problems with rebooting on Dell XPS710 */
.callback = set_bios_reboot,
.ident = "Dell XPS710",
@@ -252,240 +452,71 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
},
},
- { /* Handle problems with rebooting on Dell DXP061 */
- .callback = set_bios_reboot,
- .ident = "Dell DXP061",
+ { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */
+ .callback = set_acpi_reboot,
+ .ident = "Dell OptiPlex 7450 AIO",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"),
},
},
- { /* Handle problems with rebooting on Sony VGN-Z540N */
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
.callback = set_bios_reboot,
- .ident = "Sony VGN-Z540N",
+ .ident = "HP Compaq Laptop",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
- { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
- .callback = set_bios_reboot,
- .ident = "CompuLab SBC-FITPC2",
+
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
- DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
},
},
- { /* Handle problems with rebooting on ASUS P4S800 */
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
- .ident = "ASUS P4S800",
+ .ident = "Sony VGN-Z540N",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
},
},
+
{ }
};
static int __init reboot_init(void)
{
- dmi_check_system(reboot_dmi_table);
- return 0;
-}
-core_initcall(reboot_init);
-
-/* The following code and data reboots the machine by switching to real
- mode and jumping to the BIOS reset entry point, as if the CPU has
- really been reset. The previous version asked the keyboard
- controller to pulse the CPU reset line, which is more thorough, but
- doesn't work with at least one type of 486 motherboard. It is easy
- to stop this code working; hence the copious comments. */
-static const unsigned long long
-real_mode_gdt_entries [3] =
-{
- 0x0000000000000000ULL, /* Null descriptor */
- 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
- 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
-};
-
-static const struct desc_ptr
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, 0 };
-
-/* This is 16-bit protected mode code to disable paging and the cache,
- switch to real mode and jump to the BIOS reset code.
-
- The instruction that switches to real mode by writing to CR0 must be
- followed immediately by a far jump instruction, which set CS to a
- valid value for real mode, and flushes the prefetch queue to avoid
- running instructions that have already been decoded in protected
- mode.
-
- Clears all the flags except ET, especially PG (paging), PE
- (protected-mode enable) and TS (task switch for coprocessor state
- save). Flushes the TLB after paging has been disabled. Sets CD and
- NW, to disable the cache on a 486, and invalidates the cache. This
- is more like the state of a 486 after reset. I don't know if
- something else should be done for other chips.
-
- More could be done here to set up the registers as if a CPU reset had
- occurred; hopefully real BIOSs don't assume much. */
-static const unsigned char real_mode_switch [] =
-{
- 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
- 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
- 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */
- 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */
- 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
- 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
- 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
- 0x74, 0x02, /* jz f */
- 0x0f, 0x09, /* wbinvd */
- 0x24, 0x10, /* f: andb $0x10,al */
- 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
-};
-static const unsigned char jump_to_bios [] =
-{
- 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
-};
-
-/*
- * Switch to real mode and then execute the code
- * specified by the code and length parameters.
- * We assume that length will aways be less that 100!
- */
-void machine_real_restart(const unsigned char *code, int length)
-{
- local_irq_disable();
+ int rv;
- /* Write zero to CMOS register number 0x0f, which the BIOS POST
- routine will recognize as telling it to do a proper reboot. (Well
- that's what this book in front of me says -- it may only apply to
- the Phoenix BIOS though, it's not clear). At the same time,
- disable NMIs by setting the top bit in the CMOS address register,
- as we're about to do peculiar things to the CPU. I'm not sure if
- `outb_p' is needed instead of just `outb'. Use it to be on the
- safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
+ * on the command line
*/
- spin_lock(&rtc_lock);
- CMOS_WRITE(0x00, 0x8f);
- spin_unlock(&rtc_lock);
-
- /* Remap the kernel at virtual address zero, as well as offset zero
- from the kernel segment. This assumes the kernel segment starts at
- virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
+ if (!reboot_default)
+ return 0;
/*
- * Use `swapper_pg_dir' as our page directory.
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
*/
- load_cr3(swapper_pg_dir);
-
- /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
- this on booting to tell it to "Bypass memory test (also warm
- boot)". This seems like a fairly standard thing that gets set by
- REBOOT.COM programs, and the previous reset routine did this
- too. */
- *((unsigned short *)0x472) = reboot_mode;
-
- /* For the switch to real mode, copy some code to low memory. It has
- to be in the first 64k because it is running in 16-bit mode, and it
- has to have the same physical and virtual address, because it turns
- off paging. Copy it near the end of the first page, out of the way
- of BIOS variables. */
- memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
- real_mode_switch, sizeof (real_mode_switch));
- memcpy((void *)(0x1000 - 100), code, length);
-
- /* Set up the IDT for real mode. */
- load_idt(&real_mode_idt);
-
- /* Set up a GDT from which we can load segment descriptors for real
- mode. The GDT is not used in real mode; it is just needed here to
- prepare the descriptors. */
- load_gdt(&real_mode_gdt);
-
- /* Load the data segment registers, and thus the descriptors ready for
- real mode. The base address of each segment is 0x100, 16 times the
- selector value being loaded here. This is so that the segment
- registers don't have to be reloaded after switching to real mode:
- the values are consistent for real mode operation already. */
- __asm__ __volatile__ ("movl $0x0010,%%eax\n"
- "\tmovl %%eax,%%ds\n"
- "\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
- "\tmovl %%eax,%%ss" : : : "eax");
-
- /* Jump to the 16-bit code that we copied earlier. It disables paging
- and the cache, switches to real mode, and jumps to the BIOS reset
- entry point. */
- __asm__ __volatile__ ("ljmp $0x0008,%0"
- :
- : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
+ rv = dmi_check_system(reboot_dmi_table);
-#endif /* CONFIG_X86_32 */
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
+ reboot_type = BOOT_EFI;
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
- printk(KERN_INFO "%s series board detected. "
- "Selecting PCI-method for reboots.\n", d->ident);
- }
return 0;
}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
- { /* Handle problems with rebooting on Apple MacBook5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBook5",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
- },
- },
- { /* Handle problems with rebooting on Apple MacBookPro5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBookPro5",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
- },
- },
- { /* Handle problems with rebooting on Apple Macmini3,1 */
- .callback = set_pci_reboot,
- .ident = "Apple Macmini3,1",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
- },
- },
- { /* Handle problems with rebooting on the iMac9,1. */
- .callback = set_pci_reboot,
- .ident = "Apple iMac9,1",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
- },
- },
- { }
-};
-
-static int __init pci_reboot_init(void)
-{
- dmi_check_system(pci_reboot_dmi_table);
- return 0;
-}
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
static inline void kb_wait(void)
{
@@ -498,118 +529,187 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct die_args *args)
+static inline void nmi_shootdown_cpus_on_restart(void);
+
+#if IS_ENABLED(CONFIG_KVM_X86)
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
{
- cpu_emergency_vmxoff();
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
}
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_register_virt_callback);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_FOR_KVM(cpu_emergency_unregister_virt_callback);
+
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
*/
-static void emergency_vmx_disable_all(void)
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_virt_cb *callback;
+
+ /*
+ * IRQs must be disabled as KVM enables virtualization in hardware via
+ * function call IPIs, i.e. IRQs need to be disabled to guarantee
+ * virtualization stays disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+}
+
+static void emergency_reboot_disable_virtualization(void)
{
- /* Just make sure we won't change CPUs while doing this */
local_irq_disable();
- /* We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignore INIT
- * signals when VMX is enabled.
- *
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ /*
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU.
- */
- cpu_vmxoff();
-
- /* Halt and disable VMX on the other CPUs */
- nmi_shootdown_cpus(vmxoff_nmi);
+ if (rcu_access_pointer(cpu_emergency_virt_callback)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
-
+#else
+static void emergency_reboot_disable_virtualization(void) { }
+#endif /* CONFIG_KVM_X86 */
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
+/*
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
+ *
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
+ *
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
+ */
static void native_machine_emergency_restart(void)
{
int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
+ unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
tboot_shutdown(TB_SHUTDOWN_REBOOT);
/* Tell the BIOS if we want cold or warm reboot */
- *((unsigned short *)__va(0x472)) = reboot_mode;
+ mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+ *((unsigned short *)__va(0x472)) = mode;
+
+ /*
+ * If an EFI capsule has been registered with the firmware then
+ * override the reboot= parameter.
+ */
+ if (efi_capsule_pending(NULL)) {
+ pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+ reboot_type = BOOT_EFI;
+ }
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
case BOOT_KBD:
- mach_reboot_fixups(); /* for board specific fixups */
+ mach_reboot_fixups(); /* For board specific fixups */
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
- outb(0xfe, 0x64); /* pulse reset low */
+ outb(0xfe, 0x64); /* Pulse reset low */
udelay(50);
}
-
- case BOOT_TRIPLE:
- load_idt(&no_idt);
- __asm__ __volatile__("int3");
-
- reboot_type = BOOT_KBD;
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_EFI;
+ }
break;
-#ifdef CONFIG_X86_32
- case BOOT_BIOS:
- machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
-
- reboot_type = BOOT_KBD;
+ case BOOT_EFI:
+ efi_reboot(reboot_mode, NULL);
+ reboot_type = BOOT_BIOS;
break;
-#endif
- case BOOT_ACPI:
- acpi_reboot();
- reboot_type = BOOT_KBD;
- break;
+ case BOOT_BIOS:
+ machine_real_restart(MRR_BIOS);
- case BOOT_EFI:
- if (efi_enabled)
- efi.reset_system(reboot_mode ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
- reboot_type = BOOT_KBD;
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
break;
- case BOOT_CF9:
+ case BOOT_CF9_FORCE:
port_cf9_safe = true;
- /* fall through */
+ fallthrough;
- case BOOT_CF9_COND:
+ case BOOT_CF9_SAFE:
if (port_cf9_safe) {
- u8 cf9 = inb(0xcf9) & ~6;
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
outb(cf9|2, 0xcf9); /* Request hard reset */
udelay(50);
- outb(cf9|6, 0xcf9); /* Actually do the reset */
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
udelay(50);
}
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ idt_invalidate();
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
reboot_type = BOOT_KBD;
break;
}
@@ -618,37 +718,42 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
- /* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-
- /* The boot cpu is always logical cpu 0 */
- int reboot_cpu_id = 0;
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin();
-#ifdef CONFIG_X86_32
- /* See if there has been given a command line override */
- if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
- cpu_online(reboot_cpu))
- reboot_cpu_id = reboot_cpu;
+ /* Stop the cpus and apics */
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ clear_IO_APIC();
#endif
- /* Make certain the cpu I'm about to reboot on is online */
- if (!cpu_online(reboot_cpu_id))
- reboot_cpu_id = smp_processor_id();
-
- /* Make certain I only run on the appropriate processor */
- set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
-
- /* O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+#ifdef CONFIG_SMP
+ /*
+ * Stop all of the others. Also disable the local irq to
+ * not receive the per-cpu timer interrupt which may trigger
+ * scheduler's load balance.
*/
- smp_send_stop();
+ local_irq_disable();
+ stop_other_cpus();
#endif
lapic_shutdown();
-
-#ifdef CONFIG_X86_IO_APIC
- disable_IO_APIC();
-#endif
+ restore_boot_irq_mode();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
@@ -657,6 +762,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}
static void __machine_emergency_restart(int emergency)
@@ -667,7 +775,7 @@ static void __machine_emergency_restart(int emergency)
static void native_machine_restart(char *__unused)
{
- printk("machine restart\n");
+ pr_notice("machine restart\n");
if (!reboot_force)
machine_shutdown();
@@ -676,33 +784,32 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
- /* stop other cpus and apics */
+ /* Stop other cpus and apics */
machine_shutdown();
tboot_shutdown(TB_SHUTDOWN_HALT);
- /* stop this cpu */
stop_this_cpu(NULL);
}
static void native_machine_power_off(void)
{
- if (pm_power_off) {
+ if (kernel_can_power_off()) {
if (!reboot_force)
machine_shutdown();
- pm_power_off();
+ do_kernel_power_off();
}
- /* a fallback in case there is no PM info available */
+ /* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
}
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_DUMP
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -732,86 +839,104 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
}
#endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
#if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
-static int crash_nmi_callback(struct notifier_block *self,
- unsigned long val, void *data)
+static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
{
int cpu;
- if (val != DIE_NMI_IPI)
- return NOTIFY_OK;
-
cpu = raw_smp_processor_id();
- /* Don't do anything if this handler is invoked on crashing cpu.
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return NOTIFY_STOP;
+ return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, (struct die_args *)data);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
+
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+
/* Assume hlt works */
halt();
for (;;)
cpu_relax();
- return 1;
+ return NMI_HANDLED;
}
-static void smp_send_nmi_allbutself(void)
-{
- apic->send_IPI_allbutself(NMI_VECTOR);
-}
-
-static struct notifier_block crash_nmi_nb = {
- .notifier_call = crash_nmi_callback,
-};
-
-/* Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
+ *
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
- crashing_cpu = safe_smp_processor_id();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
+ crashing_cpu = smp_processor_id();
shootdown_callback = callback;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- if (register_die_notifier(&crash_nmi_nb))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
- * out the NMI
+
+ /*
+ * Set emergency handler to preempt other handlers.
*/
- wmb();
+ set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback);
- smp_send_nmi_allbutself();
+ apic_send_IPI_allbutself(NMI_VECTOR);
+
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
msecs = 1000; /* Wait at most a second for the other cpus to stop */
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
@@ -819,11 +944,49 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
+}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void __noreturn nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
}
+
#else /* !CONFIG_SMP */
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
/* No other CPUs to shoot down */
}
+
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
#endif
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..4679ac0a03eb 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This is a good place to put board specific reboot fixups.
*
@@ -26,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev)
static void cs5536_warm_reset(struct pci_dev *dev)
{
/* writing 1 to the LSB of this MSR causes a hard reset */
- wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL);
+ wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL);
udelay(50); /* shouldn't get here but be safe and spin a while */
}
@@ -43,17 +44,33 @@ static void rdc321x_reset(struct pci_dev *dev)
outb(1, 0x92);
}
+static void ce4100_reset(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ outb(0x2, 0xcf9);
+ udelay(50);
+ }
+}
+
struct device_fixup {
unsigned int vendor;
unsigned int device;
void (*reboot_fixup)(struct pci_dev *);
};
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
+
static const struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
};
/*
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11c..57276f134d12 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -1,18 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
#include <asm/processor-flags.h>
/*
- * Must be relocatable PIC code callable as a C function
+ * Must be relocatable PIC code callable as a C function, in particular
+ * there must be a plain RET and not jump to return thunk.
*/
#define PTR(x) (x << 2)
@@ -37,8 +37,7 @@
#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
.text
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
/* Save the CPU context, used for jumping back */
pushl %ebx
@@ -94,9 +93,14 @@ relocate_kernel:
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(relocate_kernel)
-identity_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
+ /* set return address to 0 if not preserving context */
+ pushl $0
/* store the start address on the stack */
pushl %edx
@@ -107,7 +111,7 @@ identity_mapped:
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movl %cr0, %eax
andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
@@ -159,12 +163,15 @@ identity_mapped:
xorl %edx, %edx
xorl %esi, %esi
xorl %ebp, %ebp
+ ANNOTATE_UNRET_SAFE
ret
+ int3
1:
popl %edx
movl CP_PA_SWAP_PAGE(%edi), %esp
addl $PAGE_SIZE, %esp
2:
+ ANNOTATE_RETPOLINE_SAFE
call *%edx
/* get the re-entry point of the peer system */
@@ -184,15 +191,18 @@ identity_mapped:
movl CP_PA_PGD(%ebx), %eax
movl %eax, %cr3
movl %cr0, %eax
- orl $(1<<31), %eax
+ orl $X86_CR0_PG, %eax
movl %eax, %cr0
lea PAGE_SIZE(%edi), %esp
movl %edi, %eax
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(identity_mapped)
-virtual_mapped:
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
movl CR4(%edi), %eax
movl %eax, %cr4
movl CR3(%edi), %eax
@@ -207,10 +217,13 @@ virtual_mapped:
popl %edi
popl %esi
popl %ebx
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movl 8(%esp), %edx
movl 4(%esp), %ecx
pushl %ebp
@@ -224,23 +237,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
@@ -250,17 +263,17 @@ swap_pages:
movl %edx, %edi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %ebp, %edi
movl %eax, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
movl %eax, %edi
movl %edx, %esi
movl $1024, %ecx
- rep ; movsl
+ rep movsl
lea PAGE_SIZE(%ebp), %esi
jmp 0b
@@ -269,7 +282,10 @@ swap_pages:
popl %edi
popl %ebx
popl %ebp
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d476..4ffba68dc57b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -1,52 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* relocate_kernel.S - put the kernel image in place to boot
* Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/linkage.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+#include <asm/asm-offsets.h>
/*
- * Must be relocatable PIC code callable as a C function
+ * Must be relocatable PIC code callable as a C function, in particular
+ * there must be a plain RET and not jump to return thunk.
*/
#define PTR(x) (x << 3)
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
- * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
- * ~ control_page + PAGE_SIZE are used as data storage and stack for
- * jumping back
+ * The .text..relocate_kernel and .data..relocate_kernel sections are copied
+ * into the control page, and the remainder of the page is used as the stack.
*/
-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+ .section .data..relocate_kernel,"a";
/* Minimal CPU state */
-#define RSP DATA(0x0)
-#define CR0 DATA(0x8)
-#define CR3 DATA(0x10)
-#define CR4 DATA(0x18)
-
-/* other data */
-#define CP_PA_TABLE_PAGE DATA(0x20)
-#define CP_PA_SWAP_PAGE DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
-
- .text
- .align PAGE_SIZE
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
+ /* other data */
+SYM_DATA(kexec_va_control_page, .quad 0)
+SYM_DATA(kexec_pa_table_page, .quad 0)
+SYM_DATA(kexec_pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+SYM_DATA(kexec_debug_8250_mmio32, .quad 0)
+SYM_DATA(kexec_debug_8250_port, .word 0)
+
+ .balign 16
+SYM_DATA_START_LOCAL(kexec_debug_gdt)
+ .word kexec_debug_gdt_end - kexec_debug_gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end)
+
+ .balign 8
+SYM_DATA_START(kexec_debug_idt)
+ .skip 0x100, 0x00
+SYM_DATA_END(kexec_debug_idt)
+
+ .section .text..relocate_kernel,"ax";
.code64
- .globl relocate_kernel
-relocate_kernel:
+SYM_CODE_START_NOALIGN(relocate_kernel)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
/*
* %rdi indirection_page
- * %rsi page_list
+ * %rsi pa_control_page
* %rdx start address
- * %rcx preserve_context
+ * %rcx flags: RELOC_KERNEL_*
*/
/* Save the CPU context, used for jumping back */
@@ -58,51 +78,97 @@ relocate_kernel:
pushq %r15
pushf
- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
- movq %rsp, RSP(%r11)
- movq %cr0, %rax
- movq %rax, CR0(%r11)
- movq %cr3, %rax
- movq %rax, CR3(%r11)
- movq %cr4, %rax
- movq %rax, CR4(%r11)
+ /* Invalidate GDT/IDT, zero out flags */
+ pushq $0
+ pushq $0
- /* zero out flags, and disable interrupts */
- pushq $0
+ lidt (%rsp)
+ lgdt (%rsp)
+ addq $8, %rsp
popfq
- /*
- * get physical address of control page now
- * this is impossible after page table switch
- */
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ /* Switch to the identity mapped page tables */
+ movq %cr3, %rax
+ movq kexec_pa_table_page(%rip), %r9
+ movq %r9, %cr3
- /* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
- /* get physical address of swap page now */
- movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+ /*
+ * Disable global pages immediately to ensure this mapping is RWX.
+ * Disable LASS before jumping to the identity mapped page.
+ */
+ movq %r13, %r12
+ andq $~(X86_CR4_PGE | X86_CR4_LASS), %r12
+ movq %r12, %cr4
+
+ /* Save %rsp and CRs. */
+ movq %r13, saved_cr4(%rip)
+ movq %rsp, saved_rsp(%rip)
+ movq %rax, saved_cr3(%rip)
+ movq %cr0, %rax
+ movq %rax, saved_cr0(%rip)
- /* save some information for jumping back */
- movq %r9, CP_PA_TABLE_PAGE(%r11)
- movq %r10, CP_PA_SWAP_PAGE(%r11)
- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ /* save indirection list for jumping back */
+ movq %rdi, pa_backup_pages_map(%rip)
- /* Switch to the identity mapped page tables */
- movq %r9, %cr3
+ /* Save the flags to %r11 as swap_pages clobbers %rcx. */
+ movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
- lea PAGE_SIZE(%r8), %rsp
+ lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %r8
- pushq %r8
- ret
+0: addq $identity_mapped - 0b, %rsi
+ subq $__relocate_kernel_start - 0b, %rsi
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(relocate_kernel)
+
+SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
+ UNWIND_HINT_END_OF_STACK
+ /*
+ * %rdi indirection page
+ * %rdx start address
+ * %r9 page table page
+ * %r11 flags: RELOC_KERNEL_*
+ * %r13 original CR4 when relocate_kernel() was invoked
+ */
-identity_mapped:
/* store the start address on the stack */
pushq %rdx
+ /* Create a GDTR (16 bits limit, 64 bits addr) on stack */
+ leaq kexec_debug_gdt(%rip), %rax
+ pushq %rax
+ pushw (%rax)
+
+ /* Load the GDT, put the stack back */
+ lgdt (%rsp)
+ addq $10, %rsp
+
+ /* Test that we can load segments */
+ movq %ds, %rax
+ movq %rax, %ds
+
+ /* Now an IDTR on the stack to load the IDT the kernel created */
+ leaq kexec_debug_idt(%rip), %rsi
+ pushq %rsi
+ pushw $0xff
+ lidt (%rsp)
+ addq $10, %rsp
+
+ //int3
+
+ /*
+ * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP
+ * below.
+ */
+ movq %cr4, %rax
+ andq $~(X86_CR4_CET), %rax
+ movq %rax, %cr4
+
/*
* Set cr0 to a known state:
* - Paging enabled
@@ -110,7 +176,7 @@ identity_mapped:
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movq %cr0, %rax
andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
@@ -120,17 +186,37 @@ identity_mapped:
/*
* Set cr4 to a known state:
* - physical address extension enabled
+ * - 5-level paging, if it was enabled before
+ * - Machine check exception on TDX guest, if it was enabled before.
+ * Clearing MCE might not be allowed in TDX guests, depending on setup.
+ *
+ * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * PAE is always set in the original CR4.
*/
- movq $X86_CR4_PAE, %rax
- movq %rax, %cr4
-
- jmp 1f
-1:
+ andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
+ ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
- movq %rcx, %r11
+ /*
+ * If the memory cache is in incoherent state, e.g., due to
+ * memory encryption, do WBINVD to flush cache.
+ *
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ *
+ * Note SME sets this flag to true when the platform supports
+ * SME, so the WBINVD is performed even SME is not activated
+ * by the kernel. But this has no harm.
+ */
+ testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b
+ jz .Lnowbinvd
+ wbinvd
+.Lnowbinvd:
+
call swap_pages
/*
@@ -142,60 +228,94 @@ identity_mapped:
movq %cr3, %rax
movq %rax, %cr3
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
+ jnz .Lrelocate
+
/*
* set all of the registers to known values
* leave %rsp alone
*/
- testq %r11, %r11
- jnz 1f
- xorq %rax, %rax
- xorq %rbx, %rbx
- xorq %rcx, %rcx
- xorq %rdx, %rdx
- xorq %rsi, %rsi
- xorq %rdi, %rdi
- xorq %rbp, %rbp
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r9
- xorq %r11, %r11
- xorq %r12, %r12
- xorq %r13, %r13
- xorq %r14, %r14
- xorq %r15, %r15
-
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
+
+ ANNOTATE_UNRET_SAFE
ret
+ int3
-1:
+.Lrelocate:
popq %rdx
+
+ /* Use the swap page for the callee's stack */
+ movq kexec_pa_swap_page(%rip), %r10
leaq PAGE_SIZE(%r10), %rsp
+
+ /* push the existing entry point onto the callee's stack */
+ pushq %rdx
+
+ ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
- movq 0(%rsp), %rbp
- call 1f
-1:
- popq %r8
- subq $(1b - relocate_kernel), %r8
- movq CP_PA_SWAP_PAGE(%r8), %r10
- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
- movq CP_PA_TABLE_PAGE(%r8), %rax
+ popq %rbp
+ movq kexec_pa_swap_page(%rip), %r10
+ movq pa_backup_pages_map(%rip), %rdi
+ movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
+
+ /* Find start (and end) of this physical mapping of control page */
+ leaq (%rip), %r8
+ ANNOTATE_NOENDBR
+ andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
+ /*
+ * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
+ * swap_pages() can swap pages correctly. Note all other
+ * RELOC_KERNEL_* flags passed to relocate_kernel() are not
+ * restored.
+ */
+ movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
call swap_pages
- movq $virtual_mapped, %rax
+ movq kexec_va_control_page(%rip), %rax
+0: addq $virtual_mapped - 0b, %rax
+ subq $__relocate_kernel_start - 0b, %rax
pushq %rax
+ ANNOTATE_UNRET_SAFE
ret
-
-virtual_mapped:
- movq RSP(%r8), %rsp
- movq CR4(%r8), %rax
+ int3
+SYM_CODE_END(identity_mapped)
+
+SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR // RET target, above
+ movq saved_rsp(%rip), %rsp
+ movq saved_cr4(%rip), %rax
movq %rax, %cr4
- movq CR3(%r8), %rax
- movq CR0(%r8), %r8
+ movq saved_cr3(%rip), %rax
+ movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
+
+#ifdef CONFIG_KEXEC_JUMP
+ /* Saved in save_processor_state. */
+ movq $saved_context, %rax
+ lgdt saved_context_gdt_desc(%rax)
+#endif
+
+ /* relocate_kernel() returns the re-entry point for next time */
movq %rbp, %rax
popf
@@ -205,62 +325,297 @@ virtual_mapped:
popq %r12
popq %rbp
popq %rbx
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(virtual_mapped)
/* Do the copies */
-swap_pages:
- movq %rdi, %rcx /* Put the page_list in %rcx */
- xorq %rdi, %rdi
- xorq %rsi, %rsi
- jmp 1f
+SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
+ UNWIND_HINT_END_OF_STACK
+ /*
+ * %rdi indirection page
+ * %r11 flags: RELOC_KERNEL_*
+ */
+ movq %rdi, %rcx /* Put the indirection_page in %rcx */
+ xorl %edi, %edi
+ xorl %esi, %esi
+ jmp .Lstart /* Should start with an indirection record */
-0: /* top, read another word for the indirection page */
+.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
-1:
- testq $0x1, %rcx /* is it a destination page? */
- jz 2f
+.Lstart:
+ testb $0x1, %cl /* is it a destination page? */
+ jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
- jmp 0b
-2:
- testq $0x2, %rcx /* is it an indirection page? */
- jz 2f
+ jmp .Lloop
+.Lnotdest:
+ testb $0x2, %cl /* is it an indirection page? */
+ jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
- jmp 0b
-2:
- testq $0x4, %rcx /* is it the done indicator? */
- jz 2f
- jmp 3f
-2:
- testq $0x8, %rcx /* is it the source indicator? */
- jz 0b /* Ignore it otherwise */
+ jmp .Lloop
+.Lnotind:
+ testb $0x4, %cl /* is it the done indicator? */
+ jz .Lnotdone
+ jmp .Ldone
+.Lnotdone:
+ testb $0x8, %cl /* is it the source indicator? */
+ jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
- movq %rdi, %rdx
- movq %rsi, %rax
+ movq %rdi, %rdx /* Save destination page to %rdx */
+ movq %rsi, %rax /* Save source page to %rax */
+
+ /* Only actually swap for ::preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
+ jz .Lnoswap
- movq %r10, %rdi
- movq $512, %rcx
- rep ; movsq
+ /* copy source page to swap page */
+ movq kexec_pa_swap_page(%rip), %rdi
+ movl $512, %ecx
+ rep movsq
+ /* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
- movq $512, %rcx
- rep ; movsq
+ movl $512, %ecx
+ rep movsq
+ /* copy swap page to destination page */
movq %rdx, %rdi
- movq %r10, %rsi
- movq $512, %rcx
- rep ; movsq
+ movq kexec_pa_swap_page(%rip), %rsi
+.Lnoswap:
+ movl $512, %ecx
+ rep movsq
lea PAGE_SIZE(%rax), %rsi
- jmp 0b
-3:
+ jmp .Lloop
+.Ldone:
+ ANNOTATE_UNRET_SAFE
ret
+ int3
+SYM_CODE_END(swap_pages)
+
+/*
+ * Generic 'print character' routine
+ * - %al: Character to be printed (may clobber %rax)
+ * - %rdx: MMIO address or port.
+ */
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ addw $LSR, %dx
+ xchg %al, %ah
+.Lxmtrdy_loop:
+ inb %dx, %al
+ testb $XMTRDY, %al
+ jnz .Lready
+ pause
+ jmp .Lxmtrdy_loop
+
+.Lready:
+ subw $LSR, %dx
+ xchg %al, %ah
+ outb %al, %dx
+pr_char_null:
+ ANNOTATE_NOENDBR
+
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32)
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+.Lxmtrdy_loop_mmio:
+ movb (LSR*4)(%rdx), %ah
+ testb $XMTRDY, %ah
+ jnz .Lready_mmio
+ pause
+ jmp .Lxmtrdy_loop_mmio
+
+.Lready_mmio:
+ movb %al, (%rdx)
+ ANNOTATE_UNRET_SAFE
+ ret
+SYM_CODE_END(pr_char_8250_mmio32)
+
+/*
+ * Load pr_char function pointer into %rsi and load %rdx with whatever
+ * that function wants to see there (typically port/MMIO address).
+ */
+.macro pr_setup
+ leaq pr_char_8250(%rip), %rsi
+ movw kexec_debug_8250_port(%rip), %dx
+ testw %dx, %dx
+ jnz 1f
+
+ leaq pr_char_8250_mmio32(%rip), %rsi
+ movq kexec_debug_8250_mmio32(%rip), %rdx
+ testq %rdx, %rdx
+ jnz 1f
+
+ leaq pr_char_null(%rip), %rsi
+1:
+.endm
+
+/* Print the nybble in %bl, clobber %rax */
+SYM_CODE_START_LOCAL_NOALIGN(pr_nybble)
+ UNWIND_HINT_FUNC
+ movb %bl, %al
+ nop
+ andb $0x0f, %al
+ addb $0x30, %al
+ cmpb $0x3a, %al
+ jb 1f
+ addb $('a' - '0' - 10), %al
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%rsi
+SYM_CODE_END(pr_nybble)
+
+SYM_CODE_START_LOCAL_NOALIGN(pr_qword)
+ UNWIND_HINT_FUNC
+ movq $16, %rcx
+1: rolq $4, %rbx
+ call pr_nybble
+ loop 1b
+ movb $'\n', %al
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
+SYM_CODE_END(pr_qword)
+
+.macro print_reg a, b, c, d, r
+ movb $\a, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\b, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\c, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movb $\d, %al
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rsi
+ movq \r, %rbx
+ call pr_qword
+.endm
+
+SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors)
+ /* Each of these is 6 bytes. */
+.macro vec_err exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ nop
+ nop
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+.macro vec_noerr exc
+ UNWIND_HINT_ENTRY
+ . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE)
+ pushq $0
+ pushq $\exc
+ jmp exc_handler
+.endm
+
+ ANNOTATE_NOENDBR
+ vec_noerr 0 // #DE
+ vec_noerr 1 // #DB
+ vec_noerr 2 // #NMI
+ vec_noerr 3 // #BP
+ vec_noerr 4 // #OF
+ vec_noerr 5 // #BR
+ vec_noerr 6 // #UD
+ vec_noerr 7 // #NM
+ vec_err 8 // #DF
+ vec_noerr 9
+ vec_err 10 // #TS
+ vec_err 11 // #NP
+ vec_err 12 // #SS
+ vec_err 13 // #GP
+ vec_err 14 // #PF
+ vec_noerr 15
+SYM_CODE_END(kexec_debug_exc_vectors)
+
+SYM_CODE_START_LOCAL_NOALIGN(exc_handler)
+ /* No need for RET mitigations during kexec */
+ VALIDATE_UNRET_END
+
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+ pushq %rsi
+
+ /* Stack frame */
+#define EXC_SS 0x58 /* Architectural... */
+#define EXC_RSP 0x50
+#define EXC_EFLAGS 0x48
+#define EXC_CS 0x40
+#define EXC_RIP 0x38
+#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */
+#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */
+#define EXC_RAX 0x20 /* Pushed just above in exc_handler */
+#define EXC_RBX 0x18
+#define EXC_RCX 0x10
+#define EXC_RDX 0x08
+#define EXC_RSI 0x00
+
+ /* Set up %rdx/%rsi for debug output */
+ pr_setup
+
+ /* rip and exception info */
+ print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp)
+ print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp)
+ print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp)
+ print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp)
+
+ /* We spilled these to the stack */
+ print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp)
+ print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp)
+ print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp)
+ print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp)
+ print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp)
+
+ /* Other registers untouched */
+ print_reg 'r', 'd', 'i', ':', %rdi
+ print_reg 'r', '8', ' ', ':', %r8
+ print_reg 'r', '9', ' ', ':', %r9
+ print_reg 'r', '1', '0', ':', %r10
+ print_reg 'r', '1', '1', ':', %r11
+ print_reg 'r', '1', '2', ':', %r12
+ print_reg 'r', '1', '3', ':', %r13
+ print_reg 'r', '1', '4', ':', %r14
+ print_reg 'r', '1', '5', ':', %r15
+ print_reg 'c', 'r', '2', ':', %cr2
+
+ /* Only return from INT3 */
+ cmpq $3, EXC_EXCEPTION(%rsp)
+ jne .Ldie
+
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+
+ addq $16, %rsp
+ iretq
+
+.Ldie:
+ hlt
+ jmp .Ldie
- .globl kexec_control_code_size
-.set kexec_control_code_size, . - relocate_kernel
+SYM_CODE_END(exc_handler)
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..79bc8a97a083
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ioport.h>
+#include <linux/printk.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820_entry *entry;
+ u64 e820_start, e820_end;
+ struct resource orig = *avail;
+
+ if (!pci_use_e820)
+ return;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ entry = &e820_table->entries[i];
+ e820_start = entry->addr;
+ e820_end = entry->addr + entry->size - 1;
+
+ resource_clip(avail, e820_start, e820_end);
+ if (orig.start != avail->start || orig.end != avail->end) {
+ pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n",
+ e820_start, e820_end);
+ if (avail->end > avail->start)
+ /*
+ * Use %pa instead of %pR because "avail"
+ * is typically IORESOURCE_UNSET, so %pR
+ * shows the size instead of addresses.
+ */
+ pr_info("resource: remaining [mem %pa-%pa] available\n",
+ &avail->start, &avail->end);
+ orig = *avail;
+ }
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
+ if (avail->flags & IORESOURCE_MEM) {
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
new file mode 100644
index 000000000000..85e2f2d16a90
--- /dev/null
+++ b/arch/x86/kernel/rethook.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c.
+ */
+#include <linux/bug.h>
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include <linux/objtool.h>
+
+#include "kprobes/common.h"
+
+__visible void arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#ifndef ANNOTATE_NOENDBR
+#define ANNOTATE_NOENDBR
+#endif
+
+/*
+ * When a target function returns, this code saves registers and calls
+ * arch_rethook_trampoline_callback(), which calls the rethook handler.
+ */
+asm(
+ ".text\n"
+ ".global arch_rethook_trampoline\n"
+ ".type arch_rethook_trampoline, @function\n"
+ "arch_rethook_trampoline:\n"
+#ifdef CONFIG_X86_64
+ ANNOTATE_NOENDBR "\n" /* This is only jumped from ret instruction */
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushq $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushq $" __stringify(__KERNEL_DS) "\n"
+ /* Save the 'sp - 16', this will be fixed later. */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addq $16, %rsp\n"
+ " popfq\n"
+#else
+ /* Push a fake return address to tell the unwinder it's a rethook. */
+ " pushl $arch_rethook_trampoline\n"
+ UNWIND_HINT_FUNC
+ " pushl %ss\n"
+ /* Save the 'sp - 8', this will be fixed later. */
+ " pushl %esp\n"
+ " pushfl\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call arch_rethook_trampoline_callback\n"
+ RESTORE_REGS_STRING
+ /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */
+ " addl $8, %esp\n"
+ " popfl\n"
+#endif
+ ASM_RET
+ ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n"
+);
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ unsigned long *frame_pointer;
+
+ /* fixup registers */
+ regs->cs = __KERNEL_CS;
+#ifdef CONFIG_X86_32
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)&arch_rethook_trampoline;
+ regs->orig_ax = ~0UL;
+ regs->sp += 2*sizeof(long);
+ frame_pointer = (long *)(regs + 1);
+
+ /*
+ * The return address at 'frame_pointer' is recovered by the
+ * arch_rethook_fixup_return() which called from this
+ * rethook_trampoline_handler().
+ */
+ rethook_trampoline_handler(regs, (unsigned long)frame_pointer);
+
+ /*
+ * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline()
+ * can do RET right after POPF.
+ */
+ *(unsigned long *)&regs->ss = regs->flags;
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/*
+ * arch_rethook_trampoline() skips updating frame pointer. The frame pointer
+ * saved in arch_rethook_trampoline_callback() points to the real caller
+ * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have
+ * a standard stack frame with CONFIG_FRAME_POINTER=y.
+ * Let's mark it non-standard function. Anyway, FP unwinder can correctly
+ * unwind without the hint.
+ */
+STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline);
+
+/* This is called from rethook_trampoline_handler(). */
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ unsigned long *frame_pointer = (void *)(regs + 1);
+
+ /* Replace fake return address with real one. */
+ *frame_pointer = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+ unsigned long *stack = (unsigned long *)regs->sp;
+
+ rh->ret_addr = stack[0];
+ rh->frame = regs->sp;
+
+ /* Replace the return addr with trampoline addr */
+ stack[0] = (unsigned long) arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..51a849a79c98 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -1,146 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* RTC related functions
*/
#include <linux/platform_device.h>
#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/bcd.h>
+#include <linux/export.h>
#include <linux/pnp.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
+#include <asm/setup.h>
#ifdef CONFIG_X86_32
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+ * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details.
*/
volatile unsigned long cmos_lock;
EXPORT_SYMBOL(cmos_lock);
#endif /* CONFIG_X86_32 */
-/* For two digit years assume time is always after that */
-#define CMOS_YEARS_OFFS 2000
-
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * In order to set the CMOS clock precisely, mach_set_cmos_time has to be
* called 500 ms after the second nowtime has started, because when
* nowtime is written into the registers of the CMOS clock, it will
* jump to the next second precisely 500 ms later. Check the Motorola
* MC146818A or Dallas DS12887 data sheet for details.
- *
- * BUG: This routine does not handle hour overflow properly; it just
- * sets the minutes. Usually you'll only notice that after reboot!
*/
-int mach_set_rtc_mmss(unsigned long nowtime)
+int mach_set_cmos_time(const struct timespec64 *now)
{
- int real_seconds, real_minutes, cmos_minutes;
- unsigned char save_control, save_freq_select;
+ unsigned long long nowtime = now->tv_sec;
+ struct rtc_time tm;
int retval = 0;
- /* tell the clock it's being set */
- save_control = CMOS_READ(RTC_CONTROL);
- CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
-
- /* stop and reset prescaler */
- save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
- CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
-
- cmos_minutes = CMOS_READ(RTC_MINUTES);
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
- cmos_minutes = bcd2bin(cmos_minutes);
-
- /*
- * since we're only adjusting minutes and seconds,
- * don't interfere with hour overflow. This avoids
- * messing with unknown time zones but requires your
- * RTC not to be off by more than 15 minutes
- */
- real_seconds = nowtime % 60;
- real_minutes = nowtime / 60;
- /* correct for half hour time zone */
- if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
- real_minutes += 30;
- real_minutes %= 60;
-
- if (abs(real_minutes - cmos_minutes) < 30) {
- if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
- real_seconds = bin2bcd(real_seconds);
- real_minutes = bin2bcd(real_minutes);
- }
- CMOS_WRITE(real_seconds, RTC_SECONDS);
- CMOS_WRITE(real_minutes, RTC_MINUTES);
+ rtc_time64_to_tm(nowtime, &tm);
+ if (!rtc_valid_tm(&tm)) {
+ retval = mc146818_set_time(&tm);
+ if (retval)
+ printk(KERN_ERR "%s: RTC write failed with error %d\n",
+ __func__, retval);
} else {
- printk(KERN_WARNING
- "set_rtc_mmss: can't update from %d to %d\n",
- cmos_minutes, real_minutes);
- retval = -1;
+ printk(KERN_ERR
+ "%s: Invalid RTC value: write of %llx to RTC failed\n",
+ __func__, nowtime);
+ retval = -EINVAL;
}
-
- /* The following flags have to be released exactly in this order,
- * otherwise the DS12887 (popular MC146818A clone with integrated
- * battery and quartz) will not reset the oscillator and will not
- * update precisely 500 ms later. You won't find this mentioned in
- * the Dallas Semiconductor data sheets, but who believes data
- * sheets anyway ... -- Markus Kuhn
- */
- CMOS_WRITE(save_control, RTC_CONTROL);
- CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-
return retval;
}
-unsigned long mach_get_cmos_time(void)
+void mach_get_cmos_time(struct timespec64 *now)
{
- unsigned int status, year, mon, day, hour, min, sec, century = 0;
+ struct rtc_time tm;
/*
- * If UIP is clear, then we have >= 244 microseconds before
- * RTC registers will be updated. Spec sheet says that this
- * is the reliable way to read RTC - registers. If UIP is set
- * then the register access might be invalid.
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
*/
- while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
- cpu_relax();
-
- sec = CMOS_READ(RTC_SECONDS);
- min = CMOS_READ(RTC_MINUTES);
- hour = CMOS_READ(RTC_HOURS);
- day = CMOS_READ(RTC_DAY_OF_MONTH);
- mon = CMOS_READ(RTC_MONTH);
- year = CMOS_READ(RTC_YEAR);
-
-#ifdef CONFIG_ACPI
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- acpi_gbl_FADT.century)
- century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-
- status = CMOS_READ(RTC_CONTROL);
- WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
-
- if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
- sec = bcd2bin(sec);
- min = bcd2bin(min);
- hour = bcd2bin(hour);
- day = bcd2bin(day);
- mon = bcd2bin(mon);
- year = bcd2bin(year);
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
}
- if (century) {
- century = bcd2bin(century);
- year += century * 100;
- printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
- } else
- year += CMOS_YEARS_OFFS;
+ if (mc146818_get_time(&tm, 1000)) {
+ pr_err("Unable to read current time from RTC\n");
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
- return mktime(year, mon, day, hour, min, sec);
+ now->tv_sec = rtc_tm_to_time64(&tm);
+ now->tv_nsec = 0;
}
/* Routines for accessing the CMOS RAM/RTC. */
@@ -166,36 +99,16 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
}
EXPORT_SYMBOL(rtc_cmos_write);
-int update_persistent_clock(struct timespec now)
+int update_persistent_clock64(struct timespec64 now)
{
- unsigned long flags;
- int retval;
-
- spin_lock_irqsave(&rtc_lock, flags);
- retval = x86_platform.set_wallclock(now.tv_sec);
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- return retval;
+ return x86_platform.set_wallclock(&now);
}
/* not static: needed by APM */
-void read_persistent_clock(struct timespec *ts)
-{
- unsigned long retval, flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- retval = x86_platform.get_wallclock();
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- ts->tv_sec = retval;
- ts->tv_nsec = 0;
-}
-
-unsigned long long native_read_tsc(void)
+void read_persistent_clock64(struct timespec64 *ts)
{
- return __native_read_tsc();
+ x86_platform.get_wallclock(ts);
}
-EXPORT_SYMBOL(native_read_tsc);
static struct resource rtc_resources[] = {
@@ -221,21 +134,20 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char *ids[] __initconst =
+ static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
- struct pnp_id *id;
int i;
pnp_for_each_dev(dev) {
- for (id = dev->id; id; id = id->next) {
- for (i = 0; i < ARRAY_SIZE(ids); i++) {
- if (compare_pnp_id(id, ids[i]) != 0)
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(dev->id, ids[i]) != 0)
+ return 0;
}
}
#endif
+ if (!x86_platform.legacy.rtc)
+ return -ENODEV;
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..1b2edd07a3e1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1,121 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 1995 Linus Torvalds
*
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
+ * This file contains the setup_arch() code, which handles the architecture-dependent
+ * parts of early kernel initialization.
*/
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
#include <linux/acpi.h>
-#include <linux/sfi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/bootmem.h>
-#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/mca.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-map-ops.h>
#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
+#include <linux/hugetlb.h>
+#include <linux/ima.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
+#include <linux/memblock.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/delay.h>
-
-#include <linux/kallsyms.h>
-#include <linux/cpufreq.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/uaccess.h>
-
-#include <linux/percpu.h>
-#include <linux/crash_dump.h>
+#include <linux/random.h>
+#include <linux/root_dev.h>
+#include <linux/static_call.h>
+#include <linux/swiotlb.h>
#include <linux/tboot.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <linux/vmalloc.h>
-#include <video/edid.h>
+#include <uapi/linux/mount.h>
+
+#include <xen/xen.h>
-#include <asm/mtrr.h>
#include <asm/apic.h>
-#include <asm/trampoline.h>
-#include <asm/e820.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/vmi.h>
-#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
#include <asm/bugs.h>
-
-#include <asm/system.h>
-#include <asm/vsyscall.h>
+#include <asm/cacheinfo.h>
+#include <asm/coco.h>
#include <asm/cpu.h>
-#include <asm/desc.h>
-#include <asm/dma.h>
-#include <asm/iommu.h>
+#include <asm/efi.h>
#include <asm/gart.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-
-#include <asm/paravirt.h>
#include <asm/hypervisor.h>
-
-#include <asm/percpu.h>
-#include <asm/topology.h>
-#include <asm/apicdef.h>
-#include <asm/k8.h>
-#ifdef CONFIG_X86_64
-#include <asm/numa_64.h>
-#endif
+#include <asm/io_apic.h>
+#include <asm/kasan.h>
+#include <asm/kaslr.h>
#include <asm/mce.h>
+#include <asm/memtype.h>
+#include <asm/mtrr.h>
+#include <asm/nmi.h>
+#include <asm/numa.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pci-direct.h>
+#include <asm/prom.h>
+#include <asm/proto.h>
+#include <asm/realmode.h>
+#include <asm/thermal.h>
+#include <asm/unwind.h>
+#include <asm/vsyscall.h>
/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
+ * max_pfn_mapped: highest directly mapped pfn > 4 GB
+ *
+ * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped[].
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -124,73 +71,50 @@ unsigned long max_pfn_mapped;
RESERVE_BRK(dmi_alloc, 65536);
#endif
-unsigned int boot_cpu_id __read_mostly;
-
-static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
-unsigned long _brk_end = (unsigned long)__brk_base;
-#ifdef CONFIG_X86_64
-int default_cpu_present_to_apicid(int mps_cpu)
-{
- return __default_cpu_present_to_apicid(mps_cpu);
-}
+unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
-int default_check_phys_apicid_present(int phys_apicid)
-{
- return __default_check_phys_apicid_present(phys_apicid);
-}
-#endif
-
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-struct boot_params __initdata boot_params;
-#else
struct boot_params boot_params;
-#endif
/*
- * Machine setup..
+ * These are the four main kernel memory regions, we put them into
+ * the resource tree so that kdump tools and other debugging tools
+ * recover it:
*/
+
+static struct resource rodata_resource = {
+ .name = "Kernel rodata",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
static struct resource data_resource = {
.name = "Kernel data",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource code_resource = {
.name = "Kernel code",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
static struct resource bss_resource = {
.name = "Kernel bss",
.start = 0,
.end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
};
#ifdef CONFIG_X86_32
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
-EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
- MCA_bus = x;
-#endif
-}
-
-unsigned int def_to_bigsmp;
-
-/* for MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
+/* CPU data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
@@ -203,30 +127,96 @@ EXPORT_SYMBOL(ist_info);
struct ist_info ist_info;
#endif
-#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
- .x86_phys_bits = MAX_PHYSMEM_BITS,
-};
-EXPORT_SYMBOL(boot_cpu_data);
#endif
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+SYM_PIC_ALIAS(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
#else
-unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
+#endif
+
+#ifdef CONFIG_IMA
+static phys_addr_t ima_kexec_buffer_phys;
+static size_t ima_kexec_buffer_size;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
+static const struct ctl_table x86_sysctl_table[] = {
+ {
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_unrecovered_nmi",
+ .data = &panic_on_unrecovered_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_io_nmi",
+ .data = &panic_on_io_nmi,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_type",
+ .data = &bootloader_type,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "bootloader_version",
+ .data = &bootloader_version,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "io_delay_type",
+ .data = &io_delay_type,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#if defined(CONFIG_ACPI_SLEEP)
+ {
+ .procname = "acpi_video_flags",
+ .data = &acpi_realmode_flags,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+#endif
+};
+
+static int __init init_x86_sysctl(void)
+{
+ register_sysctl_init("kernel", x86_sysctl_table);
+ return 0;
+}
+arch_initcall(init_x86_sysctl);
+
/*
* Setup options
*/
struct screen_info screen_info;
EXPORT_SYMBOL(screen_info);
+#if defined(CONFIG_FIRMWARE_EDID)
struct edid_info edid_info;
EXPORT_SYMBOL_GPL(edid_info);
+#endif
extern int root_mountflags;
@@ -238,7 +228,8 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+bool builtin_cmdline_added __ro_after_init;
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -284,16 +275,8 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
+#ifdef CONFIG_X86_32
+static void __init cleanup_highmap(void)
{
}
#endif
@@ -301,7 +284,8 @@ static inline void init_gbpages(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_reserve_kern(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -310,101 +294,92 @@ static void __init reserve_brk(void)
#ifdef CONFIG_BLK_DEV_INITRD
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ if (ramdisk_image == 0)
+ ramdisk_image = phys_initrd_start;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ if (ramdisk_size == 0)
+ ramdisk_size = phys_initrd_size;
+
+ return ramdisk_size;
+}
+
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
- u64 ramdisk_here;
- unsigned long slop, clen, mapaddr;
- char *p, *q;
+ int ret = 0;
- /* We need to move the initrd down into lowmem */
- ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
- PAGE_SIZE);
-
- if (ramdisk_here == -1ULL)
+ /* We need to move the initrd down into directly mapped mem */
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
+ if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
- ramdisk_size);
+ ramdisk_size);
- /* Note: this includes all the lowmem currently occupied by
- the initrd, we rely on that fact to keep the data intact. */
- reserve_early(ramdisk_here, ramdisk_here + area_size,
- "NEW RAMDISK");
- initrd_start = ramdisk_here + PAGE_OFFSET;
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
- printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
- ramdisk_here, ramdisk_here + ramdisk_size);
-
- q = (char *)initrd_start;
-
- /* Copy any lowmem portion of the initrd */
- if (ramdisk_image < end_of_lowmem) {
- clen = end_of_lowmem - ramdisk_image;
- p = (char *)__va(ramdisk_image);
- memcpy(q, p, clen);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- /* Copy the highmem portion of the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
- /* high pages is not converted by early_res_to_bootmem */
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
- " %08llx - %08llx\n",
+ ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+ if (ret)
+ panic("Copy RAMDISK failed\n");
+
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
- ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
}
-static void __init reserve_initrd(void)
+static void __init early_reserve_initrd(void)
{
/* Assume only end is not page aligned */
- u64 ramdisk_image = boot_params.hdr.ramdisk_image;
- u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- initrd_start = 0;
+ memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
+}
- if (ramdisk_size >= (end_of_lowmem>>1)) {
- free_early(ramdisk_image, ramdisk_end);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
- }
+static void __init reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
- ramdisk_end);
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
- if (ramdisk_end <= end_of_lowmem) {
- /* All in lowmem, easy case */
- /*
- * don't need to reserve again, already reserved early
- * in i386_start_kernel
- */
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -412,145 +387,243 @@ static void __init reserve_initrd(void)
relocate_initrd();
- free_early(ramdisk_image, ramdisk_end);
+ memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
+
#else
+static void __init early_reserve_initrd(void)
+{
+}
static void __init reserve_initrd(void)
{
}
#endif /* CONFIG_BLK_DEV_INITRD */
+static void __init add_early_ima_buffer(u64 phys_addr)
+{
+#ifdef CONFIG_IMA
+ struct ima_setup_data *data;
+
+ data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data));
+ if (!data) {
+ pr_warn("setup: failed to memremap ima_setup_data entry\n");
+ return;
+ }
+
+ if (data->size) {
+ memblock_reserve_kern(data->addr, data->size);
+ ima_kexec_buffer_phys = data->addr;
+ ima_kexec_buffer_size = data->size;
+ }
+
+ early_memunmap(data, sizeof(*data));
+#else
+ pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n");
+#endif
+}
+
+#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
+int __init ima_free_kexec_buffer(void)
+{
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
+
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
+
+ ima_kexec_buffer_phys = 0;
+ ima_kexec_buffer_size = 0;
+
+ return 0;
+}
+
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
+{
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
+
+ *addr = __va(ima_kexec_buffer_phys);
+ *size = ima_kexec_buffer_size;
+
+ return 0;
+}
+#endif
+
+static void __init add_kho(u64 phys_addr, u32 data_len)
+{
+ struct kho_data *kho;
+ u64 addr = phys_addr + sizeof(struct setup_data);
+ u64 size = data_len - sizeof(struct setup_data);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
+ return;
+ }
+
+ kho = early_memremap(addr, size);
+ if (!kho) {
+ pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
+ addr, size);
+ return;
+ }
+
+ kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
+
+ early_memunmap(kho, size);
+}
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
- u64 pa_data;
+ u64 pa_data, pa_next;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- data = early_memremap(pa_data, PAGE_SIZE);
- switch (data->type) {
+ u32 data_len, data_type;
+
+ data = early_memremap(pa_data, sizeof(*data));
+ data_len = data->len + sizeof(struct setup_data);
+ data_type = data->type;
+ pa_next = data->next;
+ early_memunmap(data, sizeof(*data));
+
+ switch (data_type) {
case SETUP_E820_EXT:
- parse_e820_ext(data, pa_data);
+ e820__memory_setup_extended(pa_data, data_len);
+ break;
+ case SETUP_DTB:
+ add_dtb(pa_data);
+ break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
+ case SETUP_IMA:
+ add_early_ima_buffer(pa_data);
+ break;
+ case SETUP_KEXEC_KHO:
+ add_kho(pa_data, data_len);
+ break;
+ case SETUP_RNG_SEED:
+ data = early_memremap(pa_data, data_len);
+ add_bootloader_randomness(data->data, data->len);
+ /* Zero seed for forward secrecy. */
+ memzero_explicit(data->data, data->len);
+ /* Zero length in case we find ourselves back here by accident. */
+ memzero_explicit(&data->len, sizeof(data->len));
+ early_memunmap(data, data_len);
break;
default:
break;
}
- pa_data = data->next;
- early_iounmap(data, PAGE_SIZE);
+ pa_data = pa_next;
}
}
-static void __init e820_reserve_setup_data(void)
+/*
+ * Translate the fields of 'struct boot_param' into global variables
+ * representing these parameters.
+ */
+static void __init parse_boot_params(void)
{
- struct setup_data *data;
- u64 pa_data;
- int found = 0;
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+#if defined(CONFIG_FIRMWARE_EDID)
+ edid_info = boot_params.edid_info;
+#endif
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_memremap(pa_data, sizeof(*data));
- e820_update_range(pa_data, sizeof(*data)+data->len,
- E820_RAM, E820_RESERVED_KERN);
- found = 1;
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
}
- if (!found)
- return;
+#endif
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "extended physical RAM map:\n");
- e820_print_map("reserve setup_data");
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
}
-static void __init reserve_early_setup_data(void)
+static void __init memblock_x86_reserve_range_setup_data(void)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
- u64 pa_data;
- char buf[32];
+ u64 pa_data, pa_next;
+ u32 len;
- if (boot_params.hdr.version < 0x0209)
- return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
- }
-}
+ if (!data) {
+ pr_warn("setup: failed to memremap setup_data entry\n");
+ return;
+ }
-/*
- * --------- Crashkernel reservation ------------------------------
- */
+ len = sizeof(*data);
+ pa_next = data->next;
-#ifdef CONFIG_KEXEC
+ memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
-static inline unsigned long long get_total_mem(void)
-{
- unsigned long long total;
+ if (data->type == SETUP_INDIRECT) {
+ len += data->len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap(pa_data, len);
+ if (!data) {
+ pr_warn("setup: failed to memremap indirect setup_data\n");
+ return;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
- total = max_pfn - min_low_pfn;
+ if (indirect->type != SETUP_INDIRECT)
+ memblock_reserve_kern(indirect->addr, indirect->len);
+ }
- return total << PAGE_SHIFT;
+ pa_data = pa_next;
+ early_memunmap(data, len);
+ }
}
-static void __init reserve_crashkernel(void)
+static void __init arch_reserve_crashkernel(void)
{
- unsigned long long total_mem;
- unsigned long long crash_size, crash_base;
+ unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0;
+ bool high = false;
int ret;
- total_mem = get_total_mem();
-
- ret = parse_crashkernel(boot_command_line, total_mem,
- &crash_size, &crash_base);
- if (ret != 0 || crash_size <= 0)
+ if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
- /* 0 means: find the address automatically */
- if (crash_base <= 0) {
- const unsigned long long alignment = 16<<20; /* 16M */
-
- crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
- alignment);
- if (crash_base == -1ULL) {
- pr_info("crashkernel reservation failed - No suitable area found.\n");
- return;
- }
- } else {
- unsigned long long start;
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
+ &crash_size, &crash_base,
+ &low_size, &cma_size, &high);
+ if (ret)
+ return;
- start = find_e820_area(crash_base, ULONG_MAX, crash_size,
- 1<<20);
- if (start != crash_base) {
- pr_info("crashkernel reservation failed - memory is in use.\n");
- return;
- }
+ if (xen_pv_domain()) {
+ pr_info("Ignoring crashkernel for a Xen PV domain\n");
+ return;
}
- reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
+ reserve_crashkernel_cma(cma_size);
}
-#else
-static void __init reserve_crashkernel(void)
-{
-}
-#endif
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
@@ -585,111 +658,92 @@ void __init reserve_standard_io_resources(void)
}
-/*
- * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
- * is_kdump_kernel() to determine if we are booting after a panic. Hence
- * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
- */
-
-#ifdef CONFIG_CRASH_DUMP
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
+static void __init setup_kernel_resources(void)
{
- char *end;
- if (!arg)
- return -EINVAL;
- elfcorehdr_addr = memparse(arg, &end);
- return end > arg ? 0 : -EINVAL;
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ rodata_resource.start = __pa_symbol(__start_rodata);
+ rodata_resource.end = __pa_symbol(__end_rodata)-1;
+ data_resource.start = __pa_symbol(_sdata);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &rodata_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-static __init void reserve_ibft_region(void)
+static bool __init snb_gfx_workaround_needed(void)
{
- unsigned long addr, size = 0;
-
- addr = find_ibft_region(&size);
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const __initconst u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
- if (size)
- reserve_early_overlap_ok(addr, addr + size, "ibft");
+ return false;
}
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
{
- printk(KERN_NOTICE
- "%s detected: BIOS may corrupt low RAM, working around it.\n",
- d->ident);
+ static const __initconst unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
- e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ if (!snb_gfx_workaround_needed())
+ return;
- return 0;
-}
-#endif
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix/MSC BIOS",
- .matches = {
- DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
- },
- },
/*
- * AMI BIOS with low memory corruption was found on Intel DG45ID and
- * DG45FC boards.
- * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
- * match only DMI_BOARD_NAME and see if there is more bad products
- * with this vendor.
+ * SandyBridge integrated graphics devices have a bug that prevents
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+ * To avoid these pages being ever accessed by SNB gfx devices reserve
+ * bad_pages that have not already been reserved at boot time.
+ * All memory below the 1 MB mark is anyway reserved later during
+ * setup_arch(), so there is no need to reserve it here.
*/
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
- },
- },
- {
- .callback = dmi_low_memory_corruption,
- .ident = "AMI BIOS",
- .matches = {
- DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
- },
- },
- /*
- * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
- * match on the product name.
- */
- {
- .callback = dmi_low_memory_corruption,
- .ident = "Phoenix BIOS",
- .matches = {
- DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
- },
- },
-#endif
- {}
-};
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
static void __init trim_bios_range(void)
{
@@ -697,15 +751,117 @@ static void __init trim_bios_range(void)
* A special case is the first 4Kb of memory;
* This is a BIOS owned area, not kernel ram, but generally
* not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
*/
- e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+ e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
/*
- * special case: Some BIOSen report the PC BIOS
- * area (640->1Mb) as ram even though it is not.
+ * special case: Some BIOSes report the PC BIOS
+ * area (640Kb -> 1Mb) as RAM even though it is not.
* take them out.
*/
- e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
+
+ e820__update_table(e820_table);
+}
+
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
+ e820__range_remove(start, size, E820_TYPE_RAM, 0);
+ e820__range_add(start, size, E820_TYPE_RAM);
+}
+
+static void __init early_reserve_memory(void)
+{
+ /*
+ * Reserve the memory occupied by the kernel between _text and
+ * __end_of_kernel_reserve symbols. Any kernel sections after the
+ * __end_of_kernel_reserve symbol must be explicitly reserved with a
+ * separate memblock_reserve() or they will be discarded.
+ */
+ memblock_reserve_kern(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+
+ /*
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+ * Reserve the first 64K of memory since some BIOSes are known to
+ * corrupt low memory. After the real mode trampoline is allocated the
+ * rest of the memory below 640k is reserved.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, SZ_64K);
+
+ early_reserve_initrd();
+
+ memblock_x86_reserve_range_setup_data();
+
+ reserve_bios_regions();
+ trim_snb_memory();
+}
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ kaslr_offset(),
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
+
+ return 0;
+}
+
+void x86_configure_nx(void)
+{
+ if (boot_cpu_has(X86_FEATURE_NX))
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+static void __init x86_report_nx(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
+#endif
+ }
}
/*
@@ -723,163 +879,147 @@ static void __init trim_bios_range(void)
void __init setup_arch(char **cmdline_p)
{
- int acpi = 0;
- int k8 = 0;
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- visws_early_detect();
-#else
- printk(KERN_INFO "Command line: %s\n", boot_command_line);
-#endif
-
- /* VMI may relocate the fixmap; do this before touching ioremap area */
- vmi_init();
-
- early_trap_init();
- early_cpu_init();
- early_ioremap_init();
- ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
- screen_info = boot_params.screen_info;
- edid_info = boot_params.edid_info;
-#ifdef CONFIG_X86_32
- apm_info.bios = boot_params.apm_bios_info;
- ist_info = boot_params.ist_info;
- if (boot_params.sys_desc_table.length != 0) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
- machine_id = boot_params.sys_desc_table.table[0];
- machine_submodel_id = boot_params.sys_desc_table.table[1];
- BIOS_revision = boot_params.sys_desc_table.table[2];
- }
-#endif
- saved_video_mode = boot_params.hdr.vid_mode;
- bootloader_type = boot_params.hdr.type_of_loader;
- if ((bootloader_type >> 4) == 0xe) {
- bootloader_type &= 0xf;
- bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
- }
- bootloader_version = bootloader_type & 0xf;
- bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
-#ifdef CONFIG_BLK_DEV_RAM
- rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
- rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
- rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
-#endif
-#ifdef CONFIG_EFI
- if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
- "EL32",
+ load_cr3(swapper_pg_dir);
+ /*
+ * Note: Quark X1000 CPUs advertise PGE incorrectly and require
+ * a cr3 based tlb flush, so the following __flush_tlb_all()
+ * will not flush anything because the CPU quirk which clears
+ * X86_FEATURE_PGE has not been invoked yet. Though due to the
+ * load_cr3() above the TLB has been flushed already. The
+ * quirk is invoked before subsequent calls to __flush_tlb_all()
+ * so proper operation is guaranteed.
+ */
+ __flush_tlb_all();
#else
- "EL64",
-#endif
- 4)) {
- efi_enabled = 1;
- efi_reserve_early();
- }
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
- x86_init.oem.arch_setup();
-
- setup_memory_map();
- parse_setup_data();
- /* update the e820_saved too */
- e820_reserve_setup_data();
-
- copy_edd();
-
- if (!boot_params.hdr.root_flags)
- root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = _brk_end;
-
- code_resource.start = virt_to_phys(_text);
- code_resource.end = virt_to_phys(_etext)-1;
- data_resource.start = virt_to_phys(_etext);
- data_resource.end = virt_to_phys(_edata)-1;
- bss_resource.start = virt_to_phys(&__bss_start);
- bss_resource.end = virt_to_phys(&__bss_stop)-1;
-
#ifdef CONFIG_CMDLINE_BOOL
#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
#else
if (builtin_cmdline[0]) {
/* append boot loader cmdline to builtin */
strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
}
#endif
+ builtin_cmdline_added = true;
#endif
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
/*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
+ olpc_ofw_detect();
+
+ idt_setup_early_traps();
+ early_cpu_init();
+ jump_label_init();
+ static_call_init();
+ early_ioremap_init();
+
+ setup_olpc_ofw_pgd();
+
+ parse_boot_params();
+
+ x86_init.oem.arch_setup();
+
+ /*
+ * Do some memory reservations *before* memory is added to memblock, so
+ * memblock allocations won't overwrite it.
+ *
+ * After this point, everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not RAM in
+ * e820. All other memory is free game.
+ *
+ * This call needs to happen before e820__memory_setup() which calls the
+ * xen_memory_setup() on Xen dom0 which relies on the fact that those
+ * early reservations have happened already.
+ */
+ early_reserve_memory();
+
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+ e820__memory_setup();
+ parse_setup_data();
+
+ copy_edd();
+
+ setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
+
+ /*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
- * console setup can safely call set_fixmap()). It may then be called
- * again from within noexec_setup() during parsing early parameters
- * to honor the respective command line option.
+ * console setup can safely call set_fixmap()).
*/
x86_configure_nx();
parse_early_param();
- x86_report_nx();
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
- /* Must be before kernel pagetables are setup */
- vmi_activate();
+ x86_report_nx();
- /* after early param, so could get panic from serial */
- reserve_early_setup_data();
+ apic_setup_apic_calls();
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
- disable_apic = 1;
+ apic_is_disabled = true;
#endif
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
-#ifdef CONFIG_PCI
- if (pci_early_dump_regs)
- early_dump_pci_devices();
-#endif
-
- finish_e820_parsing();
+ e820__finish_early_params();
- if (efi_enabled)
+ if (efi_enabled(EFI_BOOT))
efi_init();
- dmi_scan_machine();
-
- dmi_check_system(bad_bios_dmi_table);
+ reserve_ibft_region();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
- * needs to be done after dmi_scan_machine, for the BP.
+ * needs to be done after dmi_setup(), for the boot CPU.
+ * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be
+ * called before cache_bp_init() for setting up MTRR state.
*/
init_hypervisor_platform();
+ tsc_early_init();
x86_init.resources.probe_roms();
- /* after parse_early_param, so could debug it */
- insert_resource(&iomem_resource, &code_resource);
- insert_resource(&iomem_resource, &data_resource);
- insert_resource(&iomem_resource, &bss_resource);
+ /*
+ * Add resources for kernel text and data to the iomem_resource.
+ * Do it after parse_early_param, so it can be debugged.
+ */
+ setup_kernel_resources();
+ e820_add_kernel_range();
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
- e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
- E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
+ E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
printk(KERN_INFO "fixed physical RAM map:\n");
- e820_print_map("bad_ppro");
+ e820__print_table("bad_ppro");
}
#else
early_gart_iommu_check();
@@ -889,75 +1029,121 @@ void __init setup_arch(char **cmdline_p)
* partially used pages are not usable - thus
* we are rounding upwards:
*/
- max_pfn = e820_end_of_ram_pfn();
+ max_pfn = e820__end_of_ram_pfn();
- /* preallocate 4k for mptable mpc */
- early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ cache_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
- max_pfn = e820_end_of_ram_pfn();
+ max_pfn = e820__end_of_ram_pfn();
+
+ max_possible_pfn = max_pfn;
+
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
- num_physpages = max_pfn;
-
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
- max_low_pfn = e820_end_of_low_ram_pfn();
+ max_low_pfn = e820__end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
-
- high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
- max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
-#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
- setup_bios_corruption_check();
-#endif
+ /* Find and reserve MPTABLE area */
+ x86_init.mpparse.find_mptable();
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
+ early_alloc_pgt_buf();
+ /*
+ * Need to conclude brk, before e820__memblock_setup()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
reserve_brk();
+ cleanup_highmap();
+
+ e820__memblock_setup();
+
/*
- * Find and reserve possible boot-time SMP configuration:
+ * Needs to run after memblock setup because it needs the physical
+ * memory size.
*/
- find_smp_config();
-
- reserve_ibft_region();
+ mem_encrypt_setup_arch();
+ cc_random_init();
- reserve_trampoline_memory();
+ efi_find_mirror();
+ efi_esrt_init();
+ efi_mokvar_table_init();
-#ifdef CONFIG_ACPI_SLEEP
/*
- * Reserve low memory region for sleep support.
- * even before init_memory_mapping
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
*/
- acpi_reserve_wakeup_memory();
-#endif
- init_gbpages();
+ efi_reserve_boot_services();
- /* max_pfn_mapped is updated here */
- max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
- max_pfn_mapped = max_low_pfn_mapped;
+ /* preallocate 4k for mptable mpc */
+ e820__memblock_alloc_reserved_mpc_new();
-#ifdef CONFIG_X86_64
- if (max_pfn > max_low_pfn) {
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
- /* can we preseve max_low_pfn ?*/
- max_low_pfn = max_pfn;
- }
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
+#ifdef CONFIG_X86_32
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
/*
+ * Find free memory for the real mode trampoline and place it there. If
+ * there is not enough free memory under 1M, on EFI-enabled systems
+ * there will be additional attempt to reclaim the memory for the real
+ * mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because BIOSes
+ * are known to corrupt low memory and several hundred kilobytes are not
+ * worth complex detection what memory gets clobbered. Windows does the
+ * same thing for very similar reasons.
+ *
+ * Moreover, on machines with SandyBridge graphics or in setups that use
+ * crashkernel the entire 1M is reserved anyway.
+ *
+ * Note the host kernel TDX also requires the first 1MB being reserved.
+ */
+ x86_platform.realmode_reserve();
+
+ init_mem_mapping();
+
+ /*
+ * init_mem_mapping() relies on the early IDT page fault handling.
+ * Now either enable FRED or install the real page fault handler
+ * for 64-bit in the IDT.
+ */
+ cpu_init_replace_early_idt();
+
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
+ */
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
+
+ memblock_set_current_limit(get_max_mapped());
+
+ /*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
*/
@@ -965,104 +1151,134 @@ void __init setup_arch(char **cmdline_p)
if (init_ohci1394_dma_early)
init_ohci1394_dma_on_all_controllers();
#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
+
+ if (efi_enabled(EFI_BOOT)) {
+ switch (boot_params.secure_boot) {
+ case efi_secureboot_mode_disabled:
+ pr_info("Secure boot disabled\n");
+ break;
+ case efi_secureboot_mode_enabled:
+ pr_info("Secure boot enabled\n");
+ break;
+ default:
+ pr_info("Secure boot could not be determined\n");
+ break;
+ }
+ }
reserve_initrd();
- reserve_crashkernel();
+ acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
vsmp_init();
io_delay_init();
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
+ early_platform_quirks();
+ /* Some platforms need the APIC registered for NUMA configuration */
early_acpi_boot_init();
+ x86_init.mpparse.early_parse_smp_cfg();
+
+ x86_flattree_get_config();
+
+ initmem_init();
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES)) {
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+ hugetlb_bootmem_alloc();
+ }
-#ifdef CONFIG_ACPI_NUMA
/*
- * Parse SRAT to discover nodes.
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
*/
- acpi = acpi_numa_init();
-#endif
+ arch_reserve_crashkernel();
-#ifdef CONFIG_K8_NUMA
- if (!acpi)
- k8 = !k8_numa_init(0, max_pfn);
-#endif
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
- initmem_init(0, max_pfn, acpi, k8);
-#ifndef CONFIG_NO_BOOTMEM
- early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#endif
-
- dma32_reserve_bootmem();
+ x86_init.paging.pagetable_init();
-#ifdef CONFIG_KVM_CLOCK
- kvmclock_init();
-#endif
+ kasan_init();
- x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- paging_init();
- x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ /*
+ * Sync back kernel address range.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
tboot_probe();
-#ifdef CONFIG_X86_64
map_vsyscall();
-#endif
- generic_apic_probe();
+ x86_32_probe_apic();
early_quirks();
+ topology_apply_cmdline_limits_early();
+
/*
- * Read APIC and some other early information from ACPI tables.
+ * Parse SMP configuration. Try ACPI first and then the platform
+ * specific parser.
*/
acpi_boot_init();
+ x86_init.mpparse.parse_smp_cfg();
- sfi_init();
-
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- get_smp_config();
+ /* Last opportunity to detect and map the local APIC */
+ init_apic_mappings();
- prefill_possible_map();
+ topology_init_possible_cpus();
-#ifdef CONFIG_X86_64
init_cpu_to_node();
-#endif
-
- init_apic_mappings();
- ioapic_init_mappings();
+ init_gi_nodes();
- /* need to wait for io_apic is mapped */
- probe_nr_irqs_gsi();
+ io_apic_init_mappings();
- kvm_guest_init();
+ x86_init.hyper.guest_late_init();
- e820_reserve_resources();
- e820_mark_nosave_regions(max_low_pfn);
+ e820__reserve_resources();
+ e820__register_nosave_regions(max_pfn);
x86_init.resources.reserve_resources();
- e820_setup_gap();
+ e820__setup_pci_gap();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
- conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ vgacon_register_screen(&screen_info);
#endif
#endif
x86_init.oem.banner();
+ x86_init.timers.wallclock_init();
+
+ /*
+ * This needs to run before setup_local_APIC() which soft-disables the
+ * local APIC temporarily and that masks the thermal LVT interrupt,
+ * leading to softlockups on machines which have configured SMI
+ * interrupt delivery.
+ */
+ therm_lvt_init();
+
mcheck_init();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
+
+#ifdef CONFIG_EFI
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
+#endif
+
+ unwind_init();
}
#ifdef CONFIG_X86_32
@@ -1081,3 +1297,22 @@ void __init i386_reserve_resources(void)
}
#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
+
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return cpu > 0;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..bfa48e7a32a2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,17 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/percpu.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/pfn.h>
+#include <linux/stackprotector.h>
#include <asm/sections.h>
#include <asm/processor.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
@@ -19,23 +22,14 @@
#include <asm/proto.h>
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#include <asm/stackprotector.h>
-DEFINE_PER_CPU(int, cpu_number);
+DEFINE_PER_CPU_CACHE_HOT(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);
-#ifdef CONFIG_X86_64
-#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
-#else
-#define BOOT_PERCPU_OFFSET 0
-#endif
-
-DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
- [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
-};
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init;
EXPORT_SYMBOL(__per_cpu_offset);
/*
@@ -64,7 +58,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
*/
static bool __init pcpu_need_numa(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
pg_data_t *last = NULL;
unsigned int cpu;
@@ -82,67 +76,9 @@ static bool __init pcpu_need_numa(void)
}
#endif
-/**
- * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
- * @cpu: cpu to allocate for
- * @size: size allocation in bytes
- * @align: alignment
- *
- * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
- * does the right thing for NUMA regardless of the current
- * configuration.
- *
- * RETURNS:
- * Pointer to the allocated area on success, NULL on failure.
- */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
- unsigned long align)
-{
- const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
- int node = early_cpu_to_node(cpu);
- void *ptr;
-
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = __alloc_bootmem_nopanic(size, align, goal);
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
- cpu, size, __pa(ptr));
- } else {
- ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- size, align, goal);
- pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
- cpu, size, node, __pa(ptr));
- }
- return ptr;
-#else
- return __alloc_bootmem_nopanic(size, align, goal);
-#endif
-}
-
-/*
- * Helpers for first chunk memory allocation
- */
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
-{
- return pcpu_alloc_bootmem(cpu, size, align);
-}
-
-static void __init pcpu_fc_free(void *ptr, size_t size)
-{
-#ifdef CONFIG_NO_BOOTMEM
- u64 start = __pa(ptr);
- u64 end = start + size;
- free_early_partial(start, end);
-#else
- free_bootmem(__pa(ptr), size);
-#endif
-}
-
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
if (early_cpu_to_node(from) == early_cpu_to_node(to))
return LOCAL_DISTANCE;
else
@@ -152,7 +88,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
#endif
}
-static void __init pcpup_populate_pte(unsigned long addr)
+static int __init pcpu_cpu_to_node(int cpu)
+{
+ return early_cpu_to_node(cpu);
+}
+
+void __init pcpu_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
@@ -160,13 +101,10 @@ static void __init pcpup_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct gdt;
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
- pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
- gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_table(cpu),
- GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
}
@@ -176,7 +114,7 @@ void __init setup_per_cpu_areas(void)
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
@@ -191,22 +129,33 @@ void __init setup_per_cpu_areas(void)
#endif
rc = -EINVAL;
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
- const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
const size_t dyn_size = PERCPU_MODULE_RESERVE +
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
- pcpu_fc_alloc, pcpu_fc_free);
+ pcpu_cpu_to_node);
if (rc < 0)
- pr_warning("%s allocator failed (%d), falling back to page size\n",
- pcpu_fc_names[pcpu_chosen_fc], rc);
+ pr_warn("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
- pcpu_fc_alloc, pcpu_fc_free,
- pcpup_populate_pte);
+ pcpu_cpu_to_node);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
@@ -217,7 +166,6 @@ void __init setup_per_cpu_areas(void)
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
- setup_stack_canary_segment(cpu);
/*
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas. These
@@ -228,13 +176,9 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_LOCAL_APIC
per_cpu(x86_cpu_to_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_apicid, cpu);
- per_cpu(x86_bios_cpu_apicid, cpu) =
- early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+ per_cpu(x86_cpu_to_acpiid, cpu) =
+ early_per_cpu_map(x86_cpu_to_acpiid, cpu);
#endif
-#ifdef CONFIG_X86_64
- per_cpu(irq_stack_ptr, cpu) =
- per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE - 64;
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -248,21 +192,20 @@ void __init setup_per_cpu_areas(void)
*/
set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
#endif
-#endif
/*
* Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
- if (cpu == boot_cpu_id)
- switch_to_new_gdt(cpu);
+ if (!cpu)
+ switch_gdt_and_percpu_base(cpu);
}
/* indicate the early static arrays will soon be gone */
#ifdef CONFIG_X86_LOCAL_APIC
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+ early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+#ifdef CONFIG_NUMA
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
@@ -271,4 +214,16 @@ void __init setup_per_cpu_areas(void)
/* Setup cpu initialized, callin, callout masks */
setup_cpu_local_masks();
+
+ /*
+ * Sync back kernel address range again. We already did this in
+ * setup_arch(), but percpu data also needs to be available in
+ * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
+ * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
+ * there too.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
}
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
new file mode 100644
index 000000000000..1ab65f6c6ae7
--- /dev/null
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sev_verify_cbit.S - Code for verification of the C-bit position reported
+ * by the Hypervisor when running with SEV enabled.
+ *
+ * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de)
+ *
+ * sev_verify_cbit() is called before switching to a new long-mode page-table
+ * at boot.
+ *
+ * Verify that the C-bit position is correct by writing a random value to
+ * an encrypted memory location while on the current page-table. Then it
+ * switches to the new page-table to verify the memory content is still the
+ * same. After that it switches back to the current page-table and when the
+ * check succeeded it returns. If the check failed the code invalidates the
+ * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to
+ * make sure no interrupt or exception can get the CPU out of the hlt loop.
+ *
+ * New page-table pointer is expected in %rdi (first parameter)
+ *
+ */
+SYM_FUNC_START(sev_verify_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* First check if a C-bit was detected */
+ movq sme_me_mask(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */
+ movq sev_status(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* Save CR4 in %rsi */
+ movq %cr4, %rsi
+
+ /* Disable Global Pages */
+ movq %rsi, %rdx
+ andq $(~X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /*
+ * Verified that running under SEV - now get a random value using
+ * RDRAND. This instruction is mandatory when running as an SEV guest.
+ *
+ * Don't bail out of the loop if RDRAND returns errors. It is better to
+ * prevent forward progress than to work with a non-random value here.
+ */
+1: rdrand %rdx
+ jnc 1b
+
+ /* Store value to memory and keep it in %rdx */
+ movq %rdx, sev_check_data(%rip)
+
+ /* Backup current %cr3 value to restore it later */
+ movq %cr3, %rcx
+
+ /* Switch to new %cr3 - This might unmap the stack */
+ movq %rdi, %cr3
+
+ /*
+ * Compare value in %rdx with memory location. If C-bit is incorrect
+ * this would read the encrypted data and make the check fail.
+ */
+ cmpq %rdx, sev_check_data(%rip)
+
+ /* Restore old %cr3 */
+ movq %rcx, %cr3
+
+ /* Restore previous CR4 */
+ movq %rsi, %cr4
+
+ /* Check CMPQ result */
+ je 3f
+
+ /*
+ * The check failed, prevent any forward progress to prevent ROP
+ * attacks, invalidate the stack and go into a hlt loop.
+ */
+ xorl %esp, %esp
+ subq $0x1000, %rsp
+2: hlt
+ jmp 2b
+3:
+#endif
+ /* Return page-table pointer */
+ movq %rdi, %rax
+ RET
+SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
deleted file mode 100644
index cb22acf3ed09..000000000000
--- a/arch/x86/kernel/sfi.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-
-void __init mp_sfi_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = read_apic_id();
-
- pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
-}
-
-/* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
-{
- if (MAX_APICS - id <= 0) {
- pr_warning("Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
- return;
- }
-
- pr_info("registering lapic[%d]\n", id);
-
- generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_cpu_table_entry *pentry;
- int i;
- int cpu_num;
-
- sb = (struct sfi_table_simple *)table;
- cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
- pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-
- for (i = 0; i < cpu_num; i++) {
- mp_sfi_register_lapic(pentry->apic_id);
- pentry++;
- }
-
- smp_found_config = 1;
- return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#ifdef CONFIG_X86_IO_APIC
-
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_apic_table_entry *pentry;
- int i, num;
-
- sb = (struct sfi_table_simple *)table;
- num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
- pentry = (struct sfi_apic_table_entry *)sb->pentry;
-
- for (i = 0; i < num; i++) {
- mp_register_ioapic(i, pentry->phys_addr, gsi_top);
- pentry++;
- }
-
- WARN(pic_mode, KERN_WARNING
- "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
- pic_mode = 0;
- return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- mp_sfi_register_lapic_address(sfi_lapic_addr);
- sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
- sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
- return 0;
-}
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index 000000000000..978232b6d48d
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,635 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/shstk.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
+#include <asm/prctl.h>
+
+#define SS_FRAME_SIZE 8
+
+static bool features_enabled(unsigned long features)
+{
+ return current->thread.features & features;
+}
+
+static void features_set(unsigned long features)
+{
+ current->thread.features |= features;
+}
+
+static void features_clr(unsigned long features)
+{
+ current->thread.features &= ~features;
+}
+
+/*
+ * Create a restore token on the shadow stack. A token is always 8-byte
+ * and aligned to 8.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+ unsigned long addr;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ addr = ssp - SS_FRAME_SIZE;
+
+ /*
+ * SSP is aligned, so reserved bits and mode bit are a zero, just mark
+ * the token 64-bit.
+ */
+ ssp |= BIT(0);
+
+ if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
+ return -EFAULT;
+
+ if (token_addr)
+ *token_addr = addr;
+
+ return 0;
+}
+
+/*
+ * VM_SHADOW_STACK will have a guard page. This helps userspace protect
+ * itself from attacks. The reasoning is as follows:
+ *
+ * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
+ * INCSSP instruction can increment the shadow stack pointer. It is the
+ * shadow stack analog of an instruction like:
+ *
+ * addq $0x80, %rsp
+ *
+ * However, there is one important difference between an ADD on %rsp
+ * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
+ * memory of the first and last elements that were "popped". It can be
+ * thought of as acting like this:
+ *
+ * READ_ONCE(ssp); // read+discard top element on stack
+ * ssp += nr_to_pop * 8; // move the shadow stack
+ * READ_ONCE(ssp-8); // read+discard last popped stack element
+ *
+ * The maximum distance INCSSP can move the SSP is 2040 bytes, before
+ * it would read the memory. Therefore a single page gap will be enough
+ * to prevent any operation from shifting the SSP to an adjacent stack,
+ * since it would have to land in the gap at least once, causing a
+ * fault.
+ */
+static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
+ goto out;
+
+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
+ vm_munmap(mapped_addr, size);
+ return -EINVAL;
+ }
+
+out:
+ return mapped_addr;
+}
+
+static unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * mmap_write_lock_killable() failed with -EINTR. This means
+ * the process is about to die and have it's MM cleaned up.
+ * This task shouldn't ever make it back to userspace. In this
+ * case it is ok to leak a shadow stack, so just exit out.
+ */
+ if (r == -EINTR)
+ return;
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+}
+
+static int shstk_setup(void)
+{
+ struct thread_shstk *shstk = &current->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+
+ shstk->base = addr;
+ shstk->size = size;
+ features_set(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
+void reset_thread_features(void)
+{
+ memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
+ current->thread.features = 0;
+ current->thread.features_locked = 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
+ unsigned long stack_size)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long addr, size;
+
+ /*
+ * If shadow stack is not enabled on the new thread, skip any
+ * switch to a new shadow stack.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /*
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
+ * stack.
+ */
+ if (!(clone_flags & CLONE_VM))
+ return 0;
+
+ size = adjust_shstk_size(stack_size);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ shstk->base = addr;
+ shstk->size = size;
+
+ return addr + size;
+}
+
+static unsigned long get_user_shstk_addr(void)
+{
+ unsigned long long ssp;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+
+ fpregs_unlock();
+
+ return ssp;
+}
+
+int shstk_pop(u64 *val)
+{
+ int ret = 0;
+ u64 ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ if (val && get_user(*val, (__user u64 *)ssp))
+ ret = -EFAULT;
+ else
+ wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
+ fpregs_unlock();
+
+ return ret;
+}
+
+int shstk_push(u64 val)
+{
+ u64 ssp;
+ int ret;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -ENOTSUPP;
+
+ fpregs_lock_and_load();
+
+ rdmsrq(MSR_IA32_PL3_SSP, ssp);
+ ssp -= SS_FRAME_SIZE;
+ ret = write_user_shstk_64((__user void *)ssp, val);
+ if (!ret)
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return ret;
+}
+
+#define SHSTK_DATA_BIT BIT(63)
+
+static int put_shstk_data(u64 __user *addr, u64 data)
+{
+ if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ /*
+ * Mark the high bit so that the sigframe can't be processed as a
+ * return address.
+ */
+ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
+ return -EFAULT;
+ return 0;
+}
+
+static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
+{
+ unsigned long ldata;
+
+ if (unlikely(get_user(ldata, addr)))
+ return -EFAULT;
+
+ if (!(ldata & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ *data = ldata & ~SHSTK_DATA_BIT;
+
+ return 0;
+}
+
+static int shstk_push_sigframe(unsigned long *ssp)
+{
+ unsigned long target_ssp = *ssp;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(target_ssp, 8))
+ return -EINVAL;
+
+ *ssp -= SS_FRAME_SIZE;
+ if (put_shstk_data((void __user *)*ssp, target_ssp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int shstk_pop_sigframe(unsigned long *ssp)
+{
+ struct vm_area_struct *vma;
+ unsigned long token_addr;
+ bool need_to_check_vma;
+ int err = 1;
+
+ /*
+ * It is possible for the SSP to be off the end of a shadow stack by 4
+ * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
+ * before it, it might be this case, so check that the address being
+ * read is actually shadow stack.
+ */
+ if (!IS_ALIGNED(*ssp, 8))
+ return -EINVAL;
+
+ need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
+
+ if (need_to_check_vma)
+ mmap_read_lock_killable(current->mm);
+
+ err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
+ if (unlikely(err))
+ goto out_err;
+
+ if (need_to_check_vma) {
+ vma = find_vma(current->mm, *ssp);
+ if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
+ err = -EFAULT;
+ goto out_err;
+ }
+
+ mmap_read_unlock(current->mm);
+ }
+
+ /* Restore SSP aligned? */
+ if (unlikely(!IS_ALIGNED(token_addr, 8)))
+ return -EINVAL;
+
+ /* SSP in userspace? */
+ if (unlikely(token_addr >= TASK_SIZE_MAX))
+ return -EINVAL;
+
+ *ssp = token_addr;
+
+ return 0;
+out_err:
+ if (need_to_check_vma)
+ mmap_read_unlock(current->mm);
+ return err;
+}
+
+int setup_signal_shadow_stack(struct ksignal *ksig)
+{
+ void __user *restorer = ksig->ka.sa.sa_restorer;
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ if (!restorer)
+ return -EINVAL;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_push_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ /* Push restorer address */
+ ssp -= SS_FRAME_SIZE;
+ err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
+ if (unlikely(err))
+ return -EFAULT;
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+int restore_signal_shadow_stack(void)
+{
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_pop_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ fpregs_lock_and_load();
+ wrmsrq(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already has a
+ * shadow stack allocated, and exit_thread() calls this function to
+ * free it. In this case the parent (current) and the child share
+ * the same mm struct.
+ */
+ if (!tsk->mm || tsk->mm != current->mm)
+ return;
+
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
+ unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
+}
+
+static int wrss_control(bool enable)
+{
+ u64 msrval;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /*
+ * Only enable WRSS if shadow stack is enabled. If shadow stack is not
+ * enabled, WRSS will already be disabled, so don't bother clearing it
+ * when disabling.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -EPERM;
+
+ /* Already enabled/disabled? */
+ if (features_enabled(ARCH_SHSTK_WRSS) == enable)
+ return 0;
+
+ fpregs_lock_and_load();
+ rdmsrq(MSR_IA32_U_CET, msrval);
+
+ if (enable) {
+ features_set(ARCH_SHSTK_WRSS);
+ msrval |= CET_WRSS_EN;
+ } else {
+ features_clr(ARCH_SHSTK_WRSS);
+ if (!(msrval & CET_WRSS_EN))
+ goto unlock;
+
+ msrval &= ~CET_WRSS_EN;
+ }
+
+ wrmsrq(MSR_IA32_U_CET, msrval);
+
+unlock:
+ fpregs_unlock();
+
+ return 0;
+}
+
+static int shstk_disable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /* Already disabled? */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ fpregs_lock_and_load();
+ /* Disable WRSS too when disabling shadow stack */
+ wrmsrq(MSR_IA32_U_CET, 0);
+ wrmsrq(MSR_IA32_PL3_SSP, 0);
+ fpregs_unlock();
+
+ shstk_free(current);
+ features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+ unsigned long aligned_size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ if (flags & ~SHADOW_STACK_SET_TOKEN)
+ return -EINVAL;
+
+ /* If there isn't space for a token */
+ if (set_tok && size < 8)
+ return -ENOSPC;
+
+ if (addr && addr < SZ_4G)
+ return -ERANGE;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ aligned_size = PAGE_ALIGN(size);
+ if (aligned_size < size)
+ return -EOVERFLOW;
+
+ return alloc_shstk(addr, aligned_size, size, set_tok);
+}
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
+{
+ unsigned long features = arg2;
+
+ if (option == ARCH_SHSTK_STATUS) {
+ return put_user(task->thread.features, (unsigned long __user *)arg2);
+ }
+
+ if (option == ARCH_SHSTK_LOCK) {
+ task->thread.features_locked |= features;
+ return 0;
+ }
+
+ /* Only allow via ptrace */
+ if (task != current) {
+ if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
+ task->thread.features_locked &= ~features;
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ /* Do not allow to change locked features */
+ if (features & task->thread.features_locked)
+ return -EPERM;
+
+ /* Only support enabling/disabling one feature at a time. */
+ if (hweight_long(features) > 1)
+ return -EINVAL;
+
+ if (option == ARCH_SHSTK_DISABLE) {
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(false);
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_disable();
+ return -EINVAL;
+ }
+
+ /* Handle ARCH_SHSTK_ENABLE */
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_setup();
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(true);
+ return -EINVAL;
+}
+
+int shstk_update_last_frame(unsigned long val)
+{
+ unsigned long ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
+
+bool shstk_is_enabled(void)
+{
+ return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..2404233336ab 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
@@ -6,686 +7,261 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
-#include <linux/signal.h>
+#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
+#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/i387.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
-
-#ifdef CONFIG_X86_64
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#endif /* CONFIG_X86_64 */
+#include <asm/sighandling.h>
+#include <asm/vm86.h>
#include <asm/syscall.h>
-#include <asm/syscalls.h>
-
#include <asm/sigframe.h>
+#include <asm/signal.h>
+#include <asm/shstk.h>
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
- X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
- X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
- X86_EFLAGS_CF)
-
-#ifdef CONFIG_X86_32
-# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
-#else
-# define FIX_EFLAGS __FIX_EFLAGS
-#endif
-
-#define COPY(x) do { \
- get_user_ex(regs->x, &sc->x); \
-} while (0)
-
-#define GET_SEG(seg) ({ \
- unsigned short tmp; \
- get_user_ex(tmp, &sc->seg); \
- tmp; \
-})
-
-#define COPY_SEG(seg) do { \
- regs->seg = GET_SEG(seg); \
-} while (0)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
+}
-#define COPY_SEG_CPL3(seg) do { \
- regs->seg = GET_SEG(seg) | 3; \
-} while (0)
+static inline int is_ia32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
+}
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+static inline int is_x32_frame(struct ksignal *ksig)
{
- void __user *buf;
- unsigned int tmpflags;
- unsigned int err = 0;
-
- /* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
- get_user_try {
-
-#ifdef CONFIG_X86_32
- set_user_gs(regs, GET_SEG(gs));
- COPY_SEG(fs);
- COPY_SEG(es);
- COPY_SEG(ds);
-#endif /* CONFIG_X86_32 */
-
- COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
-
-#ifdef CONFIG_X86_64
- COPY(r8);
- COPY(r9);
- COPY(r10);
- COPY(r11);
- COPY(r12);
- COPY(r13);
- COPY(r14);
- COPY(r15);
-#endif /* CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_32
- COPY_SEG_CPL3(cs);
- COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
-
- get_user_ex(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
- regs->orig_ax = -1; /* disable syscall checks */
-
- get_user_ex(buf, &sc->fpstate);
- err |= restore_i387_xstate(buf);
-
- get_user_ex(*pax, &sc->ax);
- } get_user_catch(err);
-
- return err;
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
-static int
-setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+/*
+ * Enable all pkeys temporarily, so as to ensure that both the current
+ * execution stack as well as the alternate signal stack are writeable.
+ * The application can use any of the available pkeys to protect the
+ * alternate signal stack, and we don't know which one it is, so enable
+ * all. The PKRU register will be reset to init_pkru later in the flow,
+ * in fpu__clear_user_states(), and it is the application's responsibility
+ * to enable the appropriate pkey as the first step in the signal handler
+ * so that the handler does not segfault.
+ */
+static inline u32 sig_prepare_pkru(void)
{
- int err = 0;
-
- put_user_try {
-
-#ifdef CONFIG_X86_32
- put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
- put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
- put_user_ex(regs->es, (unsigned int __user *)&sc->es);
- put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
-#endif /* CONFIG_X86_32 */
-
- put_user_ex(regs->di, &sc->di);
- put_user_ex(regs->si, &sc->si);
- put_user_ex(regs->bp, &sc->bp);
- put_user_ex(regs->sp, &sc->sp);
- put_user_ex(regs->bx, &sc->bx);
- put_user_ex(regs->dx, &sc->dx);
- put_user_ex(regs->cx, &sc->cx);
- put_user_ex(regs->ax, &sc->ax);
-#ifdef CONFIG_X86_64
- put_user_ex(regs->r8, &sc->r8);
- put_user_ex(regs->r9, &sc->r9);
- put_user_ex(regs->r10, &sc->r10);
- put_user_ex(regs->r11, &sc->r11);
- put_user_ex(regs->r12, &sc->r12);
- put_user_ex(regs->r13, &sc->r13);
- put_user_ex(regs->r14, &sc->r14);
- put_user_ex(regs->r15, &sc->r15);
-#endif /* CONFIG_X86_64 */
-
- put_user_ex(current->thread.trap_no, &sc->trapno);
- put_user_ex(current->thread.error_code, &sc->err);
- put_user_ex(regs->ip, &sc->ip);
-#ifdef CONFIG_X86_32
- put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
- put_user_ex(regs->flags, &sc->flags);
- put_user_ex(regs->sp, &sc->sp_at_signal);
- put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
-#else /* !CONFIG_X86_32 */
- put_user_ex(regs->flags, &sc->flags);
- put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->gs);
- put_user_ex(0, &sc->fs);
-#endif /* CONFIG_X86_32 */
-
- put_user_ex(fpstate, &sc->fpstate);
-
- /* non-iBCS2 extensions.. */
- put_user_ex(mask, &sc->oldmask);
- put_user_ex(current->thread.cr2, &sc->cr2);
- } put_user_catch(err);
-
- return err;
+ u32 orig_pkru = read_pkru();
+
+ write_pkru(0);
+ return orig_pkru;
}
/*
* Set up a signal frame.
*/
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT 16UL
+
+#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1)
+
/*
* Determine which stack to use..
*/
-static unsigned long align_sigframe(unsigned long sp)
-{
-#ifdef CONFIG_X86_32
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -16ul) - 4;
-#else /* !CONFIG_X86_32 */
- sp = round_down(sp, 16) - 8;
-#endif
- return sp;
-}
-
-static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
+ struct k_sigaction *ka = &ksig->ka;
+ int ia32_frame = is_ia32_frame(ksig);
/* Default to using normal stack */
+ bool nested_altstack = on_sig_stack(regs->sp);
+ bool entering_altstack = false;
+ unsigned long math_size = 0;
unsigned long sp = regs->sp;
- int onsigstack = on_sig_stack(sp);
+ unsigned long buf_fx = 0;
+ u32 pkru;
-#ifdef CONFIG_X86_64
/* redzone */
- sp -= 128;
-#endif /* CONFIG_X86_64 */
-
- if (!onsigstack) {
- /* This is the X/Open sanctioned signal stack switching. */
- if (ka->sa.sa_flags & SA_ONSTACK) {
- if (current->sas_ss_size)
- sp = current->sas_ss_sp + current->sas_ss_size;
- } else {
-#ifdef CONFIG_X86_32
- /* This is the legacy signal stack switching. */
- if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer)
- sp = (unsigned long) ka->sa.sa_restorer;
-#endif /* CONFIG_X86_32 */
+ if (!ia32_frame)
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ /*
+ * This checks nested_altstack via sas_ss_flags(). Sensible
+ * programs use SS_AUTODISARM, which disables that check, and
+ * programs that don't use SS_AUTODISARM get compatible.
+ */
+ if (sas_ss_flags(sp) == 0) {
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ entering_altstack = true;
}
+ } else if (ia32_frame &&
+ !nested_altstack &&
+ regs->ss != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
+ sp = (unsigned long) ka->sa.sa_restorer;
+ entering_altstack = true;
}
- if (used_math()) {
- sp -= sig_xstate_size;
-#ifdef CONFIG_X86_64
- sp = round_down(sp, 64);
-#endif /* CONFIG_X86_64 */
- *fpstate = (void __user *)sp;
- }
+ sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
+ *fpstate = (void __user *)sp;
- sp = align_sigframe(sp - frame_size);
+ sp -= frame_size;
+
+ if (ia32_frame)
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
+ else
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
/*
* If we are on the alternate signal stack and would overflow it, don't.
* Return an always-bogus address instead so we will die with SIGSEGV.
*/
- if (onsigstack && !likely(on_sig_stack(sp)))
- return (void __user *)-1L;
+ if (unlikely((nested_altstack || entering_altstack) &&
+ !__on_sig_stack(sp))) {
- /* save i387 state */
- if (used_math() && save_i387_xstate(*fpstate) < 0)
- return (void __user *)-1L;
+ if (show_unhandled_signals && printk_ratelimit())
+ pr_info("%s[%d] overflowed sigaltstack\n",
+ current->comm, task_pid_nr(current));
- return (void __user *)sp;
-}
-
-#ifdef CONFIG_X86_32
-static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
-} __attribute__((packed)) retcode = {
- 0xb858, /* popl %eax; movl $..., %eax */
- __NR_sigreturn,
- 0x80cd, /* int $0x80 */
-};
-
-static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
-} __attribute__((packed)) rt_retcode = {
- 0xb8, /* movl $..., %eax */
- __NR_rt_sigreturn,
- 0x80cd, /* int $0x80 */
- 0
-};
-
-static int
-__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
- struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (__put_user(sig, &frame->sig))
- return -EFAULT;
-
- if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
- return -EFAULT;
-
- if (_NSIG_WORDS > 1) {
- if (__copy_to_user(&frame->extramask, &set->sig[1],
- sizeof(frame->extramask)))
- return -EFAULT;
+ return (void __user *)-1L;
}
- if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
- else
- restorer = &frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
-
- /* Set up to return from userspace. */
- err |= __put_user(restorer, &frame->pretcode);
-
- /*
- * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = 0;
- regs->cx = 0;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-}
-
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *restorer;
- int err = 0;
- void __user *fpstate = NULL;
-
- frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(sig, &frame->sig);
- put_user_ex(&frame->info, &frame->pinfo);
- put_user_ex(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
-
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
- regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
- put_user_ex(restorer, &frame->pretcode);
-
+ /* Update PKRU to enable access to the alternate signal stack. */
+ pkru = sig_prepare_pkru();
+ /* save i387 and extended state */
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
/*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
+ * Restore PKRU to the original, user-defined value; disable
+ * extra pkeys enabled for the alternate signal stack, if any.
*/
- put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ka->sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = (unsigned long)&frame->info;
- regs->cx = (unsigned long)&frame->uc;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-}
-#else /* !CONFIG_X86_32 */
-static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- int err = 0;
- struct task_struct *me = current;
-
- frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
-
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
- return -EFAULT;
-
- if (ka->sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, info))
- return -EFAULT;
+ write_pkru(pkru);
+ return (void __user *)-1L;
}
- put_user_try {
- /* Create the ucontext. */
- if (cpu_has_xsave)
- put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
- else
- put_user_ex(0, &frame->uc.uc_flags);
- put_user_ex(0, &frame->uc.uc_link);
- put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
- put_user_ex(sas_ss_flags(regs->sp),
- &frame->uc.uc_stack.ss_flags);
- put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
- err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- /* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER) {
- put_user_ex(ka->sa.sa_restorer, &frame->pretcode);
- } else {
- /* could use a vstub here */
- err |= -EFAULT;
- }
- } put_user_catch(err);
-
- if (err)
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ka->sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
- regs->cs = __USER_CS;
-
- return 0;
+ return (void __user *)sp;
}
-#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_X86_32
/*
- * Atomically swap in the new signal mask, and wait for a signal.
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
*/
-asmlinkage int
-sys_sigsuspend(int history0, int history1, old_sigset_t mask)
-{
- mask &= _BLOCKABLE;
- spin_lock_irq(&current->sighand->siglock);
- current->saved_sigmask = current->blocked;
- siginitset(&current->blocked, mask);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- set_restore_sigmask();
-
- return -ERESTARTNOHAND;
-}
-
-asmlinkage int
-sys_sigaction(int sig, const struct old_sigaction __user *act,
- struct old_sigaction __user *oact)
-{
- struct k_sigaction new_ka, old_ka;
- int ret = 0;
-
- if (act) {
- old_sigset_t mask;
-
- if (!access_ok(VERIFY_READ, act, sizeof(*act)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
- get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
- get_user_ex(mask, &act->sa_mask);
- get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
- } get_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- siginitset(&new_ka.sa.sa_mask, mask);
- }
-
- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
-
- if (!ret && oact) {
- if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
- return -EFAULT;
-
- put_user_try {
- put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
- put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
- put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
- put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
- } put_user_catch(ret);
-
- if (ret)
- return -EFAULT;
- }
-
- return ret;
-}
-#endif /* CONFIG_X86_32 */
-
-long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
- struct pt_regs *regs)
-{
- return do_sigaltstack(uss, uoss, regs->sp);
-}
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+#endif
/*
- * Do a signal return; undo the signal stack.
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
*/
-#ifdef CONFIG_X86_32
-unsigned long sys_sigreturn(struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct sigframe __user *)(regs->sp - 8);
-
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
- && __copy_from_user(&set.sig[1], &frame->extramask,
- sizeof(frame->extramask))))
- goto badframe;
+#define MAX_XSAVE_PADDING 63UL
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- if (restore_sigcontext(regs, &frame->sc, &ax))
- goto badframe;
- return ax;
-
-badframe:
- signal_fault(regs, frame, "sigreturn");
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | (f)xsave frame |
+ * -------------------------
+ * | fsave header |
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | siginfo + ucontext |
+ * -------------------------
+ */
- return 0;
-}
-#endif /* CONFIG_X86_32 */
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+static unsigned int __ro_after_init fpu_default_state_size;
-long sys_rt_sigreturn(struct pt_regs *regs)
+static int __init init_sigframe_size(void)
{
- struct rt_sigframe __user *frame;
- unsigned long ax;
- sigset_t set;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
- goto badframe;
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
- goto badframe;
-
- sigdelsetmask(&set, ~_BLOCKABLE);
- spin_lock_irq(&current->sighand->siglock);
- current->blocked = set;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
+ fpu_default_state_size = fpu__get_fpstate_size();
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
- goto badframe;
+ max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
- if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
- goto badframe;
+ max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;
- return ax;
+ /* Userspace expects an aligned size. */
+ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
+ pr_info("max sigframe size: %lu\n", max_frame_size);
return 0;
}
+early_initcall(init_sigframe_size);
-/*
- * OK, we're invoking a handler:
- */
-static int signr_convert(int sig)
+unsigned long get_sigframe_size(void)
{
-#ifdef CONFIG_X86_32
- struct thread_info *info = current_thread_info();
-
- if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
- return info->exec_domain->signal_invmap[sig];
-#endif /* CONFIG_X86_32 */
- return sig;
+ return max_frame_size;
}
-#ifdef CONFIG_X86_32
-
-#define is_ia32 1
-#define ia32_setup_frame __setup_frame
-#define ia32_setup_rt_frame __setup_rt_frame
-
-#else /* !CONFIG_X86_32 */
-
-#ifdef CONFIG_IA32_EMULATION
-#define is_ia32 test_thread_flag(TIF_IA32)
-#else /* !CONFIG_IA32_EMULATION */
-#define is_ia32 0
-#endif /* CONFIG_IA32_EMULATION */
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs *regs);
-
-#endif /* CONFIG_X86_32 */
-
static int
-setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs *regs)
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = signr_convert(sig);
- int ret;
+ /* Perform fixup for the pre-signal frame. */
+ rseq_signal_deliver(ksig, regs);
/* Set up the stack frame */
- if (is_ia32) {
- if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ if (is_ia32_frame(ksig)) {
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(ksig, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
- } else
- ret = __setup_rt_frame(sig, ka, info, set, regs);
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
+ return ia32_setup_frame(ksig, regs);
+ } else if (is_x32_frame(ksig)) {
+ return x32_setup_rt_frame(ksig, regs);
+ } else {
+ return x64_setup_rt_frame(ksig, regs);
}
-
- return ret;
}
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
+static void
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
- int ret;
+ bool stepping, failed;
+ struct fpu *fpu = x86_task_fpu(current);
+
+ if (v8086_mode(regs))
+ save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* If so, check system call restarting.. */
switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
@@ -694,11 +270,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
break;
case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
regs->ax = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->ax = regs->orig_ax;
regs->ip -= 2;
@@ -707,104 +283,65 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
}
/*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
*/
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
-
- ret = setup_rt_frame(sig, ka, info, oldset, regs);
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
- if (ret)
- return ret;
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ *
+ * Clear RF when entering the signal handler, because
+ * it might disable possible debug exception from the
+ * signal handler.
+ *
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
+ */
+ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ fpu__clear_user_states(fpu);
+ }
+ signal_setup_done(failed, ksig, stepping);
+}
-#ifdef CONFIG_X86_64
- /*
- * This has nothing to do with segment registers,
- * despite the name. This magic affects uaccess.h
- * macros' behavior. Reset it to the normal setting.
- */
- set_fs(USER_DS);
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (current->restart_block.arch_data & TS_COMPAT)
+ return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+ return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
+ return __NR_restart_syscall;
#endif
-
- /*
- * Clear the direction flag as per the ABI for function entry.
- */
- regs->flags &= ~X86_EFLAGS_DF;
-
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~X86_EFLAGS_TF;
-
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked, sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
}
-#ifdef CONFIG_X86_32
-#define NR_restart_syscall __NR_restart_syscall
-#else /* !CONFIG_X86_32 */
-#define NR_restart_syscall \
- test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-#endif /* CONFIG_X86_32 */
-
/*
* Note that 'init' is a special process: it doesn't get signals it doesn't
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-static void do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
- struct k_sigaction ka;
- siginfo_t info;
- int signr;
- sigset_t *oldset;
-
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
+ struct ksignal ksig;
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- oldset = &current->saved_sigmask;
- else
- oldset = &current->blocked;
-
- signr = get_signal_to_deliver(&info, &ka, regs, NULL);
- if (signr > 0) {
+ if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
- if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /*
- * A signal was successfully delivered; the saved
- * sigmask will have been stored in the signal frame,
- * and will be restored by sigreturn, so we can simply
- * clear the TS_RESTORE_SIGMASK flag.
- */
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- }
+ handle_signal(&ksig, regs);
return;
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
@@ -815,7 +352,7 @@ static void do_signal(struct pt_regs *regs)
break;
case -ERESTART_RESTARTBLOCK:
- regs->ax = NR_restart_syscall;
+ regs->ax = get_nr_restart_syscall(regs);
regs->ip -= 2;
break;
}
@@ -825,41 +362,7 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
- }
-}
-
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-#ifdef CONFIG_X86_MCE
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
- /* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- if (current->replacement_session_keyring)
- key_replace_session_keyring();
- }
- if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
- fire_user_return_notifiers();
-
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
+ restore_saved_sigmask();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -872,9 +375,65 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
- printk(KERN_CONT "\n");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
}
- force_sig(SIGSEGV, me);
+ force_sig(SIGSEGV);
+}
+
+#ifdef CONFIG_DYNAMIC_SIGFRAME
+#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
+static bool strict_sigaltstack_size __ro_after_init = true;
+#else
+static bool strict_sigaltstack_size __ro_after_init = false;
+#endif
+
+static int __init strict_sas_size(char *arg)
+{
+ return kstrtobool(arg, &strict_sigaltstack_size) == 0;
+}
+__setup("strict_sas_size", strict_sas_size);
+
+/*
+ * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
+ * exceeds that size already. As such programs might never use the
+ * sigaltstack they just continued to work. While always checking against
+ * the real size would be correct, this might be considered a regression.
+ *
+ * Therefore avoid the sanity check, unless enforced by kernel
+ * configuration or command line option.
+ *
+ * When dynamic FPU features are supported, the check is also enforced when
+ * the task has permissions to use dynamic features. Tasks which have no
+ * permission are checked against the size of the non-dynamic feature set
+ * if strict checking is enabled. This avoids forcing all tasks on the
+ * system to allocate large sigaltstacks even if they are never going
+ * to use a dynamic feature. As this is serialized via sighand::siglock
+ * any permission request for a dynamic feature either happened already
+ * or will see the newly install sigaltstack size in the permission checks.
+ */
+bool sigaltstack_size_valid(size_t ss_size)
+{
+ unsigned long fsize = max_frame_size - fpu_default_state_size;
+ u64 mask;
+
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
+ return true;
+
+ fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size;
+ if (likely(ss_size > fsize))
+ return true;
+
+ if (strict_sigaltstack_size)
+ return ss_size > fsize;
+
+ mask = x86_task_fpu(current->group_leader)->perm.__state_perm;
+ if (mask & XFEATURE_MASK_USER_DYNAMIC)
+ return ss_size > fsize;
+
+ return true;
}
+#endif /* CONFIG_DYNAMIC_SIGFRAME */
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
new file mode 100644
index 000000000000..42bbc42bd350
--- /dev/null
+++ b/arch/x86/kernel/signal_32.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <asm/ucontext.h>
+#include <linux/uaccess.h>
+#include <asm/fpu/signal.h>
+#include <asm/ptrace.h>
+#include <asm/user32.h>
+#include <uapi/asm/sigcontext.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/sigframe.h>
+#include <asm/sighandling.h>
+#include <asm/smap.h>
+#include <asm/gsseg.h>
+
+/*
+ * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0
+ * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index
+ * GDT, selector values 0~3 all point to the NULL descriptor, thus values
+ * 0, 1, 2 and 3 are all valid NULL selector values.
+ *
+ * However IRET zeros ES, FS, GS, and DS segment registers if any of them
+ * is found to have any nonzero NULL selector value, which can be used by
+ * userspace in pre-FRED systems to spot any interrupt/exception by loading
+ * a nonzero NULL selector and waiting for it to become zero. Before FRED
+ * there was nothing software could do to prevent such an information leak.
+ *
+ * ERETU, the only legit instruction to return to userspace from kernel
+ * under FRED, by design does NOT zero any segment register to avoid this
+ * problem behavior.
+ *
+ * As such, leave NULL selector values 0~3 unchanged.
+ */
+static inline u16 fixup_rpl(u16 sel)
+{
+ return sel <= 3 ? sel : sel | 3;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/unistd_32_ia32.h>
+
+static inline void reload_segments(struct sigcontext_32 *sc)
+{
+ u16 cur;
+
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
+ savesegment(gs, cur);
+ if (fixup_rpl(sc->gs) != cur)
+ load_gs_index(fixup_rpl(sc->gs));
+ savesegment(fs, cur);
+ if (fixup_rpl(sc->fs) != cur)
+ loadsegment(fs, fixup_rpl(sc->fs));
+
+ savesegment(ds, cur);
+ if (fixup_rpl(sc->ds) != cur)
+ loadsegment(ds, fixup_rpl(sc->ds));
+ savesegment(es, cur);
+ if (fixup_rpl(sc->es) != cur)
+ loadsegment(es, fixup_rpl(sc->es));
+}
+
+#define sigset32_t compat_sigset_t
+#define siginfo32_t compat_siginfo_t
+#define restore_altstack32 compat_restore_altstack
+#define unsafe_save_altstack32 unsafe_compat_save_altstack
+
+#else
+
+#define sigset32_t sigset_t
+#define siginfo32_t siginfo_t
+#define __NR_ia32_sigreturn __NR_sigreturn
+#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn
+#define restore_altstack32 restore_altstack
+#define unsafe_save_altstack32 unsafe_save_altstack
+#define __copy_siginfo_to_user32 copy_siginfo_to_user
+
+#endif
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+static bool ia32_restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext_32 __user *usc)
+{
+ struct sigcontext_32 sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (unlikely(copy_from_user(&sc, usc, sizeof(sc))))
+ return false;
+
+ /* Get only the ia32 registers. */
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+#ifdef CONFIG_IA32_EMULATION
+ reload_segments(&sc);
+#else
+ loadsegment(gs, fixup_rpl(sc.gs));
+ regs->fs = fixup_rpl(sc.fs);
+ regs->es = fixup_rpl(sc.es);
+ regs->ds = fixup_rpl(sc.ds);
+#endif
+
+ return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
+}
+
+SYSCALL32_DEFINE0(sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
+ sigset_t set;
+
+ prevent_single_step_upon_eretu(regs);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask)
+ || __get_user(((__u32 *)&set)[1], &frame->extramask[0]))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!ia32_restore_sigcontext(regs, &frame->sc))
+ goto badframe;
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit sigreturn");
+ return 0;
+}
+
+SYSCALL32_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_ia32 __user *frame;
+ sigset_t set;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+ goto badframe;
+
+ if (restore_altstack32(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit rt sigreturn");
+ return 0;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; })
+
+static __always_inline int
+__unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
+ void __user *fpstate,
+ struct pt_regs *regs, unsigned int mask)
+{
+ unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+#ifdef CONFIG_IA32_EMULATION
+ unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
+ unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+#else
+ unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+ unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
+#endif
+
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
+ unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
+
+ unsafe_put_user(ptr_to_compat(fpstate), &sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext32(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext32(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0)
+
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
+ struct sigframe_ia32 __user *frame;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ /* copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+ } __attribute__((packed)) code = {
+ 0xb858, /* popl %eax ; movl $...,%eax */
+ __NR_ia32_sigreturn,
+ 0x80cd, /* int $0x80 */
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* Return stub is in 32bit vsyscall page */
+ if (current->mm->context.vdso)
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_sigreturn;
+ else
+ restorer = &frame->retcode;
+ }
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
+ unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
+ unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
+ unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+ /*
+ * These are actually not used anymore, but left because some
+ * gdb versions depend on them as a marker.
+ */
+ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+ user_access_end();
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = ksig->sig;
+ regs->dx = 0;
+ regs->cx = 0;
+
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
+ struct rt_sigframe_ia32 __user *frame;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ /* unsafe_put_user optimizes that into a single 8 byte store */
+ static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+ } __attribute__((packed)) code = {
+ 0xb8,
+ __NR_ia32_rt_sigreturn,
+ 0x80cd,
+ 0,
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
+ unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
+ unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
+
+ /* Create the ucontext. */
+ if (static_cpu_has(X86_FEATURE_XSAVE))
+ unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
+ else
+ unsafe_put_user(0, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+ else
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_rt_sigreturn;
+ unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
+
+ /*
+ * Not actually used anymore, but left because some gdb
+ * versions need it.
+ */
+ unsafe_put_user(*((u64 *)&code), (u64 __user *)frame->retcode, Efault);
+ unsafe_put_sigcontext32(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_user(*(__u64 *)set, (__u64 __user *)&frame->uc.uc_sigmask, Efault);
+ user_access_end();
+
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info))
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = ksig->sig;
+ regs->dx = (unsigned long) &frame->info;
+ regs->cx = (unsigned long) &frame->uc;
+
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * The siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 10);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo32_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo32_t) == 4);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int));
+
+static_assert(offsetof(siginfo32_t, si_signo) == 0);
+static_assert(offsetof(siginfo32_t, si_errno) == 4);
+static_assert(offsetof(siginfo32_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo32_t, _sifields) == \
+ offsetof(siginfo32_t, _sifields.name))
+
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo32_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0xC);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+
+CHECK_SI_OFFSET(_timer);
+#ifdef CONFIG_COMPAT
+/* compat_siginfo_t doesn't have si_sys_private */
+CHECK_SI_SIZE (_timer, 3*sizeof(int));
+#else
+CHECK_SI_SIZE (_timer, 4*sizeof(int));
+#endif
+static_assert(offsetof(siginfo32_t, si_tid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_overrun) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 5*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_status) == 0x14);
+static_assert(offsetof(siginfo32_t, si_utime) == 0x18);
+static_assert(offsetof(siginfo32_t, si_stime) == 0x1C);
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 4*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_addr) == 0x0C);
+
+static_assert(offsetof(siginfo32_t, si_trapno) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_lower) == 0x14);
+static_assert(offsetof(siginfo32_t, si_upper) == 0x18);
+
+static_assert(offsetof(siginfo32_t, si_pkey) == 0x14);
+
+static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10);
+static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14);
+static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_band) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_fd) == 0x10);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_syscall) == 0x10);
+static_assert(offsetof(siginfo32_t, si_arch) == 0x14);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..d483b585c6c6
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <asm/ucontext.h>
+#include <asm/fpu/signal.h>
+#include <asm/sighandling.h>
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
+{
+ struct sigcontext sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
+ return false;
+
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+ regs->r8 = sc.r8;
+ regs->r9 = sc.r9;
+ regs->r10 = sc.r10;
+ regs->r11 = sc.r11;
+ regs->r12 = sc.r12;
+ regs->r13 = sc.r13;
+ regs->r14 = sc.r14;
+ regs->r15 = sc.r15;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+
+ return fpu__restore_sig((void __user *)sc.fpstate, 0);
+}
+
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(regs->r8, &sc->r8, Efault);
+ unsafe_put_user(regs->r9, &sc->r9, Efault);
+ unsafe_put_user(regs->r10, &sc->r10, Efault);
+ unsafe_put_user(regs->r11, &sc->r11, Efault);
+ unsafe_put_user(regs->r12, &sc->r12, Efault);
+ unsafe_put_user(regs->r13, &sc->r13, Efault);
+ unsafe_put_user(regs->r14, &sc->r14, Efault);
+ unsafe_put_user(regs->r15, &sc->r15, Efault);
+
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->cs, &sc->cs, Efault);
+ unsafe_put_user(0, &sc->gs, Efault);
+ unsafe_put_user(0, &sc->fs, Efault);
+ unsafe_put_user(regs->ss, &sc->ss, Efault);
+
+ unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0);
+
+#define unsafe_put_sigmask(set, frame, label) \
+ unsafe_put_user(*(__u64 *)(set), \
+ (__u64 __user *)&(frame)->uc.uc_sigmask, \
+ label)
+
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset_t *set = sigmask_to_save();
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ unsigned long uc_flags;
+
+ /* x86-64 should always use SA_RESTORER. */
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = ksig->sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatibility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ struct compat_siginfo new;
+
+ copy_siginfo_to_external32(&new, from);
+ if (from->si_signo == SIGCHLD) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ }
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+ return 0;
+}
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ if (in_x32_syscall())
+ return x32_copy_siginfo_to_user(to, from);
+ return __copy_siginfo_to_user32(to, from);
+}
+
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
+ struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ uc_flags = frame_uc_flags(regs);
+
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+ restorer = ksig->ka.sa.sa_restorer;
+ unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ prevent_single_step_upon_eretu(regs);
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif /* CONFIG_X86_X32_ABI */
+
+#ifdef CONFIG_COMPAT
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ if (!act)
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+#endif /* CONFIG_COMPAT */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 10);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo_t) == 8);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo_t, si_signo) == 0);
+static_assert(offsetof(siginfo_t, si_errno) == 4);
+static_assert(offsetof(siginfo_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo_t, _sifields) == \
+ offsetof(siginfo_t, _sifields.name))
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+
+CHECK_SI_OFFSET(_timer);
+CHECK_SI_SIZE (_timer, 6*sizeof(int));
+static_assert(offsetof(siginfo_t, si_tid) == 0x10);
+static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_status) == 0x18);
+static_assert(offsetof(siginfo_t, si_utime) == 0x20);
+static_assert(offsetof(siginfo_t, si_stime) == 0x28);
+
+#ifdef CONFIG_X86_X32_ABI
+/* no _sigchld_x32 in the generic siginfo_t */
+static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
+ 7*sizeof(int));
+static_assert(offsetof(compat_siginfo_t, _sifields) ==
+ offsetof(compat_siginfo_t, _sifields._sigchld_x32));
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18);
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20);
+#endif
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_addr) == 0x10);
+
+static_assert(offsetof(siginfo_t, si_trapno) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_lower) == 0x20);
+static_assert(offsetof(siginfo_t, si_upper) == 0x28);
+
+static_assert(offsetof(siginfo_t, si_pkey) == 0x20);
+
+static_assert(offsetof(siginfo_t, si_perf_data) == 0x18);
+static_assert(offsetof(siginfo_t, si_perf_type) == 0x20);
+static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_band) == 0x10);
+static_assert(offsetof(siginfo_t, si_fd) == 0x18);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
+static_assert(offsetof(siginfo_t, si_syscall) == 0x18);
+static_assert(offsetof(siginfo_t, si_arch) == 0x1C);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210945d6..b014e6d229f9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Intel SMP support routines.
*
@@ -6,9 +7,6 @@
* (c) 2002,2003 Andi Kleen, SuSE Labs.
*
* i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
*/
#include <linux/init.h>
@@ -16,18 +14,28 @@
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
+#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/idtentry.h>
+#include <asm/nmi.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+#include <asm/reboot.h>
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -61,7 +69,7 @@
* 5AP. symmetric IO mode (normal Linux operation) not affected.
* 'noapic' mode has vector 0xf filled out properly.
* 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 7AP. We do not assume writes to the LVT deasserting IRQs
* 8AP. We do not enable low power mode (deep sleep) during MP bootup
* 9AP. We do not use mixed mode
*
@@ -107,131 +115,183 @@
* about nothing of note with C stepping upwards.
*/
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
- if (unlikely(cpu_is_offline(cpu))) {
- WARN_ON(1);
- return;
- }
- apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
-}
-
-void native_send_call_func_single_ipi(int cpu)
-{
- apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
-}
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
-void native_send_call_func_ipi(const struct cpumask *mask)
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
- cpumask_var_t allbutself;
-
- if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
- return;
- }
-
- cpumask_copy(allbutself, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), allbutself);
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
- if (cpumask_equal(mask, allbutself) &&
- cpumask_equal(cpu_online_mask, cpu_callout_mask))
- apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ cpu_emergency_disable_virtualization();
+ stop_this_cpu(NULL);
- free_cpumask_var(allbutself);
+ return NMI_HANDLED;
}
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
-
-asmlinkage void smp_reboot_interrupt(void)
+DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
- ack_APIC_irq();
- irq_enter();
+ apic_eoi();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
- irq_exit();
}
-static void native_smp_send_stop(void)
+static int register_stop_handler(void)
+{
+ return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop");
+}
+
+static void native_stop_other_cpus(int wait)
{
- unsigned long flags;
- unsigned long wait;
+ unsigned int old_cpu, this_cpu;
+ unsigned long flags, timeout;
if (reboot_force)
return;
+ /* Only proceed if this is the first CPU to reach this code */
+ old_cpu = -1;
+ this_cpu = smp_processor_id();
+ if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu))
+ return;
+
+ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
+ if (kexec_in_progress)
+ smp_kick_mwait_play_dead();
+
/*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- * On most systems we could also use an NMI here,
- * but there are a few systems around where NMI
- * is problematic so stay with an non NMI for now
- * (this implies we cannot stop CPUs spinning with irq off
- * currently)
+ * 1) Send an IPI on the reboot vector to all other CPUs.
+ *
+ * The other CPUs should react on it after leaving critical
+ * sections and re-enabling interrupts. They might still hold
+ * locks, but there is nothing which can be done about that.
+ *
+ * 2) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * 3) If #2 timed out send an NMI to the CPUs which did not
+ * yet report
+ *
+ * 4) Wait for all other CPUs to report that they reached the
+ * HLT loop in stop_this_cpu()
+ *
+ * #3 can obviously race against a CPU reaching the HLT loop late.
+ * That CPU will have reported already and the "have all CPUs
+ * reached HLT" condition will be true despite the fact that the
+ * other CPU is still handling the NMI. Again, there is no
+ * protection against that as "disabled" APICs still respond to
+ * NMIs.
*/
- if (num_online_cpus() > 1) {
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ cpumask_copy(&cpus_stop_mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, &cpus_stop_mask);
- /* Don't wait longer than a second */
- wait = USEC_PER_SEC;
- while (num_online_cpus() > 1 && wait--)
+ if (!cpumask_empty(&cpus_stop_mask)) {
+ apic_send_IPI_allbutself(REBOOT_VECTOR);
+
+ /*
+ * Don't wait longer than a second for IPI completion. The
+ * wait request is not checked here because that would
+ * prevent an NMI shutdown attempt in case that not all
+ * CPUs reach shutdown state.
+ */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(&cpus_stop_mask) && timeout--)
+ udelay(1);
+ }
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if (!cpumask_empty(&cpus_stop_mask)) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ unsigned int cpu;
+
+ pr_emerg("Shutting down cpus with NMI\n");
+
+ for_each_cpu(cpu, &cpus_stop_mask)
+ __apic_send_IPI(cpu, NMI_VECTOR);
+ }
+ /*
+ * Don't wait longer than 10 ms if the caller didn't
+ * request it. If wait is true, the machine hangs here if
+ * one or more CPUs do not reach shutdown state.
+ */
+ timeout = USEC_PER_MSEC * 10;
+ while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
udelay(1);
}
local_irq_save(flags);
disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
local_irq_restore(flags);
+
+ /*
+ * Ensure that the cpus_stop_mask cache lines are invalidated on
+ * the other CPUs. See comment vs. SME in stop_this_cpu().
+ */
+ cpumask_clear(&cpus_stop_mask);
}
/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode.
*/
-void smp_reschedule_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
- /*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
}
-void smp_call_function_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
- ack_APIC_irq();
- irq_enter();
- generic_smp_call_function_interrupt();
+ apic_eoi();
+ trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
- irq_exit();
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
}
-void smp_call_function_single_interrupt(struct pt_regs *regs)
+DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
- ack_APIC_irq();
- irq_enter();
- generic_smp_call_function_single_interrupt();
+ apic_eoi();
+ trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
- irq_exit();
+ generic_smp_call_function_single_interrupt();
+ trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
}
+static int __init nonmi_ipi_setup(char *str)
+{
+ smp_no_nmi_ipi = true;
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
+ .stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_CRASH_DUMP)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
.smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
+ .kick_ap_alive = native_kick_ap,
.cpu_disable = native_cpu_disable,
.play_dead = native_play_dead,
@@ -239,3 +299,27 @@ struct smp_ops smp_ops = {
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
+
+int arch_cpu_rescan_dead_smt_siblings(void)
+{
+ enum cpuhp_smt_control old = cpu_smt_control;
+ int ret;
+
+ /*
+ * If SMT has been disabled and SMT siblings are in HLT, bring them back
+ * online and offline them again so that they end up in MWAIT proper.
+ *
+ * Called with hotplug enabled.
+ */
+ if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED)
+ return 0;
+
+ ret = cpuhp_smt_enable();
+ if (ret)
+ return ret;
+
+ ret = cpuhp_smt_disable(old);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..5cd6950ab672 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,5 @@
-/*
+// SPDX-License-Identifier: GPL-2.0-or-later
+ /*
* x86 SMP booting functions
*
* (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -12,9 +13,6 @@
* Pentium Pro and Pentium-II/Xeon MP machines.
* Original development of Linux SMP code supported by Caldera.
*
- * This code is released under the GNU General Public License version 2 or
- * later.
- *
* Fixes
* Felix Koop : NR_CPUS used properly
* Jose Renau : Handle single CPU case.
@@ -39,392 +37,617 @@
* Glauber Costa : i386 and x86_64 integration
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/smp.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
#include <linux/percpu.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
-#include <linux/stackprotector.h>
#include <linux/gfp.h>
+#include <linux/cpuidle.h>
+#include <linux/kexec.h>
+#include <linux/numa.h>
+#include <linux/pgtable.h>
+#include <linux/overflow.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuhotplug.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpuid/api.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
-#include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/fpu/api.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
-#include <linux/mc146818rtc.h>
-
-#include <asm/smpboot_hooks.h>
+#include <asm/microcode.h>
#include <asm/i8259.h>
-
-#ifdef CONFIG_X86_32
-u8 apicid_2_node[MAX_APICID];
-static int low_mappings;
-#endif
-
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
-#endif
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+#include <asm/misc.h>
+#include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/hw_irq.h>
+#include <asm/stackprotector.h>
+#include <asm/sev.h>
+#include <asm/spec-ctrl.h>
/* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
-/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
-EXPORT_PER_CPU_SYMBOL(cpu_info);
-
-atomic_t init_deasserted;
+/* representing HT, core, and die siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_to_node_map);
+/* Representing CPUs for which sibling maps can be computed */
+static cpumask_var_t cpu_sibling_setup_mask;
-/* set up a mapping between cpu and node. */
-static void map_cpu_to_node(int cpu, int node)
-{
- printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
- cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = node;
-}
+struct mwait_cpu_dead {
+ unsigned int control;
+ unsigned int status;
+};
-/* undo a mapping between cpu and node. */
-static void unmap_cpu_to_node(int cpu)
-{
- int node;
+#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
+#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
- printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
- for (node = 0; node < MAX_NUMNODES; node++)
- cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = 0;
-}
-#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
-#define map_cpu_to_node(cpu, node) ({})
-#define unmap_cpu_to_node(cpu) ({})
-#endif
+/*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
-#ifdef CONFIG_X86_32
-static int boot_cpu_logical_apicid;
+/* Maximum number of SMT threads on any online core */
+int __read_mostly __max_smt_threads = 1;
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
- { [0 ... NR_CPUS-1] = BAD_APICID };
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
-static void map_cpu_to_logical_apicid(void)
+int arch_update_cpu_topology(void)
{
- int cpu = smp_processor_id();
- int apicid = logical_smp_processor_id();
- int node = apic->apicid_to_node(apicid);
-
- if (!node_online(node))
- node = first_online_node;
+ int retval = x86_topology_update;
- cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, node);
+ x86_topology_update = false;
+ return retval;
}
-void numa_remove_cpu(int cpu)
+static unsigned int smpboot_warm_reset_vector_count;
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- unmap_cpu_to_node(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ if (!smpboot_warm_reset_vector_count++) {
+ CMOS_WRITE(0xa, 0xf);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
+ }
+ spin_unlock_irqrestore(&rtc_lock, flags);
}
-#else
-#define map_cpu_to_logical_apicid() do {} while (0)
-#endif
-/*
- * Report back to the Boot Processor.
- * Running on AP.
- */
-static void __cpuinit smp_callin(void)
+static inline void smpboot_restore_warm_reset_vector(void)
{
- int cpuid, phys_id;
- unsigned long timeout;
-
- /*
- * If waken up by an INIT in an 82489DX configuration
- * we may get here before an INIT-deassert IPI reaches
- * our local APIC. We have to wait for the IPI or we'll
- * lock up on an APIC access.
- */
- if (apic->wait_for_init_deassert)
- apic->wait_for_init_deassert(&init_deasserted);
+ unsigned long flags;
/*
- * (This works even if the APIC is not enabled.)
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
*/
- phys_id = read_apic_id();
- cpuid = smp_processor_id();
- if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
- panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
- phys_id, cpuid);
+ spin_lock_irqsave(&rtc_lock, flags);
+ if (!--smpboot_warm_reset_vector_count) {
+ CMOS_WRITE(0, 0xf);
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
- pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+ spin_unlock_irqrestore(&rtc_lock, flags);
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
+}
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies, timeout)) {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (cpumask_test_cpu(cpuid, cpu_callout_mask))
- break;
- cpu_relax();
- }
+/* Run the next set of setup steps for the upcoming CPU */
+static void ap_starting(void)
+{
+ int cpuid = smp_processor_id();
- if (!time_before(jiffies, timeout)) {
- panic("%s: CPU%d started up but did not get a callout!\n",
- __func__, cpuid);
- }
+ /* Mop up eventual mwait_play_dead() wreckage */
+ this_cpu_write(mwait_cpu_dead.status, 0);
+ this_cpu_write(mwait_cpu_dead.control, 0);
/*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
+ * If woken up by an INIT in an 82489DX configuration the alive
+ * synchronization guarantees that the CPU does not reach this
+ * point before an INIT_deassert IPI reaches the local APIC, so it
+ * is now safe to touch the local APIC.
+ *
+ * Set up this CPU, first the APIC, which is probably redundant on
+ * most boards.
*/
+ apic_ap_setup();
- pr_debug("CALLIN, before setup_local_APIC().\n");
- if (apic->smp_callin_clear_local_apic)
- apic->smp_callin_clear_local_apic();
- setup_local_APIC();
- end_local_APIC_setup();
- map_cpu_to_logical_apicid();
+ /* Save the processor parameters. */
+ identify_secondary_cpu(cpuid);
/*
- * Need to setup vector mappings before we enable interrupts.
+ * The topology information must be up to date before
+ * notify_cpu_starting().
*/
- setup_vector_irq(smp_processor_id());
- /*
- * Get our bogomips.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
- */
- local_irq_enable();
- calibrate_delay();
- local_irq_disable();
+ set_cpu_sibling_map(cpuid);
+
+ ap_init_aperfmperf();
+
pr_debug("Stack at about %p\n", &cpuid);
+ wmb();
+
/*
- * Save our processor parameters
+ * This runs the AP through all the cpuhp states to its target
+ * state CPUHP_ONLINE.
*/
- smp_store_cpu_info(cpuid);
-
notify_cpu_starting(cpuid);
+}
+static void ap_calibrate_delay(void)
+{
/*
- * Allow the master to continue.
+ * Calibrate the delay loop and update loops_per_jiffy in cpu_data.
+ * identify_secondary_cpu() stored a value that is close but not as
+ * accurate as the value just calculated.
+ *
+ * As this is invoked after the TSC synchronization check,
+ * calibrate_delay_is_known() will skip the calibration routine
+ * when TSC is synchronized across sockets.
*/
- cpumask_set_cpu(cpuid, cpu_callin_mask);
+ calibrate_delay();
+ cpu_data(smp_processor_id()).loops_per_jiffy = loops_per_jiffy;
}
/*
* Activate a secondary processor.
*/
-notrace static void __cpuinit start_secondary(void *unused)
+static void notrace __noendbr start_secondary(void *unused)
{
/*
- * Don't put *anything* before cpu_init(), SMP booting is too
- * fragile that we want to limit the things done here to the
- * most necessary things.
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
*/
- vmi_bringup();
- cpu_init();
- preempt_disable();
- smp_callin();
+ cr4_init();
- /* otherwise gcc will move up smp_processor_id before the cpu_init */
- barrier();
/*
- * Check TSC synchronization with the BP:
+ * 32-bit specific. 64-bit reaches this code with the correct page
+ * table established. Yet another historical divergence.
*/
- check_tsc_sync_target();
-
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->chip->mask(0);
- enable_NMI_through_LVT0();
- legacy_pic->chip->unmask(0);
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
}
-#ifdef CONFIG_X86_32
- while (low_mappings)
- cpu_relax();
- __flush_tlb_all();
-#endif
-
- /* This must be done before setting cpu_online_mask */
- set_cpu_sibling_map(raw_smp_processor_id());
- wmb();
+ cpu_init_exception_handling(false);
/*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI recipients, and the time when the determination is made
- * for which cpus receive the IPI. Holding this
- * lock helps us to not include this cpu in a currently in progress
- * smp_call_function().
+ * Load the microcode before reaching the AP alive synchronization
+ * point below so it is not part of the full per CPU serialized
+ * bringup part when "parallel" bringup is enabled.
+ *
+ * That's even safe when hyperthreading is enabled in the CPU as
+ * the core code starts the primary threads first and leaves the
+ * secondary threads waiting for SIPI. Loading microcode on
+ * physical cores concurrently is a safe operation.
*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
+ * This covers both the Intel specific issue that concurrent
+ * microcode loading on SMT siblings must be prohibited and the
+ * vendor independent issue`that microcode loading which changes
+ * CPUID, MSRs etc. must be strictly serialized to maintain
+ * software state correctness.
+ */
+ load_ucode_ap();
+
+ /*
+ * Synchronization point with the hotplug core. Sets this CPUs
+ * synchronization state to ALIVE and spin-waits for the control CPU to
+ * release this CPU for further bringup.
+ */
+ cpuhp_ap_sync_alive();
+
+ cpu_init();
+ fpu__init_cpu();
+ rcutree_report_cpu_starting(raw_smp_processor_id());
+ x86_cpuinit.early_percpu_clock_init();
+
+ ap_starting();
+
+ /* Check TSC synchronization with the control CPU. */
+ check_tsc_sync_target();
+
+ /*
+ * Calibrate the delay loop after the TSC synchronization check.
+ * This allows to skip the calibration when TSC is synchronized
+ * across sockets.
+ */
+ ap_calibrate_delay();
+
+ speculative_store_bypass_ht_init();
+
+ /*
+ * Lock vector_lock, set CPU online and bring the vector
+ * allocator online. Online must be set with vector_lock held
+ * to prevent a concurrent irq setup/teardown from seeing a
+ * half valid vector space.
*/
- ipi_call_lock();
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
+ lapic_online();
unlock_vector_lock();
- ipi_call_unlock();
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
x86_platform.nmi_init();
/* enable local interrupts */
local_irq_enable();
- /* to prevent fake stack check failure in clock setup */
- boot_init_stack_canary();
-
x86_cpuinit.setup_percpu_clockev();
wmb();
- cpu_idle();
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+ANNOTATE_NOENDBR_SYM(start_secondary);
+
+static bool
+topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
+}
+
+static bool
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(!topology_same_node(c, o),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(mfunc, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), mfunc(c2)); \
+ cpumask_set_cpu((c2), mfunc(c1)); \
+} while (0)
+
+static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.amd_node_id == o->topo.amd_node_id &&
+ per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
+ if (c->topo.core_id == o->topo.core_id)
+ return topology_sane(c, o, "smt");
+
+ if ((c->topo.cu_id != 0xff) &&
+ (o->topo.cu_id != 0xff) &&
+ (c->topo.cu_id == o->topo.cu_id))
+ return topology_sane(c, o, "smt");
+ }
+
+ } else if (c->topo.pkg_id == o->topo.pkg_id &&
+ c->topo.die_id == o->topo.die_id &&
+ c->topo.core_id == o->topo.core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
-/* In this case, llc_shared_map is a pointer to a cpumask. */
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
- const struct cpuinfo_x86 *src)
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- struct cpumask *llc = dst->llc_shared_map;
- *dst = *src;
- dst->llc_shared_map = llc;
+ if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id)
+ return false;
+
+ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1)
+ return c->topo.amd_node_id == o->topo.amd_node_id;
+
+ return true;
}
-#else
-static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
- const struct cpuinfo_x86 *src)
+
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- *dst = *src;
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* If the arch didn't set up l2c_id, fall back to SMT */
+ if (per_cpu_l2c_id(cpu1) == BAD_APICID)
+ return match_smt(c, o);
+
+ /* Do not match if L2 cache id does not match: */
+ if (per_cpu_l2c_id(cpu1) != per_cpu_l2c_id(cpu2))
+ return false;
+
+ return topology_sane(c, o, "l2c");
}
-#endif /* CONFIG_CPUMASK_OFFSTACK */
/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->topo.pkg_id == o->topo.pkg_id)
+ return true;
+ return false;
+}
+
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
+ *
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
+ *
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
*/
-void __cpuinit smp_store_cpu_info(int id)
+static const struct x86_cpu_id intel_cod_cpu[] = {
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
+ {}
+};
+
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- struct cpuinfo_x86 *c = &cpu_data(id);
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ bool intel_snc = id && id->driver_data;
+
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu_llc_id(cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if LLC id does not match: */
+ if (per_cpu_llc_id(cpu1) != per_cpu_llc_id(cpu2))
+ return false;
- copy_cpuinfo_x86(c, &boot_cpu_data);
- c->cpu_index = id;
- if (id != 0)
- identify_secondary_cpu(c);
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
+ return false;
+
+ return topology_sane(c, o, "llc");
}
-void __cpuinit set_cpu_sibling_map(int cpu)
+static inline int x86_sched_itmt_flags(void)
{
- int i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
- cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+ return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- cpumask_set_cpu(i, cpu_sibling_mask(cpu));
- cpumask_set_cpu(cpu, cpu_sibling_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, o->llc_shared_map);
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
+
+static struct sched_domain_topology_level x86_topology[] = {
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
+#ifdef CONFIG_SCHED_CLUSTER
+ SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS),
+#endif
+#ifdef CONFIG_SCHED_MC
+ SDTL_INIT(tl_mc_mask, x86_core_flags, MC),
+#endif
+ SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG),
+ { NULL },
+};
+
+static void __init build_sched_topology(void)
+{
+ struct sched_domain_topology_level *topology = x86_topology;
+
+ /*
+ * When there is NUMA topology inside the package invalidate the
+ * PKG domain since the NUMA domains will auto-magically create the
+ * right spanning domains based on the SLIT.
+ */
+ if (x86_has_numa_in_package) {
+ unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2;
+
+ memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom]));
+ }
+
+ /*
+ * Drop the SMT domains if there is only one thread per-core
+ * since it'll get degenerated by the scheduler anyways.
+ */
+ if (cpu_smt_num_threads <= 1)
+ ++topology;
+
+ set_sched_topology(topology);
+}
+
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
}
}
- } else {
- cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
}
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
- cpumask_set_cpu(cpu, c->llc_shared_map);
+void set_cpu_sibling_map(int cpu)
+{
+ bool has_smt = __max_threads_per_core > 1;
+ bool has_mp = has_smt || topology_num_cores_per_package() > 1;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i, threads;
- if (current_cpu_data.x86_max_cores == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+
+ if (!has_mp) {
+ cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
+ cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
c->booted_cores = 1;
return;
}
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, c->llc_shared_map);
- cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if (match_pkg(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(topology_sibling_cpumask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
+ link_mask(cpu_llc_shared_mask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_l2c(c, o)))
+ link_mask(cpu_l2c_shared_mask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_die(c, o)))
+ link_mask(topology_die_cpumask, cpu, i);
+ }
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+
+ for_each_cpu(i, topology_sibling_cpumask(cpu))
+ cpu_data(i).smt_active = threads > 1;
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * topology_sibling_cpumask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mp && match_pkg(c, o))) {
+ link_mask(topology_core_cpumask, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
- if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+ if (threads == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
*/
- if (cpumask_first(cpu_sibling_mask(i)) == i)
+ if (cpumask_first(
+ topology_sibling_cpumask(i)) == i)
c->booted_cores++;
/*
* increment the core count for all
@@ -441,18 +664,15 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if ((sched_mc_power_savings || sched_smt_power_savings) &&
- !(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return c->llc_shared_map;
+ return cpu_llc_shared_mask(cpu);
}
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+ return cpu_l2c_shared_mask(cpu);
+}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
+
static void impress_friends(void)
{
int cpu;
@@ -460,141 +680,95 @@ static void impress_friends(void)
/*
* Allow the user to impress friends.
*/
- pr_debug("Before bogomips.\n");
- for_each_possible_cpu(cpu)
- if (cpumask_test_cpu(cpu, cpu_callout_mask))
- bogosum += cpu_data(cpu).loops_per_jiffy;
- printk(KERN_INFO
- "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ pr_debug("Before bogomips\n");
+ for_each_online_cpu(cpu)
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+
+ pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
num_online_cpus(),
bogosum/(500000/HZ),
(bogosum/(5000/HZ))%100);
- pr_debug("Before bogocount - setting activated=1.\n");
+ pr_debug("Before bogocount - setting activated=1\n");
}
-void __inquire_remote_apic(int apicid)
-{
- unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
- int timeout;
- u32 status;
+/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "cpu_init_udelay=" is available to override this delay.
+ */
+#define UDELAY_10MS_LEGACY 10000
- printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
+static unsigned int init_udelay = UINT_MAX;
- for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
+static int __init cpu_init_udelay(char *str)
+{
+ get_option(&str, &init_udelay);
- /*
- * Wait for idle.
- */
- status = safe_apic_wait_icr_idle();
- if (status)
- printk(KERN_CONT
- "a previous APIC delivery may have failed\n");
-
- apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
-
- timeout = 0;
- do {
- udelay(100);
- status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
- } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
- switch (status) {
- case APIC_ICR_RR_VALID:
- status = apic_read(APIC_RRR);
- printk(KERN_CONT "%08x\n", status);
- break;
- default:
- printk(KERN_CONT "failed\n");
- }
- }
+ return 0;
}
+early_param("cpu_init_udelay", cpu_init_udelay);
-/*
- * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- * won't ... remember to clear down the APIC, etc later.
- */
-int __cpuinit
-wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
+static void __init smp_set_init_udelay(void)
{
- unsigned long send_status, accept_status = 0;
- int maxlvt;
-
- /* Target chip */
- /* Boot on the stack */
- /* Kick the second */
- apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+ /* if cmdline changed it from default, leave it alone */
+ if (init_udelay != UINT_MAX)
+ return;
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- accept_status = (apic_read(APIC_ESR) & 0xEF);
+ /* if modern processor, use no delay */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) ||
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) {
+ init_udelay = 0;
+ return;
}
- pr_debug("NMI sent.\n");
-
- if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
- if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
-
- return (send_status | accept_status);
+ /* else, use legacy delay */
+ init_udelay = UDELAY_10MS_LEGACY;
}
-static int __cpuinit
-wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static void send_init_sequence(u32 phys_apicid)
{
- unsigned long send_status, accept_status = 0;
- int maxlvt, num_starts, j;
+ int maxlvt = lapic_get_maxlvt();
- maxlvt = lapic_get_maxlvt();
-
- /*
- * Be paranoid about clearing APIC errors.
- */
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ /* Be paranoid about clearing APIC errors. */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
- pr_debug("Asserting INIT.\n");
+ /* Assert INIT on the target CPU */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
- /*
- * Turn INIT on target chip
- */
- /*
- * Send IPI
- */
- apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
- phys_apicid);
-
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- mdelay(10);
-
- pr_debug("Deasserting INIT.\n");
+ udelay(init_udelay);
- /* Target chip */
- /* Send IPI */
+ /* Deassert INIT on the target CPU */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+ safe_apic_wait_icr_idle();
+}
- pr_debug("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int num_starts, j, maxlvt;
+
+ preempt_disable();
+ maxlvt = lapic_get_maxlvt();
+ send_init_sequence(phys_apicid);
mb();
- atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
@@ -602,29 +776,22 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2;
else
num_starts = 0;
/*
- * Paravirt / VMI wants a startup IPI hook here to set up the
- * target processor state.
- */
- startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- (unsigned long)stack_start.sp);
-
- /*
* Run STARTUP IPI loop.
*/
- pr_debug("#startup loops: %d.\n", num_starts);
+ pr_debug("#startup loops: %d\n", num_starts);
for (j = 1; j <= num_starts; j++) {
- pr_debug("Sending STARTUP #%d.\n", j);
+ pr_debug("Sending STARTUP #%d\n", j);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- pr_debug("After apic_write.\n");
+ pr_debug("After apic_write\n");
/*
* STARTUP IPI
@@ -639,9 +806,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(300);
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(300);
- pr_debug("Startup point 1.\n");
+ pr_debug("Startup point 1\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -649,124 +819,115 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(200);
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(200);
+
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
- pr_debug("After Startup.\n");
+ pr_debug("After Startup\n");
if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
+ pr_err("APIC never delivered???\n");
if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+ preempt_enable();
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
-
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
-static void __cpuinit announce_cpu(int cpu, int apicid)
+static void announce_cpu(int cpu, int apicid)
{
- static int current_node = -1;
+ static int width, node_width, first = 1;
+ static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
- if (system_state == SYSTEM_BOOTING) {
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+ if (system_state < SYSTEM_RUNNING) {
+ if (first)
+ pr_info("x86: Booting SMP configuration:\n");
+
if (node != current_node) {
if (current_node > (-1))
- pr_cont(" Ok.\n");
+ pr_cont("\n");
current_node = node;
- pr_info("Booting Node %3d, Processors ", node);
+
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
}
- pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
- return;
+
+ /* Add padding for the BSP */
+ if (first)
+ pr_cont("%*s", width + 1, " ");
+ first = 0;
+
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
} else
pr_info("Booting Node %d Processor %d APIC 0x%x\n",
node, cpu, apicid);
}
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- * Returns zero if CPU booted OK, else error code from
- * ->wakeup_secondary_cpu.
- */
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
{
- unsigned long boot_error = 0;
- unsigned long start_ip;
- int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
+ int ret;
- INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
- alternatives_smp_switch(1);
+ per_cpu(current_task, cpu) = idle;
+ cpu_init_stack_canary(cpu, idle);
- c_idle.idle = get_idle_for_cpu(cpu);
+ /* Initialize the interrupt stack(s) */
+ ret = irq_init_percpu_irqstack(cpu);
+ if (ret)
+ return ret;
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
-
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
-
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- destroy_work_on_stack(&c_idle.work);
- return PTR_ERR(c_idle.idle);
- }
-
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- irq_ctx_init(cpu);
-#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
- initial_gs = per_cpu_offset(cpu);
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
- KERNEL_STACK_OFFSET + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
+#endif
+ return 0;
+}
+
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if startup was successfully sent, else error code from
+ * ->wakeup_secondary_cpu.
+ */
+static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
+{
+ unsigned long start_ip = real_mode_header->trampoline_start;
+ int ret;
+
+#ifdef CONFIG_X86_64
+ /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+ if (apic->wakeup_secondary_cpu_64)
+ start_ip = real_mode_header->trampoline_start64;
#endif
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
initial_code = (unsigned long)start_secondary;
- stack_start.sp = (void *) c_idle.idle->thread.sp;
- /* start_ip had better be page-aligned! */
- start_ip = setup_trampoline();
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_stack = idle->thread.sp;
+ } else if (!(smpboot_control & STARTUP_PARALLEL_MASK)) {
+ smpboot_control = cpu;
+ }
+
+ /* Enable the espfix hack for this CPU */
+ init_espfix_ap(cpu);
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -775,10 +936,7 @@ do_rest:
* This grunge runs the startup process for
* the targeted processor.
*/
-
- atomic_set(&init_deasserted, 0);
-
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+ if (x86_platform.legacy.warm_reset) {
pr_debug("Setting warm reset code and vector.\n");
@@ -786,104 +944,51 @@ do_rest:
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
+ smp_mb();
+
/*
- * Kick the secondary CPU. Use the method in the APIC driver
- * if it's defined - or use an INIT boot APIC message otherwise:
+ * Wake up a CPU in difference cases:
+ * - Use a method from the APIC driver if one defined, with wakeup
+ * straight to 64-bit mode preferred over wakeup to RM.
+ * Otherwise,
+ * - Use an INIT boot APIC message
*/
- if (apic->wakeup_secondary_cpu)
- boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ if (apic->wakeup_secondary_cpu_64)
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
+ else if (apic->wakeup_secondary_cpu)
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
else
- boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
-
- if (!boot_error) {
- /*
- * allow APs to start initializing.
- */
- pr_debug("Before Callout %d.\n", cpu);
- cpumask_set_cpu(cpu, cpu_callout_mask);
- pr_debug("After Callout %d.\n", cpu);
-
- /*
- * Wait 5s total for a response
- */
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
- break; /* It has booted */
- udelay(100);
- }
-
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
- pr_debug("CPU%d: has booted.\n", cpu);
- else {
- boot_error = 1;
- if (*((volatile unsigned char *)trampoline_base)
- == 0xA5)
- /* trampoline started but...? */
- pr_err("CPU%d: Stuck ??\n", cpu);
- else
- /* trampoline code not run */
- pr_err("CPU%d: Not responding.\n", cpu);
- if (apic->inquire_remote_apic)
- apic->inquire_remote_apic(apicid);
- }
- }
-
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- numa_remove_cpu(cpu); /* was set by numa_add_cpu */
-
- /* was set by do_boot_cpu() */
- cpumask_clear_cpu(cpu, cpu_callout_mask);
-
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
-
- set_cpu_present(cpu, false);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
- }
-
- /* mark "stuck" area as not stuck */
- *((volatile unsigned long *)trampoline_base) = 0;
-
- if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
- }
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
- destroy_work_on_stack(&c_idle.work);
- return boot_error;
+ /* If the wakeup mechanism failed, cleanup the warm reset vector */
+ if (ret)
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
{
- int apicid = apic->cpu_present_to_apicid(cpu);
- unsigned long flags;
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
int err;
- WARN_ON(irqs_disabled());
+ lockdep_assert_irqs_enabled();
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
- !physid_isset(apicid, phys_cpu_present_map)) {
- printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+ if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
+ pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
return -EINVAL;
}
- /*
- * Already booted CPU?
- */
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- pr_debug("do_boot_cpu %d Already started\n", cpu);
- return -ENOSYS;
+ if (!test_bit(apicid, phys_cpu_present_map)) {
+ pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
+ return -EINVAL;
}
/*
@@ -892,269 +997,162 @@ int __cpuinit native_cpu_up(unsigned int cpu)
*/
mtrr_save_state();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* the FPU context is blank, nobody can own it */
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
-#ifdef CONFIG_X86_32
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
- flush_tlb_all();
- low_mappings = 1;
-
- err = do_boot_cpu(apicid, cpu);
-
- zap_low_mappings(false);
- low_mappings = 0;
-#else
- err = do_boot_cpu(apicid, cpu);
-#endif
- if (err) {
- pr_debug("do_boot_cpu failed %d\n", err);
- return -EIO;
- }
-
- /*
- * Check TSC synchronization with the AP (keep irqs disabled
- * while doing so):
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
+ err = common_cpu_up(cpu, tidle);
+ if (err)
+ return err;
- while (!cpu_online(cpu)) {
- cpu_relax();
- touch_nmi_watchdog();
- }
+ err = do_boot_cpu(apicid, cpu, tidle);
+ if (err)
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
- return 0;
+ return err;
}
-/*
- * Fall back to non SMP mode after errors.
- *
- * RED-PEN audit/test this more. I bet there is more state messed up here.
- */
-static __init void disable_smp(void)
+int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
{
- init_cpu_present(cpumask_of(0));
- init_cpu_possible(cpumask_of(0));
- smpboot_clear_io_apic_irqs();
-
- if (smp_found_config)
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- else
- physid_set_mask_of_physid(0, &phys_cpu_present_map);
- map_cpu_to_logical_apicid();
- cpumask_set_cpu(0, cpu_sibling_mask(0));
- cpumask_set_cpu(0, cpu_core_mask(0));
+ return smp_ops.kick_ap_alive(cpu, tidle);
}
-/*
- * Various sanity checks.
- */
-static int __init smp_sanity_check(unsigned max_cpus)
+void arch_cpuhp_cleanup_kick_cpu(unsigned int cpu)
{
- preempt_disable();
-
-#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
- if (def_to_bigsmp && nr_cpu_ids > 8) {
- unsigned int cpu;
- unsigned nr;
+ /* Cleanup possible dangling ends... */
+ if (smp_ops.kick_ap_alive == native_kick_ap && x86_platform.legacy.warm_reset)
+ smpboot_restore_warm_reset_vector();
+}
- printk(KERN_WARNING
- "More than 8 CPUs detected - skipping them.\n"
- "Use CONFIG_X86_BIGSMP.\n");
+void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
+{
+ if (smp_ops.cleanup_dead_cpu)
+ smp_ops.cleanup_dead_cpu(cpu);
- nr = 0;
- for_each_present_cpu(cpu) {
- if (nr >= 8)
- set_cpu_present(cpu, false);
- nr++;
- }
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+}
- nr = 0;
- for_each_possible_cpu(cpu) {
- if (nr >= 8)
- set_cpu_possible(cpu, false);
- nr++;
- }
+void arch_cpuhp_sync_state_poll(void)
+{
+ if (smp_ops.poll_sync_state)
+ smp_ops.poll_sync_state();
+}
- nr_cpu_ids = 8;
- }
-#endif
+/**
+ * arch_disable_smp_support() - Disables SMP support for x86 at boottime
+ */
+void __init arch_disable_smp_support(void)
+{
+ disable_ioapic_support();
+}
- if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
- hard_smp_processor_id());
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+ pr_info("SMP disabled\n");
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
+ disable_ioapic_support();
+ topology_reset_possible_cpus_up();
- /*
- * If we couldn't find an SMP configuration at boot time,
- * get out of here now!
- */
- if (!smp_found_config && !acpi_lapic) {
- preempt_enable();
- printk(KERN_NOTICE "SMP motherboard not detected.\n");
- disable_smp();
- if (APIC_init_uniprocessor())
- printk(KERN_NOTICE "Local APIC not detected."
- " Using dummy APIC emulation.\n");
- return -1;
- }
+ cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ cpumask_set_cpu(0, topology_core_cpumask(0));
+ cpumask_set_cpu(0, topology_die_cpumask(0));
+}
- /*
- * Should not be necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- */
- if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
- printk(KERN_NOTICE
- "weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_physical_apicid);
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
- preempt_enable();
+void __init smp_prepare_cpus_common(void)
+{
+ unsigned int cpu, node;
- /*
- * If we couldn't find a local APIC, then get out of here now!
- */
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
- !cpu_has_apic) {
- if (!disable_apic) {
- pr_err("BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- pr_err("... forcing use of dummy APIC emulation."
- "(tell your hw vendor)\n");
- }
- smpboot_clear_io_apic();
- arch_disable_smp_support();
- return -1;
+ /* Mark all except the boot CPU as hotpluggable */
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
}
- verify_local_APIC();
-
- /*
- * If SMP should be disabled, then really disable it!
- */
- if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated.\n");
- smpboot_clear_io_apic();
-
- localise_nmi_watchdog();
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
- connect_bsp_APIC();
- setup_local_APIC();
- end_local_APIC_setup();
- return -1;
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
- return 0;
+ set_cpu_sibling_map(0);
}
-static void __init smp_cpu_index_default(void)
+void __init smp_prepare_boot_cpu(void)
{
- int i;
- struct cpuinfo_x86 *c;
+ smp_ops.smp_prepare_boot_cpu();
+}
- for_each_possible_cpu(i) {
- c = &cpu_data(i);
- /* mark all to hotplug */
- c->cpu_index = nr_cpu_ids;
+#ifdef CONFIG_X86_64
+/* Establish whether parallel bringup can be supported. */
+bool __init arch_cpuhp_init_parallel_bringup(void)
+{
+ if (!x86_cpuinit.parallel_bringup) {
+ pr_info("Parallel CPU startup disabled by the platform\n");
+ return false;
}
+
+ smpboot_control = STARTUP_READ_APICID;
+ pr_debug("Parallel CPU startup enabled: 0x%08x\n", smpboot_control);
+ return true;
}
+#endif
/*
- * Prepare for SMP bootup. The MP table or ACPI has been read
- * earlier. Just do some sanity checking here and enable APIC mode.
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
- unsigned int i;
+ smp_prepare_cpus_common();
- preempt_disable();
- smp_cpu_index_default();
- current_cpu_data = boot_cpu_data;
- cpumask_copy(cpu_callin_mask, cpumask_of(0));
- mb();
- /*
- * Setup boot CPU information
- */
- smp_store_cpu_info(0); /* Final full version of the data */
-#ifdef CONFIG_X86_32
- boot_cpu_logical_apicid = logical_smp_processor_id();
-#endif
- current_thread_info()->cpu = 0; /* needed? */
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
- }
- set_cpu_sibling_map(0);
-
- enable_IR_x2apic();
- default_setup_apic_routing();
-
- if (smp_sanity_check(max_cpus) < 0) {
- printk(KERN_INFO "SMP disabled\n");
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
disable_smp();
- goto out;
- }
-
- preempt_disable();
- if (read_apic_id() != boot_cpu_physical_apicid) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- read_apic_id(), boot_cpu_physical_apicid);
- /* Or can we switch back to PIC here? */
+ return;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ disable_smp();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return;
+ case APIC_VIRTUAL_WIRE:
+ case APIC_SYMMETRIC_IO:
+ break;
}
- preempt_enable();
-
- connect_bsp_APIC();
- /*
- * Switch from PIC to APIC mode.
- */
- setup_local_APIC();
-
- /*
- * Enable IO APIC before setting up error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- end_local_APIC_setup();
-
- map_cpu_to_logical_apicid();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
- if (apic->setup_portio_remap)
- apic->setup_portio_remap();
+ pr_info("CPU0: ");
+ print_cpu_info(&cpu_data(0));
- smpboot_setup_io_apic();
- /*
- * Set up local APIC timer on boot CPU.
- */
+ uv_system_init();
- printk(KERN_INFO "CPU%d: ", 0);
- print_cpu_info(&cpu_data(0));
- x86_init.timers.setup_percpu_clockev();
+ smp_set_init_udelay();
- if (is_uv_system())
- uv_system_init();
+ speculative_store_bypass_ht_init();
- set_mtrr_aps_delayed_init();
-out:
- preempt_enable();
+ snp_set_wakeup_secondary_cpu();
}
-void arch_enable_nonboot_cpus_begin(void)
+void arch_thaw_secondary_cpus_begin(void)
{
- set_mtrr_aps_delayed_init();
+ set_cache_aps_delayed_init(true);
}
-void arch_enable_nonboot_cpus_end(void)
+void arch_thaw_secondary_cpus_end(void)
{
- mtrr_aps_init();
+ cache_aps_init();
}
/*
@@ -1163,134 +1161,88 @@ void arch_enable_nonboot_cpus_end(void)
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
- switch_to_new_gdt(me);
- /* already set me in cpu_online_mask in boot_cpu_init() */
- cpumask_set_cpu(me, cpu_callout_mask);
- per_cpu(cpu_state, me) = CPU_ONLINE;
+
+ /* SMP handles this from setup_per_cpu_areas() */
+ if (!IS_ENABLED(CONFIG_SMP))
+ switch_gdt_and_percpu_base(me);
+
+ native_pv_lock_init();
}
void __init native_smp_cpus_done(unsigned int max_cpus)
{
- pr_debug("Boot done.\n");
+ pr_debug("Boot done\n");
+ build_sched_topology();
+ nmi_selftest();
impress_friends();
-#ifdef CONFIG_X86_IO_APIC
- setup_ioapic_dest();
-#endif
- check_nmi_watchdog();
- mtrr_aps_init();
+ cache_aps_init();
}
-static int __initdata setup_possible_cpus = -1;
-static int __init _setup_possible_cpus(char *str)
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
{
- get_option(&str, &setup_possible_cpus);
- return 0;
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}
-early_param("possible_cpus", _setup_possible_cpus);
-
-
-/*
- * cpu_possible_mask should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_mask on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with possible_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- * We do this because additional CPUs waste a lot of memory.
- * -AK
- */
-__init void prefill_possible_map(void)
-{
- int i, possible;
- /* no processor from mptable or madt */
- if (!num_processors)
- num_processors = 1;
-
- i = setup_max_cpus ?: 1;
- if (setup_possible_cpus == -1) {
- possible = num_processors;
#ifdef CONFIG_HOTPLUG_CPU
- if (setup_max_cpus)
- possible += disabled_cpus;
-#else
- if (possible > i)
- possible = i;
-#endif
- } else
- possible = setup_possible_cpus;
- total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+ int max_threads, cpu;
- /* nr_cpu_ids could be reduced via nr_cpus= */
- if (possible > nr_cpu_ids) {
- printk(KERN_WARNING
- "%d Processors exceeds NR_CPUS limit of %d\n",
- possible, nr_cpu_ids);
- possible = nr_cpu_ids;
- }
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
-#ifdef CONFIG_HOTPLUG_CPU
- if (!setup_max_cpus)
-#endif
- if (possible > i) {
- printk(KERN_WARNING
- "%d Processors exceeds max_cpus limit of %u\n",
- possible, setup_max_cpus);
- possible = i;
+ if (threads > max_threads)
+ max_threads = threads;
}
-
- printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
- possible, max_t(int, possible - num_processors, 0));
-
- for (i = 0; i < possible; i++)
- set_cpu_possible(i, true);
- for (; i < NR_CPUS; i++)
- set_cpu_possible(i, false);
-
- nr_cpu_ids = possible;
+ __max_smt_threads = max_threads;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
static void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- for_each_cpu(sibling, cpu_core_mask(cpu)) {
- cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+ for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
/*/
* last thread sibling in this cpu core going down
*/
- if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
cpu_data(sibling).booted_cores--;
}
- for_each_cpu(sibling, cpu_sibling_mask(cpu))
- cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
- cpumask_clear(cpu_sibling_mask(cpu));
- cpumask_clear(cpu_core_mask(cpu));
- c->phys_proc_id = 0;
- c->cpu_core_id = 0;
+ for_each_cpu(sibling, topology_die_cpumask(cpu))
+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
+
+ for_each_cpu(sibling, topology_sibling_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+ if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1)
+ cpu_data(sibling).smt_active = false;
+ }
+
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(cpu_l2c_shared_mask(cpu));
+ cpumask_clear(topology_sibling_cpumask(cpu));
+ cpumask_clear(topology_core_cpumask(cpu));
+ cpumask_clear(topology_die_cpumask(cpu));
+ c->topo.core_id = 0;
+ c->booted_cores = 0;
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
}
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
{
set_cpu_online(cpu, false);
- cpumask_clear_cpu(cpu, cpu_callout_mask);
- cpumask_clear_cpu(cpu, cpu_callin_mask);
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
numa_remove_cpu(cpu);
}
@@ -1300,78 +1252,160 @@ void cpu_disable_common(void)
remove_siblinginfo(cpu);
+ /*
+ * Stop allowing kernel-mode FPU. This is needed so that if the CPU is
+ * brought online again, the initial state is not allowed:
+ */
+ this_cpu_write(kernel_fpu_allowed, false);
+
/* It's now safe to remove this processor from the online map */
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
fixup_irqs();
+ lapic_offline();
}
int native_cpu_disable(void)
{
- int cpu = smp_processor_id();
-
- /*
- * Perhaps use cpufreq to drop frequency, but that could go
- * into generic code.
- *
- * We won't take down the boot processor on i386 due to some
- * interrupts only being able to be serviced by the BSP.
- * Especially so if we're not using an IOAPIC -zwane
- */
- if (cpu == 0)
- return -EBUSY;
+ int ret;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
- clear_local_APIC();
+ ret = lapic_can_unplug_cpu();
+ if (ret)
+ return ret;
cpu_disable_common();
+
+ /*
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
+ * messages.
+ *
+ * Disabling the APIC must happen after cpu_disable_common()
+ * which invokes fixup_irqs().
+ *
+ * Disabling the APIC preserves already set bits in IRR, but
+ * an interrupt arriving after disabling the local APIC does not
+ * set the corresponding IRR bit.
+ *
+ * fixup_irqs() scans IRR for set bits so it can raise a not
+ * yet handled interrupt on the new destination CPU via an IPI
+ * but obviously it can't do so for IRR bits which are not set.
+ * IOW, interrupts arriving after disabling the local APIC will
+ * be lost.
+ */
+ apic_soft_disable();
+
return 0;
}
-void native_cpu_die(unsigned int cpu)
+void play_dead_common(void)
{
- /* We don't do anything here: idle task is faking death itself. */
- unsigned int i;
-
- for (i = 0; i < 10; i++) {
- /* They ack this in play_dead by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- if (system_state == SYSTEM_RUNNING)
- pr_info("CPU %u is now offline\n", cpu);
-
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
- return;
+ idle_task_exit();
+
+ cpuhp_ap_report_dead();
+
+ local_irq_disable();
+}
+
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+void __noreturn mwait_play_dead(unsigned int eax_hint)
+{
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
+ /* Set up state for the kexec() hack below */
+ md->status = CPUDEAD_MWAIT_WAIT;
+ md->control = CPUDEAD_MWAIT_WAIT;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ mb();
+ clflush(md);
+ mb();
+ __monitor(md, 0, 0);
+ mb();
+ __mwait(eax_hint, 0);
+
+ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+ /*
+ * Kexec is about to happen. Don't go back into mwait() as
+ * the kexec kernel might overwrite text and data including
+ * page tables and stack. So mwait() would resume when the
+ * monitor cache line is written to and then the CPU goes
+ * south due to overwritten text, page tables and stack.
+ *
+ * Note: This does _NOT_ protect against a stray MCE, NMI,
+ * SMI. They will resume execution at the instruction
+ * following the HLT instruction and run into the problem
+ * which this is trying to prevent.
+ */
+ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+ while(1)
+ native_halt();
}
- msleep(100);
}
- pr_err("CPU %u didn't die...\n", cpu);
}
-void play_dead_common(void)
+/*
+ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+ * mwait_play_dead().
+ */
+void smp_kick_mwait_play_dead(void)
{
- idle_task_exit();
- reset_lazy_tlbstate();
- irq_ctx_exit(raw_smp_processor_id());
- c1e_remove_cpu(raw_smp_processor_id());
+ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+ struct mwait_cpu_dead *md;
+ unsigned int cpu, i;
+
+ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+
+ /* Does it sit in mwait_play_dead() ? */
+ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+ continue;
+
+ /* Wait up to 5ms */
+ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+ /* Bring it out of mwait */
+ WRITE_ONCE(md->control, newstate);
+ udelay(5);
+ }
- mb();
- /* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
+ if (READ_ONCE(md->status) != newstate)
+ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
+ }
+}
- /*
- * With physical CPU hotplug, we should halt the cpu
- */
- local_irq_disable();
+void __noreturn hlt_play_dead(void)
+{
+ if (__this_cpu_read(cpu_info.x86) >= 4)
+ wbinvd();
+
+ while (1)
+ native_halt();
}
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ __update_spec_ctrl(0);
+
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- wbinvd_halt();
+
+ /* Below returns only on error. */
+ cpuidle_play_dead();
+ hlt_play_dead();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
@@ -1380,13 +1414,7 @@ int native_cpu_disable(void)
return -ENOSYS;
}
-void native_cpu_die(unsigned int cpu)
-{
- /* We said "no" in __cpu_disable */
- BUG();
-}
-
-void native_play_dead(void)
+void __noreturn native_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..ee117fcf46ed 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,129 +4,115 @@
* Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-static void save_stack_warning(void *data, char *msg)
+void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
+ struct task_struct *task, struct pt_regs *regs)
{
-}
+ struct unwind_state state;
+ unsigned long addr;
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
-static int save_stack_stack(void *data, char *name)
-{
- return 0;
-}
-
-static void save_stack_address(void *data, unsigned long addr, int reliable)
-{
- struct stack_trace *trace = data;
- if (!reliable)
- return;
- if (trace->skip > 0) {
- trace->skip--;
+ if (regs && !consume_entry(cookie, regs->ip))
return;
+
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || !consume_entry(cookie, addr))
+ break;
}
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
}
-static void
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
+ void *cookie, struct task_struct *task)
{
- struct stack_trace *trace = (struct stack_trace *)data;
- if (!reliable)
- return;
- if (in_sched_functions(addr))
- return;
- if (trace->skip > 0) {
- trace->skip--;
- return;
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
-}
+ struct unwind_state state;
+ struct pt_regs *regs;
+ unsigned long addr;
+
+ for (unwind_start(&state, task, NULL, NULL);
+ !unwind_done(&state) && !unwind_error(&state);
+ unwind_next_frame(&state)) {
+
+ regs = unwind_get_entry_regs(&state, NULL);
+ if (regs) {
+ /* Success path for user tasks */
+ if (user_mode(regs))
+ return 0;
+
+ /*
+ * Kernel mode registers on the stack indicate an
+ * in-kernel interrupt or exception (e.g., preemption
+ * or a page fault), which can make frame pointers
+ * unreliable.
+ */
+ if (IS_ENABLED(CONFIG_FRAME_POINTER))
+ return -EINVAL;
+ }
-static const struct stacktrace_ops save_stack_ops = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address,
- .walk_stack = print_context_stack,
-};
+ addr = unwind_get_return_address(&state);
-static const struct stacktrace_ops save_stack_ops_nosched = {
- .warning = save_stack_warning,
- .warning_symbol = save_stack_warning_symbol,
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
- .walk_stack = print_context_stack,
-};
+ /*
+ * A NULL or invalid return address probably means there's some
+ * generated code which __kernel_text_address() doesn't know
+ * about.
+ */
+ if (!addr)
+ return -EINVAL;
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
+ if (!consume_entry(cookie, addr))
+ return -EINVAL;
+ }
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
-{
- dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
+ /* Check for stack corruption */
+ if (unwind_error(&state))
+ return -EINVAL;
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
-{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ return 0;
}
-EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
const void __user *next_fp;
unsigned long ret_addr;
};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const struct stack_frame_user __user *fp,
+ struct stack_frame_user *frame)
{
int ret;
- if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ if (!__access_ok(fp, sizeof(*frame)))
return 0;
ret = 1;
pagefault_disable();
- if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ if (__get_user(frame->next_fp, &fp->next_fp) ||
+ __get_user(frame->ret_addr, &fp->ret_addr))
ret = 0;
pagefault_enable();
return ret;
}
-static inline void __save_stack_trace_user(struct stack_trace *trace)
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+ const struct pt_regs *regs)
{
- const struct pt_regs *regs = task_pt_regs(current);
const void __user *fp = (const void __user *)regs->bp;
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = regs->ip;
+ if (!consume_entry(cookie, regs->ip))
+ return;
- while (trace->nr_entries < trace->max_entries) {
- struct stack_frame frame;
+ while (1) {
+ struct stack_frame_user frame;
frame.next_fp = NULL;
frame.ret_addr = 0;
@@ -134,25 +120,11 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
break;
if ((unsigned long)fp < regs->sp)
break;
- if (frame.ret_addr) {
- trace->entries[trace->nr_entries++] =
- frame.ret_addr;
- }
- if (fp == frame.next_fp)
+ if (!frame.ret_addr)
+ break;
+ if (!consume_entry(cookie, frame.ret_addr))
break;
fp = frame.next_fp;
}
}
-void save_stack_trace_user(struct stack_trace *trace)
-{
- /*
- * Trace user stack if we are not a kernel thread
- */
- if (current->mm) {
- __save_stack_trace_user(trace);
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
-
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
new file mode 100644
index 000000000000..61592e41a6b1
--- /dev/null
+++ b/arch/x86/kernel/static_call.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+#include <linux/memory.h>
+#include <linux/bug.h>
+#include <asm/text-patching.h>
+
+enum insn_type {
+ CALL = 0, /* site call */
+ NOP = 1, /* site cond-call */
+ JMP = 2, /* tramp / site tail-call */
+ RET = 3, /* tramp / site cond-tail-call */
+ JCC = 4,
+};
+
+/*
+ * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such
+ * that there is no false-positive trampoline identification while also being a
+ * speculation stop.
+ */
+static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc };
+
+/*
+ * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax
+ */
+static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
+
+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
+/*
+ * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug()
+ */
+static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a };
+
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+ u8 ret = 0;
+
+ if (insn[0] == 0x0f) {
+ u8 tmp = insn[1];
+ if ((tmp & 0xf0) == 0x80)
+ ret = tmp;
+ }
+
+ return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+ ".type __static_call_return, @function\n\t"
+ ASM_FUNC_ALIGN "\n\t"
+ "__static_call_return:\n\t"
+ ANNOTATE_NOENDBR "\n\t"
+ ANNOTATE_RETPOLINE_SAFE "\n\t"
+ "ret; int3\n\t"
+ ".size __static_call_return, . - __static_call_return \n\t");
+
+static void __ref __static_call_transform(void *insn, enum insn_type type,
+ void *func, bool modinit)
+{
+ const void *emulate = NULL;
+ int size = CALL_INSN_SIZE;
+ const void *code;
+ u8 op, buf[6];
+
+ if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+ type = JCC;
+
+ switch (type) {
+ case CALL:
+ func = callthunks_translate_call_dest(func);
+ code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+ if (func == &__WARN_trap) {
+ emulate = code;
+ code = &warninsn;
+ }
+ break;
+
+ case NOP:
+ code = x86_nops[5];
+ break;
+
+ case JMP:
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, func);
+ break;
+
+ case RET:
+ if (cpu_wants_rethunk_at(insn))
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
+ else
+ code = &retinsn;
+ break;
+
+ case JCC:
+ if (!func) {
+ func = __static_call_return;
+ if (cpu_wants_rethunk())
+ func = x86_return_thunk;
+ }
+
+ buf[0] = 0x0f;
+ __text_gen_insn(buf+1, op, insn+1, func, 5);
+ code = buf;
+ size = 6;
+
+ break;
+ }
+
+ if (memcmp(insn, code, size) == 0)
+ return;
+
+ if (system_state == SYSTEM_BOOTING || modinit)
+ return text_poke_early(insn, code, size);
+
+ smp_text_poke_single(insn, code, size, emulate);
+}
+
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
+{
+ u8 opcode = insn[0];
+
+ if (tramp && memcmp(insn+5, tramp_ud, 3)) {
+ pr_err("trampoline signature fail");
+ BUG();
+ }
+
+ if (tail) {
+ if (opcode == JMP32_INSN_OPCODE ||
+ opcode == RET_INSN_OPCODE ||
+ __is_Jcc(insn))
+ return;
+ } else {
+ if (opcode == CALL_INSN_OPCODE ||
+ !memcmp(insn, x86_nops[5], 5) ||
+ !memcmp(insn, xor5rax, 5) ||
+ !memcmp(insn, warninsn, 5))
+ return;
+ }
+
+ /*
+ * If we ever trigger this, our text is corrupt, we'll probably not live long.
+ */
+ pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn);
+ BUG();
+}
+
+static inline enum insn_type __sc_insn(bool null, bool tail)
+{
+ /*
+ * Encode the following table without branches:
+ *
+ * tail null insn
+ * -----+-------+------
+ * 0 | 0 | CALL
+ * 0 | 1 | NOP
+ * 1 | 0 | JMP
+ * 1 | 1 | RET
+ */
+ return 2*tail + null;
+}
+
+void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
+{
+ mutex_lock(&text_mutex);
+
+ if (tramp && !site) {
+ __static_call_validate(tramp, true, true);
+ __static_call_transform(tramp, __sc_insn(!func, true), func, false);
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) {
+ __static_call_validate(site, tail, false);
+ __static_call_transform(site, __sc_insn(!func, tail), func, false);
+ }
+
+ mutex_unlock(&text_mutex);
+}
+EXPORT_SYMBOL_GPL(arch_static_call_transform);
+
+noinstr void __static_call_update_early(void *tramp, void *func)
+{
+ BUG_ON(system_state != SYSTEM_BOOTING);
+ BUG_ON(static_call_initialized);
+ __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE);
+ sync_core();
+}
+
+#ifdef CONFIG_MITIGATION_RETHUNK
+/*
+ * This is called by apply_returns() to fix up static call trampolines,
+ * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as
+ * having a return trampoline.
+ *
+ * The problem is that static_call() is available before determining
+ * X86_FEATURE_RETHUNK and, by implication, running alternatives.
+ *
+ * This means that __static_call_transform() above can have overwritten the
+ * return trampoline and we now need to fix things up to be consistent.
+ */
+bool __static_call_fixup(void *tramp, u8 op, void *dest)
+{
+ unsigned long addr = (unsigned long)tramp;
+ /*
+ * Not all .return_sites are a static_call trampoline (most are not).
+ * Check if the 3 bytes after the return are still kernel text, if not,
+ * then this definitely is not a trampoline and we need not worry
+ * further.
+ *
+ * This avoids the memcmp() below tripping over pagefaults etc..
+ */
+ if (((addr >> PAGE_SHIFT) != ((addr + 7) >> PAGE_SHIFT)) &&
+ !kernel_text_address(addr + 7))
+ return false;
+
+ if (memcmp(tramp+5, tramp_ud, 3)) {
+ /* Not a trampoline site, not our problem. */
+ return false;
+ }
+
+ mutex_lock(&text_mutex);
+ if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk)
+ __static_call_transform(tramp, RET, NULL, true);
+ mutex_unlock(&text_mutex);
+
+ return true;
+}
+#endif
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..3e2952679b88 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -1,22 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* x86 single-step support code, common to 32-bit and 64-bit.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
+
#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/mmu_context.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
unsigned long addr, seg;
addr = regs->ip;
- seg = regs->cs & 0xffff;
+ seg = regs->cs;
if (v8086_mode(regs)) {
addr = (addr & 0xffff) + (seg << 4);
return addr;
}
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
@@ -27,13 +33,14 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
struct desc_struct *desc;
unsigned long base;
- seg &= ~7UL;
+ seg >>= 3;
mutex_lock(&child->mm->context.lock);
- if (unlikely((seg >> 3) >= child->mm->context.size))
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->nr_entries))
addr = -1L; /* bogus selector, access would fault */
else {
- desc = child->mm->context.ldt + seg;
+ desc = &child->mm->context.ldt->entries[seg];
base = get_desc_base(desc);
/* 16-bit code segment? */
@@ -43,6 +50,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
}
mutex_unlock(&child->mm->context.lock);
}
+#endif
return addr;
}
@@ -53,7 +61,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
unsigned char opcode[15];
unsigned long addr = convert_ip_to_linear(child, regs);
- copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+ copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+ FOLL_FORCE);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
@@ -74,7 +83,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
#ifdef CONFIG_X86_64
case 0x40 ... 0x4f:
- if (regs->cs != __USER_CS)
+ if (!user_64bit_mode(regs))
/* 32-bit mode: register increment */
return 0;
/* 64-bit mode: REX prefix */
@@ -120,12 +129,17 @@ static int enable_single_step(struct task_struct *child)
regs->flags |= X86_EFLAGS_TF;
/*
- * Always set TIF_SINGLESTEP - this guarantees that
- * we single-step system calls etc.. This will also
+ * Always set TIF_SINGLESTEP. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
+ /*
+ * Ensure that a trap is triggered once stepping out of a system
+ * call prior to executing any user instruction.
+ */
+ set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
+
oflags = regs->flags;
/* Set TF on the kernel stack.. */
@@ -157,6 +171,33 @@ static int enable_single_step(struct task_struct *child)
return 1;
}
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ *
+ * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
+ * task is current or it can't be running, otherwise we can race
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced().
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
/*
* Enable single or block step.
*/
@@ -166,22 +207,13 @@ static void enable_step(struct task_struct *child, bool block)
* Make sure block stepping (BTF) is not enabled unless it should be.
* Note that we don't try to worry about any is_setting_trap_flag()
* instructions after the first when using block stepping.
- * So noone should try to use debugger block stepping in a program
+ * So no one should try to use debugger block stepping in a program
* that uses user-mode single stepping itself.
*/
- if (enable_single_step(child) && block) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- set_tsk_thread_flag(child, TIF_BLOCKSTEP);
- } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
}
void user_enable_single_step(struct task_struct *child)
@@ -199,16 +231,12 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+ clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
/* But touch TF only if it was set by us.. */
if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
deleted file mode 100644
index 196552bb412c..000000000000
--- a/arch/x86/kernel/sys_i386_32.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file contains various random system calls that
- * have a non-standard calling sequence on the Linux/i386
- * platform.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/ipc.h>
-
-#include <linux/uaccess.h>
-#include <linux/unistd.h>
-
-#include <asm/syscalls.h>
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
-{
- long __res;
- asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
- : "=a" (__res)
- : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
- return __res;
-}
diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c
new file mode 100644
index 000000000000..6cf65397d225
--- /dev/null
+++ b/arch/x86/kernel/sys_ia32.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ * sys_sparc32
+ *
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory.
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ *
+ * Some of the functions are LE specific currently. These are
+ * hopefully all marked. This should be fixed.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/rwsem.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <asm/vgtod.h>
+#include <asm/ia32.h>
+
+#define AA(__x) ((unsigned long)(__x))
+
+SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_truncate(filename,
+ ((loff_t) offset_high << 32) | offset_low);
+}
+
+SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+
+/* warning: next two assume little endian */
+SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pread64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pwrite64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+
+/*
+ * Some system calls that need sign extended arguments. This could be
+ * done by a generic wrapper.
+ */
+SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low,
+ __u32, offset_high, __u32, len_low, __u32, len_high,
+ int, advice)
+{
+ return ksys_fadvise64_64(fd,
+ (((u64)offset_high)<<32) | offset_low,
+ (((u64)len_high)<<32) | len_low,
+ advice);
+}
+
+SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo,
+ unsigned int, off_hi, size_t, count)
+{
+ return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
+}
+
+SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low,
+ unsigned int, off_hi, unsigned int, n_low,
+ unsigned int, n_hi, int, flags)
+{
+ return ksys_sync_file_range(fd,
+ ((u64)off_hi << 32) | off_low,
+ ((u64)n_hi << 32) | n_low, flags);
+}
+
+SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo,
+ unsigned int, offset_hi, size_t, len, int, advice)
+{
+ return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
+ len, advice);
+}
+
+SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode,
+ unsigned int, offset_lo, unsigned int, offset_hi,
+ unsigned int, len_lo, unsigned int, len_hi)
+{
+ return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+ ((u64)len_hi << 32) | len_lo);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+/*
+ * Another set for IA32/LFS -- x86_64 struct stat is different due to
+ * support for 64bit inode numbers.
+ */
+static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
+ if (!user_write_access_begin(ubuf, sizeof(struct stat64)))
+ return -EFAULT;
+ unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault);
+ unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault);
+ unsafe_put_user(stat->ino, &ubuf->st_ino, Efault);
+ unsafe_put_user(stat->mode, &ubuf->st_mode, Efault);
+ unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault);
+ unsafe_put_user(uid, &ubuf->st_uid, Efault);
+ unsafe_put_user(gid, &ubuf->st_gid, Efault);
+ unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault);
+ unsafe_put_user(stat->size, &ubuf->st_size, Efault);
+ unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault);
+ unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault);
+ unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault);
+ unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault);
+ unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault);
+ unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault);
+ unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault);
+ unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault);
+ user_access_end();
+ return 0;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_stat(filename, &stat);
+
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_lstat(filename, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_fstat(fd, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd,
+ const char __user *, filename,
+ struct stat64 __user *, statbuf, int, flag)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_stat64(statbuf, &stat);
+}
+
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct32 {
+ unsigned int addr;
+ unsigned int len;
+ unsigned int prot;
+ unsigned int flags;
+ unsigned int fd;
+ unsigned int offset;
+};
+
+COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg)
+{
+ struct mmap_arg_struct32 a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset>>PAGE_SHIFT);
+}
+
+/*
+ * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS
+ */
+COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags,
+ unsigned long, newsp, int __user *, parent_tidptr,
+ unsigned long, tls_val, int __user *, child_tidptr)
+{
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls_val,
+ };
+
+ return kernel_clone(&args);
+}
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index ff14a5044ce6..776ae6fa7f2d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,5 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -14,29 +17,82 @@
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/uaccess.h>
+#include <linux/elf.h>
+#include <linux/hugetlb.h>
+#include <asm/elf.h>
#include <asm/ia32.h>
-#include <asm/syscalls.h>
+
+/*
+ * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
+ */
+static unsigned long get_align_mask(struct file *filp)
+{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
+ /* handle 32- and 64-bit case with a single conditional */
+ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
+ return 0;
+
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+
+ return va_align.mask;
+}
+
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask(NULL);
+}
+
+static int __init control_va_addr_alignment(char *str)
+{
+ /* guard against enabling this on other CPU families */
+ if (va_align.flags < 0)
+ return 1;
+
+ if (*str == 0)
+ return 1;
+
+ if (!strcmp(str, "32"))
+ va_align.flags = ALIGN_VA_32;
+ else if (!strcmp(str, "64"))
+ va_align.flags = ALIGN_VA_64;
+ else if (!strcmp(str, "off"))
+ va_align.flags = 0;
+ else if (!strcmp(str, "on"))
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+ else
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
+
+ return 1;
+}
+__setup("align_va_addr=", control_va_addr_alignment);
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
- long error;
- error = -EINVAL;
if (off & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
- error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-out:
- return error;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
-static void find_start_end(unsigned long flags, unsigned long *begin,
- unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
{
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
- unsigned long new_begin;
+ if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
@@ -47,29 +103,39 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = 0x40000000;
*end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
- new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
- if (new_begin)
- *begin = new_begin;
+ *begin = randomize_page(*begin, 0x02000000);
}
- } else {
- *begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ return;
}
+
+ *begin = get_mmap_base(1);
+ if (in_32bit_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+}
+
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
}
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info = {};
unsigned long begin, end;
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(addr, flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -78,118 +144,90 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
- if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
- && len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = begin;
- }
- addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
- start_addr = addr;
-
-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (end - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != begin) {
- start_addr = addr = begin;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
+ info.length = len;
+ info.low_limit = begin;
+ info.high_limit = end;
+ if (!(filp && is_file_hugepages(filp))) {
+ info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
+ }
+ if (filp) {
+ info.align_mask = get_align_mask(filp);
+ info.align_offset += get_align_bits();
}
-}
+ return vm_unmapped_area(&info);
+}
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
+ /* No address checking. See comment at mmap_address_hint_valid() */
if (flags & MAP_FIXED)
return addr;
- /* for MAP_32BIT mappings we force the legact mmap base */
- if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
+ /* for MAP_32BIT mappings we force the legacy mmap base */
+ if (!in_32bit_syscall() && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
if (addr) {
- addr = PAGE_ALIGN(addr);
+ addr &= PAGE_MASK;
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ if (!vma || addr + len <= vm_start_gap(vma))
return addr;
}
-
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
+get_unmapped_area:
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
+ info.low_limit = SZ_4G;
+ else
+ info.low_limit = PAGE_SIZE;
+
+ info.high_limit = get_mmap_base(0);
+ if (!(filp && is_file_hugepages(filp))) {
+ info.start_gap = stack_guard_placement(vm_flags);
+ info.align_offset = pgoff << PAGE_SHIFT;
}
- /* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_32bit_syscall() check to avoid high addresses for x32
+ * (and make it no op on native i386).
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr-len;
+ if (filp) {
+ info.align_mask = get_align_mask(filp);
+ info.align_offset += get_align_bits();
}
-
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base-len;
-
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return mm->free_area_cache = addr;
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
+ addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
bottomup:
/*
@@ -198,14 +236,5 @@ bottomup:
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
-
- return addr;
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0);
}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
deleted file mode 100644
index de87d6008295..000000000000
--- a/arch/x86/kernel/syscall_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* System call table for x86-64. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __NO_STUBS
-
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void sys_ni_syscall(void);
-
-const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /*
- *Smells like a like a compiler bug -- it doesn't work
- *when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
-};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 8b3729341216..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,339 +0,0 @@
-ENTRY(sys_call_table)
- .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
- .long sys_exit
- .long ptregs_fork
- .long sys_read
- .long sys_write
- .long sys_open /* 5 */
- .long sys_close
- .long sys_waitpid
- .long sys_creat
- .long sys_link
- .long sys_unlink /* 10 */
- .long ptregs_execve
- .long sys_chdir
- .long sys_time
- .long sys_mknod
- .long sys_chmod /* 15 */
- .long sys_lchown16
- .long sys_ni_syscall /* old break syscall holder */
- .long sys_stat
- .long sys_lseek
- .long sys_getpid /* 20 */
- .long sys_mount
- .long sys_oldumount
- .long sys_setuid16
- .long sys_getuid16
- .long sys_stime /* 25 */
- .long sys_ptrace
- .long sys_alarm
- .long sys_fstat
- .long sys_pause
- .long sys_utime /* 30 */
- .long sys_ni_syscall /* old stty syscall holder */
- .long sys_ni_syscall /* old gtty syscall holder */
- .long sys_access
- .long sys_nice
- .long sys_ni_syscall /* 35 - old ftime syscall holder */
- .long sys_sync
- .long sys_kill
- .long sys_rename
- .long sys_mkdir
- .long sys_rmdir /* 40 */
- .long sys_dup
- .long sys_pipe
- .long sys_times
- .long sys_ni_syscall /* old prof syscall holder */
- .long sys_brk /* 45 */
- .long sys_setgid16
- .long sys_getgid16
- .long sys_signal
- .long sys_geteuid16
- .long sys_getegid16 /* 50 */
- .long sys_acct
- .long sys_umount /* recycled never used phys() */
- .long sys_ni_syscall /* old lock syscall holder */
- .long sys_ioctl
- .long sys_fcntl /* 55 */
- .long sys_ni_syscall /* old mpx syscall holder */
- .long sys_setpgid
- .long sys_ni_syscall /* old ulimit syscall holder */
- .long sys_olduname
- .long sys_umask /* 60 */
- .long sys_chroot
- .long sys_ustat
- .long sys_dup2
- .long sys_getppid
- .long sys_getpgrp /* 65 */
- .long sys_setsid
- .long sys_sigaction
- .long sys_sgetmask
- .long sys_ssetmask
- .long sys_setreuid16 /* 70 */
- .long sys_setregid16
- .long sys_sigsuspend
- .long sys_sigpending
- .long sys_sethostname
- .long sys_setrlimit /* 75 */
- .long sys_old_getrlimit
- .long sys_getrusage
- .long sys_gettimeofday
- .long sys_settimeofday
- .long sys_getgroups16 /* 80 */
- .long sys_setgroups16
- .long sys_old_select
- .long sys_symlink
- .long sys_lstat
- .long sys_readlink /* 85 */
- .long sys_uselib
- .long sys_swapon
- .long sys_reboot
- .long sys_old_readdir
- .long sys_old_mmap /* 90 */
- .long sys_munmap
- .long sys_truncate
- .long sys_ftruncate
- .long sys_fchmod
- .long sys_fchown16 /* 95 */
- .long sys_getpriority
- .long sys_setpriority
- .long sys_ni_syscall /* old profil syscall holder */
- .long sys_statfs
- .long sys_fstatfs /* 100 */
- .long sys_ioperm
- .long sys_socketcall
- .long sys_syslog
- .long sys_setitimer
- .long sys_getitimer /* 105 */
- .long sys_newstat
- .long sys_newlstat
- .long sys_newfstat
- .long sys_uname
- .long ptregs_iopl /* 110 */
- .long sys_vhangup
- .long sys_ni_syscall /* old "idle" system call */
- .long ptregs_vm86old
- .long sys_wait4
- .long sys_swapoff /* 115 */
- .long sys_sysinfo
- .long sys_ipc
- .long sys_fsync
- .long ptregs_sigreturn
- .long ptregs_clone /* 120 */
- .long sys_setdomainname
- .long sys_newuname
- .long sys_modify_ldt
- .long sys_adjtimex
- .long sys_mprotect /* 125 */
- .long sys_sigprocmask
- .long sys_ni_syscall /* old "create_module" */
- .long sys_init_module
- .long sys_delete_module
- .long sys_ni_syscall /* 130: old "get_kernel_syms" */
- .long sys_quotactl
- .long sys_getpgid
- .long sys_fchdir
- .long sys_bdflush
- .long sys_sysfs /* 135 */
- .long sys_personality
- .long sys_ni_syscall /* reserved for afs_syscall */
- .long sys_setfsuid16
- .long sys_setfsgid16
- .long sys_llseek /* 140 */
- .long sys_getdents
- .long sys_select
- .long sys_flock
- .long sys_msync
- .long sys_readv /* 145 */
- .long sys_writev
- .long sys_getsid
- .long sys_fdatasync
- .long sys_sysctl
- .long sys_mlock /* 150 */
- .long sys_munlock
- .long sys_mlockall
- .long sys_munlockall
- .long sys_sched_setparam
- .long sys_sched_getparam /* 155 */
- .long sys_sched_setscheduler
- .long sys_sched_getscheduler
- .long sys_sched_yield
- .long sys_sched_get_priority_max
- .long sys_sched_get_priority_min /* 160 */
- .long sys_sched_rr_get_interval
- .long sys_nanosleep
- .long sys_mremap
- .long sys_setresuid16
- .long sys_getresuid16 /* 165 */
- .long ptregs_vm86
- .long sys_ni_syscall /* Old sys_query_module */
- .long sys_poll
- .long sys_nfsservctl
- .long sys_setresgid16 /* 170 */
- .long sys_getresgid16
- .long sys_prctl
- .long ptregs_rt_sigreturn
- .long sys_rt_sigaction
- .long sys_rt_sigprocmask /* 175 */
- .long sys_rt_sigpending
- .long sys_rt_sigtimedwait
- .long sys_rt_sigqueueinfo
- .long sys_rt_sigsuspend
- .long sys_pread64 /* 180 */
- .long sys_pwrite64
- .long sys_chown16
- .long sys_getcwd
- .long sys_capget
- .long sys_capset /* 185 */
- .long ptregs_sigaltstack
- .long sys_sendfile
- .long sys_ni_syscall /* reserved for streams1 */
- .long sys_ni_syscall /* reserved for streams2 */
- .long ptregs_vfork /* 190 */
- .long sys_getrlimit
- .long sys_mmap_pgoff
- .long sys_truncate64
- .long sys_ftruncate64
- .long sys_stat64 /* 195 */
- .long sys_lstat64
- .long sys_fstat64
- .long sys_lchown
- .long sys_getuid
- .long sys_getgid /* 200 */
- .long sys_geteuid
- .long sys_getegid
- .long sys_setreuid
- .long sys_setregid
- .long sys_getgroups /* 205 */
- .long sys_setgroups
- .long sys_fchown
- .long sys_setresuid
- .long sys_getresuid
- .long sys_setresgid /* 210 */
- .long sys_getresgid
- .long sys_chown
- .long sys_setuid
- .long sys_setgid
- .long sys_setfsuid /* 215 */
- .long sys_setfsgid
- .long sys_pivot_root
- .long sys_mincore
- .long sys_madvise
- .long sys_getdents64 /* 220 */
- .long sys_fcntl64
- .long sys_ni_syscall /* reserved for TUX */
- .long sys_ni_syscall
- .long sys_gettid
- .long sys_readahead /* 225 */
- .long sys_setxattr
- .long sys_lsetxattr
- .long sys_fsetxattr
- .long sys_getxattr
- .long sys_lgetxattr /* 230 */
- .long sys_fgetxattr
- .long sys_listxattr
- .long sys_llistxattr
- .long sys_flistxattr
- .long sys_removexattr /* 235 */
- .long sys_lremovexattr
- .long sys_fremovexattr
- .long sys_tkill
- .long sys_sendfile64
- .long sys_futex /* 240 */
- .long sys_sched_setaffinity
- .long sys_sched_getaffinity
- .long sys_set_thread_area
- .long sys_get_thread_area
- .long sys_io_setup /* 245 */
- .long sys_io_destroy
- .long sys_io_getevents
- .long sys_io_submit
- .long sys_io_cancel
- .long sys_fadvise64 /* 250 */
- .long sys_ni_syscall
- .long sys_exit_group
- .long sys_lookup_dcookie
- .long sys_epoll_create
- .long sys_epoll_ctl /* 255 */
- .long sys_epoll_wait
- .long sys_remap_file_pages
- .long sys_set_tid_address
- .long sys_timer_create
- .long sys_timer_settime /* 260 */
- .long sys_timer_gettime
- .long sys_timer_getoverrun
- .long sys_timer_delete
- .long sys_clock_settime
- .long sys_clock_gettime /* 265 */
- .long sys_clock_getres
- .long sys_clock_nanosleep
- .long sys_statfs64
- .long sys_fstatfs64
- .long sys_tgkill /* 270 */
- .long sys_utimes
- .long sys_fadvise64_64
- .long sys_ni_syscall /* sys_vserver */
- .long sys_mbind
- .long sys_get_mempolicy
- .long sys_set_mempolicy
- .long sys_mq_open
- .long sys_mq_unlink
- .long sys_mq_timedsend
- .long sys_mq_timedreceive /* 280 */
- .long sys_mq_notify
- .long sys_mq_getsetattr
- .long sys_kexec_load
- .long sys_waitid
- .long sys_ni_syscall /* 285 */ /* available */
- .long sys_add_key
- .long sys_request_key
- .long sys_keyctl
- .long sys_ioprio_set
- .long sys_ioprio_get /* 290 */
- .long sys_inotify_init
- .long sys_inotify_add_watch
- .long sys_inotify_rm_watch
- .long sys_migrate_pages
- .long sys_openat /* 295 */
- .long sys_mkdirat
- .long sys_mknodat
- .long sys_fchownat
- .long sys_futimesat
- .long sys_fstatat64 /* 300 */
- .long sys_unlinkat
- .long sys_renameat
- .long sys_linkat
- .long sys_symlinkat
- .long sys_readlinkat /* 305 */
- .long sys_fchmodat
- .long sys_faccessat
- .long sys_pselect6
- .long sys_ppoll
- .long sys_unshare /* 310 */
- .long sys_set_robust_list
- .long sys_get_robust_list
- .long sys_splice
- .long sys_sync_file_range
- .long sys_tee /* 315 */
- .long sys_vmsplice
- .long sys_move_pages
- .long sys_getcpu
- .long sys_epoll_pwait
- .long sys_utimensat /* 320 */
- .long sys_signalfd
- .long sys_timerfd_create
- .long sys_eventfd
- .long sys_fallocate
- .long sys_timerfd_settime /* 325 */
- .long sys_timerfd_gettime
- .long sys_signalfd4
- .long sys_eventfd2
- .long sys_epoll_create1
- .long sys_dup3 /* 330 */
- .long sys_pipe2
- .long sys_inotify_init1
- .long sys_preadv
- .long sys_pwritev
- .long sys_rt_tgsigqueueinfo /* 335 */
- .long sys_perf_event_open
- .long sys_recvmmsg
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..46b8f1f16676 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -1,27 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* tboot.c: main implementation of helper functions used by kernel for
* runtime support of Intel(R) Trusted Execution Technology
*
* Copyright (c) 2006-2009, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
*/
-#include <linux/dma_remapping.h>
#include <linux/init_task.h>
#include <linux/spinlock.h>
+#include <linux/export.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -30,23 +17,22 @@
#include <linux/pfn.h>
#include <linux/mm.h>
#include <linux/tboot.h>
+#include <linux/debugfs.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/io.h>
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
-struct tboot *tboot __read_mostly;
-EXPORT_SYMBOL(tboot);
+static struct tboot *tboot __read_mostly;
/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
#define AP_WAIT_TIMEOUT 1
@@ -56,6 +42,35 @@ EXPORT_SYMBOL(tboot);
static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+bool tboot_enabled(void)
+{
+ return tboot != NULL;
+}
+
+/* noinline to prevent gcc from warning about dereferencing constant fixaddr */
+static noinline __init bool check_tboot_version(void)
+{
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ return false;
+ }
+
+ if (tboot->version < 5) {
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
+ return false;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+
+ return true;
+}
+
void __init tboot_probe(void)
{
/* Look for valid page-aligned address for shared page. */
@@ -65,52 +80,29 @@ void __init tboot_probe(void)
* also verify that it is mapped as we expect it before calling
* set_fixmap(), to reduce chance of garbage value causing crash
*/
- if (!e820_any_mapped(boot_params.tboot_addr,
- boot_params.tboot_addr, E820_RESERVED)) {
- pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
- return;
- }
-
- /* only a natively booted kernel should be using TXT */
- if (paravirt_enabled()) {
- pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
+ if (!e820__mapped_any(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_TYPE_RESERVED)) {
+ pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
return;
}
/* Map and check for tboot UUID. */
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
- tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
- if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warning("tboot at 0x%llx is invalid\n",
- boot_params.tboot_addr);
+ tboot = (void *)fix_to_virt(FIX_TBOOT_BASE);
+ if (!check_tboot_version())
tboot = NULL;
- return;
- }
- if (tboot->version < 5) {
- pr_warning("tboot version is invalid: %u\n", tboot->version);
- tboot = NULL;
- return;
- }
-
- pr_info("found shared page at phys addr 0x%llx:\n",
- boot_params.tboot_addr);
- pr_debug("version: %d\n", tboot->version);
- pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
- pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
- pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
- pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
}
static pgd_t *tboot_pg_dir;
static struct mm_struct tboot_mm = {
- .mm_rb = RB_ROOT,
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
- .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq),
+ MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
- .cpu_vm_mask = CPU_MASK_ALL,
};
static inline void switch_to_tboot_pt(void)
@@ -122,12 +114,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pgprot_t prot)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset(&tboot_mm, vaddr);
- pud = pud_alloc(&tboot_mm, pgd, vaddr);
+ p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
+ if (!p4d)
+ return -1;
+ pud = pud_alloc(&tboot_mm, p4d, vaddr);
if (!pud)
return -1;
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
@@ -138,6 +134,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
pte_unmap(pte);
+
+ /*
+ * PTI poisons low addresses in the kernel page tables in the
+ * name of making them unusable for userspace. To execute
+ * code at such a low address, the poison must be cleared.
+ *
+ * Note: 'pgd' actually gets set in p4d_alloc() _or_
+ * pud_alloc() depending on 4/5-level paging.
+ */
+ pgd->pgd &= ~_PAGE_NX;
+
return 0;
}
@@ -192,15 +199,15 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
- for (i = 0; i < e820.nr_map; i++) {
- if ((e820.map[i].type != E820_RAM)
- && (e820.map[i].type != E820_RESERVED_KERN))
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if (e820_table->entries[i].type != E820_TYPE_RAM)
continue;
- add_mac_region(e820.map[i].addr, e820.map[i].size);
+ add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
}
- tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
return 0;
}
@@ -271,7 +278,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
offsetof(struct acpi_table_facs, firmware_waking_vector);
}
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
{
static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
/* S0,1,2: */ -1, -1, -1,
@@ -280,7 +287,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
/* S5: */ TB_SHUTDOWN_S5 };
if (!tboot_enabled())
- return;
+ return 0;
tboot_copy_fadt(&acpi_gbl_FADT);
tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -290,11 +297,21 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
- pr_warning("unsupported sleep state 0x%x\n", sleep_state);
- return;
+ pr_warn("unsupported sleep state 0x%x\n", sleep_state);
+ return -1;
}
tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
+}
+
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
}
static atomic_t ap_wfs_count;
@@ -311,30 +328,88 @@ static int tboot_wait_for_aps(int num_aps)
}
if (timeout)
- pr_warning("tboot wait for APs timeout\n");
+ pr_warn("tboot wait for APs timeout\n");
return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
}
-static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int tboot_dying_cpu(unsigned int cpu)
{
- switch (action) {
- case CPU_DYING:
- atomic_inc(&ap_wfs_count);
- if (num_online_cpus() == 1)
- if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
- return NOTIFY_BAD;
- break;
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1) {
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return -EBUSY;
}
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block tboot_cpu_notifier __cpuinitdata =
+#ifdef CONFIG_DEBUG_FS
+
+#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
+ 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 }
+
+#define TBOOT_SERIAL_LOG_ADDR 0x60000
+#define TBOOT_SERIAL_LOG_SIZE 0x08000
+#define LOG_MAX_SIZE_OFF 16
+#define LOG_BUF_OFF 24
+
+static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID;
+
+static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
{
- .notifier_call = tboot_cpu_callback,
+ void __iomem *log_base;
+ u8 log_uuid[16];
+ u32 max_size;
+ void *kbuf;
+ int ret = -EFAULT;
+
+ log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ if (!log_base)
+ return ret;
+
+ memcpy_fromio(log_uuid, log_base, sizeof(log_uuid));
+ if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid)))
+ goto err_iounmap;
+
+ max_size = readl(log_base + LOG_MAX_SIZE_OFF);
+ if (*ppos >= max_size) {
+ ret = 0;
+ goto err_iounmap;
+ }
+
+ if (*ppos + count > max_size)
+ count = max_size - *ppos;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto err_iounmap;
+ }
+
+ memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count);
+ if (copy_to_user(user_buf, kbuf, count))
+ goto err_kfree;
+
+ *ppos += count;
+
+ ret = count;
+
+err_kfree:
+ kfree(kbuf);
+
+err_iounmap:
+ iounmap(log_base);
+
+ return ret;
+}
+
+static const struct file_operations tboot_log_fops = {
+ .read = tboot_log_read,
+ .llseek = default_llseek,
};
+#endif /* CONFIG_DEBUG_FS */
+
static __init int tboot_late_init(void)
{
if (!tboot_enabled())
@@ -343,7 +418,15 @@ static __init int tboot_late_init(void)
tboot_create_trampoline();
atomic_set(&ap_wfs_count, 0);
- register_hotcpu_notifier(&tboot_cpu_notifier);
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
+ tboot_dying_cpu);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("tboot_log", S_IRUSR,
+ arch_debugfs_dir, NULL, &tboot_log_fops);
+#endif
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
return 0;
}
@@ -431,20 +514,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb
return dmar_tbl;
}
-
-int tboot_force_iommu(void)
-{
- if (!tboot_enabled())
- return 0;
-
- if (no_iommu || swiotlb || dmar_disabled)
- pr_warning("Forcing Intel-IOMMU to enabled\n");
-
- dmar_disabled = 0;
-#ifdef CONFIG_SWIOTLB
- swiotlb = 0;
-#endif
- no_iommu = 0;
-
- return 1;
-}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
deleted file mode 100644
index 9e540fee7009..000000000000
--- a/arch/x86/kernel/tce_64.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * This file manages the translation entries for the IBM Calgary IOMMU.
- *
- * Derived from arch/powerpc/platforms/pseries/iommu.c
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/bootmem.h>
-#include <asm/tce.h>
-#include <asm/calgary.h>
-#include <asm/proto.h>
-
-/* flush a tce at 'tceaddr' to main memory */
-static inline void flush_tce(void* tceaddr)
-{
- /* a single tce can't cross a cache line */
- if (cpu_has_clflush)
- clflush(tceaddr);
- else
- wbinvd();
-}
-
-void tce_build(struct iommu_table *tbl, unsigned long index,
- unsigned int npages, unsigned long uaddr, int direction)
-{
- u64* tp;
- u64 t;
- u64 rpn;
-
- t = (1 << TCE_READ_SHIFT);
- if (direction != DMA_TO_DEVICE)
- t |= (1 << TCE_WRITE_SHIFT);
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
- t &= ~TCE_RPN_MASK;
- t |= (rpn << TCE_RPN_SHIFT);
-
- *tp = cpu_to_be64(t);
- flush_tce(tp);
-
- uaddr += PAGE_SIZE;
- tp++;
- }
-}
-
-void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
-{
- u64* tp;
-
- tp = ((u64*)tbl->it_base) + index;
-
- while (npages--) {
- *tp = cpu_to_be64(0);
- flush_tce(tp);
- tp++;
- }
-}
-
-static inline unsigned int table_size_to_number_of_entries(unsigned char size)
-{
- /*
- * size is the order of the table, 0-7
- * smallest table is 8K entries, so shift result by 13 to
- * multiply by 8K
- */
- return (1 << size) << 13;
-}
-
-static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
-{
- unsigned int bitmapsz;
- unsigned long bmppages;
- int ret;
-
- tbl->it_busno = dev->bus->number;
-
- /* set the tce table size - measured in entries */
- tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-
- /*
- * number of bytes needed for the bitmap size in number of
- * entries; we need one bit per entry
- */
- bitmapsz = tbl->it_size / BITS_PER_BYTE;
- bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
- if (!bmppages) {
- printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
- ret = -ENOMEM;
- goto done;
- }
-
- tbl->it_map = (unsigned long*)bmppages;
-
- memset(tbl->it_map, 0, bitmapsz);
-
- tbl->it_hint = 0;
-
- spin_lock_init(&tbl->it_lock);
-
- return 0;
-
-done:
- return ret;
-}
-
-int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
-{
- struct iommu_table *tbl;
- int ret;
-
- if (pci_iommu(dev->bus)) {
- printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
- dev, pci_iommu(dev->bus));
- BUG();
- }
-
- tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
- if (!tbl) {
- printk(KERN_ERR "Calgary: error allocating iommu_table\n");
- ret = -ENOMEM;
- goto done;
- }
-
- ret = tce_table_setparms(dev, tbl);
- if (ret)
- goto free_tbl;
-
- tbl->bbar = bbar;
-
- set_pci_iommu(dev->bus, tbl);
-
- return 0;
-
-free_tbl:
- kfree(tbl);
-done:
- return ret;
-}
-
-void * __init alloc_tce_table(void)
-{
- unsigned int size;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- return __alloc_bootmem_low(size, size, 0);
-}
-
-void __init free_tce_table(void *tbl)
-{
- unsigned int size;
-
- if (!tbl)
- return;
-
- size = table_size_to_number_of_entries(specified_table_size);
- size *= TCE_ENTRY_SIZE;
-
- free_bootmem(__pa(tbl), size);
-}
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
deleted file mode 100644
index 787a5e499dd1..000000000000
--- a/arch/x86/kernel/test_nx.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * test_nx.c: functional test for NX functionality
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/asm.h>
-
-extern int rodata_test_data;
-
-/*
- * This file checks 4 things:
- * 1) Check if the stack is not executable
- * 2) Check if kmalloc memory is not executable
- * 3) Check if the .rodata section is not executable
- * 4) Check if the .data section of a module is not executable
- *
- * To do this, the test code tries to execute memory in stack/kmalloc/etc,
- * and then checks if the expected trap happens.
- *
- * Sadly, this implies having a dynamic exception handling table entry.
- * ... which can be done (and will make Rusty cry)... but it can only
- * be done in a stand-alone module with only 1 entry total.
- * (otherwise we'd have to sort and that's just too messy)
- */
-
-
-
-/*
- * We want to set up an exception handling point on our stack,
- * which means a variable value. This function is rather dirty
- * and walks the exception table of the module, looking for a magic
- * marker and replaces it with a specific function.
- */
-static void fudze_exception_table(void *marker, void *new)
-{
- struct module *mod = THIS_MODULE;
- struct exception_table_entry *extable;
-
- /*
- * Note: This module has only 1 exception table entry,
- * so searching and sorting is not needed. If that changes,
- * this would be the place to search and re-sort the exception
- * table.
- */
- if (mod->num_exentries > 1) {
- printk(KERN_ERR "test_nx: too many exception table entries!\n");
- printk(KERN_ERR "test_nx: test results are not reliable.\n");
- return;
- }
- extable = (struct exception_table_entry *)mod->extable;
- extable[0].insn = (unsigned long)new;
-}
-
-
-/*
- * exception tables get their symbols translated so we need
- * to use a fake function to put in there, which we can then
- * replace at runtime.
- */
-void foo_label(void);
-
-/*
- * returns 0 for not-executable, negative for executable
- *
- * Note: we cannot allow this function to be inlined, because
- * that would give us more than 1 exception table entry.
- * This in turn would break the assumptions above.
- */
-static noinline int test_address(void *address)
-{
- unsigned long result;
-
- /* Set up an exception table entry for our address */
- fudze_exception_table(&foo_label, address);
- result = 1;
- asm volatile(
- "foo_label:\n"
- "0: call *%[fake_code]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: mov %[zero], %[rslt]\n"
- " ret\n"
- ".previous\n"
- _ASM_EXTABLE(0b,2b)
- : [rslt] "=r" (result)
- : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
- );
- /* change the exception table back for the next round */
- fudze_exception_table(address, &foo_label);
-
- if (result)
- return -ENODEV;
- return 0;
-}
-
-static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
-
-static int test_NX(void)
-{
- int ret = 0;
- /* 0xC3 is the opcode for "ret" */
- char stackcode[] = {0xC3, 0x90, 0 };
- char *heap;
-
- test_data = 0xC3;
-
- printk(KERN_INFO "Testing NX protection\n");
-
- /* Test 1: check if the stack is not executable */
- if (test_address(&stackcode)) {
- printk(KERN_ERR "test_nx: stack was executable\n");
- ret = -ENODEV;
- }
-
-
- /* Test 2: Check if the heap is executable */
- heap = kmalloc(64, GFP_KERNEL);
- if (!heap)
- return -ENOMEM;
- heap[0] = 0xC3; /* opcode for "ret" */
-
- if (test_address(heap)) {
- printk(KERN_ERR "test_nx: heap was executable\n");
- ret = -ENODEV;
- }
- kfree(heap);
-
- /*
- * The following 2 tests currently fail, this needs to get fixed
- * Until then, don't run them to avoid too many people getting scared
- * by the error message
- */
-
-#ifdef CONFIG_DEBUG_RODATA
- /* Test 3: Check if the .rodata section is executable */
- if (rodata_test_data != 0xC3) {
- printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
- ret = -ENODEV;
- } else if (test_address(&rodata_test_data)) {
- printk(KERN_ERR "test_nx: .rodata section is executable\n");
- ret = -ENODEV;
- }
-#endif
-
-#if 0
- /* Test 4: Check if the .data section of a module is executable */
- if (test_address(&test_data)) {
- printk(KERN_ERR "test_nx: .data section is executable\n");
- ret = -ENODEV;
- }
-
-#endif
- return 0;
-}
-
-static void test_exit(void)
-{
-}
-
-module_init(test_NX);
-module_exit(test_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the NX infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index c29e235792af..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-
-int rodata_test(void)
-{
- unsigned long result;
- unsigned long start, end;
-
- /* test 1: read the value */
- /* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
- return -ENODEV;
- }
-
- /* test 2: write to the variable; this should fault */
- /*
- * If this test fails, we managed to overwrite the data
- *
- * This is written in assembly to be able to catch the
- * exception that is supposed to happen in the correct
- * case
- */
-
- result = 1;
- asm volatile(
- "0: mov %[zero],(%[rodata_test])\n"
- " mov %[zero], %[rslt]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: jmp 1b\n"
- ".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 16\n"
-#ifdef CONFIG_X86_32
- " .long 0b,2b\n"
-#else
- " .quad 0b,2b\n"
-#endif
- ".previous"
- : [rslt] "=r" (result)
- : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
- );
-
-
- if (!result) {
- printk(KERN_ERR "rodata_test: test data was not read only\n");
- return -ENODEV;
- }
-
- /* test 3: check the value hasn't changed */
- /* If this test fails, we managed to overwrite the data */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
- return -ENODEV;
- }
- /* test 4: check if the rodata section is 4Kb aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
- return -ENODEV;
- }
- if (end & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
- return -ENODEV;
- }
-
- return 0;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..52e1f3f0b361 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 1991,1992,1995 Linus Torvalds
* Copyright (c) 1994 Alan Modra
@@ -9,49 +10,24 @@
*
*/
+#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/i8253.h>
#include <linux/time.h>
-#include <linux/mca.h>
+#include <linux/export.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/i8259.h>
-#include <asm/i8253.h>
#include <asm/timer.h>
#include <asm/hpet.h>
#include <asm/time.h>
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
-#ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-#endif
-
unsigned long profile_pc(struct pt_regs *regs)
{
- unsigned long pc = instruction_pointer(regs);
-
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp =
- (unsigned long *)kernel_stack_pointer(regs);
- /*
- * Return address is either directly at stack pointer
- * or above a saved flags. Eflags has bits 22-31 zero,
- * kernel addresses don't.
- */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
+ return instruction_pointer(regs);
}
EXPORT_SYMBOL(profile_pc);
@@ -60,55 +36,54 @@ EXPORT_SYMBOL(profile_pc);
*/
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
- /* Keep nmi watchdog up to date */
- inc_irq_stat(irq0_irqs);
-
- /* Optimized out for !IO_APIC and x86_64 */
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- raw_spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- raw_spin_unlock(&i8259A_lock);
- }
-
global_clock_event->event_handler(global_clock_event);
-
- /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
- if (MCA_bus)
- outb_p(inb_p(0x61)| 0x80, 0x61);
-
return IRQ_HANDLED;
}
-static struct irqaction irq0 = {
- .handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
- .name = "timer"
-};
-
-void __init setup_default_timer_irq(void)
+static void __init setup_default_timer_irq(void)
{
- setup_irq(0, &irq0);
+ unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER;
+
+ /*
+ * Unconditionally register the legacy timer interrupt; even
+ * without legacy PIC/PIT we need this for the HPET0 in legacy
+ * replacement mode.
+ */
+ if (request_irq(0, timer_interrupt, flags, "timer", NULL))
+ pr_info("Failed to register legacy timer interrupt\n");
}
/* Default timer init function */
void __init hpet_time_init(void)
{
- if (!hpet_enable())
- setup_pit_timer();
+ if (!hpet_enable()) {
+ if (!pit_timer_init())
+ return;
+ }
+
setup_default_timer_irq();
}
static __init void x86_late_time_init(void)
{
+ /*
+ * Before PIT/HPET init, select the interrupt mode. This is required
+ * to make the decision whether PIT should be initialized correct.
+ */
+ x86_init.irqs.intr_mode_select();
+
+ /* Setup the legacy timers */
x86_init.timers.timer_init();
+
+ /*
+ * After PIT/HPET timers init, set up the final interrupt mode for
+ * delivering IRQs.
+ */
+ x86_init.irqs.intr_mode_init();
tsc_init();
+
+ if (static_cpu_has(X86_FEATURE_WAITPKG))
+ use_tpause_delay();
}
/*
@@ -119,3 +94,18 @@ void __init time_init(void)
{
late_time_init = x86_late_time_init;
}
+
+/*
+ * Sanity check the vdso related archdata content.
+ */
+void clocksource_arch_init(struct clocksource *cs)
+{
+ if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE)
+ return;
+
+ if (cs->mask != CLOCKSOURCE_MASK(64)) {
+ pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n",
+ cs->name, cs->mask);
+ cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
+ }
+}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 7fea555929e2..000000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,1397 +0,0 @@
-/*
- * SGI UltraViolet TLB flush routines.
- *
- * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- */
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-
-#include <asm/mmu_context.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-#include <asm/uv/uv_bau.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/tsc.h>
-#include <asm/irq_vectors.h>
-#include <asm/timer.h>
-
-struct msg_desc {
- struct bau_payload_queue_entry *msg;
- int msg_slot;
- int sw_ack_slot;
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
-};
-
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-
-static int uv_bau_max_concurrent __read_mostly;
-
-static int nobau;
-static int __init setup_nobau(char *arg)
-{
- nobau = 1;
- return 0;
-}
-early_param("nobau", setup_nobau);
-
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
-/* position of pnode (which is nasid>>1): */
-static int uv_nshift __read_mostly;
-static unsigned long uv_mmask __read_mostly;
-
-static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
-static DEFINE_PER_CPU(struct bau_control, bau_control);
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
-struct reset_args {
- int sender;
-};
-
-/*
- * Determine the first node on a uvhub. 'Nodes' are used for kernel
- * memory allocation.
- */
-static int __init uvhub_to_first_node(int uvhub)
-{
- int node, b;
-
- for_each_online_node(node) {
- b = uv_node_to_blade_id(node);
- if (uvhub == b)
- return node;
- }
- return -1;
-}
-
-/*
- * Determine the apicid of the first cpu on a uvhub.
- */
-static int __init uvhub_to_first_apicid(int uvhub)
-{
- int cpu;
-
- for_each_present_cpu(cpu)
- if (uvhub == uv_cpu_to_blade_id(cpu))
- return per_cpu(x86_cpu_to_apicid, cpu);
- return -1;
-}
-
-/*
- * Free a software acknowledge hardware resource by clearing its Pending
- * bit. This will return a reply to the sender.
- * If the message has timed out, a reply has already been sent by the
- * hardware but the resource has not been released. In that case our
- * clear of the Timeout bit (as well) will free the resource. No reply will
- * be sent (the hardware will only do one reply per message).
- */
-static inline void uv_reply_to_message(struct msg_desc *mdp,
- struct bau_control *bcp)
-{
- unsigned long dw;
- struct bau_payload_queue_entry *msg;
-
- msg = mdp->msg;
- if (!msg->canceled) {
- dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
- msg->sw_ack_vector;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
- }
- msg->replied_to = 1;
- msg->sw_ack_vector = 0;
-}
-
-/*
- * Process the receipt of a RETRY message
- */
-static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
- struct bau_control *bcp)
-{
- int i;
- int cancel_count = 0;
- int slot2;
- unsigned long msg_res;
- unsigned long mmr = 0;
- struct bau_payload_queue_entry *msg;
- struct bau_payload_queue_entry *msg2;
- struct ptc_stats *stat;
-
- msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
- stat->d_retries++;
- /*
- * cancel any message from msg+1 to the retry itself
- */
- for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
- if (msg2 > mdp->va_queue_last)
- msg2 = mdp->va_queue_first;
- if (msg2 == msg)
- break;
-
- /* same conditions for cancellation as uv_do_reset */
- if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
- (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
- msg->sw_ack_vector) == 0) &&
- (msg2->sending_cpu == msg->sending_cpu) &&
- (msg2->msg_type != MSG_NOOP)) {
- slot2 = msg2 - mdp->va_queue_first;
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg2->sw_ack_vector << 8) |
- msg2->sw_ack_vector);
- /*
- * This is a message retry; clear the resources held
- * by the previous message only if they timed out.
- * If it has not timed out we have an unexpected
- * situation to report.
- */
- if (mmr & (msg_res << 8)) {
- /*
- * is the resource timed out?
- * make everyone ignore the cancelled message.
- */
- msg2->canceled = 1;
- stat->d_canceled++;
- cancel_count++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- (msg_res << 8) | msg_res);
- } else
- printk(KERN_INFO "note bau retry: no effect\n");
- }
- }
- if (!cancel_count)
- stat->d_nocanceled++;
-}
-
-/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
- */
-static void uv_bau_process_message(struct msg_desc *mdp,
- struct bau_control *bcp)
-{
- int msg_ack_count;
- short socket_ack_count = 0;
- struct ptc_stats *stat;
- struct bau_payload_queue_entry *msg;
- struct bau_control *smaster = bcp->socket_master;
-
- /*
- * This must be a normal message, or retry of a normal message
- */
- msg = mdp->msg;
- stat = &per_cpu(ptcstats, bcp->cpu);
- if (msg->address == TLB_FLUSH_ALL) {
- local_flush_tlb();
- stat->d_alltlb++;
- } else {
- __flush_tlb_one(msg->address);
- stat->d_onetlb++;
- }
- stat->d_requestee++;
-
- /*
- * One cpu on each uvhub has the additional job on a RETRY
- * of releasing the resource held by the message that is
- * being retried. That message is identified by sending
- * cpu number.
- */
- if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
- uv_bau_process_retry_msg(mdp, bcp);
-
- /*
- * This is a sw_ack message, so we have to reply to it.
- * Count each responding cpu on the socket. This avoids
- * pinging the count's cache line back and forth between
- * the sockets.
- */
- socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
- &smaster->socket_acknowledge_count[mdp->msg_slot]);
- if (socket_ack_count == bcp->cpus_in_socket) {
- /*
- * Both sockets dump their completed count total into
- * the message's count.
- */
- smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
- msg_ack_count = atomic_add_short_return(socket_ack_count,
- (struct atomic_short *)&msg->acknowledge_count);
-
- if (msg_ack_count == bcp->cpus_in_uvhub) {
- /*
- * All cpus in uvhub saw it; reply
- */
- uv_reply_to_message(mdp, bcp);
- }
- }
-
- return;
-}
-
-/*
- * Determine the first cpu on a uvhub.
- */
-static int uvhub_to_first_cpu(int uvhub)
-{
- int cpu;
- for_each_present_cpu(cpu)
- if (uvhub == uv_cpu_to_blade_id(cpu))
- return cpu;
- return -1;
-}
-
-/*
- * Last resort when we get a large number of destination timeouts is
- * to clear resources held by a given cpu.
- * Do this with IPI so that all messages in the BAU message queue
- * can be identified by their nonzero sw_ack_vector field.
- *
- * This is entered for a single cpu on the uvhub.
- * The sender want's this uvhub to free a specific message's
- * sw_ack resources.
- */
-static void
-uv_do_reset(void *ptr)
-{
- int i;
- int slot;
- int count = 0;
- unsigned long mmr;
- unsigned long msg_res;
- struct bau_control *bcp;
- struct reset_args *rap;
- struct bau_payload_queue_entry *msg;
- struct ptc_stats *stat;
-
- bcp = &per_cpu(bau_control, smp_processor_id());
- rap = (struct reset_args *)ptr;
- stat = &per_cpu(ptcstats, bcp->cpu);
- stat->d_resets++;
-
- /*
- * We're looking for the given sender, and
- * will free its sw_ack resource.
- * If all cpu's finally responded after the timeout, its
- * message 'replied_to' was set.
- */
- for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
- /* uv_do_reset: same conditions for cancellation as
- uv_bau_process_retry_msg() */
- if ((msg->replied_to == 0) &&
- (msg->canceled == 0) &&
- (msg->sending_cpu == rap->sender) &&
- (msg->sw_ack_vector) &&
- (msg->msg_type != MSG_NOOP)) {
- /*
- * make everyone else ignore this message
- */
- msg->canceled = 1;
- slot = msg - bcp->va_queue_first;
- count++;
- /*
- * only reset the resource if it is still pending
- */
- mmr = uv_read_local_mmr
- (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
- msg_res = ((msg->sw_ack_vector << 8) |
- msg->sw_ack_vector);
- if (mmr & msg_res) {
- stat->d_rcanceled++;
- uv_write_local_mmr(
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
- msg_res);
- }
- }
- }
- return;
-}
-
-/*
- * Use IPI to get all target uvhubs to release resources held by
- * a given sending cpu number.
- */
-static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
- int sender)
-{
- int uvhub;
- int cpu;
- cpumask_t mask;
- struct reset_args reset_args;
-
- reset_args.sender = sender;
-
- cpus_clear(mask);
- /* find a single cpu for each uvhub in this distribution mask */
- for (uvhub = 0;
- uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
- uvhub++) {
- if (!bau_uvhub_isset(uvhub, distribution))
- continue;
- /* find a cpu for this uvhub */
- cpu = uvhub_to_first_cpu(uvhub);
- cpu_set(cpu, mask);
- }
- /* IPI all cpus; Preemption is already disabled */
- smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
- return;
-}
-
-static inline unsigned long
-cycles_2_us(unsigned long long cyc)
-{
- unsigned long long ns;
- unsigned long us;
- ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
- >> CYC2NS_SCALE_FACTOR;
- us = ns / 1000;
- return us;
-}
-
-/*
- * wait for all cpus on this hub to finish their sends and go quiet
- * leaves uvhub_quiesce set so that no new broadcasts are started by
- * bau_flush_send_and_wait()
- */
-static inline void
-quiesce_local_uvhub(struct bau_control *hmaster)
-{
- atomic_add_short_return(1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
-}
-
-/*
- * mark this quiet-requestor as done
- */
-static inline void
-end_uvhub_quiesce(struct bau_control *hmaster)
-{
- atomic_add_short_return(-1, (struct atomic_short *)
- &hmaster->uvhub_quiesce);
-}
-
-/*
- * Wait for completion of a broadcast software ack message
- * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
- */
-static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift, int this_cpu,
- struct bau_control *bcp, struct bau_control *smaster, long try)
-{
- int relaxes = 0;
- unsigned long descriptor_status;
- unsigned long mmr;
- unsigned long mask;
- cycles_t ttime;
- cycles_t timeout_time;
- struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
- struct bau_control *hmaster;
-
- hmaster = bcp->uvhub_master;
- timeout_time = get_cycles() + bcp->timeout_interval;
-
- /* spin on the status MMR, waiting for it to go idle */
- while ((descriptor_status = (((unsigned long)
- uv_read_local_mmr(mmr_offset) >>
- right_shift) & UV_ACT_STATUS_MASK)) !=
- DESC_STATUS_IDLE) {
- /*
- * Our software ack messages may be blocked because there are
- * no swack resources available. As long as none of them
- * has timed out hardware will NACK our message and its
- * state will stay IDLE.
- */
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
- stat->s_stimeout++;
- return FLUSH_GIVEUP;
- } else if (descriptor_status ==
- DESC_STATUS_DESTINATION_TIMEOUT) {
- stat->s_dtimeout++;
- ttime = get_cycles();
-
- /*
- * Our retries may be blocked by all destination
- * swack resources being consumed, and a timeout
- * pending. In that case hardware returns the
- * ERROR that looks like a destination timeout.
- */
- if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
- bcp->conseccompletes = 0;
- return FLUSH_RETRY_PLUGGED;
- }
-
- bcp->conseccompletes = 0;
- return FLUSH_RETRY_TIMEOUT;
- } else {
- /*
- * descriptor_status is still BUSY
- */
- cpu_relax();
- relaxes++;
- if (relaxes >= 10000) {
- relaxes = 0;
- if (get_cycles() > timeout_time) {
- quiesce_local_uvhub(hmaster);
-
- /* single-thread the register change */
- spin_lock(&hmaster->masks_lock);
- mmr = uv_read_local_mmr(mmr_offset);
- mask = 0UL;
- mask |= (3UL < right_shift);
- mask = ~mask;
- mmr &= mask;
- uv_write_local_mmr(mmr_offset, mmr);
- spin_unlock(&hmaster->masks_lock);
- end_uvhub_quiesce(hmaster);
- stat->s_busy++;
- return FLUSH_GIVEUP;
- }
- }
- }
- }
- bcp->conseccompletes++;
- return FLUSH_COMPLETE;
-}
-
-static inline cycles_t
-sec_2_cycles(unsigned long sec)
-{
- unsigned long ns;
- cycles_t cyc;
-
- ns = sec * 1000000000;
- cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
- return cyc;
-}
-
-/*
- * conditionally add 1 to *v, unless *v is >= u
- * return 0 if we cannot add 1 to *v because it is >= u
- * return 1 if we can add 1 to *v because it is < u
- * the add is atomic
- *
- * This is close to atomic_add_unless(), but this allows the 'u' value
- * to be lowered below the current 'v'. atomic_add_unless can only stop
- * on equal.
- */
-static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
-{
- spin_lock(lock);
- if (atomic_read(v) >= u) {
- spin_unlock(lock);
- return 0;
- }
- atomic_inc(v);
- spin_unlock(lock);
- return 1;
-}
-
-/**
- * uv_flush_send_and_wait
- *
- * Send a broadcast and wait for it to complete.
- *
- * The flush_mask contains the cpus the broadcast is to be sent to, plus
- * cpus that are on the local uvhub.
- *
- * Returns NULL if all flushing represented in the mask was done. The mask
- * is zeroed.
- * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set, representing any cpus on the local
- * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
- */
-const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
- struct cpumask *flush_mask,
- struct bau_control *bcp)
-{
- int right_shift;
- int uvhub;
- int bit;
- int completion_status = 0;
- int seq_number = 0;
- long try = 0;
- int cpu = bcp->uvhub_cpu;
- int this_cpu = bcp->cpu;
- int this_uvhub = bcp->uvhub;
- unsigned long mmr_offset;
- unsigned long index;
- cycles_t time1;
- cycles_t time2;
- struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
- struct bau_control *smaster = bcp->socket_master;
- struct bau_control *hmaster = bcp->uvhub_master;
-
- /*
- * Spin here while there are hmaster->max_concurrent or more active
- * descriptors. This is the per-uvhub 'throttle'.
- */
- if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_concurrent)) {
- stat->s_throttles++;
- do {
- cpu_relax();
- } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
- &hmaster->active_descriptor_count,
- hmaster->max_concurrent));
- }
-
- while (hmaster->uvhub_quiesce)
- cpu_relax();
-
- if (cpu < UV_CPUS_PER_ACT_STATUS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = cpu * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift =
- ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
- }
- time1 = get_cycles();
- do {
- /*
- * Every message from any given cpu gets a unique message
- * sequence number. But retries use that same number.
- * Our message may have timed out at the destination because
- * all sw-ack resources are in use and there is a timeout
- * pending there. In that case, our last send never got
- * placed into the queue and we need to persist until it
- * does.
- *
- * Make any retry a type MSG_RETRY so that the destination will
- * free any resource held by a previous message from this cpu.
- */
- if (try == 0) {
- /* use message type set by the caller the first time */
- seq_number = bcp->message_number++;
- } else {
- /* use RETRY type on all the rest; same sequence */
- bau_desc->header.msg_type = MSG_RETRY;
- stat->s_retry_messages++;
- }
- bau_desc->header.sequence = seq_number;
- index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- bcp->uvhub_cpu;
- bcp->send_message = get_cycles();
-
- uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
-
- try++;
- completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift, this_cpu, bcp, smaster, try);
-
- if (completion_status == FLUSH_RETRY_PLUGGED) {
- /*
- * Our retries may be blocked by all destination swack
- * resources being consumed, and a timeout pending. In
- * that case hardware immediately returns the ERROR
- * that looks like a destination timeout.
- */
- udelay(TIMEOUT_DELAY);
- bcp->plugged_tries++;
- if (bcp->plugged_tries >= PLUGSB4RESET) {
- bcp->plugged_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_plug++;
- }
- } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
- hmaster->max_concurrent = 1;
- bcp->timeout_tries++;
- udelay(TIMEOUT_DELAY);
- if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
- bcp->timeout_tries = 0;
- quiesce_local_uvhub(hmaster);
- spin_lock(&hmaster->queue_lock);
- uv_reset_with_ipi(&bau_desc->distribution,
- this_cpu);
- spin_unlock(&hmaster->queue_lock);
- end_uvhub_quiesce(hmaster);
- bcp->ipi_attempts++;
- stat->s_resets_timeout++;
- }
- }
- if (bcp->ipi_attempts >= 3) {
- bcp->ipi_attempts = 0;
- completion_status = FLUSH_GIVEUP;
- break;
- }
- cpu_relax();
- } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
- (completion_status == FLUSH_RETRY_TIMEOUT));
- time2 = get_cycles();
-
- if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
- && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
- hmaster->max_concurrent++;
-
- /*
- * hold any cpu not timing out here; no other cpu currently held by
- * the 'throttle' should enter the activation code
- */
- while (hmaster->uvhub_quiesce)
- cpu_relax();
- atomic_dec(&hmaster->active_descriptor_count);
-
- /* guard against cycles wrap */
- if (time2 > time1)
- stat->s_time += (time2 - time1);
- else
- stat->s_requestor--; /* don't count this one */
- if (completion_status == FLUSH_COMPLETE && try > 1)
- stat->s_retriesok++;
- else if (completion_status == FLUSH_GIVEUP) {
- /*
- * Cause the caller to do an IPI-style TLB shootdown on
- * the target cpu's, all of which are still in the mask.
- */
- stat->s_giveup++;
- return flush_mask;
- }
-
- /*
- * Success, so clear the remote cpu's from the mask so we don't
- * use the IPI method of shootdown on them.
- */
- for_each_cpu(bit, flush_mask) {
- uvhub = uv_cpu_to_blade_id(bit);
- if (uvhub == this_uvhub)
- continue;
- cpumask_clear_cpu(bit, flush_mask);
- }
- if (!cpumask_empty(flush_mask))
- return flush_mask;
-
- return NULL;
-}
-
-/**
- * uv_flush_tlb_others - globally purge translation cache of a virtual
- * address or all TLB's
- * @cpumask: mask of all cpu's in which the address is to be removed
- * @mm: mm_struct containing virtual address range
- * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
- * @cpu: the current cpu
- *
- * This is the entry point for initiating any UV global TLB shootdown.
- *
- * Purges the translation caches of all specified processors of the given
- * virtual address, or purges all TLB's on specified processors.
- *
- * The caller has derived the cpumask from the mm_struct. This function
- * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
- *
- * The cpumask is converted into a uvhubmask of the uvhubs containing
- * those cpus.
- *
- * Note that this function should be called with preemption disabled.
- *
- * Returns NULL if all remote flushing was done.
- * Returns pointer to cpumask if some remote flushing remains to be
- * done. The returned pointer is valid till preemption is re-enabled.
- */
-const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va, unsigned int cpu)
-{
- int remotes;
- int tcpu;
- int uvhub;
- int locals = 0;
- struct bau_desc *bau_desc;
- struct cpumask *flush_mask;
- struct ptc_stats *stat;
- struct bau_control *bcp;
-
- if (nobau)
- return cpumask;
-
- bcp = &per_cpu(bau_control, cpu);
- /*
- * Each sending cpu has a per-cpu mask which it fills from the caller's
- * cpu mask. Only remote cpus are converted to uvhubs and copied.
- */
- flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
- /*
- * copy cpumask to flush_mask, removing current cpu
- * (current cpu should already have been flushed by the caller and
- * should never be returned if we return flush_mask)
- */
- cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
- if (cpu_isset(cpu, *cpumask))
- locals++; /* current cpu was targeted */
-
- bau_desc = bcp->descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
-
- bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
- remotes = 0;
- for_each_cpu(tcpu, flush_mask) {
- uvhub = uv_cpu_to_blade_id(tcpu);
- if (uvhub == bcp->uvhub) {
- locals++;
- continue;
- }
- bau_uvhub_set(uvhub, &bau_desc->distribution);
- remotes++;
- }
- if (remotes == 0) {
- /*
- * No off_hub flushing; return status for local hub.
- * Return the caller's mask if all were local (the current
- * cpu may be in that mask).
- */
- if (locals)
- return cpumask;
- else
- return NULL;
- }
- stat = &per_cpu(ptcstats, cpu);
- stat->s_requestor++;
- stat->s_ntargcpu += remotes;
- remotes = bau_uvhub_weight(&bau_desc->distribution);
- stat->s_ntarguvhub += remotes;
- if (remotes >= 16)
- stat->s_ntarguvhub16++;
- else if (remotes >= 8)
- stat->s_ntarguvhub8++;
- else if (remotes >= 4)
- stat->s_ntarguvhub4++;
- else if (remotes >= 2)
- stat->s_ntarguvhub2++;
- else
- stat->s_ntarguvhub1++;
-
- bau_desc->payload.address = va;
- bau_desc->payload.sending_cpu = cpu;
-
- /*
- * uv_flush_send_and_wait returns null if all cpu's were messaged, or
- * the adjusted flush_mask if any cpu's were not messaged.
- */
- return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
-}
-
-/*
- * The BAU message interrupt comes here. (registered by set_intr_gate)
- * See entry_64.S
- *
- * We received a broadcast assist message.
- *
- * Interrupts are disabled; this interrupt could represent
- * the receipt of several messages.
- *
- * All cores/threads on this hub get this interrupt.
- * The last one to see it does the software ack.
- * (the resource will not be freed until noninterruptable cpus see this
- * interrupt; hardware may timeout the s/w ack and reply ERROR)
- */
-void uv_bau_message_interrupt(struct pt_regs *regs)
-{
- int count = 0;
- cycles_t time_start;
- struct bau_payload_queue_entry *msg;
- struct bau_control *bcp;
- struct ptc_stats *stat;
- struct msg_desc msgdesc;
-
- time_start = get_cycles();
- bcp = &per_cpu(bau_control, smp_processor_id());
- stat = &per_cpu(ptcstats, smp_processor_id());
- msgdesc.va_queue_first = bcp->va_queue_first;
- msgdesc.va_queue_last = bcp->va_queue_last;
- msg = bcp->bau_msg_head;
- while (msg->sw_ack_vector) {
- count++;
- msgdesc.msg_slot = msg - msgdesc.va_queue_first;
- msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
- msgdesc.msg = msg;
- uv_bau_process_message(&msgdesc, bcp);
- msg++;
- if (msg > msgdesc.va_queue_last)
- msg = msgdesc.va_queue_first;
- bcp->bau_msg_head = msg;
- }
- stat->d_time += (get_cycles() - time_start);
- if (!count)
- stat->d_nomsg++;
- else if (count > 1)
- stat->d_multmsg++;
- ack_APIC_irq();
-}
-
-/*
- * uv_enable_timeouts
- *
- * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
- * shootdown message timeouts enabled. The timeout does not cause
- * an interrupt, but causes an error message to be returned to
- * the sender.
- */
-static void uv_enable_timeouts(void)
-{
- int uvhub;
- int nuvhubs;
- int pnode;
- unsigned long mmr_image;
-
- nuvhubs = uv_num_possible_blades();
-
- for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
- if (!uv_blade_nr_possible_cpus(uvhub))
- continue;
-
- pnode = uv_blade_to_pnode(uvhub);
- mmr_image =
- uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
- /*
- * Set the timeout period and then lock it in, in three
- * steps; captures and locks in the period.
- *
- * To program the period, the SOFT_ACK_MODE must be off.
- */
- mmr_image &= ~((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- /*
- * Set the 4-bit period.
- */
- mmr_image &= ~((unsigned long)0xf <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
- UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- /*
- * Subsequent reversals of the timebase bit (3) cause an
- * immediate timeout of one or all INTD resources as
- * indicated in bits 2:0 (7 causes all of them to timeout).
- */
- mmr_image |= ((unsigned long)1 <<
- UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
- uv_write_global_mmr64
- (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
- }
-}
-
-static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
-{
- if (*offset < num_possible_cpus())
- return offset;
- return NULL;
-}
-
-static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
-{
- (*offset)++;
- if (*offset < num_possible_cpus())
- return offset;
- return NULL;
-}
-
-static void uv_ptc_seq_stop(struct seq_file *file, void *data)
-{
-}
-
-static inline unsigned long long
-millisec_2_cycles(unsigned long millisec)
-{
- unsigned long ns;
- unsigned long long cyc;
-
- ns = millisec * 1000;
- cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
- return cyc;
-}
-
-/*
- * Display the statistics thru /proc.
- * 'data' points to the cpu number
- */
-static int uv_ptc_seq_show(struct seq_file *file, void *data)
-{
- struct ptc_stats *stat;
- int cpu;
-
- cpu = *(loff_t *)data;
-
- if (!cpu) {
- seq_printf(file,
- "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
- seq_printf(file,
- "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
- seq_printf(file,
- "retries rok resetp resett giveup sto bz throt ");
- seq_printf(file,
- "sw_ack recv rtime all ");
- seq_printf(file,
- "one mult none retry canc nocan reset rcan\n");
- }
- if (cpu < num_possible_cpus() && cpu_online(cpu)) {
- stat = &per_cpu(ptcstats, cpu);
- /* source side statistics */
- seq_printf(file,
- "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
- cpu, stat->s_requestor, cycles_2_us(stat->s_time),
- stat->s_ntarguvhub, stat->s_ntarguvhub16,
- stat->s_ntarguvhub8, stat->s_ntarguvhub4,
- stat->s_ntarguvhub2, stat->s_ntarguvhub1,
- stat->s_ntargcpu, stat->s_dtimeout);
- seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
- stat->s_retry_messages, stat->s_retriesok,
- stat->s_resets_plug, stat->s_resets_timeout,
- stat->s_giveup, stat->s_stimeout,
- stat->s_busy, stat->s_throttles);
- /* destination side statistics */
- seq_printf(file,
- "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
- uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
- UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
- stat->d_requestee, cycles_2_us(stat->d_time),
- stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
- stat->d_nomsg, stat->d_retries, stat->d_canceled,
- stat->d_nocanceled, stat->d_resets,
- stat->d_rcanceled);
- }
-
- return 0;
-}
-
-/*
- * -1: resetf the statistics
- * 0: display meaning of the statistics
- * >0: maximum concurrent active descriptors per uvhub (throttle)
- */
-static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
- size_t count, loff_t *data)
-{
- int cpu;
- long input_arg;
- char optstr[64];
- struct ptc_stats *stat;
- struct bau_control *bcp;
-
- if (count == 0 || count > sizeof(optstr))
- return -EINVAL;
- if (copy_from_user(optstr, user, count))
- return -EFAULT;
- optstr[count - 1] = '\0';
- if (strict_strtol(optstr, 10, &input_arg) < 0) {
- printk(KERN_DEBUG "%s is invalid\n", optstr);
- return -EINVAL;
- }
-
- if (input_arg == 0) {
- printk(KERN_DEBUG "# cpu: cpu number\n");
- printk(KERN_DEBUG "Sender statistics:\n");
- printk(KERN_DEBUG
- "sent: number of shootdown messages sent\n");
- printk(KERN_DEBUG
- "stime: time spent sending messages\n");
- printk(KERN_DEBUG
- "numuvhubs: number of hubs targeted with shootdown\n");
- printk(KERN_DEBUG
- "numuvhubs16: number times 16 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs8: number times 8 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs4: number times 4 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs2: number times 2 or more hubs targeted\n");
- printk(KERN_DEBUG
- "numuvhubs1: number times 1 hub targeted\n");
- printk(KERN_DEBUG
- "numcpus: number of cpus targeted with shootdown\n");
- printk(KERN_DEBUG
- "dto: number of destination timeouts\n");
- printk(KERN_DEBUG
- "retries: destination timeout retries sent\n");
- printk(KERN_DEBUG
- "rok: : destination timeouts successfully retried\n");
- printk(KERN_DEBUG
- "resetp: ipi-style resource resets for plugs\n");
- printk(KERN_DEBUG
- "resett: ipi-style resource resets for timeouts\n");
- printk(KERN_DEBUG
- "giveup: fall-backs to ipi-style shootdowns\n");
- printk(KERN_DEBUG
- "sto: number of source timeouts\n");
- printk(KERN_DEBUG
- "bz: number of stay-busy's\n");
- printk(KERN_DEBUG
- "throt: number times spun in throttle\n");
- printk(KERN_DEBUG "Destination side statistics:\n");
- printk(KERN_DEBUG
- "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
- printk(KERN_DEBUG
- "recv: shootdown messages received\n");
- printk(KERN_DEBUG
- "rtime: time spent processing messages\n");
- printk(KERN_DEBUG
- "all: shootdown all-tlb messages\n");
- printk(KERN_DEBUG
- "one: shootdown one-tlb messages\n");
- printk(KERN_DEBUG
- "mult: interrupts that found multiple messages\n");
- printk(KERN_DEBUG
- "none: interrupts that found no messages\n");
- printk(KERN_DEBUG
- "retry: number of retry messages processed\n");
- printk(KERN_DEBUG
- "canc: number messages canceled by retries\n");
- printk(KERN_DEBUG
- "nocan: number retries that found nothing to cancel\n");
- printk(KERN_DEBUG
- "reset: number of ipi-style reset requests processed\n");
- printk(KERN_DEBUG
- "rcan: number messages canceled by reset requests\n");
- } else if (input_arg == -1) {
- for_each_present_cpu(cpu) {
- stat = &per_cpu(ptcstats, cpu);
- memset(stat, 0, sizeof(struct ptc_stats));
- }
- } else {
- uv_bau_max_concurrent = input_arg;
- bcp = &per_cpu(bau_control, smp_processor_id());
- if (uv_bau_max_concurrent < 1 ||
- uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
- "Error: BAU max concurrent %d; %d is invalid\n",
- bcp->max_concurrent, uv_bau_max_concurrent);
- return -EINVAL;
- }
- printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
- uv_bau_max_concurrent);
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurrent = uv_bau_max_concurrent;
- }
- }
-
- return count;
-}
-
-static const struct seq_operations uv_ptc_seq_ops = {
- .start = uv_ptc_seq_start,
- .next = uv_ptc_seq_next,
- .stop = uv_ptc_seq_stop,
- .show = uv_ptc_seq_show
-};
-
-static int uv_ptc_proc_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &uv_ptc_seq_ops);
-}
-
-static const struct file_operations proc_uv_ptc_operations = {
- .open = uv_ptc_proc_open,
- .read = seq_read,
- .write = uv_ptc_proc_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init uv_ptc_init(void)
-{
- struct proc_dir_entry *proc_uv_ptc;
-
- if (!is_uv_system())
- return 0;
-
- proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
- &proc_uv_ptc_operations);
- if (!proc_uv_ptc) {
- printk(KERN_ERR "unable to create %s proc entry\n",
- UV_PTC_BASENAME);
- return -EINVAL;
- }
- return 0;
-}
-
-/*
- * initialize the sending side's sending buffers
- */
-static void
-uv_activation_descriptor_init(int node, int pnode)
-{
- int i;
- int cpu;
- unsigned long pa;
- unsigned long m;
- unsigned long n;
- struct bau_desc *bau_desc;
- struct bau_desc *bd2;
- struct bau_control *bcp;
-
- /*
- * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
- */
- bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
- UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
- BUG_ON(!bau_desc);
-
- pa = uv_gpa(bau_desc); /* need the real nasid*/
- n = pa >> uv_nshift;
- m = pa & uv_mmask;
-
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
- (n << UV_DESC_BASE_PNODE_SHIFT | m));
-
- /*
- * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
- * cpu even though we only use the first one; one descriptor can
- * describe a broadcast to 256 uv hubs.
- */
- for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
- i++, bd2++) {
- memset(bd2, 0, sizeof(struct bau_desc));
- bd2->header.sw_ack_flag = 1;
- /*
- * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
- * in the partition. The bit map will indicate uvhub numbers,
- * which are 0-N in a partition. Pnodes are unique system-wide.
- */
- bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
- bd2->header.dest_subnodeid = 0x10; /* the LB */
- bd2->header.command = UV_NET_ENDPOINT_INTD;
- bd2->header.int_both = 1;
- /*
- * all others need to be set to zero:
- * fairness chaining multilevel count replied_to
- */
- }
- for_each_present_cpu(cpu) {
- if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
- continue;
- bcp = &per_cpu(bau_control, cpu);
- bcp->descriptor_base = bau_desc;
- }
-}
-
-/*
- * initialize the destination side's receiving buffers
- * entered for each uvhub in the partition
- * - node is first node (kernel memory notion) on the uvhub
- * - pnode is the uvhub's physical identifier
- */
-static void
-uv_payload_queue_init(int node, int pnode)
-{
- int pn;
- int cpu;
- char *cp;
- unsigned long pa;
- struct bau_payload_queue_entry *pqp;
- struct bau_payload_queue_entry *pqp_malloc;
- struct bau_control *bcp;
-
- pqp = (struct bau_payload_queue_entry *) kmalloc_node(
- (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
- GFP_KERNEL, node);
- BUG_ON(!pqp);
- pqp_malloc = pqp;
-
- cp = (char *)pqp + 31;
- pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-
- for_each_present_cpu(cpu) {
- if (pnode != uv_cpu_to_pnode(cpu))
- continue;
- /* for every cpu on this pnode: */
- bcp = &per_cpu(bau_control, cpu);
- bcp->va_queue_first = pqp;
- bcp->bau_msg_head = pqp;
- bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
- }
- /*
- * need the pnode of where the memory was really allocated
- */
- pa = uv_gpa(pqp);
- pn = pa >> uv_nshift;
- uv_write_global_mmr64(pnode,
- UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
- ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
- uv_physnodeaddr(pqp));
- uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
- (unsigned long)
- uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
- /* in effect, all msg_type's are set to MSG_NOOP */
- memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-}
-
-/*
- * Initialization of each UV hub's structures
- */
-static void __init uv_init_uvhub(int uvhub, int vector)
-{
- int node;
- int pnode;
- unsigned long apicid;
-
- node = uvhub_to_first_node(uvhub);
- pnode = uv_blade_to_pnode(uvhub);
- uv_activation_descriptor_init(node, pnode);
- uv_payload_queue_init(node, pnode);
- /*
- * the below initialization can't be in firmware because the
- * messaging IRQ will be determined by the OS
- */
- apicid = uvhub_to_first_apicid(uvhub);
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
- ((apicid << 32) | vector));
-}
-
-/*
- * initialize the bau_control structure for each cpu
- */
-static void uv_init_per_cpu(int nuvhubs)
-{
- int i, j, k;
- int cpu;
- int pnode;
- int uvhub;
- short socket = 0;
- struct bau_control *bcp;
- struct uvhub_desc *bdp;
- struct socket_desc *sdp;
- struct bau_control *hmaster = NULL;
- struct bau_control *smaster = NULL;
- struct socket_desc {
- short num_cpus;
- short cpu_number[16];
- };
- struct uvhub_desc {
- short num_sockets;
- short num_cpus;
- short uvhub;
- short pnode;
- struct socket_desc socket[2];
- };
- struct uvhub_desc *uvhub_descs;
-
- uvhub_descs = (struct uvhub_desc *)
- kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
- memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
- for_each_present_cpu(cpu) {
- bcp = &per_cpu(bau_control, cpu);
- memset(bcp, 0, sizeof(struct bau_control));
- spin_lock_init(&bcp->masks_lock);
- bcp->max_concurrent = uv_bau_max_concurrent;
- pnode = uv_cpu_hub_info(cpu)->pnode;
- uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
- bdp = &uvhub_descs[uvhub];
- bdp->num_cpus++;
- bdp->uvhub = uvhub;
- bdp->pnode = pnode;
- /* time interval to catch a hardware stay-busy bug */
- bcp->timeout_interval = millisec_2_cycles(3);
- /* kludge: assume uv_hub.h is constant */
- socket = (cpu_physical_id(cpu)>>5)&1;
- if (socket >= bdp->num_sockets)
- bdp->num_sockets = socket+1;
- sdp = &bdp->socket[socket];
- sdp->cpu_number[sdp->num_cpus] = cpu;
- sdp->num_cpus++;
- }
- socket = 0;
- for_each_possible_blade(uvhub) {
- bdp = &uvhub_descs[uvhub];
- for (i = 0; i < bdp->num_sockets; i++) {
- sdp = &bdp->socket[i];
- for (j = 0; j < sdp->num_cpus; j++) {
- cpu = sdp->cpu_number[j];
- bcp = &per_cpu(bau_control, cpu);
- bcp->cpu = cpu;
- if (j == 0) {
- smaster = bcp;
- if (i == 0)
- hmaster = bcp;
- }
- bcp->cpus_in_uvhub = bdp->num_cpus;
- bcp->cpus_in_socket = sdp->num_cpus;
- bcp->socket_master = smaster;
- bcp->uvhub_master = hmaster;
- for (k = 0; k < DEST_Q_SIZE; k++)
- bcp->socket_acknowledge_count[k] = 0;
- bcp->uvhub_cpu =
- uv_cpu_hub_info(cpu)->blade_processor_id;
- }
- socket++;
- }
- }
- kfree(uvhub_descs);
-}
-
-/*
- * Initialization of BAU-related structures
- */
-static int __init uv_bau_init(void)
-{
- int uvhub;
- int pnode;
- int nuvhubs;
- int cur_cpu;
- int vector;
- unsigned long mmr;
-
- if (!is_uv_system())
- return 0;
-
- if (nobau)
- return 0;
-
- for_each_possible_cpu(cur_cpu)
- zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
- GFP_KERNEL, cpu_to_node(cur_cpu));
-
- uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
- uv_nshift = uv_hub_info->m_val;
- uv_mmask = (1UL << uv_hub_info->m_val) - 1;
- nuvhubs = uv_num_possible_blades();
-
- uv_init_per_cpu(nuvhubs);
-
- uv_partition_base_pnode = 0x7fffffff;
- for (uvhub = 0; uvhub < nuvhubs; uvhub++)
- if (uv_blade_nr_possible_cpus(uvhub) &&
- (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
- uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
-
- vector = UV_BAU_MESSAGE;
- for_each_possible_blade(uvhub)
- if (uv_blade_nr_possible_cpus(uvhub))
- uv_init_uvhub(uvhub, vector);
-
- uv_enable_timeouts();
- alloc_intr_gate(vector, uv_bau_message_intr1);
-
- for_each_possible_blade(uvhub) {
- pnode = uv_blade_to_pnode(uvhub);
- /* INIT the bau */
- uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
- ((unsigned long)1 << 63));
- mmr = 1; /* should be 1 to broadcast to both sockets */
- uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
- }
-
- return 0;
-}
-core_initcall(uv_bau_init);
-core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b8579e70..3ffbab0081f4 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -1,16 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/user.h>
#include <linux/regset.h>
+#include <linux/syscalls.h>
+#include <linux/nospec.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
-#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
-#include <asm/syscalls.h>
+#include <asm/gsseg.h>
#include "tls.h"
@@ -28,6 +30,58 @@ static int get_free_idx(void)
return -ESRCH;
}
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+ return true;
+}
+
static void set_tls_desc(struct task_struct *p, int idx,
const struct user_desc *info, int n)
{
@@ -41,8 +95,8 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info))
- desc->a = desc->b = 0;
+ if (LDT_empty(info) || LDT_zero(info))
+ memset(desc, 0, sizeof(*desc));
else
fill_ldt(desc, info);
++info;
@@ -63,10 +117,14 @@ int do_set_thread_area(struct task_struct *p, int idx,
int can_allocate)
{
struct user_desc info;
+ unsigned short __maybe_unused sel, modified_sel;
if (copy_from_user(&info, u_info, sizeof(info)))
return -EFAULT;
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
if (idx == -1)
idx = info.entry_number;
@@ -87,14 +145,47 @@ int do_set_thread_area(struct task_struct *p, int idx,
set_tls_desc(p, idx, &info, 1);
+ /*
+ * If DS, ES, FS, or GS points to the modified segment, forcibly
+ * refresh it. Only needed on x86_64 because x86_32 reloads them
+ * on return to user mode.
+ */
+ modified_sel = (idx << 3) | 3;
+
+ if (p == current) {
+#ifdef CONFIG_X86_64
+ savesegment(ds, sel);
+ if (sel == modified_sel)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if (sel == modified_sel)
+ loadsegment(es, sel);
+
+ savesegment(fs, sel);
+ if (sel == modified_sel)
+ loadsegment(fs, sel);
+#endif
+
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ load_gs_index(sel);
+ } else {
+#ifdef CONFIG_X86_64
+ if (p->thread.fsindex == modified_sel)
+ p->thread.fsbase = info.base_addr;
+
+ if (p->thread.gsindex == modified_sel)
+ p->thread.gsbase = info.base_addr;
+#endif
+ }
+
return 0;
}
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_set_thread_area(current, -1, u_info, 1);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_set_thread_area(current, -1, u_info, 1);
}
@@ -125,6 +216,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *u_info)
{
struct user_desc info;
+ int index;
if (idx == -1 && get_user(idx, &u_info->entry_number))
return -EFAULT;
@@ -132,19 +224,20 @@ int do_get_thread_area(struct task_struct *p, int idx,
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
return -EINVAL;
- fill_user_desc(&info, idx,
- &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
+ index = idx - GDT_ENTRY_TLS_MIN;
+ index = array_index_nospec(index,
+ GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);
+
+ fill_user_desc(&info, idx, &p->thread.tls_array[index]);
if (copy_to_user(u_info, &info, sizeof(info)))
return -EFAULT;
return 0;
}
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
{
- int ret = do_get_thread_area(current, -1, u_info);
- asmlinkage_protect(1, ret, u_info);
- return ret;
+ return do_get_thread_area(current, -1, u_info);
}
int regset_tls_active(struct task_struct *target,
@@ -158,36 +251,16 @@ int regset_tls_active(struct task_struct *target,
}
int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
const struct desc_struct *tls;
+ struct user_desc v;
+ int pos;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
- (pos % sizeof(struct user_desc)) != 0 ||
- (count % sizeof(struct user_desc)) != 0)
- return -EINVAL;
-
- pos /= sizeof(struct user_desc);
- count /= sizeof(struct user_desc);
-
- tls = &target->thread.tls_array[pos];
-
- if (kbuf) {
- struct user_desc *info = kbuf;
- while (count-- > 0)
- fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
- tls++);
- } else {
- struct user_desc __user *u_info = ubuf;
- while (count-- > 0) {
- struct user_desc info;
- fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
- if (__copy_to_user(u_info++, &info, sizeof(info)))
- return -EFAULT;
- }
+ for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) {
+ fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls);
+ membuf_write(&to, &v, sizeof(v));
}
-
return 0;
}
@@ -197,8 +270,9 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
{
struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
const struct user_desc *info;
+ int i;
- if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
(pos % sizeof(struct user_desc)) != 0 ||
(count % sizeof(struct user_desc)) != 0)
return -EINVAL;
@@ -210,6 +284,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
else
info = infobuf;
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
set_tls_desc(target,
GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
info, count / sizeof(struct user_desc));
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
index 2f083a2fe216..fc39447a0c1a 100644
--- a/arch/x86/kernel/tls.h
+++ b/arch/x86/kernel/tls.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Internal declarations for x86 TLS implementation functions.
*
* Copyright (C) 2007 Red Hat, Inc. All rights reserved.
*
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License v.2.
- *
* Red Hat Author: Roland McGrath.
*/
@@ -15,7 +12,7 @@
#include <linux/regset.h>
extern user_regset_active_fn regset_tls_active;
-extern user_regset_get_fn regset_tls_get;
+extern user_regset_get2_fn regset_tls_get;
extern user_regset_set_fn regset_tls_set;
#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index 7e4515957a1c..000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/cpu.h>
-
-static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
-
-#ifdef CONFIG_HOTPLUG_CPU
-int __ref arch_register_cpu(int num)
-{
- /*
- * CPU0 cannot be offlined due to several
- * restrictions and assumptions in kernel. This basically
- * doesnt add a control file, one cannot attempt to offline
- * BSP.
- *
- * Also certain PCI quirks require not to enable hotplug control
- * for all CPU's.
- */
- if (num)
- per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
-
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void arch_unregister_cpu(int num)
-{
- unregister_cpu(&per_cpu(cpu_devices, num).cpu);
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-static int __init arch_register_cpu(int num)
-{
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init topology_init(void)
-{
- int i;
-
-#ifdef CONFIG_NUMA
- for_each_online_node(i)
- register_one_node(i);
-#endif
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
-
- return 0;
-}
-subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
new file mode 100644
index 000000000000..8322e8352777
--- /dev/null
+++ b/arch/x86/kernel/trace.c
@@ -0,0 +1,234 @@
+#include <asm/trace/irq_vectors.h>
+#include <linux/trace.h>
+
+#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
+/*
+ * trace_intel_irq_entry - record intel specific IRQ entry
+ */
+static void trace_intel_irq_entry(void *data, int vector)
+{
+ osnoise_trace_irq_entry(vector);
+}
+
+/*
+ * trace_intel_irq_exit - record intel specific IRQ exit
+ */
+static void trace_intel_irq_exit(void *data, int vector)
+{
+ char *vector_desc = (char *) data;
+
+ osnoise_trace_irq_exit(vector, vector_desc);
+}
+
+/*
+ * register_intel_irq_tp - Register intel specific IRQ entry tracepoints
+ */
+int osnoise_arch_register(void)
+{
+ int ret;
+
+ ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ if (ret)
+ goto out_timer_entry;
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_timer_exit;
+
+ ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ if (ret)
+ goto out_thermal_entry;
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+#ifdef CONFIG_X86_MCE_AMD
+ ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_thermal_exit;
+
+ ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ if (ret)
+ goto out_deferred_entry;
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_deferred_exit;
+
+ ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ if (ret)
+ goto out_threshold_entry;
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
+#ifdef CONFIG_SMP
+ ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_threshold_exit;
+
+ ret = register_trace_call_function_single_exit(trace_intel_irq_exit,
+ "call_function_single");
+ if (ret)
+ goto out_call_function_single_entry;
+
+ ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_single_exit;
+
+ ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ if (ret)
+ goto out_call_function_entry;
+
+ ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_exit;
+
+ ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ if (ret)
+ goto out_reschedule_entry;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+ ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_reschedule_exit;
+
+ ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ if (ret)
+ goto out_irq_work_entry;
+#endif
+
+ ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_irq_work_exit;
+
+ ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ if (ret)
+ goto out_x86_ipi_entry;
+
+ ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_x86_ipi_exit;
+
+ ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ if (ret)
+ goto out_error_apic_entry;
+
+ ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_error_apic_exit;
+
+ ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ if (ret)
+ goto out_spurious_apic_entry;
+
+ return 0;
+
+out_spurious_apic_entry:
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+out_error_apic_exit:
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+out_error_apic_entry:
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+out_x86_ipi_exit:
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+out_x86_ipi_entry:
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+out_irq_work_exit:
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+out_irq_work_entry:
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+out_reschedule_exit:
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+out_reschedule_entry:
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+out_call_function_exit:
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+out_call_function_entry:
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+out_call_function_single_exit:
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+out_call_function_single_entry:
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+out_threshold_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+out_threshold_entry:
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+out_deferred_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+out_deferred_entry:
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+out_thermal_exit:
+#endif /* CONFIG_X86_MCE_AMD */
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+out_thermal_entry:
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+out_timer_exit:
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+out_timer_entry:
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+out_err:
+ return -EINVAL;
+}
+
+void osnoise_arch_unregister(void)
+{
+ unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+}
+#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 000000000000..708d61743d15
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * X86 trace clocks
+ */
+#include <asm/trace_clock.h>
+#include <asm/barrier.h>
+#include <asm/tsc.h>
+
+/*
+ * trace_clock_x86_tsc(): A clock that is just the cycle counter.
+ *
+ * Unlike the other clocks, this is not in nanoseconds.
+ */
+u64 notrace trace_clock_x86_tsc(void)
+{
+ return rdtsc_ordered();
+}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index c652ef62742d..000000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <linux/io.h>
-
-#include <asm/trampoline.h>
-#include <asm/e820.h>
-
-#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
-#define __trampinit
-#define __trampinitdata
-#else
-#define __trampinit __cpuinit
-#define __trampinitdata __cpuinitdata
-#endif
-
-/* ready for x86_64 and x86 */
-unsigned char *__trampinitdata trampoline_base;
-
-void __init reserve_trampoline_memory(void)
-{
- unsigned long mem;
-
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
- if (mem == -1L)
- panic("Cannot allocate trampoline\n");
-
- trampoline_base = __va(mem);
- reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
-}
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-unsigned long __trampinit setup_trampoline(void)
-{
- memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
- return virt_to_phys(trampoline_base);
-}
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 8508237e8e43..000000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- * This is only used for booting secondary CPUs in SMP machine
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * We jump into arch/x86/kernel/head_32.S.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * If you work on this file, check the object module with
- * objdump --reloc to make sure there are no relocation
- * entries except for:
- *
- * TYPE VALUE
- * R_386_32 startup_32_smp
- * R_386_32 boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/* We can free up trampoline after bootup if cpu hotplug is not supported. */
-__CPUINITRODATA
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
- wbinvd # Needed for NUMA-Q should be harmless for others
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
-
- cli # We should be safe anyway
-
- movl $0xA5A5A5A5, trampoline_data - r_base
- # write marker for master knows we're running
-
- /* GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl boot_idt_descr - r_base # load idt with 0, 0
- lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
- # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
- # These need to be in the same 64K segment as the above;
- # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
- .word __BOOT_DS + 7 # gdt limit
- .long boot_gdt - __PAGE_OFFSET # gdt base
-
-boot_idt_descr:
- .word 0 # idt limit = 0
- .long 0 # idt base = 0L
-
-.globl trampoline_end
-trampoline_end:
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index 3af2dff58b21..000000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- * 15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * With the addition of trampoline_level4_pgt this code can
- * now enter a 64bit kernel that lives at arbitrary 64bit
- * physical addresses.
- *
- * If you work on this file, check the object module with objdump
- * --full-contents --reloc to make sure there are no relocation
- * entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
-#ifdef CONFIG_ACPI_SLEEP
-.section .rodata, "a", @progbits
-#else
-/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
-__CPUINITRODATA
-#endif
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
- cli # We should be safe anyway
- wbinvd
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
- mov %ax, %es
- mov %ax, %ss
-
-
- movl $0xA5A5A5A5, trampoline_data - r_base
- # write marker for master knows we're running
-
- # Setup stack
- movw $(trampoline_stack_end - r_base), %sp
-
- call verify_cpu # Verify the cpu supports long mode
- testl %eax, %eax # Check for return code
- jnz no_longmode
-
- mov %cs, %ax
- movzx %ax, %esi # Find the 32bit trampoline location
- shll $4, %esi
-
- # Fixup the vectors
- addl %esi, startup_32_vector - r_base
- addl %esi, startup_64_vector - r_base
- addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer
-
- /*
- * GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl tidt - r_base # load idt with 0, 0
- lgdtl tgdt - r_base # load gdt with whatever is appropriate
-
- mov $X86_CR0_PE, %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
-
- # flush prefetch and jump to startup_32
- ljmpl *(startup_32_vector - r_base)
-
- .code32
- .balign 4
-startup_32:
- movl $__KERNEL_DS, %eax # Initialize the %ds segment register
- movl %eax, %ds
-
- movl $X86_CR4_PAE, %eax
- movl %eax, %cr4 # Enable PAE mode
-
- # Setup trampoline 4 level pagetables
- leal (trampoline_level4_pgt - r_base)(%esi), %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- movl $(1 << _EFER_LME), %eax # Enable Long Mode
- xorl %edx, %edx
- wrmsr
-
- # Enable paging and in turn activate Long Mode
- # Enable protected mode
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
-
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
- ljmp *(startup_64_vector - r_base)(%esi)
-
- .code64
- .balign 4
-startup_64:
- # Now jump into the kernel using virtual addresses
- movq $secondary_startup_64, %rax
- jmp *%rax
-
- .code16
-no_longmode:
- hlt
- jmp no_longmode
-#include "verify_cpu_64.S"
-
- # Careful these need to be in the same 64K segment as the above;
-tidt:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- # Duplicate the global descriptor table
- # so the kernel can live anywhere
- .balign 4
-tgdt:
- .short tgdt_end - tgdt # gdt limit
- .long tgdt - r_base
- .short 0
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-tgdt_end:
-
- .balign 4
-startup_32_vector:
- .long startup_32 - r_base
- .word __KERNEL32_CS, 0
-
- .balign 4
-startup_64_vector:
- .long startup_64 - r_base
- .word __KERNEL_CS, 0
-
-trampoline_stack:
- .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..bcf1dedc1d00 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,496 +9,1168 @@
/*
* Handle hardware traps and faults.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
+#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/static_call.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
+#include <linux/hardirq.h>
+#include <linux/atomic.h>
+#include <linux/iommu.h>
+#include <linux/ubsan.h>
-#ifdef CONFIG_EISA
-#include <linux/ioport.h>
-#include <linux/eisa.h>
-#endif
-
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/kmemcheck.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
+#include <asm/realmode.h>
+#include <asm/text-patching.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fred.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu.h>
+#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
-
+#include <asm/fixmap.h>
#include <asm/mach_traps.h>
+#include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/vm86.h>
+#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/vdso.h>
+#include <asm/tdx.h>
+#include <asm/cfi.h>
+#include <asm/msr.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
-#include <asm/pgalloc.h>
-#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
+#endif
-asmlinkage int system_call(void);
+#include <asm/proto.h>
-/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq;
+DECLARE_BITMAP(system_vectors, NR_VECTORS);
+
+__always_inline int is_valid_bugaddr(unsigned long addr)
+{
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ /*
+ * We got #UD, if the text isn't readable we'd have gotten
+ * a different exception.
+ */
+ return *(unsigned short *)addr == INSN_UD2;
+}
/*
- * The IDT has to be page-aligned to simplify the Pentium
- * F0 0F bug workaround.
+ * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
+ * If it's a UD1, further decode to determine its use:
+ *
+ * FineIBT: d6 udb
+ * FineIBT: f0 75 f9 lock jne . - 6
+ * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax
+ * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax
+ * static_call: 0f b9 cc ud1 %esp,%ecx
+ * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg
+ *
+ * Notable, since __WARN_trap can use all registers, the distinction between
+ * UD1 users is through R/M.
*/
-gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
-#endif
+__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len)
+{
+ unsigned long start = addr;
+ u8 v, reg, rm, rex = 0;
+ int type = BUG_UD1;
+ bool lock = false;
+
+ if (addr < TASK_SIZE_MAX)
+ return BUG_NONE;
+
+ for (;;) {
+ v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ continue;
+
+ if (v == INSN_LOCK) {
+ lock = true;
+ continue;
+ }
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
+ if ((v & 0xf0) == 0x40) {
+ rex = v;
+ continue;
+ }
-static int ignore_nmis;
+ break;
+ }
-static inline void conditional_sti(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
+ switch (v) {
+ case 0x70 ... 0x7f: /* Jcc.d8 */
+ addr += 1; /* d8 */
+ *len = addr - start;
+ WARN_ON_ONCE(!lock);
+ return BUG_LOCK;
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
- inc_preempt_count();
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
+ case 0xd6:
+ *len = addr - start;
+ return BUG_UDB;
+
+ case OPCODE_ESCAPE:
+ break;
+
+ default:
+ return BUG_NONE;
+ }
+
+ v = *(u8 *)(addr++);
+ if (v == SECOND_BYTE_OPCODE_UD2) {
+ *len = addr - start;
+ return BUG_UD2;
+ }
+
+ if (v != SECOND_BYTE_OPCODE_UD1)
+ return BUG_NONE;
+
+ *imm = 0;
+ v = *(u8 *)(addr++); /* ModRM */
+
+ if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4)
+ addr++; /* SIB */
+
+ reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex);
+ rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex);
+
+ /* Decode immediate, if present */
+ switch (X86_MODRM_MOD(v)) {
+ case 0: if (X86_MODRM_RM(v) == 5)
+ addr += 4; /* RIP + disp32 */
+
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+
+ if (rm == 2) { /* (%edx) */
+ *imm = reg;
+ type = BUG_UD1_WARN;
+ }
+ break;
+
+ case 1: *imm = *(s8 *)addr;
+ addr += 1;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+ break;
+
+ case 2: *imm = *(s32 *)addr;
+ addr += 4;
+ if (rm == 0) /* (%eax) */
+ type = BUG_UD1_UBSAN;
+ break;
+
+ case 3: break;
+ }
+
+ /* record instruction length */
+ *len = addr - start;
+
+ return type;
}
-static inline void conditional_cli(struct pt_regs *regs)
+static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
+ int offset = pt_regs_offset(regs, nr);
+ if (WARN_ON_ONCE(offset < -0))
+ return 0;
+ return *((unsigned long *)((void *)regs + offset));
}
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+#ifdef HAVE_ARCH_BUG_FORMAT_ARGS
+DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
+EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+
+/*
+ * Create a va_list from an exception context.
+ */
+void *__warn_args(struct arch_va_list *args, struct pt_regs *regs)
{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
- dec_preempt_count();
+ /*
+ * Register save area; populate with function call argument registers
+ */
+ args->regs[0] = regs->di;
+ args->regs[1] = regs->si;
+ args->regs[2] = regs->dx;
+ args->regs[3] = regs->cx;
+ args->regs[4] = regs->r8;
+ args->regs[5] = regs->r9;
+
+ /*
+ * From the ABI document:
+ *
+ * @gp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available general purpose
+ * argument register is saved. In case all argument registers have
+ * been exhausted, it is set to the value 48 (6*8).
+ *
+ * @fp_offset - the element holds the offset in bytes from
+ * reg_save_area to the place where the next available floating point
+ * argument is saved. In case all argument registers have been
+ * exhausted, it is set to the value 176 (6*8 + 8*16)
+ *
+ * @overflow_arg_area - this pointer is used to fetch arguments passed
+ * on the stack. It is initialized with the address of the first
+ * argument passed on the stack, if any, and then always updated to
+ * point to the start of the next argument on the stack.
+ *
+ * @reg_save_area - the element points to the start of the register
+ * save area.
+ *
+ * Notably the vararg starts with the second argument and there are no
+ * floating point arguments in the kernel.
+ */
+ args->args.gp_offset = 1*8;
+ args->args.fp_offset = 6*8 + 8*16;
+ args->args.reg_save_area = &args->regs;
+ args->args.overflow_arg_area = (void *)regs->sp;
+
+ /*
+ * If the exception came from __WARN_trap, there is a return
+ * address on the stack, skip that. This is why any __WARN_trap()
+ * caller must inhibit tail-call optimization.
+ */
+ if ((void *)regs->ip == &__WARN_trap)
+ args->args.overflow_arg_area += 8;
+
+ return &args->args;
}
+#endif /* HAVE_ARCH_BUG_FORMAT */
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
+static nokprobe_inline int
+do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
+ struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
-
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
/*
- * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < 6)
- goto vm86_trap;
- goto trap_signal;
- }
-#endif
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ } else if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return 0;
- if (!user_mode(regs))
- goto kernel_trap;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ } else {
+ if (fixup_vdso_exception(regs, trapnr, error_code, 0))
+ return 0;
+ }
-#ifdef CONFIG_X86_32
-trap_signal:
-#endif
/*
- * We want error_code and trap_no set for userspace faults and
+ * We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
* kernelspace faults which are fixed up. die() gives the
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
+ * delivered, faults. See also exc_general_protection below.
*/
tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+ tsk->thread.trap_nr = trapnr;
-#ifdef CONFIG_X86_64
+ return -1;
+}
+
+static void show_signal(struct task_struct *tsk, int signr,
+ const char *type, const char *desc,
+ struct pt_regs *regs, long error_code)
+{
if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
- tsk->comm, tsk->pid, str,
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
+ pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk), type, desc,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
}
-#endif
+}
+
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, int sicode, void __user *addr)
+{
+ struct task_struct *tsk = current;
- if (info)
- force_sig_info(signr, info, tsk);
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
+
+ show_signal(tsk, signr, "trap ", str, regs, error_code);
+
+ if (!sicode)
+ force_sig(signr);
else
- force_sig(signr, tsk);
- return;
+ force_sig_fault(signr, sicode, addr);
+}
+NOKPROBE_SYMBOL(do_trap);
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
- die(str, regs, error_code);
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr, int sicode, void __user *addr)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ cond_local_irq_enable(regs);
+ do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
+ cond_local_irq_disable(regs);
}
- return;
+}
-#ifdef CONFIG_X86_32
-vm86_trap:
- if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, trapnr))
- goto trap_signal;
- return;
-#endif
+/*
+ * Posix requires to provide the address of the faulting instruction for
+ * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
+ *
+ * This address is usually regs->ip, but when an uprobe moved the code out
+ * of line then regs->ip points to the XOL code which would confuse
+ * anything which analyzes the fault address vs. the unmodified binary. If
+ * a trap happened in XOL code then uprobe maps regs->ip back to the
+ * original instruction address.
+ */
+static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
+{
+ return (void __user *)uprobe_get_trap_addr(regs);
}
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
-}
-
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-#ifdef CONFIG_X86_32
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-#endif
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DEFINE_IDTENTRY(exc_divide_error)
+{
+ do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
+ FPE_INTDIV, error_get_trap_addr(regs));
+}
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
-dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_overflow)
{
- if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- 12, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
-dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+#ifdef CONFIG_X86_F00F_BUG
+void handle_invalid_op(struct pt_regs *regs)
+#else
+static inline void handle_invalid_op(struct pt_regs *regs)
+#endif
{
- static const char str[] = "double fault";
- struct task_struct *tsk = current;
+ do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
+ ILL_ILLOPN, error_get_trap_addr(regs));
+}
- /* Return not checked because double check cannot be ignored */
- notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+static noinstr bool handle_bug(struct pt_regs *regs)
+{
+ unsigned long addr = regs->ip;
+ bool handled = false;
+ int ud_type, ud_len;
+ s32 ud_imm;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 8;
+ ud_type = decode_bug(addr, &ud_imm, &ud_len);
+ if (ud_type == BUG_NONE)
+ return handled;
/*
- * This is always a kernel trap and never fixable (and thus must
- * never return).
+ * All lies, just get the WARN/BUG out.
*/
- for (;;)
- die(str, regs, error_code);
+ instrumentation_begin();
+ /*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
+ /*
+ * Since we're emulating a CALL with exceptions, restore the interrupt
+ * state to what it was at the exception site.
+ */
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_enable();
+
+ switch (ud_type) {
+ case BUG_UD1_WARN:
+ if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN)
+ handled = true;
+ break;
+
+ case BUG_UD2:
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ fallthrough;
+
+ case BUG_UDB:
+ case BUG_LOCK:
+ if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
+ handled = true;
+ break;
+ }
+ break;
+
+ case BUG_UD1_UBSAN:
+ if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n",
+ report_ubsan_failure(ud_imm),
+ (void *)regs->ip);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * When continuing, and regs->ip hasn't changed, move it to the next
+ * instruction. When not continuing execution, restore the instruction
+ * pointer.
+ */
+ if (handled) {
+ if (regs->ip == addr)
+ regs->ip += ud_len;
+ } else {
+ regs->ip = addr;
+ }
+
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_disable();
+ instrumentation_end();
+
+ return handled;
}
-#endif
-dotraplinkage void __kprobes
-do_general_protection(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
{
- struct task_struct *tsk;
+ irqentry_state_t state;
- conditional_sti(regs);
+ /*
+ * We use UD2 as a short encoding for 'CALL __WARN', as such
+ * handle it before exception entry to avoid recursive WARN
+ * in case exception entry is the one triggering WARNs.
+ */
+ if (!user_mode(regs) && handle_bug(regs))
+ return;
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
-#endif
+ state = irqentry_enter(regs);
+ instrumentation_begin();
+ handle_invalid_op(regs);
+ instrumentation_end();
+ irqentry_exit(regs, state);
+}
- tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
+DEFINE_IDTENTRY(exc_coproc_segment_overrun)
+{
+ do_error_trap(regs, 0, "coprocessor segment overrun",
+ X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
+}
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
+DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
+{
+ do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
+ 0, NULL);
+}
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(KERN_INFO
- "%s[%d] general protection ip:%lx sp:%lx error:%lx",
- tsk->comm, task_pid_nr(tsk),
- regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
- printk("\n");
- }
+DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
+{
+ do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
+ SIGBUS, 0, NULL);
+}
- force_sig(SIGSEGV, tsk);
- return;
+DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
+{
+ do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
+ 0, NULL);
+}
-#ifdef CONFIG_X86_32
-gp_in_vm86:
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- return;
-#endif
+DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
+{
+ char *str = "alignment check";
-gp_in_kernel:
- if (fixup_exception(regs))
+ if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 13;
- if (notify_die(DIE_GPF, "general protection fault", regs,
- error_code, 13, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
+ if (!user_mode(regs))
+ die("Split lock detected\n", regs, error_code);
+
+ local_irq_enable();
+
+ if (handle_user_split_lock(regs, error_code))
+ goto out;
+
+ do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
+ error_code, BUS_ADRALN, NULL);
+
+out:
+ local_irq_disable();
}
-static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
+ unsigned long fault_address,
+ struct stack_info *info)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ const char *name = stack_type_name(info->type);
+
+ printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
+ name, (void *)fault_address, info->begin, info->end);
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+ die("stack guard page", regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic("%s stack guard hit", name);
+}
+#endif
+
+/*
+ * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64
+ * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch
+ * between configs triggers objtool warnings.
+ *
+ * This is a temporary hack until we have compiler or plugin support for
+ * annotating noreturns.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+#define always_true() true
+#else
+bool always_true(void);
+bool __weak always_true(void) { return true; }
+#endif
+
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected. Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
+ * be lost. If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ *
+ * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
+ * to be read before doing anything else.
+ */
+DEFINE_IDTENTRY_DF(exc_double_fault)
+{
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_VMAP_STACK
+ unsigned long address = read_cr2();
+ struct stack_info info;
+#endif
+
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
+ *
+ * No need for nmi_enter() here because we don't use RCU.
+ */
+ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ unsigned long *p = (unsigned long *)regs->sp;
+
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ gpregs->ip = p[0];
+ gpregs->cs = p[1];
+ gpregs->flags = p[2];
+ gpregs->sp = p[3];
+ gpregs->ss = p[4];
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interrupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ *
+ * We will enter general_protection with kernel GSBASE,
+ * which is what the stub expects, given that the faulting
+ * RIP will be the IRET instruction.
+ */
+ regs->ip = (unsigned long)asm_exc_general_protection;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
-#if defined(CONFIG_EDAC)
- if (edac_handler_set()) {
- edac_atomic_assert_error();
return;
}
#endif
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ irqentry_nmi_enter(regs);
+ instrumentation_begin();
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_DF;
+
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * delivered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ if (get_stack_guard_info((void *)address, &info))
+ handle_stack_overflow(regs, address, &info);
+#endif
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ die("double fault", regs, error_code);
+ if (always_true())
+ panic("Machine halted.");
+ instrumentation_end();
}
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
+DEFINE_IDTENTRY(exc_bounds)
{
- unsigned long i;
-
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+ if (notify_die(DIE_TRAP, "bounds", regs, 0,
+ X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
+ return;
+ cond_local_irq_enable(regs);
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ if (!user_mode(regs))
+ die("bounds", regs, 0);
- i = 2000;
- while (--i)
- udelay(1000);
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
- reason &= ~8;
- outb(reason, 0x61);
+ cond_local_irq_disable(regs);
}
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+enum kernel_gp_hint {
+ GP_NO_HINT,
+ GP_NON_CANONICAL,
+ GP_CANONICAL,
+ GP_LASS_VIOLATION,
+ GP_NULL_POINTER,
+};
+
+static const char * const kernel_gp_hint_help[] = {
+ [GP_NON_CANONICAL] = "probably for non-canonical address",
+ [GP_CANONICAL] = "maybe for address",
+ [GP_LASS_VIOLATION] = "probably LASS violation for address",
+ [GP_NULL_POINTER] = "kernel NULL pointer dereference",
+};
+
+/*
+ * When an uncaught #GP occurs, try to determine the memory address accessed by
+ * the instruction and return that address to the caller. Also, try to figure
+ * out whether any part of the access to that address was non-canonical or
+ * across privilege levels.
+ */
+static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
+ unsigned long *addr)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
- NOTIFY_STOP)
- return;
-#ifdef CONFIG_MCA
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
+ int ret;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
+ MAX_INSN_SIZE))
+ return GP_NO_HINT;
+
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return GP_NO_HINT;
+
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (*addr == -1UL)
+ return GP_NO_HINT;
+
+#ifdef CONFIG_X86_64
+ /* Operand is in the kernel half */
+ if (*addr >= ~__VIRTUAL_MASK)
+ return GP_CANONICAL;
+
+ /* The last byte of the operand is not in the user canonical half */
+ if (*addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+ return GP_NON_CANONICAL;
+
/*
- * Might actually be able to figure out what the guilty party
- * is:
+ * A NULL pointer dereference usually causes a #PF. However, it
+ * can result in a #GP when LASS is active. Provide the same
+ * hint in the rare case that the condition is hit without LASS.
*/
- if (MCA_bus) {
- mca_handle_nmi();
- return;
- }
-#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ if (*addr < PAGE_SIZE)
+ return GP_NULL_POINTER;
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ /*
+ * Assume that LASS caused the exception, because the address is
+ * canonical and in the user half.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LASS))
+ return GP_LASS_VIOLATION;
+#endif
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ return GP_CANONICAL;
}
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+#define GPFSTR "general protection fault"
+
+static bool fixup_iopl_exception(struct pt_regs *regs)
{
- unsigned char reason = 0;
- int cpu;
+ struct thread_struct *t = &current->thread;
+ unsigned char byte;
+ unsigned long ip;
- cpu = smp_processor_id();
+ if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
+ return false;
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
+ if (insn_get_effective_ip(regs, &ip))
+ return false;
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
- unknown_nmi_error(reason, regs);
-#else
- unknown_nmi_error(reason, regs);
-#endif
+ if (get_user(byte, (const char __user *)ip))
+ return false;
- return;
+ if (byte != 0xfa && byte != 0xfb)
+ return false;
+
+ if (!t->iopl_warn && printk_ratelimit()) {
+ pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
+ current->comm, task_pid_nr(current), ip);
+ print_vma_addr(KERN_CONT " in ", ip);
+ pr_cont("\n");
+ t->iopl_warn = 1;
}
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
+ regs->ip += 1;
+ return true;
+}
+
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated. If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ u32 pasid;
+
/*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
+ * MSR_IA32_PASID is managed using XSAVE. Directly
+ * writing to the MSR is only possible when fpregs
+ * are valid and the fpstate is not. This is
+ * guaranteed when handling a userspace exception
+ * in *before* interrupts are re-enabled.
*/
- reassert_nmi();
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Hardware without ENQCMD will not generate
+ * #GPs that can be fixed up here.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ /*
+ * If the mm has not been allocated a
+ * PASID, the #GP can not be fixed up.
+ */
+ if (!mm_valid_pasid(current->mm))
+ return false;
+
+ pasid = mm_get_enqcmd_pasid(current->mm);
+
+ /*
+ * Did this thread already have its PASID activated?
+ * If so, the #GP must be from something else.
+ */
+ if (current->pasid_activated)
+ return false;
+
+ wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+ current->pasid_activated = 1;
+
+ return true;
+#else
+ return false;
#endif
}
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str,
+ unsigned long address)
{
- nmi_enter();
+ if (fixup_exception(regs, trapnr, error_code, address))
+ return true;
- inc_irq_stat(__nmi_count);
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
- if (!ignore_nmis)
- default_do_nmi(regs);
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
- nmi_exit();
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
}
-void stop_nmi(void)
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
{
- acpi_nmi_disable();
- ignore_nmis++;
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
}
-void restart_nmi(void)
+DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
- ignore_nmis--;
- acpi_nmi_enable();
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
+ enum kernel_gp_hint hint = GP_NO_HINT;
+ unsigned long gp_addr;
+
+ if (user_mode(regs) && try_fixup_enqcmd_gp())
+ return;
+
+ cond_local_irq_enable(regs);
+
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ goto exit;
+ }
+
+ if (v8086_mode(regs)) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ local_irq_disable();
+ return;
+ }
+
+ if (user_mode(regs)) {
+ if (fixup_iopl_exception(regs))
+ goto exit;
+
+ if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
+ goto exit;
+
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
+ goto exit;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0))
+ goto exit;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ hint = get_kernel_gp_address(regs, &gp_addr);
+
+ if (hint != GP_NO_HINT)
+ snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
+ kernel_gp_hint_help[hint], gp_addr);
+
+ /*
+ * KASAN is interested only in the non-canonical case, clear it
+ * otherwise.
+ */
+ if (hint != GP_NON_CANONICAL)
+ gp_addr = 0;
+
+ die_addr(desc, regs, error_code, gp_addr);
+
+exit:
+ cond_local_irq_disable(regs);
}
-/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+static bool do_int3(struct pt_regs *regs)
{
+ int res;
+
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
#ifdef CONFIG_KPROBES
- if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+ if (kprobe_int3_handler(regs))
+ return true;
+#endif
+ res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
+
+ return res == NOTIFY_STOP;
+}
+NOKPROBE_SYMBOL(do_int3);
+
+static void do_int3_user(struct pt_regs *regs)
+{
+ if (do_int3(regs))
return;
-#else
- if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
+
+ cond_local_irq_enable(regs);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
+ cond_local_irq_disable(regs);
+}
+
+DEFINE_IDTENTRY_RAW(exc_int3)
+{
+ /*
+ * smp_text_poke_int3_handler() is completely self contained code; it does (and
+ * must) *NOT* call out to anything, lest it hits upon yet another
+ * INT3.
+ */
+ if (smp_text_poke_int3_handler(regs))
return;
-#endif
- preempt_conditional_sti(regs);
- do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ /*
+ * irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
+ * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must
+ * be done before. If the entry came from kernel mode, then use
+ * nmi_enter() because the INT3 could have been hit in any context
+ * including NMI.
+ */
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
+ do_int3_user(regs);
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
+ } else {
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+ if (!do_int3(regs))
+ die("int3", regs, 0);
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
+ }
}
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
*/
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
-{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /*
- * Exception from kernel and interrupts are enabled. Move to
- * kernel process stack.
- */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
+asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1;
+ if (regs != eregs)
*regs = *eregs;
return regs;
}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
+{
+ unsigned long sp, *stack;
+ struct stack_info info;
+ struct pt_regs *regs_ret;
+
+ /*
+ * In the SYSCALL entry path the RSP value comes from user-space - don't
+ * trust it and switch to the current kernel stack
+ */
+ if (ip_within_syscall_gap(regs)) {
+ sp = current_top_of_stack();
+ goto sync;
+ }
+
+ /*
+ * From here on the RSP value is trusted. Now check whether entry
+ * happened from a safe stack. Not safe are the entry or unknown stacks,
+ * use the fall-back stack instead in this case.
+ */
+ sp = regs->sp;
+ stack = (unsigned long *)sp;
+
+ if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+ info.type > STACK_TYPE_EXCEPTION_LAST)
+ sp = __this_cpu_ist_top_va(VC2);
+
+sync:
+ /*
+ * Found a safe stack - switch to it as if the entry didn't happen via
+ * IST stack. The code below only copies pt_regs, the real switch happens
+ * in assembly code.
+ */
+ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
+
+ regs_ret = (struct pt_regs *)sp;
+ *regs_ret = *regs;
+
+ return regs_ret;
+}
#endif
+asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
+{
+ struct pt_regs tmp, *new_stack;
+
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
+ */
+ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the temporary storage. */
+ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));
+
+ /* Update the entry stack */
+ __memcpy(new_stack, &tmp, sizeof(tmp));
+
+ BUG_ON(!user_mode(new_stack));
+ return new_stack;
+}
+#endif
+
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+ /*
+ * We don't try for precision here. If we're anywhere in the region of
+ * code that can be single-stepped in the SYSENTER entry path, then
+ * assume that this is a useless single-step trap due to SYSENTER
+ * being invoked with TF set. (We don't know in advance exactly
+ * which instructions will be hit because BTF could plausibly
+ * be set.)
+ */
+#ifdef CONFIG_X86_32
+ return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+ (unsigned long)__end_SYSENTER_singlestep_region -
+ (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+ return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+ (unsigned long)__end_entry_SYSENTER_compat -
+ (unsigned long)entry_SYSENTER_compat;
+#else
+ return false;
+#endif
+}
+
+static __always_inline unsigned long debug_read_reset_dr6(void)
+{
+ unsigned long dr6;
+
+ get_debugreg(dr6, 6);
+ dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
+
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3 of DR6.
+ *
+ * BLD induced #DB clears DR6.BLD and any other debug
+ * exception doesn't modify DR6.BLD.
+ *
+ * RTM induced #DB clears DR6.RTM and any other debug
+ * exception sets DR6.RTM.
+ *
+ * To avoid confusion in identifying debug exceptions,
+ * debug handlers should set DR6.BLD and DR6.RTM, and
+ * clear other DR6 bits before returning.
+ *
+ * Keep it simple: write DR6 with its architectural reset
+ * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately.
+ */
+ set_debugreg(DR6_RESERVED, 6);
+
+ return dr6;
+}
+
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -523,377 +1195,502 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+
+static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
{
- struct task_struct *tsk = current;
- int user_icebp = 0;
- unsigned long dr6;
- int si_code;
+ /*
+ * Notifiers will clear bits in @dr6 to indicate the event has been
+ * consumed - hw_breakpoint_handler(), single_stop_cont().
+ *
+ * Notifiers will set bits in @virtual_dr6 to indicate the desire
+ * for signals - ptrace_triggered(), kgdb_hw_overflow_handler().
+ */
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP)
+ return true;
- get_debugreg(dr6, 6);
+ return false;
+}
- /* Filter out all the reserved bits which are preset to 1 */
- dr6 &= ~DR6_RESERVED;
+static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
+{
+ /*
+ * Disable breakpoints during exception handling; recursive exceptions
+ * are exceedingly 'fun'.
+ *
+ * Since this function is NOKPROBE, and that also applies to
+ * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
+ * HW_BREAKPOINT_W on our stack)
+ *
+ * Entry text is excluded for HW_BP_X and cpu_entry_area, which
+ * includes the entry stack is excluded for everything.
+ *
+ * For FRED, nested #DB should just work fine. But when a watchpoint or
+ * breakpoint is set in the code path which is executed by #DB handler,
+ * it results in an endless recursion and stack overflow. Thus we stay
+ * with the IDT approach, i.e., save DR7 and disable #DB.
+ */
+ unsigned long dr7 = local_db_save();
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+ instrumentation_begin();
/*
- * If dr6 has no reason to give us about the origin of this trap,
- * then it's very likely the result of an icebp/int01 trap.
- * User wants a sigtrap for that.
+ * If something gets miswired and we end up here for a user mode
+ * #DB, we will malfunction.
*/
- if (!dr6 && user_mode(regs))
- user_icebp = 1;
+ WARN_ON_ONCE(user_mode(regs));
- /* Catch kmemcheck conditions first of all! */
- if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- return;
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
+ * it for userspace, but we just took a kernel #DB, so re-set
+ * BTF.
+ */
+ unsigned long debugctl;
- /* DR6 may or may not be cleared by the CPU */
- set_debugreg(0, 6);
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= DEBUGCTLMSR_BTF;
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
/*
- * The processor cleared BTF, so don't mark that we need it set.
+ * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
+ * watchpoint at the same time then that will still be handled.
*/
- clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ (dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ dr6 &= ~DR_STEP;
- /* Store the virtualized DR6 value */
- tsk->thread.debugreg6 = dr6;
-
- if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
- SIGTRAP) == NOTIFY_STOP)
- return;
-
- /* It's safe to allow irq's after DR6 has been saved */
- preempt_conditional_sti(regs);
+ /*
+ * The kernel doesn't use INT1
+ */
+ if (!dr6)
+ goto out;
- if (regs->flags & X86_VM_MASK) {
- handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, 1);
- return;
- }
+ if (notify_debug(regs, &dr6))
+ goto out;
/*
- * Single-stepping through system calls: ignore any exceptions in
- * kernel space, but re-enable TF when returning to user mode.
+ * The kernel doesn't use TF single-step outside of:
+ *
+ * - Kprobes, consumed through kprobe_debug_handler()
+ * - KGDB, consumed through notify_debug()
*
- * We already checked v86 mode above, so we can check for kernel mode
- * by just checking the CPL of CS.
+ * So if we get here with DR_STEP set, something is wonky.
+ *
+ * A known way to trigger this is through QEMU's GDB stub,
+ * which leaks #DB into the guest and causes IST recursion.
*/
- if ((dr6 & DR_STEP) && !user_mode(regs)) {
- tsk->thread.debugreg6 &= ~DR_STEP;
- set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ if (WARN_ON_ONCE(dr6 & DR_STEP))
regs->flags &= ~X86_EFLAGS_TF;
- }
- si_code = get_si_code(tsk->thread.debugreg6);
- if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
- send_sigtrap(tsk, regs, error_code, si_code);
- preempt_conditional_cli(regs);
+out:
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
- return;
+ local_db_restore(dr7);
}
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
{
- struct task_struct *task = current;
- siginfo_t info;
- unsigned short err;
- char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+ bool icebp;
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
- return;
- conditional_sti(regs);
+ /*
+ * If something gets miswired and we end up here for a kernel mode
+ * #DB, we will malfunction.
+ */
+ WARN_ON_ONCE(!user_mode(regs));
- if (!user_mode_vm(regs))
- {
- if (!fixup_exception(regs)) {
- task->thread.error_code = error_code;
- task->thread.trap_no = trapnr;
- die(str, regs, error_code);
- }
- return;
- }
+ /*
+ * NB: We can't easily clear DR7 here because
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * user memory, etc. This means that a recursive #DB is possible. If
+ * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
+ * Since we're not on the IST stack right now, everything will be
+ * fine.
+ */
+
+ irqentry_enter_from_user_mode(regs);
+ instrumentation_begin();
/*
- * Save the info for the exception handler and clear the error.
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
+ *
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
+ * even if it is not the result of PTRACE_SINGLESTEP.
*/
- save_init_fpu(task);
- task->thread.trap_no = trapnr;
- task->thread.error_code = error_code;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_addr = (void __user *)regs->ip;
- if (trapnr == 16) {
- unsigned short cwd, swd;
- /*
- * (~cwd & swd) will mask out exceptions that are not set to unmasked
- * status. 0x3f is the exception bits in these regs, 0x200 is the
- * C1 reg you need in case of a stack fault, 0x040 is the stack
- * fault bit. We should only be taking one exception at a time,
- * so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't synchronizing its FPU usage
- * and it will suffer the consequences since we won't be able to
- * fully reproduce the context of the exception
- */
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
- err = swd & ~cwd;
- } else {
- /*
- * The SIMD FPU exceptions are handled a little differently, as there
- * is only a single status/control register. Thus, to determine which
- * unmasked exception was caught we must mask the exception mask bits
- * at 0x1f80, and then use these to mask the exception bits at 0x3f.
- */
- unsigned short mxcsr = get_fpu_mxcsr(task);
- err = ~(mxcsr >> 7) & mxcsr;
- }
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_thread_flag(TIF_BLOCKSTEP);
- if (err & 0x001) { /* Invalid op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- } else if (err & 0x004) { /* Divide by Zero */
- info.si_code = FPE_FLTDIV;
- } else if (err & 0x008) { /* Overflow */
- info.si_code = FPE_FLTOVF;
- } else if (err & 0x012) { /* Denormal, Underflow */
- info.si_code = FPE_FLTUND;
- } else if (err & 0x020) { /* Precision */
- info.si_code = FPE_FLTRES;
- } else {
- /*
- * If we're using IRQ 13, or supposedly even some trap 16
- * implementations, it's possible we get a spurious trap...
- */
- return; /* Spurious trap, no error */
+ /*
+ * If dr6 has no reason to give us about the origin of this trap,
+ * then it's very likely the result of an icebp/int01 trap.
+ * User wants a sigtrap for that.
+ */
+ icebp = !dr6;
+
+ if (notify_debug(regs, &dr6))
+ goto out;
+
+ /* It's safe to allow irq's after DR6 has been saved */
+ local_irq_enable();
+
+ if (v8086_mode(regs)) {
+ handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB);
+ goto out_irq;
}
- force_sig_info(SIGFPE, &info, task);
-}
-dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
-{
-#ifdef CONFIG_X86_32
- ignore_fpu_irq = 1;
-#endif
+ /* #DB for bus lock can only be triggered from userspace. */
+ if (dr6 & DR_BUS_LOCK)
+ handle_bus_lock(regs);
- math_error(regs, error_code, 16);
+ /* Add the virtual_dr6 bits for signals. */
+ dr6 |= current->thread.virtual_dr6;
+ if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
+ send_sigtrap(regs, 0, get_si_code(dr6));
+
+out_irq:
+ local_irq_disable();
+out:
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
}
-dotraplinkage void
-do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+#ifdef CONFIG_X86_64
+/* IST stack entry */
+DEFINE_IDTENTRY_DEBUG(exc_debug)
{
- math_error(regs, error_code, 19);
+ exc_debug_kernel(regs, debug_read_reset_dr6());
}
-dotraplinkage void
-do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+/* User entry, runs on regular task stack */
+DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
- conditional_sti(regs);
-#if 0
- /* No need to warn about this any longer. */
- printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
+ exc_debug_user(regs, debug_read_reset_dr6());
}
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #DB needs to be handled on different stack: User #DB on
+ * current task stack, while kernel #DB on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #DB dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
+ * entry stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_DEBUG(exc_debug)
{
+ /*
+ * FRED #DB stores DR6 on the stack in the format which
+ * debug_read_reset_dr6() returns for the IDT entry points.
+ */
+ unsigned long dr6 = fred_event_data(regs);
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
}
+#endif /* CONFIG_X86_FRED */
-asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+#else
+/* 32 bit does not have separate entry points. */
+DEFINE_IDTENTRY_RAW(exc_debug)
{
+ unsigned long dr6 = debug_read_reset_dr6();
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
}
+#endif
/*
- * __math_state_restore assumes that cr0.TS is already clear and the
- * fpu state is all ready for use. Used during context switch.
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
*/
-void __math_state_restore(void)
+static void math_error(struct pt_regs *regs, int trapnr)
{
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
+ struct task_struct *task = current;
+ struct fpu *fpu = x86_task_fpu(task);
+ int si_code;
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
+
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr, 0, 0))
+ goto exit;
+
+ task->thread.error_code = 0;
+ task->thread.trap_nr = trapnr;
+
+ if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
+ SIGFPE) != NOTIFY_STOP)
+ die(str, regs, 0);
+ goto exit;
+ }
/*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ * Synchronize the FPU register state to the memory register state
+ * if necessary. This allows the exception handler to inspect it.
*/
- if (unlikely(restore_fpu_checking(tsk))) {
- stts();
- force_sig(SIGSEGV, tsk);
- return;
- }
+ fpu_sync_fpstate(fpu);
+
+ task->thread.trap_nr = trapnr;
+ task->thread.error_code = 0;
+
+ si_code = fpu__exception_code(fpu, trapnr);
+ /* Retry when we get spurious exceptions: */
+ if (!si_code)
+ goto exit;
- thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
- tsk->fpu_counter++;
+ if (fixup_vdso_exception(regs, trapnr, 0, 0))
+ goto exit;
+
+ force_sig_fault(SIGFPE, si_code,
+ (void __user *)uprobe_get_trap_addr(regs));
+exit:
+ cond_local_irq_disable(regs);
}
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (in this case,
- * local interrupts are disabled at the call-site in entry.S).
- */
-asmlinkage void math_state_restore(void)
+DEFINE_IDTENTRY(exc_coprocessor_error)
{
- struct thread_info *thread = current_thread_info();
- struct task_struct *tsk = thread->task;
+ math_error(regs, X86_TRAP_MF);
+}
- if (!tsk_used_math(tsk)) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(tsk)) {
- /*
- * ran out of memory!
- */
- do_group_exit(SIGKILL);
+DEFINE_IDTENTRY(exc_simd_coprocessor_error)
+{
+ if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
+ /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
+ if (!static_cpu_has(X86_FEATURE_XMM)) {
+ __exc_general_protection(regs, 0);
return;
}
- local_irq_disable();
}
+ math_error(regs, X86_TRAP_XF);
+}
- clts(); /* Allow maths ops (or we recurse) */
-
- __math_state_restore();
+DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
+{
+ /*
+ * This addresses a Pentium Pro Erratum:
+ *
+ * PROBLEM: If the APIC subsystem is configured in mixed mode with
+ * Virtual Wire mode implemented through the local APIC, an
+ * interrupt vector of 0Fh (Intel reserved encoding) may be
+ * generated by the local APIC (Int 15). This vector may be
+ * generated upon receipt of a spurious interrupt (an interrupt
+ * which is removed before the system receives the INTA sequence)
+ * instead of the programmed 8259 spurious interrupt vector.
+ *
+ * IMPLICATION: The spurious interrupt vector programmed in the
+ * 8259 is normally handled by an operating system's spurious
+ * interrupt handler. However, a vector of 0Fh is unknown to some
+ * operating systems, which would crash if this erratum occurred.
+ *
+ * In theory this could be limited to 32bit, but the handler is not
+ * hurting and who knows which other CPUs suffer from this.
+ */
}
-EXPORT_SYMBOL_GPL(math_state_restore);
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
+static bool handle_xfd_event(struct pt_regs *regs)
{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
+ u64 xfd_err;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
+ return false;
+
+ rdmsrq(MSR_IA32_XFD_ERR, xfd_err);
+ if (!xfd_err)
+ return false;
+
+ wrmsrq(MSR_IA32_XFD_ERR, 0);
+
+ /* Die if that happens in kernel space */
+ if (WARN_ON(!user_mode(regs)))
+ return false;
+
+ local_irq_enable();
+
+ err = xfd_enable_feature(xfd_err);
+
+ switch (err) {
+ case -EPERM:
+ force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
+ break;
+ case -EFAULT:
+ force_sig(SIGSEGV);
+ break;
+ }
+
+ local_irq_disable();
+ return true;
}
-#endif /* CONFIG_MATH_EMULATION */
-dotraplinkage void __kprobes
-do_device_not_available(struct pt_regs *regs, long error_code)
+DEFINE_IDTENTRY(exc_device_not_available)
{
-#ifdef CONFIG_X86_32
- if (read_cr0() & X86_CR0_EM) {
+ unsigned long cr0 = read_cr0();
+
+ if (handle_xfd_event(regs))
+ return;
+
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
- conditional_sti(regs);
+ cond_local_irq_enable(regs);
info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+
+ cond_local_irq_disable(regs);
+ return;
}
-#else
- math_state_restore();
#endif
+
+ /* This should not happen. */
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, 0);
+ }
}
-#ifdef CONFIG_X86_32
-dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code,
+ unsigned long address)
{
- siginfo_t info;
- local_irq_enable();
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
- info.si_signo = SIGILL;
- info.si_errno = 0;
- info.si_code = ILL_BADSTK;
- info.si_addr = NULL;
- if (notify_die(DIE_TRAP, "iret exception",
- regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code,
+ VE_FAULT_STR, address)) {
return;
- do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
-}
-#endif
+ }
-/* Set of traps needed for early debugging. */
-void __init early_trap_init(void)
-{
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
- /* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
- set_intr_gate(14, &page_fault);
- load_idt(&idt_descr);
+ die_addr(VE_FAULT_STR, regs, error_code, address);
}
-void __init trap_init(void)
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
{
- int i;
+ struct ve_info ve;
-#ifdef CONFIG_EISA
- void __iomem *p = early_ioremap(0x0FFFD9, 4);
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
- if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
- EISA_bus = 1;
- early_iounmap(p, 4);
-#endif
+ cond_local_irq_enable(regs);
- set_intr_gate(0, &divide_error);
- set_intr_gate_ist(2, &nmi, NMI_STACK);
- /* int4 can be called from all */
- set_system_intr_gate(4, &overflow);
- set_intr_gate(5, &bounds);
- set_intr_gate(6, &invalid_op);
- set_intr_gate(7, &device_not_available);
-#ifdef CONFIG_X86_32
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
-#else
- set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
-#endif
- set_intr_gate(9, &coprocessor_segment_overrun);
- set_intr_gate(10, &invalid_TSS);
- set_intr_gate(11, &segment_not_present);
- set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(13, &general_protection);
- set_intr_gate(15, &spurious_interrupt_bug);
- set_intr_gate(16, &coprocessor_error);
- set_intr_gate(17, &alignment_check);
-#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(18, &machine_check, MCE_STACK);
-#endif
- set_intr_gate(19, &simd_coprocessor_error);
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0, ve.gla);
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
+ cond_local_irq_disable(regs);
+}
-#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
+DEFINE_IDTENTRY_SW(iret_error)
+{
+ local_irq_enable();
+ if (notify_die(DIE_TRAP, "iret exception", regs, 0,
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
+ ILL_BADSTK, (void __user *)NULL);
}
-
- set_system_trap_gate(SYSCALL_VECTOR, &system_call);
- set_bit(SYSCALL_VECTOR, used_vectors);
+ local_irq_disable();
+}
#endif
- /*
- * Should be a barrier for any external CPU state:
- */
- cpu_init();
+void __init trap_init(void)
+{
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
- x86_init.irqs.trap_init();
+ /* Init GHCB memory pages when running as an SEV-ES guest */
+ sev_es_init_vc_handling();
+
+ /* Initialize TSS before setting up traps so ISTs work */
+ cpu_init_exception_handling(true);
+
+ /* Setup traps as cpu_init() might #GP */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_traps();
+
+ cpu_init();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..7d3e13e14eab 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,16 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
-#include <linux/dmi.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/timex.h>
+#include <linux/static_key.h>
+#include <linux/static_call.h>
+#include <asm/cpuid/api.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
@@ -19,6 +26,14 @@
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/i8259.h>
+#include <asm/msr.h>
+#include <asm/topology.h>
+#include <asm/uv/uv.h>
+#include <asm/sev.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -26,23 +41,207 @@ EXPORT_SYMBOL(cpu_khz);
unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);
+#define KHZ 1000
+
/*
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
static int __read_mostly tsc_unstable;
+static unsigned int __initdata tsc_early_khz;
+
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
+
+int tsc_clocksource_reliable;
+
+static int __read_mostly tsc_force_recalibrate;
+
+static struct clocksource_base art_base_clk = {
+ .id = CSID_X86_ART,
+};
+static bool have_art;
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_latch_t seq; /* 32 + 4 = 36 */
+
+}; /* fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+static int __init tsc_early_khz_setup(char *buf)
+{
+ return kstrtouint(buf, 0, &tsc_early_khz);
+}
+early_param("tsc_early_khz", tsc_early_khz_setup);
+
+__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
+{
+ int seq, idx;
+
+ do {
+ seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
+ idx = seq & 1;
+
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
+
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
+}
+
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+ preempt_disable_notrace();
+ __cyc2ns_read(data);
+}
+
+__always_inline void cyc2ns_read_end(void)
+{
+ preempt_enable_notrace();
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data data;
+ unsigned long long ns;
+
+ __cyc2ns_read(&data);
+
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ return ns;
+}
-/* native_sched_clock() is called before tsc_init(), so
- we must start with the TSC soft disabled to prevent
- erroneous rdtsc usage on !cpu_has_tsc processors */
-static int __read_mostly tsc_disabled = -1;
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ unsigned long long ns;
+ preempt_disable_notrace();
+ ns = __cycles_2_ns(cyc);
+ preempt_enable_notrace();
+ return ns;
+}
+
+static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
+
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
+ NSEC_PER_MSEC, 0);
+
+ /*
+ * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+ * not expected to be greater than 31 due to the original published
+ * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+ * value) - refer perf_event_mmap_page documentation in perf_event.h.
+ */
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
+ }
+
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ write_seqcount_latch_begin(&c2n->seq);
+ c2n->data[0] = data;
+ write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
+ write_seqcount_latch_end(&c2n->seq);
+}
+
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (khz)
+ __set_cyc2ns_scale(khz, cpu, tsc_now);
+
+ sched_clock_idle_wakeup_event();
+ local_irq_restore(flags);
+}
+
+/*
+ * Initialize cyc2ns for boot cpu
+ */
+static void __init cyc2ns_init_boot_cpu(void)
+{
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+
+ seqcount_latch_init(&c2n->seq);
+ __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
+}
+
+/*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU.
+ */
+static void __init cyc2ns_init_secondary_cpus(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+ struct cyc2ns_data *data = c2n->data;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != this_cpu) {
+ seqcount_latch_init(&c2n->seq);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+ c2n->data[0] = data[0];
+ c2n->data[1] = data[1];
+ }
+ }
+}
-static int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
{
- u64 this_offset;
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return __cycles_2_ns(tsc_now);
+ }
/*
* Fall back to jiffies if there's no TSC available:
@@ -52,30 +251,46 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (unlikely(tsc_disabled)) {
- /* No locking but a rare wrong value is not a big deal: */
- return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
- }
- /* read the Time Stamp Counter: */
- rdtscll(this_offset);
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
- /* return the value in ns */
- return __cycles_2_ns(this_offset);
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
}
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock_noinstr(void)
{
return paravirt_sched_clock();
}
+
+bool using_native_sched_clock(void)
+{
+ return static_call_query(pv_sched_clock) == native_sched_clock;
+}
#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
#endif
+notrace u64 sched_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = sched_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+
int check_tsc_unstable(void)
{
return tsc_unstable;
@@ -85,9 +300,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC completely.\n");
- tsc_disabled = 1;
+ mark_tsc_unstable("boot parameter notsc");
return 1;
}
#else
@@ -104,24 +317,49 @@ int __init notsc_setup(char *str)
__setup("notsc", notsc_setup);
+static int no_sched_irq_time;
+static int no_tsc_watchdog;
+static int tsc_as_watchdog;
+
static int __init tsc_setup(char *str)
{
if (!strcmp(str, "reliable"))
tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
+ if (!strcmp(str, "nowatchdog")) {
+ no_tsc_watchdog = 1;
+ if (tsc_as_watchdog)
+ pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
+ __func__);
+ tsc_as_watchdog = 0;
+ }
+ if (!strcmp(str, "recalibrate"))
+ tsc_force_recalibrate = 1;
+ if (!strcmp(str, "watchdog")) {
+ if (no_tsc_watchdog)
+ pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
+ __func__);
+ else
+ tsc_as_watchdog = 1;
+ }
return 1;
}
__setup("tsc=", tsc_setup);
-#define MAX_RETRIES 5
-#define SMI_TRESHOLD 50000
+#define MAX_RETRIES 5
+#define TSC_DEFAULT_THRESHOLD 0x20000
/*
- * Read TSC and the reference counters. Take care of SMI disturbance
+ * Read TSC and the reference counters. Take care of any disturbances
*/
static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
+ u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
int i;
for (i = 0; i < MAX_RETRIES; i++) {
@@ -131,7 +369,7 @@ static u64 tsc_read_refs(u64 *p, int hpet)
else
*p = acpi_pm_read_early();
t2 = get_cycles();
- if ((t2 - t1) < SMI_TRESHOLD)
+ if ((t2 - t1) < thresh)
return t2;
}
return ULLONG_MAX;
@@ -149,7 +387,7 @@ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
hpet2 -= hpet1;
tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
do_div(tmp, 1000000);
- do_div(deltatsc, tmp);
+ deltatsc = div64_u64(deltatsc, tmp);
return (unsigned long) deltatsc;
}
@@ -175,11 +413,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
}
#define CAL_MS 10
-#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS 1000
#define CAL2_MS 50
-#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS 5000
@@ -196,6 +434,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
unsigned long tscmin, tscmax;
int pitcnt;
+ if (!has_legacy_pic()) {
+ /*
+ * Relies on tsc_early_delay_calibrate() to have given us semi
+ * usable udelay(), wait for the same 50ms we would have with
+ * the PIT loop below.
+ */
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ return ULONG_MAX;
+ }
+
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -275,7 +527,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
* transition from one expected value to another with a fairly
* high accuracy, and we didn't miss any events. We can thus
* use the TSC value at the transitions to calculate a pretty
- * good value for the TSC frequencty.
+ * good value for the TSC frequency.
*/
static inline int pit_verify_msb(unsigned char val)
{
@@ -287,14 +539,15 @@ static inline int pit_verify_msb(unsigned char val)
static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
int count;
- u64 tsc = 0;
+ u64 tsc = 0, prev_tsc = 0;
for (count = 0; count < 50000; count++) {
if (!pit_verify_msb(val))
break;
+ prev_tsc = tsc;
tsc = get_cycles();
}
- *deltap = get_cycles() - tsc;
+ *deltap = get_cycles() - prev_tsc;
*tscp = tsc;
/*
@@ -308,9 +561,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
* How many MSB values do we want to see? We aim for
* a maximum error rate of 500ppm (in practice the
* real error is much smaller), but refuse to spend
- * more than 25ms on it.
+ * more than 50ms on it.
*/
-#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
static unsigned long quick_pit_calibrate(void)
@@ -319,6 +572,9 @@ static unsigned long quick_pit_calibrate(void)
u64 tsc, delta;
unsigned long d1, d2;
+ if (!has_legacy_pic())
+ return 0;
+
/* Set the Gate high, disable speaker */
outb((inb(0x61) & ~0x02) | 0x01, 0x61);
@@ -350,10 +606,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
@@ -369,7 +634,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- printk("Fast TSC calibration failed\n");
+ pr_info("Fast TSC calibration failed\n");
return 0;
success:
@@ -380,37 +645,124 @@ success:
*
* As a result, we can depend on there not being
* any odd delays anywhere, and the TSC reads are
- * reliable (within the error). We also adjust the
- * delta to the middle of the error bars, just
- * because it looks nicer.
+ * reliable (within the error).
*
* kHz = ticks / time-in-seconds / 1000;
* kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
* kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
*/
- delta += (long)(d2 - d1)/2;
delta *= PIT_TICK_RATE;
do_div(delta, i*256*1000);
- printk("Fast TSC calibration using PIT\n");
+ pr_info("Fast TSC calibration using PIT\n");
return delta;
}
/**
- * native_calibrate_tsc - calibrate the tsc on boot
+ * native_calibrate_tsc - determine TSC frequency
+ * Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
{
+ unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
+ unsigned int crystal_khz;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return 0;
+
+ eax_denominator = ebx_numerator = ecx_hz = edx = 0;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+
+ if (ebx_numerator == 0 || eax_denominator == 0)
+ return 0;
+
+ crystal_khz = ecx_hz / 1000;
+
+ /*
+ * Denverton SoCs don't report crystal clock, and also don't support
+ * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
+ * crystal clock.
+ */
+ if (crystal_khz == 0 &&
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
+ crystal_khz = 25000;
+
+ /*
+ * TSC frequency reported directly by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ if (crystal_khz != 0)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Some Intel SoCs like Skylake and Kabylake don't report the crystal
+ * clock, but we can easily calculate it to a high degree of accuracy
+ * by considering the crystal ratio and the CPU speed.
+ */
+ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
+ unsigned int eax_base_mhz, ebx, ecx, edx;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
+ crystal_khz = eax_base_mhz * 1000 *
+ eax_denominator / ebx_numerator;
+ }
+
+ if (crystal_khz == 0)
+ return 0;
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * The local APIC appears to be fed by the core crystal clock
+ * (which sounds entirely sensible). We can set the global
+ * lapic_timer_period here to avoid having to calibrate the APIC
+ * timer later.
+ */
+ lapic_timer_period = crystal_khz * 1000 / HZ;
+#endif
+
+ return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+static unsigned long cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
+ return 0;
+
+ eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
+
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
+/*
+ * calibrate cpu using pit, hpet, and ptimer methods. They are available
+ * later in boot after acpi is initialized.
+ */
+static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
+{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate;
+ unsigned long flags, latch, ms;
int hpet = is_hpet_enabled(), i, loopmin;
- local_irq_save(flags);
- fast_calibrate = quick_pit_calibrate();
- local_irq_restore(flags);
- if (fast_calibrate)
- return fast_calibrate;
-
/*
* Run 5 calibration loops to get the lowest frequency value
* (the best estimate). We use two different calibration modes
@@ -422,15 +774,15 @@ unsigned long native_calibrate_tsc(void)
* zero. In each wait loop iteration we read the TSC and check
* the delta to the previous read. We keep track of the min
* and max values of that delta. The delta is mostly defined
- * by the IO time of the PIT access, so we can detect when a
- * SMI/SMM disturbance happend between the two reads. If the
+ * by the IO time of the PIT access, so we can detect when
+ * any disturbance happened between the two reads. If the
* maximum time is significantly larger than the minimum time,
* then we discard the result and have another try.
*
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for a SMI/SMM disturbance. We dicard
+ * reference read for any possible disturbance. We discard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -460,10 +812,10 @@ unsigned long native_calibrate_tsc(void)
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !ref1 && !ref2)
+ if (ref1 == ref2)
continue;
- /* Check, whether the sampling was disturbed by an SMI */
+ /* Check, whether the sampling was disturbed */
if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
continue;
@@ -486,9 +838,8 @@ unsigned long native_calibrate_tsc(void)
* use the reference value, as it is more precise.
*/
if (delta >= 90 && delta <= 110) {
- printk(KERN_INFO
- "TSC: PIT calibration matches %s. %d loops\n",
- hpet ? "HPET" : "PMTIMER", i + 1);
+ pr_info("PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
return tsc_ref_min;
}
@@ -510,38 +861,36 @@ unsigned long native_calibrate_tsc(void)
*/
if (tsc_pit_min == ULONG_MAX) {
/* PIT gave no useful value */
- printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
+ pr_warn("Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
if (!hpet && !ref1 && !ref2) {
- printk("TSC: No reference (HPET/PMTIMER) available\n");
+ pr_notice("No reference (HPET/PMTIMER) available\n");
return 0;
}
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed.\n");
+ pr_warn("HPET/PMTIMER calibration failed\n");
return 0;
}
/* Use the alternative source */
- printk(KERN_INFO "TSC: using %s reference calibration\n",
- hpet ? "HPET" : "PMTIMER");
+ pr_info("using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
return tsc_ref_min;
}
/* We don't have an alternative source, use the PIT calibration value */
if (!hpet && !ref1 && !ref2) {
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
- "Using PIT calibration\n");
+ pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
return tsc_pit_min;
}
@@ -550,90 +899,119 @@ unsigned long native_calibrate_tsc(void)
* the PIT value as we know that there are PMTIMERs around
* running at double speed. At least we let the user know:
*/
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
- printk(KERN_INFO "TSC: Using PIT calibration value\n");
+ pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+ pr_info("Using PIT calibration value\n");
return tsc_pit_min;
}
-int recalibrate_cpu_khz(void)
+/**
+ * native_calibrate_cpu_early - can calibrate the cpu early in boot
+ */
+unsigned long native_calibrate_cpu_early(void)
+{
+ unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
+
+ if (!fast_calibrate)
+ fast_calibrate = cpu_khz_from_msr();
+ if (!fast_calibrate) {
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ }
+ return fast_calibrate;
+}
+
+
+/**
+ * native_calibrate_cpu - calibrate the cpu
+ */
+static unsigned long native_calibrate_cpu(void)
+{
+ unsigned long tsc_freq = native_calibrate_cpu_early();
+
+ if (!tsc_freq)
+ tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+
+ return tsc_freq;
+}
+
+void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
unsigned long cpu_khz_old = cpu_khz;
- if (cpu_has_tsc) {
- tsc_khz = x86_platform.calibrate_tsc();
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
cpu_khz = tsc_khz;
- cpu_data(0).loops_per_jiffy =
- cpufreq_scale(cpu_data(0).loops_per_jiffy,
- cpu_khz_old, cpu_khz);
- return 0;
- } else
- return -ENODEV;
-#else
- return -ENODEV;
+ cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
#endif
}
+EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
-EXPORT_SYMBOL(recalibrate_cpu_khz);
+static unsigned long long cyc2ns_suspend;
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
+void tsc_save_sched_clock_state(void)
+{
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ return;
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
+ cyc2ns_suspend = sched_clock();
+}
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void tsc_restore_sched_clock_state(void)
{
- unsigned long long tsc_now, ns_now, *offset;
- unsigned long flags, *scale;
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!static_branch_likely(&__use_tsc) && !sched_clock_stable())
+ return;
local_irq_save(flags);
- sched_clock_idle_sleep_event();
- scale = &per_cpu(cyc2ns, cpu);
- offset = &per_cpu(cyc2ns_offset, cpu);
+ /*
+ * We're coming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
- if (cpu_khz) {
- *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
- *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
}
- sched_clock_idle_wakeup_event(0);
local_irq_restore(flags);
}
#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+/*
+ * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
* changes.
*
- * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
+ * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
+ * as unstable and give up in those cases.
*
* Should fix up last_tsc too. Currently gettimeofday in the
* first tick after the change will be slightly wrong.
@@ -647,33 +1025,29 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
- unsigned long *lpj;
- if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+ if (num_online_cpus() > 1) {
+ mark_tsc_unstable("cpufreq changes on SMP");
return 0;
-
- lpj = &boot_cpu_data.loops_per_jiffy;
-#ifdef CONFIG_SMP
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#endif
+ }
if (!ref_freq) {
ref_freq = freq->old;
- loops_per_jiffy_ref = *lpj;
+ loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
tsc_khz_ref = tsc_khz;
}
+
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+ boot_cpu_data.loops_per_jiffy =
+ cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
+ }
return 0;
}
@@ -682,9 +1056,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
.notifier_call = time_cpufreq_notifier
};
-static int __init cpufreq_tsc(void)
+static int __init cpufreq_register_tsc_scaling(void)
{
- if (!cpu_has_tsc)
+ if (!boot_cpu_has(X86_FEATURE_TSC))
return 0;
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
@@ -693,134 +1067,212 @@ static int __init cpufreq_tsc(void)
return 0;
}
-core_initcall(cpufreq_tsc);
+core_initcall(cpufreq_register_tsc_scaling);
#endif /* CONFIG_CPU_FREQ */
+#define ART_MIN_DENOMINATOR (1)
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void __init detect_art(void)
+{
+ unsigned int unused;
+
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
+ return;
+
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
+ return;
+
+ cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
+ &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
+
+ art_base_clk.freq_khz /= KHZ;
+ if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
+ return;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
+
+ /* Make this sticky over multiple CPU init calls */
+ setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
/* clocksource code */
-static struct clocksource clocksource_tsc;
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
* is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
+ * is slightly behind. This delta is nowhere else observable, but in
* that case it results in a forward time jump in the range of hours
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
-static cycle_t read_tsc(struct clocksource *cs)
+static u64 read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
+ return (u64)rdtsc_ordered();
}
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
+static void tsc_cs_mark_unstable(struct clocksource *cs)
{
- cycle_t ret;
+ if (tsc_unstable)
+ return;
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
- rdtsc_barrier();
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
- return ret >= __vsyscall_gtod_data.clock.cycle_last ?
- ret : __vsyscall_gtod_data.clock.cycle_last;
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
}
-#endif
-static void resume_tsc(struct clocksource *cs)
+static int tsc_cs_enable(struct clocksource *cs)
{
- clocksource_tsc.cycle_last = 0;
+ vclocks_set_used(VDSO_CLOCKMODE_TSC);
+ return 0;
}
-static struct clocksource clocksource_tsc = {
- .name = "tsc",
- .rating = 300,
- .read = read_tsc,
- .resume = resume_tsc,
- .mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
+static struct clocksource clocksource_tsc_early = {
+ .name = "tsc-early",
+ .rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
- .vread = vread_tsc,
-#endif
+ .id = CSID_X86_TSC_EARLY,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
+ .enable = tsc_cs_enable,
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
+};
+
+/*
+ * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
+ * this one will immediately take over. We will only register if TSC has
+ * been found good.
+ */
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_VALID_FOR_HRES |
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
+ .id = CSID_X86_TSC,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TSC,
+ .enable = tsc_cs_enable,
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc.list),
};
void mark_tsc_unstable(char *reason)
{
- if (!tsc_unstable) {
- tsc_unstable = 1;
- sched_clock_stable = 0;
- printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult)
- clocksource_mark_unstable(&clocksource_tsc);
- else {
- clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
- clocksource_tsc.rating = 0;
- }
- }
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+
+ clocksource_mark_unstable(&clocksource_tsc_early);
+ clocksource_mark_unstable(&clocksource_tsc);
}
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
+static void __init tsc_disable_clocksource_watchdog(void)
{
- printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
- d->ident);
- tsc_unstable = 1;
- return 0;
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
- {
- .callback = dmi_mark_tsc_unstable,
- .ident = "IBM Thinkpad 380XD",
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
- DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
- },
- },
- {}
-};
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
+ tsc_as_watchdog && !no_tsc_watchdog;
+}
static void __init check_system_tsc_reliable(void)
{
-#ifdef CONFIG_MGEODE_LX
- /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
- unsigned long res_low, res_high;
+ unsigned long res_low, res_high;
- rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a very reliable TSC */
- if (res_low & RTSC_SUSP)
- tsc_clocksource_reliable = 1;
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than four packages
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ topology_max_packages() <= 4)
+ tsc_disable_clocksource_watchdog();
}
/*
* Make an educated guess if the TSC is trustworthy and synchronized
* over all CPUs.
*/
-__cpuinit int unsynchronized_tsc(void)
+int unsynchronized_tsc(void)
{
- if (!cpu_has_tsc || tsc_unstable)
+ if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
return 1;
#ifdef CONFIG_SMP
@@ -830,140 +1282,325 @@ __cpuinit int unsynchronized_tsc(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
- tsc_unstable = 1;
+ if (topology_max_packages() > 1)
+ return 1;
}
- return tsc_unstable;
+ return 0;
}
-static void __init init_tsc_clocksource(void)
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work: ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
{
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
- if (tsc_clocksource_reliable)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
- /* lower the rating if we already know its unstable: */
- if (check_tsc_unstable()) {
- clocksource_tsc.rating = 0;
- clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
+ static u64 tsc_start = ULLONG_MAX, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+ int cpu;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (tsc_unstable)
+ goto unreg;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == ULLONG_MAX) {
+restart:
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ return;
}
- clocksource_register(&clocksource_tsc);
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (ref_start == ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed */
+ if (tsc_stop == ULLONG_MAX)
+ goto restart;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Will hit this only if tsc_force_recalibrate has been set */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+
+ /* Warn if the deviation exceeds 500 ppm */
+ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
+ pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
+ pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+ }
+
+ pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
+ hpet ? "HPET" : "PM_TIMER",
+ (unsigned long)freq / 1000,
+ (unsigned long)freq % 1000);
+
+ return;
+ }
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
+out:
+ if (tsc_unstable)
+ goto unreg;
+
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
}
-#ifdef CONFIG_X86_64
+
+static int __init init_tsc_clocksource(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
+ return 0;
+
+ if (tsc_unstable) {
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
+
+ /*
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ clocksource_unregister(&clocksource_tsc_early);
+
+ if (!tsc_force_recalibrate)
+ return 0;
+ }
+
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
+}
/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
*/
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
- int tsc_start, tsc_now;
- int i, no_ctr_free;
- unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
- unsigned long flags;
+device_initcall(init_tsc_clocksource);
- for (i = 0; i < 4; i++)
- if (avail_to_resrv_perfctr_nmi_bit(i))
- break;
- no_ctr_free = (i == 4);
- if (no_ctr_free) {
- WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
- "cpu_khz value may be incorrect.\n");
- i = 3;
- rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- rdmsrl(MSR_K7_PERFCTR3, pmc3);
+static bool __init determine_cpu_tsc_frequencies(bool early)
+{
+ /* Make sure that cpu and tsc are not already calibrated */
+ WARN_ON(cpu_khz || tsc_khz);
+
+ if (early) {
+ cpu_khz = x86_platform.calibrate_cpu();
+ if (tsc_early_khz) {
+ tsc_khz = tsc_early_khz;
+ } else {
+ tsc_khz = x86_platform.calibrate_tsc();
+ clocksource_tsc.freq_khz = tsc_khz;
+ }
} else {
- reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ /* We should not be here with non-native cpu calibration */
+ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
+ cpu_khz = pit_hpet_ptimer_calibrate_cpu();
}
- local_irq_save(flags);
- /* start measuring cycles, incrementing from 0 */
- wrmsrl(MSR_K7_PERFCTR0 + i, 0);
- wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
- rdtscl(tsc_start);
- do {
- rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
- tsc_now = get_cycles();
- } while ((tsc_now - tsc_start) < TICK_COUNT);
- local_irq_restore(flags);
- if (no_ctr_free) {
- wrmsrl(MSR_K7_EVNTSEL3, 0);
- wrmsrl(MSR_K7_PERFCTR3, pmc3);
- wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
- } else {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ /*
+ * Trust non-zero tsc_khz as authoritative,
+ * and use it to sanity check cpu_khz,
+ * which will be off if system timer is off.
+ */
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
+
+ if (tsc_khz == 0)
+ return false;
+
+ pr_info("Detected %lu.%03lu MHz processor\n",
+ (unsigned long)cpu_khz / KHZ,
+ (unsigned long)cpu_khz % KHZ);
+
+ if (cpu_khz != tsc_khz) {
+ pr_info("Detected %lu.%03lu MHz TSC",
+ (unsigned long)tsc_khz / KHZ,
+ (unsigned long)tsc_khz % KHZ);
}
+ return true;
+}
+
+static unsigned long __init get_loops_per_jiffy(void)
+{
+ u64 lpj = (u64)tsc_khz * KHZ;
- return pmc_now * tsc_khz / (tsc_now - tsc_start);
+ do_div(lpj, HZ);
+ return lpj;
}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-void __init tsc_init(void)
+static void __init tsc_enable_sched_clock(void)
{
- u64 lpj;
- int cpu;
+ loops_per_jiffy = get_loops_per_jiffy();
+ use_tsc_delay();
- x86_init.timers.tsc_pre_init();
+ /* Sanitize TSC ADJUST before cyc2ns gets initialized */
+ tsc_store_and_check_tsc_adjust(true);
+ cyc2ns_init_boot_cpu();
+ static_branch_enable(&__use_tsc);
+}
- if (!cpu_has_tsc)
+void __init tsc_early_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+ /* Don't change UV TSC multi-chassis synchronization */
+ if (is_early_uv_system())
return;
- tsc_khz = x86_platform.calibrate_tsc();
- cpu_khz = tsc_khz;
+ snp_secure_tsc_init();
- if (!tsc_khz) {
- mark_tsc_unstable("could not calculate TSC khz");
+ if (!determine_cpu_tsc_frequencies(true))
return;
- }
-
- if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
- cpu_khz = calibrate_cpu();
+ tsc_enable_sched_clock();
+}
- printk("Detected %lu.%03lu MHz processor.\n",
- (unsigned long)cpu_khz / 1000,
- (unsigned long)cpu_khz % 1000);
+void __init tsc_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
/*
- * Secondary CPUs do not run through tsc_init(), so set up
- * all the scale factors for all CPUs, assuming the same
- * speed as the bootup CPU. (cpufreq notifiers will fix this
- * up if their speed diverges)
+ * native_calibrate_cpu_early can only calibrate using methods that are
+ * available early in boot.
*/
- for_each_possible_cpu(cpu)
- set_cyc2ns_scale(cpu_khz, cpu);
+ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
+ x86_platform.calibrate_cpu = native_calibrate_cpu;
- if (tsc_disabled > 0)
- return;
+ if (!tsc_khz) {
+ /* We failed to determine frequencies earlier, try again */
+ if (!determine_cpu_tsc_frequencies(false)) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+ tsc_enable_sched_clock();
+ }
- /* now allow native_sched_clock() to use rdtsc */
- tsc_disabled = 0;
+ cyc2ns_init_secondary_cpus();
- lpj = ((u64)tsc_khz * 1000);
- do_div(lpj, HZ);
- lpj_fine = lpj;
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
- use_tsc_delay();
- /* Check and install the TSC clocksource */
- dmi_check_system(bad_tsc_dmi_table);
+ lpj_fine = get_loops_per_jiffy();
- if (unsynchronized_tsc())
+ check_system_tsc_reliable();
+
+ if (unsynchronized_tsc()) {
mark_tsc_unstable("TSCs unsynchronized");
+ return;
+ }
- check_system_tsc_reliable();
- init_tsc_clocksource();
+ if (tsc_clocksource_reliable || no_tsc_watchdog)
+ tsc_disable_clocksource_watchdog();
+
+ clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
+ detect_art();
}
+#ifdef CONFIG_SMP
+/*
+ * Check whether existing calibration data can be reused.
+ */
+unsigned long calibrate_delay_is_known(void)
+{
+ int sibling, cpu = smp_processor_id();
+ int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
+ const struct cpumask *mask = topology_core_cpumask(cpu);
+
+ /*
+ * If TSC has constant frequency and TSC is synchronized across
+ * sockets then reuse CPU0 calibration.
+ */
+ if (constant_tsc && !tsc_unstable)
+ return cpu_data(0).loops_per_jiffy;
+
+ /*
+ * If TSC has constant frequency and TSC is not synchronized across
+ * sockets and this is not the first CPU in the socket, then reuse
+ * the calibration value of an already online CPU on that socket.
+ *
+ * This assumes that CONSTANT_TSC is consistent for all CPUs in a
+ * socket.
+ */
+ if (!constant_tsc || !mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
+ if (sibling < nr_cpu_ids)
+ return cpu_data(sibling).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..48e6cc1cb017
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TSC frequency enumeration via MSR
+ *
+ * Copyright (C) 2013, 2018 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/thread_info.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include <asm/param.h>
+#include <asm/tsc.h>
+
+#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */
+
+/*
+ * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a
+ * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs
+ * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal
+ * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is
+ * unclear if the root PLL outputs are used directly by the CPU clock PLL or
+ * if there is another PLL in between.
+ * This does not matter though, we can model the chain of PLLs as a single PLL
+ * with a quotient equal to the quotients of all PLLs in the chain multiplied.
+ * So we can create a simplified model of the CPU clock setup using a reference
+ * clock of 100 MHz plus a quotient which gets us as close to the frequency
+ * from the SDM as possible.
+ * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 =
+ * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw.
+ */
+#define TSC_REFERENCE_KHZ 100000
+
+struct muldiv {
+ u32 multiplier;
+ u32 divider;
+};
+
+/*
+ * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field use_msr_plat does.
+ */
+struct freq_desc {
+ bool use_msr_plat;
+ struct muldiv muldiv[MAX_NUM_FREQS];
+ /*
+ * Some CPU frequencies in the SDM do not map to known PLL freqs, in
+ * that case the muldiv array is empty and the freqs array is used.
+ */
+ u32 freqs[MAX_NUM_FREQS];
+ u32 mask;
+};
+
+/*
+ * Penwell and Clovertrail use spread spectrum clock,
+ * so the freq number is not exactly the same as reported
+ * by MSR based on SDM.
+ */
+static const struct freq_desc freq_desc_pnw = {
+ .use_msr_plat = false,
+ .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 },
+ .mask = 0x07,
+};
+
+static const struct freq_desc freq_desc_clv = {
+ .use_msr_plat = false,
+ .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 },
+ .mask = 0x07,
+};
+
+/*
+ * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 000: 100 * 5 / 6 = 83.3333 MHz
+ * 001: 100 * 1 / 1 = 100.0000 MHz
+ * 010: 100 * 4 / 3 = 133.3333 MHz
+ * 011: 100 * 7 / 6 = 116.6667 MHz
+ * 100: 100 * 4 / 5 = 80.0000 MHz
+ */
+static const struct freq_desc freq_desc_byt = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 },
+ { 4, 5 } },
+ .mask = 0x07,
+};
+
+/*
+ * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0000: 100 * 5 / 6 = 83.3333 MHz
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ * 0011: 100 * 7 / 6 = 116.6667 MHz
+ * 0100: 100 * 4 / 5 = 80.0000 MHz
+ * 0101: 100 * 14 / 15 = 93.3333 MHz
+ * 0110: 100 * 9 / 10 = 90.0000 MHz
+ * 0111: 100 * 8 / 9 = 88.8889 MHz
+ * 1000: 100 * 7 / 8 = 87.5000 MHz
+ */
+static const struct freq_desc freq_desc_cht = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 },
+ { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 },
+ { 7, 8 } },
+ .mask = 0x0f,
+};
+
+/*
+ * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ */
+static const struct freq_desc freq_desc_tng = {
+ .use_msr_plat = true,
+ .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } },
+ .mask = 0x07,
+};
+
+/*
+ * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model:
+ * 0000: 100 * 5 / 6 = 83.3333 MHz
+ * 0001: 100 * 1 / 1 = 100.0000 MHz
+ * 0010: 100 * 4 / 3 = 133.3333 MHz
+ * 0011: 100 * 1 / 1 = 100.0000 MHz
+ */
+static const struct freq_desc freq_desc_ann = {
+ .use_msr_plat = true,
+ .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } },
+ .mask = 0x0f,
+};
+
+/*
+ * 24 MHz crystal? : 24 * 13 / 4 = 78 MHz
+ * Frequency step for Lightning Mountain SoC is fixed to 78 MHz,
+ * so all the frequency entries are 78000.
+ */
+static const struct freq_desc freq_desc_lgm = {
+ .use_msr_plat = true,
+ .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000,
+ 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 },
+ .mask = 0x0f,
+};
+
+static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
+ {}
+};
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ *
+ * Set global "lapic_timer_period" to bus_clock_cycles/jiffy
+ * Return processor base frequency in KHz, or 0 on failure.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+ u32 lo, hi, ratio, freq, tscref;
+ const struct freq_desc *freq_desc;
+ const struct x86_cpu_id *id;
+ const struct muldiv *md;
+ unsigned long res;
+ int index;
+
+ id = x86_match_cpu(tsc_msr_cpu_ids);
+ if (!id)
+ return 0;
+
+ freq_desc = (struct freq_desc *)id->driver_data;
+ if (freq_desc->use_msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0xff;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+ index = lo & freq_desc->mask;
+ md = &freq_desc->muldiv[index];
+
+ /*
+ * Note this also catches cases where the index points to an unpopulated
+ * part of muldiv, in that case the else will set freq and res to 0.
+ */
+ if (md->divider) {
+ tscref = TSC_REFERENCE_KHZ * md->multiplier;
+ freq = DIV_ROUND_CLOSEST(tscref, md->divider);
+ /*
+ * Multiplying by ratio before the division has better
+ * accuracy than just calculating freq * ratio.
+ */
+ res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider);
+ } else {
+ freq = freq_desc->freqs[index];
+ res = freq * ratio;
+ }
+
+ if (freq == 0)
+ pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_period = (freq * 1000) / HZ;
+#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return res;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..ec3aa340d351 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* check TSC synchronization.
*
@@ -14,47 +15,264 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
+#include <linux/workqueue.h>
+#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
+#include <asm/msr.h>
#include <asm/tsc.h>
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
+
+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
+ */
+ if (bootcpu && bootval != 0) {
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrq(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrq(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * The default adjust value cannot be assumed to be zero on any socket.
+ */
+ cur->adjusted = bootval;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ cur->adjusted = ref->adjusted;
+ wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
+static atomic_t start_count;
+static atomic_t stop_count;
+static atomic_t test_runs;
/*
* We use a raw spinlock in this exceptional case, because
* we want to have the fastest, inlined, non-debug version
* of a critical section, to be able to prove TSC time-warps:
*/
-static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
+static cycles_t last_tsc;
+static cycles_t max_warp;
+static int nr_warps;
+static int random_warps;
/*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs. This is not called
+ * if there is no TSC.
*/
-static __cpuinit void check_tsc_warp(void)
+static cycles_t check_tsc_warp(unsigned int timeout)
{
- cycles_t start, now, prev, end;
- int i;
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
- rdtsc_barrier();
- start = get_cycles();
- rdtsc_barrier();
+ start = rdtsc_ordered();
/*
- * The measurement runs for 20 msecs:
+ * The measurement runs for 'timeout' msecs:
*/
- end = start + tsc_khz * 20ULL;
- now = start;
+ end = start + (cycles_t) tsc_khz * timeout;
for (i = 0; ; i++) {
/*
@@ -64,9 +282,7 @@ static __cpuinit void check_tsc_warp(void)
*/
arch_spin_lock(&sync_lock);
prev = last_tsc;
- rdtsc_barrier();
- now = get_cycles();
- rdtsc_barrier();
+ now = rdtsc_ordered();
last_tsc = now;
arch_spin_unlock(&sync_lock);
@@ -89,72 +305,111 @@ static __cpuinit void check_tsc_warp(void)
if (unlikely(prev > now)) {
arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
nr_warps++;
+ cur_warps = nr_warps;
arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
now-start, end-start);
+ return cur_max_warp;
}
/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket don't get reset at the same time.
*/
-void __cpuinit check_tsc_sync_source(int cpu)
+static inline unsigned int loop_timeout(int cpu)
{
- int cpus = 2;
+ return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
+}
- /*
- * No need to check if we already know that the TSC is not
- * synchronized:
- */
- if (unsynchronized_tsc())
- return;
+static void tsc_sync_mark_tsc_unstable(struct work_struct *work)
+{
+ mark_tsc_unstable("check_tsc_sync_source failed");
+}
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
- pr_info(
- "Skipped synchronization checks as TSC is reliable.\n");
- return;
- }
+static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable);
- /*
- * Reset it - in case this is a second bootup:
- */
- atomic_set(&stop_count, 0);
+/*
+ * The freshly booted CPU initiates this via an async SMP function call.
+ */
+static void check_tsc_sync_source(void *__cpu)
+{
+ unsigned int cpu = (unsigned long)__cpu;
+ int cpus = 2;
/*
- * Wait for the target to arrive:
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
*/
- while (atomic_read(&start_count) != cpus-1)
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
+ /* Wait for the target to start. */
+ while (atomic_read(&start_count) != cpus - 1)
cpu_relax();
+
/*
* Trigger the target to continue into the measurement too:
*/
atomic_inc(&start_count);
- check_tsc_warp();
+ check_tsc_warp(loop_timeout(cpu));
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
- if (nr_warps) {
- pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n",
smp_processor_id(), cpu);
- pr_warning("Measured %Ld cycles TSC warp between CPUs, "
- "turning off TSC clock.\n", max_warp);
- mark_tsc_unstable("check_tsc_sync_source failed");
- } else {
- pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
+ pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n",
smp_processor_id(), cpu);
+ pr_warn("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warn("TSC warped randomly between CPUs\n");
+ schedule_work(&tsc_sync_work);
}
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
+ random_warps = 0;
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
@@ -163,18 +418,44 @@ void __cpuinit check_tsc_sync_source(int cpu)
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
}
/*
* Freshly booted CPUs call into this:
*/
-void __cpuinit check_tsc_sync_target(void)
+void check_tsc_sync_target(void)
{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;
- if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ /* Also aborts if there is no TSC. */
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ *
+ * The test is also skipped when the TSC is marked reliable. This
+ * is true for SoCs which have no fallback clocksource. On these
+ * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+ * register might have been wreckaged by the BIOS..
+ */
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable)
return;
+ /* Kick the control CPU into the TSC synchronization function */
+ smp_call_function_single(cpumask_first(cpu_online_mask), check_tsc_sync_source,
+ (unsigned long *)(unsigned long)cpu, 0);
+retry:
/*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
@@ -183,7 +464,12 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp();
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
/*
* Ok, we are done:
@@ -195,4 +481,47 @@ void __cpuinit check_tsc_sync_target(void)
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustment value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000000..d432f3824f0c
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,422 @@
+/*
+ * umip.c Emulation for instruction protected by the User-Mode Instruction
+ * Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * User-Mode Instruction Prevention is a security feature present in recent
+ * x86 processors that, when enabled, prevents a group of instructions (SGDT,
+ * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general
+ * protection fault if the instruction is executed with CPL > 0.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (SGDT and SIDT) and those which return a
+ * value (SLDT, STR and SMSW).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * The instruction SMSW is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ * SLDT and STR are emulated to return the values that the kernel programmatically
+ * assigns:
+ * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not.
+ * - STR returns (GDT_ENTRY_TSS * 8).
+ *
+ * Emulation is provided for both 32-bit and 64-bit processes.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL
+#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes X
+ * has a value of 4, in 64-bit processes X has a value of 8.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8
+#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 2 /* 0F 01 /4 */
+#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
+#define UMIP_INST_STR 4 /* 0F 00 /1 */
+
+static const char * const umip_insns[5] = {
+ [UMIP_INST_SGDT] = "SGDT",
+ [UMIP_INST_SIDT] = "SIDT",
+ [UMIP_INST_SMSW] = "SMSW",
+ [UMIP_INST_SLDT] = "SLDT",
+ [UMIP_INST_STR] = "STR",
+};
+
+#define umip_pr_err(regs, fmt, ...) \
+ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
+#define umip_pr_debug(regs, fmt, ...) \
+ umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__)
+
+/**
+ * umip_printk() - Print a rate-limited message
+ * @regs: Register set with the context in which the warning is printed
+ * @log_level: Kernel log level to print the message
+ * @fmt: The text string to print
+ *
+ * Print the text contained in @fmt. The print rate is limited to bursts of 5
+ * messages every two minutes. The purpose of this customized version of
+ * printk() is to print messages when user space processes use any of the
+ * UMIP-protected instructions. Thus, the printed text is prepended with the
+ * task name and process ID number of the current task as well as the
+ * instruction and stack pointers in @regs as seen when entering kernel mode.
+ *
+ * Returns:
+ *
+ * None.
+ */
+static __printf(3, 4)
+void umip_printk(const struct pt_regs *regs, const char *log_level,
+ const char *fmt, ...)
+{
+ /* Bursts of 5 messages every two minutes */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5);
+ struct task_struct *tsk = current;
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm,
+ task_pid_nr(tsk), regs->ip, regs->sp, &vaf);
+ va_end(args);
+}
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */
+ if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ /* The reg form of 0F 01 /0 encodes VMX instructions. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ return UMIP_INST_SGDT;
+ case 1:
+ /*
+ * The reg form of 0F 01 /1 encodes MONITOR/MWAIT,
+ * STAC/CLAC, and ENCLS.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ } else if (insn->opcode.bytes[1] == 0x0) {
+ if (X86_MODRM_REG(insn->modrm.value) == 0)
+ return UMIP_INST_SLDT;
+ else if (X86_MODRM_REG(insn->modrm.value) == 1)
+ return UMIP_INST_STR;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ * @x86_64: true if process is 64-bit, false otherwise
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size, bool x86_64)
+{
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ u64 dummy_base_addr;
+ u16 dummy_limit = 0;
+
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ /*
+ * 64-bit processes use the entire dummy base address.
+ * 32-bit processes use the lower 32 bits of the base address.
+ * dummy_base_addr is always 64 bits, but we memcpy the correct
+ * number of bytes from it to the destination.
+ */
+ if (x86_64)
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT;
+ else
+ *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT;
+
+ memcpy(data + 2, &dummy_base_addr, *data_size);
+
+ *data_size += UMIP_GDT_IDT_LIMIT_SIZE;
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT ||
+ umip_inst == UMIP_INST_STR) {
+ unsigned long dummy_value;
+
+ if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+ } else if (umip_inst == UMIP_INST_STR) {
+ dummy_value = GDT_ENTRY_TSS * 8;
+ } else if (umip_inst == UMIP_INST_SLDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ down_read(&current->mm->context.ldt_usr_sem);
+ if (current->mm->context.ldt)
+ dummy_value = GDT_ENTRY_LDT * 8;
+ else
+ dummy_value = 0;
+ up_read(&current->mm->context.ldt_usr_sem);
+#else
+ dummy_value = 0;
+#endif
+ }
+
+ /*
+ * For these 3 instructions, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * significant bytes.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, addr);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ umip_pr_err(regs, "segfault in emulation. error%x\n",
+ X86_PF_USER | X86_PF_WRITE);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). This function fixes
+ * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR
+ * and SLDT are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int nr_copied, reg_offset, dummy_data_size, umip_inst;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ unsigned long *reg_addr;
+ void __user *uaddr;
+ struct insn insn;
+
+ if (!regs)
+ return false;
+
+ /*
+ * Give up on emulation if fetching the instruction failed. Should a
+ * page fault or a #GP be issued?
+ */
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
+ return false;
+
+ if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ umip_pr_debug(regs, "%s instruction cannot be used by applications.\n",
+ umip_insns[umip_inst]);
+
+ umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n");
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size,
+ user_64bit_mode(regs)))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000000..d8ba93778ae3
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,419 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/interrupt.h>
+#include <asm/sections.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %0*x ...\n",
+ sp, BITS_PER_LONG/4, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %0*lx (%pB)\n",
+ sp, BITS_PER_LONG/4, word, (void *)word);
+ }
+ }
+}
+
+static bool in_entry_code(unsigned long ip)
+{
+ char *addr = (char *)ip;
+
+ return addr >= __entry_text_start && addr < __entry_text_end;
+}
+
+static inline unsigned long *last_frame(struct unwind_state *state)
+{
+ return (unsigned long *)task_pt_regs(state->task) - 2;
+}
+
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static inline unsigned long *last_aligned_frame(struct unwind_state *state)
+{
+ return last_frame(state) - GCC_REALIGN_WORDS;
+}
+
+static bool is_last_aligned_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *aligned_bp = last_aligned_frame(state);
+
+ /*
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
+ *
+ * <start_secondary>:
+ * push %edi
+ * lea 0x8(%esp),%edi
+ * and $0xfffffff8,%esp
+ * pushl -0x4(%edi)
+ * push %ebp
+ * mov %esp,%ebp
+ *
+ * <x86_64_start_kernel>:
+ * lea 0x8(%rsp),%r10
+ * and $0xfffffffffffffff0,%rsp
+ * pushq -0x8(%r10)
+ * push %rbp
+ * mov %rsp,%rbp
+ *
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
+ */
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+#ifdef CONFIG_X86_64
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+/*
+ * While walking the stack, KMSAN may stomp on stale locals from other
+ * functions that were marked as uninitialized upon function exit, and
+ * now hold the call frame information for the current function (e.g. the frame
+ * pointer). Because KMSAN does not specifically mark call frames as
+ * initialized, false positive reports are possible. To prevent such reports,
+ * we mark the functions scanning the stack (here and below) with
+ * __no_kmsan_checks.
+ */
+__no_kmsan_checks
+static bool update_stack_state(struct unwind_state *state,
+ unsigned long *next_bp)
+{
+ struct stack_info *info = &state->stack_info;
+ enum stack_type prev_type = info->type;
+ struct pt_regs *regs;
+ unsigned long *frame, *prev_frame_end, *addr_p, addr;
+ size_t len;
+
+ if (state->regs)
+ prev_frame_end = (void *)state->regs + sizeof(*state->regs);
+ else
+ prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
+
+ /* Is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ state->got_irq = true;
+ } else {
+ frame = next_bp;
+ len = FRAME_HEADER_SIZE;
+ }
+
+ /*
+ * If the next bp isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the next bp
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, frame, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ /* Make sure it only unwinds up and doesn't overlap the prev frame: */
+ if (state->orig_sp && state->stack_info.type == prev_type &&
+ frame < prev_frame_end)
+ return false;
+
+ /* Move state to the next frame: */
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
+ /* Save the return address: */
+ if (state->regs && user_mode(state->regs))
+ state->ip = 0;
+ else {
+ addr_p = unwind_get_return_address_ptr(state);
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ state->ip = unwind_recover_ret_addr(state, addr, addr_p);
+ }
+
+ /* Save the original stack pointer for unwind_dump(): */
+ if (!state->orig_sp)
+ state->orig_sp = frame;
+
+ return true;
+}
+
+__no_kmsan_checks
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct pt_regs *regs;
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * kernel_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ state->ip = 0;
+ return true;
+ }
+
+ /* Get the next frame pointer: */
+ if (state->next_bp) {
+ next_bp = state->next_bp;
+ state->next_bp = NULL;
+ } else if (state->regs) {
+ next_bp = (unsigned long *)state->regs->bp;
+ } else {
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ }
+
+ /* Move to the next frame if it's safe: */
+ if (!update_stack_state(state, next_bp))
+ goto bad_address;
+
+ return true;
+
+bad_address:
+ state->error = true;
+
+ /*
+ * When unwinding a non-current task, the task might actually be
+ * running on another CPU, in which case it could be modifying its
+ * stack while we're reading it. This is generally not a problem and
+ * can be ignored as long as the caller understands that unwinding
+ * another task will not always succeed.
+ */
+ if (state->task != current)
+ goto the_end;
+
+ /*
+ * Don't warn if the unwinder got lost due to an interrupt in entry
+ * code or in the C handler before the first frame pointer got set up:
+ */
+ if (state->got_irq && in_entry_code(state->ip))
+ goto the_end;
+ if (state->regs &&
+ state->regs->sp >= (unsigned long)last_aligned_frame(state) &&
+ state->regs->sp < (unsigned long)task_pt_regs(state->task))
+ goto the_end;
+
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
+ if (state->task != current)
+ goto the_end;
+
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ unsigned long *bp;
+
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+ state->got_irq = (regs);
+
+ /* Don't even attempt to start from user mode regs: */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ bp = get_frame_pointer(task, regs);
+
+ /*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer.
+ * That means that SP points into the middle of an incomplete frame:
+ * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+ * would have written a frame pointer if we hadn't crashed.
+ * Pretend that the frame is complete and that BP points to it, but save
+ * the real BP so that we can use it when looking for the next frame.
+ */
+ if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) {
+ state->next_bp = bp;
+ bp = ((unsigned long *)regs->sp) - 1;
+ }
+
+ /* Initialize stack info and make sure the frame data is accessible: */
+ get_stack_info(bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, bp);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ (state->next_bp == NULL && state->bp < first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000000..884d68a6e714
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = READ_ONCE_NOCHECK(*state->sp);
+
+ return unwind_recover_ret_addr(state, addr, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++) {
+ unsigned long addr = READ_ONCE_NOCHECK(*state->sp);
+
+ if (__kernel_text_address(addr))
+ return true;
+ }
+
+ state->sp = PTR_ALIGN(info->next_sp, sizeof(long));
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = PTR_ALIGN(first_frame, sizeof(long));
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ if (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ !__kernel_text_address(*first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..977ee75e047c
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/objtool.h>
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/orc_header.h>
+
+ORC_HEADER;
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
+
+#define orc_warn_current(args...) \
+({ \
+ static bool dumped_before; \
+ if (state->task == current && !state->error) { \
+ orc_warn(args); \
+ if (unwind_debug && !dumped_before) { \
+ dumped_before = true; \
+ unwind_dump(state); \
+ } \
+ } \
+})
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static bool orc_init __ro_after_init;
+static bool unwind_debug __ro_after_init;
+static unsigned int lookup_num_blocks __ro_after_init;
+
+static int __init unwind_debug_cmdline(char *str)
+{
+ unwind_debug = true;
+
+ return 0;
+}
+early_param("unwind_debug", unwind_debug_cmdline);
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = __builtin_frame_address(0); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4,
+ (unsigned long)sp, BITS_PER_LONG/4,
+ word, (void *)word);
+ }
+ }
+}
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct orc_entry *orc_find(unsigned long ip);
+
+/*
+ * Ftrace dynamic trampolines do not have orc entries of their own.
+ * But they are copies of the ftrace entries that are static and
+ * defined in ftrace_*.S, which do have orc entries.
+ *
+ * If the unwinder comes across a ftrace trampoline, then find the
+ * ftrace function that was used to create it, and use that ftrace
+ * function's orc entry, as the placement of the return code in
+ * the stack will be identical.
+ */
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ struct ftrace_ops *ops;
+ unsigned long tramp_addr, offset;
+
+ ops = ftrace_ops_trampoline(ip);
+ if (!ops)
+ return NULL;
+
+ /* Set tramp_addr to the start of the code copied by the trampoline */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ tramp_addr = (unsigned long)ftrace_regs_caller;
+ else
+ tramp_addr = (unsigned long)ftrace_caller;
+
+ /* Now place tramp_addr to the location within the trampoline ip is at */
+ offset = ip - ops->trampoline;
+ tramp_addr += offset;
+
+ /* Prevent unlikely recursion */
+ if (ip == tramp_addr)
+ return NULL;
+
+ return orc_find(tramp_addr);
+}
+#else
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+ .sp_offset = sizeof(long),
+ .sp_reg = ORC_REG_SP,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = ORC_TYPE_CALL
+};
+
+/* Fake frame pointer entry -- used as a fallback for generated code */
+static struct orc_entry orc_fp_entry = {
+ .type = ORC_TYPE_CALL,
+ .sp_reg = ORC_REG_BP,
+ .sp_offset = 16,
+ .bp_reg = ORC_REG_PREV_SP,
+ .bp_offset = -16,
+};
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ static struct orc_entry *orc;
+
+ if (ip == 0)
+ return &null_orc_entry;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
+ idx, lookup_num_blocks, (void *)ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
+ idx, lookup_num_blocks, start, stop, (void *)ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (is_kernel_inittext(ip))
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ orc = orc_module_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_ftrace_find(ip);
+}
+
+#ifdef CONFIG_MODULES
+
+static DEFINE_MUTEX(sort_mutex);
+static int *cur_orc_ip_table = __start_orc_unwind_ip;
+static struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ swap(*orc_a, *orc_b);
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be first
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
+}
+
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /*
+ * Note, the orc_unwind and orc_unwind_ip tables were already
+ * sorted at build time via the 'sorttable' tool.
+ * It's ready for binary search straight away, no need to sort it.
+ */
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
+
+ if (on_stack(info, addr, len))
+ return true;
+
+ return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
+ on_stack(info, addr, len);
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
+ return true;
+}
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (struct pt_regs *)addr;
+
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
+
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
+
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+/*
+ * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
+ * value from state->regs.
+ *
+ * Otherwise, if state->regs just points to IRET regs, and the previous frame
+ * had full regs, it's safe to get the value from the previous regs. This can
+ * happen when early/late IRQ entry code gets interrupted by an NMI.
+ */
+static bool get_reg(struct unwind_state *state, unsigned int reg_off,
+ unsigned long *val)
+{
+ unsigned int reg = reg_off/8;
+
+ if (!state->regs)
+ return false;
+
+ if (state->full_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
+ return true;
+ }
+
+ if (state->prev_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
+ return true;
+ }
+
+ return false;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ guard(rcu)();
+
+ /* End-of-stack check for user tasks: */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc) {
+ /*
+ * As a fallback, try to assume this code uses a frame pointer.
+ * This is useful for generated code, like BPF, which ORC
+ * doesn't know about. This is just a guess, so the rest of
+ * the unwind is no longer considered reliable.
+ */
+ orc = &orc_fp_entry;
+ state->error = true;
+ } else {
+ if (orc->type == ORC_TYPE_UNDEFINED)
+ goto err;
+
+ if (orc->type == ORC_TYPE_END_OF_STACK)
+ goto the_end;
+ }
+
+ state->signal = orc->signal;
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
+ orc_warn_current("missing R10 value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_R13:
+ if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
+ orc_warn_current("missing R13 value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DI:
+ if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
+ orc_warn_current("missing RDI value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DX:
+ if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
+ orc_warn_current("missing DX value at %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d at %pB\n",
+ orc->sp_reg, (void *)state->ip);
+ goto err;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto err;
+
+ if (orc->sp_reg == ORC_REG_SP_INDIRECT)
+ sp += orc->sp_offset;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto err;
+
+ state->ip = unwind_recover_ret_addr(state, state->ip,
+ (unsigned long *)ip_p);
+ state->sp = sp;
+ state->regs = NULL;
+ state->prev_regs = NULL;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn_current("can't access registers at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+ /*
+ * There is a small chance to interrupt at the entry of
+ * arch_rethook_trampoline() where the ORC info doesn't exist.
+ * That point is right after the RET to arch_rethook_trampoline()
+ * which was modified return address.
+ * At that point, the @addr_p of the unwind_recover_rethook()
+ * (this has to point the address of the stack entry storing
+ * the modified return address) must be "SP - (a stack entry)"
+ * because SP is incremented by the RET.
+ */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
+ state->regs = (struct pt_regs *)sp;
+ state->prev_regs = NULL;
+ state->full_regs = true;
+ break;
+
+ case ORC_TYPE_REGS_PARTIAL:
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn_current("can't access iret registers at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+ /* See ORC_TYPE_REGS case comment. */
+ state->ip = unwind_recover_rethook(state, state->ip,
+ (unsigned long *)(state->sp - sizeof(long)));
+
+ if (state->full_regs)
+ state->prev_regs = state->regs;
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d at %pB\n",
+ orc->type, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
+ state->bp = tmp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %pB\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn_current("stack going in the wrong direction? at %pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+
+ return true;
+
+err:
+ state->error = true;
+
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ if (!orc_init)
+ goto err;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto err;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto the_end;
+
+ state->ip = regs->ip;
+ state->sp = regs->sp;
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp + sizeof(*frame);
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork_asm;
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ state->error = true;
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+err:
+ state->error = true;
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000000..7be8e361ca55
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,1825 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/mmu_context.h>
+#include <asm/nops.h>
+
+/* Post-execution fixups. */
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x01
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x02
+
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x04
+
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ * (why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ * Normally not used in userspace, but would execute if used.
+ * Can cause GP or stack exception if tries to load wrong segment descriptor.
+ * We hesitate to run them under single step since kernel's handling
+ * of userspace single-stepping (TF flag) is fragile.
+ * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ * on the same grounds that they are never used.
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_32 NULL
+#endif
+
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_64)
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_64 NULL
+#endif
+
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ * Also encodes tons of other system insns if mod=11.
+ * Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ * Note: with prefixes, these two opcodes are
+ * extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ * {rd,wr}{fs,gs}base,{s,l,m}fence.
+ * Why? They are all user-executable.
+ */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+/*
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ insn_byte_t p;
+
+ for_each_insn_prefix(insn, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
+ return true;
+ }
+ }
+ return false;
+}
+
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
+{
+ enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
+ u32 volatile *good_insns;
+ int ret;
+
+ ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
+ if (ret < 0)
+ return -ENOEXEC;
+
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_X86_64
+
+struct uretprobe_syscall_args {
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long ax;
+};
+
+asm (
+ ".pushsection .rodata\n"
+ ".global uretprobe_trampoline_entry\n"
+ "uretprobe_trampoline_entry:\n"
+ "push %rax\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "mov $" __stringify(__NR_uretprobe) ", %rax\n"
+ "syscall\n"
+ ".global uretprobe_syscall_check\n"
+ "uretprobe_syscall_check:\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ /*
+ * The uretprobe syscall replaces stored %rax value with final
+ * return address, so we don't restore %rax in here and just
+ * call ret.
+ */
+ "ret\n"
+ "int3\n"
+ ".global uretprobe_trampoline_end\n"
+ "uretprobe_trampoline_end:\n"
+ ".popsection\n"
+);
+
+extern u8 uretprobe_trampoline_entry[];
+extern u8 uretprobe_trampoline_end[];
+extern u8 uretprobe_syscall_check[];
+
+void *arch_uretprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /*
+ * At the moment the uretprobe syscall trampoline is supported
+ * only for native 64-bit process, the compat process still uses
+ * standard breakpoint.
+ */
+ if (user_64bit_mode(regs)) {
+ *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry;
+ return uretprobe_trampoline_entry;
+ }
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
+static unsigned long trampoline_check_ip(unsigned long tramp)
+{
+ return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uretprobe_syscall_args args;
+ unsigned long err, ip, sp, tramp;
+
+ /* If there's no trampoline, we are called from wrong place. */
+ tramp = uprobe_get_trampoline_vaddr();
+ if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR))
+ goto sigill;
+
+ /* Make sure the ip matches the only allowed sys_uretprobe caller. */
+ if (unlikely(regs->ip != trampoline_check_ip(tramp)))
+ goto sigill;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ax = args.ax;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ ip = regs->ip;
+ sp = regs->sp;
+
+ uprobe_handle_trampoline(regs);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ * .. or shadow stack is enabled, in which case we need to skip
+ * return through the user space stack address.
+ */
+ if (regs->sp != sp || shstk_is_enabled())
+ return regs->ax;
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed r11/cx */
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /*
+ * ax register is passed through as return value, so we can use
+ * its space on stack for ip value and jump to it through the
+ * trampoline's ret instruction
+ */
+ args.ax = regs->ip;
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+
+ return regs->ax;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte with bit layout "00 reg 101".
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+ u8 reg2;
+
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode low numbered reg, not r8+.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes >= 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x60;
+ }
+
+ /*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ /*
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
+ */
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
+
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
+ }
+}
+
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
+}
+
+static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
+}
+
+static struct page *tramp_mapping_pages[2] __ro_after_init;
+
+static struct vm_special_mapping tramp_mapping = {
+ .name = "[uprobes-trampoline]",
+ .mremap = tramp_mremap,
+ .pages = tramp_mapping_pages,
+};
+
+struct uprobe_trampoline {
+ struct hlist_node node;
+ unsigned long vaddr;
+};
+
+static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
+{
+ long delta = (long)(vaddr + 5 - vtramp);
+
+ return delta >= INT_MIN && delta <= INT_MAX;
+}
+
+static unsigned long find_nearest_trampoline(unsigned long vaddr)
+{
+ struct vm_unmapped_area_info info = {
+ .length = PAGE_SIZE,
+ .align_mask = ~PAGE_MASK,
+ };
+ unsigned long low_limit, high_limit;
+ unsigned long low_tramp, high_tramp;
+ unsigned long call_end = vaddr + 5;
+
+ if (check_add_overflow(call_end, INT_MIN, &low_limit))
+ low_limit = PAGE_SIZE;
+
+ high_limit = call_end + INT_MAX;
+
+ /* Search up from the caller address. */
+ info.low_limit = call_end;
+ info.high_limit = min(high_limit, TASK_SIZE);
+ high_tramp = vm_unmapped_area(&info);
+
+ /* Search down from the caller address. */
+ info.low_limit = max(low_limit, PAGE_SIZE);
+ info.high_limit = call_end;
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ low_tramp = vm_unmapped_area(&info);
+
+ if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
+ return -ENOMEM;
+ if (IS_ERR_VALUE(high_tramp))
+ return low_tramp;
+ if (IS_ERR_VALUE(low_tramp))
+ return high_tramp;
+
+ /* Return address that's closest to the caller address. */
+ if (call_end - low_tramp < high_tramp - call_end)
+ return low_tramp;
+ return high_tramp;
+}
+
+static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct mm_struct *mm = current->mm;
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+
+ if (!user_64bit_mode(regs))
+ return NULL;
+
+ vaddr = find_nearest_trampoline(vaddr);
+ if (IS_ERR_VALUE(vaddr))
+ return NULL;
+
+ tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
+ if (unlikely(!tramp))
+ return NULL;
+
+ tramp->vaddr = vaddr;
+ vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+ VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+ &tramp_mapping);
+ if (IS_ERR(vma)) {
+ kfree(tramp);
+ return NULL;
+ }
+ return tramp;
+}
+
+static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
+{
+ struct uprobes_state *state = &current->mm->uprobes_state;
+ struct uprobe_trampoline *tramp = NULL;
+
+ if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+ return NULL;
+
+ hlist_for_each_entry(tramp, &state->head_tramps, node) {
+ if (is_reachable_by_call(tramp->vaddr, vaddr)) {
+ *new = false;
+ return tramp;
+ }
+ }
+
+ tramp = create_uprobe_trampoline(vaddr);
+ if (!tramp)
+ return NULL;
+
+ *new = true;
+ hlist_add_head(&tramp->node, &state->head_tramps);
+ return tramp;
+}
+
+static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
+{
+ /*
+ * We do not unmap and release uprobe trampoline page itself,
+ * because there's no easy way to make sure none of the threads
+ * is still inside the trampoline.
+ */
+ hlist_del(&tramp->node);
+ kfree(tramp);
+}
+
+void arch_uprobe_init_state(struct mm_struct *mm)
+{
+ INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
+}
+
+void arch_uprobe_clear_state(struct mm_struct *mm)
+{
+ struct uprobes_state *state = &mm->uprobes_state;
+ struct uprobe_trampoline *tramp;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
+ destroy_uprobe_trampoline(tramp);
+}
+
+static bool __in_uprobe_trampoline(unsigned long ip)
+{
+ struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+
+ return vma && vma_is_special_mapping(vma, &tramp_mapping);
+}
+
+static bool in_uprobe_trampoline(unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ bool found, retry = true;
+ unsigned int seq;
+
+ rcu_read_lock();
+ if (mmap_lock_speculate_try_begin(mm, &seq)) {
+ found = __in_uprobe_trampoline(ip);
+ retry = mmap_lock_speculate_retry(mm, seq);
+ }
+ rcu_read_unlock();
+
+ if (retry) {
+ mmap_read_lock(mm);
+ found = __in_uprobe_trampoline(ip);
+ mmap_read_unlock(mm);
+ }
+ return found;
+}
+
+/*
+ * See uprobe syscall trampoline; the call to the trampoline will push
+ * the return address on the stack, the trampoline itself then pushes
+ * cx, r11 and ax.
+ */
+struct uprobe_syscall_args {
+ unsigned long ax;
+ unsigned long r11;
+ unsigned long cx;
+ unsigned long retaddr;
+};
+
+SYSCALL_DEFINE0(uprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ struct uprobe_syscall_args args;
+ unsigned long ip, sp, sret;
+ int err;
+
+ /* Allow execution only from uprobe trampolines. */
+ if (!in_uprobe_trampoline(regs->ip))
+ return -ENXIO;
+
+ err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
+ if (err)
+ goto sigill;
+
+ ip = regs->ip;
+
+ /*
+ * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
+ * - adjust ip to the probe address, call saved next instruction address
+ * - adjust sp to the probe's stack frame (check trampoline code)
+ */
+ regs->ax = args.ax;
+ regs->r11 = args.r11;
+ regs->cx = args.cx;
+ regs->ip = args.retaddr - 5;
+ regs->sp += sizeof(args);
+ regs->orig_ax = -1;
+
+ sp = regs->sp;
+
+ err = shstk_pop((u64 *)&sret);
+ if (err == -EFAULT || (!err && sret != args.retaddr))
+ goto sigill;
+
+ handle_syscall_uprobe(regs, regs->ip);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ */
+ if (regs->sp != sp) {
+ /* skip the trampoline call */
+ if (args.retaddr - 5 == regs->ip)
+ regs->ip += 5;
+ return regs->ax;
+ }
+
+ regs->sp -= sizeof(args);
+
+ /* for the case uprobe_consumer has changed ax/r11/cx */
+ args.ax = regs->ax;
+ args.r11 = regs->r11;
+ args.cx = regs->cx;
+
+ /* keep return address unless we are instructed otherwise */
+ if (args.retaddr - 5 != regs->ip)
+ args.retaddr = regs->ip;
+
+ if (shstk_push(args.retaddr) == -EFAULT)
+ goto sigill;
+
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+ return 0;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
+asm (
+ ".pushsection .rodata\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ "uprobe_trampoline_entry:\n"
+ "push %rcx\n"
+ "push %r11\n"
+ "push %rax\n"
+ "mov $" __stringify(__NR_uprobe) ", %rax\n"
+ "syscall\n"
+ "pop %rax\n"
+ "pop %r11\n"
+ "pop %rcx\n"
+ "ret\n"
+ "int3\n"
+ ".balign " __stringify(PAGE_SIZE) "\n"
+ ".popsection\n"
+);
+
+extern u8 uprobe_trampoline_entry[];
+
+static int __init arch_uprobes_init(void)
+{
+ tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
+ return 0;
+}
+
+late_initcall(arch_uprobes_init);
+
+enum {
+ EXPECT_SWBP,
+ EXPECT_CALL,
+};
+
+struct write_opcode_ctx {
+ unsigned long base;
+ int expect;
+};
+
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+ return *insn == CALL_INSN_OPCODE;
+}
+
+/*
+ * Verification callback used by int3_update uprobe_write calls to make sure
+ * the underlying instruction is as expected - either int3 or call.
+ */
+static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
+ int nbytes, void *data)
+{
+ struct write_opcode_ctx *ctx = data;
+ uprobe_opcode_t old_opcode[5];
+
+ uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+
+ switch (ctx->expect) {
+ case EXPECT_SWBP:
+ if (is_swbp_insn(&old_opcode[0]))
+ return 1;
+ break;
+ case EXPECT_CALL:
+ if (is_call_insn(&old_opcode[0]))
+ return 1;
+ break;
+ }
+
+ return -1;
+}
+
+/*
+ * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * We completely avoid using stop_machine() here, and achieve the
+ * synchronization using INT3 breakpoints and SMP cross-calls.
+ * (borrowed comment from smp_text_poke_batch_finish)
+ *
+ * The way it is done:
+ * - Add an INT3 trap to the address that will be patched
+ * - SMP sync all CPUs
+ * - Update all but the first byte of the patched range
+ * - SMP sync all CPUs
+ * - Replace the first byte (INT3) by the first byte of the replacing opcode
+ * - SMP sync all CPUs
+ */
+static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, char *insn, bool optimize)
+{
+ uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+ struct write_opcode_ctx ctx = {
+ .base = vaddr,
+ };
+ int err;
+
+ /*
+ * Write int3 trap.
+ *
+ * The swbp_optimize path comes with breakpoint already installed,
+ * so we can skip this step for optimize == true.
+ */
+ if (!optimize) {
+ ctx.expect = EXPECT_CALL;
+ err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+ }
+
+ smp_text_poke_sync_each_cpu();
+
+ /* Write all but the first byte of the patched range. */
+ ctx.expect = EXPECT_SWBP;
+ err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+ true /* is_register */, false /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+
+ /*
+ * Write first byte.
+ *
+ * The swbp_unoptimize needs to finish uprobe removal together
+ * with ref_ctr update, using uprobe_write with proper flags.
+ */
+ err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+ optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+ &ctx);
+ if (err)
+ return err;
+
+ smp_text_poke_sync_each_cpu();
+ return 0;
+}
+
+static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr, unsigned long tramp)
+{
+ u8 call[5];
+
+ __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+ (const void *) tramp, CALL_INSN_SIZE);
+ return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+}
+
+static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+}
+
+static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
+{
+ unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ uprobe_copy_from_page(page, vaddr, dst, len);
+ put_page(page);
+ return 0;
+}
+
+static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+{
+ struct __packed __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } *call = (struct __arch_relative_insn *) insn;
+
+ if (!is_call_insn(insn))
+ return false;
+ return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+}
+
+static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
+{
+ uprobe_opcode_t insn[5];
+ int err;
+
+ err = copy_from_vaddr(mm, vaddr, &insn, 5);
+ if (err)
+ return err;
+ return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+ return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
+ test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+}
+
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (should_optimize(auprobe)) {
+ /*
+ * We could race with another thread that already optimized the probe,
+ * so let's not overwrite it with int3 again in this case.
+ */
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
+ true /* is_register */);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
+{
+ if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
+ int ret = is_optimized(vma->vm_mm, vaddr);
+ if (ret < 0)
+ return ret;
+ if (ret) {
+ ret = swbp_unoptimize(auprobe, vma, vaddr);
+ WARN_ON_ONCE(ret);
+ return ret;
+ }
+ }
+ return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
+ false /* is_register */);
+}
+
+static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+ unsigned long vaddr)
+{
+ struct uprobe_trampoline *tramp;
+ struct vm_area_struct *vma;
+ bool new = false;
+ int err = 0;
+
+ vma = find_vma(mm, vaddr);
+ if (!vma)
+ return -EINVAL;
+ tramp = get_uprobe_trampoline(vaddr, &new);
+ if (!tramp)
+ return -EINVAL;
+ err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
+ if (WARN_ON_ONCE(err) && new)
+ destroy_uprobe_trampoline(tramp);
+ return err;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ uprobe_opcode_t insn[5];
+
+ if (!should_optimize(auprobe))
+ return;
+
+ mmap_write_lock(mm);
+
+ /*
+ * Check if some other thread already optimized the uprobe for us,
+ * if it's the case just go away silently.
+ */
+ if (copy_from_vaddr(mm, vaddr, &insn, 5))
+ goto unlock;
+ if (!is_swbp_insn((uprobe_opcode_t*) &insn))
+ goto unlock;
+
+ /*
+ * If we fail to optimize the uprobe we set the fail bit so the
+ * above should_optimize will fail from now on.
+ */
+ if (__arch_uprobe_optimize(auprobe, mm, vaddr))
+ set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
+
+unlock:
+ mmap_write_unlock(mm);
+}
+
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ if (!insn->x86_64 || insn->length != 5)
+ return false;
+
+ if (!insn_is_nop(insn))
+ return false;
+
+ /* We can't do cross page atomic writes yet. */
+ return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
+#else /* 32-bit: */
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static bool can_optimize(struct insn *insn, unsigned long vaddr)
+{
+ return false;
+}
+#endif /* CONFIG_X86_64 */
+
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(struct pt_regs *regs)
+{
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
+
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
+{
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
+
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
+ return -EFAULT;
+
+ regs->sp = new_sp;
+ return 0;
+}
+
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
+ }
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
+}
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
+static const struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
+{
+ return auprobe->branch.opc1 == 0xe8;
+}
+
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
+
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
+
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
+}
+
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long flags = regs->flags;
+
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
+
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
+
+#undef XF
+#undef COND
+#undef CASE_COND
+
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (emulate_push_stack(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
+}
+
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+ if (emulate_push_stack(regs, *src_ptr))
+ return false;
+ regs->ip += auprobe->push.ilen;
+ return true;
+}
+
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long(regs);
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
+
+static const struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+static const struct uprobe_xol_ops push_xol_ops = {
+ .emulate = push_emulate_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+ insn_byte_t p;
+
+ if (insn_is_nop(insn))
+ goto setup;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
+ /*
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
+ */
+ opc1 = OPCODE2(insn) - 0x10;
+ fallthrough;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for_each_insn_prefix(insn, p) {
+ if (p == 0x66)
+ return -ENOTSUPP;
+ }
+
+setup:
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
+}
+
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+ if (opc1 < 0x50 || opc1 > 0x57)
+ return -ENOSYS;
+
+ if (insn->length > 2)
+ return -ENOSYS;
+ if (insn->length == 2) {
+ /* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+ if (insn->rex_prefix.nbytes != 1 ||
+ insn->rex_prefix.bytes[0] != 0x41)
+ return -ENOSYS;
+
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, r8);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, r9);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, r10);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, r11);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, r12);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, r13);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, r14);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, r15);
+ break;
+ }
+#else
+ return -ENOSYS;
+#endif
+ } else {
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, ax);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, cx);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, dx);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, bx);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, sp);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, bp);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, si);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, di);
+ break;
+ }
+ }
+
+ auprobe->push.reg_offset = reg_offset;
+ auprobe->push.ilen = insn->length;
+ auprobe->ops = &push_xol_ops;
+ return 0;
+}
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @auprobe: the probepoint information.
+ * @mm: the probed address space.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ struct insn insn;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ if (can_optimize(&insn, addr))
+ set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ ret = push_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ fallthrough;
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
+}
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
+ /*
+ * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
+ * so we can get an extra SIGTRAP if we do not clear TF. We need
+ * to examine the opcode to make it right.
+ */
+ if (send_sigtrap)
+ send_sig(SIGTRAP, current, 0);
+
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ return err;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
+ /* clear TF if it was set by us in arch_uprobe_pre_xol() */
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+}
+
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
+ return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ int rasize = sizeof_long(regs), nleft;
+ unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+ return -1;
+
+ /* check whether address has been already hijacked */
+ if (orig_ret_vaddr == trampoline_vaddr)
+ return orig_ret_vaddr;
+
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft)) {
+ if (shstk_update_last_frame(trampoline_vaddr)) {
+ force_sig(SIGSEGV);
+ return -1;
+ }
+ return orig_ret_vaddr;
+ }
+
+ if (nleft != rasize) {
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
+
+ force_sig(SIGSEGV);
+ }
+
+ return -1;
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+ return regs->sp < ret->stack;
+ else
+ return regs->sp <= ret->stack;
+}
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
deleted file mode 100644
index 1132129db792..000000000000
--- a/arch/x86/kernel/uv_irq.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV IRQ functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#include <linux/module.h>
-#include <linux/rbtree.h>
-#include <linux/slab.h>
-#include <linux/irq.h>
-
-#include <asm/apic.h>
-#include <asm/uv/uv_irq.h>
-#include <asm/uv/uv_hub.h>
-
-/* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
- struct rb_node list;
- unsigned long offset;
- int pnode;
- int irq;
-};
-
-static spinlock_t uv_irq_lock;
-static struct rb_root uv_irq_root;
-
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
-
-static void uv_noop(unsigned int irq)
-{
-}
-
-static unsigned int uv_noop_ret(unsigned int irq)
-{
- return 0;
-}
-
-static void uv_ack_apic(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip uv_irq_chip = {
- .name = "UV-CORE",
- .startup = uv_noop_ret,
- .shutdown = uv_noop,
- .enable = uv_noop,
- .disable = uv_noop,
- .ack = uv_noop,
- .mask = uv_noop,
- .unmask = uv_noop,
- .eoi = uv_ack_apic,
- .end = uv_noop,
- .set_affinity = uv_set_irq_affinity,
-};
-
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
-{
- struct rb_node **link = &uv_irq_root.rb_node;
- struct rb_node *parent = NULL;
- struct uv_irq_2_mmr_pnode *n;
- struct uv_irq_2_mmr_pnode *e;
- unsigned long irqflags;
-
- n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
- uv_blade_to_memory_nid(blade));
- if (!n)
- return -ENOMEM;
-
- n->irq = irq;
- n->offset = offset;
- n->pnode = uv_blade_to_pnode(blade);
- spin_lock_irqsave(&uv_irq_lock, irqflags);
- /* Find the right place in the rbtree: */
- while (*link) {
- parent = *link;
- e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
- if (unlikely(irq == e->irq)) {
- /* irq entry exists */
- e->pnode = uv_blade_to_pnode(blade);
- e->offset = offset;
- spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- kfree(n);
- return 0;
- }
-
- if (irq < e->irq)
- link = &(*link)->rb_left;
- else
- link = &(*link)->rb_right;
- }
-
- /* Insert the node into the rbtree. */
- rb_link_node(&n->list, parent, link);
- rb_insert_color(&n->list, &uv_irq_root);
-
- spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- return 0;
-}
-
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
-{
- struct uv_irq_2_mmr_pnode *e;
- struct rb_node *n;
- unsigned long irqflags;
-
- spin_lock_irqsave(&uv_irq_lock, irqflags);
- n = uv_irq_root.rb_node;
- while (n) {
- e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
- if (e->irq == irq) {
- *offset = e->offset;
- *pnode = e->pnode;
- spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- return 0;
- }
-
- if (irq < e->irq)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
- spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- return -1;
-}
-
-/*
- * Re-target the irq to the specified CPU and enable the specified MMR located
- * on the specified blade to allow the sending of MSIs to the specified CPU.
- */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset, int limit)
-{
- const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
- int mmr_pnode;
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- int err;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
- sizeof(unsigned long));
-
- cfg = irq_cfg(irq);
-
- err = assign_irq_vector(irq, cfg, eligible_cpu);
- if (err != 0)
- return err;
-
- if (limit == UV_AFFINITY_CPU)
- desc->status |= IRQ_NO_BALANCING;
- else
- desc->status |= IRQ_MOVE_PCNTXT;
-
- set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
- irq_name);
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
-
- mmr_pnode = uv_blade_to_pnode(mmr_blade);
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- return irq;
-}
-
-/*
- * Disable the specified MMR located on the specified blade so that MSIs are
- * longer allowed to be sent.
- */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
-{
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
-
- BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
- sizeof(unsigned long));
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- entry->mask = 1;
-
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-}
-
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg = desc->chip_data;
- unsigned int dest;
- unsigned long mmr_value;
- struct uv_IO_APIC_route_entry *entry;
- unsigned long mmr_offset;
- int mmr_pnode;
-
- if (set_desc_affinity(desc, mask, &dest))
- return -1;
-
- mmr_value = 0;
- entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = dest;
-
- /* Get previously stored MMR and pnode of hub sourcing interrupts */
- if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
- return -1;
-
- uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- return 0;
-}
-
-/*
- * Set up a mapping of an available irq and vector, and enable the specified
- * MMR that defines the MSI that is to be sent to the specified CPU when an
- * interrupt is raised.
- */
-int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
- unsigned long mmr_offset, int limit)
-{
- int irq, ret;
-
- irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
-
- if (irq <= 0)
- return -EBUSY;
-
- ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
- limit);
- if (ret == irq)
- uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
- else
- destroy_irq(irq);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(uv_setup_irq);
-
-/*
- * Tear down a mapping of an irq and vector, and disable the specified MMR that
- * defined the MSI that was to be sent to the specified CPU when an interrupt
- * was raised.
- *
- * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
- */
-void uv_teardown_irq(unsigned int irq)
-{
- struct uv_irq_2_mmr_pnode *e;
- struct rb_node *n;
- unsigned long irqflags;
-
- spin_lock_irqsave(&uv_irq_lock, irqflags);
- n = uv_irq_root.rb_node;
- while (n) {
- e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
- if (e->irq == irq) {
- arch_disable_uv_irq(e->pnode, e->offset);
- rb_erase(n, &uv_irq_root);
- kfree(e);
- break;
- }
- if (irq < e->irq)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
- spin_unlock_irqrestore(&uv_irq_lock, irqflags);
- destroy_irq(irq);
-}
-EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
deleted file mode 100644
index 309c70fb7759..000000000000
--- a/arch/x86/kernel/uv_sysfs.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
- * Copyright (c) Russ Anderson
- */
-
-#include <linux/sysdev.h>
-#include <asm/uv/bios.h>
-#include <asm/uv/uv.h>
-
-struct kobject *sgi_uv_kobj;
-
-static ssize_t partition_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
-}
-
-static ssize_t coherence_id_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
-}
-
-static struct kobj_attribute partition_id_attr =
- __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
-
-static struct kobj_attribute coherence_id_attr =
- __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
-
-
-static int __init sgi_uv_sysfs_init(void)
-{
- unsigned long ret;
-
- if (!is_uv_system())
- return -ENODEV;
-
- if (!sgi_uv_kobj)
- sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
- if (!sgi_uv_kobj) {
- printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
- return -EINVAL;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
- return ret;
- }
-
- ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
- if (ret) {
- printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
- return ret;
- }
-
- return 0;
-}
-
-device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
new file mode 100644
index 000000000000..37ad43792452
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *
+ * verify_cpu.S - Code for cpu long mode and SSE verification. This
+ * code has been borrowed from boot/setup.S and was introduced by
+ * Andi Kleen.
+ *
+ * Copyright (c) 2007 Andi Kleen (ak@suse.de)
+ * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
+ *
+ * This is a common code for verification whether CPU supports
+ * long mode and SSE or not. It is not called directly instead this
+ * file is included at various places and compiled in that context.
+ * This file is expected to run in 32bit code. Currently:
+ *
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verification
+ * arch/x86/kernel/head_32.S: processor startup
+ *
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
+ * 0: Success 1: Failure
+ *
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ * The caller needs to check for the error code and take the action
+ * appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/cpufeaturemasks.h>
+#include <asm/msr-index.h>
+
+#define SSE_MASK \
+ (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31))))
+
+SYM_FUNC_START_LOCAL(verify_cpu)
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+
+#ifndef __x86_64__
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
+#endif
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb .Lverify_cpu_no_longmode # no cpuid 1
+
+ xor %di,%di
+ cmpl $0x68747541,%ebx # AuthenticAMD
+ jnz .Lverify_cpu_noamd
+ cmpl $0x69746e65,%edx
+ jnz .Lverify_cpu_noamd
+ cmpl $0x444d4163,%ecx
+ jnz .Lverify_cpu_noamd
+ mov $1,%di # cpu is from AMD
+ jmp .Lverify_cpu_check
+
+.Lverify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz .Lverify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz .Lverify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz .Lverify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
+
+.Lverify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+.Lverify_cpu_check:
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
+ jnz .Lverify_cpu_no_longmode
+
+ movl $0x80000000,%eax # See if extended cpuid is implemented
+ cpuid
+ cmpl $0x80000001,%eax
+ jb .Lverify_cpu_no_longmode # no extended cpuid
+
+ movl $0x80000001,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz .Lverify_cpu_no_longmode
+
+.Lverify_cpu_sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je .Lverify_cpu_sse_ok
+ test %di,%di
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
+ movl $MSR_K7_HWCR,%ecx
+ rdmsr
+ btr $15,%eax # enable SSE
+ wrmsr
+ xor %di,%di # don't loop
+ jmp .Lverify_cpu_sse_test # try again
+
+.Lverify_cpu_no_longmode:
+ popf # Restore caller passed flags
+ movl $1,%eax
+ RET
+.Lverify_cpu_sse_ok:
+ popf # Restore caller passed flags
+ xorl %eax, %eax
+ RET
+SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
deleted file mode 100644
index 45b6f8a975a1..000000000000
--- a/arch/x86/kernel/verify_cpu_64.S
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *
- * verify_cpu.S - Code for cpu long mode and SSE verification. This
- * code has been borrowed from boot/setup.S and was introduced by
- * Andi Kleen.
- *
- * Copyright (c) 2007 Andi Kleen (ak@suse.de)
- * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
- * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
- *
- * This is a common code for verification whether CPU supports
- * long mode and SSE or not. It is not called directly instead this
- * file is included at various places and compiled in that context.
- * Following are the current usage.
- *
- * This file is included by both 16bit and 32bit code.
- *
- * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- * verify_cpu, returns the status of cpu check in register %eax.
- * 0: Success 1: Failure
- *
- * The caller needs to check for the error code and take the action
- * appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-
-verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
-
- pushfl # standard way to check for cpuid
- popl %eax
- movl %eax,%ebx
- xorl $0x200000,%eax
- pushl %eax
- popfl
- pushfl
- popl %eax
- cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
-
- movl $0x0,%eax # See if cpuid 1 is implemented
- cpuid
- cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
-
- xor %di,%di
- cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
- cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
- cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
- mov $1,%di # cpu is from AMD
-
-verify_cpu_noamd:
- movl $0x1,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK0,%edx
- xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
-
- movl $0x80000000,%eax # See if extended cpuid is implemented
- cpuid
- cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
-
- movl $0x80000001,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK1,%edx
- xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
-
-verify_cpu_sse_test:
- movl $1,%eax
- cpuid
- andl $SSE_MASK,%edx
- cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
- test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $0xc0010015,%ecx # HWCR
- rdmsr
- btr $15,%eax # enable SSE
- wrmsr
- xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
-
-verify_cpu_no_longmode:
- popfl # Restore caller passed flags
- movl $1,%eax
- ret
-verify_cpu_sse_ok:
- popfl # Restore caller passed flags
- xorl %eax, %eax
- ret
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
deleted file mode 100644
index e680ea52db9b..000000000000
--- a/arch/x86/kernel/visws_quirks.c
+++ /dev/null
@@ -1,666 +0,0 @@
-/*
- * SGI Visual Workstation support and quirks, unmaintained.
- *
- * Split out from setup.c by davej@suse.de
- *
- * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
- *
- * SGI Visual Workstation interrupt controller
- *
- * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
- * which serves as the main interrupt controller in the system. Non-legacy
- * hardware in the system uses this controller directly. Legacy devices
- * are connected to the PIIX4 which in turn has its 8259(s) connected to
- * a of the Cobalt APIC entry.
- *
- * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
- *
- * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
- */
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/visws/cobalt.h>
-#include <asm/visws/piix4.h>
-#include <asm/io_apic.h>
-#include <asm/fixmap.h>
-#include <asm/reboot.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/time.h>
-#include <asm/io.h>
-
-#include <linux/kernel_stat.h>
-
-#include <asm/i8259.h>
-#include <asm/irq_vectors.h>
-#include <asm/visws/lithium.h>
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-
-extern int no_broadcast;
-
-char visws_board_type = -1;
-char visws_board_rev = -1;
-
-static void __init visws_time_init(void)
-{
- printk(KERN_INFO "Starting Cobalt Timer system clock\n");
-
- /* Set the countdown value */
- co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
-
- /* Start the timer */
- co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
-
- /* Enable (unmask) the timer interrupt */
- co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
-
- setup_default_timer_irq();
-}
-
-/* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
- init_VISWS_APIC_irqs();
-}
-
-/* Quirk for machine specific memory setup. */
-
-#define MB (1024 * 1024)
-
-unsigned long sgivwfb_mem_phys;
-unsigned long sgivwfb_mem_size;
-EXPORT_SYMBOL(sgivwfb_mem_phys);
-EXPORT_SYMBOL(sgivwfb_mem_size);
-
-long long mem_size __initdata = 0;
-
-static char * __init visws_memory_setup(void)
-{
- long long gfx_mem_size = 8 * MB;
-
- mem_size = boot_params.alt_mem_k;
-
- if (!mem_size) {
- printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
- mem_size = 128 * MB;
- }
-
- /*
- * this hardcodes the graphics memory to 8 MB
- * it really should be sized dynamically (or at least
- * set as a boot param)
- */
- if (!sgivwfb_mem_size) {
- printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
- sgivwfb_mem_size = 8 * MB;
- }
-
- /*
- * Trim to nearest MB
- */
- sgivwfb_mem_size &= ~((1 << 20) - 1);
- sgivwfb_mem_phys = mem_size - gfx_mem_size;
-
- e820_add_region(0, LOWMEMSIZE(), E820_RAM);
- e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
- e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
-
- return "PROM";
-}
-
-static void visws_machine_emergency_restart(void)
-{
- /*
- * Visual Workstations restart after this
- * register is poked on the PIIX4
- */
- outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
-}
-
-static void visws_machine_power_off(void)
-{
- unsigned short pm_status;
-/* extern unsigned int pci_bus0; */
-
- while ((pm_status = inw(PMSTS_PORT)) & 0x100)
- outw(pm_status, PMSTS_PORT);
-
- outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
-
- mdelay(10);
-
-#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
- (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
-
-/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
- outl(PIIX_SPECIAL_STOP, 0xCFC);
-}
-
-static void __init visws_get_smp_config(unsigned int early)
-{
-}
-
-/*
- * The Visual Workstation is Intel MP compliant in the hardware
- * sense, but it doesn't have a BIOS(-configuration table).
- * No problem for Linux.
- */
-
-static void __init MP_processor_info(struct mpc_cpu *m)
-{
- int ver, logical_apicid;
- physid_mask_t apic_cpus;
-
- if (!(m->cpuflag & CPU_ENABLED))
- return;
-
- logical_apicid = m->apicid;
- printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
- m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
-
- if (m->cpuflag & CPU_BOOTPROCESSOR)
- boot_cpu_physical_apicid = m->apicid;
-
- ver = m->apicver;
- if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
- printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->apicid, MAX_APICS);
- return;
- }
-
- apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
- physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- m->apicid);
- ver = 0x10;
- }
- apic_version[m->apicid] = ver;
-}
-
-static void __init visws_find_smp_config(void)
-{
- struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
- unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
-
- if (ncpus > CO_CPU_MAX) {
- printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
- ncpus, mp);
-
- ncpus = CO_CPU_MAX;
- }
-
- if (ncpus > setup_max_cpus)
- ncpus = setup_max_cpus;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- smp_found_config = 1;
-#endif
- while (ncpus--)
- MP_processor_info(mp++);
-
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-}
-
-static void visws_trap_init(void);
-
-void __init visws_early_detect(void)
-{
- int raw;
-
- visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
- >> PIIX_GPI_BD_SHIFT;
-
- if (visws_board_type < 0)
- return;
-
- /*
- * Override the default platform setup functions
- */
- x86_init.resources.memory_setup = visws_memory_setup;
- x86_init.mpparse.get_smp_config = visws_get_smp_config;
- x86_init.mpparse.find_smp_config = visws_find_smp_config;
- x86_init.irqs.pre_vector_init = visws_pre_intr_init;
- x86_init.irqs.trap_init = visws_trap_init;
- x86_init.timers.timer_init = visws_time_init;
- x86_init.pci.init = pci_visws_init;
- x86_init.pci.init_irq = x86_init_noop;
-
- /*
- * Install reboot quirks:
- */
- pm_power_off = visws_machine_power_off;
- machine_ops.emergency_restart = visws_machine_emergency_restart;
-
- /*
- * Do not use broadcast IPIs:
- */
- no_broadcast = 0;
-
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Turn off IO-APIC detection and initialization:
- */
- skip_ioapic_setup = 1;
-#endif
-
- /*
- * Get Board rev.
- * First, we have to initialize the 307 part to allow us access
- * to the GPIO registers. Let's map them at 0x0fc0 which is right
- * after the PIIX4 PM section.
- */
- outb_p(SIO_DEV_SEL, SIO_INDEX);
- outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
-
- outb_p(SIO_DEV_MSB, SIO_INDEX);
- outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
-
- outb_p(SIO_DEV_LSB, SIO_INDEX);
- outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
-
- outb_p(SIO_DEV_ENB, SIO_INDEX);
- outb_p(1, SIO_DATA); /* Enable GPIO registers. */
-
- /*
- * Now, we have to map the power management section to write
- * a bit which enables access to the GPIO registers.
- * What lunatic came up with this shit?
- */
- outb_p(SIO_DEV_SEL, SIO_INDEX);
- outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
-
- outb_p(SIO_DEV_MSB, SIO_INDEX);
- outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
-
- outb_p(SIO_DEV_LSB, SIO_INDEX);
- outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
-
- outb_p(SIO_DEV_ENB, SIO_INDEX);
- outb_p(1, SIO_DATA); /* Enable PM registers. */
-
- /*
- * Now, write the PM register which enables the GPIO registers.
- */
- outb_p(SIO_PM_FER2, SIO_PM_INDEX);
- outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
-
- /*
- * Now, initialize the GPIO registers.
- * We want them all to be inputs which is the
- * power on default, so let's leave them alone.
- * So, let's just read the board rev!
- */
- raw = inb_p(SIO_GP_DATA1);
- raw &= 0x7f; /* 7 bits of valid board revision ID. */
-
- if (visws_board_type == VISWS_320) {
- if (raw < 0x6) {
- visws_board_rev = 4;
- } else if (raw < 0xc) {
- visws_board_rev = 5;
- } else {
- visws_board_rev = 6;
- }
- } else if (visws_board_type == VISWS_540) {
- visws_board_rev = 2;
- } else {
- visws_board_rev = raw;
- }
-
- printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
- (visws_board_type == VISWS_320 ? "320" :
- (visws_board_type == VISWS_540 ? "540" :
- "unknown")), visws_board_rev);
-}
-
-#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
-#define BCD (LI_INTB | LI_INTC | LI_INTD)
-#define ALLDEVS (A01234 | BCD)
-
-static __init void lithium_init(void)
-{
- set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
- set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
-
- if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
- printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
-/* panic("This machine is not SGI Visual Workstation 320/540"); */
- }
-
- if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
- (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
- printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
-/* panic("This machine is not SGI Visual Workstation 320/540"); */
- }
-
- li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
- li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
-}
-
-static __init void cobalt_init(void)
-{
- /*
- * On normal SMP PC this is used only with SMP, but we have to
- * use it and set it up here to start the Cobalt clock
- */
- set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
- setup_local_APIC();
- printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
- (unsigned int)apic_read(APIC_LVR),
- (unsigned int)apic_read(APIC_ID));
-
- set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
- set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
- printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
- co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
-
- /* Enable Cobalt APIC being careful to NOT change the ID! */
- co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
-
- printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
- co_apic_read(CO_APIC_ID));
-}
-
-static void __init visws_trap_init(void)
-{
- lithium_init();
- cobalt_init();
-}
-
-/*
- * IRQ controller / APIC support:
- */
-
-static DEFINE_SPINLOCK(cobalt_lock);
-
-/*
- * Set the given Cobalt APIC Redirection Table entry to point
- * to the given IDT vector/index.
- */
-static inline void co_apic_set(int entry, int irq)
-{
- co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
- co_apic_write(CO_APIC_HI(entry), 0);
-}
-
-/*
- * Cobalt (IO)-APIC functions to handle PCI devices.
- */
-static inline int co_apic_ide0_hack(void)
-{
- extern char visws_board_type;
- extern char visws_board_rev;
-
- if (visws_board_type == VISWS_320 && visws_board_rev == 5)
- return 5;
- return CO_APIC_IDE0;
-}
-
-static int is_co_apic(unsigned int irq)
-{
- if (IS_CO_APIC(irq))
- return CO_APIC(irq);
-
- switch (irq) {
- case 0: return CO_APIC_CPU;
- case CO_IRQ_IDE0: return co_apic_ide0_hack();
- case CO_IRQ_IDE1: return CO_APIC_IDE1;
- default: return -1;
- }
-}
-
-
-/*
- * This is the SGI Cobalt (IO-)APIC:
- */
-
-static void enable_cobalt_irq(unsigned int irq)
-{
- co_apic_set(is_co_apic(irq), irq);
-}
-
-static void disable_cobalt_irq(unsigned int irq)
-{
- int entry = is_co_apic(irq);
-
- co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
- co_apic_read(CO_APIC_LO(entry));
-}
-
-/*
- * "irq" really just serves to identify the device. Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
- desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
- return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- disable_cobalt_irq(irq);
- apic_write(APIC_EOI, APIC_EIO_ACK);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static void end_cobalt_irq(unsigned int irq)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_to_desc(irq);
-
- spin_lock_irqsave(&cobalt_lock, flags);
- if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip cobalt_irq_type = {
- .name = "Cobalt-APIC",
- .startup = startup_cobalt_irq,
- .shutdown = disable_cobalt_irq,
- .enable = enable_cobalt_irq,
- .disable = disable_cobalt_irq,
- .ack = ack_cobalt_irq,
- .end = end_cobalt_irq,
-};
-
-
-/*
- * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
- * -- not the manner expected by the code in i8259.c.
- *
- * there is a 'master' physical interrupt source that gets sent to
- * the CPU. But in the chipset there are various 'virtual' interrupts
- * waiting to be handled. We represent this to Linux through a 'master'
- * interrupt controller type, and through a special virtual interrupt-
- * controller. Device drivers only see the virtual interrupt sources.
- */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
-{
- legacy_pic->init(0);
-
- return startup_cobalt_irq(irq);
-}
-
-static void end_piix4_master_irq(unsigned int irq)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- enable_cobalt_irq(irq);
- spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
-static struct irq_chip piix4_master_irq_type = {
- .name = "PIIX4-master",
- .startup = startup_piix4_master_irq,
- .ack = ack_cobalt_irq,
- .end = end_piix4_master_irq,
-};
-
-
-static struct irq_chip piix4_virtual_irq_type = {
- .name = "PIIX4-virtual",
-};
-
-
-/*
- * PIIX4-8259 master/virtual functions to handle interrupt requests
- * from legacy devices: floppy, parallel, serial, rtc.
- *
- * None of these get Cobalt APIC entries, neither do they have IDT
- * entries. These interrupts are purely virtual and distributed from
- * the 'master' interrupt source: CO_IRQ_8259.
- *
- * When the 8259 interrupts its handler figures out which of these
- * devices is interrupting and dispatches to its handler.
- *
- * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
- * enable_irq gets the right irq. This 'master' irq is never directly
- * manipulated by any driver.
- */
-static irqreturn_t piix4_master_intr(int irq, void *dev_id)
-{
- int realirq;
- struct irq_desc *desc;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&i8259A_lock, flags);
-
- /* Find out what's interrupting in the PIIX4 master 8259 */
- outb(0x0c, 0x20); /* OCW3 Poll command */
- realirq = inb(0x20);
-
- /*
- * Bit 7 == 0 means invalid/spurious
- */
- if (unlikely(!(realirq & 0x80)))
- goto out_unlock;
-
- realirq &= 7;
-
- if (unlikely(realirq == 2)) {
- outb(0x0c, 0xa0);
- realirq = inb(0xa0);
-
- if (unlikely(!(realirq & 0x80)))
- goto out_unlock;
-
- realirq = (realirq & 7) + 8;
- }
-
- /* mask and ack interrupt */
- cached_irq_mask |= 1 << realirq;
- if (unlikely(realirq > 7)) {
- inb(0xa1);
- outb(cached_slave_mask, 0xa1);
- outb(0x60 + (realirq & 7), 0xa0);
- outb(0x60 + 2, 0x20);
- } else {
- inb(0x21);
- outb(cached_master_mask, 0x21);
- outb(0x60 + realirq, 0x20);
- }
-
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-
- desc = irq_to_desc(realirq);
-
- /*
- * handle this 'virtual interrupt' as a Cobalt one now.
- */
- kstat_incr_irqs_this_cpu(realirq, desc);
-
- if (likely(desc->action != NULL))
- handle_IRQ_event(realirq, desc->action);
-
- if (!(desc->status & IRQ_DISABLED))
- legacy_pic->chip->unmask(realirq);
-
- return IRQ_HANDLED;
-
-out_unlock:
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
- return IRQ_NONE;
-}
-
-static struct irqaction master_action = {
- .handler = piix4_master_intr,
- .name = "PIIX4-8259",
-};
-
-static struct irqaction cascade_action = {
- .handler = no_action,
- .name = "cascade",
-};
-
-static inline void set_piix4_virtual_irq_type(void)
-{
- piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
- piix4_virtual_irq_type.enable = i8259A_chip.unmask;
- piix4_virtual_irq_type.disable = i8259A_chip.mask;
-}
-
-void init_VISWS_APIC_irqs(void)
-{
- int i;
-
- for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = 0;
- desc->depth = 1;
-
- if (i == 0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE0) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_IDE1) {
- desc->chip = &cobalt_irq_type;
- }
- else if (i == CO_IRQ_8259) {
- desc->chip = &piix4_master_irq_type;
- }
- else if (i < CO_IRQ_APIC0) {
- set_piix4_virtual_irq_type();
- desc->chip = &piix4_virtual_irq_type;
- }
- else if (IS_CO_APIC(i)) {
- desc->chip = &cobalt_irq_type;
- }
- }
-
- setup_irq(CO_IRQ_8259, &master_action);
- setup_irq(2, &cascade_action);
-}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..e6cc84143f3e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1994 Linus Torvalds
*
@@ -28,10 +29,14 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/syscalls.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/string.h>
@@ -41,12 +46,16 @@
#include <linux/ptrace.h>
#include <linux/audit.h>
#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
-#include <asm/syscalls.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
@@ -64,10 +73,6 @@
*/
-#define KVM86 ((struct kernel_vm86_struct *)regs)
-#define VMPI KVM86->vm86plus
-
-
/*
* 8- and 16-bit register defines..
*/
@@ -79,8 +84,8 @@
/*
* virtual flags (16 and 32-bit versions)
*/
-#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS (current->thread.v86flags)
+#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS (current->thread.vm86->veflags)
#define set_flags(X, new, mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -88,46 +93,11 @@
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
- const struct kernel_vm86_regs *regs)
-{
- int ret = 0;
-
- /*
- * kernel_vm86_regs is missing gs, so copy everything up to
- * (but not including) orig_eax, and then rest including orig_eax.
- */
- ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
- return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
- const struct vm86_regs __user *user,
- unsigned extra)
-{
- int ret = 0;
-
- /* copy ax-fs inclusive */
- ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- /* copy orig_ax-__gsh+extra */
- ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax) +
- extra);
- return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
- struct pt_regs *ret;
- unsigned long tmp;
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
/*
* This gets called from entry.S with interrupts disabled, but
@@ -136,115 +106,81 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
*/
local_irq_enable();
- if (!current->thread.vm86_info) {
- printk("no vm86_info: BAD\n");
- do_exit(SIGSEGV);
- }
- set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
- tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
- tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
- if (tmp) {
- printk("vm86: could not access userspace vm86_info\n");
- do_exit(SIGSEGV);
- }
-
- tss = &per_cpu(init_tss, get_cpu());
- current->thread.sp0 = current->thread.saved_sp0;
- current->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &current->thread);
- current->thread.saved_sp0 = 0;
- put_cpu();
+ BUG_ON(!vm86);
+
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+ user = vm86->user_vm86;
+
+ if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ?
+ sizeof(struct vm86plus_struct) :
+ sizeof(struct vm86_struct)))
+ goto Efault;
+
+ unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end);
+ unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end);
+ unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end);
+ unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end);
+ unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end);
+ unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end);
+ unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end);
+ unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end);
+ unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end);
+ unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end);
+ unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end);
+ unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end);
+ unsafe_put_user(regs->es, &user->regs.es, Efault_end);
+ unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
+ unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
+ unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- ret = KVM86->regs32;
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
- ret->fs = current->thread.saved_fs;
- set_user_gs(ret, current->thread.saved_gs);
+ user_access_end();
- return ret;
-}
+exit_vm86:
+ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ update_task_stack(tsk);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ preempt_enable();
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- spinlock_t *ptl;
- int i;
+ memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- pud = pud_offset(pgd, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- flush_tlb();
-}
+ loadsegment(gs, vm86->regs32.gs);
+ regs->pt.ax = retval;
+ return;
+Efault_end:
+ user_access_end();
+Efault:
+ pr_alert("could not access userspace vm86 info\n");
+ force_exit_sig(SIGSEGV);
+ goto exit_vm86;
+}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
-int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
{
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk;
- int tmp, ret = -EPERM;
-
- tsk = current;
- if (tsk->thread.saved_sp0)
- goto out;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, vm86plus) -
- sizeof(info.regs));
- ret = -EFAULT;
- if (tmp)
- goto out;
- memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = regs;
- tsk->thread.vm86_info = v86;
- do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
}
-int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
+SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
{
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk;
- int tmp, ret;
- struct vm86plus_struct __user *v86;
-
- tsk = current;
switch (cmd) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
case VM86_GET_IRQ_BITS:
case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(cmd, (int)arg);
- goto out;
+ return do_vm86_irq_handling(cmd, (int)arg);
case VM86_PLUS_INSTALL_CHECK:
/*
* NOTE: on old vm86 stuff this will return the error
@@ -252,121 +188,163 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
* interpreted as (invalid) address to vm86_struct.
* So the installation check works.
*/
- ret = 0;
- goto out;
+ return 0;
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
- ret = -EPERM;
- if (tsk->thread.saved_sp0)
- goto out;
- v86 = (struct vm86plus_struct __user *)arg;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, regs32) -
- sizeof(info.regs));
- ret = -EFAULT;
- if (tmp)
- goto out;
- info.regs32 = regs;
- info.vm86plus.is_vm86pus = 1;
- tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
- do_sys_vm86(&info, tsk);
- ret = 0; /* we never return here */
-out:
- return ret;
+ return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
}
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
- info->regs.pt.ds = 0;
- info->regs.pt.es = 0;
- info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
- info->regs.pt.gs = 0;
-#endif
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long err = 0;
+ struct vm86_struct v;
+
+ err = security_mmap_addr(0);
+ if (err) {
+ /*
+ * vm86 cannot virtualize the address space, so vm86 users
+ * need to manage the low 1MB themselves using mmap. Given
+ * that BIOS places important data in the first page, vm86
+ * is essentially useless if mmap_min_addr != 0. DOSEMU,
+ * for example, won't even bother trying to use vm86 if it
+ * can't map a page at virtual address 0.
+ *
+ * To reduce the available kernel attack surface, simply
+ * disallow vm86(old) for users who cannot mmap at va 0.
+ *
+ * The implementation of security_mmap_addr will allow
+ * suitably privileged users to map va 0 even if
+ * vm.mmap_min_addr is set above 0, and we want this
+ * behavior for vm86 as well, as it ensures that legacy
+ * tools like vbetool will not fail just because of
+ * vm.mmap_min_addr.
+ */
+ pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()));
+ return -EPERM;
+ }
+
+ if (!vm86) {
+ if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+ return -ENOMEM;
+ tsk->thread.vm86 = vm86;
+ }
+ if (vm86->saved_sp0)
+ return -EPERM;
+
+ if (copy_from_user(&v, user_vm86,
+ offsetof(struct vm86_struct, int_revectored)))
+ return -EFAULT;
+
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ memset(&vm86regs, 0, sizeof(vm86regs));
+
+ vm86regs.pt.bx = v.regs.ebx;
+ vm86regs.pt.cx = v.regs.ecx;
+ vm86regs.pt.dx = v.regs.edx;
+ vm86regs.pt.si = v.regs.esi;
+ vm86regs.pt.di = v.regs.edi;
+ vm86regs.pt.bp = v.regs.ebp;
+ vm86regs.pt.ax = v.regs.eax;
+ vm86regs.pt.ip = v.regs.eip;
+ vm86regs.pt.cs = v.regs.cs;
+ vm86regs.pt.flags = v.regs.eflags;
+ vm86regs.pt.sp = v.regs.esp;
+ vm86regs.pt.ss = v.regs.ss;
+ vm86regs.es = v.regs.es;
+ vm86regs.ds = v.regs.ds;
+ vm86regs.fs = v.regs.fs;
+ vm86regs.gs = v.regs.gs;
+
+ vm86->flags = v.flags;
+ vm86->cpu_type = v.cpu_type;
+
+ if (copy_from_user(&vm86->int_revectored,
+ &user_vm86->int_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (copy_from_user(&vm86->int21_revectored,
+ &user_vm86->int21_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (plus) {
+ if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+ sizeof(struct vm86plus_info_struct)))
+ return -EFAULT;
+ vm86->vm86plus.is_vm86pus = 1;
+ } else
+ memset(&vm86->vm86plus, 0,
+ sizeof(struct vm86plus_info_struct));
+
+ memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+ vm86->user_vm86 = user_vm86;
/*
* The flags register is also special: we cannot trust that the user
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
- VEFLAGS = info->regs.pt.flags;
- info->regs.pt.flags &= SAFE_MASK;
- info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
- info->regs.pt.flags |= X86_VM_MASK;
+ VEFLAGS = vm86regs.pt.flags;
+ vm86regs.pt.flags &= SAFE_MASK;
+ vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+ vm86regs.pt.flags |= X86_VM_MASK;
+
+ vm86regs.pt.orig_ax = regs->orig_ax;
- switch (info->cpu_type) {
+ switch (vm86->cpu_type) {
case CPU_286:
- tsk->thread.v86mask = 0;
+ vm86->veflags_mask = 0;
break;
case CPU_386:
- tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
case CPU_486:
- tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
default:
- tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
}
/*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
*/
- info->regs32->ax = VM86_SIGNAL;
- tsk->thread.saved_sp0 = tsk->thread.sp0;
- tsk->thread.saved_fs = info->regs32->fs;
- tsk->thread.saved_gs = get_user_gs(info->regs32);
-
- tss = &per_cpu(init_tss, get_cpu());
- tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
- if (cpu_has_sep)
+ vm86->saved_sp0 = tsk->thread.sp0;
+ savesegment(gs, vm86->regs32.gs);
+
+ /* make room for real-mode segments */
+ preempt_disable();
+ tsk->thread.sp0 += 16;
+
+ if (boot_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
- load_sp0(tss, &tsk->thread);
- put_cpu();
-
- tsk->thread.screen_bitmap = info->screen_bitmap;
- if (info->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
- /*call audit_syscall_exit since we do not exit via the normal paths */
- if (unlikely(current->audit_context))
- audit_syscall_exit(AUDITSC_RESULT(0), 0);
-
- __asm__ __volatile__(
- "movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
- "mov %2, %%gs\n\t"
-#endif
- "jmp resume_userspace"
- : /* no outputs */
- :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
- /* we never return here */
-}
+ refresh_sysenter_cs(&tsk->thread);
+ }
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
- struct pt_regs *regs32;
-
- regs32 = save_v86_state(regs16);
- regs32->ax = retval;
- __asm__ __volatile__("movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
- "jmp resume_userspace"
- : : "r" (regs32), "r" (current_thread_info()));
+ update_task_stack(tsk);
+ preempt_enable();
+
+ memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+ return regs->ax;
}
static inline void set_IF(struct kernel_vm86_regs *regs)
{
VEFLAGS |= X86_EFLAGS_VIF;
- if (VEFLAGS & X86_EFLAGS_VIP)
- return_to_32bit(regs, VM86_STI);
}
static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -398,7 +376,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
{
- set_flags(VEFLAGS, flags, current->thread.v86mask);
+ set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -408,7 +386,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
{
- set_flags(VFLAGS, flags, current->thread.v86mask);
+ set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -423,15 +401,12 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
if (VEFLAGS & X86_EFLAGS_VIF)
flags |= X86_EFLAGS_IF;
flags |= X86_EFLAGS_IOPL;
- return flags | (VEFLAGS & current->thread.v86mask);
+ return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
}
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
{
- __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
- :"=r" (nr)
- :"m" (*bitmap), "r" (nr));
- return nr;
+ return test_bit(nr, bitmap->__map);
}
#define val_byte(val, n) (((__u8 *)&val)[n])
@@ -521,12 +496,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
+ struct vm86 *vm86 = current->thread.vm86;
if (regs->pt.cs == BIOSSEG)
goto cannot_handle;
- if (is_revectored(i, &KVM86->int_revectored))
+ if (is_revectored(i, &vm86->int_revectored))
goto cannot_handle;
- if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+ if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
@@ -545,22 +521,26 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
return;
cannot_handle:
- return_to_32bit(regs, VM86_INTx + (i << 8));
+ save_v86_state(regs, VM86_INTx + (i << 8));
}
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
- if (VMPI.is_vm86pus) {
- if ((trapno == 3) || (trapno == 1))
- return_to_32bit(regs, VM86_TRAP + (trapno << 8));
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (vm86->vm86plus.is_vm86pus) {
+ if ((trapno == 3) || (trapno == 1)) {
+ save_v86_state(regs, VM86_TRAP + (trapno << 8));
+ return 0;
+ }
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
return 0;
}
if (trapno != 1)
return 1; /* we let this handle by the calling routine */
- current->thread.trap_no = trapno;
+ current->thread.trap_nr = trapno;
current->thread.error_code = error_code;
- force_sig(SIGTRAP, current);
+ force_sig(SIGTRAP);
return 0;
}
@@ -571,16 +551,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
unsigned char __user *ssp;
unsigned short ip, sp, orig_flags;
int data32, pref_done;
+ struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
#define CHECK_IF_IN_TRAP \
- if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
- if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
- return_to_32bit(regs, VM86_PICRETURN); \
- if (orig_flags & X86_EFLAGS_TF) \
- handle_vm86_trap(regs, 0, 1); \
- return; } while (0)
orig_flags = *(unsigned short *)&regs->pt.flags;
@@ -619,7 +594,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
SP(regs) -= 2;
}
IP(regs) = ip;
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* popf */
case 0x9d:
@@ -639,16 +614,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
else
set_vflags_short(newflags, regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* int xx */
case 0xcd: {
int intno = popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
- if (VMPI.vm86dbg_active) {
- if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
- return_to_32bit(regs, VM86_INTx + (intno << 8));
+ if (vmpi->vm86dbg_active) {
+ if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+ save_v86_state(regs, VM86_INTx + (intno << 8));
+ return;
+ }
}
do_int(regs, intno, ssp, sp);
return;
@@ -679,14 +656,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
} else {
set_vflags_short(newflags, regs);
}
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* cli */
case 0xfa:
IP(regs) = ip;
clear_IF(regs);
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* sti */
/*
@@ -698,14 +675,30 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
case 0xfb:
IP(regs) = ip;
set_IF(regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
default:
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
return;
+check_vip:
+ if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) ==
+ (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) {
+ save_v86_state(regs, VM86_STI);
+ return;
+ }
+
+vm86_fault_return:
+ if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+ save_v86_state(regs, VM86_PICRETURN);
+ return;
+ }
+ if (orig_flags & X86_EFLAGS_TF)
+ handle_vm86_trap(regs, 0, X86_TRAP_DB);
+ return;
+
simulate_sigsegv:
/* FIXME: After a long discussion with Stas we finally
* agreed, that this is wrong. Here we should
@@ -717,7 +710,7 @@ simulate_sigsegv:
* should be a mixture of the two, but how do we
* get the information? [KD]
*/
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
/* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/arch/x86/kernel/vmcore_info_32.c b/arch/x86/kernel/vmcore_info_32.c
new file mode 100644
index 000000000000..5995a749288a
--- /dev/null
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/pgtable.h>
+
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
diff --git a/arch/x86/kernel/vmcore_info_64.c b/arch/x86/kernel/vmcore_info_64.c
new file mode 100644
index 000000000000..0dec7d868754
--- /dev/null
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmcore_info.h>
+#include <linux/pgtable.h>
+
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ u64 sme_mask = sme_me_mask;
+
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_NUMBER(sme_mask);
+}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
- (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
- (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
- void (*cpuid)(void /* non-c */);
- void (*_set_ldt)(u32 selector);
- void (*set_tr)(u32 selector);
- void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
- void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
- void (*set_kernel_stack)(u32 selector, u32 sp0);
- void (*allocate_page)(u32, u32, u32, u32, u32);
- void (*release_page)(u32, u32);
- void (*set_pte)(pte_t, pte_t *, unsigned);
- void (*update_pte)(pte_t *, unsigned);
- void (*set_linear_mapping)(int, void *, u32, u32);
- void (*_flush_tlb)(int);
- void (*set_initial_ap_state)(int, int);
- void (*halt)(void);
- void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP 0xe9
-#define MNEM_RET 0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE 5
-
-static inline void patch_offset(void *insnbuf,
- unsigned long ip, unsigned long dest)
-{
- *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
- unsigned long ip)
-{
- u64 reloc;
- struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
- switch(rel->type) {
- case VMI_RELOCATION_CALL_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_CALL;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_JUMP_REL:
- BUG_ON(len < 5);
- *(char *)insnbuf = MNEM_JMP;
- patch_offset(insnbuf, ip, (unsigned long)rel->eip);
- return 5;
-
- case VMI_RELOCATION_NOP:
- /* obliterate the whole thing */
- return 0;
-
- case VMI_RELOCATION_NONE:
- /* leave native code in place */
- break;
-
- default:
- BUG();
- }
- return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence. The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
- unsigned long ip, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- return patch_internal(VMI_CALL_DisableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- return patch_internal(VMI_CALL_EnableInterrupts, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return patch_internal(VMI_CALL_SetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- return patch_internal(VMI_CALL_GetInterruptMask, len,
- insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.iret):
- return patch_internal(VMI_CALL_IRET, len, insns, ip);
- case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
- return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
- default:
- break;
- }
- return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
- unsigned int *cx, unsigned int *dx)
-{
- int override = 0;
- if (*ax == 1)
- override = 1;
- asm volatile ("call *%6"
- : "=a" (*ax),
- "=b" (*bx),
- "=c" (*cx),
- "=d" (*dx)
- : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
- if (override) {
- if (disable_pse)
- *dx &= ~X86_FEATURE_PSE;
- if (disable_pge)
- *dx &= ~X86_FEATURE_PGE;
- if (disable_sep)
- *dx &= ~X86_FEATURE_SEP;
- if (disable_tsc)
- *dx &= ~X86_FEATURE_TSC;
- if (disable_mtrr)
- *dx &= ~X86_FEATURE_MTRR;
- }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
- if (gdt[nr].a != new->a || gdt[nr].b != new->b)
- write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
- vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
- unsigned cpu = smp_processor_id();
- struct desc_struct desc;
-
- pack_descriptor(&desc, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESC_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
- vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
- vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
- u32 *idt_entry = (u32 *)g;
- vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
- const void *desc, int type)
-{
- u32 *gdt_entry = (u32 *)desc;
- vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
- const void *desc)
-{
- u32 *ldt_entry = (u32 *)desc;
- vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
-{
- tss->x86_tss.sp0 = thread->sp0;
-
- /* This can only happen when SEP is enabled, no need to test "SEP"arately */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
- vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
- vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
- /*
- * This call comes in very early, before mem_map is setup.
- * It is called only for swapper_pg_dir, which already has
- * data on it.
- */
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
- vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
- vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags. We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush). We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs. We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
- (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user) \
- ((level) | (is_current_as(mm, user) ? \
- (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
- /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
- vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
- const pte_t pte = { .pte = pmdval.pmd };
-#else
- const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
- vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
- /*
- * XXX This is called from set_pmd_pte, but at both PT
- * and PD layers so the VMI_PAGE_PT flag is wrong. But
- * it is only called for large page mapping changes,
- * the Xen backend, doesn't support large pages, and the
- * ESX backend doesn't depend on the flag.
- */
- set_64bit((unsigned long long *)ptep,pte_val(pteval));
- vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
- /* Um, eww */
- const pte_t pte = { .pte = pudval.pgd.pgd };
- vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
- const pte_t pte = { .pte = 0 };
- vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
- unsigned long start_esp)
-{
- struct vmi_ap_state ap;
-
- /* Default everything to zero. This is fine for most GPRs. */
- memset(&ap, 0, sizeof(struct vmi_ap_state));
-
- ap.gdtr_limit = GDT_SIZE - 1;
- ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
- ap.idtr_limit = IDT_ENTRIES * 8 - 1;
- ap.idtr_base = (unsigned long) idt_table;
-
- ap.ldtr = 0;
-
- ap.cs = __KERNEL_CS;
- ap.eip = (unsigned long) start_eip;
- ap.ss = __KERNEL_DS;
- ap.esp = (unsigned long) start_esp;
-
- ap.ds = __USER_DS;
- ap.es = __USER_DS;
- ap.fs = __KERNEL_PERCPU;
- ap.gs = __KERNEL_STACK_CANARY;
-
- ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
- /* efer should match BSP efer. */
- if (cpu_has_nx) {
- unsigned l, h;
- rdmsr(MSR_EFER, l, h);
- ap.efer = (unsigned long long) h << 32 | l;
- }
-#endif
-
- ap.cr3 = __pa(swapper_pg_dir);
- /* Protected mode, paging, AM, WP, NE, MP. */
- ap.cr0 = 0x80050023;
- ap.cr4 = mmu_cr4_features;
- vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
- paravirt_start_context_switch(prev);
- vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
- vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
- vmi_ops.set_lazy_mode(0);
- paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
- struct pci_header *pci;
- struct pnp_header *pnp;
- const char *manufacturer = "UNKNOWN";
- const char *product = "UNKNOWN";
- const char *license = "unspecified";
-
- if (rom->rom_signature != 0xaa55)
- return 0;
- if (rom->vrom_signature != VMI_SIGNATURE)
- return 0;
- if (rom->api_version_maj != VMI_API_REV_MAJOR ||
- rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
- printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
- rom->api_version_maj,
- rom->api_version_min);
- return 0;
- }
-
- /*
- * Relying on the VMI_SIGNATURE field is not 100% safe, so check
- * the PCI header and device type to make sure this is really a
- * VMI device.
- */
- if (!rom->pci_header_offs) {
- printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
- return 0;
- }
-
- pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
- if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
- pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
- /* Allow it to run... anyways, but warn */
- printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
- }
-
- if (rom->pnp_header_offs) {
- pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
- if (pnp->manufacturer_offset)
- manufacturer = (const char *)rom+pnp->manufacturer_offset;
- if (pnp->product_offset)
- product = (const char *)rom+pnp->product_offset;
- }
-
- if (rom->license_offs)
- license = (char *)rom+rom->license_offs;
-
- printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
- manufacturer, product,
- rom->api_version_maj, rom->api_version_min,
- pci->rom_version_maj, pci->rom_version_min);
-
- /* Don't allow BSD/MIT here for now because we don't want to end up
- with any binary only shim layers */
- if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
- printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
- license);
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
- unsigned long base;
-
- /* VMI ROM is in option ROM area, check signature */
- for (base = 0xC0000; base < 0xE0000; base += 2048) {
- struct vrom_header *romstart;
- romstart = (struct vrom_header *)isa_bus_to_virt(base);
- if (check_vmi_rom(romstart)) {
- vmi_rom = romstart;
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
- /* We must establish the lowmem mapping for MMU ops to work */
- if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
- reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
- if (rel->type == VMI_RELOCATION_CALL_REL)
- return (void *)rel->eip;
- else
- return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- if (rel->type == VMI_RELOCATION_CALL_REL) \
- opname = (void *)rel->eip; \
- else if (rel->type == VMI_RELOCATION_NOP) \
- opname = (void *)vmi_nop; \
- else if (rel->type != VMI_RELOCATION_NONE) \
- printk(KERN_WARNING "VMI: Unknown relocation " \
- "type %d for " #vmicall"\n",\
- rel->type); \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub. Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall) \
-do { \
- reloc = call_vrom_long_func(vmi_rom, get_reloc, \
- VMI_CALL_##vmicall); \
- BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
- if (rel->type == VMI_RELOCATION_CALL_REL) { \
- opname = wrapper; \
- vmi_ops.cache = (void *)rel->eip; \
- } \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
- short kernel_cs;
- u64 reloc;
- const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
- /*
- * Prevent page tables from being allocated in highmem, even if
- * CONFIG_HIGHPTE is enabled.
- */
- __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
- if (call_vrom_func(vmi_rom, vmi_init) != 0) {
- printk(KERN_ERR "VMI ROM failed to initialize!");
- return 0;
- }
- savesegment(cs, kernel_cs);
-
- pv_info.paravirt_enabled = 1;
- pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
- pv_info.name = "vmi [deprecated]";
-
- pv_init_ops.patch = vmi_patch;
-
- /*
- * Many of these operations are ABI compatible with VMI.
- * This means we can fill in the paravirt-ops with direct
- * pointers into the VMI ROM. If the calling convention for
- * these operations changes, this code needs to be updated.
- *
- * Exceptions
- * CPUID paravirt-op uses pointers, not the native ISA
- * halt has no VMI equivalent; all VMI halts are "safe"
- * no MSR support yet - just trap and emulate. VMI uses the
- * same ABI as the native ISA, but Linux wants exceptions
- * from bogus MSR read / write handled
- * rdpmc is not yet used in Linux
- */
-
- /* CPUID is special, so very special it gets wrapped like a present */
- para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
- para_fill(pv_cpu_ops.clts, CLTS);
- para_fill(pv_cpu_ops.get_debugreg, GetDR);
- para_fill(pv_cpu_ops.set_debugreg, SetDR);
- para_fill(pv_cpu_ops.read_cr0, GetCR0);
- para_fill(pv_mmu_ops.read_cr2, GetCR2);
- para_fill(pv_mmu_ops.read_cr3, GetCR3);
- para_fill(pv_cpu_ops.read_cr4, GetCR4);
- para_fill(pv_cpu_ops.write_cr0, SetCR0);
- para_fill(pv_mmu_ops.write_cr2, SetCR2);
- para_fill(pv_mmu_ops.write_cr3, SetCR3);
- para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
- para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
- para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
- para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
- para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
- para_fill(pv_cpu_ops.wbinvd, WBINVD);
- para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
- /* The following we emulate with trap and emulate for now */
- /* paravirt_ops.read_msr = vmi_rdmsr */
- /* paravirt_ops.write_msr = vmi_wrmsr */
- /* paravirt_ops.rdpmc = vmi_rdpmc */
-
- /* TR interface doesn't pass TR value, wrap */
- para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
- /* LDT is special, too */
- para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
- para_fill(pv_cpu_ops.load_gdt, SetGDT);
- para_fill(pv_cpu_ops.load_idt, SetIDT);
- para_fill(pv_cpu_ops.store_gdt, GetGDT);
- para_fill(pv_cpu_ops.store_idt, GetIDT);
- para_fill(pv_cpu_ops.store_tr, GetTR);
- pv_cpu_ops.load_tls = vmi_load_tls;
- para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
- write_ldt_entry, WriteLDTEntry);
- para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
- write_gdt_entry, WriteGDTEntry);
- para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
- write_idt_entry, WriteIDTEntry);
- para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
- para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
- para_fill(pv_cpu_ops.io_delay, IODelay);
-
- para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
- set_lazy_mode, SetLazyMode);
-
- para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
- set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
- set_lazy_mode, SetLazyMode);
-
- /* user and kernel flush are just handled with different flags to FlushTLB */
- para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
- para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
- para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
- /*
- * Until a standard flag format can be agreed on, we need to
- * implement these as wrappers in Linux. Get the VMI ROM
- * function pointers for the two backend calls.
- */
-#ifdef CONFIG_X86_PAE
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
- vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
- vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
- if (vmi_ops.set_pte) {
- pv_mmu_ops.set_pte = vmi_set_pte;
- pv_mmu_ops.set_pte_at = vmi_set_pte_at;
- pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
- pv_mmu_ops.set_pud = vmi_set_pud;
- pv_mmu_ops.pte_clear = vmi_pte_clear;
- pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
- }
-
- if (vmi_ops.update_pte) {
- pv_mmu_ops.pte_update = vmi_update_pte;
- pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
- }
-
- vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
- if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pte = vmi_allocate_pte;
- pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
- pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
- }
-
- vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
- if (vmi_ops.release_page) {
- pv_mmu_ops.release_pte = vmi_release_pte;
- pv_mmu_ops.release_pmd = vmi_release_pmd;
- pv_mmu_ops.pgd_free = vmi_pgd_free;
- }
-
- /* Set linear is needed in all cases */
- vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
- /*
- * These MUST always be patched. Don't support indirect jumps
- * through these operations, as the VMI interface may use either
- * a jump or a call to get to these operations, depending on
- * the backend. They are performance critical anyway, so requiring
- * a patch is not a big problem.
- */
- pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
- pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
- para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
- para_fill(apic->read, APICRead);
- para_fill(apic->write, APICWrite);
-#endif
-
- /*
- * Check for VMI timer functionality by probing for a cycle frequency method
- */
- reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
- if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
- vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
- vmi_timer_ops.get_cycle_counter =
- vmi_get_function(VMI_CALL_GetCycleCounter);
- vmi_timer_ops.get_wallclock =
- vmi_get_function(VMI_CALL_GetWallclockTime);
- vmi_timer_ops.wallclock_updated =
- vmi_get_function(VMI_CALL_WallclockUpdated);
- vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
- vmi_timer_ops.cancel_alarm =
- vmi_get_function(VMI_CALL_CancelAlarm);
- x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
- x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
- x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
- pv_time_ops.sched_clock = vmi_sched_clock;
- x86_platform.calibrate_tsc = vmi_tsc_khz;
- x86_platform.get_wallclock = vmi_get_wallclock;
- x86_platform.set_wallclock = vmi_set_wallclock;
-
- /* We have true wallclock functions; disable CMOS clock sync */
- no_sync_cmos_clock = 1;
- } else {
- disable_noidle = 1;
- disable_vmi_timer = 1;
- }
-
- para_fill(pv_irq_ops.safe_halt, Halt);
-
- /*
- * Alternative instruction rewriting doesn't happen soon enough
- * to convert VMI_IRET to a call instead of a jump; so we have
- * to do this before IRQs get reenabled. Fortunately, it is
- * idempotent.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
- vmi_bringup();
-
- return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
- if (!vmi_rom)
- probe_vmi_rom();
- else
- check_vmi_rom(vmi_rom);
-
- /* In case probing for or validating the ROM failed, basil */
- if (!vmi_rom)
- return;
-
- reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
- /* This is virtual hardware; timer routing is wired correctly */
- no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
- unsigned long flags;
-
- if (!vmi_rom)
- return;
-
- local_irq_save(flags);
- activate_vmi();
- local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "disable_pge")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
- disable_pge = 1;
- } else if (!strcmp(arg, "disable_pse")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
- disable_pse = 1;
- } else if (!strcmp(arg, "disable_sep")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
- disable_sep = 1;
- } else if (!strcmp(arg, "disable_tsc")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
- disable_tsc = 1;
- } else if (!strcmp(arg, "disable_mtrr")) {
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
- disable_mtrr = 1;
- } else if (!strcmp(arg, "disable_timer")) {
- disable_vmi_timer = 1;
- disable_noidle = 1;
- } else if (!strcmp(arg, "disable_noidle"))
- disable_noidle = 1;
- return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
- /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
- * cycle counter. */
- return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
- unsigned long long wallclock;
- wallclock = vmi_timer_ops.get_wallclock(); // nsec
- (void)do_div(wallclock, 1000000000); // sec
-
- return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
- return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
- return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
- unsigned long long khz;
- khz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(khz, 1000);
- return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
- return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-
- return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
- unsigned long val = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
- ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
- .name = "VMI-LOCAL",
- .startup = startup_timer_irq,
- .mask = mask_timer_irq,
- .unmask = unmask_timer_irq,
- .ack = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0 0x00000000
-#define VMI_ALARM_WIRED_LVTT 0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
- return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
-{
- cycle_t now, cycles_per_hz;
- BUG_ON(!irqs_disabled());
-
- switch (mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_hz, HZ);
- now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
- vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- switch (evt->mode) {
- case CLOCK_EVT_MODE_ONESHOT:
- vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
- break;
- default:
- break;
- }
- break;
- default:
- break;
- }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- /* Unfortunately, set_next_event interface only passes relative
- * expiry, but we want absolute expiry. It'd be better if were
- * were passed an absolute expiry, since a bunch of time may
- * have been stolen between the time the delta is computed and
- * when we set the alarm below. */
- cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
- BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
- vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
- return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
- .name = "vmi-timer",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .shift = 22,
- .set_mode = vmi_timer_set_mode,
- .set_next_event = vmi_timer_next_event,
- .rating = 1000,
- .irq = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
- struct clock_event_device *evt = &__get_cpu_var(local_events);
- evt->event_handler(evt);
- return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action = {
- .name = "vmi-timer",
- .handler = vmi_timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
- cycle_t cycles_per_msec;
- struct clock_event_device *evt;
-
- int cpu = smp_processor_id();
- evt = &__get_cpu_var(local_events);
-
- /* Use cycles_per_msec since div_sc params are 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- memcpy(evt, &vmi_clockevent, sizeof(*evt));
- /* Must pick .shift such that .mult fits in 32-bits. Choosing
- * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
- * before overflow. */
- evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
- /* Upper bound is clockevent's use of ulong for cycle deltas. */
- evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
- evt->min_delta_ns = clockevent_delta2ns(1, evt);
- evt->cpumask = cpumask_of(cpu);
-
- printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
- evt->name, evt->mult, evt->shift);
- clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
- unsigned int cpu;
- /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
- outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
- vmi_time_init_clockevent();
- setup_irq(0, &vmi_clock_action);
- for_each_possible_cpu(cpu)
- per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
- /*
- * On APIC systems, we want local timers to fire on each cpu. We do
- * this by programming LVTT to deliver timer events to the IRQ handler
- * for IRQ-0, since we can't re-use the APIC local timer handler
- * without interfering with that code.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
- local_irq_disable();
-#ifdef CONFIG_SMP
- /*
- * XXX handle_percpu_irq only defined for SMP; we need to switch over
- * to using it, since this is a local interrupt, which each CPU must
- * handle individually without locking out or dropping simultaneous
- * local timers on other CPUs. We also don't want to trigger the
- * quirk workaround code for interrupts which gets invoked from
- * handle_percpu_irq via eoi, so we use our own IRQ chip.
- */
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
- set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
- vmi_wiring = VMI_ALARM_WIRED_LVTT;
- apic_write(APIC_LVTT, vmi_get_timer_vector());
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
- vmi_time_init_clockevent();
- apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
- cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
- return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
- .name = "vmi-timer",
- .rating = 450,
- .read = read_real_cycles,
- .mask = CLOCKSOURCE_MASK(64),
- .mult = 0, /* to be set */
- .shift = 22,
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
- cycle_t cycles_per_msec;
-
- if (!vmi_timer_ops.get_cycle_frequency)
- return 0;
- /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
- cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
- (void)do_div(cycles_per_msec, 1000);
-
- /* Note that clocksource.{mult, shift} converts in the opposite direction
- * as clockevents. */
- clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
- clocksource_vmi.shift);
-
- printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
- return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..d7af4a64c211 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* ld script for the x86 kernel
*
@@ -14,111 +15,158 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
+
+#define RUNTIME_DISCARD_EXIT
+#define EMITS_PT_NOTE
+#define RO_EXCEPTION_TABLE_ALIGN 16
#include <asm-generic/vmlinux.lds.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
+#include <asm/kexec.h>
#undef i386 /* in case the preprocessor is a 32bit one */
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
-jiffies = jiffies_64;
#else
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+jiffies = jiffies_64;
+const_current_task = current_task;
+const_cpu_current_top_of_stack = cpu_current_top_of_stack;
+
+#if defined(CONFIG_X86_64)
/*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
*
* However, kernel identity mappings will have different RWX permissions
* to the pages mapping to text and to the pages padding (which are freed) the
* text section. Hence kernel identity mappings will be broken to smaller
* pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
*/
-#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
-#define X64_ALIGN_DEBUG_RODATA_END \
+#define X86_ALIGN_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
- __end_rodata_hpage_align = .;
+ __end_rodata_hpage_align = .; \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ __pi___start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+ __pi___end_bss_decrypted = .; \
#else
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(PAGE_SIZE); \
+ __end_rodata_aligned = .;
-#endif
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_X86_64
- user PT_LOAD FLAGS(5); /* R_E */
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(6); /* RW_ */
#endif
- init PT_LOAD FLAGS(7); /* RWE */
+#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
+#define KEXEC_RELOCATE_KERNEL \
+ . = ALIGN(0x100); \
+ __relocate_kernel_start = .; \
+ *(.text..relocate_kernel); \
+ *(.data..relocate_kernel); \
+ __relocate_kernel_end = .;
+
+ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "relocate_kernel code too large!")
+#else
+#define KEXEC_RELOCATE_KERNEL
#endif
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(6); /* RW_ */
note PT_NOTE FLAGS(0); /* ___ */
}
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
- /* bootstrapping code */
- HEAD_TEXT
-#ifdef CONFIG_X86_32
- . = ALIGN(PAGE_SIZE);
- *(.text..page_aligned)
-#endif
- . = ALIGN(8);
+ __pi__text = .;
_stext = .;
+ ALIGN_ENTRY_TEXT_BEGIN
+ *(.text..__x86.rethunk_untrain)
+ ENTRY_TEXT
+
+#ifdef CONFIG_MITIGATION_SRSO
+ /*
+ * See the comment above srso_alias_untrain_ret()'s
+ * definition.
+ */
+ . = srso_alias_untrain_ret | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20);
+ *(.text..__x86.rethunk_safe)
+#endif
+ ALIGN_ENTRY_TEXT_END
+
TEXT_TEXT
SCHED_TEXT
LOCK_TEXT
KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
+ SOFTIRQENTRY_TEXT
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ *(.text..__x86.indirect_thunk)
+ *(.text..__x86.return_thunk)
+#endif
+ STATIC_CALL_TEXT
*(.gnu.warning)
- /* End of text section */
- _etext = .;
- } :text = 0x9090
- NOTES :text :note
+ } :text = 0xcccccccc
- EXCEPTION_TABLE(16) :text = 0x9090
+ /* End of text section, which should occupy whole number of pages */
+ _etext = .;
+ . = ALIGN(PAGE_SIZE);
- X64_ALIGN_DEBUG_RODATA_BEGIN
+ X86_ALIGN_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
- X64_ALIGN_DEBUG_RODATA_END
+ X86_ALIGN_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -128,6 +176,9 @@ SECTIONS
/* init_task */
INIT_TASK_DATA(THREAD_SIZE)
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
#ifdef CONFIG_X86_32
/* 32 bit has nosave before _edata */
NOSAVE_DATA
@@ -135,10 +186,13 @@ SECTIONS
PAGE_ALIGNED_DATA(PAGE_SIZE)
+ CACHE_HOT_DATA(L1_CACHE_BYTES)
+
CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
DATA_DATA
CONSTRUCTORS
+ KEXEC_RELOCATE_KERNEL
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
@@ -147,108 +201,99 @@ SECTIONS
_edata = .;
} :data
-#ifdef CONFIG_X86_64
-
-#define VSYSCALL_ADDR (-10*1024*1024)
+ BUG_TABLE
-#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+ ORC_UNWIND_TABLE
-#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
- . = ALIGN(4096);
- __vsyscall_0 = .;
-
- . = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
- *(.vsyscall_0)
- } :user
-
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
- *(.vsyscall_fn)
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
}
- . = ALIGN(L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
- *(.vsyscall_gtod_data)
- }
+ INIT_TEXT_SECTION(PAGE_SIZE)
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
- *(.vsyscall_clock)
+ /*
+ * Section for code used exclusively before alternatives are run. All
+ * references to such code must be patched out by alternatives, normally
+ * by using X86_FEATURE_ALWAYS CPU feature bit.
+ *
+ * See static_cpu_has() for an example.
+ */
+ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+ *(.altinstr_aux)
+ . = ALIGN(PAGE_SIZE);
+ __inittext_end = .;
}
- vsyscall_clock = VVIRT(.vsyscall_clock);
+ INIT_DATA_SECTION(16)
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
- *(.vsyscall_1)
- }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
- *(.vsyscall_2)
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
}
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
- *(.vgetcpu_mode)
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
}
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
+#endif
- . = ALIGN(L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) {
- *(.jiffies)
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ /*
+ * List of instructions that call/jmp/jcc to retpoline thunks
+ * __x86_indirect_thunk_*(). These instructions can be patched along
+ * with alternatives, after which the section can be freed.
+ */
+ . = ALIGN(8);
+ .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) {
+ __retpoline_sites = .;
+ *(.retpoline_sites)
+ __retpoline_sites_end = .;
}
- jiffies = VVIRT(.jiffies);
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
- *(.vsyscall_3)
+ . = ALIGN(8);
+ .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) {
+ __return_sites = .;
+ *(.return_sites)
+ __return_sites_end = .;
}
- . = __vsyscall_0 + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
-#endif /* CONFIG_X86_64 */
-
- /* Init code and data - will be freed after init */
- . = ALIGN(PAGE_SIZE);
- .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
- __init_begin = .; /* paired with __init_end */
+ . = ALIGN(8);
+ .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) {
+ __call_sites = .;
+ *(.call_sites)
+ __call_sites_end = .;
}
-
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - .init.text - should
- * start another segment - init.
- */
- PERCPU_VADDR(0, :percpu)
-#endif
-
- INIT_TEXT_SECTION(PAGE_SIZE)
-#ifdef CONFIG_X86_64
- :init
#endif
- INIT_DATA_SECTION(16)
-
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
+#ifdef CONFIG_X86_KERNEL_IBT
+ . = ALIGN(8);
+ .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) {
+ __ibt_endbr_seal = .;
+ *(.ibt_endbr_seal)
+ __ibt_endbr_seal_end = .;
}
+#endif
+#ifdef CONFIG_FINEIBT
. = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
+ .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
+ __cfi_sites = .;
+ *(.cfi_sites)
+ __cfi_sites_end = .;
}
+#endif
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
. = ALIGN(8);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
__alt_instructions = .;
@@ -256,13 +301,26 @@ SECTIONS
__alt_instructions_end = .;
}
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
/*
- * .exit.text is discard at runtime, not link time, to deal with
- * references from .altinstructions and .eh_frame
+ * .exit.text is discarded at runtime, not link time, to deal with
+ * references from .altinstructions
*/
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
EXIT_TEXT
@@ -272,9 +330,11 @@ SECTIONS
EXIT_DATA
}
-#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
- PERCPU(PAGE_SIZE)
-#endif
+ PERCPU_SECTION(L1_CACHE_BYTES)
+ ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large")
+
+ RUNTIME_CONST_VARIABLES
+ RUNTIME_CONST(ptr, USER_PTR_MAX)
. = ALIGN(PAGE_SIZE);
@@ -306,62 +366,171 @@ SECTIONS
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
*(.bss..page_aligned)
- *(.bss)
- . = ALIGN(4);
+ . = ALIGN(PAGE_SIZE);
+ *(BSS_MAIN)
+ BSS_DECRYPTED
+ . = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
+ /*
+ * The memory occupied from _text to here, __end_of_kernel_reserve, is
+ * automatically reserved in setup_arch(). Anything after here must be
+ * explicitly reserved using memblock_reserve() or it will be discarded
+ * and treated as available memory.
+ */
+ __end_of_kernel_reserve = .;
+
. = ALIGN(PAGE_SIZE);
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}
+ . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
_end = .;
+ __pi__end = .;
- STABS_DEBUG
- DWARF_DEBUG
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Early scratch/workarea section: Lives outside of the kernel proper
+ * (_text - _end).
+ *
+ * Resides after _end because even though the .brk section is after
+ * __end_of_kernel_reserve, the .brk section is later reserved as a
+ * part of the kernel. Since it is located after __end_of_kernel_reserve
+ * it will be discarded and become part of the available memory. As
+ * such, it can only be used by very early boot code and must not be
+ * needed afterwards.
+ *
+ * Currently used by SME for performing in-place encryption of the
+ * kernel during boot. Resides on a 2MB boundary to simplify the
+ * pagetable setup used for SME in-place encryption.
+ */
+ . = ALIGN(HPAGE_SIZE);
+ .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) {
+ __init_scratch_begin = .;
+ *(.init.scratch)
+ . = ALIGN(HPAGE_SIZE);
+ __init_scratch_end = .;
+ }
+#endif
+
+ STABS_DEBUG
+ DWARF_DEBUG
+#ifdef CONFIG_PROPELLER_CLANG
+ .llvm_bb_addr_map : { *(.llvm_bb_addr_map) }
+#endif
+
+ ELF_DETAILS
- /* Sections to be discarded */
DISCARDS
- /DISCARD/ : { *(.eh_frame) }
-}
+ /*
+ * Make sure that the .got.plt is either completely empty or it
+ * contains only the lazy dispatch entries.
+ */
+ .got.plt (INFO) : { *(.got.plt) }
+ ASSERT(SIZEOF(.got.plt) == 0 ||
+#ifdef CONFIG_X86_64
+ SIZEOF(.got.plt) == 0x18,
+#else
+ SIZEOF(.got.plt) == 0xc,
+#endif
+ "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .got : {
+ *(.got) *(.igot.*)
+ }
+ ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!")
+
+ .plt : {
+ *(.plt) *(.plt.*) *(.iplt)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+
+ .rel.dyn : {
+ *(.rel.*) *(.rel_*)
+ }
+ ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!")
+
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+}
-#ifdef CONFIG_X86_32
/*
- * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause
+ * this. Let's assume that nobody will be running a COMPILE_TEST kernel and
+ * let's assert that fuller build coverage is more valuable than being able to
+ * run a COMPILE_TEST kernel.
*/
-. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE");
-#else
+#ifndef CONFIG_COMPILE_TEST
/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
+ * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility:
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+#endif
+
+/* needed for Clang - see arch/x86/entry/entry.S */
+PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
+#endif
+
+#ifdef CONFIG_MITIGATION_SRSO
+. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
- * Build-time check on the image size:
+ * GNU ld cannot do XOR until 2.41.
+ * https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=f6f78318fca803c4907fb8d7f6ded8295f1947b1
+ *
+ * LLVM lld cannot do XOR until lld-17.
+ * https://github.com/llvm/llvm-project/commit/fae96104d4378166cbe5c875ef8ed808a356f3fb
+ *
+ * Instead do: (A | B) - (A & B) in order to compute the XOR
+ * of the two function addresses:
*/
-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE");
+. = ASSERT(((ABSOLUTE(srso_alias_untrain_ret) | srso_alias_safe_ret) -
+ (ABSOLUTE(srso_alias_untrain_ret) & srso_alias_safe_ret)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)),
+ "SRSO function pair won't alias");
+#endif
-#ifdef CONFIG_SMP
-. = ASSERT((irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline");
+. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart");
+. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array");
#endif
-#endif /* CONFIG_X86_32 */
+#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)
+. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline");
+#endif
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
+#endif /* CONFIG_X86_64 */
-. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big");
+/*
+ * The symbols below are referenced using relative relocations in the
+ * respective ELF notes. This produces build time constants that the
+ * linker will never mark as relocatable. (Using just ABSOLUTE() is not
+ * sufficient for that).
+ */
+#ifdef CONFIG_XEN_PV
+xen_elfnote_entry_value =
+ ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen);
+#endif
+#ifdef CONFIG_PVH
+xen_elfnote_phys32_entry_value =
+ ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET);
#endif
+#include "../boot/startup/exports.h"
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd483..73511332bb67 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* vSMPowered(tm) systems specific initialization
* Copyright (C) 2005 ScaleMP Inc.
*
- * Use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
* Ravikiran Thirumalai <kiran@scalemp.com>,
* Shai Fultheim <shai@scalemp.com>
* Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>,
@@ -15,6 +12,8 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/pci-direct.h>
@@ -22,65 +21,10 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
-#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
-/*
- * Interrupt control on vSMPowered systems:
- * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
- * and vice versa.
- */
-
-static unsigned long vsmp_save_fl(void)
-{
- unsigned long flags = native_save_fl();
-
- if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
- flags &= ~X86_EFLAGS_IF;
- return flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
+#define TOPOLOGY_REGISTER_OFFSET 0x10
-static void vsmp_restore_fl(unsigned long flags)
-{
- if (flags & X86_EFLAGS_IF)
- flags &= ~X86_EFLAGS_AC;
- else
- flags |= X86_EFLAGS_AC;
- native_restore_fl(flags);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
-
-static void vsmp_irq_disable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
-
-static void vsmp_irq_enable(void)
-{
- unsigned long flags = native_save_fl();
-
- native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
-}
-PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
-
-static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
- unsigned long addr, unsigned len)
-{
- switch (type) {
- case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
- case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
- case PARAVIRT_PATCH(pv_irq_ops.save_fl):
- case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
- return paravirt_patch_default(type, clobbers, ibuf, addr, len);
- default:
- return native_patch(type, clobbers, ibuf, addr, len);
- }
-
-}
-
-static void __init set_vsmp_pv_ops(void)
+#ifdef CONFIG_PCI
+static void __init set_vsmp_ctl(void)
{
void __iomem *address;
unsigned int cap, ctl, cfg;
@@ -92,29 +36,25 @@ static void __init set_vsmp_pv_ops(void)
ctl = readl(address + 4);
printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
cap, ctl);
- if (cap & ctl & (1 << 4)) {
- /* Setup irq ops and turn on vSMP IRQ fastpath handling */
- pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
- pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
- pv_init_ops.patch = vsmp_patch;
-
- ctl &= ~(1 << 4);
- writel(ctl, address + 4);
- ctl = readl(address + 4);
- printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
+
+ /* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+ if (cap & ctl & BIT(8)) {
+ ctl &= ~BIT(8);
+
+#ifdef CONFIG_PROC_FS
+ /* Don't let users change irq affinity via procfs */
+ no_irq_affinity = 1;
+#endif
}
+#endif
+
+ writel(ctl, address + 4);
+ ctl = readl(address + 4);
+ pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
early_iounmap(address, 8);
}
-#else
-static void __init set_vsmp_pv_ops(void)
-{
-}
-#endif
-
-#ifdef CONFIG_PCI
static int is_vsmp = -1;
static void __init detect_vsmp_box(void)
@@ -130,7 +70,7 @@ static void __init detect_vsmp_box(void)
is_vsmp = 1;
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
if (is_vsmp != -1)
return is_vsmp;
@@ -144,17 +84,57 @@ int is_vsmp_box(void)
static void __init detect_vsmp_box(void)
{
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
return 0;
}
+static void __init set_vsmp_ctl(void)
+{
+}
#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
- set_vsmp_pv_ops();
+ vsmp_cap_cpus();
+
+ set_vsmp_ctl();
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
deleted file mode 100644
index 1c0c6ab9c60f..000000000000
--- a/arch/x86/kernel/vsyscall_64.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
- *
- * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
- * at virtual address -10Mbyte+1024bytes etc... There are at max 4
- * vsyscalls. One vsyscall can reserve more than 1 slot to avoid
- * jumping out of line if necessary. We cannot add more with this
- * mechanism because older kernels won't return -ENOSYS.
- * If we want more than four we need a vDSO.
- *
- * Note: the concept clashes with user mode linux. If you use UML and
- * want per guest time just set the kernel.vsyscall64 sysctl to 0.
- */
-
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/seqlock.h>
-#include <linux/jiffies.h>
-#include <linux/sysctl.h>
-#include <linux/clocksource.h>
-#include <linux/getcpu.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
-
-#include <asm/vsyscall.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/unistd.h>
-#include <asm/fixmap.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/segment.h>
-#include <asm/desc.h>
-#include <asm/topology.h>
-#include <asm/vgtod.h>
-
-#define __vsyscall(nr) \
- __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
-{
- .lock = SEQLOCK_UNLOCKED,
- .sysctl_enabled = 1,
-};
-
-void update_vsyscall_tz(void)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* sys_tz has changed */
- vsyscall_gtod_data.sys_tz = sys_tz;
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
- u32 mult)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
- /* copy vsyscall data */
- vsyscall_gtod_data.clock.vread = clock->vread;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
- write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
- *tz = __vsyscall_gtod_data.sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
- int ret;
- asm volatile("syscall"
- : "=a" (ret)
- : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
- : __syscall_clobber );
- return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
- long secs;
- asm volatile("syscall"
- : "=a" (secs)
- : "0" (__NR_time),"D" (t) : __syscall_clobber);
- return secs;
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
- cycle_t now, base, mask, cycle_delta;
- unsigned seq;
- unsigned long mult, shift, nsec;
- cycle_t (*vread)(void);
- do {
- seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
- vread = __vsyscall_gtod_data.clock.vread;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
- gettimeofday(tv,NULL);
- return;
- }
-
- now = vread();
- base = __vsyscall_gtod_data.clock.cycle_last;
- mask = __vsyscall_gtod_data.clock.mask;
- mult = __vsyscall_gtod_data.clock.mult;
- shift = __vsyscall_gtod_data.clock.shift;
-
- tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
- nsec = __vsyscall_gtod_data.wall_time_nsec;
- } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
- /* calculate interval: */
- cycle_delta = (now - base) & mask;
- /* convert to nsecs: */
- nsec += (cycle_delta * mult) >> shift;
-
- while (nsec >= NSEC_PER_SEC) {
- tv->tv_sec += 1;
- nsec -= NSEC_PER_SEC;
- }
- tv->tv_usec = nsec / NSEC_PER_USEC;
-}
-
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
- if (tv)
- do_vgettimeofday(tv);
- if (tz)
- do_get_tz(tz);
- return 0;
-}
-
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
- struct timeval tv;
- time_t result;
- if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
- return time_syscall(t);
-
- vgettimeofday(&tv, NULL);
- result = tv.tv_sec;
- if (t)
- *t = result;
- return result;
-}
-
-/* Fast way to get current CPU and node.
- This helps to do per node and per CPU caches in user space.
- The result is not guaranteed without CPU affinity, but usually
- works out because the scheduler tries to keep a thread on the same
- CPU.
-
- tcache must point to a two element sized long array.
- All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
- unsigned int p;
- unsigned long j = 0;
-
- /* Fast cache - only recompute value once per jiffies and avoid
- relatively costly rdtscp/cpuid otherwise.
- This works because the scheduler usually keeps the process
- on the same CPU and this syscall doesn't guarantee its
- results anyways.
- We do this here because otherwise user space would do it on
- its own in a likely inferior way (no access to jiffies).
- If you don't like it pass NULL. */
- if (tcache && tcache->blob[0] == (j = __jiffies)) {
- p = tcache->blob[1];
- } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
- }
- if (tcache) {
- tcache->blob[0] = j;
- tcache->blob[1] = p;
- }
- if (cpu)
- *cpu = p & 0xfff;
- if (node)
- *node = p >> 12;
- return 0;
-}
-
-static long __vsyscall(3) venosys_1(void)
-{
- return -ENOSYS;
-}
-
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
- { .procname = "vsyscall64",
- .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec },
- {}
-};
-
-static ctl_table kernel_root_table2[] = {
- { .procname = "kernel", .mode = 0555,
- .child = kernel_table2 },
- {}
-};
-#endif
-
-/* Assume __initcall executes before all user space. Hopefully kmod
- doesn't violate that. We'll find out if it does. */
-static void __cpuinit vsyscall_set_cpu(int cpu)
-{
- unsigned long d;
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /* Store cpu number in limit so that it can be loaded quickly
- in user space in vgetcpu.
- 12 bits for the CPU and 8 bits for the node. */
- d = 0x0f40000000000ULL;
- d |= cpu;
- d |= (node & 0xf) << 12;
- d |= (node >> 4) << 48;
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static void __cpuinit cpu_vsyscall_init(void *arg)
-{
- /* preemption should be already off */
- vsyscall_set_cpu(raw_smp_processor_id());
-}
-
-static int __cpuinit
-cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
-{
- long cpu = (long)arg;
- if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
- return NOTIFY_DONE;
-}
-
-void __init map_vsyscall(void)
-{
- extern char __vsyscall_0;
- unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
-
- /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
-}
-
-static int __init vsyscall_init(void)
-{
- BUG_ON(((unsigned long) &vgettimeofday !=
- VSYSCALL_ADDR(__NR_vgettimeofday)));
- BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
- BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
- BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
- register_sysctl_table(kernel_root_table2);
-#endif
- on_each_cpu(cpu_vsyscall_init, NULL, 1);
- /* notifier priority > KVM */
- hotcpu_notifier(cpu_vsyscall_notifier, 30);
- return 0;
-}
-
-__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
deleted file mode 100644
index 1b950d151e58..000000000000
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Exports for assembly files.
- All C exports should go in the respective C files. */
-
-#include <linux/module.h>
-#include <linux/smp.h>
-
-#include <net/checksum.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
-EXPORT_SYMBOL(mcount);
-#endif
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(copy_user_generic_string);
-EXPORT_SYMBOL(copy_user_generic_unrolled);
-EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(_copy_from_user);
-EXPORT_SYMBOL(_copy_to_user);
-
-EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * Export string functions. We normally rely on gcc builtin for most of these,
- * but gcc sometimes decides not to inline them.
- */
-#undef memcpy
-#undef memset
-#undef memmove
-
-extern void *memset(void *, int, __kernel_size_t);
-extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
-
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
-
-EXPORT_SYMBOL(empty_zero_page);
-#ifndef CONFIG_PARAVIRT
-EXPORT_SYMBOL(native_load_gs_index);
-#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3eca..0a2bbd674a6d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -3,28 +3,60 @@
*
* For licencing details see kernel-base/COPYING
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/ioport.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
#include <asm/pci_x86.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
#include <asm/apic.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/time.h>
#include <asm/irq.h>
-#include <asm/pat.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <asm/memtype.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+#include <asm/irqdomain.h>
+#include <asm/realmode.h>
-void __cpuinit x86_init_noop(void) { }
+void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
-void __init x86_init_pgd_noop(pgd_t *unused) { }
-int __init iommu_init_noop(void) { return 0; }
-void iommu_shutdown_noop(void) { }
+static int __init iommu_init_noop(void) { return 0; }
+static void iommu_shutdown_noop(void) { }
+bool __init bool_x86_init_noop(void) { return false; }
+void x86_op_int_noop(int cpu) { }
+int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+void get_rtc_noop(struct timespec64 *now) { }
+
+static __initconst const struct of_device_id of_cmos_match[] = {
+ { .compatible = "motorola,mc146818" },
+ {}
+};
+
+/*
+ * Allow devicetree configured systems to disable the RTC by setting the
+ * corresponding DT node's status property to disabled. Code is optimized
+ * out for CONFIG_OF=n builds.
+ */
+static __init void x86_wallclock_init(void)
+{
+ struct device_node *node = of_find_matching_node(NULL, of_cmos_match);
+
+ if (node && !of_device_is_available(node)) {
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ }
+}
/*
* The platform setup functions are preset with the default functions
@@ -33,25 +65,25 @@ void iommu_shutdown_noop(void) { }
struct x86_init_ops x86_init __initdata = {
.resources = {
- .probe_roms = x86_init_noop,
+ .probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
- .memory_setup = default_machine_specific_memory_setup,
+ .memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
},
.mpparse = {
- .mpc_record = x86_init_uint_noop,
.setup_ioapic_ids = x86_init_noop,
- .mpc_apic_id = default_mpc_apic_id,
- .smp_read_mpc_oem = default_smp_read_mpc_oem,
- .mpc_oem_bus_info = default_mpc_oem_bus_info,
- .find_smp_config = default_find_smp_config,
- .get_smp_config = default_get_smp_config,
+ .find_mptable = mpparse_find_mptable,
+ .early_parse_smp_cfg = mpparse_parse_early_smp_config,
+ .parse_smp_cfg = mpparse_parse_smp_config,
},
.irqs = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
- .trap_init = x86_init_noop,
+ .intr_mode_select = apic_intr_mode_select,
+ .intr_mode_init = apic_intr_mode_init,
+ .create_pci_msi_domain = native_create_pci_msi_domain,
},
.oem = {
@@ -60,14 +92,13 @@ struct x86_init_ops x86_init __initdata = {
},
.paging = {
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
+ .pagetable_init = native_pagetable_init,
},
.timers = {
.setup_percpu_clockev = setup_boot_APIC_clock,
- .tsc_pre_init = x86_init_noop,
.timer_init = hpet_time_init,
+ .wallclock_init = x86_wallclock_init,
},
.iommu = {
@@ -79,23 +110,68 @@ struct x86_init_ops x86_init __initdata = {
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .guest_late_init = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .msi_ext_dest_id = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ .init_after_bootmem = x86_init_noop,
+ },
+
+ .acpi = {
+ .set_root_pointer = x86_default_set_root_pointer,
+ .get_root_pointer = x86_default_get_root_pointer,
+ .reduced_hw_early_init = acpi_generic_reduced_hw_init,
+ },
};
-struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+struct x86_cpuinit_ops x86_cpuinit = {
+ .early_percpu_clock_init = x86_init_noop,
.setup_percpu_clockev = setup_secondary_APIC_clock,
+ .parallel_bringup = true,
};
static void default_nmi_init(void) { };
-static int default_i8042_detect(void) { return 1; };
-struct x86_platform_ops x86_platform = {
+static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static bool enc_tlb_flush_required_noop(bool enc) { return false; }
+static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(void) {}
+static void enc_kexec_finish_noop(void) {}
+static bool is_private_mmio_noop(u64 addr) {return false; }
+
+struct x86_platform_ops x86_platform __ro_after_init = {
+ .calibrate_cpu = native_calibrate_cpu_early,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
- .set_wallclock = mach_set_rtc_mmss,
+ .set_wallclock = mach_set_cmos_time,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
- .i8042_detect = default_i8042_detect
+ .get_nmi_reason = default_get_nmi_reason,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .realmode_reserve = reserve_real_mode,
+ .realmode_init = init_real_mode,
+ .hyper.pin_vcpu = x86_op_int_noop,
+ .hyper.is_private_mmio = is_private_mmio_noop,
+
+ .guest = {
+ .enc_status_change_prepare = enc_status_change_prepare_noop,
+ .enc_status_change_finish = enc_status_change_finish_noop,
+ .enc_tlb_flush_required = enc_tlb_flush_required_noop,
+ .enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
+ },
};
EXPORT_SYMBOL_GPL(x86_platform);
+
+struct x86_apic_ops x86_apic_ops __ro_after_init = {
+ .io_apic_read = native_io_apic_read,
+ .restore = native_restore_boot_irq_mode,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
deleted file mode 100644
index 37e68fc5e24a..000000000000
--- a/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <asm/i387.h>
-#ifdef CONFIG_IA32_EMULATION
-#include <asm/sigcontext32.h>
-#endif
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-struct _fpx_sw_bytes fx_sw_reserved;
-#ifdef CONFIG_IA32_EMULATION
-struct _fpx_sw_bytes fx_sw_reserved_ia32;
-#endif
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-int check_for_xstate(struct i387_fxsave_struct __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw_user)
-{
- int min_xstate_size = sizeof(struct i387_fxsave_struct) +
- sizeof(struct xsave_hdr_struct);
- unsigned int magic2;
- int err;
-
- err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
- sizeof(struct _fpx_sw_bytes));
-
- if (err)
- return err;
-
- /*
- * First Magic check failed.
- */
- if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -1;
-
- /*
- * Check for error scenarios.
- */
- if (fx_sw_user->xstate_size < min_xstate_size ||
- fx_sw_user->xstate_size > xstate_size ||
- fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -1;
-
- err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
- fx_sw_user->extended_size -
- FP_XSTATE_MAGIC2_SIZE));
- /*
- * Check for the presence of second magic word at the end of memory
- * layout. This detects the case where the user just copied the legacy
- * fpstate layout with out copying the extended state information
- * in the memory layout.
- */
- if (err || magic2 != FP_XSTATE_MAGIC2)
- return -1;
-
- return 0;
-}
-
-#ifdef CONFIG_X86_64
-/*
- * Signal frame handlers.
- */
-
-int save_i387_xstate(void __user *buf)
-{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
- return -EACCES;
-
- BUG_ON(sig_xstate_size < xstate_size);
-
- if ((unsigned long)buf % 64)
- printk("save_i387_xstate: bad fpstate %p\n", buf);
-
- if (!used_math())
- return 0;
-
- if (task_thread_info(tsk)->status & TS_USEDFPU) {
- /*
- * Start with clearing the user buffer. This will present a
- * clean context for the bytes not touched by the fxsave/xsave.
- */
- err = __clear_user(buf, sig_xstate_size);
- if (err)
- return err;
-
- if (use_xsave())
- err = xsave_user(buf);
- else
- err = fxsave_user(buf);
-
- if (err)
- return err;
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
- stts();
- } else {
- if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
- xstate_size))
- return -1;
- }
-
- clear_used_math(); /* trigger finit */
-
- if (use_xsave()) {
- struct _fpstate __user *fx = buf;
- struct _xstate __user *x = buf;
- u64 xstate_bv;
-
- err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
-
- err |= __put_user(FP_XSTATE_MAGIC2,
- (__u32 __user *) (buf + sig_xstate_size
- - FP_XSTATE_MAGIC2_SIZE));
-
- /*
- * Read the xstate_bv which we copied (directly from the cpu or
- * from the state in task struct) to the user buffers and
- * set the FP/SSE bits.
- */
- err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
-
- /*
- * For legacy compatible, we always set FP/SSE bits in the bit
- * vector while saving the state to the user context. This will
- * enable us capturing any changes(during sigreturn) to
- * the FP/SSE bits by the legacy applications which don't touch
- * xstate_bv in the xsave header.
- *
- * xsave aware apps can change the xstate_bv in the xsave
- * header as well as change any contents in the memory layout.
- * xrestore as part of sigreturn will capture all the changes.
- */
- xstate_bv |= XSTATE_FPSSE;
-
- err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
-
- if (err)
- return err;
- }
-
- return 1;
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE
- * state.
- */
-static int restore_user_xstate(void __user *buf)
-{
- struct _fpx_sw_bytes fx_sw_user;
- u64 mask;
- int err;
-
- if (((unsigned long)buf % 64) ||
- check_for_xstate(buf, buf, &fx_sw_user))
- goto fx_only;
-
- mask = fx_sw_user.xstate_bv;
-
- /*
- * restore the state passed by the user.
- */
- err = xrestore_user(buf, mask);
- if (err)
- return err;
-
- /*
- * init the state skipped by the user.
- */
- mask = pcntxt_mask & ~mask;
-
- xrstor_state(init_xstate_buf, mask);
-
- return 0;
-
-fx_only:
- /*
- * couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
- return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
-}
-
-/*
- * This restores directly out of user space. Exceptions are handled.
- */
-int restore_i387_xstate(void __user *buf)
-{
- struct task_struct *tsk = current;
- int err = 0;
-
- if (!buf) {
- if (used_math())
- goto clear;
- return 0;
- } else
- if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
- return -EACCES;
-
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
-
- if (!(task_thread_info(current)->status & TS_USEDFPU)) {
- clts();
- task_thread_info(current)->status |= TS_USEDFPU;
- }
- if (use_xsave())
- err = restore_user_xstate(buf);
- else
- err = fxrstor_checking((__force struct i387_fxsave_struct *)
- buf);
- if (unlikely(err)) {
- /*
- * Encountered an error while doing the restore from the
- * user buffer, clear the fpu state.
- */
-clear:
- clear_fpu(tsk);
- clear_used_math();
- }
- return err;
-}
-#endif
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
- int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
- FP_XSTATE_MAGIC2_SIZE;
-
- sig_xstate_size = sizeof(struct _fpstate) + size_extended;
-
-#ifdef CONFIG_IA32_EMULATION
- sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
-#endif
-
- memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
-
- fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
- fx_sw_reserved.extended_size = sig_xstate_size;
- fx_sw_reserved.xstate_bv = pcntxt_mask;
- fx_sw_reserved.xstate_size = xstate_size;
-#ifdef CONFIG_IA32_EMULATION
- memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
- sizeof(struct _fpx_sw_bytes));
- fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
-#endif
-}
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-#ifdef CONFIG_X86_64
-unsigned int sig_xstate_size = sizeof(struct _fpstate);
-#endif
-
-/*
- * Enable the extended processor state save/restore feature
- */
-void __cpuinit xsave_init(void)
-{
- if (!cpu_has_xsave)
- return;
-
- set_in_cr4(X86_CR4_OSXSAVE);
-
- /*
- * Enable all the features that the HW is capable of
- * and the Linux kernel is aware of.
- */
- xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_xstate_init(void)
-{
- init_xstate_buf = alloc_bootmem(xstate_size);
- init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-void __ref xsave_cntxt_init(void)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
- pcntxt_mask = eax + ((u64)edx << 32);
-
- if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
- printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
- pcntxt_mask);
- BUG();
- }
-
- /*
- * Support only the state known to OS.
- */
- pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
- xsave_init();
-
- /*
- * Recompute the context size for enabled features
- */
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
-
- update_regset_xstate_info(xstate_size, pcntxt_mask);
- prepare_fx_sw_frame();
-
- setup_xstate_init();
-
- printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
- "cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
-}
diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore
new file mode 100644
index 000000000000..615d6ff35c00
--- /dev/null
+++ b/arch/x86/kvm/.gitignore
@@ -0,0 +1,2 @@
+/kvm-asm-offsets.s
+/kvm-asm-offsets.h
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..278f08194ec8 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# KVM configuration
#
@@ -6,9 +7,8 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on HAVE_KVM || X86
default y
- ---help---
+ help
Say Y here to get to see options for using your Linux host to run other
operating systems inside virtual machines (guests).
This option alone does not add any kernel code.
@@ -17,20 +17,42 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
-config KVM
- tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
- # for device assignment:
- depends on PCI
- select PREEMPT_NOTIFIERS
- select MMU_NOTIFIER
- select ANON_INODES
+config KVM_X86
+ def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n)
+ select KVM_COMMON
+ select KVM_GENERIC_MMU_NOTIFIER
+ select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+ select KVM_MMU_LOCKLESS_AGING
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_EVENTFD
- select KVM_APIC_ARCHITECTURE
+ select HAVE_KVM_PFNCACHE
+ select HAVE_KVM_DIRTY_RING_TSO
+ select HAVE_KVM_DIRTY_RING_ACQ_REL
+ select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_READONLY_MEM
+ select VHOST_TASK
+ select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
- ---help---
+ select SCHED_INFO
+ select PERF_EVENTS
+ select GUEST_PERF_EVENTS
+ select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select HAVE_KVM_NO_POLL
+ select VIRT_XFER_TO_GUEST_WORK
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_VFIO
+ select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
+ select KVM_WERROR if WERROR
+ select KVM_GUEST_MEMFD if X86_64
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support"
+ depends on X86_LOCAL_APIC
+ help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
processor equipped with virtualization extensions. You will also
@@ -44,30 +66,178 @@ config KVM
If unsure, say N.
+config KVM_WERROR
+ bool "Compile KVM with -Werror"
+ # Disallow KVM's -Werror if KASAN is enabled, e.g. to guard against
+ # randomized configs from selecting KVM_WERROR=y, which doesn't play
+ # nice with KASAN. KASAN builds generates warnings for the default
+ # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
+ # Building KVM with -Werror and KASAN is still doable via enabling
+ # the kernel-wide WERROR=y.
+ depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
+ help
+ Add -Werror to the build flags for KVM.
+
+ If in doubt, say "N".
+
+config KVM_SW_PROTECTED_VM
+ bool "Enable support for KVM software-protected VMs"
+ depends on EXPERT
+ depends on KVM_X86 && X86_64
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ help
+ Enable support for KVM software-protected VMs. Currently, software-
+ protected VMs are purely a development and testing vehicle for
+ KVM_CREATE_GUEST_MEMFD. Attempting to run a "real" VM workload as a
+ software-protected VM will fail miserably.
+
+ If unsure, say "N".
+
config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- ---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
+ tristate "KVM for Intel (and compatible) processors support"
+ depends on KVM && IA32_FEAT_CTL
+ select X86_FRED if X86_64
+ help
+ Provides support for KVM on processors equipped with Intel's VT
+ extensions, a.k.a. Virtual Machine Extensions (VMX).
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ depends on KVM_INTEL && EXPERT
+ help
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ Note: some CPUs appear to generate spurious EPT Violations #VEs
+ that trigger KVM's WARN, in particular with eptad=0 and/or nested
+ virtualization.
+
+ If unsure, say N.
+
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
+config KVM_INTEL_TDX
+ bool "Intel Trust Domain Extensions (TDX) support"
+ default y
+ depends on INTEL_TDX_HOST
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select HAVE_KVM_ARCH_GMEM_POPULATE
+ help
+ Provides support for launching Intel Trust Domain Extensions (TDX)
+ confidential VMs on Intel processors.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
- depends on KVM
- ---help---
+ depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
+ help
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
To compile this as a module, choose M here: the module
will be called kvm-amd.
-# OK, it's a little counter-intuitive to do this, but it puts it neatly under
-# the virtualization menu.
-source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
-source drivers/virtio/Kconfig
+config KVM_AMD_SEV
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ default y
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ select ARCH_HAS_CC_PLATFORM
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_POPULATE
+ help
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM_X86
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM_X86
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
+config KVM_HYPERV
+ bool "Support for Microsoft Hyper-V emulation"
+ depends on KVM_X86
+ default y
+ help
+ Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
+ to expose a subset of the paravirtualized interfaces defined in the
+ Hyper-V Hypervisor Top-Level Functional Specification (TLFS):
+ https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ These interfaces are required for the correct and performant functioning
+ of Windows and Hyper-V guests on KVM.
+
+ If unsure, say "Y".
+
+config KVM_XEN
+ bool "Support for Xen hypercall interface"
+ depends on KVM_X86
+ help
+ Provides KVM support for the hosting Xen HVM guests and
+ passing Xen hypercalls to userspace.
+
+ If in doubt, say "N".
+
+config KVM_PROVE_MMU
+ bool "Prove KVM MMU correctness"
+ depends on DEBUG_KERNEL
+ depends on KVM_X86
+ depends on EXPERT
+ help
+ Enables runtime assertions in KVM's MMU that are too costly to enable
+ in anything remotely resembling a production environment, e.g. this
+ gates code that verifies a to-be-freed page table doesn't have any
+ present SPTEs.
+
+ If in doubt, say "N".
+
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
+
+config KVM_MAX_NR_VCPUS
+ int "Maximum number of vCPUs per KVM guest"
+ depends on KVM_X86
+ range 1024 4096
+ default 4096 if MAXSMP
+ default 1024
+ help
+ Set the maximum number of vCPUs per KVM guest. Larger values will increase
+ the memory footprint of each KVM guest, regardless of how many vCPUs are
+ created for a given VM.
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,20 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -I $(srctree)/arch/x86/kvm
+ccflags-$(CONFIG_KVM_WERROR) += -Werror
-CFLAGS_x86.o := -I.
-CFLAGS_svm.o := -I.
-CFLAGS_vmx.o := -I.
+include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
- coalesced_mmio.o irq_comm.o eventfd.o \
- assigned-dev.o)
-kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
-kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
-kvm-intel-y += vmx.o
-kvm-amd-y += svm.o
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
+kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
+kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
-obj-$(CONFIG_KVM) += kvm.o
+kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
+ vmx/nested.o vmx/posted_intr.o vmx/main.o
+
+kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
+kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
+kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o
+
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
+
+kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
+
+ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o
+kvm-amd-y += svm/svm_onhyperv.o
+endif
+
+obj-$(CONFIG_KVM_X86) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
+
+AFLAGS_svm/vmenter.o := -iquote $(obj)
+$(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+AFLAGS_vmx/vmenter.o := -iquote $(obj)
+$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h
+
+$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE
+ $(call filechk,offsets,__KVM_ASM_OFFSETS_H__)
+
+targets += kvm-asm-offsets.s
+clean-files += kvm-asm-offsets.h
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..d563a948318b
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,2107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
+#include <asm/processor.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include <asm/sgx.h>
+#include <asm/cpuid/api.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+#include "pmu.h"
+#include "xen.h"
+
+/*
+ * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
+ * aligned to sizeof(unsigned long) because it's not accessed via bitops.
+ */
+u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps);
+
+struct cpuid_xstate_sizes {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+};
+
+static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init;
+
+void __init kvm_init_xstate_sizes(void)
+{
+ u32 ign;
+ int i;
+
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) {
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
+
+ cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign);
+ }
+}
+
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
+{
+ u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ int i;
+
+ xstate_bv &= XFEATURE_MASK_EXTEND;
+ for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) {
+ struct cpuid_xstate_sizes *xs = &xstate_sizes[i];
+ u32 offset;
+
+ if (!(xstate_bv & BIT_ULL(i)))
+ continue;
+
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = xs->ebx;
+ ret = max(ret, offset + xs->eax);
+ xstate_bv &= ~BIT_ULL(i);
+ }
+
+ return ret;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(
+ struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index)
+{
+ struct kvm_cpuid_entry2 *e;
+ int i;
+
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
+ for (i = 0; i < nent; i++) {
+ e = &entries[i];
+
+ if (e->function != function)
+ continue;
+
+ /*
+ * If the index isn't significant, use the first entry with a
+ * matching function. It's userspace's responsibility to not
+ * provide "duplicate" entries in all cases.
+ */
+ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
+ return e;
+
+
+ /*
+ * Similarly, use the first matching entry if KVM is doing a
+ * lookup (as opposed to emulating CPUID) for a function that's
+ * architecturally defined as not having a significant index.
+ */
+ if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) {
+ /*
+ * Direct lookups from KVM should not diverge from what
+ * KVM defines internally (the architectural behavior).
+ */
+ WARN_ON_ONCE(cpuid_function_is_indexed(function));
+ return e;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_cpuid_entry2);
+
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
+
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
+
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
+}
+
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu);
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ /*
+ * Apply runtime CPUID updates to the incoming CPUID entries to avoid
+ * false positives due mismatches on KVM-owned feature flags.
+ *
+ * Note! @e2 and @nent track the _old_ CPUID entries!
+ */
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_apply_cpuid_pv_features_quirk(vcpu);
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
+{
+ struct kvm_hypervisor_cpuid cpuid = {};
+ struct kvm_cpuid_entry2 *entry;
+ u32 base;
+
+ for_each_possible_cpuid_base_hypervisor(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
+
+ if (entry) {
+ u32 signature[3];
+
+ signature[0] = entry->ebx;
+ signature[1] = entry->ecx;
+ signature[2] = entry->edx;
+
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
+ break;
+ }
+ }
+ }
+
+ return cpuid;
+}
+
+static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hypervisor_cpuid kvm_cpuid;
+ struct kvm_cpuid_entry2 *best;
+
+ kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ if (!kvm_cpuid.base)
+ return 0;
+
+ best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES);
+ if (!best)
+ return 0;
+
+ if (kvm_hlt_in_guest(vcpu->kvm))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
+ return best->eax;
+}
+
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0;
+}
+
+static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1);
+ if (!best)
+ return 0;
+
+ return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss;
+}
+
+static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool has_feature)
+{
+ cpuid_entry_change(entry, x86_feature, has_feature);
+ guest_cpu_cap_change(vcpu, x86_feature, has_feature);
+}
+
+static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ vcpu->arch.cpuid_dynamic_bits_dirty = false;
+
+ best = kvm_find_cpuid_entry(vcpu, 1);
+ if (best) {
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
+
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC,
+ vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
+
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT,
+ vcpu->arch.ia32_misc_enable_msr &
+ MSR_IA32_MISC_ENABLE_MWAIT);
+ }
+
+ best = kvm_find_cpuid_entry_index(vcpu, 7, 0);
+ if (best)
+ kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE,
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
+
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0);
+ if (best)
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
+
+ best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1);
+ if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
+ cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0 |
+ vcpu->arch.ia32_xss, true);
+}
+
+static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_HYPERV
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
+ return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+#else
+ return false;
+#endif
+}
+
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
+/*
+ * This isn't truly "unsafe", but except for the cpu_caps initialization code,
+ * all register lookups should use __cpuid_entry_get_reg(), which provides
+ * compile-time validation of the input.
+ */
+static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return entry->eax;
+ case CPUID_EBX:
+ return entry->ebx;
+ case CPUID_ECX:
+ return entry->ecx;
+ case CPUID_EDX:
+ return entry->edx;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated);
+
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *best;
+ struct kvm_cpuid_entry2 *entry;
+ bool allow_gbpages;
+ int i;
+
+ memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps));
+ BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS);
+
+ /*
+ * Reset guest capabilities to userspace's guest CPUID definition, i.e.
+ * honor userspace's definition for features that don't require KVM or
+ * hardware management/support (or that KVM simply doesn't care about).
+ */
+ for (i = 0; i < NR_KVM_CPU_CAPS; i++) {
+ const struct cpuid_reg cpuid = reverse_cpuid[i];
+ struct kvm_cpuid_entry2 emulated;
+
+ if (!cpuid.function)
+ continue;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ continue;
+
+ cpuid_func_emulated(&emulated, cpuid.function, true);
+
+ /*
+ * A vCPU has a feature if it's supported by KVM and is enabled
+ * in guest CPUID. Note, this includes features that are
+ * supported by KVM but aren't advertised to userspace!
+ */
+ vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] |
+ cpuid_get_reg_unsafe(&emulated, cpuid.reg);
+ vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg);
+ }
+
+ kvm_update_cpuid_runtime(vcpu);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES);
+ guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages);
+
+ best = kvm_find_cpuid_entry(vcpu, 1);
+ if (best && apic) {
+ if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+
+ kvm_apic_set_version(vcpu);
+ }
+
+ vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu);
+ vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu);
+
+ vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu);
+
+ vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+
+ kvm_pmu_refresh(vcpu);
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
+ __cr4_reserved_bits(guest_cpu_cap_has, vcpu);
+#undef __kvm_cpu_cap_has
+
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu));
+
+ /* Invoke the vendor callback only after the above state is updated. */
+ kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to do its thing any vendor specific
+ * adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_after_set_cpuid(vcpu);
+
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
+}
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008);
+ if (best)
+ return (best->eax >> 16) & 0xff;
+not_found:
+ return 0;
+}
+
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
+static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ u32 vcpu_caps[NR_KVM_CPU_CAPS];
+ int r;
+
+ /*
+ * Swap the existing (old) entries with the incoming (new) entries in
+ * order to massage the new entries, e.g. to account for dynamic bits
+ * that KVM controls, without clobbering the current guest CPUID, which
+ * KVM needs to preserve in order to unwind on failure.
+ *
+ * Similarly, save the vCPU's current cpu_caps so that the capabilities
+ * can be updated alongside the CPUID entries when performing runtime
+ * updates. Full initialization is done if and only if the vCPU hasn't
+ * run, i.e. only if userspace is potentially changing CPUID features.
+ */
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+
+ memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps));
+ BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps));
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (kvm_vcpu_has_run(vcpu)) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ goto err;
+ goto success;
+ }
+
+#ifdef CONFIG_KVM_HYPERV
+ if (kvm_cpuid_has_hyperv(vcpu)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ goto err;
+ }
+#endif
+
+ r = kvm_check_cpuid(vcpu);
+ if (r)
+ goto err;
+
+#ifdef CONFIG_KVM_XEN
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
+#endif
+ kvm_vcpu_after_set_cpuid(vcpu);
+
+success:
+ kvfree(e2);
+ return 0;
+
+err:
+ memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps));
+ swap(vcpu->arch.cpuid_entries, e2);
+ swap(vcpu->arch.cpuid_nent, nent);
+ return r;
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *e = NULL;
+ struct kvm_cpuid_entry2 *e2 = NULL;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e = vmemdup_array_user(entries, cpuid->nent, sizeof(*e));
+ if (IS_ERR(e))
+ return PTR_ERR(e);
+
+ e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT);
+ if (!e2) {
+ r = -ENOMEM;
+ goto out_free_cpuid;
+ }
+ }
+ for (i = 0; i < cpuid->nent; i++) {
+ e2[i].function = e[i].function;
+ e2[i].eax = e[i].eax;
+ e2[i].ebx = e[i].ebx;
+ e2[i].ecx = e[i].ecx;
+ e2[i].edx = e[i].edx;
+ e2[i].index = 0;
+ e2[i].flags = 0;
+ e2[i].padding[0] = 0;
+ e2[i].padding[1] = 0;
+ e2[i].padding[2] = 0;
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+out_free_cpuid:
+ kvfree(e);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *e2 = NULL;
+ int r;
+
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ return -E2BIG;
+
+ if (cpuid->nent) {
+ e2 = vmemdup_array_user(entries, cpuid->nent, sizeof(*e2));
+ if (IS_ERR(e2))
+ return PTR_ERR(e2);
+ }
+
+ r = kvm_set_cpuid(vcpu, e2, cpuid->nent);
+ if (r)
+ kvfree(e2);
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ return -E2BIG;
+
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ if (copy_to_user(entries, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return 0;
+}
+
+static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid)
+{
+ struct kvm_cpuid_entry2 entry;
+ u32 base;
+
+ /*
+ * KVM only supports features defined by Intel (0x0), AMD (0x80000000),
+ * and Centaur (0xc0000000). WARN if a feature for new vendor base is
+ * defined, as this and other code would need to be updated.
+ */
+ base = cpuid.function & 0xffff0000;
+ if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000))
+ return 0;
+
+ if (cpuid_eax(base) < cpuid.function)
+ return 0;
+
+ cpuid_count(cpuid.function, cpuid.index,
+ &entry.eax, &entry.ebx, &entry.ecx, &entry.edx);
+
+ return *__cpuid_entry_get_reg(&entry, cpuid.reg);
+}
+
+/*
+ * For kernel-defined leafs, mask KVM's supported feature set with the kernel's
+ * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw
+ * CPUID, as KVM is the one and only authority (in the kernel).
+ */
+#define kvm_cpu_cap_init(leaf, feature_initializers...) \
+do { \
+ const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \
+ const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \
+ const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \
+ u32 kvm_cpu_cap_passthrough = 0; \
+ u32 kvm_cpu_cap_synthesized = 0; \
+ u32 kvm_cpu_cap_emulated = 0; \
+ u32 kvm_cpu_cap_features = 0; \
+ \
+ feature_initializers \
+ \
+ kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \
+ \
+ if (leaf < NCAPINTS) \
+ kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \
+ \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \
+ kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \
+ kvm_cpu_cap_synthesized); \
+ kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \
+} while (0)
+
+/*
+ * Assert that the feature bit being declared, e.g. via F(), is in the CPUID
+ * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX
+ * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX.
+ */
+#define KVM_VALIDATE_CPU_CAP_USAGE(name) \
+do { \
+ u32 __leaf = __feature_leaf(X86_FEATURE_##name); \
+ \
+ BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \
+} while (0)
+
+#define F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SCATTERED_F(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (boot_cpu_has(X86_FEATURE_##name)) \
+ F(name); \
+})
+
+/* Features that KVM supports only on 64-bit kernels. */
+#define X86_64_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+ if (IS_ENABLED(CONFIG_X86_64)) \
+ F(name); \
+})
+
+/*
+ * Emulated Feature - For features that KVM emulates in software irrespective
+ * of host CPU/kernel support.
+ */
+#define EMULATED_F(name) \
+({ \
+ kvm_cpu_cap_emulated |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Synthesized Feature - For features that are synthesized into boot_cpu_data,
+ * i.e. may not be present in the raw CPUID, but can still be advertised to
+ * userspace. Primarily used for mitigation related feature flags.
+ */
+#define SYNTHESIZED_F(name) \
+({ \
+ kvm_cpu_cap_synthesized |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Passthrough Feature - For features that KVM supports based purely on raw
+ * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't
+ * use the feature. Simply force set the feature in KVM's capabilities, raw
+ * CPUID support will be factored in by kvm_cpu_cap_mask().
+ */
+#define PASSTHROUGH_F(name) \
+({ \
+ kvm_cpu_cap_passthrough |= feature_bit(name); \
+ F(name); \
+})
+
+/*
+ * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of
+ * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001.
+ */
+#define ALIASED_1_EDX_F(name) \
+({ \
+ BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \
+ BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \
+ kvm_cpu_cap_features |= feature_bit(name); \
+})
+
+/*
+ * Vendor Features - For features that KVM supports, but are added in later
+ * because they require additional vendor enabling.
+ */
+#define VENDOR_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Runtime Features - For features that KVM dynamically sets/clears at runtime,
+ * e.g. when CR4 changes, but which are never advertised to userspace.
+ */
+#define RUNTIME_F(name) \
+({ \
+ KVM_VALIDATE_CPU_CAP_USAGE(name); \
+})
+
+/*
+ * Undefine the MSR bit macro to avoid token concatenation issues when
+ * processing X86_FEATURE_SPEC_CTRL_SSBD.
+ */
+#undef SPEC_CTRL_SSBD
+
+/* DS is defined by ptrace-abi.h on 32-bit builds. */
+#undef DS
+
+void kvm_set_cpu_caps(void)
+{
+ memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
+
+ BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) >
+ sizeof(boot_cpu_data.x86_capability));
+
+ kvm_cpu_cap_init(CPUID_1_ECX,
+ F(XMM3),
+ F(PCLMULQDQ),
+ VENDOR_F(DTES64),
+ /*
+ * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
+ * advertised to guests via CPUID! MWAIT is also technically a
+ * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so
+ * that KVM is aware that it's a known, unadvertised flag.
+ */
+ RUNTIME_F(MWAIT),
+ /* DS-CPL */
+ VENDOR_F(VMX),
+ /* SMX, EST */
+ /* TM2 */
+ F(SSSE3),
+ /* CNXT-ID */
+ /* Reserved */
+ F(FMA),
+ F(CX16),
+ /* xTPR Update */
+ F(PDCM),
+ F(PCID),
+ /* Reserved, DCA */
+ F(XMM4_1),
+ F(XMM4_2),
+ EMULATED_F(X2APIC),
+ F(MOVBE),
+ F(POPCNT),
+ EMULATED_F(TSC_DEADLINE_TIMER),
+ F(AES),
+ F(XSAVE),
+ RUNTIME_F(OSXSAVE),
+ F(AVX),
+ F(F16C),
+ F(RDRAND),
+ EMULATED_F(HYPERVISOR),
+ );
+
+ kvm_cpu_cap_init(CPUID_1_EDX,
+ F(FPU),
+ F(VME),
+ F(DE),
+ F(PSE),
+ F(TSC),
+ F(MSR),
+ F(PAE),
+ F(MCE),
+ F(CX8),
+ F(APIC),
+ /* Reserved */
+ F(SEP),
+ F(MTRR),
+ F(PGE),
+ F(MCA),
+ F(CMOV),
+ F(PAT),
+ F(PSE36),
+ /* PSN */
+ F(CLFLUSH),
+ /* Reserved */
+ VENDOR_F(DS),
+ /* ACPI */
+ F(MMX),
+ F(FXSR),
+ F(XMM),
+ F(XMM2),
+ F(SELFSNOOP),
+ /* HTT, TM, Reserved, PBE */
+ );
+
+ kvm_cpu_cap_init(CPUID_7_0_EBX,
+ F(FSGSBASE),
+ EMULATED_F(TSC_ADJUST),
+ F(SGX),
+ F(BMI1),
+ F(HLE),
+ F(AVX2),
+ F(FDP_EXCPTN_ONLY),
+ F(SMEP),
+ F(BMI2),
+ F(ERMS),
+ F(INVPCID),
+ F(RTM),
+ F(ZERO_FCS_FDS),
+ VENDOR_F(MPX),
+ F(AVX512F),
+ F(AVX512DQ),
+ F(RDSEED),
+ F(ADX),
+ F(SMAP),
+ F(AVX512IFMA),
+ F(CLFLUSHOPT),
+ F(CLWB),
+ VENDOR_F(INTEL_PT),
+ F(AVX512PF),
+ F(AVX512ER),
+ F(AVX512CD),
+ F(SHA_NI),
+ F(AVX512BW),
+ F(AVX512VL),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_ECX,
+ F(AVX512VBMI),
+ PASSTHROUGH_F(LA57),
+ F(PKU),
+ RUNTIME_F(OSPKE),
+ F(RDPID),
+ F(AVX512_VPOPCNTDQ),
+ F(UMIP),
+ F(AVX512_VBMI2),
+ F(GFNI),
+ F(VAES),
+ F(VPCLMULQDQ),
+ F(AVX512_VNNI),
+ F(AVX512_BITALG),
+ F(CLDEMOTE),
+ F(MOVDIRI),
+ F(MOVDIR64B),
+ VENDOR_F(WAITPKG),
+ F(SGX_LC),
+ F(BUS_LOCK_DETECT),
+ X86_64_F(SHSTK),
+ );
+
+ /*
+ * PKU not yet implemented for shadow paging and requires OSPKE
+ * to be set on the host. Clear it if that is not the case
+ */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ kvm_cpu_cap_clear(X86_FEATURE_PKU);
+
+ /*
+ * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack
+ * accesses require "magic" Writable=0,Dirty=1 protection, which KVM
+ * doesn't know how to emulate or map.
+ */
+ if (!tdp_enabled)
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+
+ kvm_cpu_cap_init(CPUID_7_EDX,
+ F(AVX512_4VNNIW),
+ F(AVX512_4FMAPS),
+ F(SPEC_CTRL),
+ F(SPEC_CTRL_SSBD),
+ EMULATED_F(ARCH_CAPABILITIES),
+ F(INTEL_STIBP),
+ F(MD_CLEAR),
+ F(AVX512_VP2INTERSECT),
+ F(FSRM),
+ F(SERIALIZE),
+ F(TSXLDTRK),
+ F(AVX512_FP16),
+ F(AMX_TILE),
+ F(AMX_INT8),
+ F(AMX_BF16),
+ F(FLUSH_L1D),
+ F(IBT),
+ );
+
+ /*
+ * Disable support for IBT and SHSTK if KVM is configured to emulate
+ * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or
+ * SHSTK, nor does KVM handle Shadow Stack #PFs (see above).
+ */
+ if (allow_smaller_maxphyaddr) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) &&
+ boot_cpu_has(X86_FEATURE_AMD_IBPB) &&
+ boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
+
+ kvm_cpu_cap_init(CPUID_7_1_EAX,
+ F(SHA512),
+ F(SM3),
+ F(SM4),
+ F(AVX_VNNI),
+ F(AVX512_BF16),
+ F(CMPCCXADD),
+ F(FZRM),
+ F(FSRS),
+ F(FSRC),
+ F(WRMSRNS),
+ X86_64_F(LKGS),
+ F(AMX_FP16),
+ F(AVX_IFMA),
+ F(LAM),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_1_ECX,
+ SCATTERED_F(MSR_IMM),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8),
+ F(AVX_NE_CONVERT),
+ F(AMX_COMPLEX),
+ F(AVX_VNNI_INT16),
+ F(PREFETCHITI),
+ F(AVX10),
+ );
+
+ kvm_cpu_cap_init(CPUID_7_2_EDX,
+ F(INTEL_PSFD),
+ F(IPRED_CTRL),
+ F(RRSBA_CTRL),
+ F(DDPD_U),
+ F(BHI_CTRL),
+ F(MCDT_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_D_1_EAX,
+ F(XSAVEOPT),
+ F(XSAVEC),
+ F(XGETBV1),
+ F(XSAVES),
+ X86_64_F(XFD),
+ );
+
+ kvm_cpu_cap_init(CPUID_12_EAX,
+ SCATTERED_F(SGX1),
+ SCATTERED_F(SGX2),
+ SCATTERED_F(SGX_EDECCSSA),
+ );
+
+ kvm_cpu_cap_init(CPUID_24_0_EBX,
+ F(AVX10_128),
+ F(AVX10_256),
+ F(AVX10_512),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0001_ECX,
+ F(LAHF_LM),
+ F(CMP_LEGACY),
+ VENDOR_F(SVM),
+ /* ExtApicSpace */
+ F(CR8_LEGACY),
+ F(ABM),
+ F(SSE4A),
+ F(MISALIGNSSE),
+ F(3DNOWPREFETCH),
+ F(OSVW),
+ /* IBS */
+ F(XOP),
+ /* SKINIT, WDT, LWP */
+ F(FMA4),
+ F(TBM),
+ F(TOPOEXT),
+ VENDOR_F(PERFCTR_CORE),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0001_EDX,
+ ALIASED_1_EDX_F(FPU),
+ ALIASED_1_EDX_F(VME),
+ ALIASED_1_EDX_F(DE),
+ ALIASED_1_EDX_F(PSE),
+ ALIASED_1_EDX_F(TSC),
+ ALIASED_1_EDX_F(MSR),
+ ALIASED_1_EDX_F(PAE),
+ ALIASED_1_EDX_F(MCE),
+ ALIASED_1_EDX_F(CX8),
+ ALIASED_1_EDX_F(APIC),
+ /* Reserved */
+ F(SYSCALL),
+ ALIASED_1_EDX_F(MTRR),
+ ALIASED_1_EDX_F(PGE),
+ ALIASED_1_EDX_F(MCA),
+ ALIASED_1_EDX_F(CMOV),
+ ALIASED_1_EDX_F(PAT),
+ ALIASED_1_EDX_F(PSE36),
+ /* Reserved */
+ F(NX),
+ /* Reserved */
+ F(MMXEXT),
+ ALIASED_1_EDX_F(MMX),
+ ALIASED_1_EDX_F(FXSR),
+ F(FXSR_OPT),
+ X86_64_F(GBPAGES),
+ F(RDTSCP),
+ /* Reserved */
+ X86_64_F(LM),
+ F(3DNOWEXT),
+ F(3DNOW),
+ );
+
+ if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
+ kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+
+ kvm_cpu_cap_init(CPUID_8000_0007_EDX,
+ SCATTERED_F(CONSTANT_TSC),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0008_EBX,
+ F(CLZERO),
+ F(XSAVEERPTR),
+ F(WBNOINVD),
+ F(AMD_IBPB),
+ F(AMD_IBRS),
+ F(AMD_SSBD),
+ F(VIRT_SSBD),
+ F(AMD_SSB_NO),
+ F(AMD_STIBP),
+ F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_IBRS_SAME_MODE),
+ PASSTHROUGH_F(EFER_LMSLE_MBZ),
+ F(AMD_PSFD),
+ F(AMD_IBPB_RET),
+ );
+
+ /*
+ * AMD has separate bits for each SPEC_CTRL bit.
+ * arch/x86/kernel/cpu/bugs.c is kind enough to
+ * record that in cpufeatures so use them.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET);
+ }
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD);
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* All SVM features required additional vendor module enabling. */
+ kvm_cpu_cap_init(CPUID_8000_000A_EDX,
+ VENDOR_F(NPT),
+ VENDOR_F(VMCBCLEAN),
+ VENDOR_F(FLUSHBYASID),
+ VENDOR_F(NRIPS),
+ VENDOR_F(TSCRATEMSR),
+ VENDOR_F(V_VMSAVE_VMLOAD),
+ VENDOR_F(LBRV),
+ VENDOR_F(PAUSEFILTER),
+ VENDOR_F(PFTHRESHOLD),
+ VENDOR_F(VGIF),
+ VENDOR_F(VNMI),
+ VENDOR_F(SVME_ADDR_CHK),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_001F_EAX,
+ VENDOR_F(SME),
+ VENDOR_F(SEV),
+ /* VM_PAGE_FLUSH */
+ VENDOR_F(SEV_ES),
+ F(SME_COHERENT),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP),
+ F(WRMSR_XX_BASE_NS),
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry
+ * in KVM's supported CPUID, i.e. if the feature is reported as
+ * supported by the kernel. LFENCE_RDTSC was a Linux-defined
+ * synthetic feature long before AMD joined the bandwagon, e.g.
+ * LFENCE is serializing on most CPUs that support SSE2. On
+ * CPUs that don't support AMD's leaf, ANDing with the raw host
+ * CPUID will drop the flags, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ SYNTHESIZED_F(LFENCE_RDTSC),
+ /* SmmPgCfgLock */
+ /* 4: Resv */
+ SYNTHESIZED_F(VERW_CLEAR),
+ F(NULL_SEL_CLR_BASE),
+ /* UpperAddressIgnore */
+ F(AUTOIBRS),
+ F(PREFETCHI),
+ EMULATED_F(NO_SMM_CTL_MSR),
+ /* PrefetchCtlMsr */
+ /* GpOnUserCpuid */
+ /* EPSF */
+ SYNTHESIZED_F(SBPB),
+ SYNTHESIZED_F(IBPB_BRTYPE),
+ SYNTHESIZED_F(SRSO_NO),
+ F(SRSO_USER_KERNEL_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0021_ECX,
+ SYNTHESIZED_F(TSA_SQ_NO),
+ SYNTHESIZED_F(TSA_L1_NO),
+ );
+
+ kvm_cpu_cap_init(CPUID_8000_0022_EAX,
+ F(PERFMON_V2),
+ );
+
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
+
+ kvm_cpu_cap_init(CPUID_C000_0001_EDX,
+ F(XSTORE),
+ F(XSTORE_EN),
+ F(XCRYPT),
+ F(XCRYPT_EN),
+ F(ACE2),
+ F(ACE2_EN),
+ F(PHE),
+ F(PHE_EN),
+ F(PMM),
+ F(PMM_EN),
+ );
+
+ /*
+ * Hide RDTSCP and RDPID if either feature is reported as supported but
+ * probing MSR_TSC_AUX failed. This is purely a sanity check and
+ * should never happen, but the guest will likely crash if RDTSCP or
+ * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
+ * the past. For example, the sanity check may fire if this instance of
+ * KVM is running as L1 on top of an older, broken KVM.
+ */
+ if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP) ||
+ kvm_cpu_cap_has(X86_FEATURE_RDPID)) &&
+ !kvm_is_supported_user_return_msr(MSR_TSC_AUX))) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps);
+
+#undef F
+#undef SCATTERED_F
+#undef X86_64_F
+#undef EMULATED_F
+#undef SYNTHESIZED_F
+#undef PASSTHROUGH_F
+#undef ALIASED_1_EDX_F
+#undef VENDOR_F
+#undef RUNTIME_F
+
+struct kvm_cpuid_array {
+ struct kvm_cpuid_entry2 *entries;
+ int maxnent;
+ int nent;
+};
+
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
+static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
+
+ if (!entry)
+ return NULL;
+
+ memset(entry, 0, sizeof(*entry));
+ entry->function = function;
+ entry->index = index;
+ switch (function & 0xC0000000) {
+ case 0x40000000:
+ /* Hypervisor leaves are always synthesized by __do_cpuid_func. */
+ return entry;
+
+ case 0x80000000:
+ /*
+ * 0x80000021 is sometimes synthesized by __do_cpuid_func, which
+ * would result in out-of-bounds calls to do_host_cpuid.
+ */
+ {
+ static int max_cpuid_80000000;
+ if (!READ_ONCE(max_cpuid_80000000))
+ WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000));
+ if (function > READ_ONCE(max_cpuid_80000000))
+ return entry;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+
+ if (cpuid_function_is_indexed(function))
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+
+ return entry;
+}
+
+static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
+ bool include_partially_emulated)
+{
+ memset(entry, 0, sizeof(*entry));
+
+ entry->function = func;
+ entry->index = 0;
+ entry->flags = 0;
+
+ switch (func) {
+ case 0:
+ entry->eax = 7;
+ return 1;
+ case 1:
+ entry->ecx = feature_bit(MOVBE);
+ /*
+ * KVM allows userspace to enumerate MONITOR+MWAIT support to
+ * the guest, but the MWAIT feature flag is never advertised
+ * to userspace because MONITOR+MWAIT aren't virtualized by
+ * hardware, can't be faithfully emulated in software (KVM
+ * emulates them as NOPs), and allowing the guest to execute
+ * them natively requires enabling a per-VM capability.
+ */
+ if (include_partially_emulated)
+ entry->ecx |= feature_bit(MWAIT);
+ return 1;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ entry->eax = 0;
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ entry->ecx = feature_bit(RDPID);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func)
+{
+ if (array->nent >= array->maxnent)
+ return -E2BIG;
+
+ array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false);
+ return 0;
+}
+
+static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
+{
+ struct kvm_cpuid_entry2 *entry;
+ int r, i, max_idx;
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ entry = do_host_cpuid(array, function, 0);
+ if (!entry)
+ goto out;
+
+ switch (function) {
+ case 0:
+ /* Limited to the highest leaf implemented in KVM. */
+ entry->eax = min(entry->eax, 0x24U);
+ break;
+ case 1:
+ cpuid_entry_override(entry, CPUID_1_EDX);
+ cpuid_entry_override(entry, CPUID_1_ECX);
+ break;
+ case 2:
+ /*
+ * On ancient CPUs, function 2 entries are STATEFUL. That is,
+ * CPUID(function=2, index=0) may return different results each
+ * time, with the least-significant byte in EAX enumerating the
+ * number of times software should do CPUID(2, 0).
+ *
+ * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
+ * idiotic. Intel's SDM states that EAX & 0xff "will always
+ * return 01H. Software should ignore this value and not
+ * interpret it as an informational descriptor", while AMD's
+ * APM states that CPUID(2) is reserved.
+ *
+ * WARN if a frankenstein CPU that supports virtualization and
+ * a stateful CPUID.0x2 is encountered.
+ */
+ WARN_ON_ONCE((entry->eax & 0xff) > 1);
+ break;
+ /* functions 4 and 0x8000001d have additional index. */
+ case 4:
+ case 0x8000001d:
+ /*
+ * Read entries until the cache type in the previous entry is
+ * zero, i.e. indicates an invalid entry.
+ */
+ for (i = 1; entry->eax & 0x1f; ++i) {
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+ }
+ break;
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ /* function 7 has additional index. */
+ case 7:
+ max_idx = entry->eax = min(entry->eax, 2u);
+ cpuid_entry_override(entry, CPUID_7_0_EBX);
+ cpuid_entry_override(entry, CPUID_7_ECX);
+ cpuid_entry_override(entry, CPUID_7_EDX);
+
+ /* KVM only supports up to 0x7.2, capped above via min(). */
+ if (max_idx >= 1) {
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_ECX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
+ entry->ebx = 0;
+ }
+ if (max_idx >= 2) {
+ entry = do_host_cpuid(array, function, 2);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_7_2_EDX);
+ entry->ecx = 0;
+ entry->ebx = 0;
+ entry->eax = 0;
+ }
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ union cpuid10_eax eax = { };
+ union cpuid10_edx edx = { };
+
+ if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ eax.split.version_id = kvm_pmu_cap.version;
+ eax.split.num_counters = kvm_pmu_cap.num_counters_gp;
+ eax.split.bit_width = kvm_pmu_cap.bit_width_gp;
+ eax.split.mask_length = kvm_pmu_cap.events_mask_len;
+ edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed;
+ edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed;
+
+ if (kvm_pmu_cap.version)
+ edx.split.anythread_deprecated = 1;
+
+ entry->eax = eax.full;
+ entry->ebx = kvm_pmu_cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ case 0x1f:
+ case 0xb:
+ /*
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
+ */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0xd: {
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
+ u64 permitted_xss = kvm_caps.supported_xss;
+
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
+ entry->ecx = entry->ebx;
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
+ break;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ cpuid_entry_override(entry, CPUID_D_1_EAX);
+ if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC)))
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
+ true);
+ else {
+ WARN_ON_ONCE(permitted_xss != 0);
+ entry->ebx = 0;
+ }
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
+
+ for (i = 2; i < 64; ++i) {
+ bool s_state;
+ if (permitted_xcr0 & BIT_ULL(i))
+ s_state = false;
+ else if (permitted_xss & BIT_ULL(i))
+ s_state = true;
+ else
+ continue;
+
+ entry = do_host_cpuid(array, function, i);
+ if (!entry)
+ goto out;
+
+ /*
+ * The supported check above should have filtered out
+ * invalid sub-leafs. Only valid sub-leafs should
+ * reach this point, and they should have a non-zero
+ * save state size. Furthermore, check whether the
+ * processor agrees with permitted_xcr0/permitted_xss
+ * on whether this is an XCR0- or IA32_XSS-managed area.
+ */
+ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
+ --array->nent;
+ continue;
+ }
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
+ entry->edx = 0;
+ }
+ break;
+ }
+ case 0x12:
+ /* Intel SGX */
+ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
+ * and max enclave sizes. The SGX sub-features and MISCSELECT
+ * are restricted by kernel and KVM capabilities (like most
+ * feature flags), while enclave size is unrestricted.
+ */
+ cpuid_entry_override(entry, CPUID_12_EAX);
+ entry->ebx &= SGX_MISC_EXINFO;
+
+ entry = do_host_cpuid(array, function, 1);
+ if (!entry)
+ goto out;
+
+ /*
+ * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
+ * feature flags. Advertise all supported flags, including
+ * privileged attributes that require explicit opt-in from
+ * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
+ * expected to derive it from supported XCR0.
+ */
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
+ entry->ebx &= 0;
+ break;
+ /* Intel PT */
+ case 0x14:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
+ case 0x24: {
+ u8 avx10_version;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * The AVX10 version is encoded in EBX[7:0]. Note, the version
+ * is guaranteed to be >=1 if AVX10 is supported. Note #2, the
+ * version needs to be captured before overriding EBX features!
+ */
+ avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ cpuid_entry_override(entry, CPUID_24_0_EBX);
+ entry->ebx |= avx10_version;
+
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
+ entry->eax = KVM_CPUID_FEATURES;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH) |
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI) |
+ (1 << KVM_FEATURE_POLL_CONTROL) |
+ (1 << KVM_FEATURE_PV_SCHED_YIELD) |
+ (1 << KVM_FEATURE_ASYNC_PF_INT);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x80000022);
+ /*
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
+ */
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
+ entry->eax = max(entry->eax, 0x80000021);
+ break;
+ case 0x80000001:
+ entry->ebx &= ~GENMASK(27, 16);
+ cpuid_entry_override(entry, CPUID_8000_0001_EDX);
+ cpuid_entry_override(entry, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
+ case 0x80000006:
+ /* Drop reserved bits, pass host L2 cache and TLB info. */
+ entry->edx &= ~GENMASK(17, 16);
+ break;
+ case 0x80000007: /* Advanced power management */
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0x80000008: {
+ /*
+ * GuestPhysAddrSize (EAX[23:16]) is intended for software
+ * use.
+ *
+ * KVM's ABI is to report the effective MAXPHYADDR for the
+ * guest in PhysAddrSize (phys_as), and the maximum
+ * *addressable* GPA in GuestPhysAddrSize (g_phys_as).
+ *
+ * GuestPhysAddrSize is valid if and only if TDP is enabled,
+ * in which case the max GPA that can be addressed by KVM may
+ * be less than the max GPA that can be legally generated by
+ * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
+ * support 5-level TDP.
+ */
+ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned int phys_as, g_phys_as;
+
+ /*
+ * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
+ * the guest operates in the same PA space as the host, i.e.
+ * reductions in MAXPHYADDR for memory encryption affect shadow
+ * paging, too.
+ *
+ * If TDP is enabled, use the raw bare metal MAXPHYADDR as
+ * reductions to the HPAs do not affect GPAs. The max
+ * addressable GPA is the same as the max effective GPA, except
+ * that it's capped at 48 bits if 5-level TDP isn't supported
+ * (hardware processes bits 51:48 only when walking the fifth
+ * level page table).
+ */
+ if (!tdp_enabled) {
+ phys_as = boot_cpu_data.x86_phys_bits;
+ g_phys_as = 0;
+ } else {
+ phys_as = entry->eax & 0xff;
+ g_phys_as = phys_as;
+ if (kvm_mmu_get_max_tdp_level() < 5)
+ g_phys_as = min(g_phys_as, 48U);
+ }
+
+ entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
+ entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0008_EBX);
+ break;
+ }
+ case 0x8000000A:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ cpuid_entry_override(entry, CPUID_8000_000A_EDX);
+ break;
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ entry->eax &= GENMASK(2, 0);
+ entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
+ break;
+ case 0x8000001F:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ } else {
+ cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+ /* Clear NumVMPL since KVM does not support VMPL. */
+ entry->ebx &= ~GENMASK(31, 12);
+ /*
+ * Enumerate '0' for "PA bits reduction", the adjusted
+ * MAXPHYADDR is enumerated directly (see 0x80000008).
+ */
+ entry->ebx &= ~GENMASK(11, 6);
+ }
+ break;
+ case 0x80000020:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ case 0x80000021:
+ entry->ebx = entry->edx = 0;
+ cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
+ break;
+ /* AMD Extended Performance Monitoring and Debug */
+ case 0x80000022: {
+ union cpuid_0x80000022_ebx ebx = { };
+
+ entry->ecx = entry->edx = 0;
+ if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
+ entry->eax = entry->ebx = 0;
+ break;
+ }
+
+ cpuid_entry_override(entry, CPUID_8000_0022_EAX);
+
+ ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
+ entry->ebx = ebx.full;
+ break;
+ }
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ cpuid_entry_override(entry, CPUID_C000_0001_EDX);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_func_emulated(array, func);
+
+ return __do_cpuid_func(array, func);
+}
+
+#define CENTAUR_CPUID_SIGNATURE 0xC0000000
+
+static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
+ unsigned int type)
+{
+ u32 limit;
+ int r;
+
+ if (func == CENTAUR_CPUID_SIGNATURE &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return 0;
+
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ return r;
+
+ limit = array->entries[array->nent - 1].eax;
+ for (func = func + 1; func <= limit; ++func) {
+ r = do_cpuid_func(array, func, type);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+ __u32 pad[3];
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+ return true;
+
+ if (pad[0] || pad[1] || pad[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
+{
+ static const u32 funcs[] = {
+ 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE,
+ };
+
+ struct kvm_cpuid_array array = {
+ .nent = 0,
+ };
+ int r, i;
+
+ if (cpuid->nent < 1)
+ return -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
+ array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL);
+ if (!array.entries)
+ return -ENOMEM;
+
+ array.maxnent = cpuid->nent;
+
+ for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+ r = get_cpuid_func(&array, funcs[i], type);
+ if (r)
+ goto out_free;
+ }
+ cpuid->nent = array.nent;
+
+ if (copy_to_user(entries, array.entries,
+ array.nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out_free:
+ kvfree(array.entries);
+ return r;
+}
+
+/*
+ * Intel CPUID semantics treats any query for an out-of-range leaf as if the
+ * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
+ * returns all zeroes for any undefined leaf, whether or not the leaf is in
+ * range. Centaur/VIA follows Intel semantics.
+ *
+ * A leaf is considered out-of-range if its function is higher than the maximum
+ * supported leaf of its associated class or if its associated class does not
+ * exist.
+ *
+ * There are three primary classes to be considered, with their respective
+ * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary
+ * class exists if a guest CPUID entry for its <base> leaf exists. For a given
+ * class, CPUID.<base>.EAX contains the max supported leaf for the class.
+ *
+ * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
+ * - Hypervisor: 0x40000000 - 0x4fffffff
+ * - Extended: 0x80000000 - 0xbfffffff
+ * - Centaur: 0xc0000000 - 0xcfffffff
+ *
+ * The Hypervisor class is further subdivided into sub-classes that each act as
+ * their own independent class associated with a 0x100 byte range. E.g. if Qemu
+ * is advertising support for both HyperV and KVM, the resulting Hypervisor
+ * CPUID sub-classes are:
+ *
+ * - HyperV: 0x40000000 - 0x400000ff
+ * - KVM: 0x40000100 - 0x400001ff
+ */
+static struct kvm_cpuid_entry2 *
+get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index)
+{
+ struct kvm_cpuid_entry2 *basic, *class;
+ u32 function = *fn_ptr;
+
+ basic = kvm_find_cpuid_entry(vcpu, 0);
+ if (!basic)
+ return NULL;
+
+ if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) ||
+ is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx))
+ return NULL;
+
+ if (function >= 0x40000000 && function <= 0x4fffffff)
+ class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00);
+ else if (function >= 0xc0000000)
+ class = kvm_find_cpuid_entry(vcpu, 0xc0000000);
+ else
+ class = kvm_find_cpuid_entry(vcpu, function & 0x80000000);
+
+ if (class && function <= class->eax)
+ return NULL;
+
+ /*
+ * Leaf specific adjustments are also applied when redirecting to the
+ * max basic entry, e.g. if the max basic leaf is 0xb but there is no
+ * entry for CPUID.0xb.index (see below), then the output value for EDX
+ * needs to be pulled from CPUID.0xb.1.
+ */
+ *fn_ptr = basic->eax;
+
+ /*
+ * The class does not exist or the requested function is out of range;
+ * the effective CPUID entry is the max basic leaf. Note, the index of
+ * the original requested leaf is observed!
+ */
+ return kvm_find_cpuid_entry_index(vcpu, basic->eax, index);
+}
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only)
+{
+ u32 orig_function = *eax, function = *eax, index = *ecx;
+ struct kvm_cpuid_entry2 *entry;
+ bool exact, used_max_basic = false;
+
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, index);
+ exact = !!entry;
+
+ if (!entry && !exact_only) {
+ entry = get_out_of_range_cpuid_entry(vcpu, &function, index);
+ used_max_basic = !!entry;
+ }
+
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ if (function == 7 && index == 0) {
+ u64 data;
+ if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) &&
+ !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) &&
+ (data & TSX_CTRL_CPUID_CLEAR))
+ *ebx &= ~(feature_bit(RTM) | feature_bit(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~feature_bit(CONSTANT_TSC);
+ } else if (IS_ENABLED(CONFIG_KVM_XEN) &&
+ kvm_xen_is_tsc_leaf(vcpu, function)) {
+ /*
+ * Update guest TSC frequency information if necessary.
+ * Ignore failures, there is no sane value that can be
+ * provided if KVM can't get the TSC frequency.
+ */
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu))
+ kvm_guest_time_update(vcpu);
+
+ if (index == 1) {
+ *ecx = vcpu->arch.pvclock_tsc_mul;
+ *edx = vcpu->arch.pvclock_tsc_shift;
+ } else if (index == 2) {
+ *eax = vcpu->arch.hw_tsc_khz;
+ }
+ }
+ } else {
+ *eax = *ebx = *ecx = *edx = 0;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact,
+ used_max_basic);
+ return exact;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpuid);
+
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
+ eax = kvm_rax_read(vcpu);
+ ecx = kvm_rcx_read(vcpu);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+ kvm_rax_write(vcpu, eax);
+ kvm_rbx_write(vcpu, ebx);
+ kvm_rcx_write(vcpu, ecx);
+ kvm_rdx_write(vcpu, edx);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..d3f5ae15a7ca
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "reverse_cpuid.h"
+#include <asm/cpu.h>
+#include <asm/processor.h>
+#include <uapi/asm/kvm_para.h>
+
+extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
+void kvm_set_cpu_caps(void);
+
+void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
+ int nent, u32 function, u64 index);
+/*
+ * Magic value used by KVM when querying userspace-provided CPUID entries and
+ * doesn't care about the CPIUD index because the index of the function in
+ * question is not significant. Note, this magic value must have at least one
+ * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2()
+ * to avoid false positives when processing guest CPUID input.
+ *
+ * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of
+ * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry().
+ */
+#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, index);
+}
+
+static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function)
+{
+ return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+ function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+
+void __init kvm_init_xstate_sizes(void);
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
+}
+
+static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry,
+ unsigned int leaf)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, leaf * 32);
+
+ BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps));
+ *reg = kvm_cpu_caps[leaf];
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+ struct kvm_cpuid_entry2 *entry;
+ u32 *reg;
+
+ /*
+ * XSAVES is a special snowflake. Due to lack of a dedicated intercept
+ * on SVM, KVM must assume that XSAVES (and thus XRSTORS) is usable by
+ * the guest if the host supports XSAVES and *XSAVE* is exposed to the
+ * guest. Because the guest can execute XSAVES and XRSTORS, i.e. can
+ * indirectly consume XSS, KVM must ensure XSS is zeroed when running
+ * the guest, i.e. must set XSAVES in vCPU capabilities. But to reject
+ * direct XSS reads and writes (to minimize the virtualization hole and
+ * honor userspace's CPUID), KVM needs to check the raw guest CPUID,
+ * not KVM's view of guest capabilities.
+ *
+ * For all other features, guest capabilities are accurate. Expand
+ * this allowlist with extreme vigilance.
+ */
+ BUILD_BUG_ON(x86_feature != X86_FEATURE_XSAVES);
+
+ entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
+
+ reg = __cpuid_entry_get_reg(entry, cpuid.reg);
+ if (!reg)
+ return false;
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.is_amd_compatible;
+}
+
+static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu)
+{
+ return !guest_cpuid_is_amd_compatible(vcpu);
+}
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu)
+{
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
+static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature)
+{
+ return !!kvm_cpu_cap_get(x86_feature);
+}
+
+static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
+{
+ if (boot_cpu_has(x86_feature))
+ kvm_cpu_cap_set(x86_feature);
+}
+
+static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
+ unsigned int kvm_feature)
+{
+ if (!vcpu->arch.pv_cpuid.enforce)
+ return true;
+
+ return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
+}
+
+static __always_inline void guest_cpu_cap_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ vcpu->arch.cpu_caps[x86_leaf] |= __feature_bit(x86_feature);
+}
+
+static __always_inline void guest_cpu_cap_clear(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ vcpu->arch.cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void guest_cpu_cap_change(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature,
+ bool guest_has_cap)
+{
+ if (guest_has_cap)
+ guest_cpu_cap_set(vcpu, x86_feature);
+ else
+ guest_cpu_cap_clear(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ /*
+ * Except for MWAIT, querying dynamic feature bits is disallowed, so
+ * that KVM can defer runtime updates until the next CPUID emulation.
+ */
+ BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC ||
+ x86_feature == X86_FEATURE_OSXSAVE ||
+ x86_feature == X86_FEATURE_OSPKE);
+
+ return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature);
+}
+
+static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
+ cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, cr3);
+}
+
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB));
+}
+
+#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..999227fc7c66
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
+
+static int vcpu_get_timer_advance_ns(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+
+static int vcpu_get_guest_mode(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->stat.guest_mode;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n");
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_caps.tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
+{
+ debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu,
+ &vcpu_guest_mode_fops);
+ debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
+ &vcpu_tsc_offset_fops);
+
+ if (lapic_in_kernel(vcpu))
+ debugfs_create_file("lapic_timer_advance_ns", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_timer_advance_ns_fops);
+
+ if (kvm_caps.has_tsc_control) {
+ debugfs_create_file("tsc-scaling-ratio", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_fops);
+ debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ debugfs_dentry, vcpu,
+ &vcpu_tsc_scaling_frac_fops);
+ }
+}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ int bkt;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots)
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+ int r;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ r = single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+ if (r < 0)
+ kvm_put_kvm(kvm);
+
+ return r;
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..c8e292e9a24d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/******************************************************************************
* emulate.c
*
@@ -9,31 +10,66 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
-#include <linux/module.h>
-#include <asm/kvm_emulate.h>
+#include "kvm_emulate.h"
+#include <linux/stringify.h>
+#include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
+#include <asm/ibt.h>
+#include <asm/text-patching.h>
#include "x86.h"
#include "tss.h"
+#include "mmu.h"
+#include "pmu.h"
+
+/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
+#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
/*
* Opcode effective-address decode tables.
@@ -45,576 +81,443 @@
*/
/* Operand sizes: 8-bit operands or specified/overridden size. */
-#define ByteOp (1<<0) /* 8-bit operands. */
-/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
-#define DstDI (5<<1) /* Destination is in ES:(E)DI */
-#define DstMem64 (6<<1) /* 64bit memory operand */
-#define DstMask (7<<1)
-/* Source operand type. */
-#define SrcNone (0<<4) /* No source operand. */
-#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
-#define SrcReg (1<<4) /* Register operand. */
-#define SrcMem (2<<4) /* Memory operand. */
-#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
-#define SrcImm (5<<4) /* Immediate operand. */
-#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
-#define SrcOne (7<<4) /* Implied '1' */
-#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
-#define SrcImmU (9<<4) /* Immediate operand, unsigned */
-#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
-#define SrcMask (0xf<<4)
-/* Generic ModRM decode. */
-#define ModRM (1<<8)
-/* Destination is only written; never read. */
-#define Mov (1<<9)
-#define BitOp (1<<10)
-#define MemAbs (1<<11) /* Memory operand is absolute displacement */
-#define String (1<<12) /* String instruction (rep capable) */
-#define Stack (1<<13) /* Stack instruction (push/pop) */
-#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
-#define GroupMask 0xff /* Group number stored in bits 0:7 */
-/* Misc flags */
+#define ByteOp (1<<0) /* 8-bit operands. */
+#define DstShift 1 /* Destination operand type at bits 1-5 */
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
+#define DstMask (OpMask << DstShift)
+#define SrcShift 6 /* Source operand type at bits 6-10 */
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcXLat (OpXLat << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
+#define SrcMask (OpMask << SrcShift)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
+#define Sse (1<<18) /* SSE Vector instruction */
+#define ModRM (1<<19) /* Generic ModRM decode. */
+#define Mov (1<<20) /* Destination is only written; never read. */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
-#define No64 (1<<28)
-/* Source 2 operand type */
-#define Src2None (0<<29)
-#define Src2CL (1<<29)
-#define Src2ImmByte (2<<29)
-#define Src2One (3<<29)
-#define Src2Imm16 (4<<29)
-#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
- in memory and second argument is located
- immediately after the first one in memory. */
-#define Src2Mask (7<<29)
-
-enum {
- Group1_80, Group1_81, Group1_82, Group1_83,
- Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
- Group8, Group9,
+#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */
+#define PageTable (1 << 29) /* instruction used to write page table */
+#define NotImpl (1 << 30) /* instruction is not implemented */
+#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */
+#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */
+#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
+/* free: 37-39 */
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+/* free: 43-44 */
+#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
+#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+struct opcode {
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
+ const struct escape *esc;
+ const struct instr_dual *idual;
+ const struct mode_dual *mdual;
+ } u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
};
-static u32 opcode_table[256] = {
- /* 0x00 - 0x07 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x08 - 0x0F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, 0,
- /* 0x10 - 0x17 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x18 - 0x1F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- /* 0x20 - 0x27 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
- /* 0x28 - 0x2F */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x30 - 0x37 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
- /* 0x38 - 0x3F */
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
- 0, 0,
- /* 0x40 - 0x47 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x48 - 0x4F */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x50 - 0x57 */
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
- /* 0x58 - 0x5F */
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
- /* 0x60 - 0x67 */
- ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
- 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
- 0, 0, 0, 0,
- /* 0x68 - 0x6F */
- SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
- DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
- SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
- /* 0x70 - 0x77 */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x78 - 0x7F */
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
- /* 0x80 - 0x87 */
- Group | Group1_80, Group | Group1_81,
- Group | Group1_82, Group | Group1_83,
- ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- /* 0x88 - 0x8F */
- ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
- ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
- DstReg | SrcMem | ModRM | Mov, Group | Group1A,
- /* 0x90 - 0x97 */
- DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
- /* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16 | No64, 0,
- ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
- /* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
- ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
- ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
- /* 0xA8 - 0xAF */
- 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
- ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
- ByteOp | DstDI | String, DstDI | String,
- /* 0xB0 - 0xB7 */
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
- /* 0xB8 - 0xBF */
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
- /* 0xC0 - 0xC7 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
- 0, ImplicitOps | Stack, 0, 0,
- ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
- /* 0xC8 - 0xCF */
- 0, 0, 0, ImplicitOps | Stack,
- ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
- /* 0xD0 - 0xD7 */
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
- 0, 0, 0, 0,
- /* 0xD8 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xE7 */
- 0, 0, 0, 0,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
- /* 0xE8 - 0xEF */
- SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
- /* 0xF0 - 0xF7 */
- 0, 0, 0, 0,
- ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
- /* 0xF8 - 0xFF */
- ImplicitOps, 0, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
};
-static u32 twobyte_table[256] = {
- /* 0x00 - 0x0F */
- 0, Group | GroupDual | Group7, 0, 0,
- 0, ImplicitOps, ImplicitOps | Priv, 0,
- ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
- 0, ImplicitOps | ModRM, 0, 0,
- /* 0x10 - 0x1F */
- 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
- /* 0x20 - 0x2F */
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- ModRM | ImplicitOps | Priv, ModRM | Priv,
- 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x30 - 0x3F */
- ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
- ImplicitOps, ImplicitOps | Priv, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x48 - 0x4F */
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- /* 0x50 - 0x5F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x60 - 0x6F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x70 - 0x7F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0x80 - 0x8F */
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
- /* 0x90 - 0x9F */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xA0 - 0xA7 */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM, 0, 0,
- /* 0xA8 - 0xAF */
- ImplicitOps | Stack, ImplicitOps | Stack,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- DstMem | SrcReg | Src2ImmByte | ModRM,
- DstMem | SrcReg | Src2CL | ModRM,
- ModRM, 0,
- /* 0xB0 - 0xB7 */
- ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
- 0, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xB8 - 0xBF */
- 0, 0,
- Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
- 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
- DstReg | SrcMem16 | ModRM | Mov,
- /* 0xC0 - 0xCF */
- 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
- 0, 0, 0, Group | GroupDual | Group9,
- 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xD0 - 0xDF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xE0 - 0xEF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 0xF0 - 0xFF */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
};
-static u32 group_table[] = {
- [Group1_80*8] =
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM | Lock,
- ByteOp | DstMem | SrcImm | ModRM,
- [Group1_81*8] =
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM | Lock,
- DstMem | SrcImm | ModRM,
- [Group1_82*8] =
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
- ByteOp | DstMem | SrcImm | ModRM | No64,
- [Group1_83*8] =
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM,
- [Group1A*8] =
- DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
- [Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, 0,
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group3*8] =
- DstMem | SrcImm | ModRM, 0,
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- 0, 0, 0, 0,
- [Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
- 0, 0, 0, 0, 0, 0,
- [Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
- SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
- SrcMem | ModRM | Stack, 0,
- [Group7*8] =
- 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
- [Group8*8] =
- 0, 0, 0, 0,
- DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
- DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
- [Group9*8] =
- 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
};
-static u32 group2_table[] = {
- [Group7*8] =
- SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
- SrcNone | ModRM | DstMem | Mov, 0,
- SrcMem16 | ModRM | Mov | Priv, 0,
- [Group9*8] =
- 0, 0, 0, 0, 0, 0, 0, 0,
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
};
-/* EFLAGS bit definitions. */
-#define EFLG_ID (1<<21)
-#define EFLG_VIP (1<<20)
-#define EFLG_VIF (1<<19)
-#define EFLG_AC (1<<18)
-#define EFLG_VM (1<<17)
-#define EFLG_RF (1<<16)
-#define EFLG_IOPL (3<<12)
-#define EFLG_NT (1<<14)
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_IF (1<<9)
-#define EFLG_TF (1<<8)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
-/*
- * Instruction emulation:
- * Most instructions are emulated directly via a fragment of inline assembly
- * code. This allows us to save/restore EFLAGS and thus very easily pick up
- * any modified flags.
- */
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#if defined(CONFIG_X86_64)
-#define _LO32 "k" /* force 32-bit operand */
-#define _STK "%%rsp" /* stack pointer */
-#elif defined(__i386__)
-#define _LO32 "" /* force 32-bit operand */
-#define _STK "%%esp" /* stack pointer */
-#endif
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
+enum rex_bits {
+ REX_B = 1,
+ REX_X = 2,
+ REX_R = 4,
+ REX_W = 8,
+};
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dirty = ctxt->regs_dirty;
+ unsigned reg;
+
+ for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
/*
* These EFLAGS bits are restored from saved value during emulation, and
* any changes are written back to the saved value after emulation.
*/
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
-
-/* Before executing instruction: restore necessary bits in EFLAGS. */
-#define _PRE_EFLAGS(_sav, _msk, _tmp) \
- /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
- "movl %"_sav",%"_LO32 _tmp"; " \
- "push %"_tmp"; " \
- "push %"_tmp"; " \
- "movl %"_msk",%"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "pushf; " \
- "notl %"_LO32 _tmp"; " \
- "andl %"_LO32 _tmp",("_STK"); " \
- "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
- "pop %"_tmp"; " \
- "orl %"_LO32 _tmp",("_STK"); " \
- "popf; " \
- "pop %"_sav"; "
-
-/* After executing instruction: write-back necessary bits in EFLAGS. */
-#define _POST_EFLAGS(_sav, _msk, _tmp) \
- /* _sav |= EFLAGS & _msk; */ \
- "pushf; " \
- "pop %"_tmp"; " \
- "andl %"_msk",%"_LO32 _tmp"; " \
- "orl %"_LO32 _tmp",%"_sav"; "
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
#ifdef CONFIG_X86_64
-#define ON64(x) x
+#define ON64(x...) x
#else
-#define ON64(x)
+#define ON64(x...)
#endif
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op _suffix " %"_x"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _y ((_src).val), "i" (EFLAGS_MASK)); \
- } while (0)
-
-
-/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
- break; \
- case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
- break; \
- case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
- break; \
- } \
- } while (0)
-
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- switch ((_dst).bytes) { \
- case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
- break; \
- default: \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- _wx, _wy, _lx, _ly, _qx, _qy); \
- break; \
- } \
- } while (0)
-
-/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "c", "b", "c", "b", "c", "b", "c")
-
-/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "q", "w", "r", _LO32, "r", "", "r")
-
-/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- "w", "r", _LO32, "r", "", "r")
-
-/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
- do { \
- unsigned long _tmp; \
- _type _clv = (_cl).val; \
- _type _srcv = (_src).val; \
- _type _dstv = (_dst).val; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "5", "2") \
- _op _suffix " %4,%1 \n" \
- _POST_EFLAGS("0", "5", "2") \
- : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
- : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
- ); \
- \
- (_cl).val = (unsigned long) _clv; \
- (_src).val = (unsigned long) _srcv; \
- (_dst).val = (unsigned long) _dstv; \
- } while (0)
-
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 2: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "w", unsigned short); \
- break; \
- case 4: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "l", unsigned int); \
- break; \
- case 8: \
- ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "q", unsigned long)); \
- break; \
- } \
- } while (0)
-
-#define __emulate_1op(_op, _dst, _eflags, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op _suffix " %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "+m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- } while (0)
-
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- switch ((_dst).bytes) { \
- case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
- case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
- case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
- } \
- } while (0)
+#define EM_ASM_START(op) \
+static int em_##op(struct x86_emulate_ctxt *ctxt) \
+{ \
+ unsigned long flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; \
+ int bytes = 1, ok = 1; \
+ if (!(ctxt->d & ByteOp)) \
+ bytes = ctxt->dst.bytes; \
+ switch (bytes) {
+
+#define __EM_ASM(str) \
+ asm("push %[flags]; popf \n\t" \
+ "10: " str \
+ "pushf; pop %[flags] \n\t" \
+ "11: \n\t" \
+ : "+a" (ctxt->dst.val), \
+ "+d" (ctxt->src.val), \
+ [flags] "+D" (flags), \
+ "+S" (ok) \
+ : "c" (ctxt->src2.val))
+
+#define __EM_ASM_1(op, dst) \
+ __EM_ASM(#op " %%" #dst " \n\t")
+
+#define __EM_ASM_1_EX(op, dst) \
+ __EM_ASM(#op " %%" #dst " \n\t" \
+ _ASM_EXTABLE_TYPE_REG(10b, 11f, EX_TYPE_ZERO_REG, %%esi))
+
+#define __EM_ASM_2(op, dst, src) \
+ __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
+
+#define __EM_ASM_3(op, dst, src, src2) \
+ __EM_ASM(#op " %%" #src2 ", %%" #src ", %%" #dst " \n\t")
+
+#define EM_ASM_END \
+ } \
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); \
+ return !ok ? emulate_de(ctxt) : X86EMUL_CONTINUE; \
+}
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
-({ unsigned long _x; \
- rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
- if (rc != X86EMUL_CONTINUE) \
- goto done; \
- (_eip) += (_size); \
- (_type)_x; \
+/* 1-operand, using "a" (dst) */
+#define EM_ASM_1(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_1(op##b, al); break; \
+ case 2: __EM_ASM_1(op##w, ax); break; \
+ case 4: __EM_ASM_1(op##l, eax); break; \
+ ON64(case 8: __EM_ASM_1(op##q, rax); break;) \
+ EM_ASM_END
+
+/* 1-operand, using "c" (src2) */
+#define EM_ASM_1SRC2(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_1(op##b, cl); break; \
+ case 2: __EM_ASM_1(op##w, cx); break; \
+ case 4: __EM_ASM_1(op##l, ecx); break; \
+ ON64(case 8: __EM_ASM_1(op##q, rcx); break;) \
+ EM_ASM_END
+
+/* 1-operand, using "c" (src2) with exception */
+#define EM_ASM_1SRC2EX(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_1_EX(op##b, cl); break; \
+ case 2: __EM_ASM_1_EX(op##w, cx); break; \
+ case 4: __EM_ASM_1_EX(op##l, ecx); break; \
+ ON64(case 8: __EM_ASM_1_EX(op##q, rcx); break;) \
+ EM_ASM_END
+
+/* 2-operand, using "a" (dst), "d" (src) */
+#define EM_ASM_2(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_2(op##b, al, dl); break; \
+ case 2: __EM_ASM_2(op##w, ax, dx); break; \
+ case 4: __EM_ASM_2(op##l, eax, edx); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \
+ EM_ASM_END
+
+/* 2-operand, reversed */
+#define EM_ASM_2R(op, name) \
+ EM_ASM_START(name) \
+ case 1: __EM_ASM_2(op##b, dl, al); break; \
+ case 2: __EM_ASM_2(op##w, dx, ax); break; \
+ case 4: __EM_ASM_2(op##l, edx, eax); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rdx, rax); break;) \
+ EM_ASM_END
+
+/* 2-operand, word only (no byte op) */
+#define EM_ASM_2W(op) \
+ EM_ASM_START(op) \
+ case 1: break; \
+ case 2: __EM_ASM_2(op##w, ax, dx); break; \
+ case 4: __EM_ASM_2(op##l, eax, edx); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, rdx); break;) \
+ EM_ASM_END
+
+/* 2-operand, using "a" (dst) and CL (src2) */
+#define EM_ASM_2CL(op) \
+ EM_ASM_START(op) \
+ case 1: __EM_ASM_2(op##b, al, cl); break; \
+ case 2: __EM_ASM_2(op##w, ax, cl); break; \
+ case 4: __EM_ASM_2(op##l, eax, cl); break; \
+ ON64(case 8: __EM_ASM_2(op##q, rax, cl); break;) \
+ EM_ASM_END
+
+/* 3-operand, using "a" (dst), "d" (src) and CL (src2) */
+#define EM_ASM_3WCL(op) \
+ EM_ASM_START(op) \
+ case 1: break; \
+ case 2: __EM_ASM_3(op##w, ax, dx, cl); break; \
+ case 4: __EM_ASM_3(op##l, eax, edx, cl); break; \
+ ON64(case 8: __EM_ASM_3(op##q, rax, rdx, cl); break;) \
+ EM_ASM_END
+
+static int em_salc(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Set AL 0xFF if CF is set, or 0x00 when clear.
+ */
+ ctxt->dst.val = 0xFF * !!(ctxt->eflags & X86_EFLAGS_CF);
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Using asm goto would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
})
-static inline unsigned long ad_mask(struct decode_cache *c)
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
{
- return (1UL << (c->ad_bytes << 3)) - 1;
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .src_type = ctxt->src.type,
+ .dst_type = ctxt->dst.type,
+ .ad_bytes = ctxt->ad_bytes,
+ .rip = ctxt->eip,
+ .next_rip = ctxt->_eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
+{
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
+}
+
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
}
/* Access/update address held in a register, based on addressing mode. */
static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
{
- if (c->ad_bytes == sizeof(unsigned long))
+ if (ctxt->ad_bytes == sizeof(unsigned long))
return reg;
else
- return reg & ad_mask(c);
+ return reg & ad_mask(ctxt);
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
{
- return base + address_mask(c, reg);
+ return address_mask(ctxt, reg_read(ctxt, reg));
+}
+
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+ assign_masked(reg, *reg + inc, mask);
}
static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
- if (c->ad_bytes == sizeof(unsigned long))
- *reg += inc;
- else
- *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+ ulong *preg = reg_rmw(ctxt, reg);
+
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
}
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
{
- register_address_increment(c, &c->eip, rel);
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
}
-static void set_seg_override(struct decode_cache *c, int seg)
+static u32 desc_limit_scaled(struct desc_struct *desc)
{
- c->has_seg_override = true;
- c->seg_override = seg;
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
}
static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
@@ -622,85 +525,418 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
- struct decode_cache *c)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
{
- if (!c->has_seg_override)
- return 0;
+ if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt))
+ return X86EMUL_UNHANDLEABLE;
- return seg_base(ctxt, c->seg_override);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
{
- return seg_base(ctxt, VCPU_SREG_ES);
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
}
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
- return seg_base(ctxt, VCPU_SREG_SS);
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long eip, u8 *dest)
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
{
- struct fetch_cache *fc = &ctxt->decode.fetch;
- int rc;
- int size, cur_size;
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
- if (eip == fc->end) {
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
- rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, NULL);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- fc->end += size;
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned int flags)
+{
+ return !ctxt->ops->is_canonical_addr(ctxt, la, flags);
+}
+
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
+ */
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ u64 alignment = ctxt->d & AlignMask;
+
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
+}
+
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ enum x86emul_mode mode, ulong *linear,
+ unsigned int flags)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ u8 va_bits;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
+ switch (mode) {
+ case X86EMUL_MODE_PROT64:
+ *linear = la = ctxt->ops->get_untagged_addr(ctxt, la, flags);
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (!__is_canonical_address(la, va_bits))
+ goto bad;
+
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
+ if (size > *max_size)
+ goto bad;
+ break;
+ default:
+ *linear = la = (u32)la;
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) || !(desc.type & 2)) &&
+ (flags & X86EMUL_F_WRITE))
+ goto bad;
+ /* unreadable code segment */
+ if (!(flags & X86EMUL_F_FETCH) && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if (!(desc.type & 8) && (desc.type & 4)) {
+ /* expand-down segment */
+ if (addr.ea <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ }
+ if (addr.ea > lim)
+ goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
+ break;
}
- *dest = fc->data[eip - fc->start];
+ if (la & (insn_alignment(ctxt, size) - 1))
+ return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, 0);
+ else
+ return emulate_gp(ctxt, 0);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, ctxt->mode, linear,
+ write ? X86EMUL_F_WRITE : 0);
}
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- unsigned long eip, void *dest, unsigned size)
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
{
+ ulong linear;
int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
- /* x86 instructions are limited to 15 bytes. */
- if (eip + size - ctxt->eip > 15)
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
return X86EMUL_UNHANDLEABLE;
- while (size--) {
- rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
}
+
return X86EMUL_CONTINUE;
}
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst);
+}
+
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ int rc = emulator_recalc_and_set_mode(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip(ctxt, dst);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+/*
+ * Prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
+{
+ int rc;
+ unsigned size, max_size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, ctxt->mode, &linear,
+ X86EMUL_F_FETCH);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
+
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
+ return emulate_gp(ctxt, 0);
+
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
+ return X86EMUL_CONTINUE;
+}
+
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _ctxt) \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += sizeof(_type); \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
+})
+
/*
* Given the 'reg' portion of a ModRM byte, and a register block, return a
* pointer into the block that addresses the relevant register.
* @highbyte_regs specifies whether to decode AH,CH,DH,BH.
*/
-static void *decode_register(u8 modrm_reg, unsigned long *regs,
- int highbyte_regs)
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
+ int byteop)
{
void *p;
+ int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop;
- p = &regs[modrm_reg];
if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
- p = (unsigned char *)&regs[modrm_reg & 3] + 1;
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
return p;
}
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *ptr,
+ struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -708,743 +944,725 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
- ctxt->vcpu, NULL);
+ rc = segmented_read_std(ctxt, addr, size, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
- ctxt->vcpu, NULL);
+ addr.ea += 2;
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
return rc;
}
-static int test_cc(unsigned int condition, unsigned int flags)
+EM_ASM_2(add);
+EM_ASM_2(or);
+EM_ASM_2(adc);
+EM_ASM_2(sbb);
+EM_ASM_2(and);
+EM_ASM_2(sub);
+EM_ASM_2(xor);
+EM_ASM_2(cmp);
+EM_ASM_2(test);
+EM_ASM_2(xadd);
+
+EM_ASM_1SRC2(mul, mul_ex);
+EM_ASM_1SRC2(imul, imul_ex);
+EM_ASM_1SRC2EX(div, div_ex);
+EM_ASM_1SRC2EX(idiv, idiv_ex);
+
+EM_ASM_3WCL(shld);
+EM_ASM_3WCL(shrd);
+
+EM_ASM_2W(imul);
+
+EM_ASM_1(not);
+EM_ASM_1(neg);
+EM_ASM_1(inc);
+EM_ASM_1(dec);
+
+EM_ASM_2CL(rol);
+EM_ASM_2CL(ror);
+EM_ASM_2CL(rcl);
+EM_ASM_2CL(rcr);
+EM_ASM_2CL(shl);
+EM_ASM_2CL(shr);
+EM_ASM_2CL(sar);
+
+EM_ASM_2W(bsf);
+EM_ASM_2W(bsr);
+EM_ASM_2W(bt);
+EM_ASM_2W(bts);
+EM_ASM_2W(btr);
+EM_ASM_2W(btc);
+
+EM_ASM_2R(cmp, cmp_r);
+
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
{
- int rc = 0;
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return em_bsf(ctxt);
+}
- switch ((condition & 15) >> 1) {
- case 0: /* o */
- rc |= (flags & EFLG_OF);
- break;
- case 1: /* b/c/nae */
- rc |= (flags & EFLG_CF);
- break;
- case 2: /* z/e */
- rc |= (flags & EFLG_ZF);
- break;
- case 3: /* be/na */
- rc |= (flags & (EFLG_CF|EFLG_ZF));
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return em_bsr(ctxt);
+}
+
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ return __emulate_cc(flags, condition & 0xf);
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
break;
- case 4: /* s */
- rc |= (flags & EFLG_SF);
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
break;
- case 5: /* p/pe */
- rc |= (flags & EFLG_PF);
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
break;
- case 7: /* le/ng */
- rc |= (flags & EFLG_ZF);
- /* fall through */
- case 6: /* l/nge */
- rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
break;
}
+ op->orig_val = op->val;
+}
- /* Odd condition identifiers (lsb == 1) have inverted sense. */
- return (!!rc ^ (condition & 1));
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fninit");
+ kvm_fpu_put();
+ return X86EMUL_CONTINUE;
}
-static void decode_register_operand(struct operand *op,
- struct decode_cache *c,
- int inhibit_bytereg)
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
{
- unsigned reg = c->modrm_reg;
- int highbyte_regs = c->rex_prefix == 0;
+ u16 fcw;
- if (!(c->d & ModRM))
- reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
- op->type = OP_REG;
- if ((c->d & ByteOp) && !inhibit_bytereg) {
- op->ptr = decode_register(reg, c->regs, highbyte_regs);
- op->val = *(u8 *)op->ptr;
- op->bytes = 1;
- } else {
- op->ptr = decode_register(reg, c->regs, 0);
- op->bytes = c->op_bytes;
- switch (op->bytes) {
- case 2:
- op->val = *(u16 *)op->ptr;
- break;
- case 4:
- op->val = *(u32 *)op->ptr;
- break;
- case 8:
- op->val = *(u64 *) op->ptr;
- break;
- }
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstcw %0": "+m"(fcw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ kvm_fpu_get();
+ asm volatile("fnstsw %0": "+m"(fsw));
+ kvm_fpu_put();
+
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static void __decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op, int reg)
+{
+ if ((ctxt->d & Avx) && ctxt->op_bytes == 32) {
+ op->type = OP_YMM;
+ op->bytes = 32;
+ op->addr.xmm = reg;
+ kvm_read_avx_reg(reg, &op->vec_val2);
+ return;
}
- op->orig_val = op->val;
+ if (ctxt->d & (Avx|Sse)) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ kvm_read_sse_reg(reg, &op->vec_val);
+ return;
+ }
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
+
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+ fetch_register_operand(op);
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned int reg;
+
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
+ reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0);
+
+ __decode_register_operand(ctxt, op, reg);
+}
+
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
u8 sib;
- int index_reg = 0, base_reg = 0, scale;
+ int index_reg, base_reg, scale;
int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
- if (c->rex_prefix) {
- c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
- c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
- }
+ ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0);
+ index_reg = (ctxt->rex_bits & REX_X ? 8 : 0);
+ base_reg = (ctxt->rex_bits & REX_B ? 8 : 0);
- c->modrm = insn_fetch(u8, 1, c->eip);
- c->modrm_mod |= (c->modrm & 0xc0) >> 6;
- c->modrm_reg |= (c->modrm & 0x38) >> 3;
- c->modrm_rm |= (c->modrm & 0x07);
- c->modrm_ea = 0;
- c->use_modrm_ea = 1;
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
- if (c->modrm_mod == 3) {
- c->modrm_ptr = decode_register(c->modrm_rm,
- c->regs, c->d & ByteOp);
- c->modrm_val = *(unsigned long *)c->modrm_ptr;
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
+ __decode_register_operand(ctxt, op, ctxt->modrm_rm);
return rc;
}
- if (c->ad_bytes == 2) {
- unsigned bx = c->regs[VCPU_REGS_RBX];
- unsigned bp = c->regs[VCPU_REGS_RBP];
- unsigned si = c->regs[VCPU_REGS_RSI];
- unsigned di = c->regs[VCPU_REGS_RDI];
+ op->type = OP_MEM;
+
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
/* 16-bit ModR/M decode. */
- switch (c->modrm_mod) {
+ switch (ctxt->modrm_mod) {
case 0:
- if (c->modrm_rm == 6)
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, ctxt);
break;
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, ctxt);
break;
case 2:
- c->modrm_ea += insn_fetch(u16, 2, c->eip);
+ modrm_ea += insn_fetch(u16, ctxt);
break;
}
- switch (c->modrm_rm) {
+ switch (ctxt->modrm_rm) {
case 0:
- c->modrm_ea += bx + si;
+ modrm_ea += bx + si;
break;
case 1:
- c->modrm_ea += bx + di;
+ modrm_ea += bx + di;
break;
case 2:
- c->modrm_ea += bp + si;
+ modrm_ea += bp + si;
break;
case 3:
- c->modrm_ea += bp + di;
+ modrm_ea += bp + di;
break;
case 4:
- c->modrm_ea += si;
+ modrm_ea += si;
break;
case 5:
- c->modrm_ea += di;
+ modrm_ea += di;
break;
case 6:
- if (c->modrm_mod != 0)
- c->modrm_ea += bp;
+ if (ctxt->modrm_mod != 0)
+ modrm_ea += bp;
break;
case 7:
- c->modrm_ea += bx;
+ modrm_ea += bx;
break;
}
- if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
- (c->modrm_rm == 6 && c->modrm_mod != 0))
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_SS);
- c->modrm_ea = (u16)c->modrm_ea;
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
- if ((c->modrm_rm & 7) == 4) {
- sib = insn_fetch(u8, 1, c->eip);
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, ctxt);
index_reg |= (sib >> 3) & 7;
base_reg |= sib & 7;
scale = sib >> 6;
- if ((base_reg & 7) == 5 && c->modrm_mod == 0)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
- else
- c->modrm_ea += c->regs[base_reg];
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, ctxt);
+ else {
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
+ }
if (index_reg != 4)
- c->modrm_ea += c->regs[index_reg] << scale;
- } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
if (ctxt->mode == X86EMUL_MODE_PROT64)
- c->rip_relative = 1;
- } else
- c->modrm_ea += c->regs[c->modrm_rm];
- switch (c->modrm_mod) {
- case 0:
- if (c->modrm_rm == 5)
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
- break;
+ ctxt->rip_relative = 1;
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ }
+ switch (ctxt->modrm_mod) {
case 1:
- c->modrm_ea += insn_fetch(s8, 1, c->eip);
+ modrm_ea += insn_fetch(s8, ctxt);
break;
case 2:
- c->modrm_ea += insn_fetch(s32, 4, c->eip);
+ modrm_ea += insn_fetch(s32, ctxt);
break;
}
}
+ op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
done:
return rc;
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+ struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
- switch (c->ad_bytes) {
+ op->type = OP_MEM;
+ switch (ctxt->ad_bytes) {
case 2:
- c->modrm_ea = insn_fetch(u16, 2, c->eip);
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
break;
case 4:
- c->modrm_ea = insn_fetch(u32, 4, c->eip);
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
break;
case 8:
- c->modrm_ea = insn_fetch(u64, 8, c->eip);
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
break;
}
done:
return rc;
}
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes, group;
-
-
- /* we cannot decode insn before we complete previous rep insn */
- WARN_ON(ctxt->restart);
-
- /* Shadow copy of register state. Committed on successful emulation. */
- memset(c, 0, sizeof(struct decode_cache));
- c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
- ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
- switch (mode) {
- case X86EMUL_MODE_REAL:
- case X86EMUL_MODE_VM86:
- case X86EMUL_MODE_PROT16:
- def_op_bytes = def_ad_bytes = 2;
- break;
- case X86EMUL_MODE_PROT32:
- def_op_bytes = def_ad_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86EMUL_MODE_PROT64:
- def_op_bytes = 4;
- def_ad_bytes = 8;
- break;
-#endif
- default:
- return -1;
- }
-
- c->op_bytes = def_op_bytes;
- c->ad_bytes = def_ad_bytes;
-
- /* Legacy prefixes. */
- for (;;) {
- switch (c->b = insn_fetch(u8, 1, c->eip)) {
- case 0x66: /* operand-size override */
- /* switch between 2/4 bytes */
- c->op_bytes = def_op_bytes ^ 6;
- break;
- case 0x67: /* address-size override */
- if (mode == X86EMUL_MODE_PROT64)
- /* switch between 4/8 bytes */
- c->ad_bytes = def_ad_bytes ^ 12;
- else
- /* switch between 2/4 bytes */
- c->ad_bytes = def_ad_bytes ^ 6;
- break;
- case 0x26: /* ES override */
- case 0x2e: /* CS override */
- case 0x36: /* SS override */
- case 0x3e: /* DS override */
- set_seg_override(c, (c->b >> 3) & 3);
- break;
- case 0x64: /* FS override */
- case 0x65: /* GS override */
- set_seg_override(c, c->b & 7);
- break;
- case 0x40 ... 0x4f: /* REX */
- if (mode != X86EMUL_MODE_PROT64)
- goto done_prefixes;
- c->rex_prefix = c->b;
- continue;
- case 0xf0: /* LOCK */
- c->lock_prefix = 1;
- break;
- case 0xf2: /* REPNE/REPNZ */
- c->rep_prefix = REPNE_PREFIX;
- break;
- case 0xf3: /* REP/REPE/REPZ */
- c->rep_prefix = REPE_PREFIX;
- break;
- default:
- goto done_prefixes;
- }
-
- /* Any legacy prefix after a REX prefix nullifies its effect. */
-
- c->rex_prefix = 0;
- }
+ long sv = 0, mask;
-done_prefixes:
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
- /* REX prefix. */
- if (c->rex_prefix)
- if (c->rex_prefix & 8)
- c->op_bytes = 8; /* REX.W */
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
- /* Opcode byte(s). */
- c->d = opcode_table[c->b];
- if (c->d == 0) {
- /* Two-byte opcode? */
- if (c->b == 0x0f) {
- c->twobyte = 1;
- c->b = insn_fetch(u8, 1, c->eip);
- c->d = twobyte_table[c->b];
- }
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
}
- if (c->d & Group) {
- group = c->d & GroupMask;
- c->modrm = insn_fetch(u8, 1, c->eip);
- --c->eip;
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+}
- group = (group << 3) + ((c->modrm >> 3) & 7);
- if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
- c->d = group2_table[group];
- else
- c->d = group_table[group];
- }
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->mem_read;
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
- }
+ if (mc->pos < mc->end)
+ goto read_cached;
- if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
- c->op_bytes = 8;
+ if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt))
+ return X86EMUL_UNHANDLEABLE;
- /* ModRM and SIB bytes. */
- if (c->d & ModRM)
- rc = decode_modrm(ctxt, ops);
- else if (c->d & MemAbs)
- rc = decode_abs(ctxt, ops);
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
- goto done;
-
- if (!c->has_seg_override)
- set_seg_override(c, VCPU_SREG_DS);
+ return rc;
- if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, c);
+ mc->end += size;
- if (c->ad_bytes != 8)
- c->modrm_ea = (u32)c->modrm_ea;
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
+ return X86EMUL_CONTINUE;
+}
- if (c->rip_relative)
- c->modrm_ea += c->eip;
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
- /*
- * Decode and fetch the source operand: register, memory
- * or immediate.
- */
- switch (c->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(&c->src, c, 0);
- break;
- case SrcMem16:
- c->src.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- c->src.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- c->src.bytes = (c->d & ByteOp) ? 1 :
- c->op_bytes;
- /* Don't fetch the address for invlpg: it could be unmapped. */
- if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
- break;
- srcmem_common:
- /*
- * For instructions with a ModR/M byte, switch to register
- * access if Mod = 3.
- */
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->src.type = OP_REG;
- c->src.val = c->modrm_val;
- c->src.ptr = c->modrm_ptr;
- break;
- }
- c->src.type = OP_MEM;
- c->src.ptr = (unsigned long *)c->modrm_ea;
- c->src.val = 0;
- break;
- case SrcImm:
- case SrcImmU:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- /* NB. Immediates are sign-extended as necessary. */
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
- if ((c->d & SrcMask) == SrcImmU) {
- switch (c->src.bytes) {
- case 1:
- c->src.val &= 0xff;
- break;
- case 2:
- c->src.val &= 0xffff;
- break;
- case 4:
- c->src.val &= 0xffffffff;
- break;
- }
- }
- break;
- case SrcImmByte:
- case SrcImmUByte:
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = 1;
- if ((c->d & SrcMask) == SrcImmByte)
- c->src.val = insn_fetch(s8, 1, c->eip);
- else
- c->src.val = insn_fetch(u8, 1, c->eip);
- break;
- case SrcOne:
- c->src.bytes = 1;
- c->src.val = 1;
- break;
- case SrcSI:
- c->src.type = OP_MEM;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)
- register_address(c, seg_override_base(ctxt, c),
- c->regs[VCPU_REGS_RSI]);
- c->src.val = 0;
- break;
- }
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
- /*
- * Decode and fetch the second source operand: register, memory
- * or immediate.
- */
- switch (c->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- c->src2.bytes = 1;
- c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 1;
- c->src2.val = insn_fetch(u8, 1, c->eip);
- break;
- case Src2Imm16:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 2;
- c->src2.val = insn_fetch(u16, 2, c->eip);
- break;
- case Src2One:
- c->src2.bytes = 1;
- c->src2.val = 1;
- break;
- case Src2Mem16:
- c->src2.type = OP_MEM;
- c->src2.bytes = 2;
- c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
- c->src2.val = 0;
- break;
- }
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
- /* Decode and fetch the destination operand: register or memory. */
- switch (c->d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- return 0;
- case DstReg:
- decode_register_operand(&c->dst, c,
- c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
- break;
- case DstMem:
- case DstMem64:
- if ((c->d & ModRM) && c->modrm_mod == 3) {
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.type = OP_REG;
- c->dst.val = c->dst.orig_val = c->modrm_val;
- c->dst.ptr = c->modrm_ptr;
- break;
- }
- c->dst.type = OP_MEM;
- c->dst.ptr = (unsigned long *)c->modrm_ea;
- if ((c->d & DstMask) == DstMem64)
- c->dst.bytes = 8;
- else
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.val = 0;
- if (c->d & BitOp) {
- unsigned long mask = ~(c->dst.bytes * 8 - 1);
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
- c->dst.ptr = (void *)c->dst.ptr +
- (c->src.val & mask) / 8;
- }
- break;
- case DstAcc:
- c->dst.type = OP_REG;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = &c->regs[VCPU_REGS_RAX];
- switch (c->dst.bytes) {
- case 1:
- c->dst.val = *(u8 *)c->dst.ptr;
- break;
- case 2:
- c->dst.val = *(u16 *)c->dst.ptr;
- break;
- case 4:
- c->dst.val = *(u32 *)c->dst.ptr;
- break;
- case 8:
- c->dst.val = *(u64 *)c->dst.ptr;
- break;
- }
- c->dst.orig_val = c->dst.val;
- break;
- case DstDI:
- c->dst.type = OP_MEM;
- c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)
- register_address(c, es_base(ctxt),
- c->regs[VCPU_REGS_RDI]);
- c->dst.val = 0;
- break;
- }
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
-done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
}
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
unsigned int size, unsigned short port,
void *dest)
{
- struct read_cache *rc = &ctxt->decode.io_read;
+ struct read_cache *rc = &ctxt->io_read;
if (rc->pos == rc->end) { /* refill pio read ahead */
- struct decode_cache *c = &ctxt->decode;
unsigned int in_page, n;
- unsigned int count = c->rep_prefix ?
- address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
- in_page = (ctxt->eflags & EFLG_DF) ?
- offset_in_page(c->regs[VCPU_REGS_RDI]) :
- PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
- if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
return 0;
rc->end = n * size;
}
- memcpy(dest, rc->data + rc->pos, size);
- rc->pos += size;
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
return 1;
}
-static u32 desc_limit_scaled(struct desc_struct *desc)
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
{
- u32 limit = get_desc_limit(desc);
+ struct desc_ptr dt;
+ ulong addr;
- return desc->g ? (limit << 12) | 0xfff : limit;
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return linear_read_system(ctxt, addr, desc, sizeof(*desc));
}
static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, struct desc_ptr *dt)
{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
+
if (selector & 1 << 2) {
struct desc_struct desc;
- memset (dt, 0, sizeof *dt);
- if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+ u16 sel;
+
+ memset(dt, 0, sizeof(*dt));
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = get_desc_base(&desc);
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
} else
- ops->get_gdt(dt, ctxt->vcpu);
+ ops->get_gdt(ctxt, dt);
}
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, struct desc_struct *desc)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
- int ret;
- u32 err;
ulong addr;
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+ get_descriptor_table_ptr(ctxt, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
- if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
- return ret;
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
}
/* allowed just for 8 bytes segments */
static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 selector, struct desc_struct *desc)
{
- struct desc_ptr dt;
- u16 index = selector >> 3;
- u32 err;
+ int rc;
ulong addr;
- int ret;
- get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ return linear_write_system(ctxt, addr, desc, sizeof(*desc));
+}
- addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl)
+{
+ const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET;
+ u64 efer = 0, cet = 0, ssp = 0;
- return ret;
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer))
+ return true;
+
+ /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */
+ if (!(efer & EFER_LMA))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet))
+ return true;
+
+ if (!(cet & CET_SHSTK_EN))
+ return false;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp))
+ return true;
+
+ /*
+ * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must
+ * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode.
+ */
+ return ssp >> 32;
}
-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 selector, int seg)
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl,
+ enum x86_transfer_type transfer,
+ struct desc_struct *desc)
{
- struct desc_struct seg_desc;
- u8 dpl, rpl, cpl;
+ struct desc_struct seg_desc, old_desc;
+ u8 dpl, rpl;
unsigned err_vec = GP_VECTOR;
u32 err_code = 0;
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
int ret;
+ u16 dummy;
+ u32 base3 = 0;
- memset(&seg_desc, 0, sizeof seg_desc);
+ memset(&seg_desc, 0, sizeof(seg_desc));
- if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
- || ctxt->mode == X86EMUL_MODE_REAL) {
- /* set real mode segment descriptor */
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor (keep limit etc. for
+ * unreal mode) */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
+ set_desc_base(&seg_desc, selector << 4);
+ goto load;
+ } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+ /* VM86 needs a clean new segment descriptor */
set_desc_base(&seg_desc, selector << 4);
set_desc_limit(&seg_desc, 0xffff);
seg_desc.type = 3;
seg_desc.p = 1;
seg_desc.s = 1;
+ seg_desc.dpl = 3;
goto load;
}
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
+ rpl = selector & 3;
/* TR should be in GDT only */
if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
goto exception;
- if (null_selector) /* for NULL selector skip all following checks */
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
goto load;
+ }
- ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
-
- /* can't load system descriptor into segment selecor */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
- goto exception;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ /* can't load system descriptor into segment selector */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
goto exception;
}
- rpl = selector & 3;
dpl = seg_desc.dpl;
- cpl = ops->cpl(ctxt->vcpu);
switch (seg) {
case VCPU_SREG_SS:
/*
* segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
+ * selector's RPL != CPL or DPL != CPL
*/
if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
goto exception;
break;
case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
if (!(seg_desc.type & 8))
goto exception;
- if (seg_desc.type & 4) {
- /* conforming */
- if (dpl > cpl)
+ if (transfer == X86_TRANSFER_RET) {
+ /* RET can never return to an inner privilege level. */
+ if (rpl < cpl)
goto exception;
- } else {
- /* nonconforming */
- if (rpl > cpl || dpl != cpl)
+ /* Outer-privilege level return is not implemented */
+ if (rpl > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if (transfer == X86_TRANSFER_RET || transfer == X86_TRANSFER_TASK_SWITCH) {
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > rpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (dpl != rpl)
+ goto exception;
+ }
+ } else { /* X86_TRANSFER_CALL_JMP */
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ }
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
goto exception;
}
+ if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) {
+ err_code = 0;
+ goto exception;
+ }
+
/* CS(RPL) <- CPL */
selector = (selector & 0xfffc) | cpl;
break;
@@ -1460,94 +1678,203 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/*
* segment is not a data or readable code segment or
* ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
+ * and ((RPL > DPL) or (CPL > DPL)))
*/
if ((seg_desc.type & 0xa) == 0x8 ||
(((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
+ (rpl > dpl || cpl > dpl)))
goto exception;
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt,
+ X86EMUL_F_DT_LOAD))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
return ret;
}
load:
- ops->set_segment_selector(selector, seg, ctxt->vcpu);
- ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
return X86EMUL_CONTINUE;
exception:
- kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_exception(ctxt, err_vec, err_code, true);
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
{
- struct decode_cache *c = &ctxt->decode;
+ u8 cpl = ctxt->ops->cpl(ctxt);
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]);
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
+}
+
+static void write_register_operand(struct operand *op)
+{
+ return assign_register(op->addr.reg, op->val, op->bytes);
+}
+
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
+{
+ switch (op->type) {
+ case OP_REG:
+ write_register_operand(op);
+ break;
+ case OP_MEM:
+ if (ctxt->lock_prefix)
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
+ op->addr.mem,
+ &op->val,
+ op->bytes);
+ case OP_MEM_STR:
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
+ case OP_XMM:
+ if (!(ctxt->d & Avx)) {
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
+ break;
+ }
+ /* full YMM write but with high bytes cleared */
+ memset(op->valptr + 16, 0, 16);
+ fallthrough;
+ case OP_YMM:
+ kvm_write_avx_reg(op->addr.xmm, &op->vec_val2);
+ break;
+ case OP_MM:
+ kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len)
+{
+ struct segmented_address addr;
+
+ rsp_increment(ctxt, -len);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+
+ return segmented_write(ctxt, addr, data, len);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
void *dest, int len)
{
- struct decode_cache *c = &ctxt->decode;
int rc;
+ struct segmented_address addr;
- rc = ops->read_emulated(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- dest, len, ctxt->vcpu);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+ rc = segmented_read(ctxt, addr, dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+ rsp_increment(ctxt, len);
return rc;
}
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
static int emulate_popf(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- void *dest, int len)
+ void *dest, int len)
{
int rc;
- unsigned long val, change_mask;
- int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- int cpl = ops->cpl(ctxt->vcpu);
+ unsigned long val = 0;
+ unsigned long change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ int cpl = ctxt->ops->cpl(ctxt);
- rc = emulate_pop(ctxt, ops, &val, len);
+ rc = emulate_pop(ctxt, &val, len);
if (rc != X86EMUL_CONTINUE)
return rc;
- change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
switch(ctxt->mode) {
case X86EMUL_MODE_PROT64:
case X86EMUL_MODE_PROT32:
case X86EMUL_MODE_PROT16:
if (cpl == 0)
- change_mask |= EFLG_IOPL;
+ change_mask |= X86_EFLAGS_IOPL;
if (cpl <= iopl)
- change_mask |= EFLG_IF;
+ change_mask |= X86_EFLAGS_IF;
break;
case X86EMUL_MODE_VM86:
- if (iopl < 3) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
- change_mask |= EFLG_IF;
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= X86_EFLAGS_IF;
break;
default: /* real mode */
- change_mask |= (EFLG_IOPL | EFLG_IF);
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
break;
}
@@ -1557,498 +1884,686 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = emulate_push(ctxt, &rbp, stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
+ stack_mask(ctxt));
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
+}
+
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- struct kvm_segment segment;
+ int seg = ctxt->src2.val;
- kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+ ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
- c->src.val = segment.selector;
- emulate_push(ctxt);
+ return em_push(ctxt);
}
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops, int seg)
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- unsigned long selector;
+ int seg = ctxt->src2.val;
+ unsigned long selector = 0;
int rc;
- rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
+ rc = emulate_pop(ctxt, &selector, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+ if (seg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
+
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
}
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
- (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
+
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- emulate_push(ctxt);
++reg;
}
+
+ return rc;
}
-static int emulate_popa(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
+ u32 val = 0;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- c->op_bytes);
+ rsp_increment(ctxt, ctxt->op_bytes);
--reg;
}
- rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
--reg;
}
return rc;
}
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
{
- struct decode_cache *c = &ctxt->decode;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
+
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ops->get_idt(ctxt, &dt);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = eip;
- return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
+ return rc;
}
-static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
{
- struct decode_cache *c = &ctxt->decode;
- switch (c->modrm_reg) {
- case 0: /* rol */
- emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
- break;
- case 1: /* ror */
- emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
- break;
- case 2: /* rcl */
- emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
- break;
- case 3: /* rcr */
- emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
- break;
- case 4: /* sal/shl */
- case 6: /* sal/shl */
- emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
- break;
- case 5: /* shr */
- emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
- break;
- case 7: /* sar */
- emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
- break;
- }
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
}
-static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
{
- struct decode_cache *c = &ctxt->decode;
-
- switch (c->modrm_reg) {
- case 0 ... 1: /* test */
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
- break;
- case 2: /* not */
- c->dst.val = ~c->dst.val;
- break;
- case 3: /* neg */
- emulate_1op("neg", c->dst, ctxt->eflags);
- break;
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return __emulate_int_real(ctxt, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
default:
- return 0;
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
}
- return 1;
}
-static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
- switch (c->modrm_reg) {
- case 0: /* inc */
- emulate_1op("inc", c->dst, ctxt->eflags);
- break;
- case 1: /* dec */
- emulate_1op("dec", c->dst, ctxt->eflags);
- break;
- case 2: /* call near abs */ {
- long int old_eip;
- old_eip = c->eip;
- c->eip = c->src.val;
- c->src.val = old_eip;
- emulate_push(ctxt);
- break;
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = temp_eip;
+
+ if (ctxt->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (ctxt->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
}
- case 4: /* jmp abs */
- c->eip = c->src.val;
- break;
- case 6: /* push */
- emulate_push(ctxt);
- break;
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ return rc;
+}
+
+static int em_iret(struct x86_emulate_ctxt *ctxt)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
}
- return X86EMUL_CONTINUE;
}
-static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- u64 old = c->dst.orig_val;
+ int rc;
+ unsigned short sel;
+ struct desc_struct new_desc;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
- ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
+{
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
- c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
- c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
- ctxt->eflags &= ~EFLG_ZF;
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ return rc;
+}
+
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+{
+ u64 old = ctxt->dst.orig_val64;
+
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
} else {
- c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
- (u32) c->regs[VCPU_REGS_RBX];
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
- ctxt->eflags |= EFLG_ZF;
+ ctxt->eflags |= X86_EFLAGS_ZF;
}
return X86EMUL_CONTINUE;
}
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_ret(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
int rc;
- unsigned long cs;
+ unsigned long eip = 0;
- rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- if (c->op_bytes == 4)
- c->eip = (u32)c->eip;
- rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+
+ return assign_eip_near(ctxt, eip);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip = 0;
+ unsigned long cs = 0;
+ int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct new_desc;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
return rc;
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
{
- int rc;
- struct decode_cache *c = &ctxt->decode;
+ int rc;
- switch (c->dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->dst.orig_val;
+ em_cmp(ctxt);
+
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
+ ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
}
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- ctxt->interruptibility = mask;
+ int seg = ctxt->src2.val;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->dst.val = ctxt->src.val;
+ return rc;
}
-static inline void
-setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct kvm_segment *cs, struct kvm_segment *ss)
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- memset(cs, 0, sizeof(struct kvm_segment));
- kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct kvm_segment));
+ if (!ctxt->ops->is_smm(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
+ return emulator_recalc_and_set_mode(ctxt);
+}
+
+static void
+setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
+{
cs->l = 0; /* will be adjusted later */
- cs->base = 0; /* flat segment */
+ set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
- cs->limit = 0xffffffff; /* 4GB limit */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
cs->type = 0x0b; /* Read, Execute, Accessed */
cs->s = 1;
cs->dpl = 0; /* will be adjusted later */
- cs->present = 1;
- cs->db = 1;
+ cs->p = 1;
+ cs->d = 1;
+ cs->avl = 0;
- ss->unusable = 0;
- ss->base = 0; /* flat segment */
- ss->limit = 0xffffffff; /* 4GB limit */
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->g = 1; /* 4kb granularity */
ss->s = 1;
ss->type = 0x03; /* Read/Write, Accessed */
- ss->db = 1; /* 32bit stack segment */
+ ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
- ss->present = 1;
+ ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
}
-static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
+ return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_SCE))
+ return emulate_ud(ctxt);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ setup_syscalls_segments(&cs, &ss);
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
- cs.selector = (u16)(msr_data & 0xfffc);
- ss.selector = (u16)(msr_data + 8);
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
- if (is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- c->regs[VCPU_REGS_RCX] = c->eip;
- if (is_long_mode(ctxt->vcpu)) {
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
+ if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
- kvm_x86_ops->get_msr(ctxt->vcpu,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
- c->eip = msr_data;
+ ops->get_msr(ctxt,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ctxt->_eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
- ctxt->eflags &= ~(msr_data | EFLG_RF);
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
#endif
} else {
/* legacy mode */
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
- c->eip = (u32)msr_data;
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
}
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
return X86EMUL_CONTINUE;
}
-static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+ ops->get_msr(ctxt, MSR_EFER, &efer);
/* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
- /* XXX sysenter/sysexit have not been tested in 64bit mode.
- * Therefore, we inject an #UD.
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ /*
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
+ */
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
+ return emulate_ud(ctxt);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ /* sysenter/sysexit have not been tested in 64bit mode. */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
- switch (ctxt->mode) {
- case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
- break;
- case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
- break;
- }
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs.selector = (u16)msr_data;
- cs.selector &= ~SELECTOR_RPL_MASK;
- ss.selector = cs.selector + 8;
- ss.selector &= ~SELECTOR_RPL_MASK;
- if (ctxt->mode == X86EMUL_MODE_PROT64
- || is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ setup_syscalls_segments(&cs, &ss);
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
- c->eip = msr_data;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
- c->regs[VCPU_REGS_RSP] = msr_data;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
+ if (efer & EFER_LMA)
+ ctxt->mode = X86EMUL_MODE_PROT64;
return X86EMUL_CONTINUE;
}
-static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
- struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
- u64 msr_data;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data, rcx, rdx;
int usermode;
+ u16 cs_sel = 0, ss_sel = 0;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(&cs, &ss);
- if ((c->rex_prefix & 0x8) != 0x0)
+ if (ctxt->rex_bits & REX_W)
usermode = X86EMUL_MODE_PROT64;
else
usermode = X86EMUL_MODE_PROT32;
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
cs.dpl = 3;
ss.dpl = 3;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
- cs.selector = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
- ss.selector = (u16)(msr_data + 24);
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
break;
case X86EMUL_MODE_PROT64:
- cs.selector = (u16)(msr_data + 32);
- if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
- ss.selector = cs.selector + 8;
- cs.db = 0;
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
cs.l = 1;
+ if (emul_is_noncanonical_address(rcx, ctxt, 0) ||
+ emul_is_noncanonical_address(rdx, ctxt, 0))
+ return emulate_gp(ctxt, 0);
break;
}
- cs.selector |= SELECTOR_RPL_MASK;
- ss.selector |= SELECTOR_RPL_MASK;
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+ ctxt->_eip = rdx;
+ ctxt->mode = usermode;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
}
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
{
int iopl;
if (ctxt->mode == X86EMUL_MODE_REAL)
return false;
if (ctxt->mode == X86EMUL_MODE_VM86)
return true;
- iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
- return ops->cpl(ctxt->vcpu) > iopl;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ return ctxt->ops->cpl(ctxt) > iopl;
}
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- struct kvm_segment tr_seg;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct tr_seg;
+ u32 base3;
int r;
- u16 io_bitmap_ptr;
- u8 perm, bit_idx = port & 0x7;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
+ unsigned long base;
- kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
- if (tr_seg.unusable)
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
+ if (!tr_seg.p)
return false;
- if (tr_seg.limit < 103)
+ if (desc_limit_scaled(&tr_seg) < 103)
return false;
- r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
- NULL);
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
if (r != X86EMUL_CONTINUE)
return false;
- if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
- ctxt->vcpu, NULL);
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -2056,321 +2571,352 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
return true;
}
-static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 port, u16 len)
+static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
{
- if (emulator_bad_iopl(ctxt, ops))
- if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
return false;
+
+ ctxt->perm_ok = true;
+
return true;
}
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- int seg)
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
{
- struct desc_struct desc;
- if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
- return get_desc_base(&desc);
- else
- return ~0;
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ fallthrough;
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
}
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
{
- struct decode_cache *c = &ctxt->decode;
-
- tss->ip = c->eip;
+ tss->ip = ctxt->_eip;
tss->flag = ctxt->eflags;
- tss->ax = c->regs[VCPU_REGS_RAX];
- tss->cx = c->regs[VCPU_REGS_RCX];
- tss->dx = c->regs[VCPU_REGS_RDX];
- tss->bx = c->regs[VCPU_REGS_RBX];
- tss->sp = c->regs[VCPU_REGS_RSP];
- tss->bp = c->regs[VCPU_REGS_RBP];
- tss->si = c->regs[VCPU_REGS_RSI];
- tss->di = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
}
static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
{
- struct decode_cache *c = &ctxt->decode;
int ret;
+ u8 cpl;
- c->eip = tss->ip;
+ ctxt->_eip = tss->ip;
ctxt->eflags = tss->flag | 2;
- c->regs[VCPU_REGS_RAX] = tss->ax;
- c->regs[VCPU_REGS_RCX] = tss->cx;
- c->regs[VCPU_REGS_RDX] = tss->dx;
- c->regs[VCPU_REGS_RBX] = tss->bx;
- c->regs[VCPU_REGS_RSP] = tss->sp;
- c->regs[VCPU_REGS_RBP] = tss->bp;
- c->regs[VCPU_REGS_RSI] = tss->si;
- c->regs[VCPU_REGS_RDI] = tss->di;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
/*
* SDM says that segment selectors are loaded before segment
* descriptors
*/
- ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+
+ cpl = tss->cs & 3;
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
return X86EMUL_CONTINUE;
}
-static int task_switch_16(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_16(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_16 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
- save_state_to_tss16(ctxt, ops, &tss_seg);
+ save_state_to_tss16(ctxt, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
}
- return load_state_from_tss16(ctxt, ops, &tss_seg);
+ return load_state_from_tss16(ctxt, &tss_seg);
}
static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_32 *tss)
{
- struct decode_cache *c = &ctxt->decode;
-
- tss->cr3 = ops->get_cr(3, ctxt->vcpu);
- tss->eip = c->eip;
+ /* CR3 and ldt selector are not saved intentionally */
+ tss->eip = ctxt->_eip;
tss->eflags = ctxt->eflags;
- tss->eax = c->regs[VCPU_REGS_RAX];
- tss->ecx = c->regs[VCPU_REGS_RCX];
- tss->edx = c->regs[VCPU_REGS_RDX];
- tss->ebx = c->regs[VCPU_REGS_RBX];
- tss->esp = c->regs[VCPU_REGS_RSP];
- tss->ebp = c->regs[VCPU_REGS_RBP];
- tss->esi = c->regs[VCPU_REGS_RSI];
- tss->edi = c->regs[VCPU_REGS_RDI];
-
- tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
- tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
- tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
- tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
- tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
- tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
- tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
}
static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
struct tss_segment_32 *tss)
{
- struct decode_cache *c = &ctxt->decode;
int ret;
+ u8 cpl;
- ops->set_cr(3, tss->cr3, ctxt->vcpu);
- c->eip = tss->eip;
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
- c->regs[VCPU_REGS_RAX] = tss->eax;
- c->regs[VCPU_REGS_RCX] = tss->ecx;
- c->regs[VCPU_REGS_RDX] = tss->edx;
- c->regs[VCPU_REGS_RBX] = tss->ebx;
- c->regs[VCPU_REGS_RSP] = tss->esp;
- c->regs[VCPU_REGS_RBP] = tss->ebp;
- c->regs[VCPU_REGS_RSI] = tss->esi;
- c->regs[VCPU_REGS_RDI] = tss->edi;
+
+ /* General purpose registers */
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
/*
* SDM says that segment selectors are loaded before segment
- * descriptors
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
*/
- ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
- ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
- ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
- ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
- ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
- ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
- ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ ctxt->mode = X86EMUL_MODE_VM86;
+ cpl = 3;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ cpl = tss->cs & 3;
+ }
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
- ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
- if (ret != X86EMUL_CONTINUE)
- return ret;
- ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
- return X86EMUL_CONTINUE;
+ return ret;
}
-static int task_switch_32(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, u16 old_tss_sel,
+static int task_switch_32(struct x86_emulate_ctxt *ctxt, u16 old_tss_sel,
ulong old_tss_base, struct desc_struct *new_desc)
{
struct tss_segment_32 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
- ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
- save_state_to_tss32(ctxt, ops, &tss_seg);
+ save_state_to_tss32(ctxt, &tss_seg);
- ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ /* Only GP registers and segment selectors are saved */
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
- ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof(tss_seg));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
- ret = ops->write_std(new_tss_base,
- &tss_seg.prev_task_link,
- sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
- /* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof(tss_seg.prev_task_link));
+ if (ret != X86EMUL_CONTINUE)
return ret;
- }
}
- return load_state_from_tss32(ctxt, ops, &tss_seg);
+ return load_state_from_tss32(ctxt, &tss_seg);
}
static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, int reason,
+ u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code)
{
+ const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct curr_tss_desc, next_tss_desc;
int ret;
- u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
ulong old_tss_base =
- get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
+ ulong desc_addr, dr7;
/* FIXME: old_tss_base == ~0 ? */
- ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
/* FIXME: check that next_tss_desc is tss */
- if (reason != TASK_SWITCH_IRET) {
- if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- kvm_inject_gp(ctxt->vcpu, 0);
- return X86EMUL_PROPAGATE_FAULT;
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
}
}
@@ -2378,30 +2924,26 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
desc_limit < 0x2b)) {
- kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
- tss_selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
}
if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
- write_segment_descriptor(ctxt, ops, old_tss_sel,
- &curr_tss_desc);
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
}
if (reason == TASK_SWITCH_IRET)
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
+ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
if (next_tss_desc.type & 8)
- ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
- old_tss_base, &next_tss_desc);
+ ret = task_switch_32(ctxt, old_tss_sel, old_tss_base, &next_tss_desc);
else
- ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+ ret = task_switch_16(ctxt, old_tss_sel,
old_tss_base, &next_tss_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2411,858 +2953,2693 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
next_tss_desc.type |= (1 << 1); /* set busy flag */
- write_segment_descriptor(ctxt, ops, tss_selector,
- &next_tss_desc);
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
}
- ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
- ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
- ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
if (has_error_code) {
- struct decode_cache *c = &ctxt->decode;
-
- c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
- c->lock_prefix = 0;
- c->src.val = (unsigned long) error_code;
- emulate_push(ctxt);
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
}
+ dr7 = ops->get_dr(ctxt, 7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
return ret;
}
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- u16 tss_selector, int reason,
+ u16 tss_selector, int idt_index, int reason,
bool has_error_code, u32 error_code)
{
- struct decode_cache *c = &ctxt->decode;
int rc;
- memset(c, 0, sizeof(struct decode_cache));
- c->eip = ctxt->eip;
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
- c->dst.type = OP_NONE;
+ invalidate_registers(ctxt);
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
- rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
if (rc == X86EMUL_CONTINUE) {
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
- rc = writeback(ctxt, ops);
+ ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
}
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
}
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
- int reg, struct operand *op)
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
{
- struct decode_cache *c = &ctxt->decode;
- int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
- register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
}
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_das(struct x86_emulate_ctxt *ctxt)
{
- u64 msr_data;
- struct decode_cache *c = &ctxt->decode;
- int rc = X86EMUL_CONTINUE;
- int saved_dst_type = c->dst.type;
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = ctxt->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
- ctxt->interruptibility = 0;
+ ctxt->dst.val = al;
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
- /* Shadow copy of register state. Committed on successful emulation.
- * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
- * modify them.
- */
+static int em_aam(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, ah;
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ if (ctxt->src.val == 0)
+ return emulate_de(ctxt);
- if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
- }
+ al = ctxt->dst.val & 0xff;
+ ah = al / ctxt->src.val;
+ al %= ctxt->src.val;
- /* LOCK prefix is allowed only with some instructions */
- if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8);
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ em_or(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return em_push(ctxt);
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
+
+ old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
+ goto fail;
}
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
+ return rc;
- /* Privileged instruction can be executed only in CPL=0 */
- if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip = 0;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
+ return em_imul(ctxt);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
+ return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
+ return emulate_gp(ctxt, 0);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ u16 tmp;
+
+ if (!ctxt->ops->guest_has_movbe(ctxt))
+ return emulate_ud(ctxt);
+
+ switch (ctxt->op_bytes) {
+ case 2:
+ /*
+ * From MOVBE definition: "...When the operand size is 16 bits,
+ * the upper word of the destination register remains unchanged
+ * ..."
+ *
+ * Both casting ->valptr and ->val to u16 breaks strict aliasing
+ * rules so we have to do the operation almost per hand.
+ */
+ tmp = (u16)ctxt->src.val;
+ ctxt->dst.val &= ~0xffffUL;
+ ctxt->dst.val |= (unsigned long)swab16(tmp);
+ break;
+ case 4:
+ ctxt->dst.val = swab32((u32)ctxt->src.val);
+ break;
+ case 8:
+ ctxt->dst.val = swab64(ctxt->src.val);
+ break;
+ default:
+ BUG();
}
+ return X86EMUL_CONTINUE;
+}
- if (c->rep_prefix && (c->d & String)) {
- ctxt->restart = true;
- /* All REP prefixes have the same first termination condition */
- if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
- string_done:
- ctxt->restart = false;
- kvm_rip_write(ctxt->vcpu, c->eip);
- goto done;
- }
- /* The second termination condition only applies for REPE
- * and REPNE. Test if the repeat string operation prefix is
- * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
- * corresponding termination condition according to:
- * - if REPE/REPZ and ZF = 0 then done
- * - if REPNE/REPNZ and ZF = 1 then done
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
*/
- if ((c->b == 0xa6) || (c->b == 0xa7) ||
- (c->b == 0xae) || (c->b == 0xaf)) {
- if ((c->rep_prefix == REPE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == 0))
- goto string_done;
- if ((c->rep_prefix == REPNE_PREFIX) &&
- ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
- goto string_done;
- }
- c->eip = ctxt->eip;
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
}
- if (c->src.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- c->src.orig_val = c->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ return r;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX);
+ u64 msr_data;
+ int r;
+
+ r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
+
+ if (r == X86EMUL_PROPAGATE_FAULT)
+ return emulate_gp(ctxt, 0);
+
+ if (r == X86EMUL_CONTINUE) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
}
+ return r;
+}
- if (c->src2.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src2.ptr,
- &c->src2.val,
- c->src2.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
+{
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+ unsigned int max_size;
+
+ rc = __linearize(ctxt, ctxt->src.addr.mem, &max_size, 1, ctxt->mode,
+ &linear, X86EMUL_F_INVLPG);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = ctxt->ops->fix_hypercall(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
}
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
- if ((c->d & DstMask) == ImplicitOps)
- goto special_insn;
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
- if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
- /* optimisation - avoid slow emulated read if Mov */
- rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ emul_is_noncanonical_address(desc_ptr.address, ctxt,
+ X86EMUL_F_DT_LOAD))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, false);
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
}
- c->dst.orig_val = c->dst.val;
-special_insn:
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
+ return X86EMUL_CONTINUE;
+}
- if (c->twobyte)
- goto twobyte_insn;
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+ u32 flags;
+
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
+ flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+ ctxt->eflags &= ~0xffUL;
+ ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
- switch (c->b) {
- case 0x00 ... 0x05:
- add: /* add */
- emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
break;
- case 0x06: /* push es */
- emulate_push_sreg(ctxt, VCPU_SREG_ES);
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
break;
- case 0x07: /* pop es */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ if (!ctxt->ops->guest_has_fxsr(ctxt))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ kvm_fpu_put();
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
+}
+
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+ size_t size;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ kvm_fpu_get();
+
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
if (rc != X86EMUL_CONTINUE)
- goto done;
- break;
- case 0x08 ... 0x0d:
- or: /* or */
- emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
- break;
- case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, VCPU_SREG_CS);
- break;
- case 0x10 ... 0x15:
- adc: /* adc */
- emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
- break;
- case 0x16: /* push ss */
- emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ goto out;
+ }
+
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+out:
+ kvm_fpu_put();
+
+ return rc;
+}
+
+static int em_xsetbv(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ecx, edx;
+
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE))
+ return emulate_ud(ctxt);
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ edx = reg_read(ctxt, VCPU_REGS_RDX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_access(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
+ ulong dr6;
+
+ dr6 = ctxt->ops->get_dr(ctxt, 6);
+ dr6 &= ~DR_TRAP_BITS;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
+ return emulate_db(ctxt);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ ctxt->ops->check_rdpmc_early(ctxt, rcx))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define N D(NotImpl)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
+
+#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static const struct opcode ud = I(SrcNone, emulate_ud);
+
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm1[] = {
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm2[] = {
+ N,
+ II(ImplicitOps | Priv, em_xsetbv, xsetbv),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm3[] = {
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
+};
+
+static const struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group1[] = {
+ I(Lock, em_add),
+ I(Lock | PageTable, em_or),
+ I(Lock, em_adc),
+ I(Lock, em_sbb),
+ I(Lock | PageTable, em_and),
+ I(Lock, em_sub),
+ I(Lock, em_xor),
+ I(NoWrite, em_cmp),
+};
+
+static const struct opcode group1A[] = {
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
+};
+
+static const struct opcode group2[] = {
+ I(DstMem | ModRM, em_rol),
+ I(DstMem | ModRM, em_ror),
+ I(DstMem | ModRM, em_rcl),
+ I(DstMem | ModRM, em_rcr),
+ I(DstMem | ModRM, em_shl),
+ I(DstMem | ModRM, em_shr),
+ I(DstMem | ModRM, em_shl),
+ I(DstMem | ModRM, em_sar),
+};
+
+static const struct opcode group3[] = {
+ I(DstMem | SrcImm | NoWrite, em_test),
+ I(DstMem | SrcImm | NoWrite, em_test),
+ I(DstMem | SrcNone | Lock, em_not),
+ I(DstMem | SrcNone | Lock, em_neg),
+ I(DstXacc | Src2Mem, em_mul_ex),
+ I(DstXacc | Src2Mem, em_imul_ex),
+ I(DstXacc | Src2Mem, em_div_ex),
+ I(DstXacc | Src2Mem, em_idiv_ex),
+};
+
+static const struct opcode group4[] = {
+ I(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ I(ByteOp | DstMem | SrcNone | Lock, em_dec),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group5[] = {
+ I(DstMem | SrcNone | Lock, em_inc),
+ I(DstMem | SrcNone | Lock, em_dec),
+ I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
+};
+
+static const struct opcode group6[] = {
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
+ N, N, N, N,
+};
+
+static const struct group_dual group7 = { {
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+}, {
+ EXT(0, group7_rm0),
+ EXT(0, group7_rm1),
+ EXT(0, group7_rm2),
+ EXT(0, group7_rm3),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
+} };
+
+static const struct opcode group8[] = {
+ N, N, N, N,
+ I(DstMem | SrcImmByte | NoWrite, em_bt),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ I(DstMem | SrcImmByte | Lock, em_btr),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+};
+
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdpid),
+};
+
+
+static const struct group_dual group9 = { {
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
+} };
+
+static const struct opcode group11[] = {
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
+};
+
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
+};
+
+static const struct group_dual group15 = { {
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct gprefix pfx_0f_6f_0f_7f = {
+ I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov),
+};
+
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
+static const struct gprefix pfx_0f_2b = {
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
+};
+
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_e7_0f_38_2a = {
+ N, I(Sse | Avx, em_mov), N, N,
+};
+
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
+static const struct instr_dual instr_dual_8d = {
+ D(DstReg | SrcMem | ModRM | NoAccess), N
+};
+
+static const struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ I6ALU(Lock, em_add),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
+ /* 0x08 - 0x0F */
+ I6ALU(Lock | PageTable, em_or),
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
+ /* 0x10 - 0x17 */
+ I6ALU(Lock, em_adc),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
+ /* 0x18 - 0x1F */
+ I6ALU(Lock, em_sbb),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
+ /* 0x20 - 0x27 */
+ I6ALU(Lock | PageTable, em_and), N, N,
+ /* 0x28 - 0x2F */
+ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ I6ALU(Lock, em_xor), N, N,
+ /* 0x38 - 0x3F */
+ I6ALU(NoWrite, em_cmp), N, N,
+ /* 0x40 - 0x4F */
+ X8(I(DstReg, em_inc)), X8(I(DstReg, em_dec)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(I(DstReg | Stack, em_pop)),
+ /* 0x60 - 0x67 */
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
+ N, MD(ModRM, &mode_dual_63),
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
+ I2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
+ ID(0, &instr_dual_8d),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
+ /* 0x90 - 0x97 */
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf),
+ I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ I2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
+ /* 0xA8 - 0xAF */
+ I2bv(DstAcc | SrcImm | NoWrite, em_test),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ I2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter),
+ I(Stack, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm),
+ I(ImplicitOps | IsBranch | ShadowStack, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret),
+ /* 0xD0 - 0xD7 */
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ I(DstAcc | SrcImmUByte | No64, em_aam),
+ I(DstAcc | SrcImmUByte | No64, em_aad),
+ I(DstAcc | ByteOp | No64, em_salc),
+ I(DstAcc | SrcXLat | ByteOp, em_mov),
+ /* 0xD8 - 0xDF */
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
+ /* 0xE0 - 0xE7 */
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
+ /* 0xE8 - 0xEF */
+ I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
+ /* 0xF0 - 0xF7 */
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static const struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ /* 0x10 - 0x1F */
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 8 * reserved NOP */
+ D(ImplicitOps | ModRM | SrcMem | NoAccess), /* NOP + 7 * reserved NOP */
+ /* 0x20 - 0x2F */
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_access),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_access),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
+ N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b),
+ N, N, N, N,
+ /* 0x30 - 0x3F */
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
+ I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit),
+ N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm | NearBranch | IsBranch)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
+ II(ImplicitOps, em_cpuid, cpuid),
+ I(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ I(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
+ /* 0xA8 - 0xAF */
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ I(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ I(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ GD(0, &group15), I(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xC7 */
+ I2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
+ N, ID(0, &instr_dual_0f_c3),
+ N, N, N, GD(0, &group9),
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a),
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
+static const struct gprefix three_byte_0f_38_f0 = {
+ ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+ ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+ /* 0x00 - 0x1f */
+ X16(N), X16(N),
+ /* 0x20 - 0x2f */
+ X8(N),
+ X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N,
+ /* 0x30 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x80 - 0xef */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0xf0 - 0xf1 */
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
+ /* 0xf2 - 0xff */
+ N, N, X4(N), X8(N)
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+#undef GP
+#undef EXT
+#undef MD
+#undef ID
+
+#undef D2bv
+#undef D2bvIP
+#undef I2bv
+#undef I2bvIP
+#undef I6ALU
+
+static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & ShadowStack;
+}
+
+static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt)
+{
+ u64 flags = ctxt->d;
+
+ if (!(flags & IsBranch))
+ return false;
+
+ /*
+ * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are
+ * indirect and thus affect IBT state. All far RETs (including SYSEXIT
+ * and IRET) are protected via Shadow Stacks and thus don't affect IBT
+ * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is
+ * enabled, but that should be handled by IRET emulation (in the very
+ * unlikely scenario that KVM adds support for fully emulating IRET).
+ */
+ if (!(flags & NearBranch))
+ return ctxt->execute != em_iret &&
+ ctxt->execute != em_ret_far &&
+ ctxt->execute != em_ret_far_imm &&
+ ctxt->execute != em_sysexit;
+
+ switch (flags & SrcMask) {
+ case SrcReg:
+ case SrcMem:
+ case SrcMem16:
+ case SrcMem32:
+ return true;
+ case SrcMemFAddr:
+ case SrcImmFAddr:
+ /* Far branches should be handled above. */
+ WARN_ON_ONCE(1);
+ return true;
+ case SrcNone:
+ case SrcImm:
+ case SrcImmByte:
+ /*
+ * Note, ImmU16 is used only for the stack adjustment operand on ENTER
+ * and RET instructions. ENTER isn't a branch and RET FAR is handled
+ * by the NearBranch check above. RET itself isn't an indirect branch.
+ */
+ case SrcImmU16:
+ return false;
+ default:
+ WARN_ONCE(1, "Unexpected Src operand '%llx' on branch",
+ flags & SrcMask);
+ return false;
+ }
+}
+
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned size;
+
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = ctxt->_eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, ctxt);
break;
- case 0x17: /* pop ss */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ case 2:
+ op->val = insn_fetch(s16, ctxt);
break;
- case 0x18 ... 0x1d:
- sbb: /* sbb */
- emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+ case 4:
+ op->val = insn_fetch(s32, ctxt);
break;
- case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
break;
- case 0x1f: /* pop ds */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op);
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RDI);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
break;
- case 0x20 ... 0x25:
- and: /* and */
- emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
break;
- case 0x28 ... 0x2d:
- sub: /* sub */
- emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+ case OpOne:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = 1;
break;
- case 0x30 ... 0x35:
- xor: /* xor */
- emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
break;
- case 0x38 ... 0x3d:
- cmp: /* cmp */
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
break;
- case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op("inc", c->dst, ctxt->eflags);
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ if (ctxt->memop.type == OP_REG) {
+ ctxt->memop.addr.reg = decode_register(ctxt,
+ ctxt->modrm_rm, true);
+ fetch_register_operand(&ctxt->memop);
+ }
+ goto mem_common;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RSI);
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpXLat:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ address_mask(ctxt,
+ reg_read(ctxt, VCPU_REGS_RBX) +
+ (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
break;
- case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op("dec", c->dst, ctxt->eflags);
+ }
+
+done:
+ return rc;
+}
+
+static int x86_decode_avx(struct x86_emulate_ctxt *ctxt,
+ u8 vex_1st, u8 vex_2nd, struct opcode *opcode)
+{
+ u8 vex_3rd, map, pp, l, v;
+ int rc = X86EMUL_CONTINUE;
+
+ if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix)
+ goto ud;
+
+ if (vex_1st == 0xc5) {
+ /* Expand RVVVVlpp to VEX3 format */
+ vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */
+ vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */
+ } else {
+ vex_3rd = insn_fetch(u8, ctxt);
+ }
+
+ /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */
+ vex_2nd ^= 0xE0; /* binary 11100000 */
+ vex_3rd ^= 0x78; /* binary 01111000 */
+
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */
+ ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */
+ if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64)
+ goto ud;
+
+ map = vex_2nd & 0x1f;
+ v = (vex_3rd >> 3) & 0xf;
+ l = vex_3rd & 0x4;
+ pp = vex_3rd & 0x3;
+
+ ctxt->b = insn_fetch(u8, ctxt);
+ switch (map) {
+ case 1:
+ ctxt->opcode_len = 2;
+ *opcode = twobyte_table[ctxt->b];
break;
- case 0x50 ... 0x57: /* push reg */
- emulate_push(ctxt);
+ case 2:
+ ctxt->opcode_len = 3;
+ *opcode = opcode_map_0f_38[ctxt->b];
break;
- case 0x58 ... 0x5f: /* pop reg */
- pop_instruction:
- rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
+ case 3:
+ /* no 0f 3a instructions are supported yet */
+ return X86EMUL_UNHANDLEABLE;
+ default:
+ goto ud;
+ }
+
+ /*
+ * No three operand instructions are supported yet; those that
+ * *are* marked with the Avx flag reserve the VVVV flag.
+ */
+ if (v)
+ goto ud;
+
+ if (l)
+ ctxt->op_bytes = 32;
+ else
+ ctxt->op_bytes = 16;
+
+ switch (pp) {
+ case 0: break;
+ case 1: ctxt->op_prefix = true; break;
+ case 2: ctxt->rep_prefix = 0xf3; break;
+ case 3: ctxt->rep_prefix = 0xf2; break;
+ }
+
+done:
+ return rc;
+ud:
+ *opcode = ud;
+ return rc;
+}
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type)
+{
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool vex_prefix = false;
+ bool has_seg_override = false;
+ struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
+
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
+ ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
+ if (insn_len > 0)
+ memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
if (rc != X86EMUL_CONTINUE)
goto done;
+ }
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
break;
- case 0x60: /* pusha */
- emulate_pusha(ctxt);
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
break;
- case 0x61: /* popa */
- rc = emulate_popa(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
break;
- case 0x63: /* movsxd */
- if (ctxt->mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- c->dst.val = (s32) c->src.val;
- break;
- case 0x68: /* push imm */
- case 0x6a: /* push imm8 */
- emulate_push(ctxt);
- break;
- case 0x6c: /* insb */
- case 0x6d: /* insw/insd */
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
+ case 0x66: /* operand-size override */
+ ctxt->op_prefix = true;
+ /* switch between 2/4 bytes */
+ ctxt->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
+ case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
+ case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->rex_prefix = REX_PREFIX;
+ ctxt->rex_bits = ctxt->b & 0xf;
+ continue;
+ case 0xf0: /* LOCK */
+ ctxt->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP/REPE/REPZ */
+ ctxt->rep_prefix = ctxt->b;
+ break;
+ default:
+ goto done_prefixes;
}
- if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
- c->regs[VCPU_REGS_RDX], &c->dst.val))
- goto done; /* IO is needed, skip writeback */
- break;
- case 0x6e: /* outsb */
- case 0x6f: /* outsw/outsd */
- c->src.bytes = min(c->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
- c->src.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (ctxt->rex_bits & REX_W)
+ ctxt->op_bytes = 8;
+
+ /* Opcode byte(s). */
+ if (ctxt->b == 0xc4 || ctxt->b == 0xc5) {
+ /* VEX or LDS/LES */
+ u8 vex_2nd = insn_fetch(u8, ctxt);
+ if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) {
+ opcode = opcode_table[ctxt->b];
+ ctxt->modrm = vex_2nd;
+ /* the Mod/RM byte has been fetched already! */
+ goto done_modrm;
}
- ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
- &c->src.val, 1, ctxt->vcpu);
- c->dst.type = OP_NONE; /* nothing to writeback */
- break;
- case 0x70 ... 0x7f: /* jcc (short) */
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
- break;
- case 0x80 ... 0x83: /* Grp1 */
- switch (c->modrm_reg) {
- case 0:
- goto add;
- case 1:
- goto or;
- case 2:
- goto adc;
- case 3:
- goto sbb;
- case 4:
- goto and;
- case 5:
- goto sub;
- case 6:
- goto xor;
- case 7:
- goto cmp;
+ vex_prefix = true;
+ rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ } else if (ctxt->b == 0x0f) {
+ /* Two- or three-byte opcode */
+ ctxt->opcode_len = 2;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = twobyte_table[ctxt->b];
+
+ /* 0F_38 opcode map */
+ if (ctxt->b == 0x38) {
+ ctxt->opcode_len = 3;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = opcode_map_0f_38[ctxt->b];
}
- break;
- case 0x84 ... 0x85:
- emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
- break;
- case 0x86 ... 0x87: /* xchg */
- xchg:
- /* Write back the register source. */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *) c->src.ptr = (u8) c->dst.val;
+ } else {
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
+ }
+
+ if (opcode.flags & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
+done_modrm:
+ ctxt->d = opcode.flags;
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
break;
- case 2:
- *(u16 *) c->src.ptr = (u16) c->dst.val;
+ case GroupDual:
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
break;
- case 4:
- *c->src.ptr = (u32) c->dst.val;
- break; /* 64b reg: zero-extend */
- case 8:
- *c->src.ptr = c->dst.val;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && ctxt->op_prefix)
+ return EMULATION_FAILED;
+ simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ case Escape:
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
+ default:
+ return EMULATION_FAILED;
}
+
+ ctxt->d &= ~(u64)GroupMask;
+ ctxt->d |= opcode.flags;
+ }
+
+ ctxt->is_branch = opcode.flags & IsBranch;
+
+ /* Unrecognised? */
+ if (ctxt->d == 0)
+ return EMULATION_FAILED;
+
+ if (unlikely(vex_prefix)) {
/*
- * Write back the memory destination with implicit LOCK
- * prefix.
+ * Only specifically marked instructions support VEX. Since many
+ * instructions support it but are not annotated, return not implemented
+ * rather than #UD.
*/
- c->dst.val = c->src.val;
- c->lock_prefix = 1;
- break;
- case 0x88 ... 0x8b: /* mov */
- goto mov;
- case 0x8c: { /* mov r/m, sreg */
- struct kvm_segment segreg;
+ if (!(ctxt->d & Avx))
+ return EMULATION_FAILED;
- if (c->modrm_reg <= VCPU_SREG_GS)
- kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
- else {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
+ if (!(ctxt->d & AlignMask))
+ ctxt->d |= Unaligned;
+ }
+
+ ctxt->execute = opcode.u.execute;
+
+ /*
+ * Reject emulation if KVM might need to emulate shadow stack updates
+ * and/or indirect branch tracking enforcement, which the emulator
+ * doesn't support.
+ */
+ if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) &&
+ ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) {
+ u64 u_cet = 0, s_cet = 0;
+
+ /*
+ * Check both User and Supervisor on far transfers as inter-
+ * privilege level transfers are impacted by CET at the target
+ * privilege level, and that is not known at this time. The
+ * expectation is that the guest will not require emulation of
+ * any CET-affected instructions at any privilege level.
+ */
+ if (!(ctxt->d & NearBranch))
+ u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else if (ctxt->ops->cpl(ctxt) == 3)
+ u_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+ else
+ s_cet = CET_SHSTK_EN | CET_ENDBR_EN;
+
+ if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) ||
+ (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet)))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt))
+ return EMULATION_FAILED;
+
+ if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt))
+ return EMULATION_FAILED;
+ }
+
+ if (unlikely(emulation_type & EMULTYPE_TRAP_UD) &&
+ likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
}
- c->dst.val = segreg.selector;
- break;
+
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
+ if (vex_prefix)
+ ;
+ else if (ctxt->d & Sse)
+ ctxt->op_bytes = 16, ctxt->d &= ~Avx;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
}
- case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->modrm_ea;
- break;
- case 0x8e: { /* mov seg, r/m16 */
- uint16_t sel;
- sel = c->src.val;
+ /* ModRM and SIB bytes. */
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &ctxt->memop);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &ctxt->memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
- if (c->modrm_reg == VCPU_SREG_CS ||
- c->modrm_reg > VCPU_SREG_GS) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
+
+ if (ctxt->rip_relative && likely(ctxt->memopp))
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
+ return true;
+
+ return false;
+}
+
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ kvm_fpu_get();
+ rc = asm_safe("fwait");
+ kvm_fpu_put();
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct operand *op)
+{
+ if (op->type == OP_MM)
+ kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
+}
+
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = REX_NONE;
+ ctxt->rex_bits = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->op_prefix = false;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = ctxt->dst.type;
+
+ ctxt->mem_read.pos = 0;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
goto done;
}
- if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+ if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
- rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
+ if (ctxt->d & Avx) {
+ u64 xcr = 0;
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE)
+ || ops->get_xcr(ctxt, 0, &xcr)
+ || !(xcr & XFEATURE_MASK_YMM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ } else if (ctxt->d & Sse) {
+ if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+ }
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
+ if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(&ctxt->src);
+ fetch_possible_mmx_operand(&ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(&ctxt->dst);
+ }
+
+ if (unlikely(check_intercepts) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ goto done;
+ }
+ }
}
- case 0x8f: /* pop (sole member of Grp1a) */
- rc = emulate_grp1a(ctxt, ops);
+
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
- break;
- case 0x90: /* nop / xchg r8,rax */
- if (!(c->rex_prefix & 1)) { /* nop */
- c->dst.type = OP_NONE;
- break;
+ ctxt->src.orig_val64 = ctxt->src.val64;
+ }
+
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((ctxt->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
+ goto done;
}
- case 0x91 ... 0x97: /* xchg reg,rax */
- c->src.type = c->dst.type = OP_REG;
- c->src.bytes = c->dst.bytes = c->op_bytes;
- c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
- c->src.val = *(c->src.ptr);
- goto xchg;
- case 0x9c: /* pushf */
- c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt);
- break;
- case 0x9d: /* popf */
- c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *) &ctxt->eflags;
- c->dst.bytes = c->op_bytes;
- rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
+ }
+ /* Copy full 64-bit value for CMPXCHG8B. */
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
+
+special_insn:
+
+ if (unlikely(check_intercepts) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
goto done;
- break;
- case 0xa0 ... 0xa1: /* mov */
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- c->dst.val = c->src.val;
- break;
- case 0xa2 ... 0xa3: /* mov */
- c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
- break;
- case 0xa4 ... 0xa5: /* movs */
- goto mov;
- case 0xa6 ... 0xa7: /* cmps */
- c->dst.type = OP_NONE; /* Disable writeback. */
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
- goto cmp;
- case 0xaa ... 0xab: /* stos */
- c->dst.val = c->regs[VCPU_REGS_RAX];
- break;
- case 0xac ... 0xad: /* lods */
- goto mov;
- case 0xae ... 0xaf: /* scas */
- DPRINTF("Urk! I don't handle SCAS.\n");
- goto cannot_emulate;
- case 0xb0 ... 0xbf: /* mov r, imm */
- goto mov;
- case 0xc0 ... 0xc1:
- emulate_grp2(ctxt);
- break;
- case 0xc3: /* ret */
- c->dst.type = OP_REG;
- c->dst.ptr = &c->eip;
- c->dst.bytes = c->op_bytes;
- goto pop_instruction;
- case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
- mov:
- c->dst.val = c->src.val;
- break;
- case 0xcb: /* ret far */
- rc = emulate_ret_far(ctxt, ops);
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= X86_EFLAGS_RF;
+ else
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+
+ if (ctxt->execute) {
+ rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
+ goto writeback;
+ }
+
+ if (ctxt->opcode_len == 2)
+ goto twobyte_insn;
+ else if (ctxt->opcode_len == 3)
+ goto threebyte_insn;
+
+ switch (ctxt->b) {
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
- case 0xd0 ... 0xd1: /* Grp2 */
- c->src.val = 1;
- emulate_grp2(ctxt);
+ case 0x8d: /* lea r16/r32, m */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
break;
- case 0xd2 ... 0xd3: /* Grp2 */
- c->src.val = c->regs[VCPU_REGS_RCX];
- emulate_grp2(ctxt);
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
break;
- case 0xe4: /* inb */
- case 0xe5: /* in */
- goto do_io_in;
- case 0xe6: /* outb */
- case 0xe7: /* out */
- goto do_io_out;
- case 0xe8: /* call (near) */ {
- long int rel = c->src.val;
- c->src.val = (unsigned long) c->eip;
- jmp_rel(c, rel);
- emulate_push(ctxt);
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
+ }
break;
- }
- case 0xe9: /* jmp rel */
- goto jmp;
- case 0xea: /* jmp far */
- jump_far:
- if (load_segment_descriptor(ctxt, ops, c->src2.val,
- VCPU_SREG_CS))
- goto done;
-
- c->eip = c->src.val;
+ case 0xcc: /* int3 */
+ rc = emulate_int(ctxt, 3);
break;
- case 0xeb:
- jmp: /* jmp rel short */
- jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ case 0xcd: /* int n */
+ rc = emulate_int(ctxt, ctxt->src.val);
break;
- case 0xec: /* in al,dx */
- case 0xed: /* in (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
- do_io_in:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- }
- if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
- &c->dst.val))
- goto done; /* IO is needed */
- break;
- case 0xee: /* out al,dx */
- case 0xef: /* out (e/r)ax,dx */
- c->src.val = c->regs[VCPU_REGS_RDX];
- do_io_out:
- c->dst.bytes = min(c->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- }
- ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
- ctxt->vcpu);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ case 0xce: /* into */
+ if (ctxt->eflags & X86_EFLAGS_OF)
+ rc = emulate_int(ctxt, 4);
+ break;
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
- ctxt->vcpu->arch.halt_request = 1;
+ ctxt->ops->halt(ctxt);
break;
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
- ctxt->eflags ^= EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- break;
- case 0xf6 ... 0xf7: /* Grp3 */
- if (!emulate_grp3(ctxt, ops))
- goto cannot_emulate;
+ ctxt->eflags ^= X86_EFLAGS_CF;
break;
case 0xf8: /* clc */
- ctxt->eflags &= ~EFLG_CF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->eflags &= ~X86_EFLAGS_CF;
break;
- case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
- ctxt->eflags &= ~X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- }
- break;
- case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
- ctxt->eflags |= X86_EFLAGS_IF;
- c->dst.type = OP_NONE; /* Disable writeback. */
- }
+ case 0xf9: /* stc */
+ ctxt->eflags |= X86_EFLAGS_CF;
break;
case 0xfc: /* cld */
- ctxt->eflags &= ~EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->eflags &= ~X86_EFLAGS_DF;
break;
case 0xfd: /* std */
- ctxt->eflags |= EFLG_DF;
- c->dst.type = OP_NONE; /* Disable writeback. */
+ ctxt->eflags |= X86_EFLAGS_DF;
break;
- case 0xfe: /* Grp4 */
- grp45:
- rc = emulate_grp45(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- break;
- case 0xff: /* Grp5 */
- if (c->modrm_reg == 5)
- goto jump_far;
- goto grp45;
+ default:
+ goto cannot_emulate;
}
-writeback:
- rc = writeback(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
+writeback:
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
/*
* restore dst type in case the decoding will be reused
* (happens for string instruction )
*/
- c->dst.type = saved_dst_type;
+ ctxt->dst.type = saved_dst_type;
- if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
- &c->src);
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
- if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+ if ((ctxt->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
- if (c->rep_prefix && (c->d & String)) {
- struct read_cache *rc = &ctxt->decode.io_read;
- register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
- /*
- * Re-enter guest when pio read ahead buffer is empty or,
- * if it is not used, after each 1024 iteration.
- */
- if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
- (rc->end != 0 && rc->end == rc->pos))
- ctxt->restart = false;
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
+ struct read_cache *r = &ctxt->io_read;
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ ctxt->eflags &= ~X86_EFLAGS_RF;
}
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
- ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+ ctxt->eip = ctxt->_eip;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
done:
- return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
-
-twobyte_insn:
- switch (c->b) {
- case 0x01: /* lgdt, lidt, lmsw */
- switch (c->modrm_reg) {
- u16 size;
- unsigned long address;
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt))
+ return EMULATION_FAILED;
+ ctxt->have_exception = true;
+ }
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
- case 0: /* vmcall */
- if (c->modrm_mod != 3 || c->modrm_rm != 1)
- goto cannot_emulate;
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
- /* Let the processor re-execute the fixed hypercall */
- c->eip = ctxt->eip;
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 2: /* lgdt */
- rc = read_descriptor(ctxt, ops, c->src.ptr,
- &size, &address, c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lgdt(ctxt->vcpu, size, address);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 3: /* lidt/vmmcall */
- if (c->modrm_mod == 3) {
- switch (c->modrm_rm) {
- case 1:
- rc = kvm_fix_hypercall(ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- break;
- default:
- goto cannot_emulate;
- }
- } else {
- rc = read_descriptor(ctxt, ops, c->src.ptr,
- &size, &address,
- c->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- realmode_lidt(ctxt->vcpu, size, address);
- }
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- case 4: /* smsw */
- c->dst.bytes = 2;
- c->dst.val = ops->get_cr(0, ctxt->vcpu);
- break;
- case 6: /* lmsw */
- ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
- (c->src.val & 0x0f), ctxt->vcpu);
- c->dst.type = OP_NONE;
- break;
- case 5: /* not defined */
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
- case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, c->modrm_ea);
- /* Disable writeback. */
- c->dst.type = OP_NONE;
- break;
- default:
- goto cannot_emulate;
- }
- break;
- case 0x05: /* syscall */
- rc = emulate_syscall(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
- break;
- case 0x06:
- emulate_clts(ctxt->vcpu);
- c->dst.type = OP_NONE;
+twobyte_insn:
+ switch (ctxt->b) {
+ case 0x09: /* wbinvd */
+ (ctxt->ops->wbinvd)(ctxt);
break;
case 0x08: /* invd */
- case 0x09: /* wbinvd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
- c->dst.type = OP_NONE;
+ case 0x1f: /* nop */
break;
case 0x20: /* mov cr, reg */
- switch (c->modrm_reg) {
- case 1:
- case 5 ... 7:
- case 9 ... 15:
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
- }
- c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
- c->dst.type = OP_NONE; /* no writeback */
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
- }
- emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x22: /* mov reg, cr */
- ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
- c->dst.type = OP_NONE;
- break;
- case 0x23: /* mov from reg to dr */
- if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
- (c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
- goto done;
- }
- emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
- c->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x30:
- /* wrmsr */
- msr_data = (u32)c->regs[VCPU_REGS_RAX]
- | ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- }
- rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
- break;
- case 0x32:
- /* rdmsr */
- if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
- goto done;
- } else {
- c->regs[VCPU_REGS_RAX] = (u32)msr_data;
- c->regs[VCPU_REGS_RDX] = msr_data >> 32;
- }
- rc = X86EMUL_CONTINUE;
- c->dst.type = OP_NONE;
- break;
- case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
- break;
- case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- else
- goto writeback;
+ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg);
break;
case 0x40 ... 0x4f: /* cmov */
- c->dst.val = c->dst.orig_val = c->src.val;
- if (!test_cc(c->b, ctxt->eflags))
- c->dst.type = OP_NONE; /* no writeback */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->op_bytes != 4)
+ ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
- if (test_cc(c->b, ctxt->eflags))
- jmp_rel(c, c->src.val);
- c->dst.type = OP_NONE;
- break;
- case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, VCPU_SREG_FS);
- break;
- case 0xa1: /* pop fs */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- break;
- case 0xa3:
- bt: /* bt */
- c->dst.type = OP_NONE;
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
- case 0xa4: /* shld imm8, r, r/m */
- case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
- break;
- case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, VCPU_SREG_GS);
- break;
- case 0xa9: /* pop gs */
- rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- break;
- case 0xab:
- bts: /* bts */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
- break;
- case 0xac: /* shrd imm8, r, r/m */
- case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
- break;
- case 0xae: /* clflush */
- break;
- case 0xb0 ... 0xb1: /* cmpxchg */
- /*
- * Save real source value, then compare EAX against
- * destination.
- */
- c->src.orig_val = c->src.val;
- c->src.val = c->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- c->dst.val = c->src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- c->dst.type = OP_REG;
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- }
- break;
- case 0xb3:
- btr: /* btr */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
case 0xb6 ... 0xb7: /* movzx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
- : (u16) c->src.val;
- break;
- case 0xba: /* Grp8 */
- switch (c->modrm_reg & 3) {
- case 0:
- goto bt;
- case 1:
- goto bts;
- case 2:
- goto btr;
- case 3:
- goto btc;
- }
- break;
- case 0xbb:
- btc: /* btc */
- /* only subword offset */
- c->src.val &= (c->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
break;
case 0xbe ... 0xbf: /* movsx */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
- (s16) c->src.val;
- break;
- case 0xc3: /* movnti */
- c->dst.bytes = c->op_bytes;
- c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
- (u64) c->src.val;
- break;
- case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = emulate_grp9(ctxt, ops);
- if (rc != X86EMUL_CONTINUE)
- goto done;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
break;
+ default:
+ goto cannot_emulate;
}
+
+threebyte_insn:
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
goto writeback;
cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
+ return EMULATION_FAILED;
+}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
}
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
new file mode 100644
index 000000000000..f898781b6a06
--- /dev/null
+++ b/arch/x86/kvm/fpu.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_FPU_H_
+#define __KVM_FPU_H_
+
+#include <asm/fpu/api.h>
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; })
+#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; })
+#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; })
+#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
+#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+
+typedef u32 __attribute__((vector_size(32))) avx256_t;
+
+static inline void _kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break;
+ case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break;
+ case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break;
+ case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break;
+ case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break;
+ case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break;
+ case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break;
+ case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break;
+ case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break;
+ case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break;
+ case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break;
+ case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break;
+ case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break;
+ case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break;
+ case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ switch (reg) {
+ case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break;
+ case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break;
+ case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break;
+ case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break;
+ case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break;
+ case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break;
+ case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break;
+ case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break;
+ case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break;
+ case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break;
+ case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break;
+ case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break;
+ case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break;
+ case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break;
+ case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_mmx_reg(int reg, u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void kvm_fpu_get(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void kvm_fpu_put(void)
+{
+ fpregs_unlock();
+}
+
+static inline void kvm_read_avx_reg(int reg, avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_avx_reg(int reg, const avx256_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_avx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_mmx_reg(int reg, u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_read_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_write_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+#endif
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000000..de92292eb1f5
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,2925 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "x86.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "mmu.h"
+#include "xen.h"
+
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+
+#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+#include "irq.h"
+#include "fpu.h"
+
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick);
+
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ bool auto_eoi_old, auto_eoi_new;
+
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256);
+
+ if (auto_eoi_old == auto_eoi_new)
+ return;
+
+ if (!enable_apicv)
+ return;
+
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ /*
+ * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on
+ * the hypervisor to manually inject IRQs.
+ */
+ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm,
+ APICV_INHIBIT_REASON_HYPERV,
+ !!hv->synic_auto_eoi_used);
+
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ unsigned long i;
+
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
+ return vcpu;
+ return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+ if (!vcpu || !to_hv_vcpu(vcpu))
+ return NULL;
+ synic = to_hv_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending && stimer->config.enable &&
+ !stimer->config.direct_mode &&
+ stimer->config.sintx == sint)
+ stimer_mark_pending(stimer, false);
+ }
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active && (!host || data))
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ if (!synic->active)
+ break;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu->cpuid_cache.syndbg_cap_eax &
+ HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+}
+
+static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
+ hv->hv_syndbg.control.status =
+ vcpu->run->hyperv.u.syndbg.status;
+ return 1;
+}
+
+static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
+ hv_vcpu->exit.u.syndbg.msr = msr;
+ hv_vcpu->exit.u.syndbg.control = syndbg->control.control;
+ hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page;
+ hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page;
+ hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_syndbg_complete_userspace;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ syndbg->control.control = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ syndbg->control.status = data;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ syndbg->control.send_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ syndbg->control.recv_page = data;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ syndbg->control.pending_page = data;
+ if (!host)
+ syndbg_exit(vcpu, msr);
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ syndbg->options = data;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_SYNDBG_CONTROL:
+ *pdata = syndbg->control.control;
+ break;
+ case HV_X64_MSR_SYNDBG_STATUS:
+ *pdata = syndbg->control.status;
+ break;
+ case HV_X64_MSR_SYNDBG_SEND_BUFFER:
+ *pdata = syndbg->control.send_page;
+ break;
+ case HV_X64_MSR_SYNDBG_RECV_BUFFER:
+ *pdata = syndbg->control.recv_page;
+ break;
+ case HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ *pdata = syndbg->control.pending_page;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ *pdata = syndbg->options;
+ break;
+ default:
+ break;
+ }
+
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
+
+ return 0;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
+{
+ int ret;
+
+ if (!synic->active && !host)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.shorthand = APIC_DEST_SELF;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, e->hv_sint.sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+ * is broken, disabled or being updated.
+ */
+ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config.periodic) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ union hv_stimer_config new_config = {.as_uint64 = config},
+ old_config = {.as_uint64 = stimer->config.as_uint64};
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
+ if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_STIMER_DIRECT_MODE_AVAILABLE)))
+ return 1;
+
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if (old_config.enable &&
+ !new_config.direct_mode && new_config.sintx == 0)
+ new_config.enable = 0;
+ stimer->config.as_uint64 = new_config.as_uint64;
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (!host) {
+ if (stimer->count == 0)
+ stimer->config.enable = 0;
+ else if (stimer->config.auto_enable)
+ stimer->config.enable = 1;
+ }
+
+ if (stimer->config.enable)
+ stimer_mark_pending(stimer, false);
+
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config.as_uint64;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg, bool no_retry)
+{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
+ gfn_t msg_page_gfn;
+ struct hv_message_header hv_hdr;
+ int r;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ msg_page_gfn = synic->msg_page >> PAGE_SHIFT;
+
+ /*
+ * Strictly following the spec-mandated ordering would assume setting
+ * .msg_pending before checking .message_type. However, this function
+ * is only called in vcpu context so the entire update is atomic from
+ * guest POV and thus the exact order here doesn't matter.
+ */
+ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type,
+ msg_off + offsetof(struct hv_message,
+ header.message_type),
+ sizeof(hv_hdr.message_type));
+ if (r < 0)
+ return r;
+
+ if (hv_hdr.message_type != HVMSG_NONE) {
+ if (no_retry)
+ return 0;
+
+ hv_hdr.message_flags.msg_pending = 1;
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn,
+ &hv_hdr.message_flags,
+ msg_off +
+ offsetof(struct hv_message,
+ header.message_flags),
+ sizeof(hv_hdr.message_flags));
+ if (r < 0)
+ return r;
+ return -EAGAIN;
+ }
+
+ r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off,
+ sizeof(src_msg->header) +
+ src_msg->header.payload_size);
+ if (r < 0)
+ return r;
+
+ r = synic_set_irq(synic, sint);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EFAULT;
+ return 0;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ /*
+ * To avoid piling up periodic ticks, don't retry message
+ * delivery for them (within "lazy" lost ticks policy).
+ */
+ bool no_retry = stimer->config.periodic;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(to_hv_synic(vcpu),
+ stimer->config.sintx, msg,
+ no_retry);
+}
+
+static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = stimer->config.apic_vector
+ };
+
+ if (lapic_in_kernel(vcpu))
+ return !kvm_apic_set_irq(vcpu, &irq, NULL);
+ return 0;
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r, direct = stimer->config.direct_mode;
+
+ stimer->msg_pending = true;
+ if (!direct)
+ r = stimer_send_msg(stimer);
+ else
+ r = stimer_notify_direct(stimer);
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, direct, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config.periodic))
+ stimer->config.enable = 0;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config.enable) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config.enable) &&
+ stimer->count) {
+ if (!stimer->msg_pending)
+ stimer_start(stimer);
+ } else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (!hv_vcpu)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
+}
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled);
+
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page);
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer_prepare_msg(stimer);
+}
+
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i;
+
+ if (hv_vcpu)
+ return 0;
+
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+
+ hv_vcpu->vp_index = vcpu->vcpu_idx;
+
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
+ return 0;
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+
+ synic = to_hv_synic(vcpu);
+
+ synic->active = true;
+ synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ synic->control = HV_SYNIC_CONTROL_ENABLE;
+ return 0;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ struct ms_hyperv_tsc_page *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+ return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+ hv->hv_tsc_emulation_control;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+
+ guard(mutex)(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
+ return;
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ goto out_err;
+
+ if (tsc_seq && tsc_page_update_unsafe(hv)) {
+ if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ return;
+ }
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ goto out_err;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ return;
+
+out_err:
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+}
+
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_lock(&hv->hv_lock);
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+
+ mutex_unlock(&hv->hv_lock);
+}
+
+static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_HYPERCALL_AVAILABLE;
+ case HV_X64_MSR_VP_RUNTIME:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_RUNTIME_AVAILABLE;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ case HV_X64_MSR_VP_INDEX:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_INDEX_AVAILABLE;
+ case HV_X64_MSR_RESET:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_RESET_AVAILABLE;
+ case HV_X64_MSR_REFERENCE_TSC:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_REFERENCE_TSC_AVAILABLE;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNIC_AVAILABLE;
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG:
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNTIMER_AVAILABLE;
+ case HV_X64_MSR_EOI:
+ case HV_X64_MSR_ICR:
+ case HV_X64_MSR_TPR:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_APIC_ACCESS_AVAILABLE;
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_FREQUENCY_MSRS;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+#define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839
+#define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */
+
+/*
+ * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC
+ * configuration.
+ * Such configuration can result from, for example, AMD Erratum 1386 workaround.
+ *
+ * Print a notice so users aren't left wondering what's suddenly gone wrong.
+ */
+static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ /* Check again under the hv_lock. */
+ if (hv->xsaves_xsavec_checked)
+ return;
+
+ if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) !=
+ KVM_HV_WIN2016_GUEST_ID)
+ return;
+
+ hv->xsaves_xsavec_checked = true;
+
+ /* UP configurations aren't affected */
+ if (atomic_read(&kvm->online_vcpus) < 2)
+ return;
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC))
+ return;
+
+ pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. "
+ "If it fails to boot try disabling XSAVEC in the VM config.\n");
+}
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!vcpu->arch.hyperv_enabled ||
+ hv->xsaves_xsavec_checked)
+ return;
+
+ mutex_lock(&hv->hv_lock);
+ __kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+ mutex_unlock(&hv->hv_lock);
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+ return 1;
+ hv->hv_hypercall = data;
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ hv->hv_tsc_page = data;
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+ if (!host)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+ else
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ } else {
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+ }
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ if (data && !host)
+ return 1;
+
+ hv->hv_tsc_emulation_status = data;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_set_msr(vcpu, msr, data, host);
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+
+ return div_u64(utime + stime, 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
+ return 1;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu->vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu->vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
+ break;
+ }
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+
+ /*
+ * Clear apic_assist portion of struct hv_vp_assist_page
+ * only, there can be valuable data in the rest which needs
+ * to be preserved e.g. on migration.
+ */
+ if (put_user(0, (u32 __user *)addr))
+ return 1;
+ hv_vcpu->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_set_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
+ break;
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(kvm,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return syndbg_get_msr(vcpu, msr, pdata, host);
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX:
+ data = hv_vcpu->vp_index;
+ break;
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ data = hv_vcpu->hv_vapic;
+ break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
+ break;
+ default:
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&hv->hv_lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
+ mutex_unlock(&hv->hv_lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
+}
+
+static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
+ u64 valid_bank_mask, unsigned long *vcpu_mask)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes);
+ u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *vcpu;
+ int bank, sbank = 0;
+ unsigned long i;
+ u64 *bitmap;
+
+ BUILD_BUG_ON(sizeof(vp_bitmap) >
+ sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS));
+
+ /*
+ * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else
+ * fill a temporary buffer and manually test each vCPU's VP index.
+ */
+ if (likely(!has_mismatch))
+ bitmap = (u64 *)vcpu_mask;
+ else
+ bitmap = vp_bitmap;
+
+ /*
+ * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask
+ * having a '1' for each bank that exists in sparse_banks. Sets must
+ * be in ascending order, i.e. bank0..bankN.
+ */
+ memset(bitmap, 0, sizeof(vp_bitmap));
+ for_each_set_bit(bank, (unsigned long *)&valid_bank_mask,
+ KVM_HV_MAX_SPARSE_VCPU_SET_BITS)
+ bitmap[bank] = sparse_banks[sbank++];
+
+ if (likely(!has_mismatch))
+ return;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
+ __set_bit(i, vcpu_mask);
+ }
+}
+
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
+struct kvm_hv_hcall {
+ /* Hypercall input data */
+ u64 param;
+ u64 ingpa;
+ u64 outgpa;
+ u16 code;
+ u16 var_cnt;
+ u16 rep_cnt;
+ u16 rep_idx;
+ bool fast;
+ bool rep;
+ sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+
+ /*
+ * Current read offset when KVM reads hypercall input data gradually,
+ * either offset in bytes from 'ingpa' for regular hypercalls or the
+ * number of already consumed 'XMM halves' for 'fast' hypercalls.
+ */
+ union {
+ gpa_t data_offset;
+ int consumed_xmm_halves;
+ };
+};
+
+
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
+
+ if (hc->fast) {
+ /*
+ * Each XMM holds two sparse banks, but do not count halves that
+ * have already been consumed for hypercall parameters.
+ */
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + hc->consumed_xmm_halves;
+ if (j % 2)
+ data[i] = sse128_hi(hc->xmm[j / 2]);
+ else
+ data[i] = sse128_lo(hc->xmm[j / 2]);
+ }
+ return 0;
+ }
+
+ return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[])
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ if (is_noncanonical_invlpg_address(entries[i], vcpu))
+ continue;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ unsigned long *vcpu_mask = hv_vcpu->vcpu_mask;
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
+ u64 valid_bank_mask;
+ struct kvm_vcpu *v;
+ unsigned long i;
+ bool all_cpus;
+
+ /*
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
+ */
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
+ if (hc->fast) {
+ flush.address_space = hc->ingpa;
+ flush.flags = hc->outgpa;
+ flush.processor_mask = sse128_lo(hc->xmm[0]);
+ hc->consumed_xmm_halves = 1;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa,
+ &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush);
+ }
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
+
+ valid_bank_mask = BIT_ULL(0);
+ sparse_banks[0] = flush.processor_mask;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
+ } else {
+ if (hc->fast) {
+ flush_ex.address_space = hc->ingpa;
+ flush_ex.flags = hc->outgpa;
+ memcpy(&flush_ex.hv_vp_set,
+ &hc->xmm[0], sizeof(hc->xmm[0]));
+ hc->consumed_xmm_halves = 2;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush_ex);
+ }
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags, is_guest_mode(vcpu));
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ if (hc->fast)
+ hc->consumed_xmm_halves += hc->var_cnt;
+ else
+ hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ }
+
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
+ * analyze it here, flush TLB regardless of the specified address space.
+ */
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
+ sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
+
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ }
+
+ret_success:
+ /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
+{
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vector
+ };
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
+ continue;
+
+ /* We fail only when APIC is disabled */
+ kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+}
+
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
+ struct kvm *kvm = vcpu->kvm;
+ struct hv_send_ipi_ex send_ipi_ex;
+ struct hv_send_ipi send_ipi;
+ u64 valid_bank_mask;
+ u32 vector;
+ bool all_cpus;
+
+ if (!lapic_in_kernel(vcpu))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (hc->code == HVCALL_SEND_IPI) {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
+ sizeof(send_ipi))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = send_ipi.cpu_mask;
+ vector = send_ipi.vector;
+ } else {
+ /* 'reserved' part of hv_send_ipi should be 0 */
+ if (unlikely(hc->ingpa >> 32 != 0))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ sparse_banks[0] = hc->outgpa;
+ vector = (u32)hc->ingpa;
+ }
+ all_cpus = false;
+ valid_bank_mask = BIT_ULL(0);
+
+ trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+ } else {
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+ sizeof(send_ipi_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ } else {
+ send_ipi_ex.vector = (u32)hc->ingpa;
+ send_ipi_ex.vp_set.format = hc->outgpa;
+ send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]);
+ }
+
+ trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
+ send_ipi_ex.vp_set.format,
+ send_ipi_ex.vp_set.valid_bank_mask);
+
+ vector = send_ipi_ex.vector;
+ valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
+ all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+
+ if (hc->var_cnt != hweight64(valid_bank_mask))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ goto check_and_send_ipi;
+
+ if (!hc->var_cnt)
+ goto ret_success;
+
+ if (!hc->fast)
+ hc->data_offset = offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents);
+ else
+ hc->consumed_xmm_halves = 1;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+check_and_send_ipi:
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
+
+ret_success:
+ return HV_STATUS_SUCCESS;
+}
+
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vcpu->arch.hyperv_enabled = hyperv_enabled;
+
+ if (!hv_vcpu) {
+ /*
+ * KVM should have already allocated kvm_vcpu_hv if Hyper-V is
+ * enabled in CPUID.
+ */
+ WARN_ON_ONCE(vcpu->arch.hyperv_enabled);
+ return;
+ }
+
+ memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache));
+
+ if (!vcpu->arch.hyperv_enabled)
+ return;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.features_eax = entry->eax;
+ hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
+ hv_vcpu->cpuid_cache.features_edx = entry->edx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
+ if (entry) {
+ hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
+ if (entry)
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.nested_eax = entry->eax;
+ hv_vcpu->cpuid_cache.nested_ebx = entry->ebx;
+ }
+}
+
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+ int ret = 0;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (enforce) {
+ ret = kvm_hv_vcpu_init(vcpu);
+ if (ret)
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+ hv_vcpu->enforce_cpuid = enforce;
+
+ return ret;
+}
+
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_hypercall(vcpu);
+ if (longmode)
+ kvm_rax_write(vcpu, result);
+ else {
+ kvm_rdx_write(vcpu, result >> 32);
+ kvm_rax_write(vcpu, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
+{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_hypercall_done(result);
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
+}
+
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!hc->fast)) {
+ int ret;
+ gpa_t gpa = hc->ingpa;
+
+ if ((gpa & (__alignof__(hc->ingpa) - 1)) ||
+ offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa,
+ &hc->ingpa, sizeof(hc->ingpa));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (hc->ingpa & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
+ eventfd = idr_find(&hv->conn_to_evt, hc->ingpa);
+ rcu_read_unlock();
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd);
+ return HV_STATUS_SUCCESS;
+}
+
+static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
+{
+ switch (hc->code) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ case HVCALL_SEND_IPI_EX:
+ return true;
+ }
+
+ return false;
+}
+
+static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
+{
+ int reg;
+
+ kvm_fpu_get();
+ for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+ _kvm_read_sse_reg(reg, &hc->xmm[reg]);
+ kvm_fpu_put();
+}
+
+static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ return hv_vcpu->cpuid_cache.enlightenments_ebx &&
+ hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX;
+ case HVCALL_POST_MESSAGE:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES;
+ case HVCALL_SIGNAL_EVENT:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ case HVCALL_RESET_DEBUG_SESSION:
+ /*
+ * Return 'true' when SynDBG is disabled so the resulting code
+ * will be HV_STATUS_INVALID_HYPERCALL_CODE.
+ */
+ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) ||
+ hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ case HVCALL_SEND_IPI_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_SEND_IPI:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_hv_hcall hc;
+ u64 ret = HV_STATUS_SUCCESS;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+#ifdef CONFIG_X86_64
+ if (is_64_bit_hypercall(vcpu)) {
+ hc.param = kvm_rcx_read(vcpu);
+ hc.ingpa = kvm_rdx_read(vcpu);
+ hc.outgpa = kvm_r8_read(vcpu);
+ } else
+#endif
+ {
+ hc.param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
+ }
+
+ hc.code = hc.param & 0xffff;
+ hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET;
+ hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
+ hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ hc.rep = !!(hc.rep_cnt || hc.rep_idx);
+
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt,
+ hc.rep_idx, hc.ingpa, hc.outgpa);
+
+ if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
+ ret = HV_STATUS_ACCESS_DENIED;
+ goto hypercall_complete;
+ }
+
+ if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ goto hypercall_complete;
+ }
+
+ if (hc.fast && is_xmm_fast_hypercall(&hc)) {
+ if (unlikely(hv_vcpu->enforce_cpuid &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_hv_hypercall_read_xmm(&hc);
+ }
+
+ switch (hc.code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ kvm_vcpu_on_spin(vcpu, true);
+ break;
+ case HVCALL_SIGNAL_EVENT:
+ if (unlikely(hc.rep || hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hvcall_signal_event(vcpu, &hc);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ fallthrough; /* maybe userspace knows this conn_id */
+ case HVCALL_POST_MESSAGE:
+ /* don't bother userspace if it has no way to handle it */
+ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, &hc);
+ break;
+ case HVCALL_SEND_IPI:
+ if (unlikely(hc.var_cnt)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ fallthrough;
+ case HVCALL_SEND_IPI_EX:
+ if (unlikely(hc.rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_send_ipi(vcpu, &hc);
+ break;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ fallthrough;
+ case HVCALL_RESET_DEBUG_SESSION: {
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+
+ if (!kvm_hv_is_syndbg_enabled(vcpu)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) {
+ ret = HV_STATUS_OPERATION_DENIED;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ }
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
+ default:
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+hypercall_complete:
+ return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
+}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&hv->conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL_ACCOUNT);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
+
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ uint16_t evmcs_ver = 0;
+ struct kvm_cpuid_entry2 cpuid_entries[] = {
+ { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_INTERFACE },
+ { .function = HYPERV_CPUID_VERSION },
+ { .function = HYPERV_CPUID_FEATURES },
+ { .function = HYPERV_CPUID_ENLIGHTMENT_INFO },
+ { .function = HYPERV_CPUID_IMPLEMENT_LIMITS },
+ { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS },
+ { .function = HYPERV_CPUID_SYNDBG_INTERFACE },
+ { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES },
+ { .function = HYPERV_CPUID_NESTED_FEATURES },
+ };
+ int i, nent = ARRAY_SIZE(cpuid_entries);
+
+ if (kvm_x86_ops.nested_ops->get_evmcs_version)
+ evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu);
+
+ if (cpuid->nent < nent)
+ return -E2BIG;
+
+ if (cpuid->nent > nent)
+ cpuid->nent = nent;
+
+ for (i = 0; i < nent; i++) {
+ struct kvm_cpuid_entry2 *ent = &cpuid_entries[i];
+ u32 signature[3];
+
+ switch (ent->function) {
+ case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_INTERFACE:
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
+ break;
+
+ case HYPERV_CPUID_VERSION:
+ /*
+ * We implement some Hyper-V 2016 functions so let's use
+ * this version.
+ */
+ ent->eax = 0x00003839;
+ ent->ebx = 0x000A0000;
+ break;
+
+ case HYPERV_CPUID_FEATURES:
+ ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE;
+ ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ ent->eax |= HV_MSR_SYNIC_AVAILABLE;
+ ent->eax |= HV_MSR_SYNTIMER_AVAILABLE;
+ ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE;
+ ent->eax |= HV_MSR_HYPERCALL_AVAILABLE;
+ ent->eax |= HV_MSR_VP_INDEX_AVAILABLE;
+ ent->eax |= HV_MSR_RESET_AVAILABLE;
+ ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
+ ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
+ ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
+
+ ent->ebx |= HV_POST_MESSAGES;
+ ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
+
+ ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
+ ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ ent->ebx |= HV_DEBUGGING;
+ ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
+ ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
+
+ /*
+ * Direct Synthetic timers only make sense with in-kernel
+ * LAPIC
+ */
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
+
+ break;
+
+ case HYPERV_CPUID_ENLIGHTMENT_INFO:
+ ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
+ ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+ ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ if (!cpu_smt_possible())
+ ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
+ /*
+ * Default number of spinlock retry attempts, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 0x00000FFF;
+
+ break;
+
+ case HYPERV_CPUID_IMPLEMENT_LIMITS:
+ /* Maximum number of virtual processors */
+ ent->eax = KVM_MAX_VCPUS;
+ /*
+ * Maximum number of logical processors, matches
+ * HyperV 2016.
+ */
+ ent->ebx = 64;
+
+ break;
+
+ case HYPERV_CPUID_NESTED_FEATURES:
+ ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
+ ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+ break;
+
+ case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS:
+ memcpy(signature, "Linux KVM Hv", 12);
+
+ ent->eax = 0;
+ ent->ebx = signature[0];
+ ent->ecx = signature[1];
+ ent->edx = signature[2];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_INTERFACE:
+ memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
+ ent->eax = signature[0];
+ break;
+
+ case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES:
+ ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000000..6ce160ffa678
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,327 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+
+#ifdef CONFIG_KVM_HYPERV
+
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
+/*
+ * The #defines related to the synthetic debugger are required by KDNet, but
+ * they are not documented in the Hyper-V TLFS because the synthetic debugger
+ * functionality has been deprecated and is subject to removal in future
+ * versions of Windows.
+ */
+#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080
+#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081
+#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082
+
+/*
+ * Hyper-V synthetic debugger platform capabilities
+ * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
+ */
+#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1)
+
+/* Hyper-V Synthetic debug options MSR */
+#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1
+#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2
+#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3
+#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4
+#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5
+#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF
+
+/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
+#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
+
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
+{
+ return &kvm->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
+}
+
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
+}
+
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->kvm->arch.hyperv.hv_syndbg;
+}
+
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
+
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->vec_bitmap);
+}
+
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return to_hv_vcpu(vcpu) &&
+ test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap);
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu->vcpu;
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
+
+void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu);
+
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled);
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu,
+ bool tdp_enabled)
+{
+ /*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && tdp_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock) {}
+static inline void kvm_hv_request_tsc_page_update(struct kvm *kvm) {}
+static inline void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) {}
+static inline void kvm_hv_init_vm(struct kvm *kvm) {}
+static inline void kvm_hv_destroy_vm(struct kvm *kvm) {}
+static inline int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ return HV_STATUS_ACCESS_DENIED;
+}
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline bool kvm_hv_synic_auto_eoi_set(struct kvm_vcpu *vcpu, int vector)
+{
+ return false;
+}
+static inline void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) {}
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) {}
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_idx;
+}
+static inline void kvm_hv_nested_transtion_tlb_flush(struct kvm_vcpu *vcpu, bool tdp_enabled) {}
+#endif /* CONFIG_KVM_HYPERV */
+
+#endif /* __ARCH_X86_KVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..850972deac8e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -29,13 +30,15 @@
* Based on QEMU and Xen.
*/
-#define pr_fmt(fmt) "pit: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
+#include "x86.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -48,32 +51,9 @@
#define RW_STATE_WORD0 3
#define RW_STATE_WORD1 4
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
{
- union {
- u64 ll;
- struct {
- u32 low, high;
- } l;
- } u, res;
- u64 rl, rh;
-
- u.ll = a;
- rl = (u64)u.l.low * (u64)b;
- rh = (u64)u.l.high * (u64)b;
- rh += (rl >> 32);
- res.l.high = div64_u64(rh, c);
- res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
- return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
switch (c->mode) {
default:
@@ -94,20 +74,18 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
c->gate = val;
}
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
{
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- return kvm->arch.vpit->pit_state.channels[channel].gate;
+ return pit->pit_state.channels[channel].gate;
}
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
{
s64 elapsed;
ktime_t remaining;
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ struct kvm_kpit_state *ps = &pit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
@@ -119,33 +97,29 @@ static s64 __kpit_elapsed(struct kvm *kvm)
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
return elapsed;
}
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
int channel)
{
if (channel == 0)
- return __kpit_elapsed(kvm);
+ return __kpit_elapsed(pit);
return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
}
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int counter;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
case 0:
@@ -165,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
return counter;
}
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
s64 d, t;
int out;
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
- t = kpit_elapsed(kvm, c, channel);
- d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
switch (c->mode) {
default:
@@ -200,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
return out;
}
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->count_latched) {
- c->latched_count = pit_get_count(kvm, channel);
+ c->latched_count = pit_get_count(pit, channel);
c->count_latched = c->rw_mode;
}
}
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
{
- struct kvm_kpit_channel_state *c =
- &kvm->arch.vpit->pit_state.channels[channel];
-
- WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
if (!c->status_latched) {
/* TODO: Return NULL COUNT (bit 6). */
- c->status = ((pit_get_out(kvm, channel) << 7) |
+ c->status = ((pit_get_out(pit, channel) << 7) |
(c->rw_mode << 4) |
(c->mode << 1) |
c->bcd);
@@ -230,24 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
}
}
-int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-
- if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
- return atomic_read(&pit->pit_state.pit_timer.pending);
- return 0;
+ return container_of(ps, struct kvm_pit, pit_state);
}
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- raw_spin_lock(&ps->inject_lock);
- if (atomic_dec_return(&ps->pit_timer.pending) < 0)
- atomic_inc(&ps->pit_timer.pending);
- ps->irq_ack = 1;
- raw_spin_unlock(&ps->inject_lock);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ kthread_queue_work(pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -255,64 +220,153 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
}
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
{
- pr_debug("execute del timer!\n");
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_flush_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
+static void pit_do_work(struct kthread_work *work)
{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
+ */
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
}
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = pit_state_to_pit(ps);
+
+ if (atomic_read(&ps->reinject))
+ atomic_inc(&ps->pending);
+
+ kthread_queue_work(pt->worker, &pt->expired);
+
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
-static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
{
- struct kvm_timer *pt = &ps->pit_timer;
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ /*
+ * AMD SVM AVIC accelerates EOI write and does not trap.
+ * This cause in-kernel PIT re-inject mode to fail
+ * since it checks ps->irq_ack before kvm_set_irq()
+ * and relies on the ack notifier to timely queue
+ * the pt->worker work iterm and reinject the missed tick.
+ * So, deactivate APICv when PIT is in reinject mode.
+ */
+ if (reinject) {
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PIT_REINJ);
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
s64 interval;
- interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
- pt->period = interval;
+ hrtimer_cancel(&ps->timer);
+ kthread_flush_work(&pit->expired);
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = kvm_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
- pt->vcpu = pt->kvm->bsp_vcpu;
+ kvm_pit_reset_reinject(pit);
- atomic_set(&pt->pending, 0);
- ps->irq_ack = 1;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
{
- struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+ struct kvm_kpit_state *ps = &pit->pit_state;
- WARN_ON(!mutex_is_locked(&ps->lock));
-
- pr_debug("load_count val is %d, channel is %d\n", val, channel);
+ pr_debug("load_count val is %u, channel is %d\n", val, channel);
/*
* The largest possible initial count is 0; this is equivalent
@@ -335,32 +389,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
- create_pit_timer(ps, val, 0);
- }
+ create_pit_timer(pit, val, 0);
break;
case 2:
case 3:
- if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
- create_pit_timer(ps, val, 1);
- }
+ create_pit_timer(pit, val, 1);
break;
default:
- destroy_pit_timer(&ps->pit_timer);
+ destroy_pit_timer(pit);
}
}
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
- saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
- kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
- pit_load_count(kvm, channel, val);
- kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+ WARN_ON(channel != 0);
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
} else {
- pit_load_count(kvm, channel, val);
+ pit_load_count(pit, channel, val);
}
}
@@ -380,12 +435,12 @@ static inline int pit_in_range(gpa_t addr)
(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
}
-static int pit_ioport_write(struct kvm_io_device *this,
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int channel, access;
struct kvm_kpit_channel_state *s;
u32 val = *(u32 *) data;
@@ -406,12 +461,11 @@ static int pit_ioport_write(struct kvm_io_device *this,
if (channel == 3) {
/* Read-Back Command. */
for (channel = 0; channel < 3; channel++) {
- s = &pit_state->channels[channel];
if (val & (2 << channel)) {
if (!(val & 0x20))
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
if (!(val & 0x10))
- pit_latch_status(kvm, channel);
+ pit_latch_status(pit, channel);
}
}
} else {
@@ -419,7 +473,7 @@ static int pit_ioport_write(struct kvm_io_device *this,
s = &pit_state->channels[channel];
access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
if (access == 0) {
- pit_latch_count(kvm, channel);
+ pit_latch_count(pit, channel);
} else {
s->rw_mode = access;
s->read_state = access;
@@ -436,17 +490,17 @@ static int pit_ioport_write(struct kvm_io_device *this,
switch (s->write_state) {
default:
case RW_STATE_LSB:
- pit_load_count(kvm, addr, val);
+ pit_load_count(pit, addr, val);
break;
case RW_STATE_MSB:
- pit_load_count(kvm, addr, val << 8);
+ pit_load_count(pit, addr, val << 8);
break;
case RW_STATE_WORD0:
s->write_latch = val;
s->write_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
s->write_state = RW_STATE_WORD0;
break;
}
@@ -456,12 +510,12 @@ static int pit_ioport_write(struct kvm_io_device *this,
return 0;
}
-static int pit_ioport_read(struct kvm_io_device *this,
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, void *data)
{
struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
int ret, count;
struct kvm_kpit_channel_state *s;
if (!pit_in_range(addr))
@@ -498,20 +552,20 @@ static int pit_ioport_read(struct kvm_io_device *this,
switch (s->read_state) {
default:
case RW_STATE_LSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
break;
case RW_STATE_MSB:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
break;
case RW_STATE_WORD0:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = count & 0xff;
s->read_state = RW_STATE_WORD1;
break;
case RW_STATE_WORD1:
- count = pit_get_count(kvm, addr);
+ count = pit_get_count(pit, addr);
ret = (count >> 8) & 0xff;
s->read_state = RW_STATE_WORD0;
break;
@@ -526,29 +580,32 @@ static int pit_ioport_read(struct kvm_io_device *this,
return 0;
}
-static int speaker_ioport_write(struct kvm_io_device *this,
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
gpa_t addr, int len, const void *data)
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
u32 val = *(u32 *) data;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
return -EOPNOTSUPP;
mutex_lock(&pit_state->lock);
- pit_state->speaker_data_on = (val >> 1) & 1;
- pit_set_gate(kvm, 2, val & 1);
+ if (val & (1 << 1))
+ pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ else
+ pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON;
+ pit_set_gate(pit, 2, val & 1);
mutex_unlock(&pit_state->lock);
return 0;
}
-static int speaker_ioport_read(struct kvm_io_device *this,
- gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
{
struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = &pit->pit_state;
- struct kvm *kvm = pit->kvm;
unsigned int refresh_clock;
int ret;
if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -558,8 +615,9 @@ static int speaker_ioport_read(struct kvm_io_device *this,
refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
mutex_lock(&pit_state->lock);
- ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
- (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) |
+ pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) |
+ (refresh_clock << 4);
if (len > sizeof(ret))
len = sizeof(ret);
memcpy(data, (char *)&ret, len);
@@ -567,33 +625,101 @@ static int speaker_ioport_read(struct kvm_io_device *this,
return 0;
}
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
{
int i;
struct kvm_kpit_channel_state *c;
- mutex_lock(&pit->pit_state.lock);
pit->pit_state.flags = 0;
for (i = 0; i < 3; i++) {
c = &pit->pit_state.channels[i];
c->mode = 0xff;
c->gate = (i != 2);
- pit_load_count(pit->kvm, i, 0);
+ pit_load_count(pit, i, 0);
}
- mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
+ kvm_pit_reset_reinject(pit);
}
-static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+static void pit_mask_notifier(struct kvm_irq_mask_notifier *kimn, bool mask)
{
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
- if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
- pit->pit_state.irq_ack = 1;
- }
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
+}
+
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
}
static const struct kvm_io_device_ops pit_dev_ops = {
@@ -606,132 +732,85 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
int ret;
- pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0) {
- kfree(pit);
- return NULL;
- }
-
mutex_init(&pit->pit_state.lock);
- mutex_lock(&pit->pit_state.lock);
- raw_spin_lock_init(&pit->pit_state.inject_lock);
- kvm->arch.vpit = pit;
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
+ goto fail_kthread;
+
+ kthread_init_work(&pit->expired, pit_do_work);
+
pit->kvm = kvm;
pit_state = &pit->pit_state;
- pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
- mutex_unlock(&pit->pit_state.lock);
+ pit->mask_notifier.func = pit_mask_notifier;
kvm_pit_reset(pit);
- pit->mask_notifier.func = pit_mask_notifer;
- kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ kvm_pit_set_reinject(pit, true);
+ mutex_lock(&kvm->slots_lock);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
if (ret < 0)
- goto fail;
+ goto fail_register_pit;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
- &pit->speaker_dev);
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
if (ret < 0)
- goto fail_unregister;
+ goto fail_register_speaker;
}
+ mutex_unlock(&kvm->slots_lock);
return pit;
-fail_unregister:
+fail_register_speaker:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
- kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-
+fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ kthread_destroy_worker(pit->worker);
+fail_kthread:
kfree(pit);
return NULL;
}
void kvm_free_pit(struct kvm *kvm)
{
- struct hrtimer *timer;
-
- if (kvm->arch.vpit) {
- kvm_unregister_irq_mask_notifier(kvm, 0,
- &kvm->arch.vpit->mask_notifier);
- kvm_unregister_irq_ack_notifier(kvm,
- &kvm->arch.vpit->pit_state.irq_ack_notifier);
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
- hrtimer_cancel(timer);
- kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- kfree(kvm->arch.vpit);
- }
-}
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kpit_state *ps;
+ struct kvm_pit *pit = kvm->arch.vpit;
if (pit) {
- int inject = 0;
- ps = &pit->pit_state;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- raw_spin_lock(&ps->inject_lock);
- if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- raw_spin_unlock(&ps->inject_lock);
- if (inject)
- __inject_pit_timer_intr(kvm);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_destroy_worker(pit->worker);
+ kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,8 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __I8254_H
#define __I8254_H
-#include "iodev.h"
+#include <linux/kthread.h>
+#include <kvm/iodev.h>
+
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -20,26 +28,28 @@ struct kvm_kpit_channel_state {
};
struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
struct kvm_kpit_channel_state channels[3];
u32 flags;
- struct kvm_timer pit_timer;
bool is_periodic;
- u32 speaker_data_on;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+
struct mutex lock;
- struct kvm_pit *pit;
- raw_spinlock_t inject_lock;
- unsigned long irq_ack;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
- unsigned long base_addresss;
struct kvm_io_device dev;
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
+ struct kthread_worker *worker;
+ struct kthread_work expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
@@ -49,10 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -25,18 +26,27 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
#include "trace.h"
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
+
+static void pic_irq_request(struct kvm *kvm, int level);
+
static void pic_lock(struct kvm_pic *s)
__acquires(&s->lock)
{
- raw_spin_lock(&s->lock);
+ spin_lock(&s->lock);
}
static void pic_unlock(struct kvm_pic *s)
@@ -44,22 +54,26 @@ static void pic_unlock(struct kvm_pic *s)
{
bool wakeup = s->wakeup_needed;
struct kvm_vcpu *vcpu;
+ unsigned long i;
s->wakeup_needed = false;
- raw_spin_unlock(&s->lock);
+ spin_unlock(&s->lock);
if (wakeup) {
- vcpu = s->kvm->bsp_vcpu;
- if (vcpu)
- kvm_vcpu_kick(vcpu);
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+ }
}
}
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
- s->isr_ack |= (1 << irq);
if (s != &s->pics_state->pics[0])
irq += 8;
/*
@@ -73,16 +87,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
pic_lock(s->pics_state);
}
-void kvm_pic_clear_isr_ack(struct kvm *kvm)
-{
- struct kvm_pic *s = pic_irqchip(kvm);
-
- pic_lock(s);
- s->pics[0].isr_ack = 0xff;
- s->pics[1].isr_ack = 0xff;
- pic_unlock(s);
-}
-
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
@@ -173,10 +177,7 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
+ pic_irq_request(s->kvm, irq >= 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
@@ -186,18 +187,22 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
- struct kvm_pic *s = opaque;
- int ret = -1;
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
pic_lock(s);
- if (irq >= 0 && irq < PIC_NUM_PINS) {
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
- pic_update_irq(s);
- trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
- s->pics[irq >> 3].imr, ret == 0);
- }
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
pic_unlock(s);
return ret;
@@ -226,7 +231,9 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
- struct kvm_pic *s = pic_irqchip(kvm);
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ s->output = 0;
pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
@@ -242,7 +249,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
@@ -258,35 +264,39 @@ int kvm_pic_read_irq(struct kvm *kvm)
return intno;
}
-void kvm_pic_reset(struct kvm_kpic_state *s)
+static void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq;
- struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
- u8 irr = s->irr, isr = s->imr;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ u8 edge_irr = s->irr & ~s->elcr;
+ bool found = false;
s->last_irr = 0;
- s->irr = 0;
+ s->irr &= s->elcr;
s->imr = 0;
- s->isr = 0;
- s->isr_ack = 0xff;
s->priority_add = 0;
- s->irq_base = 0;
- s->read_reg_select = 0;
- s->poll = 0;
s->special_mask = 0;
- s->init_state = 0;
- s->auto_eoi = 0;
- s->rotate_on_auto_eoi = 0;
- s->special_fully_nested_mode = 0;
- s->init4 = 0;
-
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
- if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (irr & (1 << irq) || isr & (1 << irq)) {
- pic_clear_isr(s, irq);
- }
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
}
+ s->init_state = 1;
+
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (edge_irr & (1 << irq))
+ pic_clear_isr(s, irq);
}
static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -297,19 +307,13 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
- kvm_pic_reset(s); /* init */
- /*
- * deassert a pending interrupt
- */
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
- s->init_state = 1;
s->init4 = val & 1;
if (val & 0x02)
- printk(KERN_ERR "single mode not supported");
+ pr_pic_unimpl("single mode not supported");
if (val & 0x08)
- printk(KERN_ERR
- "level sensitive irq not supported");
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
} else if (val & 0x08) {
if (val & 0x04)
s->poll = 1;
@@ -356,10 +360,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: /* normal mode */
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
+ }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -392,7 +406,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
+ /* Bit 7 is 1, means there's an interrupt */
+ ret |= 0x80;
} else {
+ /* Bit 7 is 0, means there's no interrupt */
ret = 0x07;
pic_update_irq(s->pics_state);
}
@@ -400,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
return ret;
}
-static u32 pic_ioport_read(void *opaque, u32 addr1)
+static u32 pic_ioport_read(void *opaque, u32 addr)
{
struct kvm_kpic_state *s = opaque;
- unsigned int addr;
int ret;
- addr = addr1;
- addr &= 1;
if (s->poll) {
- ret = pic_poll_read(s, addr1);
+ ret = pic_poll_read(s, addr);
s->poll = 0;
} else
- if (addr == 0)
+ if ((addr & 1) == 0)
if (s->read_reg_select)
ret = s->isr;
else
@@ -422,160 +436,220 @@ static u32 pic_ioport_read(void *opaque, u32 addr1)
return ret;
}
-static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+static void elcr_ioport_write(void *opaque, u32 val)
{
struct kvm_kpic_state *s = opaque;
s->elcr = val & s->elcr_mask;
}
-static u32 elcr_ioport_read(void *opaque, u32 addr1)
+static u32 elcr_ioport_read(void *opaque)
{
struct kvm_kpic_state *s = opaque;
return s->elcr;
}
-static int picdev_in_range(gpa_t addr)
-{
- switch (addr) {
- case 0x20:
- case 0x21:
- case 0xa0:
- case 0xa1:
- case 0x4d0:
- case 0x4d1:
- return 1;
- default:
- return 0;
- }
-}
-
-static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
-{
- return container_of(dev, struct kvm_pic, dev);
-}
-
-static int picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte write\n");
+ pr_pic_unimpl("non byte write\n");
return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
case 0xa0:
case 0xa1:
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_lock(s);
+ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
- elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ pic_lock(s);
+ elcr_ioport_write(&s->pics[addr & 1], data);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- pic_unlock(s);
return 0;
}
-static int picdev_read(struct kvm_io_device *this,
+static int picdev_read(struct kvm_pic *s,
gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = to_pic(this);
- unsigned char data = 0;
- if (!picdev_in_range(addr))
- return -EOPNOTSUPP;
+ unsigned char *data = (unsigned char *)val;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte read\n");
+ memset(val, 0, len);
+ pr_pic_unimpl("non byte read\n");
return 0;
}
- pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
case 0xa0:
case 0xa1:
- data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
break;
case 0x4d0:
case 0x4d1:
- data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1]);
+ pic_unlock(s);
break;
+ default:
+ return -EOPNOTSUPP;
}
- *(unsigned char *)val = data;
- pic_unlock(s);
return 0;
}
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
+}
+
+static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_elcr),
+ addr, len, val);
+}
+
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm *kvm = opaque;
- struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
- struct kvm_pic *s = pic_irqchip(kvm);
- int irq = pic_get_irq(&s->pics[0]);
+ struct kvm_pic *s = kvm->arch.vpic;
- s->output = level;
- if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
- s->pics[0].isr_ack &= ~(1 << irq);
+ if (!s->output && level)
s->wakeup_needed = true;
- }
+ s->output = level;
}
-static const struct kvm_io_device_ops picdev_ops = {
- .read = picdev_read,
- .write = picdev_write,
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_elcr_ops = {
+ .read = picdev_elcr_read,
+ .write = picdev_elcr_write,
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
{
struct kvm_pic *s;
int ret;
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
if (!s)
- return NULL;
- raw_spin_lock_init(&s->lock);
+ return -ENOMEM;
+ spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
/*
* Initialize PIO device
*/
- kvm_iodevice_init(&s->dev, &picdev_ops);
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops);
mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
mutex_unlock(&kvm->slots_lock);
- if (ret < 0) {
- kfree(s);
- return NULL;
- }
- return s;
+ kvm->arch.vpic = s;
+
+ return 0;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return ret;
}
-void kvm_destroy_pic(struct kvm *kvm)
+void kvm_pic_destroy(struct kvm *kvm)
{
struct kvm_pic *vpic = kvm->arch.vpic;
- if (vpic) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
- kvm->arch.vpic = NULL;
- kfree(vpic);
- }
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000000..2c2783296aed
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: LGPL-2.1-or-later
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+#include "trace.h"
+
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id,
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode)))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ (vector == dest_map->vectors[vcpu->vcpu_id]) &&
+ (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map))) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm) {
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ entry->fields.dest_id,
+ entry->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, entry->fields.vector))
+ continue;
+
+ /*
+ * If no longer has pending EOI in LAPICs, update
+ * EOI for this vector.
+ */
+ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector);
+ break;
+ }
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * AMD SVM AVIC accelerate EOI write iff the interrupt is edge
+ * triggered, in which case the in-kernel IOAPIC will not be able
+ * to receive the EOI. In this case, we do a lazy update of the
+ * pending EOI when trying to set IOAPIC irq.
+ */
+ if (edge && kvm_apicv_activated(ioapic->kvm))
+ ioapic_lazy_update_eoi(ioapic, irq);
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * to masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if (edge) {
+ ioapic->irr_delivered &= ~mask;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+
+ kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm,
+ e->fields.vector, ioapic_handled_vectors);
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
+{
+ if (!ioapic_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+ int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
+ old_dest_id = e->fields.dest_id;
+ old_dest_mode = e->fields.dest_mode;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ }
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+ /*
+ * Pending status in irr may be outdated: the IRQ line may have
+ * already been deasserted by a device while the IRQ was masked.
+ * This occurs, for instance, if the interrupt is handled in a
+ * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+ * case the guest acknowledges the interrupt to the device in
+ * its threaded irq handler, i.e. after the EOI but before
+ * unmasking, so at the time of unmasking the IRQ line is
+ * already down but our pending irr bit is still set. In such
+ * cases, injecting this pending interrupt to the guest is
+ * buggy: the guest will receive an extra unwanted interrupt.
+ *
+ * So we need to check here if the IRQ is actually still pending.
+ * As we are generally not able to probe the IRQ line status
+ * directly, we do it through irqfd resampler. Namely, we clear
+ * the pending status and notify the resampler that this interrupt
+ * is done, without actually injecting it into the guest. If the
+ * IRQ line is actually already deasserted, we are done. If it is
+ * still asserted, a new interrupt will be shortly triggered
+ * through irqfd and injected into the guest.
+ *
+ * If, however, it's not possible to resample (no irqfd resampler
+ * registered for this irq), then unconditionally inject this
+ * pending interrupt into the guest, so the guest will not miss
+ * an interrupt, although may get an extra unwanted interrupt.
+ */
+ if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+ ioapic->irr &= ~(1 << index);
+ else
+ ioapic_service(ioapic, index, false);
+ }
+ if (e->fields.delivery_mode == APIC_DM_FIXED) {
+ struct kvm_lapic_irq irq;
+
+ irq.vector = e->fields.vector;
+ irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+ irq.level = false;
+ irq.trig_mode = e->fields.trig_mode;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.dest_id = e->fields.dest_id;
+ irq.msi_redir_hint = false;
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ if (old_dest_mode != e->fields.dest_mode ||
+ old_dest_id != e->fields.dest_id) {
+ /*
+ * Update vcpu_bitmap with vcpus specified in
+ * the previous request as well. This is done to
+ * keep ioapic_handled_vectors synchronized.
+ */
+ irq.dest_id = old_dest_id;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(
+ !!e->fields.dest_mode);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ vcpu_bitmap);
+ }
+ kvm_make_scan_ioapic_request_mask(ioapic->kvm,
+ vcpu_bitmap);
+ } else {
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ }
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
+ return -1;
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode);
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = APIC_DEST_NOSHORT;
+ irqe.msi_redir_hint = false;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr_delivered |= 1 << irq;
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ &ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin];
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ return;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << pin))) {
+ ++ioapic->irq_eoi[pin];
+ if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[pin] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, pin, false);
+ }
+ } else {
+ ioapic->irq_eoi[pin] = 0;
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ int i;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ rtc_irq_eoi(ioapic, vcpu, vector);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+ kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
+ rtc_irq_eoi_tracking_reset(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+}
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ kvm_make_scan_ioapic_request(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000000..bf28dbc11ff6
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include "irq.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#define RTC_GSI 8
+
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPU_IDS];
+};
+
+
+struct rtc_status {
+ int pending_eoi;
+ struct dest_map dest_map;
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ spinlock_t lock;
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+ return irqchip_full(kvm);
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..7cc8950005b6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,30 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
+#include "xen.h"
/*
* check if there are pending timer events
@@ -32,14 +26,84 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- int ret;
+ int r = 0;
- ret = pit_has_pending_timer(vcpu);
- ret |= apic_has_pending_timer(vcpu);
+ if (lapic_in_kernel(vcpu))
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return ret;
+ return r;
}
-EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ /*
+ * FIXME: interrupt.injected represents an interrupt whose
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.injected;
+
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
+ return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_injectable_intr);
/*
* check if there is pending interrupt without
@@ -47,55 +111,523 @@ EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
+ if (kvm_cpu_has_extint(v))
+ return 1;
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.pending;
+ if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected)
+ return kvm_x86_call(protected_apic_has_interrupt)(v);
- if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm); /* PIC */
- return s->output;
- } else
- return 0;
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
+ return -1;
}
- return 1;
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+#ifdef CONFIG_KVM_XEN
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+#endif
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
-EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_get_extint);
/*
* Read pending interrupt vector and intack.
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- struct kvm_pic *s;
- int vector;
+ int vector = kvm_cpu_get_extint(v);
+ if (vector != -1)
+ return vector; /* PIC */
- if (!irqchip_in_kernel(v->kvm))
- return v->arch.interrupt.nr;
+ vector = kvm_apic_has_interrupt(v); /* APIC */
+ if (vector != -1)
+ kvm_apic_ack_interrupt(v, vector);
- vector = kvm_get_apic_interrupt(v); /* APIC */
- if (vector == -1) {
- if (kvm_apic_accept_pic_intr(v)) {
- s = pic_irqchip(v->kvm);
- s->output = 0; /* PIC */
- vector = kvm_pic_read_irq(v->kvm);
- }
- }
return vector;
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
- kvm_inject_apic_timer_irqs(vcpu);
- kvm_inject_pit_timer_irqs(vcpu);
- /* TODO: PIT, RTC etc. */
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
+ kvm_x86_call(migrate_timers)(vcpu);
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..34f4a78a7a01 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* irq.h: in kernel interrupt controller related definitions
* Copyright (c) 2007, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
*/
#ifndef __IRQ_H
@@ -27,10 +15,11 @@
#include <linux/kvm_host.h>
#include <linux/spinlock.h>
-#include "iodev.h"
-#include "ioapic.h"
+#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -38,14 +27,11 @@
struct kvm;
struct kvm_vcpu;
-typedef void irq_request_func(void *opaque, int level);
-
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
- u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
@@ -58,44 +44,73 @@ struct kvm_kpic_state {
u8 init4; /* true if 4 byte init */
u8 elcr; /* PIIX edge/trigger selection */
u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
struct kvm_pic *pics_state;
};
struct kvm_pic {
- raw_spinlock_t lock;
+ spinlock_t lock;
bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
int output; /* intr from master PIC */
- struct kvm_io_device dev;
- void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_elcr;
+ unsigned long irq_states[PIC_NUM_PINS];
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm *kvm);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
-void kvm_pic_clear_isr_ack(struct kvm *kvm);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+
+static inline int irqchip_full(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
-static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
+static inline int pic_in_kernel(struct kvm *kvm)
{
- return kvm->arch.vpic;
+ return irqchip_full(kvm);
}
-static inline int irqchip_in_kernel(struct kvm *kvm)
+
+static inline int irqchip_split(struct kvm *kvm)
{
- int ret;
+ int mode = kvm->arch.irqchip_mode;
- ret = (pic_irqchip(kvm) != NULL);
+ /* Matches smp_wmb() when setting irqchip_mode */
smp_rmb();
- return ret;
+ return mode == KVM_IRQCHIP_SPLIT;
}
-void kvm_pic_reset(struct kvm_kpic_state *s);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode != KVM_IRQCHIP_NONE;
+}
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
@@ -104,7 +119,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
-int pit_has_pending_timer(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c
new file mode 100644
index 000000000000..24a710d37323
--- /dev/null
+++ b/arch/x86/kvm/kvm-asm-offsets.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include "vmx/vmx.h"
+#include "svm/svm.h"
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_KVM_AMD)) {
+ BLANK();
+ OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs);
+ OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb);
+ OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl);
+ OFFSET(SVM_vmcb01, vcpu_svm, vmcb01);
+ OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa);
+ OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa);
+ }
+
+ if (IS_ENABLED(CONFIG_KVM_INTEL)) {
+ BLANK();
+ OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl);
+ }
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..8ddb01191d6f 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,56 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
-#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#include <linux/kvm_host.h>
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \
+ | X86_CR4_CET)
+
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
+#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
+static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
+{ \
+ return vcpu->arch.regs[VCPU_REGS_##uname]; \
+} \
+static __always_inline void kvm_##lname##_write(struct kvm_vcpu *vcpu, \
+ unsigned long val) \
+{ \
+ vcpu->arch.regs[VCPU_REGS_##uname] = val; \
+}
+BUILD_KVM_GPR_ACCESSORS(rax, RAX)
+BUILD_KVM_GPR_ACCESSORS(rbx, RBX)
+BUILD_KVM_GPR_ACCESSORS(rcx, RCX)
+BUILD_KVM_GPR_ACCESSORS(rdx, RDX)
+BUILD_KVM_GPR_ACCESSORS(rbp, RBP)
+BUILD_KVM_GPR_ACCESSORS(rsi, RSI)
+BUILD_KVM_GPR_ACCESSORS(rdi, RDI)
+#ifdef CONFIG_X86_64
+BUILD_KVM_GPR_ACCESSORS(r8, R8)
+BUILD_KVM_GPR_ACCESSORS(r9, R9)
+BUILD_KVM_GPR_ACCESSORS(r10, R10)
+BUILD_KVM_GPR_ACCESSORS(r11, R11)
+BUILD_KVM_GPR_ACCESSORS(r12, R12)
+BUILD_KVM_GPR_ACCESSORS(r13, R13)
+BUILD_KVM_GPR_ACCESSORS(r14, R14)
+BUILD_KVM_GPR_ACCESSORS(r15, R15)
+#endif
+
+/*
+ * Using the register cache from interrupt context is generally not allowed, as
+ * caching a register and marking it available/dirty can't be done atomically,
+ * i.e. accesses from interrupt context may clobber state or read stale data if
+ * the vCPU task is in the process of updating the cache. The exception is if
+ * KVM is handling a PMI IRQ/NMI VM-Exit, as that bound code sequence doesn't
+ * touch the cache, it runs after the cache is reset (post VM-Exit), and PMIs
+ * need to access several registers that are cacheable.
+ */
+#define kvm_assert_register_caching_allowed(vcpu) \
+ lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu))
+
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ kvm_assert_register_caching_allowed(vcpu);
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+/*
+ * The "raw" register helpers are only for cases where the full 64 bits of a
+ * register are read/written irrespective of current vCPU mode. In other words,
+ * odds are good you shouldn't be using the raw variants.
+ */
+static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, reg);
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return 0;
+
+ if (!kvm_register_is_available(vcpu, reg))
+ kvm_x86_call(cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
- unsigned long val)
+static inline void kvm_register_write_raw(struct kvm_vcpu *vcpu, int reg,
+ unsigned long val)
{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return;
+
vcpu->arch.regs[reg] = val;
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_dirty(vcpu, reg);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
- return kvm_register_read(vcpu, VCPU_REGS_RIP);
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
- kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+ kvm_register_write_raw(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline unsigned long kvm_rsp_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read_raw(vcpu, VCPU_REGS_RSP);
+}
+
+static inline void kvm_rsp_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write_raw(vcpu, VCPU_REGS_RSP, val);
}
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ might_sleep(); /* on svm */
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
- return vcpu->arch.pdptrs[index];
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+ vcpu->arch.walk_mmu->pdptrs[index] = value;
}
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
- if (tmask & vcpu->arch.cr0_guest_owned_bits)
- kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, ~0UL);
@@ -59,14 +191,59 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
- if (tmask & vcpu->arch.cr4_guest_owned_bits)
- kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
+ !kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
+ return vcpu->arch.cr3;
+}
+
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
}
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_rax_read(vcpu) & -1u)
+ | ((u64)(kvm_rdx_read(vcpu) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+ vcpu->stat.guest_mode = 1;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
+
+ vcpu->stat.guest_mode = 0;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
new file mode 100644
index 000000000000..fb3dab4b5a53
--- /dev/null
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+#include <asm/desc_defs.h>
+#include "fpu.h"
+
+struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
+
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+ u8 async_page_fault;
+ unsigned long exit_qualification;
+};
+
+/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 src_type; /* type of source operand */
+ u8 dst_type; /* type of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 rip; /* rip of the instruction */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+/* Emulation during event vectoring is unhandleable. */
+#define X86EMUL_UNHANDLEABLE_VECTORING 7
+
+/* x86-specific emulation flags */
+#define X86EMUL_F_WRITE BIT(0)
+#define X86EMUL_F_FETCH BIT(1)
+#define X86EMUL_F_IMPLICIT BIT(2)
+#define X86EMUL_F_INVLPG BIT(3)
+#define X86EMUL_F_MSR BIT(4)
+#define X86EMUL_F_DT_LOAD BIT(5)
+
+struct x86_emulate_ops {
+ void (*vm_bugged)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * read_gpr: read a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ */
+ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+ /*
+ * write_gpr: write a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ * @val: value to write.
+ */
+ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *fault, bool system);
+
+ /*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault, bool system);
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
+ unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool exact_only);
+ bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
+
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
+ void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
+ int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr);
+ int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
+
+ gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
+
+ bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr,
+ unsigned int flags);
+};
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned int count;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
+ unsigned xmm;
+ unsigned mm;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(avx256_t)];
+ sse128_t vec_val;
+ avx256_t vec_val2;
+ u64 mm_val;
+ void *data;
+ } __aligned(32);
+};
+
+#define X86_MAX_INSTRUCTION_LENGTH 15
+
+struct fetch_cache {
+ u8 data[X86_MAX_INSTRUCTION_LENGTH];
+ u8 *ptr;
+ u8 *end;
+};
+
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+ X86EMUL_MODE_REAL, /* Real mode. */
+ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
+ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
+ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
+ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
+};
+
+/*
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
+
+typedef void (*fastop_t)(struct fastop *);
+
+/*
+ * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is
+ * tracked/accessed via _eip, and except for RIP relative addressing, which
+ * also uses _eip, RIP cannot be a register operand nor can it be an operand in
+ * a ModRM or SIB byte.
+ */
+#ifdef CONFIG_X86_64
+#define NR_EMULATOR_GPRS 16
+#else
+#define NR_EMULATOR_GPRS 8
+#endif
+
+/*
+ * Distinguish between no prefix, REX, or in the future REX2.
+ */
+enum rex_type {
+ REX_NONE,
+ REX_PREFIX,
+};
+
+struct x86_emulate_ctxt {
+ void *vcpu;
+ const struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ enum x86emul_mode mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool perm_ok; /* do not check permissions if true */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /* GPA available */
+ bool gpa_available;
+ gpa_t gpa_val;
+
+ /*
+ * decode cache
+ */
+
+ /* current opcode length in bytes */
+ u8 opcode_len;
+ u8 b;
+ u8 intercept;
+ bool op_prefix;
+ u8 op_bytes;
+ u8 ad_bytes;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ fastop_t fop;
+ };
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+
+ bool rip_relative;
+ enum rex_type rex_prefix;
+ u8 rex_bits;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u16 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u16 regs_dirty;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 modrm_seg;
+ u8 seg_override;
+ u64 d;
+ unsigned long _eip;
+
+ /* Here begins the usercopy section. */
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ struct operand memop;
+ unsigned long _regs[NR_EMULATOR_GPRS];
+ struct operand *memopp;
+ struct fetch_cache fetch;
+ struct read_cache io_read;
+ struct read_cache mem_read;
+ bool is_branch;
+};
+
+#define KVM_EMULATOR_BUG_ON(cond, ctxt) \
+({ \
+ int __ret = (cond); \
+ \
+ if (WARN_ON_ONCE(__ret)) \
+ ctxt->ops->vm_bugged(ctxt); \
+ unlikely(__ret); \
+})
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
+
+/* CPUID vendors */
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975
+#define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e
+
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
+
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561
+#define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561
+
+static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx)
+{
+ return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) ||
+ (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx);
+}
+
+static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx)
+{
+ return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx;
+}
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_rdpid,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+ x86_intercept_xsetbv,
+
+ nr_x86_intercepts
+};
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
new file mode 100644
index 000000000000..ee53e75a60cb
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+ void *data)
+{
+ struct kvm_hv_tlb_range *range = data;
+
+ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
+ range->pages);
+}
+
+static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
+ struct kvm_hv_tlb_range *range)
+{
+ if (range)
+ return hyperv_flush_guest_mapping_range(root_tdp,
+ kvm_fill_hv_flush_list_func, (void *)range);
+ else
+ return hyperv_flush_guest_mapping(root_tdp);
+}
+
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
+{
+ struct kvm_arch *kvm_arch = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
+ hpa_t root;
+
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+
+ if (!VALID_PAGE(kvm_arch->hv_root_tdp)) {
+ nr_unique_valid_roots = 0;
+
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ root = vcpu->arch.hv_root_tdp;
+ if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_arch->hv_root_tdp = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_tdp(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
+ }
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ } else {
+ ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range);
+ }
+
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ return ret;
+}
+
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_flush_remote_tlbs);
+
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(hv_track_root_tdp);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
new file mode 100644
index 000000000000..eefab3dc8498
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
+void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
+static inline hpa_t hv_get_partition_assist_page(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Partition assist page is something which Hyper-V running in L0
+ * requires from KVM running in L1 before direct TLB flush for L2
+ * guests can be enabled. KVM doesn't currently use the page but to
+ * comply with TLFS it still needs to be allocated. For now, this
+ * is a single page shared among all vCPUs.
+ */
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &vcpu->kvm->arch.hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+
+ if (!*p_hv_pa_pg)
+ return INVALID_PAGE;
+
+ return __pa(*p_hv_pa_pg);
+}
+#else /* !CONFIG_HYPERV */
+static inline int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+}
+#endif /* !CONFIG_HYPERV */
+
+#endif
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
deleted file mode 100644
index 64bc6ea78d90..000000000000
--- a/arch/x86/kvm/kvm_timer.h
+++ /dev/null
@@ -1,16 +0,0 @@
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..1597dd0b0cc6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Local APIC virtualization
@@ -5,6 +6,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -12,10 +14,8 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -24,19 +24,28 @@
#include <linux/smp.h>
#include <linux/hrtimer.h>
#include <linux/io.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
+#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <asm/delay.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
#include "irq.h"
+#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
+#include "cpuid.h"
+#include "hyperv.h"
+#include "smm.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -44,99 +53,480 @@
#define mod_64(x, y) ((x) % (y))
#endif
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION 0x14UL
+#define LAPIC_MMIO_LENGTH (1 << 12)
-#define APIC_BUS_CYCLE_NS 1
+/*
+ * Enable local APIC timer advancement (tscdeadline mode only) with adaptive
+ * tuning. When enabled, KVM programs the host timer event to fire early, i.e.
+ * before the deadline expires, to account for the delay between taking the
+ * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume
+ * the guest, i.e. so that the interrupt arrives in the guest with minimal
+ * latency relative to the deadline programmed by the guest.
+ */
+static bool lapic_timer_advance __read_mostly = true;
+module_param(lapic_timer_advance, bool, 0444);
-/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
+/* step-by-step approximation to mitigate fluctuation */
+#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
-#define APIC_LVT_NUM 6
-/* 14 is the version for Xeon and Pentium 8.4.8*/
-#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
-#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
-#define MAX_APIC_VECTOR 256
+static bool __read_mostly vector_hashing_enabled = true;
+module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- return *((u32 *) (apic->regs + reg_off));
+ apic_set_reg(apic->regs, reg_off, val);
}
-static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- *((u32 *) (apic->regs + reg_off)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
-static inline int apic_test_and_set_vector(int vec, void *bitmap)
+static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
+ int reg, u64 val)
{
- return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
-static inline int apic_test_and_clear_vector(int vec, void *bitmap)
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
{
- return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+ apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline void apic_set_vector(int vec, void *bitmap)
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_has_noapic_vcpu);
+
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
+}
+
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ return apic->vcpu->vcpu_id;
}
-static inline void apic_clear_vector(int vec, void *bitmap)
+static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
-static inline int apic_hw_enabled(struct kvm_lapic *apic)
+static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
{
- return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return kvm_x86_ops.set_hv_timer
+ && !(kvm_mwait_in_guest(vcpu->kvm) ||
+ kvm_can_post_timer_interrupt(vcpu));
}
-static inline int apic_sw_enabled(struct kvm_lapic *apic)
+static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
+ return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
-static inline int apic_enabled(struct kvm_lapic *apic)
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
{
- return apic_sw_enabled(apic) && apic_hw_enabled(apic);
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
}
-#define LVT_MASK \
- (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
-#define LINT_MASK \
- (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
- APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
+ *mask = dest_id & 0xf;
+ return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+}
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
{
- return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * For simplicity, KVM always allocates enough space for all possible
+ * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on
+ * without the optimized map.
+ */
+ if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
+ return -EINVAL;
+
+ /*
+ * Bail if a vCPU was added and/or enabled its APIC between allocating
+ * the map and doing the actual calculations for the map. Note, KVM
+ * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
+ * the compiler decides to reload x2apic_id after this check.
+ */
+ if (x2apic_id > new->max_apic_id)
+ return -E2BIG;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic))
+ return;
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
+/*
+ * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
+ *
+ * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with
+ * apic_map_lock_held.
+ */
+enum {
+ CLEAN,
+ UPDATE_IN_PROGRESS,
+ DIRTY
+};
+
+static void kvm_recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch;
+ int r;
+
+ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
+ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
+ return;
+
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+retry:
+ /*
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
+ */
+ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
+ DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
+ /* Someone else has updated the map. */
+ mutex_unlock(&kvm->arch.apic_map_lock);
+ return;
+ }
+
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+ GFP_KERNEL_ACCOUNT);
+
+ if (!new)
+ goto out;
+
+ new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
+ kvfree(new);
+ new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
+ goto out;
+ }
+
+ kvm_recalculate_logical_map(new, vcpu);
+ }
+out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ /*
+ * Write kvm->arch.apic_map before clearing apic->apic_map_dirty.
+ * If another update has come in, leave it DIRTY.
+ */
+ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
+ UPDATE_IN_PROGRESS, CLEAN);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ kvfree_rcu(old, rcu);
+
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
+
+ kvm_lapic_set_reg(apic, APIC_SPIV, val);
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
+ else
+ static_branch_inc(&apic_sw_disabled.key);
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled) {
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
+}
+
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
+{
+ kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ kvm_lapic_set_reg(apic, APIC_LDR, id);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+ kvm_lapic_set_reg(apic, APIC_DFR, val);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
+{
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
+
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
+ kvm_lapic_set_reg(apic, APIC_ID, id);
+ kvm_lapic_set_reg(apic, APIC_LDR, ldr);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
- return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+ return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
}
-static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
}
static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -144,63 +534,144 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
}
+static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
+{
+ return apic->nr_lvt_entries > lvt_index;
+}
+
+static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu)
+{
+ return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P);
+}
+
void kvm_apic_set_version(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_cpuid_entry2 *feat;
- u32 v = APIC_VERSION;
+ u32 v = 0;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
return;
- feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
- if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
+ v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
+
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
+ !ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
- apic_set_reg(apic, APIC_LVR, v);
+ kvm_lapic_set_reg(apic, APIC_LVR, v);
}
-static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
{
- return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+ int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
+ return;
+
+ /* Initialize/mask any "new" LVT entries. */
+ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+
+ apic->nr_lvt_entries = nr_lvt_entries;
+
+ /* The number of LVT entries is reflected in the version register. */
+ kvm_apic_set_version(vcpu);
}
-static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
- LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
- LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
- LVT_MASK | APIC_MODE_MASK, /* LVTPC */
- LINT_MASK, LINT_MASK, /* LVT0-1 */
- LVT_MASK /* LVTERR */
+static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
+ [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */
+ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK,
+ [LVT_LINT0] = LINT_MASK,
+ [LVT_LINT1] = LINT_MASK,
+ [LVT_ERROR] = LVT_MASK,
+ [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
+static u8 count_vectors(void *bitmap)
{
- u32 *word = bitmap;
- int word_offset = MAX_APIC_VECTOR >> 5;
+ int vec;
+ u32 *reg;
+ u8 count = 0;
- while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
- continue;
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
+ count += hweight32(*reg);
+ }
- if (likely(!word_offset && !word[0]))
- return -1;
- else
- return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
+ return count;
}
-static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
- apic->irr_pending = true;
- return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
+ u32 i, vec;
+ u32 irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
+
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
+ u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
+
+ irr_val = READ_ONCE(*p_irr);
+
+ if (__pir[i]) {
+ prev_irr_val = irr_val;
+ do {
+ irr_val = prev_irr_val | __pir[i];
+ } while (prev_irr_val != irr_val &&
+ !try_cmpxchg(p_irr, &prev_irr_val, irr_val));
+
+ if (prev_irr_val != irr_val)
+ max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec;
+ }
+ if (irr_val)
+ *max_irr = __fls(irr_val) + vec;
+ }
+
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr);
+
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
+
+ if (unlikely(!apic->apicv_active && irr_updated))
+ apic->irr_pending = true;
+ return irr_updated;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr);
static inline int apic_search_irr(struct kvm_lapic *apic)
{
- return find_highest_vector(apic->regs + APIC_IRR);
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
{
int result;
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
if (!apic->irr_pending)
return -1;
@@ -212,56 +683,249 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
- apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(apic->apicv_active)) {
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+ }
}
-int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
+{
+ apic_clear_irr(vec, vcpu->arch.apic);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_clear_irr);
+
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
+ return;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(apic->apicv_active))
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
+ return;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(apic->apicv_active))
+ kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
+}
+
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
+ if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active)
+ return;
+
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_hwapic_isr);
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
/* This may race with setting of irr in __apic_accept_irq() and
* value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
* will cause vmexit immediately and the value will be recalculated
* on the next vmentry.
*/
- if (!apic)
- return 0;
- highest_irr = apic_find_highest_irr(apic);
-
- return highest_irr;
+ return apic_find_highest_irr(vcpu->arch.apic);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode);
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map)
{
struct kvm_lapic *apic = vcpu->arch.apic;
return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
- irq->level, irq->trig_mode);
+ irq->level, irq->trig_mode, dest_map);
}
-static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
+ struct kvm_lapic_irq *irq, u32 min)
{
- int result;
+ int i, count = 0;
+ struct kvm_vcpu *vcpu;
- result = find_highest_vector(apic->regs + APIC_ISR);
- ASSERT(result == -1 || result >= 16);
+ if (min > map->max_apic_id)
+ return 0;
- return result;
+ min = array_index_nospec(min, map->max_apic_id + 1);
+
+ for_each_set_bit(i, ipi_bitmap,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, irq, NULL);
+ }
+ }
+
+ return count;
}
-static void apic_update_ppr(struct kvm_lapic *apic)
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit)
+{
+ struct kvm_apic_map *map;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count;
+
+ if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
+ return -KVM_EINVAL;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ count = -EOPNOTSUPP;
+ if (likely(map)) {
+ count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
+ min += cluster_size;
+ count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
+ }
+
+ rcu_read_unlock();
+ return count;
+}
+
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
- u32 tpr, isrv, ppr;
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
+ return;
+
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
+}
+
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (kvm_x86_ops.sync_pir_to_irr)
+ highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
+{
+ u32 tpr, isrv, ppr, old_ppr;
int isr;
- tpr = apic_get_reg(apic, APIC_TASKPRI);
+ old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
isr = apic_find_highest_isr(apic);
isrv = (isr != -1) ? isr : 0;
@@ -270,90 +934,421 @@ static void apic_update_ppr(struct kvm_lapic *apic)
else
ppr = isrv & 0xf0;
- apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
- apic, ppr, isr, isrv);
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
+ kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
- apic_set_reg(apic, APIC_PROCPRI, ppr);
+ return ppr < old_ppr;
}
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_ppr);
+
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
{
- apic_set_reg(apic, APIC_TASKPRI, tpr);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
apic_update_ppr(apic);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
{
- return dest == 0xff || kvm_apic_id(apic) == dest;
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ /*
+ * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they
+ * were in x2APIC mode if the target APIC ID can't be encoded as an
+ * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which
+ * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC
+ * mode. Match the x2APIC ID if and only if the target APIC ID can't
+ * be encoded in xAPIC to avoid spurious matches against a vCPU that
+ * changed its (addressable) xAPIC ID (which is writable).
+ */
+ if (apic_x2apic_mode(apic) || mda > 0xff)
+ return mda == kvm_x2apic_id(apic);
+
+ return mda == kvm_xapic_id(apic);
+}
+
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
{
- int result = 0;
u32 logical_id;
- if (apic_x2apic_mode(apic)) {
- logical_id = apic_get_reg(apic, APIC_LDR);
- return logical_id & mda;
- }
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
- logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
+ logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
- switch (apic_get_reg(apic, APIC_DFR)) {
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
+
+ switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
+ return (logical_id & mda) != 0;
case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
default:
- printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
- break;
+ return false;
}
+}
- return result;
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
+ */
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
+ return X2APIC_BROADCAST;
+
+ return dest_id;
}
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode)
{
- int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
- apic_debug("target %p, source %p, dest 0x%x, "
- "dest_mode 0x%x, short_hand 0x%x\n",
- target, source, dest, dest_mode, short_hand);
-
- ASSERT(!target);
- switch (short_hand) {
+ ASSERT(target);
+ switch (shorthand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0)
- /* Physical mode. */
- result = kvm_apic_match_physical_addr(target, dest);
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, mda);
else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
+ return kvm_apic_match_logical_addr(target, mda);
case APIC_DEST_SELF:
- result = (target == source);
- break;
+ return target == source;
case APIC_DEST_ALLINC:
- result = 1;
- break;
+ return true;
case APIC_DEST_ALLBUT:
- result = (target != source);
- break;
+ return target != source;
default:
- printk(KERN_WARNING "Bad dest shorthand value %x\n",
- short_hand);
- break;
+ return false;
}
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_match_dest);
- return result;
+static int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus);
+
+ BUG_ON(idx >= bitmap_size);
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ pr_info("Disabled LAPIC found during irq injection\n");
+ }
+}
+
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint);
+}
+
+static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
+
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
+ return false;
+
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
+ return false;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
+ *bitmap = 1;
+ }
+ return true;
+ }
+
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
+
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
+
+ if (!vector_hashing_enabled) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
+ }
+ } else {
+ if (!*bitmap)
+ return true;
+
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
+
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
+
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
+
+ return true;
+}
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret) {
+ *r = 0;
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * This routine tries to handle interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destination vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
+static bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm,
+ struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ bool ret = false;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
+
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_intr_is_single_vcpu);
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!vector_hashing_enabled) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
}
/*
@@ -361,44 +1356,56 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
* Return 1 if successfully added and 0 if discarded.
*/
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
- int vector, int level, int trig_mode)
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map)
{
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
+ fallthrough;
case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
/* FIXME add logic for vcpu on reset */
if (unlikely(!apic_enabled(apic)))
break;
- if (trig_mode) {
- apic_debug("level trig mode for vector %d", vector);
- apic_set_vector(vector, apic->regs + APIC_TMR);
- } else
- apic_clear_vector(vector, apic->regs + APIC_TMR);
+ result = 1;
+
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
- result = !apic_test_and_set_irr(vector, apic);
- trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, !result);
- if (!result) {
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- apic_debug("level trig mode repeatedly for "
- "vector %d", vector);
- break;
+ apic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
- kvm_vcpu_kick(vcpu);
+ kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
- printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_SMI:
- printk(KERN_DEBUG "Ignoring guest SMI\n");
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
break;
case APIC_DM_NMI:
@@ -408,29 +1415,23 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_INIT:
- if (level) {
+ if (!trig_mode || level) {
result = 1;
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
- printk(KERN_DEBUG
- "INIT on a runnable vcpu %d\n",
- vcpu->vcpu_id);
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ /* assumes that there are only KVM_APIC_INIT/SIPI */
+ apic->pending_events = (1UL << KVM_APIC_INIT);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
- } else {
- apic_debug("Ignoring de-assert INIT to vcpu %d\n",
- vcpu->vcpu_id);
}
break;
case APIC_DM_STARTUP:
- apic_debug("SIPI to vcpu %d vector 0x%02x\n",
- vcpu->vcpu_id, vector);
- if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
- result = 1;
- vcpu->arch.sipi_vector = vector;
- vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
- kvm_vcpu_kick(vcpu);
- }
+ result = 1;
+ apic->sipi_vector = vector;
+ /* make sure sipi_vector is visible for the receiver */
+ smp_wmb();
+ set_bit(KVM_APIC_SIPI, &apic->pending_events);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_EXTINT:
@@ -449,83 +1450,178 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+/*
+ * This routine identifies the destination vcpus mask meant to receive the
+ * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
+ * out the destination vcpus array and set the bitmap or it traverses to
+ * each available vcpu to identify the same.
+ */
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap)
+{
+ struct kvm_lapic **dest_vcpu = NULL;
+ struct kvm_lapic *src = NULL;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ unsigned long bitmap, i;
+ int vcpu_idx;
+ bool ret;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
+ &bitmap);
+ if (ret) {
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dest_vcpu[i])
+ continue;
+ vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
+ __set_bit(vcpu_idx, vcpu_bitmap);
+ }
+ } else {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+ if (!kvm_apic_match_dest(vcpu, NULL,
+ irq->shorthand,
+ irq->dest_id,
+ irq->dest_mode))
+ continue;
+ __set_bit(i, vcpu_bitmap);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
{
- return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
+}
+
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ int __maybe_unused trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /*
+ * If the intercepted EOI is for an IRQ that was pending from previous
+ * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
+ * no longer need to be intercepted.
+ */
+ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
+ }
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
-static void apic_set_eoi(struct kvm_lapic *apic)
+static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
- int trigger_mode;
+
+ trace_kvm_eoi(apic, vector);
+
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
*/
if (vector == -1)
- return;
+ return vector;
- apic_clear_vector(vector, apic->regs + APIC_ISR);
+ apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
- kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ if (kvm_hv_synic_has_vector(apic->vcpu, vector))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
}
-static void apic_send_ipi(struct kvm_lapic *apic)
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
{
- u32 icr_low = apic_get_reg(apic, APIC_ICR);
- u32 icr_high = apic_get_reg(apic, APIC_ICR2);
- struct kvm_lapic_irq irq;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
- irq.vector = icr_low & APIC_VECTOR_MASK;
- irq.delivery_mode = icr_low & APIC_MODE_MASK;
- irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
- irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
- irq.shorthand = icr_low & APIC_SHORT_MASK;
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_eoi_accelerated);
+
+static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low,
+ u32 icr_high, struct kvm_lapic_irq *irq)
+{
+ /* KVM has no delay and should always clear the BUSY/PENDING flag. */
+ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY);
+
+ irq->vector = icr_low & APIC_VECTOR_MASK;
+ irq->delivery_mode = icr_low & APIC_MODE_MASK;
+ irq->dest_mode = icr_low & APIC_DEST_MASK;
+ irq->level = (icr_low & APIC_INT_ASSERT) != 0;
+ irq->trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq->shorthand = icr_low & APIC_SHORT_MASK;
+ irq->msi_redir_hint = false;
if (apic_x2apic_mode(apic))
- irq.dest_id = icr_high;
+ irq->dest_id = icr_high;
else
- irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+ irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high);
+}
- trace_kvm_apic_ipi(icr_low, irq.dest_id);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
+{
+ struct kvm_lapic_irq irq;
+
+ kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq);
- apic_debug("icr_high 0x%x, icr_low 0x%x, "
- "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
- icr_high, icr_low, irq.shorthand, irq.dest_id,
- irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
- kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_send_ipi);
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
- ktime_t remaining;
+ ktime_t remaining, now;
s64 ns;
- u32 tmcct;
ASSERT(apic != NULL);
/* if initial count is 0, current count should also be 0 */
- if (apic_get_reg(apic, APIC_TMICT) == 0)
+ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
return 0;
- remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
if (ktime_to_ns(remaining) < 0)
- remaining = ktime_set(0, 0);
+ remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -533,7 +1629,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -552,27 +1648,24 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return 0;
switch (offset) {
- case APIC_ID:
- if (apic_x2apic_mode(apic))
- val = kvm_apic_id(apic);
- else
- val = kvm_apic_id(apic) << 24;
- break;
case APIC_ARBPRI:
- printk(KERN_WARNING "Access APIC ARBPRI register "
- "which is for P6\n");
break;
case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
- /* fall thru */
+ fallthrough;
default:
- apic_update_ppr(apic);
- val = apic_get_reg(apic, offset);
+ val = kvm_lapic_get_reg(apic, offset);
break;
}
@@ -584,25 +1677,66 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
return container_of(dev, struct kvm_lapic, dev);
}
-static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+#define APIC_REG_MASK(reg) (1ull << ((reg) >> 4))
+#define APIC_REGS_MASK(first, count) \
+ (APIC_REG_MASK(first) * ((1ull << (count)) - 1))
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
+{
+ /* Leave bits '0' for reserved and write-only registers. */
+ u64 valid_reg_mask =
+ APIC_REG_MASK(APIC_ID) |
+ APIC_REG_MASK(APIC_LVR) |
+ APIC_REG_MASK(APIC_TASKPRI) |
+ APIC_REG_MASK(APIC_PROCPRI) |
+ APIC_REG_MASK(APIC_LDR) |
+ APIC_REG_MASK(APIC_SPIV) |
+ APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
+ APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) |
+ APIC_REG_MASK(APIC_ESR) |
+ APIC_REG_MASK(APIC_ICR) |
+ APIC_REG_MASK(APIC_LVTT) |
+ APIC_REG_MASK(APIC_LVTTHMR) |
+ APIC_REG_MASK(APIC_LVTPC) |
+ APIC_REG_MASK(APIC_LVT0) |
+ APIC_REG_MASK(APIC_LVT1) |
+ APIC_REG_MASK(APIC_LVTERR) |
+ APIC_REG_MASK(APIC_TMICT) |
+ APIC_REG_MASK(APIC_TMCCT) |
+ APIC_REG_MASK(APIC_TDCR);
+
+ if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
+ valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
+
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
+ if (!apic_x2apic_mode(apic))
+ valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
+ APIC_REG_MASK(APIC_ICR2);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
{
unsigned char alignment = offset & 0xf;
u32 result;
- /* this bitmask has a bit cleared for each reserver register */
- static const u64 rmask = 0x43ff01ffffffe70cULL;
- if ((alignment + len) > 4) {
- apic_debug("KVM_APIC_READ: alignment error %x %d\n",
- offset, len);
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
+
+ if (alignment + len > 4)
return 1;
- }
- if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
- apic_debug("KVM_APIC_READ: read reserved register %x\n",
- offset);
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
return 1;
- }
result = __apic_read(apic, offset & ~0xf);
@@ -624,12 +1758,11 @@ static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
{
- return apic_hw_enabled(apic) &&
- addr >= apic->base_address &&
- addr < apic->base_address + LAPIC_MMIO_LENGTH;
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
}
-static int apic_mmio_read(struct kvm_io_device *this,
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, void *data)
{
struct kvm_lapic *apic = to_lapic(this);
@@ -638,7 +1771,16 @@ static int apic_mmio_read(struct kvm_io_device *this,
if (!apic_mmio_in_range(apic, address))
return -EOPNOTSUPP;
- apic_reg_read(apic, offset, len, data);
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
+ kvm_lapic_reg_read(apic, offset, len, data);
return 0;
}
@@ -647,65 +1789,573 @@ static void update_divide_count(struct kvm_lapic *apic)
{
u32 tmp1, tmp2, tdcr;
- tdcr = apic_get_reg(apic, APIC_TDCR);
+ tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
tmp1 = tdcr & 0xf;
tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
apic->divide_count = 0x1 << (tmp2 & 0x7);
-
- apic_debug("timer divide count is 0x%x\n",
- apic->divide_count);
}
-static void start_apic_timer(struct kvm_lapic *apic)
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
{
- ktime_t now = apic->lapic_timer.timer.base->get_time();
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_once(
+ "vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->divide_count;
+static void cancel_hv_timer(struct kvm_lapic *apic);
+
+static void cancel_apic_timer(struct kvm_lapic *apic)
+{
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ preempt_disable();
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ preempt_enable();
atomic_set(&apic->lapic_timer.pending, 0);
+}
- if (!apic->lapic_timer.period)
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+ u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
+ apic->lapic_timer.timer_mode = timer_mode;
+ limit_periodic_timer_frequency(apic);
+ }
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg;
+
+ /*
+ * Assume a timer IRQ was "injected" if the APIC is protected. KVM's
+ * copy of the vIRR is bogus, it's the responsibility of the caller to
+ * precisely check whether or not a timer IRQ is pending.
+ */
+ if (apic->guest_apic_protected)
+ return true;
+
+ reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (apic->apicv_active)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles)
+{
+ u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns;
+
+ /*
+ * If the guest TSC is running at a different ratio than the host, then
+ * convert the delay to nanoseconds to achieve an accurate delay. Note
+ * that __delay() uses delay_tsc whenever the hardware has TSC, thus
+ * always for VMX enabled hardware.
+ */
+ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
+ __delay(min(guest_cycles,
+ nsec_to_cycles(vcpu, timer_advance_ns)));
+ } else {
+ u64 delay_ns = guest_cycles * 1000000ULL;
+ do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
+ ndelay(min_t(u32, delay_ns, timer_advance_ns));
+ }
+}
+
+static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
+ s64 advance_expire_delta)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
+ u64 ns;
+
+ /* Do not adjust for tiny fluctuations or large random spikes. */
+ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
+ abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
return;
+
+ /* too early */
+ if (advance_expire_delta < 0) {
+ ns = -advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ } else {
+ /* too late */
+ ns = advance_expire_delta * 1000000ULL;
+ do_div(ns, vcpu->arch.virtual_tsc_khz);
+ timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
+ }
+
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
+ apic->lapic_timer.timer_advance_ns = timer_advance_ns;
+}
+
+static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
+
/*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
+ * If the timer fired early, reread the TSC to account for the overhead
+ * of the above adjustment to avoid waiting longer than is necessary.
*/
- if (apic_lvtt_period(apic)) {
- if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
- apic->lapic_timer.period = NSEC_PER_MSEC/2;
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ if (guest_tsc < tsc_deadline)
+ __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
+}
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+ lapic_timer_int_injected(vcpu))
+ __kvm_wait_lapic_expire(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_wait_lapic_expire);
+
+static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic)) {
+ ktimer->tscdeadline = 0;
+ } else if (apic_lvtt_oneshot(apic)) {
+ ktimer->tscdeadline = 0;
+ ktimer->target_expiration = 0;
+ }
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+
+ if (!from_timer_fn && apic->apicv_active) {
+ WARN_ON(kvm_get_running_vcpu() != vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
+ kvm_apic_inject_pending_timer_irqs(apic);
+ return;
+ }
+
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ if (from_timer_fn)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ u64 guest_tsc, tscdeadline = ktimer->tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = ktime_get();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+
+ if (likely(tscdeadline > guest_tsc) &&
+ likely(ns > apic->lapic_timer.timer_advance_ns)) {
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
+ hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
+ } else
+ apic_timer_expired(apic, false);
+
+ local_irq_restore(flags);
+}
+
+static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
+{
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
+}
+
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+ s64 deadline;
+
+ now = ktime_get();
+ apic->lapic_timer.period =
+ tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT));
+
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
+ return false;
+ }
+
+ limit_periodic_timer_frequency(apic);
+ deadline = apic->lapic_timer.period;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+ if (unlikely(count_reg != APIC_TMICT)) {
+ deadline = tmict_to_ns(apic,
+ kvm_lapic_get_reg(apic, count_reg));
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
+ else if (unlikely(deadline > apic->lapic_timer.period)) {
+ pr_info_ratelimited(
+ "vcpu %i: requested lapic timer restore with "
+ "starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
+ "Using initial count to start timer.\n",
+ apic->vcpu->vcpu_id,
+ count_reg,
+ kvm_lapic_get_reg(apic, count_reg),
+ deadline, apic->lapic_timer.period);
+ kvm_lapic_set_reg(apic, count_reg, 0);
+ deadline = apic->lapic_timer.period;
+ }
+ }
+ }
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, deadline);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline);
+
+ return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
+
+ /*
+ * Use kernel time as the time source for both the hrtimer deadline and
+ * TSC-based deadline so that they stay synchronized. Computing each
+ * deadline independently will cause the two deadlines to drift apart
+ * over time as differences in the periods accumulate, e.g. due to
+ * differences in the underlying clocks or numerical approximation errors.
+ */
+ ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration,
+ ktimer->period);
+
+ /*
+ * If the new expiration is in the past, e.g. because userspace stopped
+ * running the VM for an extended duration, then force the expiration
+ * to "now" and don't try to play catch-up with the missed events. KVM
+ * will only deliver a single interrupt regardless of how many events
+ * are pending, i.e. restarting the timer with an expiration in the
+ * past will do nothing more than waste host cycles, and can even lead
+ * to a hard lockup in extreme cases.
+ */
+ if (ktime_before(ktimer->target_expiration, now))
+ ktimer->target_expiration = now;
+
+ /*
+ * Note, ensuring the expiration isn't in the past also prevents delta
+ * from going negative, which could cause the TSC deadline to become
+ * excessively large due to it an unsigned value.
+ */
+ delta = ktime_sub(ktimer->target_expiration, now);
+ ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic, false);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
}
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, apic->lapic_timer.period),
- HRTIMER_MODE_ABS);
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_HARD);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return false;
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
- PRIx64 ", "
- "timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __func__,
- APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- apic_get_reg(apic, APIC_TMICT),
- apic->lapic_timer.period,
- ktime_to_ns(ktime_add_ns(now,
- apic->lapic_timer.period)));
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
}
-static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+static void cancel_hv_timer(struct kvm_lapic *apic)
{
- int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+ WARN_ON(preemptible());
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ kvm_x86_call(cancel_hv_timer)(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+static bool start_hv_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ bool expired;
- if (apic_lvt_nmi_mode(lvt0_val)) {
- if (!nmi_wd_enabled) {
- apic_debug("Receive NMI setting on APIC_LVT0 "
- "for cpu %d\n", apic->vcpu->vcpu_id);
- apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ WARN_ON(preemptible());
+ if (!kvm_can_use_hv_timer(vcpu))
+ return false;
+
+ if (!ktimer->tscdeadline)
+ return false;
+
+ if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * To simplify handling the periodic timer, leave the hv timer running
+ * even if the deadline timer has expired, i.e. rely on the resulting
+ * VM-Exit to recompute the periodic timer's target expiration.
+ */
+ if (!apic_lvtt_period(apic)) {
+ /*
+ * Cancel the hv timer if the sw timer fired while the hv timer
+ * was being programmed, or if the hv timer itself expired.
+ */
+ if (atomic_read(&ktimer->pending)) {
+ cancel_hv_timer(apic);
+ } else if (expired) {
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
}
- } else if (nmi_wd_enabled)
- apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+ }
+
+ trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use);
+
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ preempt_disable();
+
+ if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending))
+ goto out;
+
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
+out:
+ preempt_enable();
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
+ apic_timer_expired(apic, false);
+ cancel_hv_timer(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ restart_apic_timer(apic);
+ }
+out:
+ preempt_enable();
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_expired_hv_timer);
-static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ restart_apic_timer(vcpu->arch.apic);
+}
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+ preempt_enable();
+}
+
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
+}
+
+static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg)
+{
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic, count_reg))
+ return;
+
+ restart_apic_timer(apic);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ __start_apic_timer(apic, APIC_TMICT);
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
+
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
+}
+
+static int get_lvt_index(u32 reg)
+{
+ if (reg == APIC_LVTCMCI)
+ return LVT_CMCI;
+ if (reg < APIC_LVTT || reg > APIC_LVTERR)
+ return -1;
+ return array_index_nospec(
+ (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES);
+}
+
+static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -713,10 +2363,11 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_ID, val);
- else
+ if (!apic_x2apic_mode(apic)) {
+ kvm_apic_set_xapic_id(apic, val >> 24);
+ } else {
ret = 1;
+ }
break;
case APIC_TASKPRI:
@@ -730,102 +2381,132 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LDR:
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
else
ret = 1;
break;
case APIC_DFR:
if (!apic_x2apic_mode(apic))
- apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ kvm_apic_set_dfr(apic, val | 0x0FFFFFFF);
else
ret = 1;
break;
case APIC_SPIV: {
u32 mask = 0x3ff;
- if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
mask |= APIC_SPIV_DIRECTED_EOI;
- apic_set_reg(apic, APIC_SPIV, val & mask);
+ apic_set_spiv(apic, val & mask);
if (!(val & APIC_SPIV_APIC_ENABLED)) {
int i;
- u32 lvt_val;
- for (i = 0; i < APIC_LVT_NUM; i++) {
- lvt_val = apic_get_reg(apic,
- APIC_LVTT + 0x10 * i);
- apic_set_reg(apic, APIC_LVTT + 0x10 * i,
- lvt_val | APIC_LVT_MASKED);
+ for (i = 0; i < apic->nr_lvt_entries; i++) {
+ kvm_lapic_set_reg(apic, APIC_LVTx(i),
+ kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
}
+ apic_update_lvtt(apic);
atomic_set(&apic->lapic_timer.pending, 0);
}
break;
}
case APIC_ICR:
+ WARN_ON_ONCE(apic_x2apic_mode(apic));
+
/* No delay here, so we always clear the pending bit */
- apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
- apic_send_ipi(apic);
+ val &= ~APIC_ICR_BUSY;
+ kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
+ kvm_lapic_set_reg(apic, APIC_ICR, val);
break;
-
case APIC_ICR2:
- if (!apic_x2apic_mode(apic))
- val &= 0xff000000;
- apic_set_reg(apic, APIC_ICR2, val);
+ if (apic_x2apic_mode(apic))
+ ret = 1;
+ else
+ kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
- case APIC_LVTT:
+ fallthrough;
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
case APIC_LVTERR:
- /* TODO: Check vector */
- if (!apic_sw_enabled(apic))
+ case APIC_LVTCMCI: {
+ u32 index = get_lvt_index(reg);
+ if (!kvm_lapic_lvt_supported(apic, index)) {
+ ret = 1;
+ break;
+ }
+ if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
+ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+ break;
+ }
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
- apic_set_reg(apic, reg, val);
-
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
+ kvm_lapic_set_reg(apic, APIC_LVTT, val);
+ apic_update_lvtt(apic);
break;
case APIC_TMICT:
- hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_TMICT, val);
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
+ cancel_apic_timer(apic);
+ kvm_lapic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
break;
- case APIC_TDCR:
- if (val & 4)
- printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
- apic_set_reg(apic, APIC_TDCR, val);
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
+ kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
- if (apic_x2apic_mode(apic) && val != 0) {
- printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+ if (apic_x2apic_mode(apic) && val != 0)
ret = 1;
- }
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
- } else
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
break;
default:
ret = 1;
break;
}
- if (ret)
- apic_debug("Local APIC Write to read-only register %x\n", reg);
+
+ /*
+ * Recalculate APIC maps if necessary, e.g. if the software enable bit
+ * was toggled, the APIC ID changed, etc... The maps are marked dirty
+ * on relevant changes, i.e. this is a nop for most writes.
+ */
+ kvm_recalculate_apic_map(apic->vcpu->kvm);
+
return ret;
}
-static int apic_mmio_write(struct kvm_io_device *this,
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
gpa_t address, int len, const void *data)
{
struct kvm_lapic *apic = to_lapic(this);
@@ -835,40 +2516,139 @@ static int apic_mmio_write(struct kvm_io_device *this,
if (!apic_mmio_in_range(apic, address))
return -EOPNOTSUPP;
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
/*
* APIC register must be aligned on 128-bits boundary.
* 32/64/128 bits registers must be accessed thru 32 bits.
* Refer SDM 8.4.1
*/
- if (len != 4 || (offset & 0xf)) {
- /* Don't shout loud, $infamous_os would cause only noise. */
- apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ if (len != 4 || (offset & 0xf))
return 0;
- }
val = *(u32*)data;
- /* too common printing */
- if (offset != APIC_EOI)
- apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __func__, offset, len, val);
+ kvm_lapic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_set_eoi);
- apic_reg_write(apic, offset & 0xff0, val);
+#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast)
+{
+ if (data & X2APIC_ICR_RESERVED_BITS)
+ return 1;
+
+ /*
+ * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ * only AMD requires it to be zero, Intel essentially just ignores the
+ * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ * behavior will "win". Arbitrarily clear the BUSY bit, as there is no
+ * sane way to provide consistent behavior with respect to hardware.
+ */
+ data &= ~APIC_ICR_BUSY;
+
+ if (fast) {
+ struct kvm_lapic_irq irq;
+ int ignored;
+
+ kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq);
+
+ if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq,
+ &ignored, NULL))
+ return -EWOULDBLOCK;
+
+ trace_kvm_apic_ipi((u32)data, irq.dest_id);
+ } else {
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ }
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
+ trace_kvm_apic_write(APIC_ICR, data);
return 0;
}
+static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ return __kvm_x2apic_icr_write(apic, data, false);
+}
+
+int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data)
+{
+ return __kvm_x2apic_icr_write(apic, data, true);
+}
+
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /*
+ * ICR is a single 64-bit register when x2APIC is enabled, all others
+ * registers hold 32-bit values. For legacy xAPIC, ICR writes need to
+ * go down the common path to get the upper half from ICR2.
+ *
+ * Note, using the write helpers may incur an unnecessary write to the
+ * virtual APIC state, but KVM needs to conditionally modify the value
+ * in certain cases, e.g. to clear the ICR busy bit. The cost of extra
+ * conditional branches is likely a wash relative to the cost of the
+ * maybe-unecessary write, and both are in the noise anyways.
+ */
+ if (apic_x2apic_mode(apic) && offset == APIC_ICR)
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
+ else
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_write_nodecode);
+
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apic)
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!vcpu->arch.apic) {
+ static_branch_dec(&kvm_has_noapic_vcpu);
return;
+ }
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
- hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
- if (vcpu->arch.apic->regs_page)
- __free_page(vcpu->arch.apic->regs_page);
+ if (!apic->sw_enabled)
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
- kfree(vcpu->arch.apic);
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
+
+ kfree(apic);
}
/*
@@ -876,116 +2656,275 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
-void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
+ if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return;
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (apic_get_reg(apic, APIC_TASKPRI) & 4));
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
- if (!apic)
- return 0;
- tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
+ tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
return (tpr & 0xf0) >> 4;
}
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
+ u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic) {
- value |= MSR_IA32_APICBASE_BSP;
- vcpu->arch.apic_base = value;
+ vcpu->arch.apic_base = value;
+
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+
+ if (!apic)
return;
+
+ /* update jump label if enable bit changes */
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ } else {
+ static_branch_inc(&apic_hw_disabled.key);
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ }
}
- if (!kvm_vcpu_is_bsp(apic->vcpu))
- value &= ~MSR_IA32_APICBASE_BSP;
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
- vcpu->arch.apic_base = value;
- if (apic_x2apic_mode(apic)) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
- apic_set_reg(apic, APIC_LDR, ldr);
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ kvm_x86_call(set_virtual_apic_mode)(vcpu);
}
+
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
- /* with FSB delivery interrupt, we can restart APIC functionality */
- apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
- "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
+}
+int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated)
+{
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(value);
+
+ if (vcpu->arch.apic_base == value)
+ return 0;
+
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
+ (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+
+ if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
+ return 1;
+ if (!host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
+
+ __kvm_apic_set_base(vcpu, value);
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_set_base);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /*
+ * When APICv is enabled, KVM must always search the IRR for a pending
+ * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
+ * isn't running. If APICv is disabled, KVM _should_ search the IRR
+ * for a pending IRQ. But KVM currently doesn't ensure *all* hardware,
+ * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
+ * the IRR at this time could race with IRQ delivery from hardware that
+ * still sees APICv as being enabled.
+ *
+ * FIXME: Ensure other vCPUs and devices observe the change in APICv
+ * state prior to updating KVM's metadata caches, so that KVM
+ * can safely search the IRR and set irr_pending accordingly.
+ */
+ apic->irr_pending = true;
+
+ if (apic->apicv_active)
+ apic->isr_count = 1;
+ else
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+
+ apic->highest_isr_cache = -1;
+}
+
+int kvm_alloc_apic_access_page(struct kvm *kvm)
+{
+ void __user *hva;
+
+ guard(mutex)(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ return 0;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva))
+ return PTR_ERR(hva);
+
+ kvm->arch.apic_access_memslot_enabled = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
int i;
- apic_debug("%s\n", __func__);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
- ASSERT(vcpu);
- apic = vcpu->arch.apic;
- ASSERT(apic != NULL);
+ if (!init_event) {
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ msr_val |= MSR_IA32_APICBASE_BSP;
+
+ /*
+ * Use the inner helper to avoid an extra recalcuation of the
+ * optimized APIC map if some other task has dirtied the map.
+ * The recalculation needed for this vCPU will be done after
+ * all APIC state has been initialized (see below).
+ */
+ __kvm_apic_set_base(vcpu, msr_val);
+ }
+
+ if (!apic)
+ return;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
- for (i = 0; i < APIC_LVT_NUM; i++)
- apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
-
- apic_set_reg(apic, APIC_DFR, 0xffffffffU);
- apic_set_reg(apic, APIC_SPIV, 0xff);
- apic_set_reg(apic, APIC_TASKPRI, 0);
- apic_set_reg(apic, APIC_LDR, 0);
- apic_set_reg(apic, APIC_ESR, 0);
- apic_set_reg(apic, APIC_ICR, 0);
- apic_set_reg(apic, APIC_ICR2, 0);
- apic_set_reg(apic, APIC_TDCR, 0);
- apic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < apic->nr_lvt_entries; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+ apic_update_lvtt(apic);
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ kvm_lapic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+
+ kvm_apic_set_dfr(apic, 0xffffffffU);
+ apic_set_spiv(apic, 0xff);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
+ kvm_lapic_set_reg(apic, APIC_ESR, 0);
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, 0);
+ }
+ kvm_lapic_set_reg(apic, APIC_TDCR, 0);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
for (i = 0; i < 8; i++) {
- apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
- apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = false;
+ kvm_apic_update_apicv(vcpu);
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (kvm_vcpu_is_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+
+ vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
+ if (apic->apicv_active) {
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_isr_update)(vcpu, -1);
+ }
vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
- vcpu->arch.apic_base, apic->base_address);
-}
-
-bool kvm_apic_present(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
-}
-
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
-{
- return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
+ kvm_recalculate_apic_map(vcpu->kvm);
}
/*
@@ -994,33 +2933,37 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
int apic_has_pending_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *lapic = vcpu->arch.apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
- return atomic_read(&lapic->lapic_timer.pending);
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
return 0;
}
-static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
- u32 reg = apic_get_reg(apic, lvt_type);
+ u32 reg = kvm_lapic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
+ int r;
- if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
- return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+
+ r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
+ if (r && lvt_type == APIC_LVTPC &&
+ guest_cpuid_is_intel_compatible(apic->vcpu))
+ kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
+ return r;
}
return 0;
}
@@ -1033,54 +2976,88 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
static const struct kvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+
+ apic_timer_expired(apic, true);
+
+ if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) {
+ advance_periodic_target_expiration(apic);
+ hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
ASSERT(vcpu != NULL);
- apic_debug("apic_init %d\n", vcpu->vcpu_id);
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ static_branch_inc(&kvm_has_noapic_vcpu);
+ return 0;
+ }
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
vcpu->arch.apic = apic;
- apic->regs_page = alloc_page(GFP_KERNEL);
- if (apic->regs_page == NULL) {
+ if (kvm_x86_ops.alloc_apic_backing_page)
+ apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
+ else
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem_free_apic;
}
- apic->regs = page_address(apic->regs_page);
- memset(apic->regs, 0, PAGE_SIZE);
apic->vcpu = vcpu;
- hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
+ apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
+ hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ if (lapic_timer_advance)
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
- kvm_lapic_reset(vcpu);
+ /*
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as this vCPU
+ * will not get notified of any changes until this vCPU is visible to
+ * other vCPUs (marked online and added to the set of vCPUs).
+ *
+ * Opportunistically mark APICv active as VMX in particularly is highly
+ * unlikely to have inhibits. Ignore the current per-VM APICv state so
+ * that vCPU creation is guaranteed to run with a deterministic value,
+ * the request will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
return 0;
nomem_free_apic:
kfree(apic);
+ vcpu->arch.apic = NULL;
nomem:
return -ENOMEM;
}
@@ -1088,113 +3065,287 @@ nomem:
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
+ u32 ppr;
- if (!apic || !apic_enabled(apic))
+ if (!kvm_apic_present(vcpu))
return -1;
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
+ if (apic->guest_apic_protected)
return -1;
- return highest_irr;
+
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_has_interrupt);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
- u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
- int r = 0;
+ u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
- if (kvm_vcpu_is_bsp(vcpu)) {
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
- return r;
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ return 1;
+ return 0;
}
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
- if (kvm_apic_local_deliver(apic, APIC_LVTT))
- atomic_dec(&apic->lapic_timer.pending);
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
+ kvm_apic_inject_pending_timer_irqs(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
}
}
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
- if (vector == -1)
- return -1;
+ if (WARN_ON_ONCE(vector < 0 || !apic))
+ return;
+
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
- apic_set_vector(vector, apic->regs + APIC_ISR);
- apic_update_ppr(apic);
apic_clear_irr(vector, apic);
- return vector;
+ if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) {
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
+ apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
+ }
+
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_ack_interrupt);
+
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+ u64 icr;
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != x2apic_id)
+ return -EINVAL;
+ } else {
+ /*
+ * Ignore the userspace value when setting APIC state.
+ * KVM's model is that the x2APIC ID is readonly, e.g.
+ * KVM only supports delivering interrupts to KVM's
+ * version of the x2APIC ID. However, for backwards
+ * compatibility, don't reject attempts to set a
+ * mismatched ID for userspace that hasn't opted into
+ * x2apic_format.
+ */
+ if (set)
+ *id = x2apic_id;
+ else
+ *id = x2apic_id << 24;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is fixed and based on the id. And
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
+ */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
+
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
+ }
+ }
+
+ return 0;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+
+ /*
+ * Get calculated timer current count for remaining timer period (if
+ * any) and store it in the returned register set.
+ */
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
+
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
- apic->base_address = vcpu->arch.apic_base &
- MSR_IA32_APICBASE_BASE;
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
+
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r) {
+ kvm_recalculate_apic_map(vcpu->kvm);
+ return r;
+ }
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
+
+ atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
+ kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
+ apic->lapic_timer.expired_tscdeadline = 0;
+ apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
- start_apic_timer(apic);
- apic->irr_pending = true;
+ __start_apic_timer(apic, APIC_TMCCT);
+ kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
+ kvm_apic_update_apicv(vcpu);
+ if (apic->apicv_active) {
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
- if (!apic)
+ if (!lapic_in_kernel(vcpu) ||
+ kvm_can_post_timer_interrupt(vcpu))
return;
- timer = &apic->lapic_timer.timer;
+ timer = &vcpu->arch.apic->lapic_timer.timer;
if (hrtimer_cancel(timer))
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
+}
+
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
}
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
- void *vapic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
- data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
- kunmap_atomic(vapic, KM_USER0);
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
{
u32 data, tpr;
int max_irr, max_isr;
- struct kvm_lapic *apic;
- void *vapic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- apic = vcpu->arch.apic;
- tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
@@ -1203,17 +3354,58 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
- *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
- kunmap_atomic(vapic, KM_USER0);
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ } else {
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
+}
+
+static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
+{
+ u32 low;
+
+ if (reg == APIC_ICR) {
+ *data = kvm_x2apic_icr_read(apic);
+ return 0;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+
+ *data = low;
+
+ return 0;
+}
+
+static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
+{
+ /*
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
+ * can be written as such, all other registers remain accessible only
+ * through 32-bit reads/writes.
+ */
+ if (reg == APIC_ICR)
+ return kvm_x2apic_icr_write(apic, data);
+
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
}
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
@@ -1221,60 +3413,120 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 reg = (msr - APIC_BASE_MSR) << 4;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- /* if this is ICR write vector before command */
- if (msr == 0x830)
- apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return apic_reg_write(apic, reg, (u32)data);
+ return kvm_lapic_msr_write(apic, reg, data);
}
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
- if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (apic_reg_read(apic, reg, 4, &low))
+ return kvm_lapic_msr_read(apic, reg, data);
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ if (!lapic_in_kernel(vcpu))
return 1;
- if (msr == 0x830)
- apic_reg_read(apic, APIC_ICR2, 4, &high);
- *data = (((u64)high) << 32) | low;
+ return kvm_lapic_msr_write(vcpu->arch.apic, reg, data);
+}
- return 0;
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ return kvm_lapic_msr_read(vcpu->arch.apic, reg, data);
}
-int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+ int ret;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!IS_ALIGNED(addr, 4))
return 1;
- /* if this is ICR write vector before command */
- if (reg == APIC_ICR)
- apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
- return apic_reg_write(apic, reg, (u32)data);
+ if (data & KVM_MSR_ENABLED) {
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+ if (ret)
+ return ret;
+ }
+
+ vcpu->arch.pv_eoi.msr_val = data;
+
+ return 0;
}
-int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- u32 low, high = 0;
+ u8 sipi_vector;
+ int r;
- if (!irqchip_in_kernel(vcpu->kvm))
- return 1;
+ if (!kvm_apic_has_pending_init_or_sipi(vcpu))
+ return 0;
- if (apic_reg_read(apic, reg, 4, &low))
- return 1;
- if (reg == APIC_ICR)
- apic_reg_read(apic, APIC_ICR2, 4, &high);
+ if (is_guest_mode(vcpu)) {
+ r = kvm_check_nested_events(vcpu);
+ if (r < 0)
+ return r == -EBUSY ? 0 : r;
+ /*
+ * Continue processing INIT/SIPI even if a nested VM-Exit
+ * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
+ * are blocked as a result of transitioning to VMX root mode.
+ */
+ }
- *data = (((u64)high) << 32) | low;
+ /*
+ * INITs are blocked while CPU is in specific states (SMM, VMX root
+ * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
+ * wait-for-SIPI (WFS).
+ */
+ if (!kvm_apic_init_sipi_allowed(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return 0;
+ }
+ if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (kvm_vcpu_is_bsp(apic->vcpu))
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ else
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ }
+ if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
+ sipi_vector);
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ }
+ }
return 0;
}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
+ static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..282b9b7da98c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,59 +1,262 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_LAPIC_H
#define __KVM_X86_LAPIC_H
-#include "iodev.h"
-#include "kvm_timer.h"
+#include <kvm/iodev.h>
+
+#include <asm/apic.h>
#include <linux/kvm_host.h>
+#include "hyperv.h"
+#include "smm.h"
+
+#define KVM_APIC_INIT 0
+#define KVM_APIC_SIPI 1
+
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
+
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
+enum lapic_lvt_entry {
+ LVT_TIMER,
+ LVT_THERMAL_MONITOR,
+ LVT_PERFORMANCE_COUNTER,
+ LVT_LINT0,
+ LVT_LINT1,
+ LVT_ERROR,
+ LVT_CMCI,
+
+ KVM_APIC_MAX_NR_LVT_ENTRIES,
+};
+
+#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x))
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ ktime_t target_expiration;
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ u32 timer_advance_ns;
+ atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool apicv_active;
+ bool sw_enabled;
bool irr_pending;
- struct page *regs_page;
+ bool lvt0_in_nmi_mode;
+ /* Select registers in the vAPIC cannot be read/written. */
+ bool guest_apic_protected;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
void *regs;
gpa_t vapic_addr;
- struct page *vapic_page;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+ int nr_lvt_entries;
};
+
+struct dest_map;
+
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
-void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int shorthand, unsigned int dest, int dest_mode);
+void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high);
-u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
-void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
+int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
-void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data);
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+void kvm_lapic_exit(void);
+
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
+static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
+{
+ apic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
+}
+
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return apic_get_reg(apic->regs, reg_off);
+}
+
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_hw_disabled;
+
+static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_branch_unlikely(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return true;
+}
+
+extern struct static_key_false_deferred apic_sw_disabled;
+
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
- return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+ if (static_branch_unlikely(&apic_sw_disabled.key))
+ return apic->sw_enabled;
+ return true;
}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
+}
+
+static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
+{
+ return !is_smm(vcpu) &&
+ !kvm_x86_call(apic_init_signal_blocked)(vcpu);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
+void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap);
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
+static inline enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(vcpu->arch.apic_base);
+}
+
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
deleted file mode 100644
index b1ed0a1a5913..000000000000
--- a/arch/x86/kvm/mmu.c
+++ /dev/null
@@ -1,3442 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include "mmu.h"
-#include "x86.h"
-#include "kvm_cache_regs.h"
-
-#include <linux/kvm_host.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/hugetlb.h>
-#include <linux/compiler.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-
-#include <asm/page.h>
-#include <asm/cmpxchg.h>
-#include <asm/io.h>
-#include <asm/vmx.h>
-
-/*
- * When setting this variable to true it enables Two-Dimensional-Paging
- * where the hardware walks 2 page tables:
- * 1. the guest-virtual to guest-physical
- * 2. while doing 1. it walks guest-physical to host-physical
- * If the hardware supports that we don't need to do shadow paging.
- */
-bool tdp_enabled = false;
-
-#undef MMU_DEBUG
-
-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
-#ifdef MMU_DEBUG
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
-#else
-
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#if defined(MMU_DEBUG) || defined(AUDIT)
-static int dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
-#endif
-
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_LEVEL_MASK(level) \
- (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
-#define PT32_LEVEL_BITS 10
-
-#define PT32_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
-
-#define PT32_LEVEL_MASK(level) \
- (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
-#define PT32_LVL_OFFSET_MASK(level) \
- (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
-#define PT32_INDEX(address, level)\
- (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-
-
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
-#define PT64_LVL_ADDR_MASK(level) \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
- (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-
-#define PT32_BASE_ADDR_MASK PAGE_MASK
-#define PT32_DIR_BASE_ADDR_MASK \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
-#define PT32_LVL_ADDR_MASK(level) \
- (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT32_LEVEL_BITS))) - 1))
-
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
- | PT64_NX_MASK)
-
-#define RMAP_EXT 4
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-#include <trace/events/kvm.h>
-
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
-struct kvm_rmap_desc {
- u64 *sptes[RMAP_EXT];
- struct kvm_rmap_desc *more;
-};
-
-struct kvm_shadow_walk_iterator {
- u64 addr;
- hpa_t shadow_addr;
- int level;
- u64 *sptep;
- unsigned index;
-};
-
-#define for_each_shadow_entry(_vcpu, _addr, _walker) \
- for (shadow_walk_init(&(_walker), _vcpu, _addr); \
- shadow_walk_okay(&(_walker)); \
- shadow_walk_next(&(_walker)))
-
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
-
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
-
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
-static u64 __read_mostly shadow_nx_mask;
-static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
-static u64 __read_mostly shadow_user_mask;
-static u64 __read_mostly shadow_accessed_mask;
-static u64 __read_mostly shadow_dirty_mask;
-
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
- shadow_trap_nonpresent_pte = trap_pte;
- shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
- shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
-{
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
-static int is_cpuid_PSE36(void)
-{
- return 1;
-}
-
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.efer & EFER_NX;
-}
-
-static int is_shadow_present_pte(u64 pte)
-{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
-}
-
-static int is_large_pte(u64 pte)
-{
- return pte & PT_PAGE_SIZE_MASK;
-}
-
-static int is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
-static int is_dirty_gpte(unsigned long pte)
-{
- return pte & PT_DIRTY_MASK;
-}
-
-static int is_rmap_spte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
-static int is_last_spte(u64 pte, int level)
-{
- if (level == PT_PAGE_TABLE_LEVEL)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
-}
-
-static pfn_t spte_to_pfn(u64 pte)
-{
- return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
-static gfn_t pse36_gfn_delta(u32 gpte)
-{
- int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
-
- return (gpte & PT32_DIR_PSE36_MASK) << shift;
-}
-
-static void __set_spte(u64 *sptep, u64 spte)
-{
-#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
-#else
- set_64bit((unsigned long long *)sptep, spte);
-#endif
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- struct page *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- cache->objects[cache->nobjs++] = page_address(page);
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
- pte_chain_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4);
- if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
- if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
-}
-
-static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
-}
-
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain));
-}
-
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
-{
- kfree(pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
- kfree(rd);
-}
-
-/*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
- */
-static int *slot_largepage_idx(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
-{
- unsigned long idx;
-
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
- return &slot->lpage_info[level - 2][idx].write_count;
-}
-
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- int *write_count;
- int i;
-
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count += 1;
- }
-}
-
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- int *write_count;
- int i;
-
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count -= 1;
- WARN_ON(*write_count < 0);
- }
-}
-
-static int has_wrprotected_page(struct kvm *kvm,
- gfn_t gfn,
- int level)
-{
- struct kvm_memory_slot *slot;
- int *largepage_idx;
-
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
- if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot, level);
- return *largepage_idx;
- }
-
- return 1;
-}
-
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
-{
- unsigned long page_size;
- int i, ret = 0;
-
- page_size = kvm_host_page_size(kvm, gfn);
-
- for (i = PT_PAGE_TABLE_LEVEL;
- i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
- if (page_size >= KVM_HPAGE_SIZE(i))
- ret = i;
- else
- break;
- }
-
- return ret;
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
- struct kvm_memory_slot *slot;
- int host_level, level, max_level;
-
- slot = gfn_to_memslot(vcpu->kvm, large_gfn);
- if (slot && slot->dirty_bitmap)
- return PT_PAGE_TABLE_LEVEL;
-
- host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
- if (host_level == PT_PAGE_TABLE_LEVEL)
- return host_level;
-
- max_level = kvm_x86_ops->get_lpage_level() < host_level ?
- kvm_x86_ops->get_lpage_level() : host_level;
-
- for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
- break;
-
- return level - 1;
-}
-
-/*
- * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
- struct kvm_memory_slot *slot;
- unsigned long idx;
-
- slot = gfn_to_memslot(kvm, gfn);
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
-
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
-
- return &slot->lpage_info[level - 2][idx].rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
- *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
- *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
- *
- * Returns the number of rmap entries before the spte was added or zero if
- * the spte was not added.
- *
- */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- struct kvm_mmu_page *sp;
- struct kvm_rmap_desc *desc;
- unsigned long *rmapp;
- int i, count = 0;
-
- if (!is_rmap_spte(*spte))
- return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
- sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- if (!*rmapp) {
- rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
- *rmapp = (unsigned long)spte;
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = mmu_alloc_rmap_desc(vcpu);
- desc->sptes[0] = (u64 *)*rmapp;
- desc->sptes[1] = spte;
- *rmapp = (unsigned long)desc | 1;
- } else {
- rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (desc->sptes[RMAP_EXT-1] && desc->more) {
- desc = desc->more;
- count += RMAP_EXT;
- }
- if (desc->sptes[RMAP_EXT-1]) {
- desc->more = mmu_alloc_rmap_desc(vcpu);
- desc = desc->more;
- }
- for (i = 0; desc->sptes[i]; ++i)
- ;
- desc->sptes[i] = spte;
- }
- return count;
-}
-
-static void rmap_desc_remove_entry(unsigned long *rmapp,
- struct kvm_rmap_desc *desc,
- int i,
- struct kvm_rmap_desc *prev_desc)
-{
- int j;
-
- for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
- ;
- desc->sptes[i] = desc->sptes[j];
- desc->sptes[j] = NULL;
- if (j != 0)
- return;
- if (!prev_desc && !desc->more)
- *rmapp = (unsigned long)desc->sptes[0];
- else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- *rmapp = (unsigned long)desc->more | 1;
- mmu_free_rmap_desc(desc);
-}
-
-static void rmap_remove(struct kvm *kvm, u64 *spte)
-{
- struct kvm_rmap_desc *desc;
- struct kvm_rmap_desc *prev_desc;
- struct kvm_mmu_page *sp;
- pfn_t pfn;
- unsigned long *rmapp;
- int i;
-
- if (!is_rmap_spte(*spte))
- return;
- sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
- if (!*rmapp) {
- printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
- BUG();
- } else if (!(*rmapp & 1)) {
- rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
- if ((u64 *)*rmapp != spte) {
- printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
- spte, *spte);
- BUG();
- }
- *rmapp = 0;
- } else {
- rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_desc = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
- if (desc->sptes[i] == spte) {
- rmap_desc_remove_entry(rmapp,
- desc, i,
- prev_desc);
- return;
- }
- prev_desc = desc;
- desc = desc->more;
- }
- pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
- BUG();
- }
-}
-
-static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-{
- struct kvm_rmap_desc *desc;
- u64 *prev_spte;
- int i;
-
- if (!*rmapp)
- return NULL;
- else if (!(*rmapp & 1)) {
- if (!spte)
- return (u64 *)*rmapp;
- return NULL;
- }
- desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- prev_spte = NULL;
- while (desc) {
- for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
- if (prev_spte == spte)
- return desc->sptes[i];
- prev_spte = desc->sptes[i];
- }
- desc = desc->more;
- }
- return NULL;
-}
-
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
-{
- unsigned long *rmapp;
- u64 *spte;
- int i, write_protected = 0;
-
- gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
- write_protected = 1;
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
- if (write_protected) {
- pfn_t pfn;
-
- spte = rmap_next(kvm, rmapp, NULL);
- pfn = spte_to_pfn(*spte);
- kvm_set_pfn_dirty(pfn);
- }
-
- /* check for huge page mappings */
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- rmapp = gfn_to_rmap(kvm, gfn, i);
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!spte);
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
- --kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
- spte = NULL;
- write_protected = 1;
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
- }
-
- return write_protected;
-}
-
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int need_tlb_flush = 0;
-
- while ((spte = rmap_next(kvm, rmapp, NULL))) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
- need_tlb_flush = 1;
- }
- return need_tlb_flush;
-}
-
-static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- int need_flush = 0;
- u64 *spte, new_spte;
- pte_t *ptep = (pte_t *)data;
- pfn_t new_pfn;
-
- WARN_ON(pte_huge(*ptep));
- new_pfn = pte_pfn(*ptep);
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- BUG_ON(!is_shadow_present_pte(*spte));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
- need_flush = 1;
- if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
- spte = rmap_next(kvm, rmapp, NULL);
- } else {
- new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
- spte = rmap_next(kvm, rmapp, spte);
- }
- }
- if (need_flush)
- kvm_flush_remote_tlbs(kvm);
-
- return 0;
-}
-
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
-{
- int i, j;
- int ret;
- int retval = 0;
- struct kvm_memslots *slots;
-
- slots = kvm_memslots(kvm);
-
- for (i = 0; i < slots->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &slots->memslots[i];
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
-
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
-
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
-
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm,
- &memslot->lpage_info[j][idx].rmap_pde,
- data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
- }
- }
-
- return retval;
-}
-
-int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
-}
-
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
- kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
-}
-
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
-{
- u64 *spte;
- int young = 0;
-
- /*
- * Emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
-
- spte = rmap_next(kvm, rmapp, NULL);
- while (spte) {
- int _young;
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- _young = _spte & PT_ACCESSED_MASK;
- if (_young) {
- young = 1;
- clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
- }
- spte = rmap_next(kvm, rmapp, spte);
- }
- return young;
-}
-
-#define RMAP_RECYCLE_THRESHOLD 1000
-
-static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *sp;
-
- sp = page_header(__pa(spte));
-
- gfn = unalias_gfn(vcpu->kvm, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
- kvm_flush_remote_tlbs(vcpu->kvm);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
-{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
-}
-
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
-{
- u64 *pos;
- u64 *end;
-
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (is_shadow_present_pte(*pos)) {
- printk(KERN_ERR "%s: %p %llx\n", __func__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
-#endif
-
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- ASSERT(is_empty_shadow_page(sp->spt));
- list_del(&sp->link);
- __free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
- ++kvm->arch.n_free_mmu_pages;
-}
-
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
-{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
-}
-
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
-{
- struct kvm_mmu_page *sp;
-
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
- list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- sp->multimapped = 0;
- sp->parent_pte = parent_pte;
- --vcpu->kvm->arch.n_free_mmu_pages;
- return sp;
-}
-
-static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!parent_pte)
- return;
- if (!sp->multimapped) {
- u64 *old = sp->parent_pte;
-
- if (!old) {
- sp->parent_pte = parent_pte;
- return;
- }
- sp->multimapped = 1;
- pte_chain = mmu_alloc_pte_chain(vcpu);
- INIT_HLIST_HEAD(&sp->parent_ptes);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = old;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
- if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
- continue;
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
- if (!pte_chain->parent_ptes[i]) {
- pte_chain->parent_ptes[i] = parent_pte;
- return;
- }
- }
- pte_chain = mmu_alloc_pte_chain(vcpu);
- BUG_ON(!pte_chain);
- hlist_add_head(&pte_chain->link, &sp->parent_ptes);
- pte_chain->parent_ptes[0] = parent_pte;
-}
-
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
- u64 *parent_pte)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->multimapped) {
- BUG_ON(sp->parent_pte != parent_pte);
- sp->parent_pte = NULL;
- return;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- if (pte_chain->parent_ptes[i] != parent_pte)
- continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES
- && pte_chain->parent_ptes[i + 1]) {
- pte_chain->parent_ptes[i]
- = pte_chain->parent_ptes[i + 1];
- ++i;
- }
- pte_chain->parent_ptes[i] = NULL;
- if (i == 0) {
- hlist_del(&pte_chain->link);
- mmu_free_pte_chain(pte_chain);
- if (hlist_empty(&sp->parent_ptes)) {
- sp->multimapped = 0;
- sp->parent_pte = NULL;
- }
- }
- return;
- }
- BUG();
-}
-
-
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- struct kvm_mmu_page *parent_sp;
- int i;
-
- if (!sp->multimapped && sp->parent_pte) {
- parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
- return;
- }
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
- }
-}
-
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
-{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
-}
-
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->parent_pte)
- return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
- return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
-}
-
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return 1;
-}
-
-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
-}
-
-#define KVM_PAGE_ARRAY_NR 16
-
-struct kvm_mmu_pages {
- struct mmu_page_and_offset {
- struct kvm_mmu_page *sp;
- unsigned int idx;
- } page[KVM_PAGE_ARRAY_NR];
- unsigned int nr;
-};
-
-#define for_each_unsync_children(bitmap, idx) \
- for (idx = find_first_bit(bitmap, 512); \
- idx < 512; \
- idx = find_next_bit(bitmap, 512, idx+1))
-
-static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
- int idx)
-{
- int i;
-
- if (sp->unsync)
- for (i=0; i < pvec->nr; i++)
- if (pvec->page[i].sp == sp)
- return 0;
-
- pvec->page[pvec->nr].sp = sp;
- pvec->page[pvec->nr].idx = idx;
- pvec->nr++;
- return (pvec->nr == KVM_PAGE_ARRAY_NR);
-}
-
-static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- int i, ret, nr_unsync_leaf = 0;
-
- for_each_unsync_children(sp->unsync_child_bitmap, i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
-
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
- }
-
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
-
- return nr_unsync_leaf;
-}
-
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_mmu_pages *pvec)
-{
- if (!sp->unsync_children)
- return 0;
-
- mmu_pages_add(pvec, sp, 0);
- return __mmu_unsync_walk(sp, pvec);
-}
-
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
-static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- WARN_ON(!sp->unsync);
- trace_kvm_mmu_sync_page(sp);
- sp->unsync = 0;
- --kvm->stat.mmu_unsync;
-}
-
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
- return 1;
- }
-
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
- return 1;
- }
-
- kvm_mmu_flush_tlb(vcpu);
- return 0;
-}
-
-struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- unsigned int idx[PT64_ROOT_LEVEL-1];
-};
-
-#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_next(&pvec, &parents, -1), \
- sp = pvec.page[i].sp; \
- i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
- i = mmu_pages_next(&pvec, &parents, i))
-
-static int mmu_pages_next(struct kvm_mmu_pages *pvec,
- struct mmu_page_path *parents,
- int i)
-{
- int n;
-
- for (n = i+1; n < pvec->nr; n++) {
- struct kvm_mmu_page *sp = pvec->page[n].sp;
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- parents->idx[0] = pvec->page[n].idx;
- return n;
- }
-
- parents->parent[sp->role.level-2] = sp;
- parents->idx[sp->role.level-1] = pvec->page[n].idx;
- }
-
- return n;
-}
-
-static void mmu_pages_clear_parents(struct mmu_page_path *parents)
-{
- struct kvm_mmu_page *sp;
- unsigned int level = 0;
-
- do {
- unsigned int idx = parents->idx[level];
-
- sp = parents->parent[level];
- if (!sp)
- return;
-
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
- level++;
- } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- struct mmu_page_path *parents,
- struct kvm_mmu_pages *pvec)
-{
- parents->parent[parent->role.level-1] = NULL;
- pvec->nr = 0;
-}
-
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
-{
- int i;
- struct kvm_mmu_page *sp;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
-
- for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
-
- if (protected)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
- mmu_pages_clear_parents(&parents);
- }
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-}
-
-static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
- gfn_t gfn,
- gva_t gaddr,
- unsigned level,
- int direct,
- unsigned access,
- u64 *parent_pte)
-{
- union kvm_mmu_page_role role;
- unsigned index;
- unsigned quadrant;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
-
- role = vcpu->arch.mmu.base_role;
- role.level = level;
- role.direct = direct;
- if (role.direct)
- role.cr4_pae = 0;
- role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
- quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
- quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
- role.quadrant = quadrant;
- }
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
-
- if (sp->role.word != role.word)
- continue;
-
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(sp);
- }
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
- ++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
- if (!sp)
- return sp;
- sp->gfn = gfn;
- sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
- if (!direct) {
- if (rmap_write_protect(vcpu->kvm, gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- account_shadowed(vcpu->kvm, gfn);
- }
- if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
- else
- nonpaging_prefetch_page(vcpu, sp);
- trace_kvm_mmu_get_page(sp, true);
- return sp;
-}
-
-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
- struct kvm_vcpu *vcpu, u64 addr)
-{
- iterator->addr = addr;
- iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
- iterator->level = vcpu->arch.mmu.shadow_root_level;
- if (iterator->level == PT32E_ROOT_LEVEL) {
- iterator->shadow_addr
- = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
- iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
- --iterator->level;
- if (!iterator->shadow_addr)
- iterator->level = 0;
- }
-}
-
-static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
-{
- if (iterator->level < PT_PAGE_TABLE_LEVEL)
- return false;
-
- if (iterator->level == PT_PAGE_TABLE_LEVEL)
- if (is_large_pte(*iterator->sptep))
- return false;
-
- iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
- iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
- return true;
-}
-
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
-{
- iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
- --iterator->level;
-}
-
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
-{
- unsigned i;
- u64 *pt;
- u64 ent;
-
- pt = sp->spt;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- ent = pt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent),
- &pt[i]);
- } else {
- if (is_large_pte(ent))
- --kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
- }
- }
- pt[i] = shadow_trap_nonpresent_pte;
- }
-}
-
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
- int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- vcpu->arch.last_pte_updated = NULL;
-}
-
-static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- u64 *parent_pte;
-
- while (sp->multimapped || sp->parent_pte) {
- if (!sp->multimapped)
- parent_pte = sp->parent_pte;
- else {
- struct kvm_pte_chain *chain;
-
- chain = container_of(sp->parent_ptes.first,
- struct kvm_pte_chain, link);
- parent_pte = chain->parent_ptes[0];
- }
- BUG_ON(!parent_pte);
- kvm_mmu_put_page(sp, parent_pte);
- __set_spte(parent_pte, shadow_trap_nonpresent_pte);
- }
-}
-
-static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
-{
- int i, zapped = 0;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- if (parent->role.level == PT_PAGE_TABLE_LEVEL)
- return 0;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- struct kvm_mmu_page *sp;
-
- for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
- mmu_pages_clear_parents(&parents);
- zapped++;
- }
- kvm_mmu_pages_init(parent, &parents, &pages);
- }
-
- return zapped;
-}
-
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- int ret;
-
- trace_kvm_mmu_zap_page(sp);
- ++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
- kvm_mmu_page_unlink_children(kvm, sp);
- kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
- if (!sp->role.invalid && !sp->role.direct)
- unaccount_shadowed(kvm, sp->gfn);
- if (sp->unsync)
- kvm_unlink_unsync_page(kvm, sp);
- if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
- } else {
- sp->role.invalid = 1;
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_reload_remote_mmus(kvm);
- }
- kvm_mmu_reset_last_pte_updated(kvm);
- return ret;
-}
-
-/*
- * Changing the number of mmu pages allocated to the vm
- * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
- */
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
-{
- int used_pages;
-
- used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
- used_pages = max(0, used_pages);
-
- /*
- * If we set the number of mmu pages to be smaller be than the
- * number of actived pages , we must to free some mmu pages before we
- * change the value
- */
-
- if (used_pages > kvm_nr_mmu_pages) {
- while (used_pages > kvm_nr_mmu_pages &&
- !list_empty(&kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_zap_page(kvm, page);
- used_pages--;
- }
- kvm_nr_mmu_pages = used_pages;
- kvm->arch.n_free_mmu_pages = 0;
- }
- else
- kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
- - kvm->arch.n_alloc_mmu_pages;
-
- kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
-}
-
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- int r;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
- return r;
-}
-
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
-
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
- }
-}
-
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
- int slot = memslot_id(kvm, gfn);
- struct kvm_mmu_page *sp = page_header(__pa(pte));
-
- __set_bit(slot, sp->slot_bitmap);
-}
-
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
- int i;
- u64 *pt = sp->spt;
-
- if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] == shadow_notrap_nonpresent_pte)
- __set_spte(&pt[i], shadow_trap_nonpresent_pte);
- }
-}
-
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
- u64 start, u64 end)
-{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
-
- if (!mtrr_state->enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state->fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
-
- if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
- continue;
-
- base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
- (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
- (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
-
- if ((start & mask) != (base & mask))
- continue;
-
- curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
- }
-
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if (prev_match != 0xFF)
- return prev_match;
-
- return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u8 mtrr;
-
- mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
- (gfn << PAGE_SHIFT) + PAGE_SIZE);
- if (mtrr == 0xfe || mtrr == 0xff)
- mtrr = MTRR_TYPE_WRBACK;
- return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
- trace_kvm_mmu_unsync_page(sp);
- ++vcpu->kvm->stat.mmu_unsync;
- sp->unsync = 1;
-
- kvm_mmu_mark_parents_unsync(sp);
-
- mmu_convert_notrap(sp);
- return 0;
-}
-
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
-{
- struct kvm_mmu_page *shadow;
-
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
- return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
- }
- return 0;
-}
-
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int user_fault,
- int write_fault, int dirty, int level,
- gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
-{
- u64 spte;
- int ret = 0;
-
- /*
- * We don't set the accessed bit, since we sometimes want to see
- * whether the guest actually used the pte (in order to detect
- * demand paging).
- */
- spte = shadow_base_present_pte | shadow_dirty_mask;
- if (!speculative)
- spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
- if (level > PT_PAGE_TABLE_LEVEL)
- spte |= PT_PAGE_SIZE_MASK;
- if (tdp_enabled)
- spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
-
- if (reset_host_protection)
- spte |= SPTE_HOST_WRITEABLE;
-
- spte |= (u64)pfn << PAGE_SHIFT;
-
- if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-
- if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level)) {
- ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
- }
-
- spte |= PT_WRITABLE_MASK;
-
- if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
- spte &= ~PT_USER_MASK;
-
- /*
- * Optimization: for pte sync, if spte was writable the hash
- * lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
- * Same reasoning can be applied to dirty page accounting.
- */
- if (!can_unsync && is_writable_pte(*sptep))
- goto set_pte;
-
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %lx, marking ro\n",
- __func__, gfn);
- ret = 1;
- pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
- }
- }
-
- if (pte_access & ACC_WRITE_MASK)
- mark_page_dirty(vcpu->kvm, gfn);
-
-set_pte:
- __set_spte(sptep, spte);
- return ret;
-}
-
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int level, gfn_t gfn,
- pfn_t pfn, bool speculative,
- bool reset_host_protection)
-{
- int was_rmapped = 0;
- int was_writable = is_writable_pte(*sptep);
- int rmap_count;
-
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
- __func__, *sptep, pt_access,
- write_fault, user_fault, gfn);
-
- if (is_rmap_spte(*sptep)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (level > PT_PAGE_TABLE_LEVEL &&
- !is_large_pte(*sptep)) {
- struct kvm_mmu_page *child;
- u64 pte = *sptep;
-
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- } else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- } else
- was_rmapped = 1;
- }
-
- if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
- if (write_fault)
- *ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
- }
-
- pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
- *sptep, sptep);
- if (!was_rmapped && is_large_pte(*sptep))
- ++vcpu->kvm->stat.lpages;
-
- page_header_update_slot(vcpu->kvm, sptep, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- kvm_release_pfn_clean(pfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
- } else {
- if (was_writable)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
- }
- if (speculative) {
- vcpu->arch.last_pte_updated = sptep;
- vcpu->arch.last_pte_gfn = gfn;
- }
-}
-
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int level, gfn_t gfn, pfn_t pfn)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- int pt_write = 0;
- gfn_t pseudo_gfn;
-
- for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
- if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write,
- level, gfn, pfn, false, true);
- ++vcpu->stat.pf_fixed;
- break;
- }
-
- if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
- sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
- if (!sp) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- __set_spte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
- }
- }
- return pt_write;
-}
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-{
- int r;
- int level;
- pfn_t pfn;
- unsigned long mmu_seq;
-
- level = mapping_level(vcpu, gfn);
-
- /*
- * This path builds a PAE pagetable - so we can map 2mb pages at
- * maximum. Therefore check if the level is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
-
- /* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, level, gfn, pfn);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-
-static void mmu_free_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- spin_unlock(&vcpu->kvm->mmu_lock);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
- }
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-}
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
- int ret = 0;
-
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
- ret = 1;
- }
-
- return ret;
-}
-
-static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- gfn_t root_gfn;
- struct kvm_mmu_page *sp;
- int direct = 0;
- u64 pdptr;
-
- root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
-
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
-
- ASSERT(!VALID_PAGE(root));
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = 0;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
- vcpu->arch.mmu.root_hpa = root;
- return 0;
- }
- direct = !is_paging(vcpu);
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- ASSERT(!VALID_PAGE(root));
- if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = kvm_pdptr_read(vcpu, i);
- if (!is_present_gpte(pdptr)) {
- vcpu->arch.mmu.pae_root[i] = 0;
- continue;
- }
- root_gfn = pdptr >> PAGE_SHIFT;
- } else if (vcpu->arch.mmu.root_level == 0)
- root_gfn = 0;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
- if (tdp_enabled) {
- direct = 1;
- root_gfn = i << 30;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, direct,
- ACC_ALL, NULL);
- root = __pa(sp->spt);
- ++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
- }
- vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
- return 0;
-}
-
-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- }
- }
-}
-
-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
-{
- if (error)
- *error = 0;
- return vaddr;
-}
-
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
-{
- gfn_t gfn;
- int r;
-
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- gfn = gva >> PAGE_SHIFT;
-
- return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
- u32 error_code)
-{
- pfn_t pfn;
- int r;
- int level;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- unsigned long mmu_seq;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- level = mapping_level(vcpu, gfn);
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- level, gfn, pfn);
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return r;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static void nonpaging_free(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static int nonpaging_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = nonpaging_page_fault;
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
-}
-
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
-{
- pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
- mmu_free_roots(vcpu);
-}
-
-static void inject_page_fault(struct kvm_vcpu *vcpu,
- u64 addr,
- u32 err_code)
-{
- kvm_inject_page_fault(vcpu, addr, err_code);
-}
-
-static void paging_free(struct kvm_vcpu *vcpu)
-{
- nonpaging_free(vcpu);
-}
-
-static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
-#define PTTYPE 64
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-#define PTTYPE 32
-#include "paging_tmpl.h"
-#undef PTTYPE
-
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
- u64 exb_bit_rsvd = 0;
-
- if (!is_nx(vcpu))
- exb_bit_rsvd = rsvd_bits(63, 63);
- switch (level) {
- case PT32_ROOT_LEVEL:
- /* no rsvd bits for 2 level 4K page table entries */
- context->rsvd_bits_mask[0][1] = 0;
- context->rsvd_bits_mask[0][0] = 0;
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
-
- if (!is_pse(vcpu)) {
- context->rsvd_bits_mask[1][1] = 0;
- break;
- }
-
- if (is_cpuid_PSE36())
- /* 36bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
- else
- /* 32 bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
- break;
- case PT32E_ROOT_LEVEL:
- context->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
- break;
- case PT64_ROOT_LEVEL:
- context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 29);
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
- break;
- }
-}
-
-static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- ASSERT(is_pae(vcpu));
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging64_page_fault;
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->prefetch_page = paging64_prefetch_page;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
- context->free = paging_free;
- context->root_level = level;
- context->shadow_root_level = level;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging64_init_context(struct kvm_vcpu *vcpu)
-{
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
-}
-
-static int paging32_init_context(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
- context->new_cr3 = paging_new_cr3;
- context->page_fault = paging32_page_fault;
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->free = paging_free;
- context->prefetch_page = paging32_prefetch_page;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
- context->root_level = PT32_ROOT_LEVEL;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = INVALID_PAGE;
- return 0;
-}
-
-static int paging32E_init_context(struct kvm_vcpu *vcpu)
-{
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
-}
-
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu *context = &vcpu->arch.mmu;
-
- context->new_cr3 = nonpaging_new_cr3;
- context->page_fault = tdp_page_fault;
- context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = nonpaging_invlpg;
- context->shadow_root_level = kvm_x86_ops->get_tdp_level();
- context->root_hpa = INVALID_PAGE;
-
- if (!is_paging(vcpu)) {
- context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->root_level = PT64_ROOT_LEVEL;
- } else if (is_pae(vcpu)) {
- reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
- context->gva_to_gpa = paging64_gva_to_gpa;
- context->root_level = PT32E_ROOT_LEVEL;
- } else {
- reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
- context->gva_to_gpa = paging32_gva_to_gpa;
- context->root_level = PT32_ROOT_LEVEL;
- }
-
- return 0;
-}
-
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
-{
- int r;
-
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- if (!is_paging(vcpu))
- r = nonpaging_init_context(vcpu);
- else if (is_long_mode(vcpu))
- r = paging64_init_context(vcpu);
- else if (is_pae(vcpu))
- r = paging32E_init_context(vcpu);
- else
- r = paging32_init_context(vcpu);
-
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
-
- return r;
-}
-
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.update_pte.pfn = bad_pfn;
-
- if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
- else
- return init_kvm_softmmu(vcpu);
-}
-
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
- vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
-}
-
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
-{
- destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
-
-int kvm_mmu_load(struct kvm_vcpu *vcpu)
-{
- int r;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
- r = mmu_alloc_roots(vcpu);
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_sync_roots(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
- if (r)
- goto out;
- /* set_cr3() should ensure TLB has been flushed */
- kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_load);
-
-void kvm_mmu_unload(struct kvm_vcpu *vcpu)
-{
- mmu_free_roots(vcpu);
-}
-
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte)
-{
- u64 pte;
- struct kvm_mmu_page *child;
-
- pte = *spte;
- if (is_shadow_present_pte(pte)) {
- if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
- else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
- }
- }
- __set_spte(spte, shadow_trap_nonpresent_pte);
- if (is_large_pte(pte))
- --vcpu->kvm->stat.lpages;
-}
-
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp,
- u64 *spte,
- const void *new)
-{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- if (!sp->role.cr4_pae)
- paging32_update_pte(vcpu, sp, spte, new);
- else
- paging64_update_pte(vcpu, sp, spte, new);
-}
-
-static bool need_remote_flush(u64 old, u64 new)
-{
- if (!is_shadow_present_pte(old))
- return false;
- if (!is_shadow_present_pte(new))
- return true;
- if ((old ^ new) & PT64_BASE_ADDR_MASK)
- return true;
- old ^= PT64_NX_MASK;
- new ^= PT64_NX_MASK;
- return (old & ~new & PT64_PERM_MASK) != 0;
-}
-
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
-{
- if (need_remote_flush(old, new))
- kvm_flush_remote_tlbs(vcpu->kvm);
- else
- kvm_mmu_flush_tlb(vcpu);
-}
-
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- return !!(spte && (*spte & shadow_accessed_mask));
-}
-
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- u64 gpte)
-{
- gfn_t gfn;
- pfn_t pfn;
-
- if (!is_present_gpte(gpte))
- return;
- gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-
- vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
- vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.pfn = pfn;
-}
-
-static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- if (spte
- && vcpu->arch.last_pte_gfn == gfn
- && shadow_accessed_mask
- && !(*spte & shadow_accessed_mask)
- && is_shadow_present_pte(*spte))
- set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
-}
-
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated)
-{
- gfn_t gfn = gpa >> PAGE_SHIFT;
- struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
- u64 entry, gentry;
- u64 *spte;
- unsigned offset = offset_in_page(gpa);
- unsigned pte_size;
- unsigned page_offset;
- unsigned misaligned;
- unsigned quadrant;
- int level;
- int flooded = 0;
- int npte;
- int r;
- int invlpg_counter;
-
- pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
- invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
-
- /*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode. This is nearly always true
- * (might be false while changing modes). Note it is verified later
- * by update_pte().
- */
- if ((is_pae(vcpu) && bytes == 4) || !new) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if (is_pae(vcpu)) {
- gpa &= ~(gpa_t)7;
- bytes = 8;
- }
- r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
- }
-
- switch (bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
- }
-
- mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
- spin_lock(&vcpu->kvm->mmu_lock);
- if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
- gentry = 0;
- kvm_mmu_access_page(vcpu, gfn);
- kvm_mmu_free_some_pages(vcpu);
- ++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
- if (guest_initiated) {
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
- }
- }
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
- pte_size = sp->role.cr4_pae ? 8 : 4;
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
- if (misaligned || flooded) {
- /*
- * Misaligned accesses are too much trouble to fix
- * up; also, they usually indicate a page is not used
- * as a page table.
- *
- * If we're seeing too many writes to a page,
- * it may no longer be a page table, or we may be
- * forking, in which case it is better to unmap the
- * page.
- */
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- goto restart;
- ++vcpu->kvm->stat.mmu_flooded;
- continue;
- }
- page_offset = offset;
- level = sp->role.level;
- npte = 1;
- if (!sp->role.cr4_pae) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- npte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != sp->role.quadrant)
- continue;
- }
- spte = &sp->spt[page_offset / sizeof(*spte)];
- while (npte--) {
- entry = *spte;
- mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry)
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
- ++spte;
- }
- }
- kvm_mmu_audit(vcpu, "post pte write");
- spin_unlock(&vcpu->kvm->mmu_lock);
- if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
- kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
- vcpu->arch.update_pte.pfn = bad_pfn;
- }
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (tdp_enabled)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- spin_lock(&vcpu->kvm->mmu_lock);
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
- !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
- struct kvm_mmu_page *sp;
-
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
- ++vcpu->kvm->stat.mmu_recycled;
- }
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
-{
- int r;
- enum emulation_result er;
-
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
- if (r < 0)
- goto out;
-
- if (!r) {
- r = 1;
- goto out;
- }
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
-
- er = emulate_instruction(vcpu, cr2, error_code, 0);
-
- switch (er) {
- case EMULATE_DONE:
- return 1;
- case EMULATE_DO_MMIO:
- ++vcpu->stat.mmio_exits;
- return 0;
- case EMULATE_FAIL:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- default:
- BUG();
- }
-out:
- return r;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-
-void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
-{
- vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
- ++vcpu->stat.invlpg;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
-
-void kvm_enable_tdp(void)
-{
- tdp_enabled = true;
-}
-EXPORT_SYMBOL_GPL(kvm_enable_tdp);
-
-void kvm_disable_tdp(void)
-{
- tdp_enabled = false;
-}
-EXPORT_SYMBOL_GPL(kvm_disable_tdp);
-
-static void free_mmu_pages(struct kvm_vcpu *vcpu)
-{
- free_page((unsigned long)vcpu->arch.mmu.pae_root);
-}
-
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int i;
-
- ASSERT(vcpu);
-
- /*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
- * Therefore we need to allocate shadow page tables in the first
- * 4GB of memory, which happens to fit the DMA32 zone.
- */
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
- if (!page)
- return -ENOMEM;
-
- vcpu->arch.mmu.pae_root = page_address(page);
- for (i = 0; i < 4; ++i)
- vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
-
- return 0;
-}
-
-int kvm_mmu_create(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- return alloc_mmu_pages(vcpu);
-}
-
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-
- return init_kvm_mmu(vcpu);
-}
-
-void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
-{
- ASSERT(vcpu);
-
- destroy_kvm_mmu(vcpu);
- free_mmu_pages(vcpu);
- mmu_free_memory_caches(vcpu);
-}
-
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
- int i;
- u64 *pt;
-
- if (!test_bit(slot, sp->slot_bitmap))
- continue;
-
- pt = sp->spt;
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- /* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
- pt[i] &= ~PT_WRITABLE_MASK;
- }
- kvm_flush_remote_tlbs(kvm);
-}
-
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
- struct kvm_mmu_page *sp, *node;
-
- spin_lock(&kvm->mmu_lock);
-restart:
- list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
-
- spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
-}
-
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
-{
- struct kvm_mmu_page *page;
-
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
- return kvm_mmu_zap_page(kvm, page) + 1;
-}
-
-static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
-{
- struct kvm *kvm;
- struct kvm *kvm_freed = NULL;
- int cache_count = 0;
-
- spin_lock(&kvm_lock);
-
- list_for_each_entry(kvm, &vm_list, vm_list) {
- int npages, idx, freed_pages;
-
- idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
- npages = kvm->arch.n_alloc_mmu_pages -
- kvm->arch.n_free_mmu_pages;
- cache_count += npages;
- if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
- cache_count -= freed_pages;
- kvm_freed = kvm;
- }
- nr_to_scan--;
-
- spin_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
- }
- if (kvm_freed)
- list_move_tail(&kvm_freed->vm_list, &vm_list);
-
- spin_unlock(&kvm_lock);
-
- return cache_count;
-}
-
-static struct shrinker mmu_shrinker = {
- .shrink = mmu_shrink,
- .seeks = DEFAULT_SEEKS * 10,
-};
-
-static void mmu_destroy_caches(void)
-{
- if (pte_chain_cache)
- kmem_cache_destroy(pte_chain_cache);
- if (rmap_desc_cache)
- kmem_cache_destroy(rmap_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
-}
-
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- unregister_shrinker(&mmu_shrinker);
-}
-
-int kvm_mmu_module_init(void)
-{
- pte_chain_cache = kmem_cache_create("kvm_pte_chain",
- sizeof(struct kvm_pte_chain),
- 0, 0, NULL);
- if (!pte_chain_cache)
- goto nomem;
- rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
- sizeof(struct kvm_rmap_desc),
- 0, 0, NULL);
- if (!rmap_desc_cache)
- goto nomem;
-
- mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
- sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
- if (!mmu_page_header_cache)
- goto nomem;
-
- register_shrinker(&mmu_shrinker);
-
- return 0;
-
-nomem:
- mmu_destroy_caches();
- return -ENOMEM;
-}
-
-/*
- * Caculate mmu pages needed for kvm.
- */
-unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
-{
- int i;
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
- struct kvm_memslots *slots;
-
- slots = kvm_memslots(kvm);
-
- for (i = 0; i < slots->nmemslots; i++)
- nr_pages += slots->memslots[i].npages;
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
-static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- if (len > buffer->len)
- return NULL;
- return buffer->ptr;
-}
-
-static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- void *ret;
-
- ret = pv_mmu_peek_buffer(buffer, len);
- if (!ret)
- return ret;
- buffer->ptr += len;
- buffer->len -= len;
- buffer->processed += len;
- return ret;
-}
-
-static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
- gpa_t addr, gpa_t value)
-{
- int bytes = 8;
- int r;
-
- if (!is_long_mode(vcpu) && !is_pae(vcpu))
- bytes = 4;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- if (!emulator_write_phys(vcpu, addr, &value, bytes))
- return -EFAULT;
-
- return 1;
-}
-
-static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
- return 1;
-}
-
-static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
-}
-
-static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
- struct kvm_pv_mmu_op_buffer *buffer)
-{
- struct kvm_mmu_op_header *header;
-
- header = pv_mmu_peek_buffer(buffer, sizeof *header);
- if (!header)
- return 0;
- switch (header->op) {
- case KVM_MMU_OP_WRITE_PTE: {
- struct kvm_mmu_op_write_pte *wpte;
-
- wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
- if (!wpte)
- return 0;
- return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
- wpte->pte_val);
- }
- case KVM_MMU_OP_FLUSH_TLB: {
- struct kvm_mmu_op_flush_tlb *ftlb;
-
- ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
- if (!ftlb)
- return 0;
- return kvm_pv_mmu_flush_tlb(vcpu);
- }
- case KVM_MMU_OP_RELEASE_PT: {
- struct kvm_mmu_op_release_pt *rpt;
-
- rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
- if (!rpt)
- return 0;
- return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
- }
- default: return 0;
- }
-}
-
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret)
-{
- int r;
- struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
-
- buffer->ptr = buffer->buf;
- buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
- buffer->processed = 0;
-
- r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
- if (r)
- goto out;
-
- while (buffer->len) {
- r = kvm_pv_mmu_op_one(vcpu, buffer);
- if (r < 0)
- goto out;
- if (r == 0)
- break;
- }
-
- r = 1;
-out:
- *ret = buffer->processed;
- return r;
-}
-
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
- struct kvm_shadow_walk_iterator iterator;
- int nr_sptes = 0;
-
- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, addr, iterator) {
- sptes[iterator.level-1] = *iterator.sptep;
- nr_sptes++;
- if (!is_shadow_present_pte(*iterator.sptep))
- break;
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
-#ifdef AUDIT
-
-static const char *audit_msg;
-
-static gva_t canonicalize(gva_t gva)
-{
-#ifdef CONFIG_X86_64
- gva = (long long)(gva << 16) >> 16;
-#endif
- return gva;
-}
-
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 ent = pt[i];
-
- if (ent == shadow_trap_nonpresent_pte)
- continue;
-
- va = canonicalize(va);
- if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
- audit_mappings_page(vcpu, ent, va, level - 1);
- else {
- gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
- gfn_t gfn = gpa >> PAGE_SHIFT;
- pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
- hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- continue;
- }
-
- if (is_shadow_present_pte(ent)
- && (ent & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx gpa %llx hpa %llx ent %llx %d\n",
- audit_msg, vcpu->arch.mmu.root_level,
- va, gpa, hpa, ent,
- is_shadow_present_pte(ent));
- else if (ent == shadow_notrap_nonpresent_pte
- && !is_error_hpa(hpa))
- printk(KERN_ERR "audit: (%s) notrap shadow,"
- " valid guest gva %lx\n", audit_msg, va);
- kvm_release_pfn_clean(pfn);
-
- }
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-static int count_rmaps(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_memslots *slots;
- int nmaps = 0;
- int i, j, k, idx;
-
- idx = srcu_read_lock(&kvm->srcu);
- slots = kvm_memslots(kvm);
- for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
- struct kvm_memory_slot *m = &slots->memslots[i];
- struct kvm_rmap_desc *d;
-
- for (j = 0; j < m->npages; ++j) {
- unsigned long *rmapp = &m->rmap[j];
-
- if (!*rmapp)
- continue;
- if (!(*rmapp & 1)) {
- ++nmaps;
- continue;
- }
- d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
- while (d) {
- for (k = 0; k < RMAP_EXT; ++k)
- if (d->sptes[k])
- ++nmaps;
- else
- break;
- d = d->more;
- }
- }
- }
- srcu_read_unlock(&kvm->srcu, idx);
- return nmaps;
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
- if (*sptep & PT_WRITABLE_MASK) {
- rev_sp = page_header(__pa(sptep));
- gfn = rev_sp->gfns[sptep - rev_sp->spt];
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no memslot for gfn %ld\n",
- audit_msg, gfn);
- printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
- audit_msg, (long int)(sptep - rev_sp->spt),
- rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
- audit_msg, *sptep);
- dump_stack();
- }
- }
-
-}
-
-void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- int i;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = pt[i];
-
- if (!(ent & PT_PRESENT_MASK))
- continue;
- if (!(ent & PT_WRITABLE_MASK))
- continue;
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
- }
- return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- check_writable_mappings_rmap(vcpu);
- count_rmaps(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
- gfn_t gfn;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
-
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (*spte & PT_WRITABLE_MASK)
- printk(KERN_ERR "%s: (%s) shadow page has "
- "writable mappings: gfn %lx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_writable_sptes_have_rmaps(vcpu);
- dbg = olddbg;
-}
-
-#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..830f46145692 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -1,25 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_MMU_H
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
-#define PT64_PT_BITS 9
-#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
-#define PT32_PT_BITS 10
-#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+extern bool __read_mostly enable_mmio_caching;
#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
-#define PT_PAGE_SIZE_MASK (1ULL << 7)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_SHIFT 63
@@ -29,44 +32,294 @@
#define PT_DIR_PAT_SHIFT 12
#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
-#define PT32_DIR_PSE36_SIZE 4
-#define PT32_DIR_PSE36_SHIFT 13
-#define PT32_DIR_PSE36_MASK \
- (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
-
-#define PT64_ROOT_LEVEL 4
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-#define PT_PDPE_LEVEL 3
-#define PT_DIRECTORY_LEVEL 2
-#define PT_PAGE_TABLE_LEVEL 1
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
+
+#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
+
+static __always_inline u64 rsvd_bits(int s, int e)
+{
+ BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
+
+ if (__builtin_constant_p(e))
+ BUILD_BUG_ON(e > 63);
+ else
+ e &= 63;
-#define PFERR_PRESENT_MASK (1U << 0)
-#define PFERR_WRITE_MASK (1U << 1)
-#define PFERR_USER_MASK (1U << 2)
-#define PFERR_RSVD_MASK (1U << 3)
-#define PFERR_FETCH_MASK (1U << 4)
+ if (e < s)
+ return 0;
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+ return ((2ULL << (e - s)) - 1) << s;
+}
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+static inline gfn_t kvm_mmu_max_gfn(void)
{
- if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
- __kvm_mmu_free_some_pages(vcpu);
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
+u8 kvm_mmu_get_max_tdp_level(void);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
- if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ kvm_mmu_free_obsolete_roots(vcpu);
+
+ /*
+ * Checking root.hpa is sufficient even when KVM has mirror root.
+ * We can have either:
+ * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE
+ * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE
+ * (3) mirror_root_hpa = root1, root.hpa = root2
+ * We don't ever have:
+ * mirror_root_hpa = INVALID_PAGE, root.hpa = root
+ */
+ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
-static inline int is_present_gpte(unsigned long pte)
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LAM))
+ return 0;
+
+ return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57);
+}
+
+static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
+{
+ u64 root_hpa = vcpu->arch.mmu->root.hpa;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
+}
+
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
+/*
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
+ */
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ u64 access)
+{
+ /* strip nested paging fault error codes */
+ unsigned int pfec = access;
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
+
+ /*
+ * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
+ * For implicit supervisor accesses, SMAP cannot be overridden.
+ *
+ * SMAP works on supervisor accesses only, and not_smap can
+ * be set or not set when user access with neither has any bearing
+ * on the result.
+ *
+ * We put the SMAP checking bit in place of the PFERR_RSVD_MASK bit;
+ * this bit will always be zero in pfec, but it will be one in index
+ * if SMAP checks are being disabled.
+ */
+ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
+ bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
+ int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
+ u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
+
+ WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK));
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
+
+ pkru_bits &= mmu->pkru_mask >> offset;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
+
+ return -(u32)fault & errcode;
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
+{
+ /*
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
+ */
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
+#ifdef CONFIG_X86_64
+extern bool tdp_mmu_enabled;
+#else
+#define tdp_mmu_enabled false
+#endif
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
+}
+
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
+{
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u64 access,
+ struct x86_exception *exception)
+{
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
+}
+
+static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm)
+{
+ return kvm->arch.gfn_direct_bits;
+}
+
+static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa)
{
- return pte & PT_PRESENT_MASK;
+ gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm));
+
+ return !gpa_direct_bits || (gpa & gpa_direct_bits);
}
+static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn & kvm_gfn_direct_bits(kvm);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
new file mode 100644
index 000000000000..02c450686b4a
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -0,0 +1,8095 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "irq.h"
+#include "ioapic.h"
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "tdp_mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "smm.h"
+#include "kvm_emulate.h"
+#include "page_track.h"
+#include "cpuid.h"
+#include "spte.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
+#include <linux/kthread.h>
+#include <linux/wordpart.h>
+
+#include <asm/page.h>
+#include <asm/memtype.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/set_memory.h>
+#include <asm/spec-ctrl.h>
+#include <asm/vmx.h>
+
+#include "trace.h"
+
+static bool nx_hugepage_mitigation_hard_disabled;
+
+int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
+#ifdef CONFIG_PREEMPT_RT
+/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
+static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
+#else
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+#endif
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
+
+static const struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = get_nx_huge_pages,
+};
+
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
+
+static bool __read_mostly force_flush_and_sync_on_reuse;
+module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(tdp_mmu_enabled);
+#endif
+
+static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
+static int max_tdp_level __read_mostly;
+
+#define PTE_PREFETCH_NUM 8
+
+#include <trace/events/kvm.h>
+
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+
+/*
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
+ */
+struct pte_list_desc {
+ struct pte_list_desc *more;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
+ u64 *sptes[PTE_LIST_EXT];
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ u64 *sptep;
+ int level;
+ unsigned index;
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
+static struct kmem_cache *pte_list_desc_cache;
+struct kmem_cache *mmu_page_header_cache;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the root_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool __maybe_unused \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+ return !mmu->cpu_role.base.has_4_byte_gpte;
+}
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
+
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
+
+ return mmu->get_guest_pgd(vcpu);
+}
+
+static inline bool kvm_available_flush_remote_tlbs_range(void)
+{
+#if IS_ENABLED(CONFIG_HYPERV)
+ return kvm_x86_ops.flush_remote_tlbs_range;
+#else
+ return false;
+#endif
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned int access)
+{
+ u64 spte = make_mmio_spte(vcpu, gfn, access);
+
+ trace_mark_mmio_spte(sptep, gfn, spte);
+ mmu_spte_set(sptep, spte);
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return spte & shadow_mmio_access_mask;
+}
+
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
+{
+ u64 kvm_gen, spte_gen, gen;
+
+ gen = kvm_vcpu_memslots(vcpu)->generation;
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return false;
+
+ kvm_gen = gen & MMIO_SPTE_GEN_MASK;
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+#ifdef CONFIG_X86_64
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ WRITE_ONCE(*sptep, spte);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ WRITE_ONCE(*sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return READ_ONCE(*sptep);
+}
+#else
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
+
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte (mm/gup.c).
+ *
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+#endif
+
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON_ONCE(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
+ check_spte_writable_invariants(new_spte);
+
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return false;
+ }
+
+ if (!spte_needs_atomic_update(old_spte))
+ __update_clear_spte_fast(sptep, new_spte);
+ else
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+ WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
+ spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
+}
+
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ * Returns the old PTE.
+ */
+static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
+{
+ u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
+
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_needs_atomic_update(old_spte))
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
+ else
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
+
+ if (!is_shadow_present_pte(old_spte))
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
+ return old_spte;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ if (is_tdp_mmu_active(vcpu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
+{
+ int r;
+
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
+ if (r)
+ return r;
+ if (kvm_has_mirrored_tdp(vcpu->kvm)) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+}
+
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
+{
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
+}
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp);
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (sp->role.passthrough)
+ return sp->gfn;
+
+ if (sp->shadowed_translation)
+ return sp->shadowed_translation[index] >> PAGE_SHIFT;
+
+ return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
+}
+
+/*
+ * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
+ * that the SPTE itself may have a more constrained access permissions that
+ * what the guest enforces. For example, a guest may create an executable
+ * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
+ */
+static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
+{
+ if (sp->shadowed_translation)
+ return sp->shadowed_translation[index] & ACC_ALL;
+
+ /*
+ * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
+ * KVM is not shadowing any guest page tables, so the "guest access
+ * permissions" are just ACC_ALL.
+ *
+ * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
+ * is shadowing a guest huge page with small pages, the guest access
+ * permissions being shadowed are the access permissions of the huge
+ * page.
+ *
+ * In both cases, sp->role.access contains the correct access bits.
+ */
+ return sp->role.access;
+}
+
+static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
+ gfn_t gfn, unsigned int access)
+{
+ if (sp->shadowed_translation) {
+ sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
+ return;
+ }
+
+ WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
+ "access mismatch under %s page %llx (expected %u, got %u)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_access(sp, index), access);
+
+ WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
+ "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
+ sp->role.passthrough ? "passthrough" : "direct",
+ sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
+ unsigned int access)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ kvm_mmu_page_set_translation(sp, index, gfn, access);
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ const struct kvm_memory_slot *slot, int level)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
+}
+
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG BIT(31)
+
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int old, i;
+
+ for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+
+ old = linfo->disallow_lpage;
+ linfo->disallow_lpage += count;
+ WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages++;
+ /*
+ * Ensure indirect_shadow_pages is elevated prior to re-reading guest
+ * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
+ * emulated writes are visible before re-reading guest PTEs, or that
+ * an emulated write will see the elevated count and acquire mmu_lock
+ * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
+ */
+ smp_mb();
+
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_add_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
+}
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
+{
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ ++kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages[mmu_type].pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages--;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (sp->role.level > PG_LEVEL_4K)
+ return __kvm_write_track_remove_gfn(kvm, slot, gfn);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type)
+{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
+ --kvm->stat.nx_lpage_splits;
+ --kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages;
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp, KVM_SHADOW_MMU);
+}
+
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return NULL;
+ if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
+ return NULL;
+
+ return slot;
+}
+
+/*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
+ * pte_list_desc containing more mappings.
+ */
+#define KVM_RMAP_MANY BIT(0)
+
+/*
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
+ * operates with mmu_lock held for write), but rmaps can be walked without
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
+ * being zapped/dropped _while the rmap is locked_.
+ *
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
+ * done while holding mmu_lock for write. This allows a task walking rmaps
+ * without holding mmu_lock to concurrently walk the same entries as a task
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
+ * the rmaps, thus the walks are stable.
+ *
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
+ * ensures all "struct pte_list_desc" fields are stable.
+ */
+#define KVM_RMAP_LOCKED BIT(1)
+
+static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
+ * entries to the rmap. I.e. the only paths that process empty rmaps
+ * do so while holding mmu_lock for write, and are mutually exclusive.
+ */
+ old_val = atomic_long_read(&rmap_head->val);
+ if (!old_val)
+ return 0;
+
+ do {
+ /*
+ * If the rmap is locked, wait for it to be unlocked before
+ * trying acquire the lock, e.g. to avoid bouncing the cache
+ * line.
+ */
+ while (old_val & KVM_RMAP_LOCKED) {
+ cpu_relax();
+ old_val = atomic_long_read(&rmap_head->val);
+ }
+
+ /*
+ * Recheck for an empty rmap, it may have been purged by the
+ * task that held the lock.
+ */
+ if (!old_val)
+ return 0;
+
+ new_val = old_val | KVM_RMAP_LOCKED;
+ /*
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
+ * from being reordered outside of the critical section created by
+ * __kvm_rmap_lock().
+ *
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
+ *
+ * For the !old_val case, no ordering is needed, as there is no rmap
+ * to walk.
+ */
+ } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
+
+ /*
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
+ * impossible for the return value to be 0 (see above), i.e. the read-
+ * only unlock flow can't get a false positive and fail to unlock.
+ */
+ return old_val;
+}
+
+static unsigned long kvm_rmap_lock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __kvm_rmap_lock(rmap_head);
+}
+
+static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
+ unsigned long val)
+{
+ KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
+ /*
+ * Ensure that all accesses to the rmap have completed before unlocking
+ * the rmap.
+ *
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
+ */
+ atomic_long_set_release(&rmap_head->val, val);
+}
+
+static void kvm_rmap_unlock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ unsigned long new_val)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ __kvm_rmap_unlock(rmap_head, new_val);
+}
+
+static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
+{
+ return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
+}
+
+/*
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
+ * actual locking is the same, but the caller is disallowed from modifying the
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
+ */
+static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val;
+
+ preempt_disable();
+ rmap_val = __kvm_rmap_lock(rmap_head);
+
+ if (!rmap_val)
+ preempt_enable();
+
+ return rmap_val;
+}
+
+static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
+ unsigned long old_val)
+{
+ if (!old_val)
+ return;
+
+ KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
+
+ __kvm_rmap_unlock(rmap_head, old_val);
+ preempt_enable();
+}
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
+ */
+static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ u64 *spte, struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+ struct pte_list_desc *desc;
+ int count = 0;
+
+ old_val = kvm_rmap_lock(kvm, rmap_head);
+
+ if (!old_val) {
+ new_val = (unsigned long)spte;
+ } else if (!(old_val & KVM_RMAP_MANY)) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->sptes[0] = (u64 *)old_val;
+ desc->sptes[1] = spte;
+ desc->spte_count = 2;
+ desc->tail_count = 0;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ ++count;
+ } else {
+ desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ } else {
+ new_val = old_val;
+ }
+ desc->sptes[desc->spte_count++] = spte;
+ }
+
+ kvm_rmap_unlock(kvm, rmap_head, new_val);
+
+ return count;
+}
+
+static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
+ struct pte_list_desc *desc, int i)
+{
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
+ int j = head_desc->spte_count - 1;
+
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
+
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
+ return;
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as empty, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
+ *rmap_val = 0;
+ else
+ *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
+ mmu_free_pte_list_desc(head_desc);
+}
+
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ unsigned long rmap_val;
+ int i;
+
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
+ goto out;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
+ goto out;
+
+ rmap_val = 0;
+ } else {
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ while (desc) {
+ for (i = 0; i < desc->spte_count; ++i) {
+ if (desc->sptes[i] == spte) {
+ pte_list_desc_remove_entry(kvm, &rmap_val,
+ desc, i);
+ goto out;
+ }
+ }
+ desc = desc->more;
+ }
+
+ KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
+ }
+
+out:
+ kvm_rmap_unlock(kvm, rmap_head, rmap_val);
+}
+
+static void kvm_zap_one_rmap_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, u64 *sptep)
+{
+ mmu_spte_clear_track_bits(kvm, sptep);
+ pte_list_remove(kvm, sptep, rmap_head);
+}
+
+/* Return true if at least one SPTE was zapped, false otherwise */
+static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc, *next;
+ unsigned long rmap_val;
+ int i;
+
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (!rmap_val)
+ return false;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ kvm_rmap_unlock(kvm, rmap_head, 0);
+ return true;
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
+ struct pte_list_desc *desc;
+
+ if (!rmap_val)
+ return 0;
+ else if (!(rmap_val & KVM_RMAP_MANY))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ return desc->tail_count + desc->spte_count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = sptep_to_sp(spte);
+ gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
+
+ /*
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
+ pte_list_remove(kvm, spte, rmap_head);
+}
+
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct rmap_head *head;
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the iterator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
+{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
+
+ if (!rmap_val)
+ return NULL;
+
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ iter->desc = NULL;
+ return (u64 *)rmap_val;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
+ iter->pos = 0;
+ return iter->desc->sptes[iter->pos];
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ ++iter->pos;
+ if (iter->desc->sptes[iter->pos])
+ return iter->desc->sptes[iter->pos];
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ return iter->desc->sptes[iter->pos];
+ }
+ }
+
+ return NULL;
+}
+
+#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
+
+#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
+ rmap_remove(kvm, sptep);
+}
+
+static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+ WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
+
+ drop_spte(kvm, sptep);
+
+ if (flush)
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte write-protection is caused by protecting shadow page table.
+ *
+ * Note: write protection is difference between dirty logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if tlb need be flushed.
+ */
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && is_mmu_writable_spte(spte)))
+ return false;
+
+ if (pt_protect)
+ spte &= ~shadow_mmu_writable_mask;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+
+ return flush;
+}
+
+static bool spte_clear_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
+ spte &= ~shadow_dirty_mask;
+ return mmu_spte_update(sptep, spte);
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ else
+ flush |= spte_clear_dirty(sptep);
+ }
+
+ return flush;
+}
+
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ rmap_write_protect(rmap_head, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
+ while (mask) {
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
+ __rmap_clear_dirty(kvm, rmap_head, slot);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ /*
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ if (READ_ONCE(eager_page_split))
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
+ if (kvm->arch.cpu_dirty_log_size)
+ kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+int kvm_cpu_dirty_log_size(struct kvm *kvm)
+{
+ return kvm->arch.cpu_dirty_log_size;
+}
+
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
+{
+ struct kvm_rmap_head *rmap_head;
+ int i;
+ bool write_protected = false;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = gfn_to_rmap(gfn, i, slot);
+ write_protected |= rmap_write_protect(rmap_head, true);
+ }
+ }
+
+ if (tdp_mmu_enabled)
+ write_protected |=
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
+
+ return write_protected;
+}
+
+static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
+}
+
+static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
+}
+
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ const struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap;
+ int level;
+
+ /* private field. */
+ struct kvm_rmap_head *end_rmap;
+};
+
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
+}
+
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ while (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
+
+ if (atomic_long_read(&iterator->rmap->val))
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
+
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
+{
+ struct slot_rmap_walk_iterator iterator;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
+
+ if (!can_yield)
+ continue;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+ return flush;
+}
+
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
+{
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
+
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
+
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
+}
+
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool flush = false;
+
+ /*
+ * To prevent races with vCPUs faulting in a gfn using stale data,
+ * zapping a gfn range must be protected by mmu_invalidate_in_progress
+ * (and mmu_invalidate_seq). The only exception is memslot deletion;
+ * in that case, SRCU synchronization ensures that SPTEs are zapped
+ * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the
+ * invalid slot.
+ */
+ lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
+ lockdep_is_held(&kvm->slots_lock));
+
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
+ return flush;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void __rmap_add(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
+
+ sp = sptep_to_sp(spte);
+ kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
+ kvm_update_page_stats(kvm, sp->role.level, 1);
+
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
+
+ if (rmap_count > kvm->stat.max_mmu_rmap_size)
+ kvm->stat.max_mmu_rmap_size = rmap_count;
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_zap_all_rmap_sptes(kvm, rmap_head);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+ }
+}
+
+static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn, unsigned int access)
+{
+ struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
+
+ __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
+}
+
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ bool test_only)
+{
+ struct kvm_rmap_head *rmap_head;
+ struct rmap_iterator iter;
+ unsigned long rmap_val;
+ bool young = false;
+ u64 *sptep;
+ gfn_t gfn;
+ int level;
+ u64 spte;
+
+ for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ for (gfn = range->start; gfn < range->end;
+ gfn += KVM_PAGES_PER_HPAGE(level)) {
+ rmap_head = gfn_to_rmap(gfn, level, range->slot);
+ rmap_val = kvm_rmap_lock_readonly(rmap_head);
+
+ for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only) {
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ return true;
+ }
+
+ if (spte_ad_enabled(spte))
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ else
+ /*
+ * If the following cmpxchg fails, the
+ * spte is being concurrently modified
+ * and should most likely stay young.
+ */
+ cmpxchg64(sptep, spte,
+ mark_spte_for_access_track(spte));
+ young = true;
+ }
+
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ }
+ }
+ return young;
+}
+
+static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
+}
+
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (tdp_mmu_enabled)
+ young = kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, false);
+
+ return young;
+}
+
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ bool young = false;
+
+ if (tdp_mmu_enabled)
+ young = kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ if (young)
+ return young;
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, true);
+
+ return young;
+}
+
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
+{
+#ifdef CONFIG_KVM_PROVE_MMU
+ int i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+ pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+ sp->spt[i], &sp->spt[i],
+ kvm_mmu_page_get_gfn(sp, i));
+ }
+#endif
+}
+
+static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm->arch.n_used_mmu_pages++;
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm->arch.n_used_mmu_pages--;
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
+static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
+{
+ kvm_mmu_check_sptes_at_free(sp);
+
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ free_page((unsigned long)sp->shadowed_translation);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
+}
+
+static void mmu_page_add_parent_pte(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ if (!parent_pte)
+ return;
+
+ pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
+}
+
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(kvm, sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
+
+static void mark_unsync(u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
+}
+
+static void mark_unsync(u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(spte);
+ if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON_ONCE((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
+
+ child = spte_to_child_sp(ent);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
+ nr_unsync_leaf += ret;
+ } else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ clear_unsync_child_bit(sp, i);
+ }
+
+ return nr_unsync_leaf;
+}
+
+#define INVALID_INDEX (-1)
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ pvec->nr = 0;
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON_ONCE(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+ if (sp->role.direct)
+ return false;
+
+ if (sp->role.passthrough)
+ return false;
+
+ return true;
+}
+
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
+#define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
+ if (is_obsolete_sp((_kvm), (_sp))) { \
+ } else
+
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
+ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret = __kvm_sync_page(vcpu, sp);
+
+ if (ret < 0)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return ret;
+}
+
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+ struct list_head *invalid_list,
+ bool remote_flush)
+{
+ if (!remote_flush && list_empty(invalid_list))
+ return false;
+
+ if (!list_empty(invalid_list))
+ kvm_mmu_commit_zap_page(kvm, invalid_list);
+ else
+ kvm_flush_remote_tlbs(kvm);
+ return true;
+}
+
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages do not use the MMU generation. */
+ return !is_tdp_mmu_page(sp) &&
+ unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_first(&pvec, &parents); \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
+
+ parents->idx[level-1] = idx;
+ if (level == PG_LEVEL_4K)
+ break;
+
+ parents->parent[level-2] = sp;
+ }
+
+ return n;
+}
+
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON_ONCE(level == PG_LEVEL_4K);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ WARN_ON_ONCE(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
+ level++;
+ } while (!sp->unsync_children);
+}
+
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ bool protected = false;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
+
+ if (protected) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
+ flush = false;
+ }
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
+ mmu_pages_clear_parents(&parents);
+ }
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ if (!can_yield) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return -EINTR;
+ }
+
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
+ }
+
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ return 0;
+}
+
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ atomic_set(&sp->write_flooding_count, 0);
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
+}
+
+/*
+ * The vCPU is required when finding indirect shadow pages; the shadow
+ * page may already exist and syncing it needs the vCPU pointer in
+ * order to read guest page tables. Direct shadow pages are never
+ * unsync, thus @vcpu can be NULL if @role.direct is true.
+ */
+static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+ int ret;
+ int collisions = 0;
+ LIST_HEAD(invalid_list);
+
+ for_each_valid_sp(kvm, sp, sp_list) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (role.level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(kvm, sp,
+ &invalid_list);
+ continue;
+ }
+
+ /* unsync and write-flooding only apply to indirect SPs. */
+ if (sp->role.direct)
+ goto out;
+
+ if (sp->unsync) {
+ if (KVM_BUG_ON(!vcpu, kvm))
+ break;
+
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
+ */
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
+ break;
+
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ if (ret > 0)
+ kvm_flush_remote_tlbs(kvm);
+ }
+
+ __clear_sp_write_flooding_count(sp);
+
+ goto out;
+ }
+
+ sp = NULL;
+ ++kvm->stat.mmu_cache_miss;
+
+out:
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (collisions > kvm->stat.max_mmu_page_hash_collisions)
+ kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+/* Caches used when allocating a new shadow page. */
+struct shadow_page_caches {
+ struct kvm_mmu_memory_cache *page_header_cache;
+ struct kvm_mmu_memory_cache *shadow_page_cache;
+ struct kvm_mmu_memory_cache *shadowed_info_cache;
+};
+
+static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ struct hlist_head *sp_list,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
+ sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ /*
+ * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
+ * depends on valid pages being added to the head of the list. See
+ * comments in kvm_zap_obsolete_pages().
+ */
+ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ list_add(&sp->link, &kvm->arch.active_mmu_pages);
+ kvm_account_mmu_page(kvm, sp);
+
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link, sp_list);
+ if (sp_has_gptes(sp))
+ account_shadowed(kvm, sp);
+
+ return sp;
+}
+
+/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
+static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
+ struct kvm_vcpu *vcpu,
+ struct shadow_page_caches *caches,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct hlist_head *sp_list;
+ struct kvm_mmu_page *sp;
+ bool created = false;
+
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
+ sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+
+ sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
+ if (!sp) {
+ created = true;
+ sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
+ }
+
+ trace_kvm_mmu_get_page(sp, created);
+ return sp;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ union kvm_mmu_page_role role)
+{
+ struct shadow_page_caches caches = {
+ .page_header_cache = &vcpu->arch.mmu_page_header_cache,
+ .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
+ .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
+ };
+
+ return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
+}
+
+static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
+ unsigned int access)
+{
+ struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
+ union kvm_mmu_page_role role;
+
+ role = parent_sp->role;
+ role.level--;
+ role.access = access;
+ role.direct = direct;
+ role.passthrough = 0;
+
+ /*
+ * If the guest has 4-byte PTEs then that means it's using 32-bit,
+ * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
+ * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
+ * shadow each guest page table with multiple shadow page tables, which
+ * requires extra bookkeeping in the role.
+ *
+ * Specifically, to shadow the guest's page directory (which covers a
+ * 4GiB address space), KVM uses 4 PAE page directories, each mapping
+ * 1GiB of the address space. @role.quadrant encodes which quarter of
+ * the address space each maps.
+ *
+ * To shadow the guest's page tables (which each map a 4MiB region), KVM
+ * uses 2 PAE page tables, each mapping a 2MiB region. For these,
+ * @role.quadrant encodes which half of the region they map.
+ *
+ * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
+ * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
+ * PDPTEs; those 4 PAE page directories are pre-allocated and their
+ * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
+ * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
+ * bit 21 in the PTE (the child here), KVM propagates that bit to the
+ * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
+ * covers bit 21 (see above), thus the quadrant is calculated from the
+ * _least_ significant bit of the PDE index.
+ */
+ if (role.has_4_byte_gpte) {
+ WARN_ON_ONCE(role.level != PG_LEVEL_4K);
+ role.quadrant = spte_index(sptep) & 1;
+ }
+
+ return role;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
+ u64 *sptep, gfn_t gfn,
+ bool direct, unsigned int access)
+{
+ union kvm_mmu_page_role role;
+
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+ return ERR_PTR(-EEXIST);
+
+ role = kvm_mmu_child_role(sptep, direct, access);
+ return kvm_mmu_get_shadow_page(vcpu, gfn, role);
+}
+
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = root;
+ iterator->level = vcpu->arch.mmu->root_role.level;
+
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->root_role.direct)
+ iterator->level = PT32E_ROOT_LEVEL;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu->root.hpa);
+
+ iterator->shadow_addr
+ = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
+ addr);
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PG_LEVEL_4K)
+ return false;
+
+ iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
+{
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ __shadow_walk_next(iterator, *iterator->sptep);
+}
+
+static void __link_shadow_page(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache, u64 *sptep,
+ struct kvm_mmu_page *sp, bool flush)
+{
+ u64 spte;
+
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
+ /*
+ * If an SPTE is present already, it must be a leaf and therefore
+ * a large one. Drop it, and flush the TLB if needed, before
+ * installing sp.
+ */
+ if (is_shadow_present_pte(*sptep))
+ drop_large_spte(kvm, sptep, flush);
+
+ spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
+
+ mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(kvm, cache, sp, sptep);
+
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
+ mark_unsync(sptep);
+}
+
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = spte_to_child_sp(*sptep);
+ if (child->role.access == direct_access)
+ return;
+
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
+ }
+}
+
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level)) {
+ drop_spte(kvm, spte);
+ } else {
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(kvm, child, spte);
+
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode &&
+ !atomic_long_read(&child->parent_ptes.val))
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
+ }
+ } else if (is_mmio_spte(kvm, pte)) {
+ mmu_spte_clear_no_track(spte);
+ }
+ return 0;
+}
+
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int zapped = 0;
+ unsigned i;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ drop_parent_pte(kvm, sp, sptep);
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PG_LEVEL_4K)
+ return 0;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ }
+
+ return zapped;
+}
+
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list,
+ int *nr_zapped)
+{
+ bool list_unstable, zapped_root = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
+ kvm_mmu_unlink_parents(kvm, sp);
+
+ /* Zapping children means active_mmu_pages has become unstable. */
+ list_unstable = *nr_zapped;
+
+ if (!sp->role.invalid && sp_has_gptes(sp))
+ unaccount_shadowed(kvm, sp);
+
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ (*nr_zapped)++;
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
+ kvm_unaccount_mmu_page(kvm, sp);
+ } else {
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
+
+ /*
+ * Obsolete pages cannot be used on any vCPUs, see the comment
+ * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
+ * treats invalid shadow pages as being obsolete.
+ */
+ zapped_root = !is_obsolete_sp(kvm, sp);
+ }
+
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
+
+ sp->role.invalid = 1;
+
+ /*
+ * Make the request to free obsolete roots after marking the root
+ * invalid, otherwise other vCPUs may not see it as invalid.
+ */
+ if (zapped_root)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+ return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int nr_zapped;
+
+ __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+ return nr_zapped;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp, *nsp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ /*
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
+ WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_shadow_page(sp);
+ }
+}
+
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
+{
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return 0;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+}
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
+}
+
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
+
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
+
+ /*
+ * Note, this check is intentionally soft, it only guarantees that one
+ * page is available, while the caller may end up allocating as many as
+ * four pages, e.g. for PAE roots or for 5-level paging. Temporarily
+ * exceeding the (arbitrary by default) limit will not harm the host,
+ * being too aggressive may unnecessarily kill the guest, and getting an
+ * exact count is far more trouble than it's worth, especially in the
+ * page fault paths.
+ */
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
+{
+ write_lock(&kvm->mmu_lock);
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
+
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
+{
+ struct kvm *kvm = vcpu->kvm;
+ LIST_HEAD(invalid_list);
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
+ return r;
+}
+
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool synchronizing, bool prefetch)
+{
+ struct kvm_mmu_page *sp;
+ bool locked = false;
+
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
+ if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
+ return -EPERM;
+
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP. In that case, KVM must
+ * complete emulation of the guest TLB flush before allowing shadow
+ * pages to become unsync (writable by the guest).
+ */
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
+ if (synchronizing)
+ return -EPERM;
+
+ if (sp->unsync)
+ continue;
+
+ if (prefetch)
+ return -EEXIST;
+
+ /*
+ * TDP MMU page faults require an additional spinlock as they
+ * run with mmu_lock held for read, not write, and the unsync
+ * logic is not thread safe. Take the spinklock regardless of
+ * the MMU type to avoid extra conditionals/parameters, there's
+ * no meaningful penalty if mmu_lock is held for write.
+ */
+ if (!locked) {
+ locked = true;
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * Recheck after taking the spinlock, a different vCPU
+ * may have since marked the page unsync. A false
+ * negative on the unprotected check above is not
+ * possible as clearing sp->unsync _must_ hold mmu_lock
+ * for write, i.e. unsync cannot transition from 1->0
+ * while this CPU holds mmu_lock for read (or write).
+ */
+ if (READ_ONCE(sp->unsync))
+ continue;
+ }
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+ kvm_unsync_page(kvm, sp);
+ }
+ if (locked)
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
+ */
+ smp_wmb();
+
+ return 0;
+}
+
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
+ int was_rmapped = 0;
+ int ret = RET_PF_FIXED;
+ bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
+
+ if (unlikely(is_noslot_pfn(pfn))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ mark_mmio_spte(vcpu, sptep, gfn, pte_access);
+ return RET_PF_EMULATE;
+ }
+
+ if (is_shadow_present_pte(*sptep)) {
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
+ return RET_PF_SPURIOUS;
+
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = spte_to_child_sp(pte);
+ drop_parent_pte(vcpu->kvm, child, sptep);
+ flush = true;
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
+ drop_spte(vcpu->kvm, sptep);
+ flush = true;
+ } else
+ was_rmapped = 1;
+ }
+
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ false, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ flush |= mmu_spte_update(sptep, spte);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ }
+
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
+
+ if (flush)
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
+
+ if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
+ rmap_add(vcpu, slot, sptep, gfn, pte_access);
+ } else {
+ /* Already rmapped but the pte_access bits may have changed. */
+ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
+ }
+
+ return ret;
+}
+
+static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep,
+ int nr_pages, unsigned int access)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ int i;
+
+ if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM))
+ return false;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
+ return false;
+
+ nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages);
+ if (nr_pages <= 0)
+ return false;
+
+ for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
+ mmu_set_spte(vcpu, slot, sptep, access, gfn,
+ page_to_pfn(pages[i]), NULL);
+
+ /*
+ * KVM always prefetches writable pages from the primary MMU,
+ * and KVM can make its SPTE writable in the fast page handler,
+ * without notifying the primary MMU. Mark pages/folios dirty
+ * now to ensure file data is written back if it ends up being
+ * written by the guest. Because KVM's prefetching GUPs
+ * writable PTEs, the probability of unnecessary writeback is
+ * extremely low.
+ */
+ kvm_release_page_dirty(pages[i]);
+ }
+
+ return true;
+}
+
+static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
+ unsigned int access = sp->role.access;
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access);
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON_ONCE(!sp->role.direct);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
+ if (!start)
+ continue;
+ if (!direct_pte_prefetch_many(vcpu, sp, start, spte))
+ return;
+
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
+ */
+ if (sp_ad_disabled(sp))
+ return;
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+/*
+ * Lookup the mapping level for @gfn in the current mm.
+ *
+ * WARNING! Use of host_pfn_mapping_level() requires the caller and the end
+ * consumer to be tied into KVM's handlers for MMU notifier events!
+ *
+ * There are several ways to safely use this helper:
+ *
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
+ * consuming it. In this case, mmu_lock doesn't need to be held during the
+ * lookup, but it does need to be held while checking the MMU notifier.
+ *
+ * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
+ * event for the hva. This can be done by explicit checking the MMU notifier
+ * or by ensuring that KVM already has a valid mapping that covers the hva.
+ *
+ * - Do not use the result to install new mappings, e.g. use the host mapping
+ * level only to decide whether or not to zap an entry. In this case, it's
+ * not required to hold mmu_lock (though it's highly likely the caller will
+ * want to hold mmu_lock anyways, e.g. to modify SPTEs).
+ *
+ * Note! The lookup can still race with modifications to host page tables, but
+ * the above "rules" ensure KVM will not _consume_ the result of the walk if a
+ * race with the primary MMU occurs.
+ */
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
+ const struct kvm_memory_slot *slot)
+{
+ int level = PG_LEVEL_4K;
+ unsigned long hva;
+ unsigned long flags;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+
+ /*
+ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
+ * is not solely for performance, it's also necessary to avoid the
+ * "writable" check in __gfn_to_hva_many(), which will always fail on
+ * read-only memslots due to gfn_to_hva() assuming writes. Earlier
+ * page fault steps have already verified the guest isn't writing a
+ * read-only memslot.
+ */
+ hva = __gfn_to_hva_memslot(slot, gfn);
+
+ /*
+ * Disable IRQs to prevent concurrent tear down of host page tables,
+ * e.g. if the primary MMU promotes a P*D to a huge page and then frees
+ * the original page table.
+ */
+ local_irq_save(flags);
+
+ /*
+ * Read each entry once. As above, a non-leaf entry can be promoted to
+ * a huge page _during_ this walk. Re-reading the entry could send the
+ * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
+ * value) and then p*d_offset() walks into the target huge page instead
+ * of the old page table (sees the new value).
+ */
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_leaf(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_leaf(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
+ return level;
+}
+
+static u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool is_private)
+{
+ u8 max_level, coco_level;
+ kvm_pfn_t pfn;
+
+ /* For faults, use the gmem information that was resolved earlier. */
+ if (fault) {
+ pfn = fault->pfn;
+ max_level = fault->max_level;
+ } else {
+ /* TODO: Call into guest_memfd once hugepages are supported. */
+ WARN_ONCE(1, "Get pfn+order from guest_memfd");
+ pfn = KVM_PFN_ERR_FAULT;
+ max_level = PG_LEVEL_4K;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return max_level;
+
+ /*
+ * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
+ * restrictions. A return of '0' means "no additional restrictions", to
+ * allow for using an optional "ret0" static call.
+ */
+ coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
+ if (coco_level)
+ max_level = min(max_level, coco_level);
+
+ return max_level;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ struct kvm_lpage_info *linfo;
+ int host_level, max_level;
+ bool is_private;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (fault) {
+ max_level = fault->max_level;
+ is_private = fault->is_private;
+ } else {
+ max_level = PG_LEVEL_NUM;
+ is_private = kvm_mem_is_private(kvm, gfn);
+ }
+
+ max_level = min(max_level, max_huge_page_level);
+ for ( ; max_level > PG_LEVEL_4K; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ if (is_private || kvm_memslot_is_gmem_only(slot))
+ host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
+ is_private);
+ else
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
+ return min(host_level, max_level);
+}
+
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ kvm_pfn_t mask;
+
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
+
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
+
+ if (is_error_noslot_pfn(fault->pfn))
+ return;
+
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
+ fault->slot, fault->gfn);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
+
+ /*
+ * mmu_invalidate_retry() was successful and mmu_lock is held, so
+ * the pmd can't be split from under us.
+ */
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
+}
+
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
+{
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
+ }
+}
+
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *sp;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ direct_pte_prefetch(vcpu, it.sptep);
+ return ret;
+}
+
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
+}
+
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ if (is_sigpending_pfn(fault->pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ */
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
+ return RET_PF_EMULATE;
+
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
+ return RET_PF_RETRY;
+ }
+
+ return -EFAULT;
+}
+
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
+
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
+
+ return RET_PF_CONTINUE;
+}
+
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
+{
+ /*
+ * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
+ * reach the common page fault handler if the SPTE has an invalid MMIO
+ * generation number. Refreshing the MMIO generation needs to go down
+ * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
+ */
+ if (fault->rsvd)
+ return false;
+
+ /*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
+ * #PF can be fast if:
+ *
+ * 1. The shadow page table entry is not present and A/D bits are
+ * disabled _by KVM_, which could mean that the fault is potentially
+ * caused by access tracking (if enabled). If A/D bits are enabled
+ * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
+ * bits for L2 and employ access tracking, but the fast page fault
+ * mechanism only supports direct MMUs.
+ * 2. The shadow page table entry is present, the access is a write,
+ * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
+ * the fault was caused by a write-protection violation. If the
+ * SPTE is MMU-writable (determined later), the fault can be fixed
+ * by setting the Writable bit, which can be done out of mmu_lock.
+ */
+ if (!fault->present)
+ return !kvm_ad_enabled;
+
+ /*
+ * Note, instruction fetches and writes are mutually exclusive, ignore
+ * the "exec" flag.
+ */
+ return fault->write;
+}
+
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
+{
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with make_spte() where instead shadow_dirty_mask is set.
+ */
+ if (!try_cmpxchg64(sptep, &old_spte, new_spte))
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
+
+ return true;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
+ */
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_INVALID;
+ u64 spte;
+ u64 *sptep;
+ uint retry_count = 0;
+
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
+ return ret;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ do {
+ u64 new_spte;
+
+ if (tdp_mmu_enabled)
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+
+ /*
+ * It's entirely possible for the mapping to have been zapped
+ * by a different task, but the root page should always be
+ * available as the vCPU holds a reference to its root(s).
+ */
+ if (WARN_ON_ONCE(!sptep))
+ spte = FROZEN_SPTE;
+
+ if (!is_shadow_present_pte(spte))
+ break;
+
+ sp = sptep_to_sp(sptep);
+ if (!is_last_spte(spte, sp->role.level))
+ break;
+
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(fault, spte)) {
+ ret = RET_PF_SPURIOUS;
+ break;
+ }
+
+ new_spte = spte;
+
+ /*
+ * KVM only supports fixing page faults outside of MMU lock for
+ * direct MMUs, nested MMUs are always indirect, and KVM always
+ * uses A/D bits for non-nested MMUs. Thus, if A/D bits are
+ * enabled, the SPTE can't be an access-tracked SPTE.
+ */
+ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte) |
+ shadow_accessed_mask;
+
+ /*
+ * To keep things simple, only SPTEs that are MMU-writable can
+ * be made fully writable outside of mmu_lock, e.g. only SPTEs
+ * that were write-protected for dirty-logging or access
+ * tracking are handled here. Don't bother checking if the
+ * SPTE is writable to prioritize running with A/D bits enabled.
+ * The is_access_allowed() check above handles the common case
+ * of the fault being spurious, and the SPTE is known to be
+ * shadow-present, i.e. except for access tracking restoration
+ * making the new SPTE writable, the check is wasteful.
+ */
+ if (fault->write && is_mmu_writable_spte(spte)) {
+ new_spte |= PT_WRITABLE_MASK;
+
+ /*
+ * Do not fix write-permission on the large spte when
+ * dirty logging is enabled. Since we only dirty the
+ * first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ */
+ if (sp->role.level > PG_LEVEL_4K &&
+ kvm_slot_dirty_track_enabled(fault->slot))
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(fault, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virt/kvm/locking.rst to get more detail.
+ */
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
+ ret = RET_PF_FIXED;
+ break;
+ }
+
+ if (++retry_count > 4) {
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
+
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (ret != RET_PF_INVALID)
+ vcpu->stat.pf_fast++;
+
+ return ret;
+}
+
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(*root_hpa))
+ return;
+
+ sp = root_to_sp(*root_hpa);
+ if (WARN_ON_ONCE(!sp))
+ return;
+
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ kvm_tdp_mmu_put_root(kvm, sp);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
+
+ *root_hpa = INVALID_PAGE;
+}
+
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
+ ulong roots_to_free)
+{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
+ int i;
+ LIST_HEAD(invalid_list);
+ bool free_active_root;
+
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
+ && VALID_PAGE(mmu->root.hpa);
+
+ if (!free_active_root) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
+
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+ /* Nothing to cleanup for dummy roots. */
+ } else if (root_to_sp(mmu->root.hpa)) {
+ mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
+ } else if (mmu->pae_root) {
+ for (i = 0; i < 4; ++i) {
+ if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
+ continue;
+
+ mmu_free_root_page(kvm, &mmu->pae_root[i],
+ &invalid_list);
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ }
+ }
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ }
+
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_roots);
+
+void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ struct kvm_mmu_page *sp;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->root_role.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ sp = root_to_sp(root_hpa);
+ if (!sp || sp->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_guest_mode_roots);
+
+static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
+ u8 level)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu_page *sp;
+
+ role.level = level;
+ role.quadrant = quadrant;
+
+ WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
+ WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
+
+ sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
+ ++sp->root_count;
+
+ return __pa(sp->spt);
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u8 shadow_root_level = mmu->root_role.level;
+ hpa_t root;
+ unsigned i;
+ int r;
+
+ if (tdp_mmu_enabled) {
+ if (kvm_has_mirrored_tdp(vcpu->kvm) &&
+ !VALID_PAGE(mmu->mirror_root_hpa))
+ kvm_tdp_mmu_alloc_root(vcpu, true);
+ kvm_tdp_mmu_alloc_root(vcpu, false);
+ return 0;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
+ mmu->root.hpa = root;
+ } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
+ PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | PT_PRESENT_MASK |
+ shadow_me_value;
+ }
+ mmu->root.hpa = __pa(mmu->pae_root);
+ } else {
+ WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /* root.pgd is ignored for direct MMUs. */
+ mmu->root.pgd = 0;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
+ /*
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 pdptrs[4], pm_mask;
+ gfn_t root_gfn, root_pgd;
+ int quadrant, i, r;
+ hpa_t root;
+
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+ mmu->root.hpa = kvm_mmu_get_dummy_root();
+ return 0;
+ }
+
+ /*
+ * On SVM, reading PDPTRs might access guest memory, which might fault
+ * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
+ */
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ pdptrs[i] = mmu->get_pdptr(vcpu, i);
+ if (!(pdptrs[i] & PT_PRESENT_MASK))
+ continue;
+
+ if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ pdptrs[i] = 0;
+ }
+ }
+
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
+ if (r)
+ return r;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ r = make_mmu_pages_available(vcpu);
+ if (r < 0)
+ goto out_unlock;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, root_gfn, 0,
+ mmu->root_role.level);
+ mmu->root.hpa = root;
+ goto set_root_pgd;
+ }
+
+ if (WARN_ON_ONCE(!mmu->pae_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK | shadow_me_value;
+ if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
+ }
+
+ for (i = 0; i < 4; ++i) {
+ WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
+
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
+ if (!(pdptrs[i] & PT_PRESENT_MASK)) {
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+ continue;
+ }
+ root_gfn = pdptrs[i] >> PAGE_SHIFT;
+ }
+
+ /*
+ * If shadowing 32-bit non-PAE page tables, each PAE page
+ * directory maps one quarter of the guest's non-PAE page
+ * directory. Othwerise each PAE page direct shadows one guest
+ * PAE page directory so that quadrant should be 0.
+ */
+ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+
+ root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
+ mmu->pae_root[i] = root | pm_mask;
+ }
+
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL)
+ mmu->root.hpa = __pa(mmu->pml5_root);
+ else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
+ mmu->root.hpa = __pa(mmu->pml4_root);
+ else
+ mmu->root.hpa = __pa(mmu->pae_root);
+
+set_root_pgd:
+ mmu->root.pgd = root_pgd;
+out_unlock:
+ write_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
+static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
+
+ /*
+ * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
+ * tables are allocated and initialized at root creation as there is no
+ * equivalent level in the guest's NPT to shadow. Allocate the tables
+ * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
+ */
+ if (mmu->root_role.direct ||
+ mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ mmu->root_role.level < PT64_ROOT_4LEVEL)
+ return 0;
+
+ /*
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
+ */
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
+ return 0;
+
+ /*
+ * The special roots should always be allocated in concert. Yell and
+ * bail if KVM ends up in a state where only one of the roots is valid.
+ */
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
+ return -EIO;
+
+ /*
+ * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
+ * doesn't need to be decrypted.
+ */
+ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pae_root)
+ return -ENOMEM;
+
+#ifdef CONFIG_X86_64
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
+ }
+#endif
+
+ mmu->pae_root = pae_root;
+ mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
+
+ return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
+}
+
+static bool is_unsync_root(hpa_t root)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = root_to_sp(root);
+
+ /*
+ * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
+ * PDPTEs for a given PAE root need to be synchronized individually.
+ */
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu->root_role.direct)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
+ return;
+
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu->root.hpa;
+
+ if (!is_unsync_root(root))
+ return;
+
+ sp = root_to_sp(root);
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_children(vcpu, sp, true);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu->pae_root[i];
+
+ if (IS_VALID_PAE_ROOT(root)) {
+ sp = spte_to_child_sp(root);
+ mmu_sync_children(vcpu, sp, true);
+ }
+ }
+
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
+}
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u64 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
+}
+
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
+ */
+static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int leaf = -1;
+ u64 spte;
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ *root_level = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ leaf = iterator.level;
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf] = spte;
+ }
+
+ return leaf;
+}
+
+static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ int leaf;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ if (is_tdp_mmu_active(vcpu))
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
+ else
+ leaf = get_walk(vcpu, addr, sptes, root_level);
+
+ walk_shadow_page_lockless_end(vcpu);
+ return leaf;
+}
+
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ struct rsvd_bits_validate *rsvd_check;
+ int root, leaf, level;
+ bool reserved = false;
+
+ leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
+ if (unlikely(leaf < 0)) {
+ *sptep = 0ull;
+ return reserved;
+ }
+
+ *sptep = sptes[leaf];
+
+ /*
+ * Skip reserved bits checks on the terminal leaf if it's not a valid
+ * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
+ * design, always have reserved bits set. The purpose of the checks is
+ * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
+ */
+ if (!is_shadow_present_pte(sptes[leaf]))
+ leaf++;
+
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+
+ for (level = root; level >= leaf; level--)
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
+
+ if (reserved) {
+ pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
+ __func__, addr);
+ for (level = root; level >= leaf; level--)
+ pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
+ sptes[level], level,
+ get_rsvd_bits(rsvd_check, sptes[level], level));
+ }
+
+ return reserved;
+}
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+ bool reserved;
+
+ if (mmio_info_in_cache(vcpu, addr, direct))
+ return RET_PF_EMULATE;
+
+ reserved = get_mmio_spte(vcpu, addr, &spte);
+ if (WARN_ON_ONCE(reserved))
+ return -EINVAL;
+
+ if (is_mmio_spte(vcpu->kvm, spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned int access = get_mmio_spte_access(spte);
+
+ if (!check_mmio_spte(vcpu, spte))
+ return RET_PF_INVALID;
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return RET_PF_RETRY;
+}
+
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(fault->rsvd))
+ return false;
+
+ if (!fault->present || !fault->write)
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ clear_sp_write_flooding_count(iterator.sptep);
+ walk_shadow_page_lockless_end(vcpu);
+}
+
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = alloc_apf_token(vcpu);
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
+ arch.direct_map = vcpu->arch.mmu->root_role.direct;
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
+
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
+}
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu->root_role.direct &&
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
+ return;
+
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+}
+
+static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int r)
+{
+ kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
+ r == RET_PF_RETRY, fault->map_writable);
+}
+
+static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int max_order, r;
+
+ if (!kvm_slot_has_gmem(fault->slot)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+ &fault->refcounted_page, &max_order);
+ if (r) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return r;
+ }
+
+ fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_level_for_order(max_order);
+
+ return RET_PF_CONTINUE;
+}
+
+static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ unsigned int foll = fault->write ? FOLL_WRITE : 0;
+
+ if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
+ return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
+
+ foll |= FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ /*
+ * If resolving the page failed because I/O is needed to fault-in the
+ * page, then either set up an asynchronous #PF to do the I/O, or if
+ * doing an async #PF isn't possible, retry with I/O allowed. All
+ * other failures are terminal, i.e. retrying won't help.
+ */
+ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
+ return RET_PF_CONTINUE;
+
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return RET_PF_RETRY;
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
+ return RET_PF_RETRY;
+ }
+ }
+
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ foll |= FOLL_INTERRUPTIBLE;
+ foll &= ~FOLL_NOWAIT;
+ fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
+ &fault->map_writable, &fault->refcounted_page);
+
+ return RET_PF_CONTINUE;
+}
+
+static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, unsigned int access)
+{
+ struct kvm_memory_slot *slot = fault->slot;
+ struct kvm *kvm = vcpu->kvm;
+ int ret;
+
+ if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm))
+ return -EFAULT;
+
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn. Punt the
+ * error to userspace if this is a prefault, as KVM's prefaulting ABI
+ * doesn't provide the same forward progress guarantees as KVM_RUN.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID) {
+ if (fault->prefetch)
+ return -EAGAIN;
+
+ return RET_PF_RETRY;
+ }
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * Check for a relevant mmu_notifier invalidation event before getting
+ * the pfn from the primary MMU, and before acquiring mmu_lock.
+ *
+ * For mmu_lock, if there is an in-progress invalidation and the kernel
+ * allows preemption, the invalidation task may drop mmu_lock and yield
+ * in response to mmu_lock being contended, which is *very* counter-
+ * productive as this vCPU can't actually make forward progress until
+ * the invalidation completes.
+ *
+ * Retrying now can also avoid unnessary lock contention in the primary
+ * MMU, as the primary MMU doesn't necessarily hold a single lock for
+ * the duration of the invalidation, i.e. faulting in a conflicting pfn
+ * can cause the invalidation to take longer by holding locks that are
+ * needed to complete the invalidation.
+ *
+ * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
+ * will never yield mmu_lock in response to contention, as this vCPU is
+ * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
+ * to detect retry guarantees the worst case latency for the vCPU.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn))
+ return RET_PF_RETRY;
+
+ ret = __kvm_mmu_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Check again for a relevant mmu_notifier invalidation event purely to
+ * avoid contending mmu_lock. Most invalidations will be detected by
+ * the previous check, but checking is extremely cheap relative to the
+ * overall cost of failing to detect the invalidation until after
+ * mmu_lock is acquired.
+ */
+ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
+ kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
+ return RET_PF_RETRY;
+ }
+
+ return RET_PF_CONTINUE;
+}
+
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
+ return true;
+
+ /*
+ * Check for a relevant mmu_notifier invalidation event one last time
+ * now that mmu_lock is held, as the "unsafe" checks performed without
+ * holding mmu_lock can get false negatives.
+ */
+ return fault->slot &&
+ mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
+}
+
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ int r;
+
+ /* Dummy roots are used only for shadowing bad guest roots. */
+ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_WRITE_PROTECTED;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+
+ r = direct_map(vcpu, fault);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
+}
+
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len)
+{
+ int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
+
+ kvm_request_l1tf_flush_l1d();
+ if (!flags) {
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+ insn_len);
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
+ vcpu->arch.apf.host_apf_flags = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait_schedule(fault_address);
+ local_irq_enable();
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
+ }
+
+ return r;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_page_fault);
+
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_WRITE_PROTECTED;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+#endif
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
+ return direct_page_fault(vcpu, fault);
+}
+
+static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 error_code, u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 direct_bits;
+ u64 end;
+ int r;
+
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
+ if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ return -EINVAL;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ direct_bits = 0;
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+ else
+ direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+ WARN_ON_ONCE(!slot->gmem.file) ||
+ WARN_ON_ONCE(!file_count(slot->gmem.file)))
+ return;
+
+ lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct kvm_page_fault fault = {
+ .addr = gfn_to_gpa(gfn),
+ .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+ .prefetch = true,
+ .is_tdp = true,
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = PG_LEVEL_4K,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = true,
+
+ .gfn = gfn,
+ .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+ .pfn = pfn,
+ .map_writable = true,
+ };
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Mapping a pre-determined private pfn is intended only for use when
+ * populating a guest_memfd instance. Assert that the slot is backed
+ * by guest_memfd and that the gmem instance's invalidate_lock is held.
+ */
+ kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+ if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+ return -EIO;
+
+ if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+ return -EPERM;
+
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+
+ if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
+
+ cond_resched();
+
+ guard(read_lock)(&kvm->mmu_lock);
+
+ r = kvm_tdp_mmu_map(vcpu, &fault);
+ } while (r == RET_PF_RETRY);
+
+ if (r != RET_PF_FIXED)
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
+static void nonpaging_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_spte = NULL;
+}
+
+static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
+ union kvm_mmu_page_role role)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root->hpa))
+ return false;
+
+ if (!role.direct && pgd != root->pgd)
+ return false;
+
+ sp = root_to_sp(root->hpa);
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ return role.word == sp->role.word;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available,
+ * and insert the current root as the MRU in the cache.
+ * If a matching root is found, it is assigned to kvm_mmu->root and
+ * true is returned.
+ * If no match is found, kvm_mmu->root is left invalid, the LRU root is
+ * evicted to make room for the current root, and false is returned.
+ */
+static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ /*
+ * The swaps end up rotating the cache like this:
+ * C 0 1 2 3 (on entry to the function)
+ * 0 C 1 2 3
+ * 1 C 0 2 3
+ * 2 C 0 1 3
+ * 3 C 0 1 2 (on exit from the loop)
+ */
+ swap(mmu->root, mmu->prev_roots[i]);
+ if (is_root_usable(&mmu->root, new_pgd, new_role))
+ return true;
+ }
+
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+ return false;
+}
+
+/*
+ * Find out if a previously cached root matching the new pgd/role is available.
+ * On entry, mmu->root is invalid.
+ * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
+ * of the cache becomes invalid, and true is returned.
+ * If no match is found, kvm_mmu->root is left invalid and false is returned.
+ */
+static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
+ goto hit;
+
+ return false;
+
+hit:
+ swap(mmu->root, mmu->prev_roots[i]);
+ /* Bubble up the remaining roots. */
+ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
+ mmu->prev_roots[i] = mmu->prev_roots[i + 1];
+ mmu->prev_roots[i].hpa = INVALID_PAGE;
+ return true;
+}
+
+static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
+ gpa_t new_pgd, union kvm_mmu_page_role new_role)
+{
+ /*
+ * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+ * avoid having to deal with PDPTEs and other complexities.
+ */
+ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
+ kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
+
+ if (VALID_PAGE(mmu->root.hpa))
+ return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
+ else
+ return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
+}
+
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role new_role = mmu->root_role;
+
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
+ return;
+
+ /*
+ * It's possible that the cached previous root page is obsolete because
+ * of a change in the MMU generation number. However, changing the
+ * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
+ * which will free the root set here and allocate a new one.
+ */
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
+
+ if (force_flush_and_sync_on_reuse) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the VCPU. When
+ * switching to a new CR3, that GVA->GPA mapping may no longer be
+ * valid. So clear any cached MMIO info even when we don't need to sync
+ * the shadow page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ /*
+ * If this is a direct root page, it doesn't have a write flooding
+ * count. Otherwise, clear the write flooding count.
+ */
+ if (!new_role.direct) {
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (!WARN_ON_ONCE(!sp))
+ __clear_sp_write_flooding_count(sp);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_new_pgd);
+
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ unsigned int access)
+{
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
+{
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
+
+ rsvd_check->bad_mt_xwr = 0;
+
+ if (!gbpages)
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (amd)
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ fallthrough;
+ case PT64_ROOT_4LEVEL:
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits,
+ context->cpu_role.base.level, is_efer_nx(context),
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
+ is_cr4_pse(context),
+ guest_cpuid_is_amd_compatible(vcpu));
+}
+
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
+{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
+ u64 bad_mt_xwr;
+
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
+ }
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
+
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level,
+ context->root_role.efer_nx,
+ guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ /*
+ * So far shadow_me_value is a constant during KVM's life
+ * time. Bits in shadow_me_value are allowed to be set.
+ * Bits in shadow_me_mask but not in shadow_me_value are
+ * not allowed to be set.
+ */
+ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
+ }
+
+}
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+ context->root_role.level, true,
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ false, true);
+ else
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->root_role.level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
+}
+
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
+{
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
+
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ unsigned pfec = byte << 1;
+
+ /*
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
+ */
+
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!efer_nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are true:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - The access is supervisor mode
+ * - If implicit supervisor access or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first four conditions.
+ * The fifth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
+ }
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ }
+}
+
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
+{
+ unsigned bit;
+ bool wp;
+
+ mmu->pkru_mask = 0;
+
+ if (!is_cr4_pke(mmu))
+ return;
+
+ wp = is_cr0_wp(mmu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (!is_cr0_pg(mmu))
+ return;
+
+ reset_guest_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
+}
+
+static void paging64_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_spte = paging64_sync_spte;
+}
+
+static void paging32_init_context(struct kvm_mmu *context)
+{
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_spte = paging32_sync_spte;
+}
+
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
+{
+ union kvm_cpu_role role = {0};
+
+ role.base.access = ACC_ALL;
+ role.base.smm = is_smm(vcpu);
+ role.base.guest_mode = is_guest_mode(vcpu);
+ role.ext.valid = 1;
+
+ if (!____is_cr0_pg(regs)) {
+ role.base.direct = 1;
+ return role;
+ }
+
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+ role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+
+ if (____is_efer_lma(regs))
+ role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+ : PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ role.base.level = PT32E_ROOT_LEVEL;
+ else
+ role.base.level = PT32_ROOT_LEVEL;
+
+ role.ext.cr4_smep = ____is_cr4_smep(regs);
+ role.ext.cr4_smap = ____is_cr4_smap(regs);
+ role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ role.ext.efer_lma = ____is_efer_lma(regs);
+ return role;
+}
+
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
+static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+{
+ int maxpa;
+
+ if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM)
+ maxpa = cpuid_query_maxguestphyaddr(vcpu);
+ else
+ maxpa = cpuid_maxphyaddr(vcpu);
+
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && maxpa <= 48)
+ return 4;
+
+ return max_tdp_level;
+}
+
+u8 kvm_mmu_get_max_tdp_level(void)
+{
+ return tdp_root_level ? tdp_root_level : max_tdp_level;
+}
+
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.access = ACC_ALL;
+ role.cr0_wp = true;
+ role.efer_nx = true;
+ role.smm = cpu_role.base.smm;
+ role.guest_mode = cpu_role.base.guest_mode;
+ role.ad_disabled = !kvm_ad_enabled;
+ role.level = kvm_mmu_get_tdp_level(vcpu);
+ role.direct = true;
+ role.has_4_byte_gpte = false;
+
+ return role;
+}
+
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
+
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+ context->page_fault = kvm_tdp_page_fault;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+
+ if (!is_cr0_pg(context))
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_cr4_pae(context))
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(context);
+}
+
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ union kvm_cpu_role cpu_role,
+ union kvm_mmu_page_role root_role)
+{
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
+ return;
+
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
+
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
+ else
+ paging32_init_context(context);
+
+ reset_guest_paging_metadata(vcpu, context);
+ reset_shadow_zero_bits_mask(vcpu, context);
+}
+
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_page_role root_role;
+
+ root_role = cpu_role.base;
+
+ /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
+
+ /*
+ * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts.
+ */
+ root_role.efer_nx = true;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+}
+
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4 & ~X86_CR4_PKE,
+ .efer = efer,
+ };
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+ union kvm_mmu_page_role root_role;
+
+ /* NPT requires CR0.PG=1. */
+ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
+
+ root_role = cpu_role.base;
+ root_role.level = kvm_mmu_get_tdp_level(vcpu);
+ if (root_role.level == PT64_ROOT_5LEVEL &&
+ cpu_role.base.level == PT64_ROOT_4LEVEL)
+ root_role.passthrough = 1;
+
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+ kvm_mmu_new_pgd(vcpu, nested_cr3);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
+
+static union kvm_cpu_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
+ bool execonly, u8 level)
+{
+ union kvm_cpu_role role = {0};
+
+ /*
+ * KVM does not support SMM transfer monitors, and consequently does not
+ * support the "entry to SMM" control either. role.base.smm is always 0.
+ */
+ WARN_ON_ONCE(is_smm(vcpu));
+ role.base.level = level;
+ role.base.has_4_byte_gpte = false;
+ role.base.direct = false;
+ role.base.ad_disabled = !accessed_dirty;
+ role.base.guest_mode = true;
+ role.base.access = ACC_ALL;
+
+ role.ext.word = 0;
+ role.ext.execonly = execonly;
+ role.ext.valid = 1;
+
+ return role;
+}
+
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ u8 level = vmx_eptp_page_walk_level(new_eptp);
+ union kvm_cpu_role new_mode =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
+ execonly, level);
+
+ if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ context->cpu_role.as_u64 = new_mode.as_u64;
+ context->root_role.word = new_mode.base.word;
+
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_spte = ept_sync_spte;
+
+ update_permission_bitmask(context, true);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+ reset_ept_shadow_zero_bits_mask(context, execonly);
+ }
+
+ kvm_mmu_new_pgd(vcpu, new_eptp);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+
+ kvm_init_shadow_mmu(vcpu, cpu_role);
+
+ context->get_guest_pgd = get_guest_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+}
+
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role new_mode)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ if (new_mode.as_u64 == g_context->cpu_role.as_u64)
+ return;
+
+ g_context->cpu_role.as_u64 = new_mode.as_u64;
+ g_context->get_guest_pgd = get_guest_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * L2 page tables are never shadowed, so there is no need to sync
+ * SPTEs.
+ */
+ g_context->sync_spte = NULL;
+
+ /*
+ * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu))
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
+ else if (is_long_mode(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else if (is_pae(vcpu))
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
+ else
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
+
+ reset_guest_paging_metadata(vcpu, g_context);
+}
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+
+ if (mmu_is_nested(vcpu))
+ init_kvm_nested_mmu(vcpu, cpu_role);
+ else if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu, cpu_role);
+ else
+ init_kvm_softmmu(vcpu, cpu_role);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
+
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_write_track (see struct kvm_mmu_page_role comments). For now
+ * that problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
+ */
+ vcpu->arch.root_mmu.root_role.invalid = 1;
+ vcpu->arch.guest_mmu.root_role.invalid = 1;
+ vcpu->arch.nested_mmu.root_role.invalid = 1;
+ vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
+ */
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
+}
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ kvm_init_mmu(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
+ if (r)
+ goto out;
+ r = mmu_alloc_special_roots(vcpu);
+ if (r)
+ goto out;
+ if (vcpu->arch.mmu->root_role.direct)
+ r = mmu_alloc_direct_roots(vcpu);
+ else
+ r = mmu_alloc_shadow_roots(vcpu);
+ if (r)
+ goto out;
+
+ kvm_mmu_sync_roots(vcpu);
+
+ kvm_mmu_load_pgd(vcpu);
+
+ /*
+ * Flush any TLB entries for the new root, the provenance of the root
+ * is unknown. Even if KVM ensures there are no stale TLB entries
+ * for a freed root, in theory another hypervisor could have left
+ * stale entries. Flushing on alloc also allows KVM to skip the TLB
+ * flush when freeing a root (see kvm_tdp_mmu_put_root()).
+ */
+ kvm_x86_call(flush_tlb_current)(vcpu);
+out:
+ return r;
+}
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+}
+
+static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root_hpa))
+ return false;
+
+ /*
+ * When freeing obsolete roots, treat roots as obsolete if they don't
+ * have an associated shadow page, as it's impossible to determine if
+ * such roots are fresh or stale. This does mean KVM will get false
+ * positives and free roots that don't strictly need to be freed, but
+ * such false positives are relatively rare:
+ *
+ * (a) only PAE paging and nested NPT have roots without shadow pages
+ * (or any shadow paging flavor with a dummy root, see note below)
+ * (b) remote reloads due to a memslot update obsoletes _all_ roots
+ * (c) KVM doesn't track previous roots for PAE paging, and the guest
+ * is unlikely to zap an in-use PGD.
+ *
+ * Note! Dummy roots are unique in that they are obsoleted by memslot
+ * _creation_! See also FNAME(fetch).
+ */
+ sp = root_to_sp(root_hpa);
+ return !sp || is_obsolete_sp(kvm, sp);
+}
+
+static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ if (is_obsolete_root(kvm, mmu->root.hpa))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots_to_free)
+ kvm_mmu_free_roots(kvm, mmu, roots_to_free);
+}
+
+void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
+{
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
+ __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_free_obsolete_roots);
+
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ int *bytes)
+{
+ u64 gentry = 0;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ }
+
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
+ }
+
+ return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
+{
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == PG_LEVEL_4K)
+ return false;
+
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (sp->role.has_4_byte_gpte) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
+}
+
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry, *spte;
+ int npte;
+ bool flush = false;
+
+ /*
+ * When emulating guest writes, ensure the written value is visible to
+ * any task that is handling page faults before checking whether or not
+ * KVM is shadowing a guest PTE. This ensures either KVM will create
+ * the correct SPTE in the page fault handler, or this task will see
+ * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in
+ * account_shadowed().
+ */
+ smp_mb();
+ if (!vcpu->kvm->arch.indirect_shadow_pages)
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
+ ++vcpu->kvm->stat.mmu_pte_write;
+
+ for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
+ while (npte--) {
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ if (is_shadow_present_pte(entry))
+ flush = true;
+ ++spte;
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
+{
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ /*
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
+ */
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
+ */
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_PF;
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ r = RET_PF_INVALID;
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+ if (r == RET_PF_EMULATE)
+ goto emulate;
+ }
+
+ if (r == RET_PF_INVALID) {
+ vcpu->stat.pf_taken++;
+
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
+ &emulation_type, NULL);
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
+ return -EIO;
+ }
+
+ if (r < 0)
+ return r;
+
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+
+ /*
+ * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or
+ * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE.
+ * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to
+ * indicate continuing the page fault handling until to the final
+ * page table mapping phase.
+ */
+ WARN_ON_ONCE(r == RET_PF_CONTINUE);
+ if (r != RET_PF_EMULATE)
+ return r;
+
+emulate:
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
+ insn_len);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_page_fault);
+
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ int root_level, leaf, level;
+
+ leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
+ if (unlikely(leaf < 0))
+ return;
+
+ pr_err("%s %llx", msg, gpa);
+ for (level = root_level; level >= leaf; level--)
+ pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_print_sptes);
+
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ /*
+ * Walking and synchronizing SPTEs both assume they are operating in
+ * the context of the current MMU, and would need to be reworked if
+ * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
+ */
+ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
+ return;
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
+{
+ int i;
+
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
+ /* It's actually a GPA for vcpu->arch.guest_mmu. */
+ if (mmu != &vcpu->arch.guest_mmu) {
+ /* INVLPG on a non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_invlpg_address(addr, vcpu))
+ return;
+
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
+ }
+
+ if (!mmu->sync_spte)
+ return;
+
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invalidate_addr);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
+
+
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots = 0;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
+{
+ tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
+ max_tdp_level = tdp_max_root_level;
+
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
+ /*
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
+ * of kernel support, e.g. KVM may be capable of using 1GB pages when
+ * the kernel is not. But, KVM never creates a page size greater than
+ * what is used by the kernel for any given HVA, i.e. the kernel's
+ * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
+ */
+ if (tdp_enabled)
+ max_huge_page_level = tdp_huge_page_level;
+ else if (boot_cpu_has(X86_FEATURE_GBPAGES))
+ max_huge_page_level = PG_LEVEL_1G;
+ else
+ max_huge_page_level = PG_LEVEL_2M;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_configure_mmu);
+
+static void free_mmu_pages(struct kvm_mmu *mmu)
+{
+ if (!tdp_enabled && mmu->pae_root)
+ set_memory_encrypted((unsigned long)mmu->pae_root, 1);
+ free_page((unsigned long)mmu->pae_root);
+ free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
+}
+
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ struct page *page;
+ int i;
+
+ mmu->root.hpa = INVALID_PAGE;
+ mmu->root.pgd = 0;
+ mmu->mirror_root_hpa = INVALID_PAGE;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
+ /*
+ * When using PAE paging, the four PDPTEs are treated as 'root' pages,
+ * while the PDP table is a per-vCPU construct that's allocated at MMU
+ * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate the PDP table in the first
+ * 4GB of memory, which happens to fit the DMA32 zone. TDP paging
+ * generally doesn't use PAE paging and can skip allocating the PDP
+ * table. The main exception, handled here, is SVM's 32-bit NPT. The
+ * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
+ */
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+ return 0;
+
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ mmu->pae_root = page_address(page);
+
+ /*
+ * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
+ * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
+ * that KVM's writes and the CPU's reads get along. Note, this is
+ * only necessary when using shadow paging, as 64-bit NPT can get at
+ * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
+ * by 32-bit kernels (when KVM itself uses 32-bit NPT).
+ */
+ if (!tdp_enabled)
+ set_memory_decrypted((unsigned long)mmu->pae_root, 1);
+ else
+ WARN_ON_ONCE(shadow_me_value);
+
+ for (i = 0; i < 4; ++i)
+ mmu->pae_root[i] = INVALID_PAE_ROOT;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
+ if (ret)
+ return ret;
+
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
+ if (ret)
+ goto fail_allocate_root;
+
+ return ret;
+ fail_allocate_root:
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ return ret;
+}
+
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int nr_zapped, batch = 0;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ /*
+ * No obsolete valid page exists before a newly created page
+ * since active_mmu_pages is a FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
+ */
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+
+ /*
+ * No need to flush the TLB since we're only zapping shadow
+ * pages with an obsolete generation number and all vCPUS have
+ * loaded a new root, i.e. the shadow pages being zapped cannot
+ * be in active use by the guest.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
+ }
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &invalid_list, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
+ goto restart;
+ }
+
+ /*
+ * Kick all vCPUs (via remote TLB flush) before freeing the page tables
+ * to ensure KVM is not in the middle of a lockless shadow page table
+ * walk, which may reference the pages. The remote TLB flush itself is
+ * not required and is simply a convenient way to kick vCPUs as needed.
+ * KVM performs a local TLB flush when allocating a new root (see
+ * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
+ * running with an obsolete MMU.
+ */
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+static void kvm_mmu_zap_all_fast(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->slots_lock);
+
+ write_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_zap_all_fast(kvm);
+
+ /*
+ * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
+ * held for the entire duration of zapping obsolete pages, it's
+ * impossible for there to be multiple invalid generations associated
+ * with *valid* shadow pages at any given time, i.e. there is exactly
+ * one valid generation and (at most) one invalid generation.
+ */
+ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
+
+ /*
+ * In order to ensure all vCPUs drop their soon-to-be invalid roots,
+ * invalidating TDP MMU roots must be done while holding mmu_lock for
+ * write and in the same critical section as making the reload request,
+ * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
+ */
+ if (tdp_mmu_enabled) {
+ /*
+ * External page tables don't support fast zapping, therefore
+ * their mirrors must be invalidated separately by the caller.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
+ }
+
+ /*
+ * Notify all vcpus to reload its shadow page table and flush TLB.
+ * Then all vcpus will switch to new shadow page table with the new
+ * mmu_valid_gen.
+ *
+ * Note: we need to do this under the protection of mmu_lock,
+ * otherwise, vcpu would purge shadow page but miss tlb flush.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
+
+ kvm_zap_obsolete_pages(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+
+ /*
+ * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
+ * returning to the caller, e.g. if the zap is in response to a memslot
+ * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
+ * associated with the deleted memslot once the update completes, and
+ * Deferring the zap until the final reference to the root is put would
+ * lead to use-after-free.
+ */
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
+}
+
+int kvm_mmu_init_vm(struct kvm *kvm)
+{
+ int r, i;
+
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages[i].pages);
+ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+
+ if (tdp_mmu_enabled) {
+ kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
+
+ kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
+ kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
+ kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
+ kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
+}
+
+static void mmu_free_vm_memory_caches(struct kvm *kvm)
+{
+ kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
+ kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ kvfree(kvm->arch.mmu_page_hash);
+
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
+
+ mmu_free_vm_memory_caches(kvm);
+}
+
+static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ const struct kvm_memory_slot *memslot;
+ struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
+ bool flush = false;
+ gfn_t start, end;
+ int i;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (WARN_ON_ONCE(start >= end))
+ continue;
+
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
+ }
+ }
+
+ return flush;
+}
+
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ bool flush;
+
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
+
+ write_lock(&kvm->mmu_lock);
+
+ kvm_mmu_invalidate_begin(kvm);
+
+ kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
+
+ flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
+
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
+
+ if (flush)
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
+
+ kvm_mmu_invalidate_end(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range);
+
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ return rmap_write_protect(rmap_head, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int start_level)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
+{
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static bool need_topup_split_caches_or_resched(struct kvm *kvm)
+{
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ /*
+ * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
+ * to split a single huge page. Calculating how many are actually needed
+ * is possible but not worth the complexity.
+ */
+ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
+ need_topup(&kvm->arch.split_page_header_cache, 1) ||
+ need_topup(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static int topup_split_caches(struct kvm *kvm)
+{
+ /*
+ * Allocating rmap list entries when splitting huge pages for nested
+ * MMUs is uncommon as KVM needs to use a list if and only if there is
+ * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
+ * aliased by multiple L2 gfns and/or from multiple nested roots with
+ * different roles. Aliasing gfns when using TDP is atypical for VMMs;
+ * a few gfns are often aliased during boot, e.g. when remapping BIOS,
+ * but aliasing rarely occurs post-boot or for many gfns. If there is
+ * only one rmap entry, rmap->val points directly at that one entry and
+ * doesn't need to allocate a list. Buffer the cache by the default
+ * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
+ * encounters an aliased gfn or two.
+ */
+ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
+ KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
+ int r;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
+ SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
+ if (r)
+ return r;
+
+ r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
+ if (r)
+ return r;
+
+ return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
+}
+
+static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ struct shadow_page_caches caches = {};
+ union kvm_mmu_page_role role;
+ unsigned int access;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
+
+ /*
+ * Note, huge page splitting always uses direct shadow pages, regardless
+ * of whether the huge page itself is mapped by a direct or indirect
+ * shadow page, since the huge page region itself is being directly
+ * mapped with smaller pages.
+ */
+ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
+
+ /* Direct SPs do not require a shadowed_info_cache. */
+ caches.page_header_cache = &kvm->arch.split_page_header_cache;
+ caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
+
+ /* Safe to pass NULL for vCPU since requesting a direct SP. */
+ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
+}
+
+static void shadow_mmu_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+
+{
+ struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
+ u64 huge_spte = READ_ONCE(*huge_sptep);
+ struct kvm_mmu_page *sp;
+ bool flush = false;
+ u64 *sptep, spte;
+ gfn_t gfn;
+ int index;
+
+ sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
+
+ for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
+ sptep = &sp->spt[index];
+ gfn = kvm_mmu_page_get_gfn(sp, index);
+
+ /*
+ * The SP may already have populated SPTEs, e.g. if this huge
+ * page is aliased by multiple sptes with the same access
+ * permissions. These entries are guaranteed to map the same
+ * gfn-to-pfn translation since the SP is direct, so no need to
+ * modify them.
+ *
+ * However, if a given SPTE points to a lower level page table,
+ * that lower level page table may only be partially populated.
+ * Installing such SPTEs would effectively unmap a potion of the
+ * huge page. Unmapping guest memory always requires a TLB flush
+ * since a subsequent operation on the unmapped regions would
+ * fail to detect the need to flush.
+ */
+ if (is_shadow_present_pte(*sptep)) {
+ flush |= !is_last_spte(*sptep, sp->role.level);
+ continue;
+ }
+
+ spte = make_small_spte(kvm, huge_spte, sp->role, index);
+ mmu_spte_set(sptep, spte);
+ __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
+ }
+
+ __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
+}
+
+static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ u64 *huge_sptep)
+{
+ struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+ int level, r = 0;
+ gfn_t gfn;
+ u64 spte;
+
+ /* Grab information for the tracepoint before dropping the MMU lock. */
+ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
+ level = huge_sp->role.level;
+ spte = *huge_sptep;
+
+ if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
+ r = -ENOSPC;
+ goto out;
+ }
+
+ if (need_topup_split_caches_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /*
+ * If the topup succeeds, return -EAGAIN to indicate that the
+ * rmap iterator should be restarted because the MMU lock was
+ * dropped.
+ */
+ r = topup_split_caches(kvm) ?: -EAGAIN;
+ write_lock(&kvm->mmu_lock);
+ goto out;
+ }
+
+ shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
+
+out:
+ trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+ return r;
+}
+
+static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ struct rmap_iterator iter;
+ struct kvm_mmu_page *sp;
+ u64 *huge_sptep;
+ int r;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+ sp = sptep_to_sp(huge_sptep);
+
+ /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
+ if (WARN_ON_ONCE(!sp->role.guest_mode))
+ continue;
+
+ /* The rmaps should never contain non-leaf SPTEs. */
+ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+ continue;
+
+ /* SPs with level >PG_LEVEL_4K should never by unsync. */
+ if (WARN_ON_ONCE(sp->unsync))
+ continue;
+
+ /* Don't bother splitting huge pages on invalid SPs. */
+ if (sp->role.invalid)
+ continue;
+
+ r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
+
+ /*
+ * The split succeeded or needs to be retried because the MMU
+ * lock was dropped. Either way, restart the iterator to get it
+ * back into a consistent state.
+ */
+ if (!r || r == -EAGAIN)
+ goto restart;
+
+ /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
+ break;
+ }
+
+ return false;
+}
+
+static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level)
+{
+ int level;
+
+ /*
+ * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+ * down to the target level. This ensures pages are recursively split
+ * all the way to the target level. There's no need to split pages
+ * already at the target level.
+ */
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, true, false);
+}
+
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ u64 start, u64 end,
+ int target_level)
+{
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm))
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
+
+ /*
+ * A TLB flush is unnecessary at this point for the same reasons as in
+ * kvm_mmu_slot_try_split_huge_pages().
+ */
+}
+
+void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ int target_level)
+{
+ u64 start = memslot->base_gfn;
+ u64 end = start + memslot->npages;
+
+ if (!tdp_mmu_enabled)
+ return;
+
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
+ read_unlock(&kvm->mmu_lock);
+
+ /*
+ * No TLB flush is necessary here. KVM will flush TLBs after
+ * write-protecting and/or clearing dirty on the newly split SPTEs to
+ * ensure that guest writes are reflected in the dirty log before the
+ * ioctl to enable dirty logging on this memslot completes. Since the
+ * split SPTEs retain the write and dirty bits of the huge SPTE, it is
+ * safe for KVM to decide if a TLB flush is necessary based on the split
+ * SPTEs.
+ */
+}
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ struct kvm_mmu_page *sp;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ sp = sptep_to_sp(sptep);
+
+ /*
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
+ */
+ if (sp->role.direct &&
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
+ kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
+
+ if (kvm_available_flush_remote_tlbs_range())
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
+ else
+ need_tlb_flush = 1;
+
+ goto restart;
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ /*
+ * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
+ * pages that are already mapped at the maximum hugepage level.
+ */
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+}
+
+void kvm_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ kvm_rmap_zap_collapsible_sptes(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_recover_huge_pages(kvm, slot);
+ read_unlock(&kvm->mmu_lock);
+ }
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
+ write_unlock(&kvm->mmu_lock);
+ }
+
+ if (tdp_mmu_enabled) {
+ read_lock(&kvm->mmu_lock);
+ kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
+ read_unlock(&kvm->mmu_lock);
+ }
+
+ /*
+ * The caller will flush the TLBs after this function returns.
+ *
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+}
+
+static void kvm_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
+ int ign;
+
+ write_lock(&kvm->mmu_lock);
+restart:
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (WARN_ON_ONCE(sp->role.invalid))
+ continue;
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
+ goto restart;
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
+ goto restart;
+ }
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_all(kvm);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+}
+
+static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ bool flush)
+{
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_flush;
+
+ /*
+ * Since accounting information is stored in struct kvm_arch_memory_slot,
+ * all MMU pages that are shadowing guest PTEs must be zapped before the
+ * memslot is deleted, as freeing such pages after the memslot is freed
+ * will result in use-after-free, e.g. in unaccount_shadowed().
+ */
+ for (i = 0; i < slot->npages; i++) {
+ struct kvm_mmu_page *sp;
+ gfn_t gfn = slot->base_gfn + i;
+
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn)
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+out_flush:
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+}
+
+static void kvm_mmu_zap_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED,
+ };
+ bool flush;
+
+ write_lock(&kvm->mmu_lock);
+ flush = kvm_unmap_gfn_range(kvm, &range);
+ kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush);
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot(kvm, slot);
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+{
+ WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+
+ if (!enable_mmio_caching)
+ return;
+
+ gen &= MMIO_SPTE_GEN_MASK;
+
+ /*
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
+ * zap all shadow pages.
+ */
+ if (unlikely(gen == 0)) {
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_zap_all_fast(kvm);
+ }
+}
+
+static void mmu_destroy_caches(void)
+{
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+static void kvm_wake_nx_recovery_thread(struct kvm *kvm)
+{
+ /*
+ * The NX recovery thread is spawned on-demand at the first KVM_RUN and
+ * may not be valid even though the VM is globally visible. Do nothing,
+ * as such a VM can't have any possible NX huge pages.
+ */
+ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
+
+ if (nx_thread)
+ vhost_task_wake(nx_thread);
+}
+
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return sysfs_emit(buffer, "never\n");
+
+ return param_get_bool(buffer, kp);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off")) {
+ new_val = 0;
+ } else if (sysfs_streq(val, "force")) {
+ new_val = 1;
+ } else if (sysfs_streq(val, "auto")) {
+ new_val = get_nx_auto_mode();
+ } else if (sysfs_streq(val, "never")) {
+ new_val = 0;
+
+ mutex_lock(&kvm_lock);
+ if (!list_empty(&vm_list)) {
+ mutex_unlock(&kvm_lock);
+ return -EBUSY;
+ }
+ nx_hugepage_mitigation_hard_disabled = true;
+ mutex_unlock(&kvm_lock);
+ } else if (kstrtobool(val, &new_val) < 0) {
+ return -EINVAL;
+ }
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_mmu_zap_all_fast(kvm);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_wake_nx_recovery_thread(kvm);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ * Forward the module init call to SPTE code so that it too can handle module
+ * params that need to be resolved/snapshot.
+ */
+void __init kvm_mmu_x86_module_init(void)
+{
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
+ kvm_mmu_spte_module_init();
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ /*
+ * MMU roles use union aliasing which is, generally speaking, an
+ * undefined behavior. However, we supposedly know how compilers behave
+ * and the current status quo is unlikely to change. Guardians below are
+ * supposed to let us know if the assumption becomes false.
+ */
+ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
+ BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
+
+ kvm_mmu_reset_all_pte_masks();
+
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
+ if (!pte_list_desc_cache)
+ goto out;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!mmu_page_header_cache)
+ goto out;
+
+ return 0;
+
+out:
+ mmu_destroy_caches();
+ return ret;
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ if (tdp_mmu_enabled) {
+ read_lock(&vcpu->kvm->mmu_lock);
+ mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa,
+ NULL);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ }
+ free_mmu_pages(&vcpu->arch.root_mmu);
+ free_mmu_pages(&vcpu->arch.guest_mmu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_vendor_module_exit(void)
+{
+ mmu_destroy_caches();
+}
+
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
+{
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
+ int err;
+
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
+
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
+
+ if (is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_wake_nx_recovery_thread(kvm);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
+ enum kvm_mmu_type mmu_type)
+{
+ unsigned long pages = READ_ONCE(kvm->arch.possible_nx_huge_pages[mmu_type].nr_pages);
+ unsigned int ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
+}
+
+static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ struct kvm_memory_slot *slot;
+
+ /*
+ * Skip the memslot lookup if dirty tracking can't possibly be enabled,
+ * as memslot lookups are relatively expensive.
+ *
+ * If a memslot update is in progress, reading an incorrect value of
+ * kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
+ * zero, KVM will do an unnecessary memslot lookup; if it is becoming
+ * nonzero, the page will be zapped unnecessarily. Either way, this
+ * only affects efficiency in racy situations, and not correctness.
+ */
+ if (!atomic_read(&kvm->nr_memslots_dirty_logging))
+ return false;
+
+ slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
+ if (WARN_ON_ONCE(!slot))
+ return false;
+
+ return kvm_slot_dirty_track_enabled(slot);
+}
+
+static void kvm_recover_nx_huge_pages(struct kvm *kvm,
+ const enum kvm_mmu_type mmu_type)
+{
+#ifdef CONFIG_X86_64
+ const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
+ spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
+#else
+ const bool is_tdp_mmu = false;
+ spinlock_t *tdp_mmu_pages_lock = NULL;
+#endif
+ unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
+ struct list_head *nx_huge_pages;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+ int rcu_idx;
+
+ nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ /*
+ * Zapping TDP MMU shadow pages, including the remote TLB flush, must
+ * be done under RCU protection, because the pages are freed via RCU
+ * callback.
+ */
+ rcu_read_lock();
+
+ for ( ; to_zap; --to_zap) {
+ if (is_tdp_mmu)
+ spin_lock(tdp_mmu_pages_lock);
+
+ if (list_empty(nx_huge_pages)) {
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
+ break;
+ }
+
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
+ */
+ sp = list_first_entry(nx_huge_pages,
+ struct kvm_mmu_page,
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ unaccount_nx_huge_page(kvm, sp);
+
+ if (is_tdp_mmu)
+ spin_unlock(tdp_mmu_pages_lock);
+
+ /*
+ * Do not attempt to recover any NX Huge Pages that are being
+ * dirty tracked, as they would just be faulted back in as 4KiB
+ * pages. The NX Huge Pages in this slot will be recovered,
+ * along with all the other huge pages in the slot, when dirty
+ * logging is disabled.
+ */
+ if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
+ if (is_tdp_mmu)
+ flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
+ else
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ }
+
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ rcu_read_unlock();
+
+ if (is_tdp_mmu)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
+ flush = false;
+ rcu_read_lock();
+ }
+ }
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+
+ rcu_read_unlock();
+
+ if (is_tdp_mmu)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static void kvm_nx_huge_page_recovery_worker_kill(void *data)
+{
+}
+
+static bool kvm_nx_huge_page_recovery_worker(void *data)
+{
+ struct kvm *kvm = data;
+ long remaining_time;
+ bool enabled;
+ uint period;
+ int i;
+
+ enabled = calc_nx_huge_pages_recovery_period(&period);
+ if (!enabled)
+ return false;
+
+ remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
+ - get_jiffies_64();
+ if (remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ /* check for signals and come back */
+ return true;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ for (i = 0; i < KVM_NR_MMU_TYPES; ++i)
+ kvm_recover_nx_huge_pages(kvm, i);
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ return true;
+}
+
+static int kvm_mmu_start_lpage_recovery(struct once *once)
+{
+ struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct vhost_task *nx_thread;
+
+ kvm->arch.nx_huge_page_last = get_jiffies_64();
+ nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker,
+ kvm_nx_huge_page_recovery_worker_kill,
+ kvm, "kvm-nx-lpage-recovery");
+
+ if (IS_ERR(nx_thread))
+ return PTR_ERR(nx_thread);
+
+ vhost_task_start(nx_thread);
+
+ /* Make the task visible only once it is fully started. */
+ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+ return 0;
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
+}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ /*
+ * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
+ * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+ * can simply ignore such slots. But if userspace is making memory
+ * PRIVATE, then KVM must prevent the guest from accessing the memory
+ * as shared. And if userspace is making memory SHARED and this point
+ * is reached, then at least one page within the range was previously
+ * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+ * Zapping SPTEs in this case ensures KVM will reassess whether or not
+ * a hugepage can be used for affected ranges.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ if (WARN_ON_ONCE(range->end <= range->start))
+ return false;
+
+ /*
+ * If the head and tail pages of the range currently allow a hugepage,
+ * i.e. reside fully in the slot and don't have mixed attributes, then
+ * add each corresponding hugepage range to the ongoing invalidation,
+ * e.g. to prevent KVM from creating a hugepage in response to a fault
+ * for a gfn whose attributes aren't changing. Note, only the range
+ * of gfns whose attributes are being modified needs to be explicitly
+ * unmapped, as that will unmap any existing hugepages.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t start = gfn_round_for_level(range->start, level);
+ gfn_t end = gfn_round_for_level(range->end - 1, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+
+ if ((start != range->start || start + nr_pages > range->end) &&
+ start >= slot->base_gfn &&
+ start + nr_pages <= slot->base_gfn + slot->npages &&
+ !hugepage_test_mixed(slot, start, level))
+ kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages);
+
+ if (end == start)
+ continue;
+
+ if ((end + nr_pages) > range->end &&
+ (end + nr_pages) <= (slot->base_gfn + slot->npages) &&
+ !hugepage_test_mixed(slot, end, level))
+ kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages);
+ }
+
+ /* Unmap the old attribute page. */
+ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ range->attr_filter = KVM_FILTER_SHARED;
+ else
+ range->attr_filter = KVM_FILTER_PRIVATE;
+
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
+
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, int level, unsigned long attrs)
+{
+ const unsigned long start = gfn;
+ const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+ if (level == PG_LEVEL_2M)
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+ if (hugepage_test_mixed(slot, gfn, level - 1) ||
+ attrs != kvm_get_memory_attributes(kvm, gfn))
+ return false;
+ }
+ return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ unsigned long attrs = range->arg.attributes;
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Calculate which ranges can be mapped with hugepages even if the slot
+ * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
+ * a range that has PRIVATE GFNs, and conversely converting a range to
+ * SHARED may now allow hugepages.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ /*
+ * The sequence matters here: upper levels consume the result of lower
+ * level's scanning.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn = gfn_round_for_level(range->start, level);
+
+ /* Process the head page if it straddles the range. */
+ if (gfn != range->start || gfn + nr_pages > range->end) {
+ /*
+ * Skip mixed tracking if the aligned gfn isn't covered
+ * by the memslot, KVM can't use a hugepage due to the
+ * misaligned address regardless of memory attributes.
+ */
+ if (gfn >= slot->base_gfn &&
+ gfn + nr_pages <= slot->base_gfn + slot->npages) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ gfn += nr_pages;
+ }
+
+ /*
+ * Pages entirely covered by the range are guaranteed to have
+ * only the attributes which were just set.
+ */
+ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+ hugepage_clear_mixed(slot, gfn, level);
+
+ /*
+ * Process the last tail page if it straddles the range and is
+ * contained by the memslot. Like the head page, KVM can't
+ * create a hugepage if the slot size is misaligned.
+ */
+ if (gfn < range->end &&
+ (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+ return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ int level;
+
+ if (!kvm_arch_has_private_mem(kvm))
+ return;
+
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ /*
+ * Don't bother tracking mixed attributes for pages that can't
+ * be huge due to alignment, i.e. process only pages that are
+ * entirely contained by the memslot.
+ */
+ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+ gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn;
+
+ if (start < slot->base_gfn)
+ start += nr_pages;
+
+ /*
+ * Unlike setting attributes, every potential hugepage needs to
+ * be manually checked as the attributes may already be mixed.
+ */
+ for (gfn = start; gfn < end; gfn += nr_pages) {
+ unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
new file mode 100644
index 000000000000..73cdcbccc89e
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_INTERNAL_H
+#define __KVM_X86_MMU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#include "mmu.h"
+
+#ifdef CONFIG_KVM_PROVE_MMU
+#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
+#else
+#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
+#endif
+
+/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
+#define __PT_LEVEL_SHIFT(level, bits_per_level) \
+ (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
+#define __PT_INDEX(address, level, bits_per_level) \
+ (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
+
+#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
+ ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
+
+#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level))
+
+/*
+ * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
+ * bit, and thus are guaranteed to be non-zero when valid. And, when a guest
+ * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
+ * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use
+ * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
+ */
+#define INVALID_PAE_ROOT 0
+#define IS_VALID_PAE_ROOT(x) (!!(x))
+
+typedef u64 __rcu *tdp_ptep_t;
+
+struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ bool tdp_mmu_page;
+ bool unsync;
+ union {
+ u8 mmu_valid_gen;
+
+ /* Only accessed under slots_lock. */
+ bool tdp_mmu_scheduled_root_to_zap;
+ };
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ union kvm_mmu_page_role role;
+ gfn_t gfn;
+
+ u64 *spt;
+
+ /*
+ * Stores the result of the guest translation being shadowed by each
+ * SPTE. KVM shadows two types of guest translations: nGPA -> GPA
+ * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
+ * cases the result of the translation is a GPA and a set of access
+ * constraints.
+ *
+ * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
+ * access permissions are stored in the lower bits. Note, for
+ * convenience and uniformity across guests, the access permissions are
+ * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
+ */
+ u64 *shadowed_translation;
+
+ /* Currently serving as active root */
+ union {
+ int root_count;
+ refcount_t tdp_mmu_root_count;
+ };
+
+ bool has_mapped_host_mmio;
+
+ union {
+ /* These two members aren't used for TDP MMU */
+ struct {
+ unsigned int unsync_children;
+ /*
+ * Number of writes since the last time traversal
+ * visited this page.
+ */
+ atomic_t write_flooding_count;
+ };
+ /*
+ * Page table page of external PT.
+ * Passed to TDX module, not accessed by KVM.
+ */
+ void *external_spt;
+ };
+ union {
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+ tdp_ptep_t ptep;
+ };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
+#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
+ int clear_spte_count;
+#endif
+
+#ifdef CONFIG_X86_64
+ /* Used for freeing the page asynchronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
+};
+
+extern struct kmem_cache *mmu_page_header_cache;
+
+static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
+{
+ return role.smm ? 1 : 0;
+}
+
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return kvm_mmu_role_as_id(sp->role);
+}
+
+static inline bool is_mirror_sp(const struct kvm_mmu_page *sp)
+{
+ return sp->role.is_mirror;
+}
+
+static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ /*
+ * external_spt is allocated for TDX module to hold private EPT mappings,
+ * TDX module will initialize the page by itself.
+ * Therefore, KVM does not need to initialize or access external_spt.
+ * KVM only interacts with sp->spt for private EPT operations.
+ */
+ sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
+}
+
+static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root)
+{
+ /*
+ * Since mirror SPs are used only for TDX, which maps private memory
+ * at its "natural" GFN, no mask needs to be applied to them - and, dually,
+ * we expect that the bits is only used for the shared PT.
+ */
+ if (is_mirror_sp(root))
+ return 0;
+ return kvm_gfn_direct_bits(kvm);
+}
+
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the CPU dirty
+ * log would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages, which bypasses PML, since
+ * writes now result in a vmexit. Note, the check on CPU dirty logging
+ * being enabled is mandatory as the bits used to denote WP-only SPTEs
+ * are reserved for PAE paging (32-bit KVM).
+ */
+ return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
+}
+
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
+ gfn_t gfn, bool synchronizing, bool prefetch);
+
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
+
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
+{
+ return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
+}
+
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u64 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool is_private;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /*
+ * Shifted addr, or result of guest page table walk if addr is a gva. In
+ * the case of VM where memslot's can be mapped at multiple GPA aliases
+ * (i.e. TDX), the gfn field does not contain the bit that selects between
+ * the aliases (i.e. the shared bit for TDX).
+ */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_mmu_faultin_pfn(). */
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ struct page *refcounted_page;
+ bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+/*
+ * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
+ * and of course kvm_mmu_do_page_fault().
+ *
+ * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ * RET_PF_FIXED: The faulting entry has been fixed.
+ * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
+ *
+ * Note, all values must be greater than or equal to zero so as not to encroach
+ * on -errno return values.
+ */
+enum {
+ RET_PF_CONTINUE = 0,
+ RET_PF_RETRY,
+ RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
+ RET_PF_INVALID,
+ RET_PF_FIXED,
+ RET_PF_SPURIOUS,
+};
+
+/*
+ * Define RET_PF_CONTINUE as 0 to allow for
+ * - efficient machine code when checking for CONTINUE, e.g.
+ * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero,
+ * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
+ */
+static_assert(RET_PF_CONTINUE == 0);
+
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
+{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled =
+ is_nx_huge_page_enabled(vcpu->kvm),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ };
+ int r;
+
+ if (vcpu->arch.mmu->root_role.direct) {
+ /*
+ * Things like memslots don't understand the concept of a shared
+ * bit. Strip it so that the GFN can be used like normal, and the
+ * fault.addr can be used when the shared bit is needed.
+ */
+ fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
+ /*
+ * With retpoline being active an indirect call is rather expensive,
+ * so do a direct call in the most common case.
+ */
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
+ r = kvm_tdp_page_fault(vcpu, &fault);
+ else
+ r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
+
+ return r;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
+
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ enum kvm_mmu_type mmu_type);
+
+#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
new file mode 100644
index 000000000000..764e3015d021
--- /dev/null
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u8, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *saved_ptr = trace_seq_buffer_ptr(p); \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
+ __entry->gfn, role.level, \
+ role.has_4_byte_gpte ? 4 : 8, \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.efer_nx ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ saved_ptr; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_PK_MASK, "PK" }, \
+ { PFERR_SS_MASK, "SS" }, \
+ { PFERR_SGX_MASK, "SGX" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte),
+ TP_ARGS(sptep, gfn, spte),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ __field(unsigned int, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = spte & ACC_ALL;
+ __entry->gen = get_mmio_spte_generation(spte);
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, int ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gpa_t, cr2_or_gpa)
+ __field(u64, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_zap_all_fast,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(__u8, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %u used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(signed char, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
+TRACE_EVENT(
+ kvm_tdp_mmu_spte_changed,
+ TP_PROTO(int as_id, gfn_t gfn, int level, u64 old_spte, u64 new_spte),
+ TP_ARGS(as_id, gfn, level, old_spte, new_spte),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ /* Level cannot be larger than 5 on x86, so it fits in a u8. */
+ __field(u8, level)
+ /* as_id can only be 0 or 1 x86, so it fits in a u8. */
+ __field(u8, as_id)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = new_spte;
+ __entry->level = level;
+ __entry->as_id = as_id;
+ ),
+
+ TP_printk("as id %d gfn %llx level %d old_spte %llx new_spte %llx",
+ __entry->as_id, __entry->gfn, __entry->level,
+ __entry->old_spte, __entry->new_spte
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_split_huge_page,
+ TP_PROTO(u64 gfn, u64 spte, int level, int errno),
+ TP_ARGS(gfn, spte, level, errno),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(int, level)
+ __field(int, errno)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = spte;
+ __entry->level = level;
+ __entry->errno = errno;
+ ),
+
+ TP_printk("gfn %llx spte %llx level %d errno %d",
+ __entry->gfn, __entry->spte, __entry->level, __entry->errno)
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH mmu
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
new file mode 100644
index 000000000000..1b17b12393a8
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/lockdep.h>
+#include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "page_track.h"
+
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
+}
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
+{
+ vfree(slot->arch.gfn_write_track);
+ slot->arch.gfn_write_track = NULL;
+}
+
+static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ const size_t size = sizeof(*slot->arch.gfn_write_track);
+
+ if (!slot->arch.gfn_write_track)
+ slot->arch.gfn_write_track = __vcalloc(npages, size,
+ GFP_KERNEL_ACCOUNT);
+
+ return slot->arch.gfn_write_track ? 0 : -ENOMEM;
+}
+
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return 0;
+
+ return __kvm_page_track_write_tracking_alloc(slot, npages);
+}
+
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
+}
+
+static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+
+ val = slot->arch.gfn_write_track[index];
+
+ if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_write_track[index] += count;
+}
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
+ return;
+
+ update_gfn_write_track(slot, gfn, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ int index;
+
+ if (!slot)
+ return false;
+
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+ return !!READ_ONCE(slot->arch.gfn_write_track[index]);
+}
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
+int kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+ return init_srcu_struct(&head->track_srcu);
+}
+
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ if (kvm->arch.vm_type == KVM_X86_TDX_VM)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+ int r;
+
+ if (!kvm || kvm->mm != current->mm)
+ return -ESRCH;
+
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
+ kvm_get_kvm(kvm);
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ write_unlock(&kvm->mmu_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ write_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ write_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+
+ kvm_put_kvm(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_write)
+ n->track_write(gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify external page track nodes that a memory region is being removed from
+ * the VM, e.g. so that users can free any associated metadata.
+ */
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_remove_region)
+ n->track_remove_region(slot->base_gfn, slot->npages, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_add_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_remove_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
new file mode 100644
index 000000000000..d4d72ed999b1
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PAGE_TRACK_H
+#define __KVM_X86_PAGE_TRACK_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_page_track.h>
+
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn);
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
+{
+ return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
+}
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
+ const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+ struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
+
+ kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
+
+#endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
new file mode 100644
index 000000000000..901cd2bd40b8
--- /dev/null
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -0,0 +1,983 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+/*
+ * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables,
+ * as well as guest EPT tables, so the code in this file is compiled thrice,
+ * once per guest PTE type. The per-type defines are #undef'd at the end.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+ #else
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_LEVEL_BITS 10
+ #define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+
+ #define PT32_DIR_PSE36_SIZE 4
+ #define PT32_DIR_PSE36_SHIFT 13
+ #define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_LEVEL_BITS 9
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+#else
+ #error Invalid PTTYPE value
+#endif
+
+/* Common logic, but per-type values. These also need to be undefined. */
+#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK)
+#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS)
+#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS)
+
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ unsigned max_level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+#if PTTYPE == 32
+static inline gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+#endif
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return pte & PT_PRESENT_MASK;
+#else
+ return pte & 7;
+#endif
+}
+
+static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return false;
+#else
+ return __is_bad_mt_xwr(rsvd_check, gpte);
+#endif
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* Prefetch only accessed entries (unless A/D bits are disabled). */
+ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
+ !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
+static inline unsigned FNAME(gpte_access)(u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+#else
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ gpa_t addr, int write_fault)
+{
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return 0;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_GUEST_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr))
+ return -EINVAL;
+#endif
+ pte |= PT_GUEST_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
+
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
+ if (ret)
+ return ret;
+
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+ walker->ptes[level - 1] = pte;
+ }
+ return 0;
+}
+
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
+/*
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access)
+{
+ int ret;
+ pt_element_t pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ u64 nested_access;
+ gpa_t pte_gpa;
+ bool have_ad;
+ int offset;
+ u64 walk_nx_mask = 0;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
+
+ trace_kvm_mmu_pagetable_walk(addr, access);
+retry_walk:
+ walker->level = mmu->cpu_role.base.level;
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+
+#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!FNAME(is_present_gpte)(pte))
+ goto error;
+ --walker->level;
+ }
+#endif
+ walker->max_level = walker->level;
+
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
+ ++walker->level;
+
+ do {
+ struct kvm_memory_slot *slot;
+ unsigned long host_addr;
+
+ pt_access = pte_access;
+ --walker->level;
+
+ index = PT_INDEX(addr, walker->level);
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
+ if (unlikely(real_gpa == INVALID_GPA))
+ return 0;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
+ &walker->pte_writable[walker->level - 1]);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(get_user(pte, ptep_user)))
+ goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
+ goto error;
+
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
+ }
+
+ walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
+
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ if (unlikely(errcode))
+ goto error;
+
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+#if PTTYPE == 32
+ if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+#endif
+
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
+ if (real_gpa == INVALID_GPA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ if (!write_fault)
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
+ */
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
+
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+ }
+
+ return 1;
+
+error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
+ errcode |= PFERR_FETCH_MASK;
+
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ walker->fault.exit_qualification = 0;
+
+ if (write_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+ }
+#endif
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+ walker->fault.async_page_fault = false;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
+ access);
+}
+
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte)
+{
+ unsigned pte_access;
+ gfn_t gfn;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return false;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PG_LEVEL_4K) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = sptep_to_sp(sptep);
+
+ if (sp->role.level > PG_LEVEL_4K)
+ return;
+
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (spte == sptep)
+ continue;
+
+ if (is_shadow_present_pte(*spte))
+ continue;
+
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
+ break;
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
+ */
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct kvm_shadow_walk_iterator it;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+
+ WARN_ON_ONCE(gw->gfn != base_gfn);
+ direct_access = gw->pte_access;
+
+ top_level = vcpu->arch.mmu->cpu_role.base.level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ return RET_PF_RETRY;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
+ return RET_PF_RETRY;
+ }
+
+ for_each_shadow_entry(vcpu, fault->addr, it) {
+ gfn_t table_gfn;
+
+ clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
+
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
+ false, access);
+
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
+
+ /*
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ return RET_PF_RETRY;
+
+ if (sp != ERR_PTR(-EEXIST))
+ link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
+ }
+
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
+
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
+ if (it.level == fault->goal_level)
+ break;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn,
+ true, direct_access);
+ if (sp == ERR_PTR(-EEXIST))
+ continue;
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
+ }
+
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ return ret;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct guest_walker walker;
+ int r;
+
+ WARN_ON_ONCE(fault->is_tdp);
+
+ /*
+ * Look up the guest pte for the faulting address.
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ if (!fault->prefetch)
+ kvm_inject_emulated_page_fault(vcpu, &walker.fault);
+
+ return RET_PF_RETRY;
+ }
+
+ fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
+ return RET_PF_WRITE_PROTECTED;
+ }
+
+ r = mmu_topup_memory_caches(vcpu, true);
+ if (r)
+ return r;
+
+ r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+#if PTTYPE != PTTYPE_EPT
+ /*
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
+ */
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (is_cr4_smep(vcpu->arch.mmu))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+#endif
+
+ r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = make_mmu_pages_available(vcpu);
+ if (r)
+ goto out_unlock;
+ r = FNAME(fetch)(vcpu, fault, &walker);
+
+out_unlock:
+ kvm_mmu_finish_page_fault(vcpu, fault, r);
+ write_unlock(&vcpu->kvm->mmu_lock);
+ return r;
+}
+
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << SPTE_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u64 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = INVALID_GPA;
+ int r;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
+#endif
+
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= addr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+/*
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is
+ * safe because SPTEs are protected by mmu_notifiers and memslot generations, so
+ * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are
+ * nuked first.
+ *
+ * Returns
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
+ */
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ bool host_writable;
+ gpa_t first_pte_gpa;
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
+ return 0;
+
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
+
+ /*
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
+ */
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
+ }
+ /*
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
+ */
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, true,
+ host_writable, &spte);
+
+ /*
+ * There is no need to mark the pfn dirty, as the new protections must
+ * be a subset of the old protections, i.e. synchronizing a SPTE cannot
+ * change the SPTE from read-only to writable.
+ */
+ return mmu_spte_update(sptep, spte);
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
new file mode 100644
index 000000000000..85a0473809b0
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.c
@@ -0,0 +1,576 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Macros and functions to access KVM PTEs (also known as SPTEs)
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2020 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "x86.h"
+#include "spte.h"
+
+#include <asm/e820/api.h>
+#include <asm/memtype.h>
+#include <asm/vmx.h>
+
+bool __read_mostly enable_mmio_caching = true;
+static bool __ro_after_init allow_mmio_caching;
+module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mmio_caching);
+
+bool __read_mostly kvm_ad_enabled;
+
+u64 __read_mostly shadow_host_writable_mask;
+u64 __read_mostly shadow_mmu_writable_mask;
+u64 __read_mostly shadow_nx_mask;
+u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_accessed_mask;
+u64 __read_mostly shadow_dirty_mask;
+u64 __read_mostly shadow_mmio_value;
+u64 __read_mostly shadow_mmio_mask;
+u64 __read_mostly shadow_mmio_access_mask;
+u64 __read_mostly shadow_present_mask;
+u64 __read_mostly shadow_me_value;
+u64 __read_mostly shadow_me_mask;
+u64 __read_mostly shadow_acc_track_mask;
+
+u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
+
+void __init kvm_mmu_spte_module_init(void)
+{
+ /*
+ * Snapshot userspace's desire to allow MMIO caching. Whether or not
+ * KVM can actually enable MMIO caching depends on vendor-specific
+ * hardware capabilities and other module params that can't be resolved
+ * until the vendor module is loaded, i.e. enable_mmio_caching can and
+ * will change when the vendor module is (re)loaded.
+ */
+ allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
+}
+
+static u64 generation_mmio_spte_mask(u64 gen)
+{
+ u64 mask;
+
+ WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
+
+ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
+ return mask;
+}
+
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
+{
+ u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
+ u64 spte = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
+
+ access &= shadow_mmio_access_mask;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
+ spte |= gpa | shadow_nonpresent_or_rsvd_mask;
+ spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+
+ return spte;
+}
+
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
+
+ return !e820__mapped_raw_any(pfn_to_hpa(pfn),
+ pfn_to_hpa(pfn + 1) - 1,
+ E820_TYPE_RAM);
+}
+
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+/*
+ * Returns true if the SPTE needs to be updated atomically due to having bits
+ * that may be changed without holding mmu_lock, and for which KVM must not
+ * lose information. E.g. KVM must not drop Dirty bit information. The caller
+ * is responsible for checking if the SPTE is shadow-present, and for
+ * determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_needs_atomic_update(u64 spte)
+{
+ /* SPTEs can be made Writable bit by KVM's fast page fault handler. */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ /*
+ * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
+ * SPTEs can be restored by KVM's fast page fault handler.
+ */
+ if (!spte_ad_enabled(spte))
+ return true;
+
+ /*
+ * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
+ * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
+ * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
+ * Accessed bit (and rare in practice).
+ */
+ return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
+}
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool synchronizing,
+ bool host_writable, u64 *new_spte)
+{
+ int level = sp->role.level;
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
+ bool wrprot = false;
+
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
+
+ if (sp->role.ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp))
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
+
+ spte |= shadow_present_mask;
+ if (!prefetch || synchronizing)
+ spte |= shadow_accessed_mask;
+
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have any page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
+ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled(vcpu->kvm)) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PG_LEVEL_4K)
+ spte |= PT_PAGE_SIZE_MASK;
+
+ if (kvm_x86_ops.get_mt_mask)
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
+ if (host_writable)
+ spte |= shadow_host_writable_mask;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ spte |= shadow_me_value;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ *
+ * When overwriting an existing leaf SPTE, and the old SPTE was
+ * writable, skip trying to unsync shadow pages as any relevant
+ * shadow pages must already be unsync, i.e. the hash lookup is
+ * unnecessary (and expensive). Note, this relies on KVM not
+ * changing PFNs without first zapping the old SPTE, which is
+ * guaranteed by both the shadow MMU and the TDP MMU.
+ */
+ if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) &&
+ mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch))
+ wrprot = true;
+ else
+ spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask |
+ shadow_dirty_mask;
+ }
+
+ if (prefetch && !synchronizing)
+ spte = mark_spte_for_access_track(spte);
+
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
+ /*
+ * Mark the memslot dirty *after* modifying it for access tracking.
+ * Unlike folios, memslots can be safely marked dirty out of mmu_lock,
+ * i.e. in the fast page fault handler.
+ */
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON_ONCE(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
+ *new_spte = spte;
+ return wrprot;
+}
+
+static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
+{
+ bool is_access_track = is_access_track_spte(spte);
+
+ if (is_access_track)
+ spte = restore_acc_track_spte(spte);
+
+ KVM_MMU_WARN_ON(set & clear);
+ spte = (spte | set) & ~clear;
+
+ if (is_access_track)
+ spte = mark_spte_for_access_track(spte);
+
+ return spte;
+}
+
+static u64 make_spte_executable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
+}
+
+static u64 make_spte_nonexecutable(u64 spte)
+{
+ return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+}
+
+/*
+ * Construct an SPTE that maps a sub-page of the given huge page SPTE where
+ * `index` identifies which sub-page.
+ *
+ * This is used during huge page splitting to build the SPTEs that make up the
+ * new page table.
+ */
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
+{
+ u64 child_spte = huge_spte;
+
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
+
+ /*
+ * The child_spte already has the base address of the huge page being
+ * split. So we just have to OR in the offset to the page at the next
+ * lower level for the given index.
+ */
+ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT;
+
+ if (role.level == PG_LEVEL_4K) {
+ child_spte &= ~PT_PAGE_SIZE_MASK;
+
+ /*
+ * When splitting to a 4K page where execution is allowed, mark
+ * the page executable as the NX hugepage mitigation no longer
+ * applies.
+ */
+ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
+ child_spte = make_spte_executable(child_spte);
+ }
+
+ return child_spte;
+}
+
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
+{
+ u64 huge_spte;
+
+ KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm);
+
+ huge_spte = small_spte | PT_PAGE_SIZE_MASK;
+
+ /*
+ * huge_spte already has the address of the sub-page being collapsed
+ * from small_spte, so just clear the lower address bits to create the
+ * huge page address.
+ */
+ huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
+
+ if (is_nx_huge_page_enabled(kvm))
+ huge_spte = make_spte_nonexecutable(huge_spte);
+
+ return huge_spte;
+}
+
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
+{
+ u64 spte = SPTE_MMU_PRESENT_MASK;
+
+ spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_value;
+
+ if (ad_disabled)
+ spte |= SPTE_TDP_AD_DISABLED;
+ else
+ spte |= shadow_accessed_mask;
+
+ return spte;
+}
+
+u64 mark_spte_for_access_track(u64 spte)
+{
+ if (spte_ad_enabled(spte))
+ return spte & ~shadow_accessed_mask;
+
+ if (is_access_track_spte(spte))
+ return spte;
+
+ check_spte_writable_invariants(spte);
+
+ WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
+ "Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
+ spte &= ~(shadow_acc_track_mask | shadow_accessed_mask);
+
+ return spte;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
+{
+ BUG_ON((u64)(unsigned)access_mask != access_mask);
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
+
+ /*
+ * Reset to the original module param value to honor userspace's desire
+ * to (dis)allow MMIO caching. Update the param itself so that
+ * userspace can see whether or not KVM is actually using MMIO caching.
+ */
+ enable_mmio_caching = allow_mmio_caching;
+ if (!enable_mmio_caching)
+ mmio_value = 0;
+
+ /*
+ * The mask must contain only bits that are carved out specifically for
+ * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO
+ * generation.
+ */
+ if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK))
+ mmio_value = 0;
+
+ /*
+ * Disable MMIO caching if the MMIO value collides with the bits that
+ * are used to hold the relocated GFN when the L1TF mitigation is
+ * enabled. This should never fire as there is no known hardware that
+ * can trigger this condition, e.g. SME/SEV CPUs that require a custom
+ * MMIO value are not susceptible to L1TF.
+ */
+ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask <<
+ SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)))
+ mmio_value = 0;
+
+ /*
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
+ * not set any RWX bits.
+ */
+ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
+ mmio_value = 0;
+
+ if (!mmio_value)
+ enable_mmio_caching = false;
+
+ shadow_mmio_value = mmio_value;
+ shadow_mmio_mask = mmio_mask;
+ shadow_mmio_access_mask = access_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_mask);
+
+void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value)
+{
+ kvm->arch.shadow_mmio_value = mmio_value;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_mmio_spte_value);
+
+void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
+{
+ /* shadow_me_value must be a subset of shadow_me_mask */
+ if (WARN_ON(me_value & ~me_mask))
+ me_value = me_mask = 0;
+
+ shadow_me_value = me_value;
+ shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
+
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+{
+ kvm_ad_enabled = has_ad_bits;
+
+ shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
+ shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
+ shadow_nx_mask = 0ull;
+ shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
+
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
+
+ /*
+ * EPT Misconfigurations are generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks);
+
+void kvm_mmu_reset_all_pte_masks(void)
+{
+ u8 low_phys_bits;
+ u64 mask;
+
+ kvm_ad_enabled = true;
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
+ */
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
+
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+
+ shadow_user_mask = PT_USER_MASK;
+ shadow_accessed_mask = PT_ACCESSED_MASK;
+ shadow_dirty_mask = PT_DIRTY_MASK;
+ shadow_nx_mask = PT64_NX_MASK;
+ shadow_x_mask = 0;
+ shadow_present_mask = PT_PRESENT_MASK;
+
+ shadow_acc_track_mask = 0;
+ shadow_me_mask = 0;
+ shadow_me_value = 0;
+
+ shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE;
+ shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (kvm_host.maxphyaddr < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
+}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
new file mode 100644
index 000000000000..91ce29fd6f1b
--- /dev/null
+++ b/arch/x86/kvm/mmu/spte.h
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef KVM_X86_MMU_SPTE_H
+#define KVM_X86_MMU_SPTE_H
+
+#include <asm/vmx.h>
+
+#include "mmu.h"
+#include "mmu_internal.h"
+
+/*
+ * A MMU present SPTE is backed by actual memory and may or may not be present
+ * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
+ * is ignored by all flavors of SPTEs and checking a low bit often generates
+ * better code than for a high bit, e.g. 56+. MMU present checks are pervasive
+ * enough that the improved code generation is noticeable in KVM's footprint.
+ */
+#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
+
+/*
+ * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
+ * be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
+ * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * is must be employed for a given TDP SPTE.
+ *
+ * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
+ * paging, including NPT PAE. This scheme works because legacy shadow paging
+ * is guaranteed to have A/D bits and write-protection is forced only for
+ * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
+ * must be restricted to 64-bit KVM.
+ */
+#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
+
+#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+/* The mask for the R/X bits in EPT PTEs */
+#define SPTE_EPT_READABLE_MASK 0x1ull
+#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
+
+#define SPTE_LEVEL_BITS 9
+#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
+#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
+#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page. This mask obviously
+ * must not overlap the A/D type mask.
+ */
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
+ SPTE_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/*
+ * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given
+ * SPTE is write-protected. See is_writable_pte() for details.
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
+
+/*
+ * Low ignored bits are at a premium for EPT, use high ignored bits, taking care
+ * to not overlap the A/D type mask or the saved access bits of access-tracked
+ * SPTEs when A/D bits are disabled.
+ */
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
+static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
+
+/* Defined only to keep the above static asserts readable. */
+#undef SHADOW_ACC_TRACK_SAVED_MASK
+
+/*
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
+ *
+ * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
+ * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ *
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap). The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
+ */
+
+#define MMIO_SPTE_GEN_LOW_START 3
+#define MMIO_SPTE_GEN_LOW_END 10
+
+#define MMIO_SPTE_GEN_HIGH_START 52
+#define MMIO_SPTE_GEN_HIGH_END 62
+
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
+#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+ MMIO_SPTE_GEN_HIGH_START)
+static_assert(!(SPTE_MMU_PRESENT_MASK &
+ (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+/*
+ * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the
+ * MMU-present bit. The generation obviously co-exists with the magic MMIO
+ * mask/value, and MMIO SPTEs are considered !MMU-present.
+ *
+ * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT
+ * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO
+ * and so they're off-limits for generation; additional checks ensure the mask
+ * doesn't overlap legal PA bits), and bit 63 (carved out for future usage).
+ */
+#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
+static_assert(!(SPTE_MMIO_ALLOWED_MASK &
+ (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
+
+#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
+#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
+
+/* remember to adjust the comment above as well if you change these */
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+
+#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
+#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
+
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
+
+/*
+ * True if A/D bits are supported in hardware and are enabled by KVM. When
+ * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable
+ * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario
+ * where KVM is using A/D bits for L1, but not L2.
+ */
+extern bool __read_mostly kvm_ad_enabled;
+
+extern u64 __read_mostly shadow_host_writable_mask;
+extern u64 __read_mostly shadow_mmu_writable_mask;
+extern u64 __read_mostly shadow_nx_mask;
+extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_accessed_mask;
+extern u64 __read_mostly shadow_dirty_mask;
+extern u64 __read_mostly shadow_mmio_value;
+extern u64 __read_mostly shadow_mmio_mask;
+extern u64 __read_mostly shadow_mmio_access_mask;
+extern u64 __read_mostly shadow_present_mask;
+extern u64 __read_mostly shadow_me_value;
+extern u64 __read_mostly shadow_me_mask;
+
+/*
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
+ */
+extern u64 __read_mostly shadow_acc_track_mask;
+
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
+/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * vulnerability.
+ *
+ * Only used by the TDP MMU.
+ */
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
+
+static inline bool is_frozen_spte(u64 spte)
+{
+ return spte == FROZEN_SPTE;
+}
+
+/* Get an SPTE's index into its parent's page table (and the spt array). */
+static inline int spte_index(u64 *sptep)
+{
+ return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1);
+}
+
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
+static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
+{
+ if (kvm_mmu_is_dummy_root(root))
+ return NULL;
+
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ return spte_to_child_sp(root);
+}
+
+static inline bool is_mirror_sptep(tdp_ptep_t sptep)
+{
+ return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
+}
+
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
+{
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
+ likely(enable_mmio_caching);
+}
+
+static inline bool is_shadow_present_pte(u64 pte)
+{
+ return !!(pte & SPTE_MMU_PRESENT_MASK);
+}
+
+static inline bool is_ept_ve_possible(u64 spte)
+{
+ return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) &&
+ !(spte & VMX_EPT_SUPPRESS_VE_BIT) &&
+ (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
+}
+
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
+ /*
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
+ * and non-TDP SPTEs will never set these bits. Optimize for 64-bit
+ * TDP and do the A/D type check unconditionally.
+ */
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
+}
+
+static inline bool is_access_track_spte(u64 spte)
+{
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
+}
+
+static inline bool is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static inline bool is_last_spte(u64 pte, int level)
+{
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
+}
+
+static inline bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
+static inline kvm_pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static inline bool is_accessed_spte(u64 spte)
+{
+ return spte & shadow_accessed_mask;
+}
+
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
+/*
+ * A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
+ *
+ * 1. To intercept writes for dirty logging. KVM write-protects huge pages
+ * so that they can be split down into the dirty logging
+ * granularity (4KiB) whenever the guest writes to them. KVM also
+ * write-protects 4KiB pages so that writes can be recorded in the dirty log
+ * (e.g. if not using PML). SPTEs are write-protected for dirty logging
+ * during the VM-iotcls that enable dirty logging.
+ *
+ * 2. To intercept writes to guest page tables that KVM is shadowing. When a
+ * guest writes to its page table the corresponding shadow page table will
+ * be marked "unsync". That way KVM knows which shadow page tables need to
+ * be updated on the next TLB flush, INVLPG, etc. and which do not.
+ *
+ * 3. To prevent guest writes to read-only memory, such as for memory in a
+ * read-only memslot or guest memory backed by a read-only VMA. Writes to
+ * such pages are disallowed entirely.
+ *
+ * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this
+ * case, the SPTE is access-protected, not just write-protected!
+ *
+ * For cases #1 and #4, KVM can safely make such SPTEs writable without taking
+ * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it.
+ * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits
+ * in the SPTE:
+ *
+ * shadow_mmu_writable_mask, aka MMU-writable -
+ * Cleared on SPTEs that KVM is currently write-protecting for shadow paging
+ * purposes (case 2 above).
+ *
+ * shadow_host_writable_mask, aka Host-writable -
+ * Cleared on SPTEs that are not host-writable (case 3 above)
+ *
+ * Note, not all possible combinations of PT_WRITABLE_MASK,
+ * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given
+ * SPTE can be in only one of the following states, which map to the
+ * aforementioned 3 cases:
+ *
+ * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
+ * ------------------------- | ------------------------ | ----------------
+ * 1 | 1 | 1 (writable)
+ * 1 | 1 | 0 (case 1)
+ * 1 | 0 | 0 (case 2)
+ * 0 | 0 | 0 (case 3)
+ *
+ * The valid combinations of these bits are checked by
+ * check_spte_writable_invariants() whenever an SPTE is modified.
+ *
+ * Clearing the MMU-writable bit is always done under the MMU lock and always
+ * accompanied by a TLB flush before dropping the lock to avoid corrupting the
+ * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging
+ * (which does not clear the MMU-writable bit), does not flush TLBs before
+ * dropping the lock, as it only needs to synchronize guest writes with the
+ * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for
+ * access-tracking via the clear_young() MMU notifier also does not flush TLBs.
+ *
+ * So, there is the problem: clearing the MMU-writable bit can encounter a
+ * write-protected SPTE while CPUs still have writable mappings for that SPTE
+ * cached in their TLB. To address this, KVM always flushes TLBs when
+ * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE.
+ *
+ * The Host-writable bit is not modified on present SPTEs, it is only set or
+ * cleared when an SPTE is first faulted in from non-present and then remains
+ * immutable.
+ */
+static inline bool is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+/* Note: spte must be a shadow-present leaf SPTE. */
+static inline void check_spte_writable_invariants(u64 spte)
+{
+ if (spte & shadow_mmu_writable_mask)
+ WARN_ONCE(!(spte & shadow_host_writable_mask),
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
+ spte);
+ else
+ WARN_ONCE(is_writable_pte(spte),
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
+}
+
+static inline bool is_mmu_writable_spte(u64 spte)
+{
+ return spte & shadow_mmu_writable_mask;
+}
+
+/*
+ * Returns true if the access indicated by @fault is allowed by the existing
+ * SPTE protections. Note, the caller is responsible for checking that the
+ * SPTE is a shadow-present, leaf SPTE (either before or after).
+ */
+static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+{
+ if (fault->exec)
+ return is_executable_pte(spte);
+
+ if (fault->write)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for
+ * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only,
+ * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM
+ * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped,
+ * not whether or not SPTEs were modified, i.e. only the write-tracking case
+ * needs to flush at the time the SPTEs is modified, before dropping mmu_lock.
+ *
+ * Don't flush if the Accessed bit is cleared, as access tracking tolerates
+ * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a
+ * mmu_notifier.clear_flush_young() event.
+ *
+ * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally
+ * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and
+ * when clearing dirty logs, KVM flushes based on whether or not dirty entries
+ * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found.
+ *
+ * Note, this logic only applies to shadow-present leaf SPTEs. The caller is
+ * responsible for checking that the old SPTE is shadow-present, and is also
+ * responsible for determining whether or not a TLB flush is required when
+ * modifying a shadow-present non-leaf SPTE.
+ */
+static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte)
+{
+ return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte);
+}
+
+static inline u64 get_mmio_spte_generation(u64 spte)
+{
+ u64 gen;
+
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
+ return gen;
+}
+
+bool spte_needs_atomic_update(u64 spte);
+
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ const struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool synchronizing,
+ bool host_writable, u64 *new_spte);
+u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index);
+u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level);
+u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
+u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
+u64 mark_spte_for_access_track(u64 spte);
+
+/* Restore an acc-track PTE back to a regular PTE */
+static inline u64 restore_acc_track_spte(u64 spte)
+{
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
+
+ spte &= ~shadow_acc_track_mask;
+ spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
+ spte |= saved_bits;
+
+ return spte;
+}
+
+void __init kvm_mmu_spte_module_init(void);
+void kvm_mmu_reset_all_pte_masks(void);
+
+#endif
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
new file mode 100644
index 000000000000..9e17bfa80901
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu_internal.h"
+#include "tdp_iter.h"
+#include "spte.h"
+
+/*
+ * Recalculates the pointer to the SPTE for the current GFN and level and
+ * reread the SPTE.
+ */
+static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
+{
+ iter->sptep = iter->pt_path[iter->level - 1] +
+ SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level);
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+}
+
+/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded = false;
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
+ * Sets a TDP iterator to walk a pre-order traversal of the paging structure
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
+ */
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits)
+{
+ if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
+ (root->role.level > PT64_ROOT_MAX_LEVEL) ||
+ (gfn_bits && next_last_level_gfn >= gfn_bits))) {
+ iter->valid = false;
+ return;
+ }
+
+ iter->next_last_level_gfn = next_last_level_gfn;
+ iter->gfn_bits = gfn_bits;
+ iter->root_level = root->role.level;
+ iter->min_level = min_level;
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
+ iter->as_id = kvm_mmu_page_as_id(root);
+
+ tdp_iter_restart(iter);
+}
+
+/*
+ * Given an SPTE and its level, returns a pointer containing the host virtual
+ * address of the child page table referenced by the SPTE. Returns null if
+ * there is no such entry.
+ */
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
+{
+ /*
+ * There's no child entry if this entry isn't present or is a
+ * last-level entry.
+ */
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
+ return NULL;
+
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
+}
+
+/*
+ * Steps down one level in the paging structure towards the goal GFN. Returns
+ * true if the iterator was able to step down a level, false otherwise.
+ */
+static bool try_step_down(struct tdp_iter *iter)
+{
+ tdp_ptep_t child_pt;
+
+ if (iter->level == iter->min_level)
+ return false;
+
+ /*
+ * Reread the SPTE before stepping down to avoid traversing into page
+ * tables that are no longer linked from this entry.
+ */
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ child_pt = spte_to_child_pt(iter->old_spte, iter->level);
+ if (!child_pt)
+ return false;
+
+ iter->level--;
+ iter->pt_path[iter->level - 1] = child_pt;
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Steps to the next entry in the current page table, at the current page table
+ * level. The next entry could point to a page backing guest memory or another
+ * page table, or it could be non-present. Returns true if the iterator was
+ * able to step to the next entry in the page table, false if the iterator was
+ * already at the end of the current page table.
+ */
+static bool try_step_side(struct tdp_iter *iter)
+{
+ /*
+ * Check if the iterator is already at the end of the current page
+ * table.
+ */
+ if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) ==
+ (SPTE_ENT_PER_PAGE - 1))
+ return false;
+
+ iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
+ iter->next_last_level_gfn = iter->gfn;
+ iter->sptep++;
+ iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
+
+ return true;
+}
+
+/*
+ * Tries to traverse back up a level in the paging structure so that the walk
+ * can continue from the next entry in the parent page table. Returns true on a
+ * successful step up, false if already in the root page.
+ */
+static bool try_step_up(struct tdp_iter *iter)
+{
+ if (iter->level == iter->root_level)
+ return false;
+
+ iter->level++;
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ return true;
+}
+
+/*
+ * Step to the next SPTE in a pre-order traversal of the paging structure.
+ * To get to the next SPTE, the iterator either steps down towards the goal
+ * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
+ * higher GFN.
+ *
+ * The basic algorithm is as follows:
+ * 1. If the current SPTE is a non-last-level SPTE, step down into the page
+ * table it points to.
+ * 2. If the iterator cannot step down, it will try to step to the next SPTE
+ * in the current page of the paging structure.
+ * 3. If the iterator cannot step to the next entry in the current page, it will
+ * try to step up to the parent paging structure page. In this case, that
+ * SPTE will have already been visited, and so the iterator must also step
+ * to the side again.
+ */
+void tdp_iter_next(struct tdp_iter *iter)
+{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
+ if (try_step_down(iter))
+ return;
+
+ do {
+ if (try_step_side(iter))
+ return;
+ } while (try_step_up(iter));
+ iter->valid = false;
+}
+
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
new file mode 100644
index 000000000000..364c5da6c499
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_ITER_H
+#define __KVM_X86_MMU_TDP_ITER_H
+
+#include <linux/kvm_host.h>
+
+#include "mmu.h"
+#include "spte.h"
+
+/*
+ * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
+ * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
+ * batched without having to collect the list of zapped SPs. Flows that can
+ * remove SPs must service pending TLB flushes prior to dropping RCU protection.
+ */
+static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
+{
+ return READ_ONCE(*rcu_dereference(sptep));
+}
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+ atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
+ * and have volatile bits (bits that can be set outside of mmu_lock) that
+ * must not be clobbered.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_needs_atomic_update(old_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
+{
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
+}
+
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
+/*
+ * A TDP iterator performs a pre-order walk over a TDP paging structure.
+ */
+struct tdp_iter {
+ /*
+ * The iterator will traverse the paging structure towards the mapping
+ * for this GFN.
+ */
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
+ /* Pointers to the page tables traversed to reach the current SPTE */
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
+ /* A pointer to the current SPTE */
+ tdp_ptep_t sptep;
+ /* The lowest GFN (mask bits excluded) mapped by the current SPTE */
+ gfn_t gfn;
+ /* Mask applied to convert the GFN to the mapping GPA */
+ gfn_t gfn_bits;
+ /* The level of the root page given to the iterator */
+ int root_level;
+ /* The lowest level the iterator should traverse to */
+ int min_level;
+ /* The iterator's current level within the paging structure */
+ int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
+ /* A snapshot of the value at sptep */
+ u64 old_spte;
+ /*
+ * Whether the iterator has a valid state. This will be false if the
+ * iterator walks off the end of the paging structure.
+ */
+ bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
+};
+
+/*
+ * Iterates over every SPTE mapping the GFN range [start, end) in a
+ * preorder traversal.
+ */
+#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \
+ for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \
+ iter.valid && iter.gfn < end; \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte_min_level_all(iter, root, min_level) \
+ for (tdp_iter_start(&iter, root, min_level, 0, 0); \
+ iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \
+ tdp_iter_next(&iter))
+
+#define for_each_tdp_pte(iter, kvm, root, start, end) \
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end)
+
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
+
+void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
+ int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits);
+void tdp_iter_next(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
+
+#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
new file mode 100644
index 000000000000..9c26038f6b77
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -0,0 +1,1992 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "mmu.h"
+#include "mmu_internal.h"
+#include "mmutrace.h"
+#include "tdp_iter.h"
+#include "tdp_mmu.h"
+#include "spte.h"
+
+#include <asm/cmpxchg.h>
+#include <trace/events/kvm.h>
+
+/* Initializes the TDP MMU for the VM, if enabled. */
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+{
+ INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/* Arbitrarily returns true so that this may be used in if statements. */
+static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+ bool shared)
+{
+ if (shared)
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return true;
+}
+
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
+{
+ /*
+ * Invalidate all roots, which besides the obvious, schedules all roots
+ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
+ * ultimately frees all roots.
+ */
+ kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
+
+#ifdef CONFIG_KVM_PROVE_MMU
+ KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
+#endif
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down. Putting the last reference to
+ * zapped roots will create new callbacks.
+ */
+ rcu_barrier();
+}
+
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->external_spt);
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
+{
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+{
+ if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
+ return;
+
+ /*
+ * The TDP MMU itself holds a reference to each root until the root is
+ * explicitly invalidated, i.e. the final reference should be never be
+ * put for a valid root.
+ */
+ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ list_del_rcu(&root->link);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
+ return false;
+
+ if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
+ return false;
+
+ if (likely(!is_mirror_sp(root)))
+ return types & KVM_DIRECT_ROOTS;
+ return types & KVM_MIRROR_ROOTS;
+}
+
+/*
+ * Returns the next root after @prev_root (or the first root if @prev_root is
+ * NULL) that matches with @types. A reference to the returned root is
+ * acquired, and the reference to @prev_root is released (the caller obviously
+ * must hold a reference to @prev_root if it's non-NULL).
+ *
+ * Roots that doesn't match with @types are skipped.
+ *
+ * Returns NULL if the end of tdp_mmu_roots was reached.
+ */
+static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
+ struct kvm_mmu_page *prev_root,
+ enum kvm_tdp_mmu_root_types types)
+{
+ struct kvm_mmu_page *next_root;
+
+ /*
+ * While the roots themselves are RCU-protected, fields such as
+ * role.invalid are protected by mmu_lock.
+ */
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ if (prev_root)
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &prev_root->link,
+ typeof(*prev_root), link);
+ else
+ next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ typeof(*next_root), link);
+
+ while (next_root) {
+ if (tdp_mmu_root_match(next_root, types) &&
+ kvm_tdp_mmu_get_root(next_root))
+ break;
+
+ next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
+ &next_root->link, typeof(*next_root), link);
+ }
+
+ rcu_read_unlock();
+
+ if (prev_root)
+ kvm_tdp_mmu_put_root(kvm, prev_root);
+
+ return next_root;
+}
+
+/*
+ * Note: this iterator gets and puts references to the roots it iterates over.
+ * This makes it safe to release the MMU lock and yield within the loop, but
+ * if exiting the loop early, the caller must drop the reference to the most
+ * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode.
+ */
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _types)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
+
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
+
+/*
+ * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
+ * the implication being that any flow that holds mmu_lock for read is
+ * inherently yield-friendly and should use the yield-safe variant above.
+ * Holding mmu_lock for write obviates the need for RCU protection as the list
+ * is guaranteed to be stable.
+ */
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types)))) { \
+ } else
+
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types))) { \
+ } else
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+ return sp;
+}
+
+static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
+ gfn_t gfn, union kvm_mmu_page_role role)
+{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ sp->role = role;
+ sp->gfn = gfn;
+ sp->ptep = sptep;
+ sp->tdp_mmu_page = true;
+
+ trace_kvm_mmu_get_page(sp, true);
+}
+
+static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *parent_sp;
+ union kvm_mmu_page_role role;
+
+ parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
+
+ role = parent_sp->role;
+ role.level--;
+
+ tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
+}
+
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *root;
+
+ if (mirror)
+ role.is_mirror = true;
+
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+
+ /*
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
+ */
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (root->role.word == role.word &&
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
+ }
+
+ root = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_sp(root, NULL, 0, role);
+
+ /*
+ * TDP MMU roots are kept until they are explicitly invalidated, either
+ * by a memslot update or by the destruction of the VM. Initialize the
+ * refcount to two; one reference for the vCPU, and one reference for
+ * the TDP MMU itself, which is held until the root is invalidated and
+ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
+ */
+ refcount_set(&root->tdp_mmu_root_count, 2);
+ list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
+
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ if (mirror) {
+ mmu->mirror_root_hpa = __pa(root->spt);
+ } else {
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ }
+}
+
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
+
+static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+#ifdef CONFIG_KVM_PROVE_MMU
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
+#endif
+}
+
+static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+#ifdef CONFIG_KVM_PROVE_MMU
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
+#endif
+}
+
+/**
+ * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ */
+static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
+ int level)
+{
+ /*
+ * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
+ * PTs are removed in a special order, involving free_external_spt().
+ * But remove_external_spte() will be called on non-leaf PTEs via
+ * __tdp_mmu_zap_root(), so avoid the error the former would return
+ * in this case.
+ */
+ if (!is_last_spte(old_spte, level))
+ return;
+
+ /* Zapping leaf spte is allowed only when write lock is held. */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
+}
+
+/**
+ * handle_removed_pt() - handle a page table removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
+ */
+static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_sp(kvm, sp);
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ tdp_ptep_t sptep = pt + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_spte;
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as frozen then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the frozen SPTE value.
+ */
+ for (;;) {
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
+ continue;
+
+ /*
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a frozen SPTE is not
+ * strictly necessary for the same reason, but using
+ * the frozen SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to FROZEN_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
+ */
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ FROZEN_SPTE, level);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_spte, FROZEN_SPTE, level, shared);
+
+ if (is_mirror_sp(sp)) {
+ KVM_BUG_ON(shared, kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+ }
+
+ if (is_mirror_sp(sp) &&
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
+ /*
+ * Failed to free page table page in mirror page table and
+ * there is nothing to do further.
+ * Intentionally leak the page to prevent the kernel from
+ * accessing the encrypted page.
+ */
+ sp->external_spt = NULL;
+ }
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
+{
+ if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
+ struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
+
+ WARN_ON_ONCE(sp->role.level + 1 != level);
+ WARN_ON_ONCE(sp->gfn != gfn);
+ return sp->external_spt;
+ }
+
+ return NULL;
+}
+
+static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
+ gfn_t gfn, u64 old_spte,
+ u64 new_spte, int level)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ int ret = 0;
+
+ KVM_BUG_ON(was_present, kvm);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+ /*
+ * We need to lock out other updates to the SPTE until the external
+ * page table has been modified. Use FROZEN_SPTE similar to
+ * the zapping case.
+ */
+ if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
+ return -EBUSY;
+
+ /*
+ * Use different call to either set up middle level
+ * external page table, or leaf.
+ */
+ if (is_leaf) {
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
+ } else {
+ void *external_spt = get_external_spt(gfn, new_spte, level);
+
+ KVM_BUG_ON(!external_spt, kvm);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
+ }
+ if (ret)
+ __kvm_tdp_mmu_write_spte(sptep, old_spte);
+ else
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return ret;
+}
+
+/**
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * @kvm: kvm instance
+ * @as_id: the address space of the paging structure the SPTE was a part of
+ * @gfn: the base GFN that was mapped by the SPTE
+ * @old_spte: The value of the SPTE before the change
+ * @new_spte: The value of the SPTE after the change
+ * @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
+ */
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ bool was_present = is_shadow_present_pte(old_spte);
+ bool is_present = is_shadow_present_pte(new_spte);
+ bool was_leaf = was_present && is_last_spte(old_spte, level);
+ bool is_leaf = is_present && is_last_spte(new_spte, level);
+ bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
+
+ WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
+ WARN_ON_ONCE(level < PG_LEVEL_4K);
+ WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+
+ /*
+ * If this warning were to trigger it would indicate that there was a
+ * missing MMU notifier or a race with some notifier handler.
+ * A present, leaf SPTE should never be directly replaced with another
+ * present leaf SPTE pointing to a different PFN. A notifier handler
+ * should be zapping the SPTE before the main MM's page table is
+ * changed, or the SPTE should be zeroed, and the TLBs flushed by the
+ * thread before replacement.
+ */
+ if (was_leaf && is_leaf && pfn_changed) {
+ pr_err("Invalid SPTE change: cannot replace a present leaf\n"
+ "SPTE with another present leaf SPTE mapping a\n"
+ "different PFN!\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+
+ /*
+ * Crash the host to prevent error propagation and guest data
+ * corruption.
+ */
+ BUG();
+ }
+
+ if (old_spte == new_spte)
+ return;
+
+ trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+
+ if (is_leaf)
+ check_spte_writable_invariants(new_spte);
+
+ /*
+ * The only times a SPTE should be changed from a non-present to
+ * non-present state is when an MMIO entry is installed/modified/
+ * removed. In that case, there is nothing to do here.
+ */
+ if (!was_present && !is_present) {
+ /*
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
+ */
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
+ !is_frozen_spte(new_spte)))
+ pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
+ "should not be replaced with another,\n"
+ "different nonpresent SPTE, unless one or both\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary frozen SPTE.\n"
+ "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
+ as_id, gfn, old_spte, new_spte, level);
+ return;
+ }
+
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
+
+ /*
+ * Recursively handle child PTs if the change removed a subtree from
+ * the paging structure. Note the WARN on the PFN changing without the
+ * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
+ * pages are kernel allocations and should never be migrated.
+ */
+ if (was_present && !was_leaf &&
+ (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
+ handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
+}
+
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ /*
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
+ */
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
+
+ if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
+ int ret;
+
+ /*
+ * Users of atomic zapping don't operate on mirror roots,
+ * so don't handle it and bug the VM if it's seen.
+ */
+ if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
+ return -EBUSY;
+
+ ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
+ iter->old_spte, new_spte, iter->level);
+ if (ret)
+ return ret;
+ } else {
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
+ * and does not hold the mmu_lock. On failure, i.e. if a
+ * different logical CPU modified the SPTE, try_cmpxchg64()
+ * updates iter->old_spte with the current value, so the caller
+ * operates on fresh data, e.g. if it retries
+ * tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
+ * in KVM's dirty bitmaps.
+ *
+ * If setting the SPTE fails because it has changed, iter->old_spte will be
+ * refreshed to the current value of the spte.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Return:
+ * * 0 - If the SPTE was set.
+ * * -EBUSY - If the SPTE cannot be set. In this case this function will have
+ * no side-effects other than setting iter->old_spte to the last
+ * known value of the spte.
+ */
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ int ret;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
+ if (ret)
+ return ret;
+
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
+
+ return 0;
+}
+
+/*
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: KVM instance
+ * @as_id: Address space ID, i.e. regular vs. SMM
+ * @sptep: Pointer to the SPTE
+ * @old_spte: The current value of the SPTE
+ * @new_spte: The new value that will be set for the SPTE
+ * @gfn: The base GFN that was (or will be) mapped by the SPTE
+ * @level: The level _containing_ the SPTE (its parent PT's level)
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
+ */
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to or from the
+ * temporary frozen SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the frozen SPTE should not be necessary.
+ */
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
+
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
+
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
+
+ /*
+ * Users that do non-atomic setting of PTEs don't operate on mirror
+ * roots, so don't handle it and bug the VM if it's seen.
+ */
+ if (is_mirror_sptep(sptep)) {
+ KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
+ remove_external_spte(kvm, gfn, old_spte, level);
+ }
+
+ return old_spte;
+}
+
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
+{
+ WARN_ON_ONCE(iter->yielded);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
+}
+
+#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
+
+#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
+ tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
+ if (!is_shadow_present_pte(_iter.old_spte) || \
+ !is_last_spte(_iter.old_spte, _iter.level)) \
+ continue; \
+ else
+
+static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
+ struct tdp_iter *iter)
+{
+ if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
+ return false;
+
+ /* Ensure forward progress has been made before yielding. */
+ return iter->next_last_level_gfn != iter->yielded_gfn;
+}
+
+/*
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
+ *
+ * Returns true if this function yielded.
+ */
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
+{
+ KVM_MMU_WARN_ON(iter->yielded);
+
+ if (!tdp_mmu_iter_need_resched(kvm, iter))
+ return false;
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ rcu_read_unlock();
+
+ if (shared)
+ cond_resched_rwlock_read(&kvm->mmu_lock);
+ else
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
+
+ iter->yielded = true;
+ return true;
+}
+
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
+{
+ /*
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
+ */
+ return kvm_mmu_max_gfn() + 1;
+}
+
+static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared, int zap_level)
+{
+ struct tdp_iter iter;
+
+ for_each_tdp_pte_min_level_all(iter, root, zap_level) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ if (iter.level > zap_level)
+ continue;
+
+ if (!shared)
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
+ goto retry;
+ }
+}
+
+static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
+ bool shared)
+{
+
+ /*
+ * The root must have an elevated refcount so that it's reachable via
+ * mmu_notifier callbacks, which allows this path to yield and drop
+ * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
+ * must drop all references to relevant pages prior to completing the
+ * callback. Dropping mmu_lock with an unreachable root would result
+ * in zapping SPTEs after a relevant mmu_notifier callback completes
+ * and lead to use-after-free as zapping a SPTE triggers "writeback" of
+ * dirty accessed bits to the SPTE's associated struct page.
+ */
+ WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
+ rcu_read_lock();
+
+ /*
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
+ *
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
+ */
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
+ __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
+
+ rcu_read_unlock();
+}
+
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ struct tdp_iter iter = {
+ .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
+ .sptep = sp->ptep,
+ .level = sp->role.level + 1,
+ .gfn = sp->gfn,
+ .as_id = kvm_mmu_page_as_id(sp),
+ };
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
+ return false;
+
+ /*
+ * Root shadow pages don't have a parent page table and thus no
+ * associated entry, but they can never be possible NX huge pages.
+ */
+ if (WARN_ON_ONCE(!sp->ptep))
+ return false;
+
+ /*
+ * Since mmu_lock is held in read mode, it's possible another task has
+ * already modified the SPTE. Zap the SPTE if and only if the SPTE
+ * points at the SP's page table, as checking shadow-present isn't
+ * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
+ * another SP. Note, spte_to_child_pt() also checks that the SPTE is
+ * shadow-present, i.e. guards against zapping a frozen SPTE.
+ */
+ if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
+ return false;
+
+ /*
+ * If a different task modified the SPTE, then it should be impossible
+ * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
+ * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
+ * creating non-leaf SPTEs, and all other bits are immutable for non-
+ * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
+ * zapping and replacement.
+ */
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
+ WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * If can_yield is true, will release the MMU lock and reschedule if the
+ * scheduler needs the CPU or there is contention on the MMU lock. If this
+ * function cannot yield, it will not release the MMU lock or reschedule and
+ * the caller must ensure it does not supply too large a GFN range, or the
+ * operation can cause a soft lockup.
+ */
+static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
+{
+ struct tdp_iter iter;
+
+ end = min(end, tdp_mmu_max_gfn_exclusive());
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
+ flush = false;
+ continue;
+ }
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
+ * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
+ */
+ return flush;
+}
+
+/*
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
+ */
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
+
+ return flush;
+}
+
+void kvm_tdp_mmu_zap_all(struct kvm *kvm)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * Zap all direct roots, including invalid direct roots, as all direct
+ * SPTEs must be dropped before returning to the caller. For TDX, mirror
+ * roots don't need handling in response to the mmu notifier (the caller).
+ *
+ * Zap directly even if the root is also being zapped by a concurrent
+ * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
+ * and mmu_lock is already held, which means the other thread has yielded.
+ *
+ * A TLB flush is unnecessary, KVM zaps everything if and only the VM
+ * is being destroyed or the userspace VMM has exited. In both cases,
+ * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
+ */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
+ KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
+ tdp_mmu_zap_root(kvm, root, false);
+}
+
+/*
+ * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
+ * zap" completes.
+ */
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
+{
+ struct kvm_mmu_page *root;
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root) {
+ if (!root->tdp_mmu_scheduled_root_to_zap)
+ continue;
+
+ root->tdp_mmu_scheduled_root_to_zap = false;
+ KVM_BUG_ON(!root->role.invalid, kvm);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB
+ * flush when allocating a new root (see kvm_mmu_load()), and
+ * when migrating a vCPU to a different pCPU. Note, the local
+ * TLB flush on reuse also invalidates paging-structure-cache
+ * entries, i.e. TLB entries for intermediate paging structures,
+ * that may be zapped, as such entries are associated with the
+ * ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, shared);
+
+ /*
+ * The referenced needs to be put *after* zapping the root, as
+ * the root must be reachable by mmu_notifiers while it's being
+ * zapped
+ */
+ kvm_tdp_mmu_put_root(kvm, root);
+ }
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
+ * is about to be zapped, e.g. in response to a memslots update. The actual
+ * zapping is done separately so that it happens with mmu_lock with read,
+ * whereas invalidating roots must be done with mmu_lock held for write (unless
+ * the VM is being destroyed).
+ *
+ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
+ * See kvm_tdp_mmu_alloc_root().
+ */
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types)
+{
+ struct kvm_mmu_page *root;
+
+ /*
+ * Invalidating invalid roots doesn't make sense, prevent developers from
+ * having to think about it.
+ */
+ if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
+ root_types &= ~KVM_INVALID_ROOTS;
+
+ /*
+ * mmu_lock must be held for write to ensure that a root doesn't become
+ * invalid while there are active readers (invalidating a root while
+ * there are active readers may or may not be problematic in practice,
+ * but it's uncharted territory and not supported).
+ *
+ * Waive the assertion if there are no users of @kvm, i.e. the VM is
+ * being destroyed after all references have been put, or if no vCPUs
+ * have been created (which means there are no roots), i.e. the VM is
+ * being destroyed in an error path of KVM_CREATE_VM.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ refcount_read(&kvm->users_count) && kvm->created_vcpus)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * As above, mmu_lock isn't held when destroying the VM! There can't
+ * be other references to @kvm, i.e. nothing else can invalidate roots
+ * or get/put references to roots.
+ */
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!tdp_mmu_root_match(root, root_types))
+ continue;
+
+ /*
+ * Note, invalid roots can outlive a memslot update! Invalid
+ * roots must be *zapped* before the memslot update completes,
+ * but a different task can acquire a reference and keep the
+ * root alive after its been zapped.
+ */
+ if (!root->role.invalid) {
+ root->tdp_mmu_scheduled_root_to_zap = true;
+ root->role.invalid = true;
+ }
+ }
+}
+
+/*
+ * Installs a last-level SPTE to handle a TDP page fault.
+ * (NPT/EPT violation/misconfiguration)
+ */
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
+ u64 new_spte;
+ int ret = RET_PF_FIXED;
+ bool wrprot = false;
+
+ if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
+ return RET_PF_RETRY;
+
+ if (is_shadow_present_pte(iter->old_spte) &&
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
+ return RET_PF_SPURIOUS;
+ }
+
+ if (unlikely(!fault->slot))
+ new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+ else
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch,
+ false, fault->map_writable, &new_spte);
+
+ if (new_spte == iter->old_spte)
+ ret = RET_PF_SPURIOUS;
+ else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
+ else if (is_shadow_present_pte(iter->old_spte) &&
+ (!is_last_spte(iter->old_spte, iter->level) ||
+ WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
+
+ /*
+ * If the page fault was caused by a write but the page is write
+ * protected, emulation is needed. If the emulation was skipped,
+ * the vCPU would have the same fault again.
+ */
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
+
+ /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
+ vcpu->stat.pf_mmio_spte_created++;
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
+ ret = RET_PF_EMULATE;
+ } else {
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
+ }
+
+ return ret;
+}
+
+/*
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @sp: The new TDP page table to install.
+ * @shared: This operation is running under the MMU lock in read mode.
+ *
+ * Returns: 0 if the new page table was installed. Non-0 if the page table
+ * could not be installed (e.g. the atomic compare-exchange failed).
+ */
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
+ int ret = 0;
+
+ if (shared) {
+ ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+ if (ret)
+ return ret;
+ } else {
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
+ }
+
+ tdp_account_mmu_page(kvm, sp);
+
+ return 0;
+}
+
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
+/*
+ * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
+ * page tables and SPTEs to translate the faulting guest physical address.
+ */
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+ struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
+ struct kvm *kvm = vcpu->kvm;
+ struct tdp_iter iter;
+ struct kvm_mmu_page *sp;
+ int ret = RET_PF_RETRY;
+
+ KVM_MMU_WARN_ON(!root || root->role.invalid);
+
+ kvm_mmu_hugepage_adjust(vcpu, fault);
+
+ trace_kvm_mmu_spte_requested(fault);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
+ int r;
+
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
+
+ /*
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
+ */
+ if (is_frozen_spte(iter.old_spte))
+ goto retry;
+
+ if (iter.level == fault->goal_level)
+ goto map_target_level;
+
+ /* Step down into the lower level page table if it exists. */
+ if (is_shadow_present_pte(iter.old_spte) &&
+ !is_large_pte(iter.old_spte))
+ continue;
+
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
+ if (is_mirror_sp(sp))
+ kvm_mmu_alloc_external_spt(vcpu, sp);
+
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
+
+ if (is_shadow_present_pte(iter.old_spte)) {
+ /* Don't support large page for mirrored roots (TDX) */
+ KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ } else {
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
+ }
+
+ /*
+ * Force the guest to retry if installing an upper level SPTE
+ * failed, e.g. because a different task modified the SPTE.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
+ }
+
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ if (sp->nx_huge_page_disallowed)
+ track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ }
+ }
+
+ /*
+ * The walk aborted before reaching the target level, e.g. because the
+ * iterator detected an upper level SPTE was frozen during traversal.
+ */
+ WARN_ON_ONCE(iter.level == fault->goal_level);
+ goto retry;
+
+map_target_level:
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
+
+retry:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Used by mmu notifier via kvm_unmap_gfn_range() */
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush)
+{
+ enum kvm_tdp_mmu_root_types types;
+ struct kvm_mmu_page *root;
+
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
+ flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
+ range->may_block, flush);
+
+ return flush;
+}
+
+/*
+ * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
+ * if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
+ */
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
+{
+ u64 new_spte;
+
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+ shadow_accessed_mask);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
+ } else {
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ /*
+ * It is safe for the following cmpxchg to fail. Leave the
+ * Accessed bit set, as the spte is most likely young anyway.
+ */
+ if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+ return;
+ }
+
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
+}
+
+static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range,
+ bool test_only)
+{
+ enum kvm_tdp_mmu_root_types types;
+ struct kvm_mmu_page *root;
+ struct tdp_iter iter;
+ bool ret = false;
+
+ types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
+
+ /*
+ * Don't support rescheduling, none of the MMU notifiers that funnel
+ * into this helper allow blocking; it'd be dead, wasteful code. Note,
+ * this helper must NOT be used to unmap GFNs, as it processes only
+ * valid roots!
+ */
+ WARN_ON(types & ~KVM_VALID_ROOTS);
+
+ guard(rcu)();
+ for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
+ tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
+ if (!is_accessed_spte(iter.old_spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ ret = true;
+ kvm_tdp_mmu_age_spte(kvm, &iter);
+ }
+ }
+
+ return ret;
+}
+
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
+}
+
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
+}
+
+/*
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
+ */
+static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ rcu_read_lock();
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
+ continue;
+
+ new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ goto retry;
+
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+ return spte_set;
+}
+
+/*
+ * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
+ * only affect leaf SPTEs down to min_level.
+ * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ */
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages, min_level);
+
+ return spte_set;
+}
+
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
+ if (!sp)
+ return NULL;
+
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!sp->spt) {
+ kmem_cache_free(mmu_page_header_cache, sp);
+ return NULL;
+ }
+
+ return sp;
+}
+
+/* Note, the caller is responsible for initializing @sp. */
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared)
+{
+ const u64 huge_spte = iter->old_spte;
+ const int level = iter->level;
+ int ret, i;
+
+ /*
+ * No need for atomics when writing to sp->spt since the page table has
+ * not been linked in yet and thus is not reachable from any other CPU.
+ */
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
+ sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
+
+ /*
+ * Replace the huge spte with a pointer to the populated lower level
+ * page table. Since we are making this change without a TLB flush vCPUs
+ * will see a mix of the split mappings and the original huge mapping,
+ * depending on what's currently in their TLB. This is fine from a
+ * correctness standpoint since the translation will be the same either
+ * way.
+ */
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
+ if (ret)
+ goto out;
+
+ /*
+ * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
+ * are overwriting from the page stats. But we have to manually update
+ * the page stats with the new present child pages.
+ */
+ kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
+
+out:
+ trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
+ return ret;
+}
+
+static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct tdp_iter iter;
+
+ rcu_read_lock();
+
+ /*
+ * Traverse the page table splitting all huge pages above the target
+ * level into one lower level. For example, if we encounter a 1GB page
+ * we split it into 512 2MB pages.
+ *
+ * Since the TDP iterator uses a pre-order traversal, we are guaranteed
+ * to visit an SPTE before ever visiting its children, which means we
+ * will correctly recursively split huge pages that are more than one
+ * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
+ * and then splitting each of those to 512 4KB pages).
+ */
+ for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
+ continue;
+
+ if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
+ continue;
+
+ if (!sp) {
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
+ if (!sp) {
+ trace_kvm_mmu_split_huge_page(iter.gfn,
+ iter.old_spte,
+ iter.level, -ENOMEM);
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
+ }
+
+ tdp_mmu_init_child_sp(sp, &iter);
+
+ if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
+ goto retry;
+
+ sp = NULL;
+ }
+
+ rcu_read_unlock();
+
+ /*
+ * It's possible to exit the loop having never used the last sp if, for
+ * example, a vCPU doing HugePage NX splitting wins the race and
+ * installs its own sp in place of the last sp we tried to split.
+ */
+ if (sp)
+ tdp_mmu_free_sp(sp);
+
+ return 0;
+}
+
+
+/*
+ * Try to split all huge pages mapped by the TDP MMU down to the target level.
+ */
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared)
+{
+ struct kvm_mmu_page *root;
+ int r = 0;
+
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
+ r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
+ if (r) {
+ kvm_tdp_mmu_put_root(kvm, root);
+ break;
+ }
+ }
+}
+
+static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ /*
+ * All TDP MMU shadow pages share the same role as their root, aside
+ * from level, so it is valid to key off any shadow page to determine if
+ * write protection is needed for an entire tree.
+ */
+ return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
+}
+
+static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t start, gfn_t end)
+{
+ const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
+ struct tdp_iter iter;
+
+ rcu_read_lock();
+
+ tdp_root_for_each_pte(iter, kvm, root, start, end) {
+retry:
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+ continue;
+
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
+ goto retry;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
+ * memslot.
+ */
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ clear_dirty_gfn_range(kvm, root, slot->base_gfn,
+ slot->base_gfn + slot->npages);
+}
+
+static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, unsigned long mask, bool wrprot)
+{
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
+ PT_WRITABLE_MASK : shadow_dirty_mask;
+ struct tdp_iter iter;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ rcu_read_lock();
+
+ tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
+ gfn + BITS_PER_LONG) {
+ if (!mask)
+ break;
+
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
+ spte_ad_need_write_protect(iter.old_spte));
+
+ if (iter.level > PG_LEVEL_4K ||
+ !(mask & (1UL << (iter.gfn - gfn))))
+ continue;
+
+ mask &= ~(1UL << (iter.gfn - gfn));
+
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
+
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
+ * which a bit is set in mask, starting at gfn. The given memslot is expected to
+ * contain all the GFNs represented by set bits in the mask.
+ */
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot)
+{
+ struct kvm_mmu_page *root;
+
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
+ clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
+}
+
+static int tdp_mmu_make_huge_spte(struct kvm *kvm,
+ struct tdp_iter *parent,
+ u64 *huge_spte)
+{
+ struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
+ gfn_t start = parent->gfn;
+ gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
+ struct tdp_iter iter;
+
+ tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
+ /*
+ * Use the parent iterator when checking for forward progress so
+ * that KVM doesn't get stuck continuously trying to yield (i.e.
+ * returning -EAGAIN here and then failing the forward progress
+ * check in the caller ad nauseam).
+ */
+ if (tdp_mmu_iter_need_resched(kvm, parent))
+ return -EAGAIN;
+
+ *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static void recover_huge_pages_range(struct kvm *kvm,
+ struct kvm_mmu_page *root,
+ const struct kvm_memory_slot *slot)
+{
+ gfn_t start = slot->base_gfn;
+ gfn_t end = start + slot->npages;
+ struct tdp_iter iter;
+ int max_mapping_level;
+ bool flush = false;
+ u64 huge_spte;
+ int r;
+
+ if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
+ return;
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
+retry:
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
+ flush = false;
+ continue;
+ }
+
+ if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
+ !is_shadow_present_pte(iter.old_spte))
+ continue;
+
+ /*
+ * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
+ * a large page size, then its parent would have been zapped
+ * instead of stepping down.
+ */
+ if (is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ /*
+ * If iter.gfn resides outside of the slot, i.e. the page for
+ * the current level overlaps but is not contained by the slot,
+ * then the SPTE can't be made huge. More importantly, trying
+ * to query that info from slot->arch.lpage_info will cause an
+ * out-of-bounds access.
+ */
+ if (iter.gfn < start || iter.gfn >= end)
+ continue;
+
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
+ if (max_mapping_level < iter.level)
+ continue;
+
+ r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
+ if (r == -EAGAIN)
+ goto retry;
+ else if (r)
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
+ goto retry;
+
+ flush = true;
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
+ rcu_read_unlock();
+}
+
+/*
+ * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
+ * huge SPTEs where possible.
+ */
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ struct kvm_mmu_page *root;
+
+ lockdep_assert_held_read(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+ recover_huge_pages_range(kvm, root, slot);
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
+ gfn_t gfn, int min_level)
+{
+ struct tdp_iter iter;
+ u64 new_spte;
+ bool spte_set = false;
+
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
+ rcu_read_lock();
+
+ for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
+ new_spte = iter.old_spte &
+ ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+
+ if (new_spte == iter.old_spte)
+ break;
+
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
+ spte_set = true;
+ }
+
+ rcu_read_unlock();
+
+ return spte_set;
+}
+
+/*
+ * Removes write access on the last level SPTE mapping this GFN and unsets the
+ * MMU-writable bit to ensure future writes continue to be intercepted.
+ * Returns true if an SPTE was set and a TLB flush is needed.
+ */
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
+{
+ struct kvm_mmu_page *root;
+ bool spte_set = false;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
+
+ return spte_set;
+}
+
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ */
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+ struct tdp_iter iter;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ int leaf = -1;
+
+ *root_level = vcpu->arch.mmu->root_role.level;
+
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ leaf = iter.level;
+ sptes[leaf] = iter.old_spte;
+ }
+
+ return leaf;
+}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
+ u64 *spte)
+{
+ /* Fast pf is not supported for mirrored roots */
+ struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
+ struct tdp_iter iter;
+ tdp_ptep_t sptep = NULL;
+
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
new file mode 100644
index 000000000000..bd62977c9199
--- /dev/null
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef __KVM_X86_MMU_TDP_MMU_H
+#define __KVM_X86_MMU_TDP_MMU_H
+
+#include <linux/kvm_host.h>
+
+#include "spte.h"
+
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
+void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private);
+
+__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
+{
+ return refcount_inc_not_zero(&root->tdp_mmu_root_count);
+}
+
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+
+enum kvm_tdp_mmu_root_types {
+ KVM_INVALID_ROOTS = BIT(0),
+ KVM_DIRECT_ROOTS = BIT(1),
+ KVM_MIRROR_ROOTS = BIT(2),
+ KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS,
+ KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS,
+};
+
+static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm,
+ enum kvm_gfn_range_filter process)
+{
+ enum kvm_tdp_mmu_root_types ret = 0;
+
+ if (!kvm_has_mirrored_tdp(kvm))
+ return KVM_DIRECT_ROOTS;
+
+ if (process & KVM_FILTER_PRIVATE)
+ ret |= KVM_MIRROR_ROOTS;
+ if (process & KVM_FILTER_SHARED)
+ ret |= KVM_DIRECT_ROOTS;
+
+ WARN_ON_ONCE(!ret);
+
+ return ret;
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu,
+ enum kvm_tdp_mmu_root_types type)
+{
+ if (unlikely(type == KVM_MIRROR_ROOTS))
+ return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);
+
+ return root_to_sp(vcpu->arch.mmu->root.hpa);
+}
+
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
+bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp);
+void kvm_tdp_mmu_zap_all(struct kvm *kvm);
+void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
+ enum kvm_tdp_mmu_root_types root_types);
+void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
+
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
+ bool flush);
+bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
+void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, unsigned long mask,
+ bool wrprot);
+void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
+
+bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
+
+void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ int target_level, bool shared);
+
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
+
+int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
+ u64 *spte);
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
deleted file mode 100644
index 42f07b1bfbc9..000000000000
--- a/arch/x86/kvm/mmutrace.h
+++ /dev/null
@@ -1,206 +0,0 @@
-#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVMMMU_H
-
-#include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvmmmu
-
-#define KVM_MMU_PAGE_FIELDS \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
- __field(bool, unsync)
-
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
- __entry->unsync = sp->unsync;
-
-#define KVM_MMU_PAGE_PRINTK() ({ \
- const char *ret = p->buffer + p->len; \
- static const char *access_str[] = { \
- "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
- }; \
- union kvm_mmu_page_role role; \
- \
- role.word = __entry->role; \
- \
- trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
- " %snxe root %u %s%c", \
- __entry->gfn, role.level, \
- role.cr4_pae ? " pae" : "", \
- role.quadrant, \
- role.direct ? " direct" : "", \
- access_str[role.access], \
- role.invalid ? " invalid" : "", \
- role.nxe ? "" : "!", \
- __entry->root_count, \
- __entry->unsync ? "unsync" : "sync", 0); \
- ret; \
- })
-
-#define kvm_mmu_trace_pferr_flags \
- { PFERR_PRESENT_MASK, "P" }, \
- { PFERR_WRITE_MASK, "W" }, \
- { PFERR_USER_MASK, "U" }, \
- { PFERR_RSVD_MASK, "RSVD" }, \
- { PFERR_FETCH_MASK, "F" }
-
-/*
- * A pagetable walk has started
- */
-TRACE_EVENT(
- kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
-
- TP_STRUCT__entry(
- __field(__u64, addr)
- __field(__u32, pferr)
- ),
-
- TP_fast_assign(
- __entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
- ),
-
- TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
- __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
-);
-
-
-/* We just walked a paging element */
-TRACE_EVENT(
- kvm_mmu_paging_element,
- TP_PROTO(u64 pte, int level),
- TP_ARGS(pte, level),
-
- TP_STRUCT__entry(
- __field(__u64, pte)
- __field(__u32, level)
- ),
-
- TP_fast_assign(
- __entry->pte = pte;
- __entry->level = level;
- ),
-
- TP_printk("pte %llx level %u", __entry->pte, __entry->level)
-);
-
-DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size),
-
- TP_STRUCT__entry(
- __field(__u64, gpa)
- ),
-
- TP_fast_assign(
- __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
- + index * size;
- ),
-
- TP_printk("gpa %llx", __entry->gpa)
-);
-
-/* We set a pte accessed bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size)
-);
-
-/* We set a pte dirty bit */
-DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
-
- TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-
- TP_ARGS(table_gfn, index, size)
-);
-
-TRACE_EVENT(
- kvm_mmu_walker_error,
- TP_PROTO(u32 pferr),
- TP_ARGS(pferr),
-
- TP_STRUCT__entry(
- __field(__u32, pferr)
- ),
-
- TP_fast_assign(
- __entry->pferr = pferr;
- ),
-
- TP_printk("pferr %x %s", __entry->pferr,
- __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
-);
-
-TRACE_EVENT(
- kvm_mmu_get_page,
- TP_PROTO(struct kvm_mmu_page *sp, bool created),
- TP_ARGS(sp, created),
-
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- __field(bool, created)
- ),
-
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- __entry->created = created;
- ),
-
- TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
- __entry->created ? "new" : "existing")
-);
-
-DECLARE_EVENT_CLASS(kvm_mmu_page_class,
-
- TP_PROTO(struct kvm_mmu_page *sp),
- TP_ARGS(sp),
-
- TP_STRUCT__entry(
- KVM_MMU_PAGE_FIELDS
- ),
-
- TP_fast_assign(
- KVM_MMU_PAGE_ASSIGN(sp)
- ),
-
- TP_printk("%s", KVM_MMU_PAGE_PRINTK())
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
- TP_PROTO(struct kvm_mmu_page *sp),
-
- TP_ARGS(sp)
-);
-#endif /* _TRACE_KVMMMU_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mmutrace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..6f74e2b27c1e
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "x86.h"
+
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
+{
+ int index;
+
+ switch (msr) {
+ case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
+ case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
+ case MSR_MTRRdefType:
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
+ }
+ return NULL;
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
+
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else {
+ /* MTRR mask */
+ mask |= 0x7ff;
+ }
+
+ return (data & mask) == 0;
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ u64 *mtrr;
+
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
+ return 1;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ *mtrr = data;
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 *mtrr;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
+ return 1;
+
+ *pdata = *mtrr;
+ return 0;
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
deleted file mode 100644
index 2331bdc2b549..000000000000
--- a/arch/x86/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * MMU support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-
-/*
- * We need the mmu code to access both 32-bit and 64-bit guest ptes,
- * so the code in this file is compiled twice, once per pte size.
- */
-
-#if PTTYPE == 64
- #define pt_element_t u64
- #define guest_walker guest_walker64
- #define FNAME(name) paging##64_##name
- #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
- #define PT_LEVEL_BITS PT64_LEVEL_BITS
- #ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
- #define CMPXCHG cmpxchg
- #else
- #define CMPXCHG cmpxchg64
- #define PT_MAX_FULL_LEVELS 2
- #endif
-#elif PTTYPE == 32
- #define pt_element_t u32
- #define guest_walker guest_walker32
- #define FNAME(name) paging##32_##name
- #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
- #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
- #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
- #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
- #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
- #define PT_LEVEL_BITS PT32_LEVEL_BITS
- #define PT_MAX_FULL_LEVELS 2
- #define CMPXCHG cmpxchg
-#else
- #error Invalid PTTYPE value
-#endif
-
-#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
-#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
-
-/*
- * The guest_walker structure emulates the behavior of the hardware page
- * table walker.
- */
-struct guest_walker {
- int level;
- gfn_t table_gfn[PT_MAX_FULL_LEVELS];
- pt_element_t ptes[PT_MAX_FULL_LEVELS];
- gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
- gfn_t gfn;
- u32 error_code;
-};
-
-static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
-{
- return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
-}
-
-static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
- gfn_t table_gfn, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
-
- page = gfn_to_page(kvm, table_gfn);
-
- table = kmap_atomic(page, KM_USER0);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table, KM_USER0);
-
- kvm_release_page_dirty(page);
-
- return (ret != orig_pte);
-}
-
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
-{
- unsigned access;
-
- access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-#if PTTYPE == 64
- if (is_nx(vcpu))
- access &= ~(gpte >> PT64_NX_SHIFT);
-#endif
- return access;
-}
-
-/*
- * Fetch a guest pte for a guest virtual address
- */
-static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- int write_fault, int user_fault, int fetch_fault)
-{
- pt_element_t pte;
- gfn_t table_gfn;
- unsigned index, pt_access, pte_access;
- gpa_t pte_gpa;
- int rsvd_fault = 0;
-
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
-walk:
- walker->level = vcpu->arch.mmu.root_level;
- pte = vcpu->arch.cr3;
-#if PTTYPE == 64
- if (!is_long_mode(vcpu)) {
- pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
- trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
- --walker->level;
- }
-#endif
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
-
- pt_access = ACC_ALL;
-
- for (;;) {
- index = PT_INDEX(addr, walker->level);
-
- table_gfn = gpte_to_gfn(pte);
- pte_gpa = gfn_to_gpa(table_gfn);
- pte_gpa += index * sizeof(pt_element_t);
- walker->table_gfn[walker->level - 1] = table_gfn;
- walker->pte_gpa[walker->level - 1] = pte_gpa;
-
- if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
- goto not_present;
-
- trace_kvm_mmu_paging_element(pte, walker->level);
-
- if (!is_present_gpte(pte))
- goto not_present;
-
- rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
- if (rsvd_fault)
- goto access_error;
-
- if (write_fault && !is_writable_pte(pte))
- if (user_fault || is_write_protection(vcpu))
- goto access_error;
-
- if (user_fault && !(pte & PT_USER_MASK))
- goto access_error;
-
-#if PTTYPE == 64
- if (fetch_fault && (pte & PT64_NX_MASK))
- goto access_error;
-#endif
-
- if (!(pte & PT_ACCESSED_MASK)) {
- trace_kvm_mmu_set_accessed_bit(table_gfn, index,
- sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
- if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
- index, pte, pte|PT_ACCESSED_MASK))
- goto walk;
- pte |= PT_ACCESSED_MASK;
- }
-
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
- walker->ptes[walker->level - 1] = pte;
-
- if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
- ((walker->level == PT_DIRECTORY_LEVEL) &&
- is_large_pte(pte) &&
- (PTTYPE == 64 || is_pse(vcpu))) ||
- ((walker->level == PT_PDPE_LEVEL) &&
- is_large_pte(pte) &&
- is_long_mode(vcpu))) {
- int lvl = walker->level;
-
- walker->gfn = gpte_to_gfn_lvl(pte, lvl);
- walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
- >> PAGE_SHIFT;
-
- if (PTTYPE == 32 &&
- walker->level == PT_DIRECTORY_LEVEL &&
- is_cpuid_PSE36())
- walker->gfn += pse36_gfn_delta(pte);
-
- break;
- }
-
- pt_access = pte_access;
- --walker->level;
- }
-
- if (write_fault && !is_dirty_gpte(pte)) {
- bool ret;
-
- trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
- ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
- pte|PT_DIRTY_MASK);
- if (ret)
- goto walk;
- pte |= PT_DIRTY_MASK;
- walker->ptes[walker->level - 1] = pte;
- }
-
- walker->pt_access = pt_access;
- walker->pte_access = pte_access;
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pt_access, pte_access);
- return 1;
-
-not_present:
- walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
- if (write_fault)
- walker->error_code |= PFERR_WRITE_MASK;
- if (user_fault)
- walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
- walker->error_code |= PFERR_FETCH_MASK;
- if (rsvd_fault)
- walker->error_code |= PFERR_RSVD_MASK;
- trace_kvm_mmu_walker_error(walker->error_code);
- return 0;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte;
- unsigned pte_access;
- pfn_t pfn;
- u64 new_spte;
-
- gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (page->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
- return;
- }
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
- return;
- pfn = vcpu->arch.update_pte.pfn;
- if (is_error_pfn(pfn))
- return;
- if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
- return;
- kvm_get_pfn(pfn);
- /*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
- * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
- */
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true, true);
-}
-
-/*
- * Fetch a shadow pte for a specific level in the paging hierarchy.
- */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
- struct guest_walker *gw,
- int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn)
-{
- unsigned access = gw->pt_access;
- struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep = NULL;
- int direct;
- gfn_t table_gfn;
- int r;
- int level;
- pt_element_t curr_pte;
- struct kvm_shadow_walk_iterator iterator;
-
- if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return NULL;
-
- for_each_shadow_entry(vcpu, addr, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
- if (iterator.level == hlevel) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, level,
- gw->gfn, pfn, false, true);
- break;
- }
-
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
- continue;
-
- if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-
- if (level <= gw->level) {
- int delta = level - gw->level + 1;
- direct = 1;
- if (!is_dirty_gpte(gw->ptes[level - delta]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
- /* advance table_gfn when emulating 1gb pages with 4k */
- if (delta == 0)
- table_gfn += PT_INDEX(addr, level);
- access &= gw->pte_access;
- } else {
- direct = 0;
- table_gfn = gw->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- direct, access, sptep);
- if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(shadow_page, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
- }
-
- spte = __pa(shadow_page->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
- }
-
- return sptep;
-}
-
-/*
- * Page fault handler. There are several causes for a page fault:
- * - there is no shadow pte for the guest pte
- * - write access through a shadow pte marked read only so that we can set
- * the dirty bit
- * - write access to a shadow pte marked read only so we can update the page
- * dirty bitmap, when userspace requests it
- * - mmio access; in this case we will never install a present shadow pte
- * - normal guest page fault due to the guest pte marked not present, not
- * writable, or not executable
- *
- * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
- * a negative value on error.
- */
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
-{
- int write_fault = error_code & PFERR_WRITE_MASK;
- int user_fault = error_code & PFERR_USER_MASK;
- int fetch_fault = error_code & PFERR_FETCH_MASK;
- struct guest_walker walker;
- u64 *sptep;
- int write_pt = 0;
- int r;
- pfn_t pfn;
- int level = PT_PAGE_TABLE_LEVEL;
- unsigned long mmu_seq;
-
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
- kvm_mmu_audit(vcpu, "pre page fault");
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
- fetch_fault);
-
- /*
- * The page is not mapped by the guest. Let the guest handle it.
- */
- if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu, addr, walker.error_code);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- return 0;
- }
-
- if (walker.level >= PT_DIRECTORY_LEVEL) {
- level = min(walker.level, mapping_level(vcpu, walker.gfn));
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
-
- /* mmio */
- if (is_error_pfn(pfn)) {
- pgprintk("gfn %lx is mmio\n", walker.gfn);
- kvm_release_pfn_clean(pfn);
- return 1;
- }
-
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
- goto out_unlock;
- kvm_mmu_free_some_pages(vcpu);
- sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn);
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- sptep, *sptep, write_pt);
-
- if (!write_pt)
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
- ++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return write_pt;
-
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return 0;
-}
-
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
- struct kvm_shadow_walk_iterator iterator;
- gpa_t pte_gpa = -1;
- int level;
- u64 *sptep;
- int need_flush = 0;
-
- spin_lock(&vcpu->kvm->mmu_lock);
-
- for_each_shadow_entry(vcpu, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- if (is_last_spte(*sptep, level)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
- int offset, shift;
-
- shift = PAGE_SHIFT -
- (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
- offset = sp->role.quadrant << shift;
-
- pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
- pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
-
- if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- if (is_large_pte(*sptep))
- --vcpu->kvm->stat.lpages;
- need_flush = 1;
- }
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- break;
- }
-
- if (!is_shadow_present_pte(*sptep))
- break;
- }
-
- if (need_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- atomic_inc(&vcpu->kvm->arch.invlpg_counter);
-
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- if (pte_gpa == -1)
- return;
-
- if (mmu_topup_memory_caches(vcpu))
- return;
- kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
-}
-
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- u32 *error)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
- r = FNAME(walk_addr)(&walker, vcpu, vaddr,
- !!(access & PFERR_WRITE_MASK),
- !!(access & PFERR_USER_MASK),
- !!(access & PFERR_FETCH_MASK));
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
-
- return gpa;
-}
-
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i, j, offset, r;
- pt_element_t pt[256 / sizeof(pt_element_t)];
- gpa_t pte_gpa;
-
- if (sp->role.direct
- || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
- nonpaging_prefetch_page(vcpu, sp);
- return;
- }
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- if (PTTYPE == 32) {
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
- pte_gpa += offset * sizeof(pt_element_t);
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
- pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
- for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_gpte(pt[j]))
- sp->spt[i+j] = shadow_trap_nonpresent_pte;
- else
- sp->spt[i+j] = shadow_notrap_nonpresent_pte;
- }
-}
-
-/*
- * Using the cached information from sp->gfns is safe because:
- * - The spte has a reference to the struct page, so the pfn for a given gfn
- * can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
- */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- int i, offset, nr_present;
- bool reset_host_protection;
- gpa_t first_pte_gpa;
-
- offset = nr_present = 0;
-
- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn = sp->gfns[i];
-
- if (!is_shadow_present_pte(sp->spt[i]))
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -EINVAL;
-
- if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
- !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_gpte(gpte))
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- __set_spte(&sp->spt[i], nonpresent);
- continue;
- }
-
- nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
- } else {
- reset_host_protection = 1;
- }
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
- spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
- }
-
- return !nr_present;
-}
-
-#undef pt_element_t
-#undef guest_walker
-#undef FNAME
-#undef PT_BASE_ADDR_MASK
-#undef PT_INDEX
-#undef PT_LEVEL_MASK
-#undef PT_LVL_ADDR_MASK
-#undef PT_LVL_OFFSET_MASK
-#undef PT_LEVEL_BITS
-#undef PT_MAX_FULL_LEVELS
-#undef gpte_to_gfn
-#undef gpte_to_gfn_lvl
-#undef CMPXCHG
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..487ad19a236e
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,1150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
+ *
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <asm/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* This is enough to filter the vast majority of currently defined events. */
+#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
+
+/* Unadultered PMU capabilities of the host, i.e. of hardware. */
+static struct x86_pmu_capability __read_mostly kvm_host_pmu;
+
+/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */
+struct x86_pmu_capability __read_mostly kvm_pmu_cap;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap);
+
+struct kvm_pmu_emulated_event_selectors {
+ u64 INSTRUCTIONS_RETIRED;
+ u64 BRANCH_INSTRUCTIONS_RETIRED;
+};
+static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
+
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
+ X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn and, for families 15H and later,
+ * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
+ * aliased to MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
+ * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
+ * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
+ */
+
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
+{
+ bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+ int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
+
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ enable_pmu = false;
+ memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu));
+ } else {
+ perf_get_x86_pmu_capability(&kvm_host_pmu);
+ }
+
+ if (enable_pmu) {
+ /*
+ * WARN if perf did NOT disable hardware PMU if the number of
+ * architecturally required GP counters aren't present, i.e. if
+ * there are a non-zero number of counters, but fewer than what
+ * is architecturally required.
+ */
+ if (!kvm_host_pmu.num_counters_gp ||
+ WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs))
+ enable_pmu = false;
+ else if (is_intel && !kvm_host_pmu.version)
+ enable_pmu = false;
+ }
+
+ if (!enable_pmu) {
+ memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
+ return;
+ }
+
+ memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu));
+ kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
+ kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
+ KVM_MAX_NR_FIXED_COUNTERS);
+
+ kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+}
+
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ bool skip_pmi = false;
+
+ if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
+ if (!in_pmi) {
+ /*
+ * TODO: KVM is currently _choosing_ to not generate records
+ * for emulated instructions, avoiding BUFFER_OVF PMI when
+ * there are no records. Strictly speaking, it should be done
+ * as well in the right context to improve sampling accuracy.
+ */
+ skip_pmi = true;
+ } else {
+ /* Indicate PEBS overflow PMI to guest. */
+ skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
+ (unsigned long *)&pmu->global_status);
+ }
+ } else {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ }
+
+ if (pmc->intr && !skip_pmi)
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+
+ /*
+ * Ignore asynchronous overflow events for counters that are scheduled
+ * to be reprogrammed, e.g. if a PMI for the previous event races with
+ * KVM's handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
+ __kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
+}
+
+static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
+{
+ u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
+
+ if (!sample_period)
+ sample_period = pmc_bitmask(pmc) + 1;
+ return sample_period;
+}
+
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+ bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
+
+ attr.sample_period = get_sample_period(pmc, pmc->counter);
+
+ if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
+ /*
+ * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+ * period. Just clear the sample period so at least
+ * allocating the counter doesn't fail.
+ */
+ attr.sample_period = 0;
+ }
+ if (pebs) {
+ /*
+ * For most PEBS hardware events, the difference in the software
+ * precision levels of guest and host PEBS events will not affect
+ * the accuracy of the PEBS profiling result, because the "event IP"
+ * in the PEBS record is calibrated on the guest side.
+ */
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
+ return PTR_ERR(event);
+ }
+
+ pmc->perf_event = event;
+ pmc_to_pmu(pmc)->event_count++;
+ pmc->is_paused = false;
+ pmc->intr = intr || pebs;
+ return 0;
+}
+
+static bool pmc_pause_counter(struct kvm_pmc *pmc)
+{
+ u64 counter = pmc->counter;
+ u64 prev_counter;
+
+ /* update counter, reset event value to avoid redundant accumulation */
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_pause(pmc->perf_event, true);
+
+ /*
+ * Snapshot the previous counter *after* accumulating state from perf.
+ * If overflow already happened, hardware (via perf) is responsible for
+ * generating a PMI. KVM just needs to detect overflow on emulated
+ * counter events that haven't yet been processed.
+ */
+ prev_counter = counter & pmc_bitmask(pmc);
+
+ counter += pmc->emulated_counter;
+ pmc->counter = counter & pmc_bitmask(pmc);
+
+ pmc->emulated_counter = 0;
+ pmc->is_paused = true;
+
+ return pmc->counter < prev_counter;
+}
+
+static bool pmc_resume_counter(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event)
+ return false;
+
+ /* recalibrate sample period and check if it's accepted by perf core */
+ if (is_sampling_event(pmc->perf_event) &&
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter)))
+ return false;
+
+ if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
+ (!!pmc->perf_event->attr.precise_ip))
+ return false;
+
+ /* reuse perf_event to serve as pmc_reprogram_counter() does*/
+ perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
+
+ return true;
+}
+
+static void pmc_release_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ pmc->current_config = 0;
+ pmc_to_pmu(pmc)->event_count--;
+ }
+}
+
+static void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ pmc_release_perf_event(pmc);
+ }
+}
+
+static void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
+{
+ /*
+ * Drop any unconsumed accumulated counts, the WRMSR is a write, not a
+ * read-modify-write. Adjust the counter value so that its value is
+ * relative to the current count, as reading the current count from
+ * perf is faster than pausing and repgrogramming the event in order to
+ * reset it to '0'. Note, this very sneakily offsets the accumulated
+ * emulated count too, by using pmc_read_counter()!
+ */
+ pmc->emulated_counter = 0;
+ pmc->counter += val - pmc_read_counter(pmc);
+ pmc->counter &= pmc_bitmask(pmc);
+ pmc_update_sample_period(pmc);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter);
+
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
+{
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
+
+ return (a > b) - (a < b);
+}
+
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
+static bool pmc_is_event_allowed(struct kvm_pmc *pmc)
+{
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm *kvm = pmc->vcpu->kvm;
+
+ filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
+ if (!filter)
+ return true;
+
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
+
+ return is_fixed_event_allowed(filter, pmc->idx);
+}
+
+static int reprogram_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 eventsel = pmc->eventsel;
+ u64 new_config = eventsel;
+ bool emulate_overflow;
+ u8 fixed_ctr_ctrl;
+
+ emulate_overflow = pmc_pause_counter(pmc);
+
+ if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) ||
+ !pmc_is_event_allowed(pmc))
+ return 0;
+
+ if (emulate_overflow)
+ __kvm_perf_overflow(pmc, false);
+
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
+ if (pmc_is_fixed(pmc)) {
+ fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
+ eventsel |= ARCH_PERFMON_EVENTSEL_OS;
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
+ eventsel |= ARCH_PERFMON_EVENTSEL_USR;
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
+ eventsel |= ARCH_PERFMON_EVENTSEL_INT;
+ new_config = (u64)fixed_ctr_ctrl;
+ }
+
+ if (pmc->current_config == new_config && pmc_resume_counter(pmc))
+ return 0;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = new_config;
+
+ return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
+}
+
+static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel)
+{
+ /*
+ * Ignore checks for edge detect (all events currently emulated by KVM
+ * are always rising edges), pin control (unsupported by modern CPUs),
+ * and counter mask and its invert flag (KVM doesn't emulate multiple
+ * events in a single clock cycle).
+ *
+ * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit
+ * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34).
+ * Checking the "in HLE/RTM transaction" flags is correct as the vCPU
+ * can't be in a transaction if KVM is emulating an instruction.
+ *
+ * Checking the reserved bits might be wrong if they are defined in the
+ * future, but so could ignoring them, so do the simple thing for now.
+ */
+ return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB);
+}
+
+void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
+{
+ bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1);
+ bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1);
+
+ /*
+ * Do NOT consult the PMU event filters, as the filters must be checked
+ * at the time of emulation to ensure KVM uses fresh information, e.g.
+ * omitting a PMC from a bitmap could result in a missed event if the
+ * filter is changed to allow counting the event.
+ */
+ if (!pmc_is_locally_enabled(pmc))
+ return;
+
+ if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED))
+ bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1);
+
+ if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED))
+ bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation);
+
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
+{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int bit;
+
+ bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+
+ /*
+ * The reprogramming bitmap can be written asynchronously by something
+ * other than the task that holds vcpu->mutex, take care to clear only
+ * the bits that will actually processed.
+ */
+ BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
+ atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
+
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
+ /*
+ * If reprogramming fails, e.g. due to contention, re-set the
+ * regprogram bit set, i.e. opportunistically try again on the
+ * next PMU refresh. Don't make a new request as doing so can
+ * stall the guest if reprogramming repeatedly fails.
+ */
+ if (reprogram_counter(pmc))
+ set_bit(pmc->idx, pmu->reprogram_pmi);
+ }
+
+ /*
+ * Release unused perf_events if the corresponding guest MSRs weren't
+ * accessed during the last vCPU time slice (need_cleanup is set when
+ * the vCPU is scheduled back in).
+ */
+ if (unlikely(pmu->need_cleanup))
+ kvm_pmu_cleanup(vcpu);
+
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap)
+ kvm_pmu_recalc_pmc_emulation(pmu, pmc);
+}
+
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ /*
+ * On Intel, VMX interception has priority over RDPMC exceptions that
+ * aren't already handled by the emulator, i.e. there are no additional
+ * check needed for Intel PMUs.
+ *
+ * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
+ * i.e. an invalid PMC results in a #GP, not #VMEXIT.
+ */
+ if (!kvm_pmu_ops.check_rdpmc_early)
+ return 0;
+
+ return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
+}
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boottime_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boottime_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 mask = ~0ull;
+
+ if (!pmu->version)
+ return 1;
+
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
+ pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ if (!pmc)
+ return 1;
+
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
+ (kvm_x86_call(get_cpl)(vcpu) != 0) &&
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
+ return 1;
+
+ *data = pmc_read_counter(pmc) & mask;
+ return 0;
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu)) {
+ kvm_pmu_call(deliver_pmi)(vcpu);
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
+}
+
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
+ default:
+ break;
+ }
+ return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
+ kvm_pmu_call(is_valid_msr)(vcpu, msr);
+}
+
+static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
+
+ if (pmc)
+ __set_bit(pmc->idx, pmu->pmc_in_use);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ msr_info->data = pmu->global_status;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ msr_info->data = pmu->global_ctrl;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ msr_info->data = 0;
+ break;
+ default:
+ return kvm_pmu_call(get_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 diff;
+
+ /*
+ * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
+ * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
+ */
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ /* Per PPR, Read-only MSR. Writes are ignored. */
+ if (!msr_info->host_initiated)
+ break;
+
+ if (data & pmu->global_status_rsvd)
+ return 1;
+
+ pmu->global_status = data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_rsvd;
+ fallthrough;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
+ diff = pmu->global_ctrl ^ data;
+ pmu->global_ctrl = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ /*
+ * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
+ * GLOBAL_STATUS, and so the set of reserved bits is the same.
+ */
+ if (data & pmu->global_status_rsvd)
+ return 1;
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ if (!msr_info->host_initiated)
+ pmu->global_status |= data & ~pmu->global_status_rsvd;
+ break;
+ default:
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
+ return kvm_pmu_call(set_msr)(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
+
+ kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
+ pmc_stop_counter(pmc);
+ pmc->counter = 0;
+ pmc->emulated_counter = 0;
+
+ if (pmc_is_gp(pmc))
+ pmc->eventsel = 0;
+ }
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
+
+ kvm_pmu_call(reset)(vcpu);
+}
+
+
+/*
+ * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
+ * and/or PERF_CAPABILITIES.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ /*
+ * Stop/release all existing counters/events before realizing the new
+ * vPMU model.
+ */
+ kvm_pmu_reset(vcpu);
+
+ pmu->version = 0;
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_rsvd = ~0ull;
+ pmu->global_status_rsvd = ~0ull;
+ pmu->fixed_ctr_ctrl_rsvd = ~0ull;
+ pmu->pebs_enable_rsvd = ~0ull;
+ pmu->pebs_data_cfg_rsvd = ~0ull;
+ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return;
+
+ kvm_pmu_call(refresh)(vcpu);
+
+ /*
+ * At RESET, both Intel and AMD CPUs set all enable bits for general
+ * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
+ * was written for v1 PMUs don't unknowingly leave GP counters disabled
+ * in the global controls). Emulate that behavior when refreshing the
+ * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
+ */
+ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
+ pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
+
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX,
+ pmu->nr_arch_fixed_counters);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_pmu_call(init)(vcpu);
+}
+
+/* Release perf_events for vPMCs that have been unused for a full time slice. */
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
+ pmu->pmc_in_use, X86_PMC_IDX_MAX);
+
+ kvm_for_each_pmc(pmu, pmc, i, bitmask) {
+ if (pmc->perf_event && !pmc_is_locally_enabled(pmc))
+ pmc_stop_counter(pmc);
+ }
+
+ kvm_pmu_call(cleanup)(vcpu);
+
+ bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
+
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ pmc->emulated_counter++;
+ kvm_pmu_request_counter_reprogram(pmc);
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config;
+
+ if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
+ select_os = config & INTEL_FIXED_0_KERNEL;
+ select_user = config & INTEL_FIXED_0_USER;
+ }
+
+ /*
+ * Skip the CPL lookup, which isn't free on Intel, if the result will
+ * be the same regardless of the CPL.
+ */
+ if (select_os == select_user)
+ return select_os;
+
+ return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
+ select_user;
+}
+
+static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu,
+ const unsigned long *event_pmcs)
+{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i, idx;
+
+ BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
+
+ if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX))
+ return;
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX);
+ else if (!bitmap_and(bitmap, event_pmcs,
+ (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
+ return;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_for_each_pmc(pmu, pmc, i, bitmap) {
+ if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc))
+ continue;
+
+ kvm_pmu_incr_counter(pmc);
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+}
+
+void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired);
+
+void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired);
+
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ size_t size;
+ int r;
+
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
+ return -EFAULT;
+
+ if (tmp.action != KVM_PMU_EVENT_ALLOW &&
+ tmp.action != KVM_PMU_EVENT_DENY)
+ return -EINVAL;
+
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
+ return -EINVAL;
+
+ if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
+ return -E2BIG;
+
+ size = struct_size(filter, events, tmp.nevents);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!filter)
+ return -ENOMEM;
+
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
+ r = -EFAULT;
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
+ goto cleanup;
+
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
+
+ mutex_lock(&kvm->lock);
+ filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
+ r = 0;
+cleanup:
+ kfree(filter);
+ return r;
+}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..5c3939e91f1d
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#include <linux/nospec.h>
+
+#include <asm/kvm_host.h>
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
+ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
+
+/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) \
+ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
+
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
+#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED
+
+struct kvm_pmu_ops {
+ struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
+ const int MIN_NR_GP_COUNTERS;
+};
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
+static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
+{
+ /*
+ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
+ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
+ * greater than zero. However, KVM only exposes and emulates the MSR
+ * to/for the guest if the guest PMU supports at least "Architectural
+ * Performance Monitoring Version 2".
+ *
+ * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2.
+ */
+ return pmu->version > 1;
+}
+
+/*
+ * KVM tracks all counters in 64-bit bitmaps, with general purpose counters
+ * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0
+ * is tracked internally via index 32. On Intel, (AMD doesn't support fixed
+ * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL
+ * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the
+ * amounter of boilerplate needed to iterate over PMCs *and* simplifies common
+ * enabling/disable/reset operations.
+ *
+ * WARNING! This helper is only for lookups that are initiated by KVM, it is
+ * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw
+ * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX
+ * for RDPMC, not by adding 32 to the fixed counter index).
+ */
+static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+ if (idx < pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[idx];
+
+ idx -= KVM_FIXED_PMC_BASE_IDX;
+ if (idx >= 0 && idx < pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[idx];
+
+ return NULL;
+}
+
+#define kvm_for_each_pmc(pmu, pmc, i, bitmap) \
+ for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \
+ if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \
+ continue; \
+ else \
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter + pmc->emulated_counter;
+
+ if (pmc->perf_event && !pmc->is_paused)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+void pmc_write_counter(struct kvm_pmc *pmc, u64 val);
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
+ u64 data)
+{
+ return !(pmu->global_ctrl_rsvd & data);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * parameter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
+
+ return NULL;
+}
+
+static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+extern struct x86_pmu_capability kvm_pmu_cap;
+
+void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops);
+
+void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc);
+
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
+{
+ kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc);
+
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
+{
+ int bit;
+
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+}
+
+/*
+ * Check if a PMC is enabled by comparing it against global_ctrl bits.
+ *
+ * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled.
+ */
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ return true;
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu);
+void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu);
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
new file mode 100644
index 000000000000..81b4a7acf72e
--- /dev/null
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_REVERSE_CPUID_H
+#define ARCH_X86_KVM_REVERSE_CPUID_H
+
+#include <uapi/asm/kvm.h>
+#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
+#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
+
+/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
+#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
+#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */
+#define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
+#define X86_FEATURE_AVX_VNNI_INT16 KVM_X86_FEATURE(CPUID_7_1_EDX, 10)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
+#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
+#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1)
+#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
+#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3)
+#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
+#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+
+/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
+#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
+#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
+#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+
+/* CPUID level 0x80000022 (EAX) */
+#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
+ [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
+ [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
+ [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
+ [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
+ [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
+ [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX},
+};
+
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
+{
+ BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Translate feature bits that are scattered in the kernel's cpufeatures word
+ * into KVM feature words that align with hardware's definitions.
+ */
+static __always_inline u32 __feature_translate(int x86_feature)
+{
+#define KVM_X86_TRANSLATE_FEATURE(f) \
+ case X86_FEATURE_##f: return KVM_X86_FEATURE_##f
+
+ switch (x86_feature) {
+ KVM_X86_TRANSLATE_FEATURE(SGX1);
+ KVM_X86_TRANSLATE_FEATURE(SGX2);
+ KVM_X86_TRANSLATE_FEATURE(SGX_EDECCSSA);
+ KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC);
+ KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
+ KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
+ KVM_X86_TRANSLATE_FEATURE(MSR_IMM);
+ default:
+ return x86_feature;
+ }
+}
+
+static __always_inline u32 __feature_leaf(int x86_feature)
+{
+ u32 x86_leaf = __feature_translate(x86_feature) / 32;
+
+ reverse_cpuid_check(x86_leaf);
+ return x86_leaf;
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ x86_feature = __feature_translate(x86_feature);
+
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature)
+{
+ unsigned int x86_leaf = __feature_leaf(x86_feature);
+
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ u32 reg)
+{
+ switch (reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ return __cpuid_entry_get_reg(entry, cpuid.reg);
+}
+
+static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ return *reg & __feature_bit(x86_feature);
+}
+
+static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ return cpuid_entry_get(entry, x86_feature);
+}
+
+static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ *reg |= __feature_bit(x86_feature);
+}
+
+static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry,
+ unsigned int x86_feature,
+ bool set)
+{
+ u32 *reg = cpuid_entry_get_reg(entry, x86_feature);
+
+ /*
+ * Open coded instead of using cpuid_entry_{clear,set}() to coerce the
+ * compiler into using CMOV instead of Jcc when possible.
+ */
+ if (set)
+ *reg |= __feature_bit(x86_feature);
+ else
+ *reg &= ~__feature_bit(x86_feature);
+}
+
+#endif /* ARCH_X86_KVM_REVERSE_CPUID_H */
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000000..f623c5986119
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,663 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_smm_changed);
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ smram->dr6 = (u32)vcpu->arch.dr6;
+ smram->dr7 = (u32)vcpu->arch.dr7;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+ smram->dr6 = vcpu->arch.dr6;
+ smram->dr7 = vcpu->arch.dr7;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ kvm_x86_call(get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ kvm_x86_call(get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp))
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (kvm_x86_call(enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (kvm_x86_call(get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_call(set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
+
+ kvm_x86_call(set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ if (kvm_x86_call(set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ kvm_x86_call(set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ kvm_x86_call(set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp))
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ kvm_x86_call(set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ __kvm_emulate_msr_write(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
+ * mode should happen _after_ loading state from SMRAM. However, KVM
+ * piggybacks the nested VM-Enter flows (which is wrong for many other
+ * reasons), and so nSVM/nVMX would clobber state that is loaded from
+ * SMRAM and from the VMCS/VMCB.
+ */
+ if (kvm_x86_call(leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * If RSM fails and triggers shutdown, architecturally the shutdown
+ * occurs *before* the transition to guest mode. But due to KVM's
+ * flawed handling of RSM to L2 (see above), the vCPU may already be
+ * in_guest_mode(). Force the vCPU out of guest mode before delivering
+ * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
+ * that architecturally shouldn't be possible.
+ */
+ if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+ return ret;
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000000..db3c88f16138
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ u64 ssp;
+ /* svm_* fields below are not implemented by KVM */
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE))
+ return -ENOTTY;
+
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
deleted file mode 100644
index ce438e0fdd26..000000000000
--- a/arch/x86/kvm/svm.c
+++ /dev/null
@@ -1,3428 +0,0 @@
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * AMD SVM support
- *
- * Copyright (C) 2006 Qumranet, Inc.
- *
- * Authors:
- * Yaniv Kamay <yaniv@qumranet.com>
- * Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
-#include <linux/kvm_host.h>
-
-#include "irq.h"
-#include "mmu.h"
-#include "kvm_cache_regs.h"
-#include "x86.h"
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/ftrace_event.h>
-#include <linux/slab.h>
-
-#include <asm/tlbflush.h>
-#include <asm/desc.h>
-
-#include <asm/virtext.h>
-#include "trace.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-#define IOPM_ALLOC_ORDER 2
-#define MSRPM_ALLOC_ORDER 1
-
-#define SEG_TYPE_LDT 2
-#define SEG_TYPE_BUSY_TSS16 3
-
-#define SVM_FEATURE_NPT (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
-
-#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
-#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
-#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
-
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
-static bool erratum_383_found __read_mostly;
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
- MSR_FS_BASE,
-#endif
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct nested_state {
- struct vmcb *hsave;
- u64 hsave_msr;
- u64 vm_cr_msr;
- u64 vmcb;
-
- /* These are the merged vectors */
- u32 *msrpm;
-
- /* gpa pointers to the real vectors */
- u64 vmcb_msrpm;
- u64 vmcb_iopm;
-
- /* A VMEXIT is required but not yet emulated */
- bool exit_required;
-
- /* cache for intercepts of the guest */
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
- u32 intercept_exceptions;
- u64 intercept;
-
-};
-
-#define MSRPM_OFFSETS 16
-static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
-struct vcpu_svm {
- struct kvm_vcpu vcpu;
- struct vmcb *vmcb;
- unsigned long vmcb_pa;
- struct svm_cpu_data *svm_data;
- uint64_t asid_generation;
- uint64_t sysenter_esp;
- uint64_t sysenter_eip;
-
- u64 next_rip;
-
- u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
-
- u32 *msrpm;
-
- struct nested_state nested;
-
- bool nmi_singlestep;
-
- unsigned int3_injected;
- unsigned long int3_rip;
-};
-
-#define MSR_INVALID 0xffffffffU
-
-static struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is always on */
-} direct_access_msrs[] = {
- { .index = MSR_K6_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-static bool npt_enabled = true;
-#else
-static bool npt_enabled;
-#endif
-static int npt = 1;
-
-module_param(npt, int, S_IRUGO);
-
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
-static void svm_complete_interrupts(struct vcpu_svm *svm);
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm);
-static int nested_svm_intercept(struct vcpu_svm *svm);
-static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code);
-
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
-{
- return container_of(vcpu, struct vcpu_svm, vcpu);
-}
-
-static inline bool is_nested(struct vcpu_svm *svm)
-{
- return svm->nested.vmcb;
-}
-
-static inline void enable_gif(struct vcpu_svm *svm)
-{
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
-}
-
-static inline void disable_gif(struct vcpu_svm *svm)
-{
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
-}
-
-static inline bool gif_set(struct vcpu_svm *svm)
-{
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
-
-static unsigned long iopm_base;
-
-struct kvm_ldttss_desc {
- u16 limit0;
- u16 base0;
- unsigned base1:8, type:5, dpl:2, p:1;
- unsigned limit1:4, zero0:3, g:1, base2:8;
- u32 base3;
- u32 zero1;
-} __attribute__((packed));
-
-struct svm_cpu_data {
- int cpu;
-
- u64 asid_generation;
- u32 max_asid;
- u32 next_asid;
- struct kvm_ldttss_desc *tss_desc;
-
- struct page *save_area;
-};
-
-static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-static uint32_t svm_features;
-
-struct svm_init_data {
- int cpu;
- int r;
-};
-
-static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-static u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
-#define MAX_INST_SIZE 15
-
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
-static inline void clgi(void)
-{
- asm volatile (__ex(SVM_CLGI));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex(SVM_STGI));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
-}
-
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
-
- to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.efer = efer;
-}
-
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 ret = 0;
-
- if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
-}
-
-static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (mask == 0)
- svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
- else
- svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
-
-}
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (svm->vmcb->control.next_rip != 0)
- svm->next_rip = svm->vmcb->control.next_rip;
-
- if (!svm->next_rip) {
- if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
- EMULATE_DONE)
- printk(KERN_DEBUG "%s: NOP\n", __func__);
- return;
- }
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
-
- kvm_rip_write(vcpu, svm->next_rip);
- svm_set_interrupt_shadow(vcpu, 0);
-}
-
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
- bool has_error_code, u32 error_code,
- bool reinject)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /*
- * If we are within a nested VM we'd better #VMEXIT and let the guest
- * handle the exception
- */
- if (!reinject &&
- nested_svm_check_exception(svm, nr, has_error_code, error_code))
- return;
-
- if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
- unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
-
- /*
- * For guest debugging where we have to reinject #BP if some
- * INT3 is guest-owned:
- * Emulate nRIP by moving RIP forward. Will fail if injection
- * raises a fault that is not intercepted. Still better than
- * failing in all cases.
- */
- skip_emulated_instruction(&svm->vcpu);
- rip = kvm_rip_read(&svm->vcpu);
- svm->int3_rip = rip + svm->vmcb->save.cs.base;
- svm->int3_injected = rip - old_rip;
- }
-
- svm->vmcb->control.event_inj = nr
- | SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
- | SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
-}
-
-static void svm_init_erratum_383(void)
-{
- u32 low, high;
- int err;
- u64 val;
-
- /* Only Fam10h is affected */
- if (boot_cpu_data.x86 != 0x10)
- return;
-
- /* Use _safe variants to not break nested virtualization */
- val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
- if (err)
- return;
-
- val |= (1ULL << 47);
-
- low = lower_32_bits(val);
- high = upper_32_bits(val);
-
- native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
-
- erratum_383_found = true;
-}
-
-static int has_svm(void)
-{
- const char *msg;
-
- if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
- }
-
- return 1;
-}
-
-static void svm_hardware_disable(void *garbage)
-{
- cpu_svm_disable();
-}
-
-static int svm_hardware_enable(void *garbage)
-{
-
- struct svm_cpu_data *sd;
- uint64_t efer;
- struct desc_ptr gdt_descr;
- struct desc_struct *gdt;
- int me = raw_smp_processor_id();
-
- rdmsrl(MSR_EFER, efer);
- if (efer & EFER_SVME)
- return -EBUSY;
-
- if (!has_svm()) {
- printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
- me);
- return -EINVAL;
- }
- sd = per_cpu(svm_data, me);
-
- if (!sd) {
- printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
- me);
- return -EINVAL;
- }
-
- sd->asid_generation = 1;
- sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
- sd->next_asid = sd->max_asid + 1;
-
- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
- sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
- wrmsrl(MSR_EFER, efer | EFER_SVME);
-
- wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
-
- svm_init_erratum_383();
-
- return 0;
-}
-
-static void svm_cpu_uninit(int cpu)
-{
- struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
-
- if (!sd)
- return;
-
- per_cpu(svm_data, raw_smp_processor_id()) = NULL;
- __free_page(sd->save_area);
- kfree(sd);
-}
-
-static int svm_cpu_init(int cpu)
-{
- struct svm_cpu_data *sd;
- int r;
-
- sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
- if (!sd)
- return -ENOMEM;
- sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
- r = -ENOMEM;
- if (!sd->save_area)
- goto err_1;
-
- per_cpu(svm_data, cpu) = sd;
-
- return 0;
-
-err_1:
- kfree(sd);
- return r;
-
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- int i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == index)
- return true;
-
- return false;
-}
-
-static void set_msr_interception(u32 *msrpm, unsigned msr,
- int read, int write)
-{
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
-
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
-
- msrpm[offset] = tmp;
-}
-
-static void svm_vcpu_init_msrpm(u32 *msrpm)
-{
- int i;
-
- memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
-
- set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
-
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
-
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
-
- /* Add offset to list */
- msrpm_offsets[i] = offset;
-
- return;
- }
-
- /*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
- */
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
-
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
-
- add_msr_offset(offset);
- }
-}
-
-static void svm_enable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 1;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-}
-
-static void svm_disable_lbrv(struct vcpu_svm *svm)
-{
- u32 *msrpm = svm->msrpm;
-
- svm->vmcb->control.lbr_ctl = 0;
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
-
- iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- init_msrpm_offsets();
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME);
- }
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
-
- if (!svm_has(SVM_FEATURE_NPT))
- npt_enabled = false;
-
- if (npt_enabled && !npt) {
- printk(KERN_INFO "kvm: Nested Paging disabled\n");
- npt_enabled = false;
- }
-
- if (npt_enabled) {
- printk(KERN_INFO "kvm: Nested Paging enabled\n");
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- return 0;
-
-err:
- __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
- iopm_base = 0;
- return r;
-}
-
-static __exit void svm_hardware_unsetup(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- svm_cpu_uninit(cpu);
-
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = 0;
-}
-
-static void init_seg(struct vmcb_seg *seg)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
- SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
-{
- seg->selector = 0;
- seg->attrib = SVM_SELECTOR_P_MASK | type;
- seg->limit = 0xffff;
- seg->base = 0;
-}
-
-static void init_vmcb(struct vcpu_svm *svm)
-{
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
-
- svm->vcpu.fpu_active = 1;
-
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR) |
- (1 << MC_VECTOR);
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_SELECTIVE_CR0) |
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
-
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = __pa(svm->msrpm);
- control->tsc_offset = 0;
- control->int_ctl = V_INTR_MASKING_MASK;
-
- init_seg(&save->es);
- init_seg(&save->ss);
- init_seg(&save->ds);
- init_seg(&save->fs);
- init_seg(&save->gs);
-
- save->cs.selector = 0xf000;
- /* Executable/Readable Code Segment */
- save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
- SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
- save->cs.limit = 0xffff;
- /*
- * cs.base should really be 0xffff0000, but vmx can't handle that, so
- * be consistent with it.
- *
- * Replace when we have real mode working for vmx.
- */
- save->cs.base = 0xf0000;
-
- save->gdtr.limit = 0xffff;
- save->idtr.limit = 0xffff;
-
- init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
- init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
-
- save->efer = EFER_SVME;
- save->dr6 = 0xffff0ff0;
- save->dr7 = 0x400;
- save->rflags = 2;
- save->rip = 0x0000fff0;
- svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * This is the guest-visible cr0 value.
- * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- */
- svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
-
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
- if (npt_enabled) {
- /* Setup VMCB for Nested Paging */
- control->nested_ctl = 1;
- control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_INVLPG));
- control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
- control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
- save->g_pat = 0x0007040600070406ULL;
- save->cr3 = 0;
- save->cr4 = 0;
- }
- force_new_asid(&svm->vcpu);
-
- svm->nested.vmcb = 0;
- svm->vcpu.arch.hflags = 0;
-
- if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
- control->pause_filter_count = 3000;
- control->intercept |= (1ULL << INTERCEPT_PAUSE);
- }
-
- enable_gif(svm);
-}
-
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- init_vmcb(svm);
-
- if (!kvm_vcpu_is_bsp(vcpu)) {
- kvm_rip_write(vcpu, 0);
- svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
- svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
- }
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
-
- return 0;
-}
-
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
-{
- struct vcpu_svm *svm;
- struct page *page;
- struct page *msrpm_pages;
- struct page *hsave_page;
- struct page *nested_msrpm_pages;
- int err;
-
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
-
- err = -ENOMEM;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- goto uninit;
-
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!msrpm_pages)
- goto free_page1;
-
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
- if (!nested_msrpm_pages)
- goto free_page2;
-
- hsave_page = alloc_page(GFP_KERNEL);
- if (!hsave_page)
- goto free_page3;
-
- svm->nested.hsave = page_address(hsave_page);
-
- svm->msrpm = page_address(msrpm_pages);
- svm_vcpu_init_msrpm(svm->msrpm);
-
- svm->nested.msrpm = page_address(nested_msrpm_pages);
- svm_vcpu_init_msrpm(svm->nested.msrpm);
-
- svm->vmcb = page_address(page);
- clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
- svm->asid_generation = 0;
- init_vmcb(svm);
-
- fx_init(&svm->vcpu);
- svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-
- return &svm->vcpu;
-
-free_page3:
- __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
-free_page1:
- __free_page(page);
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
-out:
- return ERR_PTR(err);
-}
-
-static void svm_free_vcpu(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
- __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
- __free_page(virt_to_page(svm->nested.hsave));
- __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
-}
-
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- if (unlikely(cpu != vcpu->cpu)) {
- u64 delta;
-
- if (check_tsc_unstable()) {
- /*
- * Make sure that the guest sees a monotonically
- * increasing TSC.
- */
- delta = vcpu->arch.host_tsc - native_read_tsc();
- svm->vmcb->control.tsc_offset += delta;
- if (is_nested(svm))
- svm->nested.hsave->control.tsc_offset += delta;
- }
- vcpu->cpu = cpu;
- kvm_migrate_timers(vcpu);
- svm->asid_generation = 0;
- }
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-}
-
-static void svm_vcpu_put(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
- ++vcpu->stat.host_state_reload;
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
- vcpu->arch.host_tsc = native_read_tsc();
-}
-
-static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
-{
- return to_svm(vcpu)->vmcb->save.rflags;
-}
-
-static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- to_svm(vcpu)->vmcb->save.rflags = rflags;
-}
-
-static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
- switch (reg) {
- case VCPU_EXREG_PDPTR:
- BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.cr3);
- break;
- default:
- BUG();
- }
-}
-
-static void svm_set_vintr(struct vcpu_svm *svm)
-{
- svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
-}
-
-static void svm_clear_vintr(struct vcpu_svm *svm)
-{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
-}
-
-static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- switch (seg) {
- case VCPU_SREG_CS: return &save->cs;
- case VCPU_SREG_DS: return &save->ds;
- case VCPU_SREG_ES: return &save->es;
- case VCPU_SREG_FS: return &save->fs;
- case VCPU_SREG_GS: return &save->gs;
- case VCPU_SREG_SS: return &save->ss;
- case VCPU_SREG_TR: return &save->tr;
- case VCPU_SREG_LDTR: return &save->ldtr;
- }
- BUG();
- return NULL;
-}
-
-static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- return s->base;
-}
-
-static void svm_get_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- var->base = s->base;
- var->limit = s->limit;
- var->selector = s->selector;
- var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
- var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
- var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
- var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
- var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
- var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
- var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
-
- /*
- * AMD's VMCB does not have an explicit unusable field, so emulate it
- * for cross vendor migration purposes by "not present"
- */
- var->unusable = !var->present || (var->type == 0);
-
- switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
- case VCPU_SREG_TR:
- /*
- * Work around a bug where the busy flag in the tr selector
- * isn't exposed
- */
- var->type |= 0x2;
- break;
- case VCPU_SREG_DS:
- case VCPU_SREG_ES:
- case VCPU_SREG_FS:
- case VCPU_SREG_GS:
- /*
- * The accessed bit must always be set in the segment
- * descriptor cache, although it can be cleared in the
- * descriptor, the cached bit always remains at 1. Since
- * Intel has a check on this, set it here to support
- * cross-vendor migration.
- */
- if (!var->unusable)
- var->type |= 0x1;
- break;
- case VCPU_SREG_SS:
- /*
- * On AMD CPUs sometimes the DB bit in the segment
- * descriptor is left as 1, although the whole segment has
- * been made unusable. Clear it here to pass an Intel VMX
- * entry check when cross vendor migrating.
- */
- if (var->unusable)
- var->db = 0;
- break;
- }
-}
-
-static int svm_get_cpl(struct kvm_vcpu *vcpu)
-{
- struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
-
- return save->cpl;
-}
-
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->size = svm->vmcb->save.idtr.limit;
- dt->address = svm->vmcb->save.idtr.base;
-}
-
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.idtr.limit = dt->size;
- svm->vmcb->save.idtr.base = dt->address ;
-}
-
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- dt->size = svm->vmcb->save.gdtr.limit;
- dt->address = svm->vmcb->save.gdtr.base;
-}
-
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.gdtr.limit = dt->size;
- svm->vmcb->save.gdtr.base = dt->address ;
-}
-
-static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-}
-
-static void update_cr0_intercept(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb;
- ulong gcr0 = svm->vcpu.arch.cr0;
- u64 *hcr0 = &svm->vmcb->save.cr0;
-
- if (!svm->vcpu.fpu_active)
- *hcr0 |= SVM_CR0_SELECTIVE_MASK;
- else
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-
-
- if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
- vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
- }
- } else {
- svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- }
- }
-}
-
-static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (is_nested(svm)) {
- /*
- * We are here because we run in nested mode, the host kvm
- * intercepts cr0 writes but the l1 hypervisor does not.
- * But the L1 hypervisor may intercept selective cr0 writes.
- * This needs to be checked here.
- */
- unsigned long old, new;
-
- /* Remove bits that would trigger a real cr0 write intercept */
- old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
- new = cr0 & SVM_CR0_SELECTIVE_MASK;
-
- if (old == new) {
- /* cr0 write with ts and mp unchanged */
- svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
- if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
- return;
- }
- }
-
-#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
- vcpu->arch.efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
- }
-
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
- vcpu->arch.efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
- }
- }
-#endif
- vcpu->arch.cr0 = cr0;
-
- if (!npt_enabled)
- cr0 |= X86_CR0_PG | X86_CR0_WP;
-
- if (!vcpu->fpu_active)
- cr0 |= X86_CR0_TS;
- /*
- * re-enable caching here because the QEMU bios
- * does not do it - this results in some delay at
- * reboot
- */
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
- svm->vmcb->save.cr0 = cr0;
- update_cr0_intercept(svm);
-}
-
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
- unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
-
- if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- force_new_asid(vcpu);
-
- vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
- cr4 |= X86_CR4_PAE;
- cr4 |= host_cr4_mce;
- to_svm(vcpu)->vmcb->save.cr4 = cr4;
-}
-
-static void svm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_seg *s = svm_seg(vcpu, seg);
-
- s->base = var->base;
- s->limit = var->limit;
- s->selector = var->selector;
- if (var->unusable)
- s->attrib = 0;
- else {
- s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
- s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
- s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
- s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
- s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
- s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
- s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
- s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
- }
- if (seg == VCPU_SREG_CS)
- svm->vmcb->save.cpl
- = (svm->vmcb->save.cs.attrib
- >> SVM_SELECTOR_DPL_SHIFT) & 3;
-
-}
-
-static void update_db_intercept(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.intercept_exceptions &=
- ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
-
- if (svm->nmi_singlestep)
- svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- svm->vmcb->control.intercept_exceptions |=
- 1 << DB_VECTOR;
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- svm->vmcb->control.intercept_exceptions |=
- 1 << BP_VECTOR;
- } else
- vcpu->guest_debug = 0;
-}
-
-static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
- else
- svm->vmcb->save.dr7 = vcpu->arch.dr7;
-
- update_db_intercept(vcpu);
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
-
-static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
-{
- if (sd->next_asid > sd->max_asid) {
- ++sd->asid_generation;
- sd->next_asid = 1;
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
- }
-
- svm->asid_generation = sd->asid_generation;
- svm->vmcb->control.asid = sd->next_asid++;
-}
-
-static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->save.dr7 = value;
-}
-
-static int pf_interception(struct vcpu_svm *svm)
-{
- u64 fault_address;
- u32 error_code;
-
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
-}
-
-static int db_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- if (!(svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
- !svm->nmi_singlestep) {
- kvm_queue_exception(&svm->vcpu, DB_VECTOR);
- return 1;
- }
-
- if (svm->nmi_singlestep) {
- svm->nmi_singlestep = false;
- if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
- svm->vmcb->save.rflags &=
- ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(&svm->vcpu);
- }
-
- if (svm->vcpu.guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc =
- svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = DB_VECTOR;
- return 0;
- }
-
- return 1;
-}
-
-static int bp_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- kvm_run->exit_reason = KVM_EXIT_DEBUG;
- kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
- kvm_run->debug.arch.exception = BP_VECTOR;
- return 0;
-}
-
-static int ud_interception(struct vcpu_svm *svm)
-{
- int er;
-
- er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
- if (er != EMULATE_DONE)
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 excp;
-
- if (is_nested(svm)) {
- u32 h_excp, n_excp;
-
- h_excp = svm->nested.hsave->control.intercept_exceptions;
- n_excp = svm->nested.intercept_exceptions;
- h_excp &= ~(1 << NM_VECTOR);
- excp = h_excp | n_excp;
- } else {
- excp = svm->vmcb->control.intercept_exceptions;
- excp &= ~(1 << NM_VECTOR);
- }
-
- svm->vmcb->control.intercept_exceptions = excp;
-
- svm->vcpu.fpu_active = 1;
- update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
- svm_fpu_activate(&svm->vcpu);
- return 1;
-}
-
-static bool is_erratum_383(void)
-{
- int err, i;
- u64 value;
-
- if (!erratum_383_found)
- return false;
-
- value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
- if (err)
- return false;
-
- /* Bit 62 may or may not be set for this mce */
- value &= ~(1ULL << 62);
-
- if (value != 0xb600000000010015ULL)
- return false;
-
- /* Clear MCi_STATUS registers */
- for (i = 0; i < 6; ++i)
- native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
-
- value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
- if (!err) {
- u32 low, high;
-
- value &= ~(1ULL << 2);
- low = lower_32_bits(value);
- high = upper_32_bits(value);
-
- native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
- }
-
- /* Flush tlb to evict multi-match entries */
- __flush_tlb_all();
-
- return true;
-}
-
-static void svm_handle_mce(struct vcpu_svm *svm)
-{
- if (is_erratum_383()) {
- /*
- * Erratum 383 triggered. Guest state is corrupt so kill the
- * guest.
- */
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
-
- set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
-
- return;
- }
-
- /*
- * On an #MC intercept the MCE handler is not called automatically in
- * the host. So do it by hand here.
- */
- asm volatile (
- "int $0x12\n");
- /* not sure if we ever come back to this point */
-
- return;
-}
-
-static int mc_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int shutdown_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- /*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
- */
- clear_page(svm->vmcb);
- init_vmcb(svm);
-
- kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
- return 0;
-}
-
-static int io_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
- int size, in, string;
- unsigned port;
-
- ++svm->vcpu.stat.io_exits;
- string = (io_info & SVM_IOIO_STR_MASK) != 0;
- in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
-
- port = io_info >> 16;
- size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
- svm->next_rip = svm->vmcb->control.exit_info_2;
- skip_emulated_instruction(&svm->vcpu);
-
- return kvm_fast_pio_out(vcpu, size, port);
-}
-
-static int nmi_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int intr_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.irq_exits;
- return 1;
-}
-
-static int nop_on_interception(struct vcpu_svm *svm)
-{
- return 1;
-}
-
-static int halt_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
- skip_emulated_instruction(&svm->vcpu);
- return kvm_emulate_halt(&svm->vcpu);
-}
-
-static int vmmcall_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- kvm_emulate_hypercall(&svm->vcpu);
- return 1;
-}
-
-static int nested_svm_check_permissions(struct vcpu_svm *svm)
-{
- if (!(svm->vcpu.arch.efer & EFER_SVME)
- || !is_paging(&svm->vcpu)) {
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
- }
-
- if (svm->vmcb->save.cpl) {
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- }
-
- return 0;
-}
-
-static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
- bool has_error_code, u32 error_code)
-{
- int vmexit;
-
- if (!is_nested(svm))
- return 0;
-
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = error_code;
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
-
- vmexit = nested_svm_intercept(svm);
- if (vmexit == NESTED_EXIT_DONE)
- svm->nested.exit_required = true;
-
- return vmexit;
-}
-
-/* This function returns true if it is save to enable the irq window */
-static inline bool nested_svm_intr(struct vcpu_svm *svm)
-{
- if (!is_nested(svm))
- return true;
-
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- return true;
-
- if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
- return false;
-
- svm->vmcb->control.exit_code = SVM_EXIT_INTR;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- if (svm->nested.intercept & 1ULL) {
- /*
- * The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
- * #vmexit emulation might sleep. Only signal request for
- * the #vmexit here.
- */
- svm->nested.exit_required = true;
- trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
- return false;
- }
-
- return true;
-}
-
-/* This function returns true if it is save to enable the nmi window */
-static inline bool nested_svm_nmi(struct vcpu_svm *svm)
-{
- if (!is_nested(svm))
- return true;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
- return true;
-
- svm->vmcb->control.exit_code = SVM_EXIT_NMI;
- svm->nested.exit_required = true;
-
- return false;
-}
-
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
- struct page *page;
-
- might_sleep();
-
- page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
- goto error;
-
- *_page = page;
-
- return kmap(page);
-
-error:
- kvm_release_page_clean(page);
- kvm_inject_gp(&svm->vcpu, 0);
-
- return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
- kunmap(page);
- kvm_release_page_dirty(page);
-}
-
-static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
-{
- unsigned port;
- u8 val, bit;
- u64 gpa;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
- return NESTED_EXIT_HOST;
-
- port = svm->vmcb->control.exit_info_1 >> 16;
- gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
-
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
-
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
-{
- u32 offset, msr, value;
- int write, mask;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return NESTED_EXIT_HOST;
-
- msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
- write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
-
- if (offset == MSR_INVALID)
- return NESTED_EXIT_DONE;
-
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
- return NESTED_EXIT_DONE;
-
- return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
-}
-
-static int nested_svm_exit_special(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
-
- switch (exit_code) {
- case SVM_EXIT_INTR:
- case SVM_EXIT_NMI:
- case SVM_EXIT_EXCP_BASE + MC_VECTOR:
- return NESTED_EXIT_HOST;
- case SVM_EXIT_NPF:
- /* For now we are always handling NPFs when using them */
- if (npt_enabled)
- return NESTED_EXIT_HOST;
- break;
- case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs */
- if (!npt_enabled)
- return NESTED_EXIT_HOST;
- break;
- case SVM_EXIT_EXCP_BASE + NM_VECTOR:
- nm_interception(svm);
- break;
- default:
- break;
- }
-
- return NESTED_EXIT_CONTINUE;
-}
-
-/*
- * If this function returns true, this #vmexit was already handled
- */
-static int nested_svm_intercept(struct vcpu_svm *svm)
-{
- u32 exit_code = svm->vmcb->control.exit_code;
- int vmexit = NESTED_EXIT_HOST;
-
- switch (exit_code) {
- case SVM_EXIT_MSR:
- vmexit = nested_svm_exit_handled_msr(svm);
- break;
- case SVM_EXIT_IOIO:
- vmexit = nested_svm_intercept_ioio(svm);
- break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr_read & cr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (svm->nested.intercept_cr_write & cr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr_read & dr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (svm->nested.intercept_dr_write & dr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
- u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
- if (svm->nested.intercept_exceptions & excp_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_ERR: {
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- default: {
- u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
- if (svm->nested.intercept & exit_bits)
- vmexit = NESTED_EXIT_DONE;
- }
- }
-
- return vmexit;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
-{
- int vmexit;
-
- vmexit = nested_svm_intercept(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
- nested_svm_vmexit(svm);
-
- return vmexit;
-}
-
-static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
-{
- struct vmcb_control_area *dst = &dst_vmcb->control;
- struct vmcb_control_area *from = &from_vmcb->control;
-
- dst->intercept_cr_read = from->intercept_cr_read;
- dst->intercept_cr_write = from->intercept_cr_write;
- dst->intercept_dr_read = from->intercept_dr_read;
- dst->intercept_dr_write = from->intercept_dr_write;
- dst->intercept_exceptions = from->intercept_exceptions;
- dst->intercept = from->intercept;
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->lbr_ctl = from->lbr_ctl;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
-
- trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
- vmcb->control.exit_info_1,
- vmcb->control.exit_info_2,
- vmcb->control.exit_int_info,
- vmcb->control.exit_int_info_err);
-
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
- if (!nested_vmcb)
- return 1;
-
- /* Exit nested SVM mode */
- svm->nested.vmcb = 0;
-
- /* Give the current vmcb to the guest */
- disable_gif(svm);
-
- nested_vmcb->save.es = vmcb->save.es;
- nested_vmcb->save.cs = vmcb->save.cs;
- nested_vmcb->save.ss = vmcb->save.ss;
- nested_vmcb->save.ds = vmcb->save.ds;
- nested_vmcb->save.gdtr = vmcb->save.gdtr;
- nested_vmcb->save.idtr = vmcb->save.idtr;
- nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
- nested_vmcb->save.cr2 = vmcb->save.cr2;
- nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
- nested_vmcb->save.rflags = vmcb->save.rflags;
- nested_vmcb->save.rip = vmcb->save.rip;
- nested_vmcb->save.rsp = vmcb->save.rsp;
- nested_vmcb->save.rax = vmcb->save.rax;
- nested_vmcb->save.dr7 = vmcb->save.dr7;
- nested_vmcb->save.dr6 = vmcb->save.dr6;
- nested_vmcb->save.cpl = vmcb->save.cpl;
-
- nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
- nested_vmcb->control.int_vector = vmcb->control.int_vector;
- nested_vmcb->control.int_state = vmcb->control.int_state;
- nested_vmcb->control.exit_code = vmcb->control.exit_code;
- nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
- nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
- nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
- nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
- nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-
- /*
- * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
- * to make sure that we do not lose injected events. So check event_inj
- * here and copy it to exit_int_info if it is valid.
- * Exit_int_info and event_inj can't be both valid because the case
- * below only happens on a VMRUN instruction intercept which has
- * no valid exit_int_info set.
- */
- if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
- struct vmcb_control_area *nc = &nested_vmcb->control;
-
- nc->exit_int_info = vmcb->control.event_inj;
- nc->exit_int_info_err = vmcb->control.event_inj_err;
- }
-
- nested_vmcb->control.tlb_ctl = 0;
- nested_vmcb->control.event_inj = 0;
- nested_vmcb->control.event_inj_err = 0;
-
- /* We always set V_INTR_MASKING and remember the old value in hflags */
- if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
- nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
-
- /* Restore the original control entries */
- copy_vmcb_control_area(vmcb, hsave);
-
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /* Restore selected save entries */
- svm->vmcb->save.es = hsave->save.es;
- svm->vmcb->save.cs = hsave->save.cs;
- svm->vmcb->save.ss = hsave->save.ss;
- svm->vmcb->save.ds = hsave->save.ds;
- svm->vmcb->save.gdtr = hsave->save.gdtr;
- svm->vmcb->save.idtr = hsave->save.idtr;
- svm->vmcb->save.rflags = hsave->save.rflags;
- svm_set_efer(&svm->vcpu, hsave->save.efer);
- svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
- svm_set_cr4(&svm->vcpu, hsave->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = hsave->save.cr3;
- svm->vcpu.arch.cr3 = hsave->save.cr3;
- } else {
- kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
- }
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
- svm->vmcb->save.dr7 = 0;
- svm->vmcb->save.cpl = 0;
- svm->vmcb->control.exit_int_info = 0;
-
- nested_svm_unmap(page);
-
- kvm_mmu_reset_context(&svm->vcpu);
- kvm_mmu_load(&svm->vcpu);
-
- return 0;
-}
-
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
-{
- /*
- * This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
- * the kvm msr permission bitmap may contain zero bits
- */
- int i;
-
- if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
- return true;
-
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
- offset = svm->nested.vmcb_msrpm + (p * 4);
-
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
- return false;
-
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
- }
-
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
-
- return true;
-}
-
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = vmcb->save.rflags;
- hsave->save.rip = svm->next_rip;
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = svm->vcpu.arch.cr3;
-
- copy_vmcb_control_area(hsave, vmcb);
-
- if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
- svm->vcpu.arch.hflags |= HF_HIF_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
-
- /* Load the nested guest state */
- svm->vmcb->save.es = nested_vmcb->save.es;
- svm->vmcb->save.cs = nested_vmcb->save.cs;
- svm->vmcb->save.ss = nested_vmcb->save.ss;
- svm->vmcb->save.ds = nested_vmcb->save.ds;
- svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
- svm->vmcb->save.idtr = nested_vmcb->save.idtr;
- svm->vmcb->save.rflags = nested_vmcb->save.rflags;
- svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
- svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
- svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
- if (npt_enabled) {
- svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
- svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
- } else
- kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-
- /* Guest paging mode is active - reset mmu */
- kvm_mmu_reset_context(&svm->vcpu);
-
- svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
-
- /* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = nested_vmcb->save.rax;
- svm->vmcb->save.rsp = nested_vmcb->save.rsp;
- svm->vmcb->save.rip = nested_vmcb->save.rip;
- svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
- svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
- svm->vmcb->save.cpl = nested_vmcb->save.cpl;
-
- svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
- svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
-
- /* cache intercepts */
- svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
- svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
- svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
- svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
- svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
- svm->nested.intercept = nested_vmcb->control.intercept;
-
- force_new_asid(&svm->vcpu);
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
- if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
- svm->vcpu.arch.hflags |= HF_VINTR_MASK;
- else
- svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
-
- if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
- /* We only want the cr8 intercept bits of the guest */
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- }
-
- /* We don't want to see VMMCALLs from a nested guest */
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
- /*
- * We don't want a nested guest to be more powerful than the guest, so
- * all intercepts are ORed
- */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
- svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
- svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
- svm->vmcb->control.int_state = nested_vmcb->control.int_state;
- svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
- svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
- svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
-
- nested_svm_unmap(page);
-
- /* nested_vmcb is our indicator if nested SVM is activated */
- svm->nested.vmcb = vmcb_gpa;
-
- enable_gif(svm);
-
- return true;
-}
-
-static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
-{
- to_vmcb->save.fs = from_vmcb->save.fs;
- to_vmcb->save.gs = from_vmcb->save.gs;
- to_vmcb->save.tr = from_vmcb->save.tr;
- to_vmcb->save.ldtr = from_vmcb->save.ldtr;
- to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
- to_vmcb->save.star = from_vmcb->save.star;
- to_vmcb->save.lstar = from_vmcb->save.lstar;
- to_vmcb->save.cstar = from_vmcb->save.cstar;
- to_vmcb->save.sfmask = from_vmcb->save.sfmask;
- to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
- to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
- to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
-}
-
-static int vmload_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct page *page;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return 1;
-
- nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(page);
-
- return 1;
-}
-
-static int vmsave_interception(struct vcpu_svm *svm)
-{
- struct vmcb *nested_vmcb;
- struct page *page;
-
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return 1;
-
- nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(page);
-
- return 1;
-}
-
-static int vmrun_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- if (!nested_svm_vmrun(svm))
- return 1;
-
- if (!nested_svm_vmrun_msrpm(svm))
- goto failed;
-
- return 1;
-
-failed:
-
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-
- return 1;
-}
-
-static int stgi_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- enable_gif(svm);
-
- return 1;
-}
-
-static int clgi_interception(struct vcpu_svm *svm)
-{
- if (nested_svm_check_permissions(svm))
- return 1;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
-
- disable_gif(svm);
-
- /* After a CLGI no interrupts should come */
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
-
- return 1;
-}
-
-static int invlpga_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
-
- trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
- vcpu->arch.regs[VCPU_REGS_RAX]);
-
- /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
- skip_emulated_instruction(&svm->vcpu);
- return 1;
-}
-
-static int skinit_interception(struct vcpu_svm *svm)
-{
- trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
-
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int invalid_op_interception(struct vcpu_svm *svm)
-{
- kvm_queue_exception(&svm->vcpu, UD_VECTOR);
- return 1;
-}
-
-static int task_switch_interception(struct vcpu_svm *svm)
-{
- u16 tss_selector;
- int reason;
- int int_type = svm->vmcb->control.exit_int_info &
- SVM_EXITINTINFO_TYPE_MASK;
- int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
- uint32_t type =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
- uint32_t idt_v =
- svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
- bool has_error_code = false;
- u32 error_code = 0;
-
- tss_selector = (u16)svm->vmcb->control.exit_info_1;
-
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
- reason = TASK_SWITCH_IRET;
- else if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
- reason = TASK_SWITCH_JMP;
- else if (idt_v)
- reason = TASK_SWITCH_GATE;
- else
- reason = TASK_SWITCH_CALL;
-
- if (reason == TASK_SWITCH_GATE) {
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = false;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- if (svm->vmcb->control.exit_info_2 &
- (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
- has_error_code = true;
- error_code =
- (u32)svm->vmcb->control.exit_info_2;
- }
- kvm_clear_exception_queue(&svm->vcpu);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_clear_interrupt_queue(&svm->vcpu);
- break;
- default:
- break;
- }
- }
-
- if (reason != TASK_SWITCH_GATE ||
- int_type == SVM_EXITINTINFO_TYPE_SOFT ||
- (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
- (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
- skip_emulated_instruction(&svm->vcpu);
-
- if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- svm->vcpu.run->internal.ndata = 0;
- return 0;
- }
- return 1;
-}
-
-static int cpuid_interception(struct vcpu_svm *svm)
-{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- kvm_emulate_cpuid(&svm->vcpu);
- return 1;
-}
-
-static int iret_interception(struct vcpu_svm *svm)
-{
- ++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
- svm->vcpu.arch.hflags |= HF_IRET_MASK;
- return 1;
-}
-
-static int invlpg_interception(struct vcpu_svm *svm)
-{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
-}
-
-static int emulate_on_interception(struct vcpu_svm *svm)
-{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
-}
-
-static int cr8_write_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
- /* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, 0, 0, 0);
- if (irqchip_in_kernel(svm->vcpu.kvm)) {
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- return 1;
- }
- if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return 1;
- kvm_run->exit_reason = KVM_EXIT_SET_TPR;
- return 0;
-}
-
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TSC: {
- u64 tsc_offset;
-
- if (is_nested(svm))
- tsc_offset = svm->nested.hsave->control.tsc_offset;
- else
- tsc_offset = svm->vmcb->control.tsc_offset;
-
- *data = tsc_offset + native_read_tsc();
- break;
- }
- case MSR_K6_STAR:
- *data = svm->vmcb->save.star;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
- break;
- case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
- break;
- case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
- break;
- case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
- break;
- case MSR_IA32_SYSENTER_EIP:
- *data = svm->sysenter_eip;
- break;
- case MSR_IA32_SYSENTER_ESP:
- *data = svm->sysenter_esp;
- break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
- case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
- break;
- case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
- break;
- case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
- break;
- case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
- break;
- case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
- break;
- case MSR_VM_HSAVE_PA:
- *data = svm->nested.hsave_msr;
- break;
- case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
- break;
- case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
- break;
- default:
- return kvm_get_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int rdmsr_interception(struct vcpu_svm *svm)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data;
-
- if (svm_get_msr(&svm->vcpu, ecx, &data)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(&svm->vcpu, 0);
- } else {
- trace_kvm_msr_read(ecx, data);
-
- svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
- svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int svm_dis, chg_mask;
-
- if (data & ~SVM_VM_CR_VALID_MASK)
- return 1;
-
- chg_mask = SVM_VM_CR_VALID_MASK;
-
- if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
- chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
-
- svm->nested.vm_cr_msr &= ~chg_mask;
- svm->nested.vm_cr_msr |= (data & chg_mask);
-
- svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
-
- /* check for svm_disable while efer.svme is set */
- if (svm_dis && (vcpu->arch.efer & EFER_SVME))
- return 1;
-
- return 0;
-}
-
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- switch (ecx) {
- case MSR_IA32_TSC: {
- u64 tsc_offset = data - native_read_tsc();
- u64 g_tsc_offset = 0;
-
- if (is_nested(svm)) {
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->nested.hsave->control.tsc_offset;
- svm->nested.hsave->control.tsc_offset = tsc_offset;
- }
-
- svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
-
- break;
- }
- case MSR_K6_STAR:
- svm->vmcb->save.star = data;
- break;
-#ifdef CONFIG_X86_64
- case MSR_LSTAR:
- svm->vmcb->save.lstar = data;
- break;
- case MSR_CSTAR:
- svm->vmcb->save.cstar = data;
- break;
- case MSR_KERNEL_GS_BASE:
- svm->vmcb->save.kernel_gs_base = data;
- break;
- case MSR_SYSCALL_MASK:
- svm->vmcb->save.sfmask = data;
- break;
-#endif
- case MSR_IA32_SYSENTER_CS:
- svm->vmcb->save.sysenter_cs = data;
- break;
- case MSR_IA32_SYSENTER_EIP:
- svm->sysenter_eip = data;
- svm->vmcb->save.sysenter_eip = data;
- break;
- case MSR_IA32_SYSENTER_ESP:
- svm->sysenter_esp = data;
- svm->vmcb->save.sysenter_esp = data;
- break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!svm_has(SVM_FEATURE_LBRV)) {
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
- break;
- }
- if (data & DEBUGCTL_RESERVED_BITS)
- return 1;
-
- svm->vmcb->save.dbgctl = data;
- if (data & (1ULL<<0))
- svm_enable_lbrv(svm);
- else
- svm_disable_lbrv(svm);
- break;
- case MSR_VM_HSAVE_PA:
- svm->nested.hsave_msr = data;
- break;
- case MSR_VM_CR:
- return svm_set_vm_cr(vcpu, data);
- case MSR_VM_IGNNE:
- pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
- break;
- default:
- return kvm_set_msr_common(vcpu, ecx, data);
- }
- return 0;
-}
-
-static int wrmsr_interception(struct vcpu_svm *svm)
-{
- u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data)) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(&svm->vcpu, 0);
- } else {
- trace_kvm_msr_write(ecx, data);
- skip_emulated_instruction(&svm->vcpu);
- }
- return 1;
-}
-
-static int msr_interception(struct vcpu_svm *svm)
-{
- if (svm->vmcb->control.exit_info_1)
- return wrmsr_interception(svm);
- else
- return rdmsr_interception(svm);
-}
-
-static int interrupt_window_interception(struct vcpu_svm *svm)
-{
- struct kvm_run *kvm_run = svm->vcpu.run;
-
- svm_clear_vintr(svm);
- svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
- /*
- * If the user space waits to inject interrupts, exit as soon as
- * possible
- */
- if (!irqchip_in_kernel(svm->vcpu.kvm) &&
- kvm_run->request_interrupt_window &&
- !kvm_cpu_has_interrupt(&svm->vcpu)) {
- ++svm->vcpu.stat.irq_window_exits;
- kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- return 0;
- }
-
- return 1;
-}
-
-static int pause_interception(struct vcpu_svm *svm)
-{
- kvm_vcpu_on_spin(&(svm->vcpu));
- return 1;
-}
-
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
- [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_READ_DR4] = emulate_on_interception,
- [SVM_EXIT_READ_DR5] = emulate_on_interception,
- [SVM_EXIT_READ_DR6] = emulate_on_interception,
- [SVM_EXIT_READ_DR7] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
- [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
- [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
- [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
- [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
- [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
- [SVM_EXIT_INTR] = intr_interception,
- [SVM_EXIT_NMI] = nmi_interception,
- [SVM_EXIT_SMI] = nop_on_interception,
- [SVM_EXIT_INIT] = nop_on_interception,
- [SVM_EXIT_VINTR] = interrupt_window_interception,
- [SVM_EXIT_CPUID] = cpuid_interception,
- [SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
- [SVM_EXIT_PAUSE] = pause_interception,
- [SVM_EXIT_HLT] = halt_interception,
- [SVM_EXIT_INVLPG] = invlpg_interception,
- [SVM_EXIT_INVLPGA] = invlpga_interception,
- [SVM_EXIT_IOIO] = io_interception,
- [SVM_EXIT_MSR] = msr_interception,
- [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
- [SVM_EXIT_SHUTDOWN] = shutdown_interception,
- [SVM_EXIT_VMRUN] = vmrun_interception,
- [SVM_EXIT_VMMCALL] = vmmcall_interception,
- [SVM_EXIT_VMLOAD] = vmload_interception,
- [SVM_EXIT_VMSAVE] = vmsave_interception,
- [SVM_EXIT_STGI] = stgi_interception,
- [SVM_EXIT_CLGI] = clgi_interception,
- [SVM_EXIT_SKINIT] = skinit_interception,
- [SVM_EXIT_WBINVD] = emulate_on_interception,
- [SVM_EXIT_MONITOR] = invalid_op_interception,
- [SVM_EXIT_MWAIT] = invalid_op_interception,
- [SVM_EXIT_NPF] = pf_interception,
-};
-
-static int handle_exit(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_run *kvm_run = vcpu->run;
- u32 exit_code = svm->vmcb->control.exit_code;
-
- trace_kvm_exit(exit_code, vcpu);
-
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
- vcpu->arch.cr0 = svm->vmcb->save.cr0;
- if (npt_enabled)
- vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
- if (unlikely(svm->nested.exit_required)) {
- nested_svm_vmexit(svm);
- svm->nested.exit_required = false;
-
- return 1;
- }
-
- if (is_nested(svm)) {
- int vmexit;
-
- trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
- svm->vmcb->control.exit_info_1,
- svm->vmcb->control.exit_info_2,
- svm->vmcb->control.exit_int_info,
- svm->vmcb->control.exit_int_info_err);
-
- vmexit = nested_svm_exit_special(svm);
-
- if (vmexit == NESTED_EXIT_CONTINUE)
- vmexit = nested_svm_exit_handled(svm);
-
- if (vmexit == NESTED_EXIT_DONE)
- return 1;
- }
-
- svm_complete_interrupts(svm);
-
- if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
- kvm_run->fail_entry.hardware_entry_failure_reason
- = svm->vmcb->control.exit_code;
- return 0;
- }
-
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
- printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
- || !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
- }
-
- return svm_exit_handlers[exit_code](svm);
-}
-
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- sd->tss_desc->type = 9; /* available 32/64-bit TSS */
- load_TR_desc();
-}
-
-static void pre_svm_run(struct vcpu_svm *svm)
-{
- int cpu = raw_smp_processor_id();
-
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
-
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- /* FIXME: handle wraparound of asid_generation */
- if (svm->asid_generation != sd->asid_generation)
- new_asid(svm, sd);
-}
-
-static void svm_inject_nmi(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
- vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
- ++vcpu->stat.nmi_injections;
-}
-
-static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
-{
- struct vmcb_control_area *control;
-
- trace_kvm_inj_virq(irq);
-
- ++svm->vcpu.stat.irq_injections;
- control = &svm->vmcb->control;
- control->int_vector = irq;
- control->int_ctl &= ~V_INTR_PRIO_MASK;
- control->int_ctl |= V_IRQ_MASK |
- ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
-}
-
-static void svm_set_irq(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- BUG_ON(!(gif_set(svm)));
-
- svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
- SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- if (irr == -1)
- return;
-
- if (tpr >= irr)
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
-}
-
-static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int ret;
- ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
- !(svm->vcpu.arch.hflags & HF_NMI_MASK);
- ret = ret && gif_set(svm) && nested_svm_nmi(svm);
-
- return ret;
-}
-
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (masked) {
- svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
- } else {
- svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
- }
-}
-
-static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
- int ret;
-
- if (!gif_set(svm) ||
- (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
- return 0;
-
- ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
-
- if (is_nested(svm))
- return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
-
- return ret;
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /*
- * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
- * 1, because that's a separate STGI/VMRUN intercept. The next time we
- * get that intercept, this function will be called again though and
- * we'll get the vintr intercept.
- */
- if (gif_set(svm) && nested_svm_intr(svm)) {
- svm_set_vintr(svm);
- svm_inject_irq(svm, 0x0);
- }
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
- == HF_NMI_MASK)
- return; /* IRET will cause a vm exit */
-
- /*
- * Something prevents NMI from been injected. Single step over possible
- * problem (IRET or exception injection or interrupt shadow)
- */
- svm->nmi_singlestep = true;
- svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
- update_db_intercept(vcpu);
-}
-
-static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
- return 0;
-}
-
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
- int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
- kvm_set_cr8(vcpu, cr8);
- }
-}
-
-static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 cr8;
-
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
- return;
-
- cr8 = kvm_get_cr8(vcpu);
- svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
- svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-}
-
-static void svm_complete_interrupts(struct vcpu_svm *svm)
-{
- u8 vector;
- int type;
- u32 exitintinfo = svm->vmcb->control.exit_int_info;
- unsigned int3_injected = svm->int3_injected;
-
- svm->int3_injected = 0;
-
- if (svm->vcpu.arch.hflags & HF_IRET_MASK)
- svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
-
- svm->vcpu.arch.nmi_injected = false;
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- if (!(exitintinfo & SVM_EXITINTINFO_VALID))
- return;
-
- vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
- type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
-
- switch (type) {
- case SVM_EXITINTINFO_TYPE_NMI:
- svm->vcpu.arch.nmi_injected = true;
- break;
- case SVM_EXITINTINFO_TYPE_EXEPT:
- /*
- * In case of software exceptions, do not reinject the vector,
- * but re-execute the instruction instead. Rewind RIP first
- * if we emulated INT3 before.
- */
- if (kvm_exception_is_soft(vector)) {
- if (vector == BP_VECTOR && int3_injected &&
- kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
- kvm_rip_write(&svm->vcpu,
- kvm_rip_read(&svm->vcpu) -
- int3_injected);
- break;
- }
- if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
- u32 err = svm->vmcb->control.exit_int_info_err;
- kvm_requeue_exception_e(&svm->vcpu, vector, err);
-
- } else
- kvm_requeue_exception(&svm->vcpu, vector);
- break;
- case SVM_EXITINTINFO_TYPE_INTR:
- kvm_queue_interrupt(&svm->vcpu, vector, false);
- break;
- default:
- break;
- }
-}
-
-#ifdef CONFIG_X86_64
-#define R "r"
-#else
-#define R "e"
-#endif
-
-static void svm_vcpu_run(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
-
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
- /*
- * A vmexit emulation is required before the vcpu can be executed
- * again.
- */
- if (unlikely(svm->nested.exit_required))
- return;
-
- pre_svm_run(svm);
-
- sync_lapic_to_cr8(vcpu);
-
- save_host_msrs(vcpu);
- fs_selector = kvm_read_fs();
- gs_selector = kvm_read_gs();
- ldt_selector = kvm_read_ldt();
- svm->vmcb->save.cr2 = vcpu->arch.cr2;
- /* required for live migration with NPT */
- if (npt_enabled)
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
-
- clgi();
-
- local_irq_enable();
-
- asm volatile (
- "push %%"R"bp; \n\t"
- "mov %c[rbx](%[svm]), %%"R"bx \n\t"
- "mov %c[rcx](%[svm]), %%"R"cx \n\t"
- "mov %c[rdx](%[svm]), %%"R"dx \n\t"
- "mov %c[rsi](%[svm]), %%"R"si \n\t"
- "mov %c[rdi](%[svm]), %%"R"di \n\t"
- "mov %c[rbp](%[svm]), %%"R"bp \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%[svm]), %%r8 \n\t"
- "mov %c[r9](%[svm]), %%r9 \n\t"
- "mov %c[r10](%[svm]), %%r10 \n\t"
- "mov %c[r11](%[svm]), %%r11 \n\t"
- "mov %c[r12](%[svm]), %%r12 \n\t"
- "mov %c[r13](%[svm]), %%r13 \n\t"
- "mov %c[r14](%[svm]), %%r14 \n\t"
- "mov %c[r15](%[svm]), %%r15 \n\t"
-#endif
-
- /* Enter guest mode */
- "push %%"R"ax \n\t"
- "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
- "pop %%"R"ax \n\t"
-
- /* Save guest registers, load host registers */
- "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
- "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
- "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
- "mov %%"R"si, %c[rsi](%[svm]) \n\t"
- "mov %%"R"di, %c[rdi](%[svm]) \n\t"
- "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%[svm]) \n\t"
- "mov %%r9, %c[r9](%[svm]) \n\t"
- "mov %%r10, %c[r10](%[svm]) \n\t"
- "mov %%r11, %c[r11](%[svm]) \n\t"
- "mov %%r12, %c[r12](%[svm]) \n\t"
- "mov %%r13, %c[r13](%[svm]) \n\t"
- "mov %%r14, %c[r14](%[svm]) \n\t"
- "mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
- "pop %%"R"bp"
- :
- : [svm]"a"(svm),
- [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
- [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
-#ifdef CONFIG_X86_64
- , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
-#endif
- : "cc", "memory"
- , R"bx", R"cx", R"dx", R"si", R"di"
-#ifdef CONFIG_X86_64
- , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#endif
- );
-
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- kvm_load_fs(fs_selector);
- kvm_load_gs(gs_selector);
- kvm_load_ldt(ldt_selector);
- load_host_msrs(vcpu);
-
- reload_tss(vcpu);
-
- local_irq_disable();
-
- stgi();
-
- sync_cr8_to_lapic(vcpu);
-
- svm->next_rip = 0;
-
- if (npt_enabled) {
- vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
- vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
- }
-
- /*
- * We need to handle MC intercepts here before the vcpu has a chance to
- * change the physical cpu
- */
- if (unlikely(svm->vmcb->control.exit_code ==
- SVM_EXIT_EXCP_BASE + MC_VECTOR))
- svm_handle_mce(svm);
-}
-
-#undef R
-
-static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (npt_enabled) {
- svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
- return;
- }
-
- svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
-}
-
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
-static void
-svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
- /*
- * Patch in the VMMCALL instruction:
- */
- hypercall[0] = 0x0f;
- hypercall[1] = 0x01;
- hypercall[2] = 0xd9;
-}
-
-static void svm_check_processor_compat(void *rtn)
-{
- *(int *)rtn = 0;
-}
-
-static bool svm_cpu_has_accelerated_tpr(void)
-{
- return false;
-}
-
-static int get_npt_level(void)
-{
-#ifdef CONFIG_X86_64
- return PT64_ROOT_LEVEL;
-#else
- return PT32E_ROOT_LEVEL;
-#endif
-}
-
-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
- return 0;
-}
-
-static void svm_cpuid_update(struct kvm_vcpu *vcpu)
-{
-}
-
-static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
- switch (func) {
- case 0x8000000A:
- entry->eax = 1; /* SVM revision 1 */
- entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
- ASID emulation to nested SVM */
- entry->ecx = 0; /* Reserved */
- entry->edx = 0; /* Do not support any additional features */
-
- break;
- }
-}
-
-static const struct trace_print_flags svm_exit_reasons_str[] = {
- { SVM_EXIT_READ_CR0, "read_cr0" },
- { SVM_EXIT_READ_CR3, "read_cr3" },
- { SVM_EXIT_READ_CR4, "read_cr4" },
- { SVM_EXIT_READ_CR8, "read_cr8" },
- { SVM_EXIT_WRITE_CR0, "write_cr0" },
- { SVM_EXIT_WRITE_CR3, "write_cr3" },
- { SVM_EXIT_WRITE_CR4, "write_cr4" },
- { SVM_EXIT_WRITE_CR8, "write_cr8" },
- { SVM_EXIT_READ_DR0, "read_dr0" },
- { SVM_EXIT_READ_DR1, "read_dr1" },
- { SVM_EXIT_READ_DR2, "read_dr2" },
- { SVM_EXIT_READ_DR3, "read_dr3" },
- { SVM_EXIT_WRITE_DR0, "write_dr0" },
- { SVM_EXIT_WRITE_DR1, "write_dr1" },
- { SVM_EXIT_WRITE_DR2, "write_dr2" },
- { SVM_EXIT_WRITE_DR3, "write_dr3" },
- { SVM_EXIT_WRITE_DR5, "write_dr5" },
- { SVM_EXIT_WRITE_DR7, "write_dr7" },
- { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
- { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
- { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
- { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
- { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
- { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
- { SVM_EXIT_INTR, "interrupt" },
- { SVM_EXIT_NMI, "nmi" },
- { SVM_EXIT_SMI, "smi" },
- { SVM_EXIT_INIT, "init" },
- { SVM_EXIT_VINTR, "vintr" },
- { SVM_EXIT_CPUID, "cpuid" },
- { SVM_EXIT_INVD, "invd" },
- { SVM_EXIT_HLT, "hlt" },
- { SVM_EXIT_INVLPG, "invlpg" },
- { SVM_EXIT_INVLPGA, "invlpga" },
- { SVM_EXIT_IOIO, "io" },
- { SVM_EXIT_MSR, "msr" },
- { SVM_EXIT_TASK_SWITCH, "task_switch" },
- { SVM_EXIT_SHUTDOWN, "shutdown" },
- { SVM_EXIT_VMRUN, "vmrun" },
- { SVM_EXIT_VMMCALL, "hypercall" },
- { SVM_EXIT_VMLOAD, "vmload" },
- { SVM_EXIT_VMSAVE, "vmsave" },
- { SVM_EXIT_STGI, "stgi" },
- { SVM_EXIT_CLGI, "clgi" },
- { SVM_EXIT_SKINIT, "skinit" },
- { SVM_EXIT_WBINVD, "wbinvd" },
- { SVM_EXIT_MONITOR, "monitor" },
- { SVM_EXIT_MWAIT, "mwait" },
- { SVM_EXIT_NPF, "npf" },
- { -1, NULL }
-};
-
-static int svm_get_lpage_level(void)
-{
- return PT_PDPE_LEVEL;
-}
-
-static bool svm_rdtscp_supported(void)
-{
- return false;
-}
-
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
- if (is_nested(svm))
- svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
- update_cr0_intercept(svm);
-}
-
-static struct kvm_x86_ops svm_x86_ops = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
- .hardware_setup = svm_hardware_setup,
- .hardware_unsetup = svm_hardware_unsetup,
- .check_processor_compatibility = svm_check_processor_compat,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
- .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
-
- .vcpu_create = svm_create_vcpu,
- .vcpu_free = svm_free_vcpu,
- .vcpu_reset = svm_vcpu_reset,
-
- .prepare_guest_switch = svm_prepare_guest_switch,
- .vcpu_load = svm_vcpu_load,
- .vcpu_put = svm_vcpu_put,
-
- .set_guest_debug = svm_guest_debug,
- .get_msr = svm_get_msr,
- .set_msr = svm_set_msr,
- .get_segment_base = svm_get_segment_base,
- .get_segment = svm_get_segment,
- .set_segment = svm_set_segment,
- .get_cpl = svm_get_cpl,
- .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
- .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
- .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
- .set_cr0 = svm_set_cr0,
- .set_cr3 = svm_set_cr3,
- .set_cr4 = svm_set_cr4,
- .set_efer = svm_set_efer,
- .get_idt = svm_get_idt,
- .set_idt = svm_set_idt,
- .get_gdt = svm_get_gdt,
- .set_gdt = svm_set_gdt,
- .set_dr7 = svm_set_dr7,
- .cache_reg = svm_cache_reg,
- .get_rflags = svm_get_rflags,
- .set_rflags = svm_set_rflags,
- .fpu_activate = svm_fpu_activate,
- .fpu_deactivate = svm_fpu_deactivate,
-
- .tlb_flush = svm_flush_tlb,
-
- .run = svm_vcpu_run,
- .handle_exit = handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = svm_set_interrupt_shadow,
- .get_interrupt_shadow = svm_get_interrupt_shadow,
- .patch_hypercall = svm_patch_hypercall,
- .set_irq = svm_set_irq,
- .set_nmi = svm_inject_nmi,
- .queue_exception = svm_queue_exception,
- .interrupt_allowed = svm_interrupt_allowed,
- .nmi_allowed = svm_nmi_allowed,
- .get_nmi_mask = svm_get_nmi_mask,
- .set_nmi_mask = svm_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
-
- .set_tss_addr = svm_set_tss_addr,
- .get_tdp_level = get_npt_level,
- .get_mt_mask = svm_get_mt_mask,
-
- .exit_reasons_str = svm_exit_reasons_str,
- .get_lpage_level = svm_get_lpage_level,
-
- .cpuid_update = svm_cpuid_update,
-
- .rdtscp_supported = svm_rdtscp_supported,
-
- .set_supported_cpuid = svm_set_supported_cpuid,
-};
-
-static int __init svm_init(void)
-{
- return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
-}
-
-static void __exit svm_exit(void)
-{
- kvm_exit();
-}
-
-module_init(svm_init)
-module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
new file mode 100644
index 000000000000..6b77b2033208
--- /dev/null
+++ b/arch/x86/kvm/svm/avic.c
@@ -0,0 +1,1305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/hashtable.h>
+#include <linux/amd-iommu.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/msr.h>
+
+#include "trace.h"
+#include "lapic.h"
+#include "x86.h"
+#include "irq.h"
+#include "svm.h"
+
+/*
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
+ *
+ * For the vCPU index, use however many bits are currently allowed for the max
+ * guest physical APIC ID (limited by the size of the physical ID table), and
+ * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
+ * size of the GATag is defined by hardware (32 bits), but is an opaque value
+ * as far as hardware is concerned.
+ */
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+
+#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
+#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
+
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
+
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
+({ \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
+ \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
+ ga_tag; \
+})
+
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
+
+#define AVIC_AUTO_MODE -1
+
+static int avic_param_set(const char *val, const struct kernel_param *kp)
+{
+ if (val && sysfs_streq(val, "auto")) {
+ *(int *)kp->arg = AVIC_AUTO_MODE;
+ return 0;
+ }
+
+ return param_set_bint(val, kp);
+}
+
+static const struct kernel_param_ops avic_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = avic_param_set,
+ .get = param_get_bool,
+};
+
+/*
+ * Enable / disable AVIC. In "auto" mode (default behavior), AVIC is enabled
+ * for Zen4+ CPUs with x2AVIC (and all other criteria for enablement are met).
+ */
+static int avic = AVIC_AUTO_MODE;
+module_param_cb(avic, &avic_ops, &avic, 0444);
+__MODULE_PARM_TYPE(avic, "bool");
+
+module_param(enable_ipiv, bool, 0444);
+
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+static bool x2avic_enabled;
+static u32 x2avic_max_physical_id;
+
+static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm,
+ bool intercept)
+{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
+ int i;
+
+ if (intercept == svm->x2avic_msrs_intercepted)
+ return;
+
+ if (!x2avic_enabled)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
+
+ svm->x2avic_msrs_intercepted = intercept;
+}
+
+static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ u32 arch_max;
+
+ /*
+ * Return the largest size (x2APIC) when querying without a vCPU, e.g.
+ * to allocate the per-VM table..
+ */
+ if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic)))
+ arch_max = x2avic_max_physical_id;
+ else
+ arch_max = AVIC_MAX_PHYSICAL_ID;
+
+ /*
+ * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID
+ * plus one, so the max possible APIC ID is one less than that.
+ */
+ return min(kvm->arch.max_vcpu_ids - 1, arch_max);
+}
+
+static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu)
+{
+ return __avic_get_max_physical_id(vcpu->kvm, vcpu);
+}
+
+static void avic_activate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+ vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
+
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
+ */
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
+ vmcb->control.int_ctl |= X2APIC_MODE_MASK;
+
+ /* Disabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, false);
+ } else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /* Enabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, true);
+ }
+}
+
+static void avic_deactivate_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
+ vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+
+ /*
+ * If running nested and the guest uses its own MSR bitmap, there
+ * is no need to update L0's msr bitmap
+ */
+ if (is_guest_mode(&svm->vcpu) &&
+ vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))
+ return;
+
+ /* Enabling MSR intercept for x2APIC registers */
+ avic_set_x2apic_msr_interception(svm, true);
+}
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static int avic_get_physical_id_table_order(struct kvm *kvm)
+{
+ /* Provision for the maximum physical ID supported in x2avic mode */
+ return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
+}
+
+int avic_alloc_physical_id_table(struct kvm *kvm)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_apicv)
+ return 0;
+
+ if (kvm_svm->avic_physical_id_table)
+ return 0;
+
+ kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ avic_get_physical_id_table_order(kvm));
+ if (!kvm_svm->avic_physical_id_table)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!enable_apicv)
+ return;
+
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_pages((unsigned long)kvm_svm->avic_physical_id_table,
+ avic_get_physical_id_table_order(kvm));
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ u32 vm_id;
+
+ if (!enable_apicv)
+ return 0;
+
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
+ goto free_avic;
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
+
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ avic_activate_vmcb(svm);
+ else
+ avic_deactivate_vmcb(svm);
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > x2avic_max_physical_id)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
+
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
+ return -EINVAL;
+
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
+ if (ret)
+ return ret;
+ }
+
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
+
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
+
+ return 0;
+}
+
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu()) {
+ wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu));
+ }
+ put_cpu();
+}
+
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_XAPIC_DEST_FIELD(icrh);
+
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(dest != index))
+ return -EINVAL;
+
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
+ } else {
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
+ } else {
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
+ }
+
+ /* Nothing to do if there are no destinations in the cluster. */
+ if (unlikely(!bitmap))
+ return 0;
+
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
+
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
+ }
+
+ return 0;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
+
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
+
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
+ }
+}
+
+int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * Emulate IPIs that are not handled by AVIC hardware, which
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
+ * order to clear the BUSY flag.
+ */
+ if (icrl & APIC_ICR_BUSY)
+ kvm_apic_write_nodecode(vcpu, APIC_ICR);
+ else
+ kvm_apic_send_ipi(apic, icrl, icrh);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR:
+ /* Invalid IPI with vector < 16 */
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n");
+ }
+
+ return 1;
+}
+
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ u32 cluster, index;
+
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
+ return NULL;
+ ldr &= 0xf;
+ }
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
+
+ return &kvm_svm->avic_logical_id_table[index];
+}
+
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+}
+
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry;
+
+ /* Note: x2AVIC does not use logical APIC ID table */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
+
+ /* AVIC does not support LDR update for x2APIC */
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ return;
+
+ if (ldr == svm->ldr_reg)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
+}
+
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+
+ if (svm->dfr_reg == dfr)
+ return;
+
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
+}
+
+static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
+{
+ u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_LDR:
+ avic_handle_ldr_update(vcpu);
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(vcpu);
+ break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
+ default:
+ break;
+ }
+
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(vcpu);
+ } else {
+ /* Handling Fault */
+ ret = kvm_emulate_instruction(vcpu, 0);
+ }
+
+ return ret;
+}
+
+int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ raw_spin_lock_init(&svm->ir_list_lock);
+
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ ret = avic_init_backing_page(vcpu);
+ if (ret)
+ return ret;
+
+ svm->dfr_reg = APIC_DFR_FLAT;
+
+ return ret;
+}
+
+void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ avic_handle_dfr_update(vcpu);
+ avic_handle_ldr_update(vcpu);
+}
+
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
+{
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
+ unsigned long flags;
+
+ if (!vcpu)
+ return;
+
+ raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+}
+
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
+{
+ /*
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
+ */
+ svm_ir_list_del(irqfd);
+
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
+
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
+ */
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+ }
+
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
+
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
+ }
+
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
+
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
+
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
+
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
+
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
+{
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
+
+ lockdep_assert_held(&svm->ir_list_lock);
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ if (list_empty(&svm->ir_list))
+ return;
+
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
+ }
+}
+
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+ u64 entry;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
+ return;
+
+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ entry = svm->avic_physical_id_entry;
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
+
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
+
+ lockdep_assert_preemption_disabled();
+
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >=
+ PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm)))
+ return;
+
+ /*
+ * Take and hold the per-vCPU interrupt remapping lock while updating
+ * the Physical ID entry even though the lock doesn't protect against
+ * multiple writers (see above). Holding ir_list_lock ensures that
+ * either svm_ir_list_add() will consume up-to-date entry information,
+ * or that this task will wait until svm_ir_list_add() completes to
+ * mark the vCPU as not running.
+ */
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
+
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
+}
+
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
+ return;
+
+ if (kvm_vcpu_apicv_active(vcpu)) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_apicv_post_state_restore(vcpu);
+ avic_activate_vmcb(svm);
+ } else {
+ avic_deactivate_vmcb(svm);
+ }
+ vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ if (!enable_apicv)
+ return;
+
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
+ avic_refresh_virtual_apic_mode(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
+ else
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
+}
+
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
+}
+
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ avic_vcpu_load(vcpu, vcpu->cpu);
+}
+
+static bool __init avic_want_avic_enabled(void)
+{
+ /*
+ * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is
+ * supported (to avoid enabling partial support by default, and because
+ * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for
+ * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags
+ * aren't inclusive of previous generations, i.e. the kernel will set
+ * at most one ZenX feature flag.
+ */
+ if (avic == AVIC_AUTO_MODE)
+ avic = boot_cpu_has(X86_FEATURE_X2AVIC) &&
+ (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4));
+
+ if (!avic || !npt_enabled)
+ return false;
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC))
+ pr_warn(FW_BUG "Cannot enable x2AVIC, AVIC is unsupported\n");
+ return false;
+ }
+
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) &&
+ !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) {
+ pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n");
+ return false;
+ }
+
+ /*
+ * Print a scary message if AVIC is force enabled to make it abundantly
+ * clear that ignoring CPUID could have repercussions. See Revision
+ * Guide for specific AMD processor for more details.
+ */
+ if (!boot_cpu_has(X86_FEATURE_AVIC))
+ pr_warn("AVIC unsupported in CPUID but force enabled, your system might crash and burn\n");
+
+ return true;
+}
+
+/*
+ * Note:
+ * - The module param avic enable both xAPIC and x2APIC mode.
+ * - Hypervisor can support both xAVIC and x2AVIC in the same guest.
+ * - The mode can be switched at run-time.
+ */
+bool __init avic_hardware_setup(void)
+{
+ avic = avic_want_avic_enabled();
+ if (!avic)
+ return false;
+
+ pr_info("AVIC enabled\n");
+
+ /* AVIC is a prerequisite for x2AVIC. */
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled) {
+ if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT))
+ x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID;
+ else
+ x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID;
+ pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1);
+ } else {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
+ }
+
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+
+ return true;
+}
+
+void avic_hardware_unsetup(void)
+{
+ if (avic)
+ amd_iommu_register_ga_log_notifier(NULL);
+}
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000000..088f6429b24c
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
new file mode 100644
index 000000000000..d3f8bfc05832
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM).
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__
+#define __ARCH_X86_KVM_SVM_HYPERV_H__
+
+#include <asm/mshyperv.h>
+
+#include "../hyperv.h"
+#include "svm.h"
+
+#ifdef CONFIG_KVM_HYPERV
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
+#else /* CONFIG_KVM_HYPERV */
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {}
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+static inline void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) {}
+#endif /* CONFIG_KVM_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
new file mode 100644
index 000000000000..c81005b24522
--- /dev/null
+++ b/arch/x86/kvm/svm/nested.c
@@ -0,0 +1,1928 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+
+#include <asm/msr-index.h>
+#include <asm/debugreg.h>
+
+#include "kvm_emulate.h"
+#include "trace.h"
+#include "mmu.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "svm.h"
+#include "hyperv.h"
+
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
+ }
+
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
+
+ nested_svm_vmexit(svm);
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.ctl.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ /*
+ * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores
+ * nCR3[4:0] when loading PDPTEs from memory.
+ */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ (cr3 & GENMASK(11, 5)) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.nested_cr3;
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
+ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ svm->vmcb01.ptr->save.efer,
+ svm->nested.ctl.nested_cr3);
+ vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+}
+
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
+void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
+ unsigned int i;
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->vmcb01.ptr->control;
+ g = &svm->nested.ctl;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] = h->intercepts[i];
+
+ if (g->int_ctl & V_INTR_MASKING_MASK) {
+ /*
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
+ */
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ }
+
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ c->intercepts[i] |= g->intercepts[i];
+
+ /* If SMI is not intercepted, ignore guest SMI intercept as well */
+ if (!intercept_smi)
+ vmcb_clr_intercept(c, INTERCEPT_SMI);
+
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
+}
+
+/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
+ * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
+ * is optimized in that it only merges the parts where KVM MSR permission bitmap
+ * may contain zero bits.
+ */
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
+ int i;
+
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) is using Hyper-V emulation interface and
+ * tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+#ifdef CONFIG_KVM_HYPERV
+ if (!svm->nested.force_msr_bitmap_recalc) {
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+
+ if (kvm_hv_hypercall_enabled(vcpu) &&
+ hve->hv_enlightenments_control.msr_bitmap &&
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
+ goto set_msrpm_base_pa;
+ }
+#endif
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
+
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
+
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
+ return false;
+
+ msrpm02[p] = msrpm01[p] | l1_val;
+ }
+
+ svm->nested.force_msr_bitmap_recalc = false;
+
+#ifdef CONFIG_KVM_HYPERV
+set_msrpm_base_pa:
+#endif
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+/*
+ * Bits 11:0 of bitmap address are ignored by hardware
+ */
+static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
+{
+ u64 addr = PAGE_ALIGN(pa);
+
+ return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
+ kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
+}
+
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
+{
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
+ return false;
+
+ if (CC(control->asid == 0))
+ return false;
+
+ if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
+ return false;
+
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
+ MSRPM_SIZE)))
+ return false;
+ if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
+ IOPM_SIZE)))
+ return false;
+
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
+ return true;
+}
+
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
+{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
+ /*
+ * These checks are also performed by KVM_SET_SREGS,
+ * except that EFER.LMA is not checked by SVM against
+ * CR0.PG && EFER.LME.
+ */
+ if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
+ if (CC(!(save->cr4 & X86_CR4_PAE)) ||
+ CC(!(save->cr0 & X86_CR0_PE)) ||
+ CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
+ return false;
+ }
+
+ /* Note, SVM doesn't have any additional restrictions on CR4. */
+ if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
+ return false;
+
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
+ return true;
+}
+
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
+
+ return __nested_vmcb_check_save(vcpu, save);
+}
+
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
+
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
+
+static
+void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->next_rip = from->next_rip;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
+
+#ifdef CONFIG_KVM_HYPERV
+ /* Hyper-V extensions (Enlightened VMCB) */
+ if (kvm_hv_hypercall_enabled(vcpu)) {
+ to->clean = from->clean;
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
+ }
+#endif
+}
+
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
+{
+ __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
+}
+
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
+}
+
+/*
+ * Synchronize fields that are written by the processor, so that
+ * they can be copied back into the vmcb12.
+ */
+void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
+{
+ u32 mask;
+ svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
+ svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
+
+ /* Only a few fields of int_ctl are written by the processor. */
+ mask = V_IRQ_MASK | V_TPR_MASK;
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
+ mask &= ~V_IRQ_MASK;
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
+ svm->nested.ctl.int_ctl &= ~mask;
+ svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
+}
+
+/*
+ * Transfer any event that L0 or L1 wanted to inject into L2 to
+ * EXIT_INT_INFO.
+ */
+static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
+ struct vmcb *vmcb12)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 exit_int_info = 0;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.vector;
+ exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+
+ if (vcpu->arch.exception.has_error_code) {
+ exit_int_info |= SVM_EVTINJ_VALID_ERR;
+ vmcb12->control.exit_int_info_err =
+ vcpu->arch.exception.error_code;
+ }
+
+ } else if (vcpu->arch.nmi_injected) {
+ exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ exit_int_info = nr | SVM_EVTINJ_VALID;
+
+ if (vcpu->arch.interrupt.soft)
+ exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
+ else
+ exit_int_info |= SVM_EVTINJ_TYPE_INTR;
+ }
+
+ vmcb12->control.exit_int_info = exit_int_info;
+}
+
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ /* Handle pending Hyper-V TLB flush requests */
+ kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled);
+
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
+/*
+ * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
+ * if we are emulating VM-Entry into a guest with NPT enabled.
+ */
+static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_npt, bool reload_pdptrs)
+{
+ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3)))
+ return -EINVAL;
+
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, cr3)))
+ return -EINVAL;
+
+ vcpu->arch.cr3 = cr3;
+
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
+
+ if (!nested_npt)
+ kvm_mmu_new_pgd(vcpu, cr3);
+
+ return 0;
+}
+
+void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
+{
+ if (!svm->nested.vmcb02.ptr)
+ return;
+
+ /* FIXME: merge g_pat from vmcb01 and vmcb12. */
+ svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
+}
+
+static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
+{
+ bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(vmcb02, VMCB_NPT);
+
+ /* Load the nested guest state */
+ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
+ new_vmcb12 = true;
+ svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
+ svm->nested.force_msr_bitmap_recalc = true;
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
+ }
+
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
+ }
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) {
+ vmcb02->save.s_cet = vmcb12->save.s_cet;
+ vmcb02->save.isst_addr = vmcb12->save.isst_addr;
+ vmcb02->save.ssp = vmcb12->save.ssp;
+ vmcb_mark_dirty(vmcb02, VMCB_CET);
+ }
+
+ kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+
+ svm_set_efer(vcpu, svm->nested.save.efer);
+
+ svm_set_cr0(vcpu, svm->nested.save.cr0);
+ svm_set_cr4(vcpu, svm->nested.save.cr4);
+
+ svm->vcpu.arch.cr2 = vmcb12->save.cr2;
+
+ kvm_rax_write(vcpu, vmcb12->save.rax);
+ kvm_rsp_write(vcpu, vmcb12->save.rsp);
+ kvm_rip_write(vcpu, vmcb12->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
+
+ /* These bits will be set properly on the first execution when new_vmc12 is true */
+ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ } else {
+ svm_copy_lbrs(vmcb02, vmcb01);
+ }
+ svm_update_lbrv(&svm->vcpu);
+}
+
+static inline bool is_evtinj_soft(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+ u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ if (type == SVM_EVTINJ_TYPE_SOFT)
+ return true;
+
+ return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
+}
+
+static bool is_evtinj_nmi(u32 evtinj)
+{
+ u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
+
+ if (!(evtinj & SVM_EVTINJ_VALID))
+ return false;
+
+ return type == SVM_EVTINJ_TYPE_NMI;
+}
+
+static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
+ unsigned long vmcb12_rip,
+ unsigned long vmcb12_csbase)
+{
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(vcpu);
+
+ /*
+ * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
+ * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+ */
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
+ /* Copied from vmcb01. msrpm_base can be overwritten later. */
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
+
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instruction; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ vmcb02->control.bus_lock_counter = 1;
+ else
+ vmcb02->control.bus_lock_counter = 0;
+
+ /* Done at vmrun: asid. */
+
+ /* Also overwritten later if necessary. */
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* nested_cr3. */
+ if (nested_npt_enabled(svm))
+ nested_svm_init_mmu_context(vcpu);
+
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ vmcb02->control.int_ctl =
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ /*
+ * next_rip is consumed on VMRUN as the return address pushed on the
+ * stack for injected soft exceptions/interrupts. If nrips is exposed
+ * to L1, take it verbatim from vmcb12. If nrips is supported in
+ * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
+ * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
+ * prior to injecting the event).
+ */
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = svm->nested.ctl.next_rip;
+ else if (boot_cpu_has(X86_FEATURE_NRIPS))
+ vmcb02->control.next_rip = vmcb12_rip;
+
+ svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
+ if (is_evtinj_soft(vmcb02->control.event_inj)) {
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = vmcb12_csbase;
+ svm->soft_int_old_rip = vmcb12_rip;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ svm->soft_int_next_rip = svm->nested.ctl.next_rip;
+ else
+ svm->soft_int_next_rip = vmcb12_rip;
+ }
+
+ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
+ pause_count12 = svm->nested.ctl.pause_filter_count;
+ else
+ pause_count12 = 0;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
+ pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ else
+ pause_thresh12 = 0;
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
+
+ } else {
+ /* start from host values otherwise */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
+ }
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take effect.
+ */
+ recalc_intercepts(svm);
+}
+
+static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ /*
+ * Some VMCB state is shared between L1 and L2 and thus has to be
+ * moved at the time of nested vmrun and vmexit.
+ *
+ * VMLOAD/VMSAVE state would also belong in this category, but KVM
+ * always performs VMLOAD and VMSAVE from the VMCB01.
+ */
+ to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
+}
+
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
+ struct vmcb *vmcb12, bool from_vmrun)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ trace_kvm_nested_vmenter(svm->vmcb->save.rip,
+ vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl,
+ vmcb12->control.nested_cr3,
+ vmcb12->save.cr3,
+ KVM_ISA_SVM);
+
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
+ svm->nested.vmcb12_gpa = vmcb12_gpa;
+
+ WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
+
+ nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
+ nested_vmcb02_prepare_save(svm, vmcb12);
+
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
+ nested_npt_enabled(svm), from_vmrun);
+ if (ret)
+ return ret;
+
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ svm_set_gif(svm, true);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
+ return 0;
+}
+
+int nested_svm_vmrun(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+
+ if (!svm->nested.hsave_msr) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (is_smm(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
+ vmcb12_gpa = svm->vmcb->save.rax;
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ vmcb12 = map.hva;
+
+ if (WARN_ON_ONCE(!svm->nested.initialized))
+ return -EINVAL;
+
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
+
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
+ vmcb12->control.exit_code = SVM_EXIT_ERR;
+ vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_info_1 = 0;
+ vmcb12->control.exit_info_2 = 0;
+ goto out;
+ }
+
+ /*
+ * Since vmcb01 is not in use, we can use it to store some of the L1
+ * state.
+ */
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
+
+ if (!npt_enabled)
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
+
+ svm->nested.nested_run_pending = 1;
+
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
+ goto out_exit_err;
+
+ if (nested_svm_merge_msrpm(vcpu))
+ goto out;
+
+out_exit_err:
+ svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+out:
+ kvm_vcpu_unmap(vcpu, &map);
+
+ return ret;
+}
+
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
+{
+ to_save->es = from_save->es;
+ to_save->cs = from_save->cs;
+ to_save->ss = from_save->ss;
+ to_save->ds = from_save->ds;
+ to_save->gdtr = from_save->gdtr;
+ to_save->idtr = from_save->idtr;
+ to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+ to_save->efer = from_save->efer;
+ to_save->cr0 = from_save->cr0;
+ to_save->cr3 = from_save->cr3;
+ to_save->cr4 = from_save->cr4;
+ to_save->rax = from_save->rax;
+ to_save->rsp = from_save->rsp;
+ to_save->rip = from_save->rip;
+ to_save->cpl = 0;
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ to_save->s_cet = from_save->s_cet;
+ to_save->isst_addr = from_save->isst_addr;
+ to_save->ssp = from_save->ssp;
+ }
+}
+
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int rc;
+
+ rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(vcpu);
+ svm->nested.vmcb12_gpa = 0;
+ WARN_ON_ONCE(svm->nested.nested_run_pending);
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ /* in case we halted in L2 */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ /* Give the current vmcb to the guest */
+
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
+ vmcb12->save.efer = svm->vcpu.arch.efer;
+ vmcb12->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb12->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb12->save.cr2 = vmcb02->save.cr2;
+ vmcb12->save.cr4 = svm->vcpu.arch.cr4;
+ vmcb12->save.rflags = kvm_get_rflags(vcpu);
+ vmcb12->save.rip = kvm_rip_read(vcpu);
+ vmcb12->save.rsp = kvm_rsp_read(vcpu);
+ vmcb12->save.rax = kvm_rax_read(vcpu);
+ vmcb12->save.dr7 = vmcb02->save.dr7;
+ vmcb12->save.dr6 = svm->vcpu.arch.dr6;
+ vmcb12->save.cpl = vmcb02->save.cpl;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
+ vmcb12->save.s_cet = vmcb02->save.s_cet;
+ vmcb12->save.isst_addr = vmcb02->save.isst_addr;
+ vmcb12->save.ssp = vmcb02->save.ssp;
+ }
+
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
+
+ if (vmcb12->control.exit_code != SVM_EXIT_ERR)
+ nested_save_pending_event_to_vmcb12(svm, vmcb12);
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
+
+ vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
+ vmcb12->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
+
+ /*
+ * Invalidate bus_lock_rip unless KVM is still waiting for the guest
+ * to make forward progress before re-enabling bus lock detection.
+ */
+ if (!vmcb02->control.bus_lock_counter)
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
+ nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
+ svm_copy_lbrs(vmcb12, vmcb02);
+ else
+ svm_copy_lbrs(vmcb01, vmcb02);
+
+ svm_update_lbrv(vcpu);
+
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
+ /*
+ * On vmexit the GIF is set to false and
+ * no event can be injected in L1.
+ */
+ svm_set_gif(svm, false);
+ vmcb01->control.exit_int_info = 0;
+
+ svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+ }
+
+ if (kvm_caps.has_tsc_control &&
+ vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ svm_write_tsc_multiplier(vcpu);
+ }
+
+ svm->nested.ctl.nested_cr3 = 0;
+
+ /*
+ * Restore processor state that had been saved in vmcb01
+ */
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
+
+ svm->vcpu.arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(&svm->vcpu);
+
+ trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
+ vmcb12->control.exit_info_1,
+ vmcb12->control.exit_info_2,
+ vmcb12->control.exit_int_info,
+ vmcb12->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ kvm_vcpu_unmap(vcpu, &map);
+
+ nested_svm_transition_tlb_flush(vcpu);
+
+ nested_svm_uninit_mmu_context(vcpu);
+
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
+ if (rc)
+ return 1;
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ /*
+ * If we are here following the completion of a VMRUN that
+ * is being single-stepped, queue the pending #DB intercept
+ * right now so that it an be accounted for before we execute
+ * L1's next instruction.
+ */
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
+ kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ __kvm_vcpu_update_apicv(vcpu);
+
+ return 0;
+}
+
+static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
+}
+
+int svm_allocate_nested(struct vcpu_svm *svm)
+{
+ struct page *vmcb02_page;
+
+ if (svm->nested.initialized)
+ return 0;
+
+ vmcb02_page = snp_safe_alloc_page();
+ if (!vmcb02_page)
+ return -ENOMEM;
+ svm->nested.vmcb02.ptr = page_address(vmcb02_page);
+ svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
+
+ svm->nested.msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->nested.msrpm)
+ goto err_free_vmcb02;
+
+ svm->nested.initialized = true;
+ return 0;
+
+err_free_vmcb02:
+ __free_page(vmcb02_page);
+ return -ENOMEM;
+}
+
+void svm_free_nested(struct vcpu_svm *svm)
+{
+ if (!svm->nested.initialized)
+ return;
+
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm_vcpu_free_msrpm(svm->nested.msrpm);
+ svm->nested.msrpm = NULL;
+
+ __free_page(virt_to_page(svm->nested.vmcb02.ptr));
+ svm->nested.vmcb02.ptr = NULL;
+
+ /*
+ * When last_vmcb12_gpa matches the current vmcb12 gpa,
+ * some vmcb12 fields are not loaded if they are marked clean
+ * in the vmcb12, since in this case they are up to date already.
+ *
+ * When the vmcb02 is freed, this optimization becomes invalid.
+ */
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ svm->nested.initialized = false;
+}
+
+void svm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu)) {
+ svm->nested.nested_run_pending = 0;
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+
+ leave_guest_mode(vcpu);
+
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ nested_svm_uninit_mmu_context(vcpu);
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ bit_nr = svm_msrpm_bit_nr(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+
+ if (bit_nr < 0)
+ return NESTED_EXIT_DONE;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
+ return NESTED_EXIT_DONE;
+
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.ctl.iopm_base_pa + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ /*
+ * Host-intercepted exceptions have been checked already in
+ * nested_svm_exit_special. There is nothing to do here,
+ * the vmexit is injected by svm_check_nested_events.
+ */
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (to_svm(vcpu)->vmcb->save.cpl) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
+}
+
+static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
+ vmcb->control.exit_code_hi = 0;
+
+ if (ex->has_error_code)
+ vmcb->control.exit_info_1 = ex->error_code;
+
+ /*
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
+ */
+ if (ex->vector == PF_VECTOR) {
+ if (ex->has_payload)
+ vmcb->control.exit_info_2 = ex->payload;
+ else
+ vmcb->control.exit_info_2 = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ /* See kvm_check_and_inject_events(). */
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+ } else {
+ WARN_ON(ex->has_payload);
+ }
+
+ nested_svm_vmexit(svm);
+}
+
+static inline bool nested_exit_on_init(struct vcpu_svm *svm)
+{
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+}
+
+static int svm_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = svm->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ test_bit(KVM_APIC_INIT, &apic->pending_events)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_init(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
+ return 0;
+ }
+
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ nested_svm_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ return 0;
+ }
+
+#ifdef CONFIG_KVM_SMM
+ if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_smi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
+ return 0;
+ }
+#endif
+
+ if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(svm))
+ return 0;
+ nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_intr(svm))
+ return 0;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
+ return 0;
+ }
+
+ return 0;
+}
+
+int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_NPF:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+
+ if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
+ excp_bits)
+ return NESTED_EXIT_HOST;
+ else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ svm->vcpu.arch.apf.host_apf_flags)
+ /* Trap async PF even if not shadowing */
+ return NESTED_EXIT_HOST;
+ break;
+ }
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ svm_write_tsc_multiplier(vcpu);
+}
+
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->next_rip = from->next_rip;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
+}
+
+static int svm_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = KVM_STATE_NESTED_FORMAT_SVM,
+ .size = sizeof(kvm_state),
+ };
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+
+ if (!vcpu)
+ return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
+
+ svm = to_svm(vcpu);
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ /* First fill in the header and copy it out. */
+ if (is_guest_mode(vcpu)) {
+ kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
+ kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (svm->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+
+ if (gif_set(svm))
+ kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (!is_guest_mode(vcpu))
+ goto out;
+
+ /*
+ * Copy over the full size of the VMCB rather than just the size
+ * of the structs.
+ */
+ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
+ return -EFAULT;
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
+ return -EFAULT;
+
+ if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
+ sizeof(user_vmcb->save)))
+ return -EFAULT;
+out:
+ return kvm_state.size;
+}
+
+static int svm_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb __user *user_vmcb = (struct vmcb __user *)
+ &user_kvm_nested_state->data.svm[0];
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
+ unsigned long cr0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
+ if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
+ return -EINVAL;
+
+ if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GIF_SET))
+ return -EINVAL;
+
+ /*
+ * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
+ * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
+ */
+ if (!(vcpu->arch.efer & EFER_SVME)) {
+ /* GIF=1 and no guest mode are required if SVME=0. */
+ if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
+ return -EINVAL;
+ }
+
+ /* SMM temporarily disables SVM, so we cannot be in guest mode. */
+ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
+ return -EINVAL;
+ if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
+ return -EINVAL;
+
+ ctl = memdup_user(&user_vmcb->control, sizeof(*ctl));
+ if (IS_ERR(ctl))
+ return PTR_ERR(ctl);
+
+ save = memdup_user(&user_vmcb->save, sizeof(*save));
+ if (IS_ERR(save)) {
+ kfree(ctl);
+ return PTR_ERR(save);
+ }
+
+ ret = -EINVAL;
+ __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
+ goto out_free;
+
+ /*
+ * Processor state contains L2 state. Check that it is
+ * valid for guest mode (see nested_vmcb_check_save).
+ */
+ cr0 = kvm_read_cr0(vcpu);
+ if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
+ goto out_free;
+
+ /*
+ * Validate host state saved from before VMRUN (see
+ * nested_svm_check_permissions).
+ */
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
+ if (!(save->cr0 & X86_CR0_PG) ||
+ !(save->cr0 & X86_CR0_PE) ||
+ (save->rflags & X86_EFLAGS_VM) ||
+ !__nested_vmcb_check_save(vcpu, &save_cached))
+ goto out_free;
+
+
+ /*
+ * All checks done, we can enter guest mode. Userspace provides
+ * vmcb12.control, which will be combined with L1 and stored into
+ * vmcb02, and the L1 save state which we store in vmcb01.
+ * L2 registers if needed are moved from the current VMCB to VMCB02.
+ */
+
+ if (is_guest_mode(vcpu))
+ svm_leave_nested(vcpu);
+ else
+ svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
+
+ svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
+
+ svm->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
+
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
+
+ svm_switch_vmcb(svm, &svm->nested.vmcb02);
+ nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+ svm->nested.force_msr_bitmap_recalc = true;
+
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
+}
+
+static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ if (WARN_ON(!is_guest_mode(vcpu)))
+ return true;
+
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
+ return false;
+
+ if (!nested_svm_merge_msrpm(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
+ }
+
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
+ return true;
+}
+
+struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
+ .is_exception_vmexit = nested_svm_is_exception_vmexit,
+ .check_events = svm_check_nested_events,
+ .triple_fault = nested_svm_triple_fault,
+ .get_nested_state_pages = svm_get_nested_state_pages,
+ .get_state = svm_get_nested_state,
+ .set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
+};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
new file mode 100644
index 000000000000..bc062285fbf5
--- /dev/null
+++ b/arch/x86/kvm/svm/pmu.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+#include "svm.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int num_counters = pmu->nr_arch_gp_counters;
+
+ if (pmc_idx >= num_counters)
+ return NULL;
+
+ return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)];
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ unsigned int idx;
+
+ if (!pmu->version)
+ return NULL;
+
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ /*
+ * Each PMU counter has a pair of CTL and CTR MSRs. CTLn
+ * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd.
+ */
+ idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2);
+ if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL))
+ return NULL;
+ break;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ idx = msr - MSR_K7_EVNTSEL0;
+ break;
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ idx = msr - MSR_K7_PERFCTR0;
+ break;
+ default:
+ return NULL;
+ }
+
+ return amd_pmu_get_pmc(pmu, idx);
+}
+
+static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (idx >= pmu->nr_arch_gp_counters)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
+{
+ return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx);
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return pmc;
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ switch (msr) {
+ case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
+ return pmu->version > 0;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
+ return pmu->version > 1;
+ default:
+ if (msr > MSR_F15H_PERF_CTR5 &&
+ msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters)
+ return pmu->version > 1;
+ break;
+ }
+
+ return amd_msr_idx_to_pmc(vcpu, msr);
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ msr_info->data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ msr_info->data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc_write_counter(pmc, data);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel) {
+ pmc->eventsel = data;
+ kvm_pmu_request_counter_reprogram(pmc);
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ union cpuid_0x80000022_ebx ebx;
+
+ pmu->version = 1;
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ /*
+ * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
+ * CPUID entry is guaranteed to be non-NULL.
+ */
+ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
+ x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
+ ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
+ pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
+ } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ } else {
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ }
+
+ pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters,
+ kvm_pmu_cap.num_counters_gp);
+
+ if (pmu->version > 1) {
+ pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd;
+ }
+
+ pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1;
+ pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE);
+
+ for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .check_rdpmc_early = amd_check_rdpmc_early,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
+ .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
+};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
new file mode 100644
index 000000000000..f59c65abe3cf
--- /dev/null
+++ b/arch/x86/kvm/svm/sev.c
@@ -0,0 +1,5169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM-SEV support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/psp.h>
+#include <linux/psp-sev.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
+#include <linux/processor.h>
+#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
+
+#include <asm/pkru.h>
+#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
+
+#include "mmu.h"
+#include "x86.h"
+#include "svm.h"
+#include "svm_ops.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
+
+/* enable/disable SEV support */
+static bool sev_enabled = true;
+module_param_named(sev, sev_enabled, bool, 0444);
+
+/* enable/disable SEV-ES support */
+static bool sev_es_enabled = true;
+module_param_named(sev_es, sev_es_enabled, bool, 0444);
+
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
+/* enable/disable SEV-ES DebugSwap support */
+static bool sev_es_debug_swap_enabled = true;
+module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
+static u64 sev_supported_vmsa_features;
+
+static unsigned int nr_ciphertext_hiding_asids;
+module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444);
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
+
+/*
+ * SEV-SNP policy bits that can be supported by KVM. These include policy bits
+ * that have implementation support within KVM or policy bits that do not
+ * require implementation support within KVM to enforce the policy.
+ */
+#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET | \
+ SNP_POLICY_MASK_CXL_ALLOW | \
+ SNP_POLICY_MASK_MEM_AES_256_XTS | \
+ SNP_POLICY_MASK_RAPL_DIS | \
+ SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \
+ SNP_POLICY_MASK_PAGE_SWAP_DISABLE)
+
+static u64 snp_supported_policy_bits __ro_after_init;
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
+
+static u8 sev_enc_bit;
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
+unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned int max_sev_es_asid;
+static unsigned int min_sev_es_asid;
+static unsigned int max_snp_asid;
+static unsigned int min_snp_asid;
+static unsigned long sev_me_mask;
+static unsigned int nr_asids;
+static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
+
+static int snp_decommission_context(struct kvm *kvm);
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+/* Called with the sev_bitmap_lock held, or on shutdown */
+static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
+{
+ int ret, error = 0;
+ unsigned int asid;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
+ return -EBUSY;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
+ wbinvd_on_all_cpus();
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
+
+ return ret;
+}
+
+static inline bool is_mirroring_enc_context(struct kvm *kvm)
+{
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
+}
+
+static bool snp_is_secure_tsc_enabled(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) &&
+ !WARN_ON_ONCE(!sev_snp_guest(kvm));
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
+{
+ if (sev_flush_asids(min_asid, max_asid))
+ return false;
+
+ /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
+
+ return true;
+}
+
+static int sev_misc_cg_try_charge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ return misc_cg_try_charge(type, sev->misc_cg, 1);
+}
+
+static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
+{
+ enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+}
+
+static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type)
+{
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ */
+ unsigned int min_asid, max_asid, asid;
+ bool retry = true;
+ int ret;
+
+ if (vm_type == KVM_X86_SNP_VM) {
+ min_asid = min_snp_asid;
+ max_asid = max_snp_asid;
+ } else if (sev->es_active) {
+ min_asid = min_sev_es_asid;
+ max_asid = max_sev_es_asid;
+ } else {
+ min_asid = min_sev_asid;
+ max_asid = max_sev_asid;
+ }
+
+ /*
+ * The min ASID can end up larger than the max if basic SEV support is
+ * effectively disabled by disallowing use of ASIDs for SEV guests.
+ * Similarly for SEV-ES guests the min ASID can end up larger than the
+ * max when ciphertext hiding is enabled, effectively disabling SEV-ES
+ * support.
+ */
+ if (min_asid > max_asid)
+ return -ENOTTY;
+
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = sev_misc_cg_try_charge(sev);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
+
+ mutex_lock(&sev_bitmap_lock);
+
+again:
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
+ if (retry && __sev_recycle_asids(min_asid, max_asid)) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
+ ret = -EBUSY;
+ goto e_uncharge;
+ }
+
+ __set_bit(asid, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev->asid = asid;
+ return 0;
+e_uncharge:
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+}
+
+static unsigned int sev_get_asid(struct kvm *kvm)
+{
+ return to_kvm_sev_info(kvm)->asid;
+}
+
+static void sev_asid_free(struct kvm_sev_info *sev)
+{
+ struct svm_cpu_data *sd;
+ int cpu;
+
+ mutex_lock(&sev_bitmap_lock);
+
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu_ptr(&svm_data, cpu);
+ sd->sev_vmcbs[sev->asid] = NULL;
+ }
+
+ mutex_unlock(&sev_bitmap_lock);
+
+ sev_misc_cg_uncharge(sev);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission decommission;
+
+ if (!handle)
+ return;
+
+ decommission.handle = handle;
+ sev_guest_decommission(&decommission, NULL);
+}
+
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate deactivate;
+
+ if (!handle)
+ return;
+
+ deactivate.handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
+ sev_guest_deactivate(&deactivate, NULL);
+ up_read(&sev_deactivate_lock);
+
+ sev_decommission(handle);
+}
+
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ bool snp_active = vm_type == KVM_X86_SNP_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
+ int ret;
+
+ if (kvm->created_vcpus)
+ return -EINVAL;
+
+ if (data->flags)
+ return -EINVAL;
+
+ if (!snp_active)
+ valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
+ /*
+ * KVM supports the full range of mandatory features defined by version
+ * 2 of the GHCB protocol, so default to that for SEV-ES guests created
+ * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1).
+ */
+ if (es_active && !data->ghcb_version)
+ data->ghcb_version = 2;
+
+ if (snp_active && data->ghcb_version < 2)
+ return -EINVAL;
+
+ if (unlikely(sev->active))
+ return -EINVAL;
+
+ sev->active = true;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ if (snp_active)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
+ ret = sev_asid_new(sev, vm_type);
+ if (ret)
+ goto e_no_asid;
+
+ init_args.probe = false;
+ ret = sev_platform_init(&init_args);
+ if (ret)
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
+
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (snp_active) {
+ ret = snp_guest_req_init(kvm);
+ if (ret)
+ goto e_free;
+ }
+
+ INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
+
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
+ return 0;
+
+e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
+ argp->error = init_args.error;
+ sev_asid_free(sev);
+ sev->asid = 0;
+e_no_asid:
+ sev->vmsa_features = 0;
+ sev->es_active = false;
+ sev->active = false;
+ return ret;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data;
+
+ if (!to_kvm_sev_info(kvm)->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ unsigned int asid = sev_get_asid(kvm);
+ struct sev_data_activate activate;
+ int ret;
+
+ /* activate ASID on the given handle */
+ activate.handle = handle;
+ activate.asid = asid;
+ ret = sev_guest_activate(&activate, error);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ return sev_issue_cmd_external_user(fd_file(f), id, data, error);
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_launch_start start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ memset(&start, 0, sizeof(start));
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob))
+ return PTR_ERR(dh_blob);
+
+ start.dh_cert_address = __sme_set(__pa(dh_blob));
+ start.dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start.session_address = __sme_set(__pa(session_blob));
+ start.session_len = params.session_len;
+ }
+
+ start.handle = params.handle;
+ start.policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start.handle;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start.handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->policy = params.policy;
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+ return ret;
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ unsigned int flags)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ unsigned long npages, size;
+ int npinned;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+ int ret;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return ERR_PTR(-EINVAL);
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (WARN_ON_ONCE(npages > INT_MAX))
+ return ERR_PTR(-EINVAL);
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
+ else
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
+
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ /* Pin the user virtual address. */
+ npinned = pin_user_pages_fast(uaddr, npages, flags, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ unpin_user_pages(pages, npinned);
+
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ unpin_user_pages(pages, npages);
+ kvfree(pages);
+ to_kvm_sev_info(kvm)->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 ||
+ pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_local_page(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_local(page_virtual);
+ cond_resched();
+ }
+}
+
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ *
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE);
+ if (IS_ERR(inpages))
+ return PTR_ERR(inpages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ data.reserved = 0;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data.len = len;
+ data.address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+ return ret;
+}
+
+static int sev_es_sync_vmsa(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
+
+ /* Check some debug related fields before encrypting the VMSA */
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
+ return -EINVAL;
+
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
+ /* Sync registgers */
+ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
+ save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
+ save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX];
+ save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP];
+ save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP];
+ save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI];
+ save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI];
+#ifdef CONFIG_X86_64
+ save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8];
+ save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9];
+ save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10];
+ save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11];
+ save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12];
+ save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13];
+ save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14];
+ save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15];
+#endif
+ save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP];
+
+ /* Sync some non-GPR registers before encrypting */
+ save->xcr0 = svm->vcpu.arch.xcr0;
+ save->pkru = svm->vcpu.arch.pkru;
+ save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
+
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
+ }
+
+ pr_debug("Virtual Machine Save Area (VMSA):\n");
+ print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
+
+ return 0;
+}
+
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
+{
+ struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (vcpu->guest_debug) {
+ pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
+ return -EINVAL;
+ }
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
+ */
+ clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
+
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
+ vmsa.address = __sme_pa(svm->sev_es.vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ vcpu->arch.guest_state_protected = true;
+
+ /*
+ * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
+ * only after setting guest_state_protected because KVM_SET_MSRS allows
+ * dynamic toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ if (!sev_es_guest(kvm))
+ return -ENOTTY;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = mutex_lock_killable(&vcpu->mutex);
+ if (ret)
+ return ret;
+
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
+
+ mutex_unlock(&vcpu->mutex);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = u64_to_user_ptr(argp->data);
+ struct sev_data_launch_measure data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = u64_to_user_ptr(params.uaddr);
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ }
+
+cmd:
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_launch_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error);
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ memset(&data, 0, sizeof(data));
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error);
+ if (ret)
+ return ret;
+
+ params.policy = data.policy;
+ params.state = data.state;
+ params.handle = data.handle;
+
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct sev_data_dbg data;
+
+ data.reserved = 0;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ data.dst_addr = dst;
+ data.src_addr = src;
+ data.len = size;
+
+ return sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ &data, error);
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ void __user *vaddr,
+ unsigned long dst_paddr,
+ void __user *dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED((unsigned long)vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage), vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (IS_ERR(src_p))
+ return PTR_ERR(src_p);
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE);
+ if (IS_ERR(dst_p)) {
+ sev_unpin_memory(kvm, src_p, n);
+ return PTR_ERR(dst_p);
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify
+ * the pages; flush the destination too so that future accesses do not
+ * see stale data.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ (void __user *)vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ (void __user *)dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_launch_secret data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n, i;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /*
+ * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in
+ * place; the cache may contain the data that was written unencrypted.
+ */
+ sev_clflush_pages(pages, n);
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ memset(&data, 0, sizeof(data));
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data.guest_address = __sme_page_pa(pages[0]) + offset;
+ data.guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_unpin_memory;
+ }
+
+ data.trans_address = __psp_pa(blob);
+ data.trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_unpin_memory:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < n; i++) {
+ set_page_dirty_lock(pages[i]);
+ mark_page_accessed(pages[i]);
+ }
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = u64_to_user_ptr(argp->data);
+ struct sev_data_attestation_report data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ memset(&data, 0, sizeof(data));
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = u64_to_user_ptr(params.uaddr);
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
+ if (!blob)
+ return -ENOMEM;
+
+ data.address = __psp_pa(blob);
+ data.len = params.len;
+ memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data.len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+ return ret;
+}
+
+/* Userspace wants to query session length. */
+static int
+__sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_start *params)
+{
+ struct sev_data_send_start data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ params->session_len = data.session_len;
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_start data;
+ struct kvm_sev_send_start params;
+ void *amd_certs, *session_data;
+ void *pdh_cert, *plat_certs;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_send_start)))
+ return -EFAULT;
+
+ /* if session_len is zero, userspace wants to query the session length */
+ if (!params.session_len)
+ return __sev_send_start_query_session_length(kvm, argp,
+ &params);
+
+ /* some sanity checks */
+ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
+ !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE)
+ return -EINVAL;
+
+ /* allocate the memory to hold the session data blob */
+ session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT);
+ if (!session_data)
+ return -ENOMEM;
+
+ /* copy the certificate blobs from userspace */
+ pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
+ params.pdh_cert_len);
+ if (IS_ERR(pdh_cert)) {
+ ret = PTR_ERR(pdh_cert);
+ goto e_free_session;
+ }
+
+ plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
+ params.plat_certs_len);
+ if (IS_ERR(plat_certs)) {
+ ret = PTR_ERR(plat_certs);
+ goto e_free_pdh;
+ }
+
+ amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
+ params.amd_certs_len);
+ if (IS_ERR(amd_certs)) {
+ ret = PTR_ERR(amd_certs);
+ goto e_free_plat_cert;
+ }
+
+ /* populate the FW SEND_START field with system physical address */
+ memset(&data, 0, sizeof(data));
+ data.pdh_cert_address = __psp_pa(pdh_cert);
+ data.pdh_cert_len = params.pdh_cert_len;
+ data.plat_certs_address = __psp_pa(plat_certs);
+ data.plat_certs_len = params.plat_certs_len;
+ data.amd_certs_address = __psp_pa(amd_certs);
+ data.amd_certs_len = params.amd_certs_len;
+ data.session_address = __psp_pa(session_data);
+ data.session_len = params.session_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
+
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
+ session_data, params.session_len)) {
+ ret = -EFAULT;
+ goto e_free_amd_cert;
+ }
+
+ params.policy = data.policy;
+ params.session_len = data.session_len;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
+ sizeof(struct kvm_sev_send_start)))
+ ret = -EFAULT;
+
+e_free_amd_cert:
+ kfree(amd_certs);
+e_free_plat_cert:
+ kfree(plat_certs);
+e_free_pdh:
+ kfree(pdh_cert);
+e_free_session:
+ kfree(session_data);
+ return ret;
+}
+
+/* Userspace wants to query either header or trans length. */
+static int
+__sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_send_update_data *params)
+{
+ struct sev_data_send_update_data data;
+ int ret;
+
+ memset(&data, 0, sizeof(data));
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ params->hdr_len = data.hdr_len;
+ params->trans_len = data.trans_len;
+
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
+ sizeof(struct kvm_sev_send_update_data)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_update_data data;
+ struct kvm_sev_send_update_data params;
+ void *hdr, *trans_data;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_send_update_data)))
+ return -EFAULT;
+
+ /* userspace wants to query either header or trans length */
+ if (!params.trans_len || !params.hdr_len)
+ return __sev_send_update_data_query_lengths(kvm, argp, &params);
+
+ if (!params.trans_uaddr || !params.guest_uaddr ||
+ !params.guest_len || !params.hdr_uaddr)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, 0);
+ if (IS_ERR(guest_page))
+ return PTR_ERR(guest_page);
+
+ /* allocate memory for header and transport buffer */
+ ret = -ENOMEM;
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL);
+ if (!hdr)
+ goto e_unpin;
+
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL);
+ if (!trans_data)
+ goto e_free_hdr;
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans_data);
+ data.trans_len = params.trans_len;
+
+ /* The SEND_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
+
+ if (ret)
+ goto e_free_trans_data;
+
+ /* copy transport buffer to user space */
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
+ trans_data, params.trans_len)) {
+ ret = -EFAULT;
+ goto e_free_trans_data;
+ }
+
+ /* Copy packet header to userspace. */
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
+ params.hdr_len))
+ ret = -EFAULT;
+
+e_free_trans_data:
+ kfree(trans_data);
+e_free_hdr:
+ kfree(hdr);
+e_unpin:
+ sev_unpin_memory(kvm, guest_page, n);
+
+ return ret;
+}
+
+static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error);
+}
+
+static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_send_cancel data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error);
+}
+
+static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_receive_start start;
+ struct kvm_sev_receive_start params;
+ int *error = &argp->error;
+ void *session_data;
+ void *pdh_data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* Get parameter from the userspace */
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_receive_start)))
+ return -EFAULT;
+
+ /* some sanity checks */
+ if (!params.pdh_uaddr || !params.pdh_len ||
+ !params.session_uaddr || !params.session_len)
+ return -EINVAL;
+
+ pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len);
+ if (IS_ERR(pdh_data))
+ return PTR_ERR(pdh_data);
+
+ session_data = psp_copy_user_blob(params.session_uaddr,
+ params.session_len);
+ if (IS_ERR(session_data)) {
+ ret = PTR_ERR(session_data);
+ goto e_free_pdh;
+ }
+
+ memset(&start, 0, sizeof(start));
+ start.handle = params.handle;
+ start.policy = params.policy;
+ start.pdh_cert_address = __psp_pa(pdh_data);
+ start.pdh_cert_len = params.pdh_len;
+ start.session_address = __psp_pa(session_data);
+ start.session_len = params.session_len;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start,
+ error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start.handle, error);
+ if (ret) {
+ sev_decommission(start.handle);
+ goto e_free_session;
+ }
+
+ params.handle = start.handle;
+ if (copy_to_user(u64_to_user_ptr(argp->data),
+ &params, sizeof(struct kvm_sev_receive_start))) {
+ ret = -EFAULT;
+ sev_unbind_asid(kvm, start.handle);
+ goto e_free_session;
+ }
+
+ sev->handle = start.handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_data);
+e_free_pdh:
+ kfree(pdh_data);
+
+ return ret;
+}
+
+static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_receive_update_data params;
+ struct sev_data_receive_update_data data;
+ void *hdr = NULL, *trans = NULL;
+ struct page **guest_page;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
+ sizeof(struct kvm_sev_receive_update_data)))
+ return -EFAULT;
+
+ if (!params.hdr_uaddr || !params.hdr_len ||
+ !params.guest_uaddr || !params.guest_len ||
+ !params.trans_uaddr || !params.trans_len)
+ return -EINVAL;
+
+ /* Check if we are crossing the page boundary */
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
+ return -EINVAL;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr))
+ return PTR_ERR(hdr);
+
+ trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto e_free_hdr;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.hdr_address = __psp_pa(hdr);
+ data.hdr_len = params.hdr_len;
+ data.trans_address = __psp_pa(trans);
+ data.trans_len = params.trans_len;
+
+ /* Pin guest memory */
+ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
+ PAGE_SIZE, &n, FOLL_WRITE);
+ if (IS_ERR(guest_page)) {
+ ret = PTR_ERR(guest_page);
+ goto e_free_trans;
+ }
+
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
+ /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
+ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
+ data.guest_address |= sev_me_mask;
+ data.guest_len = params.guest_len;
+ data.handle = to_kvm_sev_info(kvm)->handle;
+
+ ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
+ &argp->error);
+
+ sev_unpin_memory(kvm, guest_page, n);
+
+e_free_trans:
+ kfree(trans);
+e_free_hdr:
+ kfree(hdr);
+
+ return ret;
+}
+
+static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_receive_finish data;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data.handle = to_kvm_sev_info(kvm)->handle;
+ return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
+}
+
+static bool is_cmd_allowed_from_mirror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
+
+ /*
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
+ */
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
+ return -EBUSY;
+
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
+ return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
+}
+
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm);
+
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+}
+
+static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
+{
+ struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
+ struct kvm_sev_info *src = to_kvm_sev_info(src_kvm);
+ struct kvm_vcpu *dst_vcpu, *src_vcpu;
+ struct vcpu_svm *dst_svm, *src_svm;
+ struct kvm_sev_info *mirror;
+ unsigned long i;
+
+ dst->active = true;
+ dst->asid = src->asid;
+ dst->handle = src->handle;
+ dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
+ dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
+
+ src->asid = 0;
+ src->active = false;
+ src->handle = 0;
+ src->pages_locked = 0;
+ src->enc_context_owner = NULL;
+ src->es_active = false;
+
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+
+ /*
+ * If this VM has mirrors, "transfer" each mirror's refcount of the
+ * source to the destination (this KVM). The caller holds a reference
+ * to the source, so there's no danger of use-after-free.
+ */
+ list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
+ list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
+ kvm_get_kvm(dst_kvm);
+ kvm_put_kvm(src_kvm);
+ mirror->enc_context_owner = dst_kvm;
+ }
+
+ /*
+ * If this VM is a mirror, remove the old mirror from the owners list
+ * and add the new mirror to the list.
+ */
+ if (is_mirroring_enc_context(dst_kvm)) {
+ struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
+
+ list_del(&src->mirror_entry);
+ list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms);
+ }
+
+ kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) {
+ dst_svm = to_svm(dst_vcpu);
+
+ sev_init_vmcb(dst_svm, false);
+
+ if (!dst->es_active)
+ continue;
+
+ /*
+ * Note, the source is not required to have the same number of
+ * vCPUs as the destination when migrating a vanilla SEV VM.
+ */
+ src_vcpu = kvm_get_vcpu(src_kvm, i);
+ src_svm = to_svm(src_vcpu);
+
+ /*
+ * Transfer VMSA and GHCB state to the destination. Nullify and
+ * clear source fields as appropriate, the state now belongs to
+ * the destination.
+ */
+ memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
+ dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
+ dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
+ dst_vcpu->arch.guest_state_protected = true;
+
+ memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es));
+ src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE;
+ src_svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ src_vcpu->arch.guest_state_protected = false;
+ }
+}
+
+static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
+{
+ struct kvm_vcpu *src_vcpu;
+ unsigned long i;
+
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
+ if (!sev_es_guest(src))
+ return 0;
+
+ if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus))
+ return -EINVAL;
+
+ kvm_for_each_vcpu(i, src_vcpu, src) {
+ if (!src_vcpu->arch.guest_state_protected)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm);
+ struct kvm_sev_info *src_sev, *cg_cleanup_sev;
+ CLASS(fd, f)(source_fd);
+ struct kvm *source_kvm;
+ bool charged = false;
+ int ret;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
+
+ source_kvm = fd_file(f)->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ return ret;
+
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ src_sev = to_kvm_sev_info(source_kvm);
+
+ dst_sev->misc_cg = get_current_misc_cg();
+ cg_cleanup_sev = dst_sev;
+ if (dst_sev->misc_cg != src_sev->misc_cg) {
+ ret = sev_misc_cg_try_charge(dst_sev);
+ if (ret)
+ goto out_dst_cgroup;
+ charged = true;
+ }
+
+ ret = kvm_lock_all_vcpus(kvm);
+ if (ret)
+ goto out_dst_cgroup;
+ ret = kvm_lock_all_vcpus(source_kvm);
+ if (ret)
+ goto out_dst_vcpu;
+
+ ret = sev_check_source_vcpus(kvm, source_kvm);
+ if (ret)
+ goto out_source_vcpu;
+
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
+ sev_migrate_from(kvm, source_kvm);
+ kvm_vm_dead(source_kvm);
+ cg_cleanup_sev = src_sev;
+ ret = 0;
+
+out_source_vcpu:
+ kvm_unlock_all_vcpus(source_kvm);
+out_dst_vcpu:
+ kvm_unlock_all_vcpus(kvm);
+out_dst_cgroup:
+ /* Operates on the source on success, on the destination on failure. */
+ if (charged)
+ sev_misc_cg_uncharge(cg_cleanup_sev);
+ put_misc_cg(cg_cleanup_sev->misc_cg);
+ cg_cleanup_sev->misc_cg = NULL;
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+ return ret;
+}
+
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ case KVM_X86_SNP_POLICY_BITS:
+ *val = snp_supported_policy_bits;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~snp_supported_policy_bits)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
+ return -EINVAL;
+
+ if (snp_is_secure_tsc_enabled(kvm)) {
+ if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz))
+ return -EINVAL;
+
+ start.desired_tsc_khz = kvm->arch.default_tsc_khz;
+ }
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->policy = params.policy;
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned = false;
+ int level;
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = ret ? -EINVAL : -EEXIST;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!params.len || !PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_has_gmem(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
+int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!sev_enabled)
+ return -ENOTTY;
+
+ if (!argp)
+ return 0;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !is_cmd_allowed_from_mirror(sev_cmd.id)) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_ES_INIT:
+ if (!sev_es_enabled) {
+ r = -ENOTTY;
+ goto out;
+ }
+ fallthrough;
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_VMSA:
+ r = sev_launch_update_vmsa(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_START:
+ r = sev_send_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_UPDATE_DATA:
+ r = sev_send_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_FINISH:
+ r = sev_send_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SEND_CANCEL:
+ r = sev_send_cancel(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_START:
+ r = sev_receive_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_UPDATE_DATA:
+ r = sev_receive_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_RECEIVE_FINISH:
+ r = sev_receive_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int sev_mem_enc_register_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages,
+ FOLL_WRITE | FOLL_LONGTERM);
+ if (IS_ERR(region->pages)) {
+ ret = PTR_ERR(region->pages);
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit. Note, this must be done before dropping kvm->lock,
+ * as region and its array of pages can be freed by a different task
+ * once kvm->lock is released.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+int sev_mem_enc_unregister_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ /* If kvm is mirroring encryption context it isn't responsible for it */
+ if (is_mirroring_enc_context(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ sev_writeback_caches(kvm);
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
+{
+ CLASS(fd, f)(source_fd);
+ struct kvm *source_kvm;
+ struct kvm_sev_info *source_sev, *mirror_sev;
+ int ret;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
+
+ source_kvm = fd_file(f)->private_data;
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ return ret;
+
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
+ ret = -EINVAL;
+ goto e_unlock;
+ }
+
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
+ /*
+ * The mirror kvm holds an enc_context_owner ref so its asid can't
+ * disappear until we're done with it
+ */
+ source_sev = to_kvm_sev_info(source_kvm);
+ kvm_get_kvm(source_kvm);
+ list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
+
+ /* Set enc_context_owner and copy its encryption context over */
+ mirror_sev->enc_context_owner = source_kvm;
+ mirror_sev->active = true;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->mirror_vms);
+ ret = 0;
+
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
+
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+ return ret;
+}
+
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
+void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ WARN_ON(!list_empty(&sev->mirror_vms));
+
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
+ if (is_mirroring_enc_context(kvm)) {
+ struct kvm *owner_kvm = sev->enc_context_owner;
+
+ mutex_lock(&owner_kvm->lock);
+ list_del(&sev->mirror_entry);
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
+ return;
+ }
+
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
+ sev_asid_free(sev);
+}
+
+void __init sev_set_cpu_caps(void)
+{
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
+}
+
+static bool is_sev_snp_initialized(void)
+{
+ struct sev_user_data_snp_status *status;
+ struct sev_data_snp_addr buf;
+ bool initialized = false;
+ int ret, error = 0;
+
+ status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
+ if (!status)
+ return false;
+
+ buf.address = __psp_pa(status);
+ ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
+ if (ret) {
+ pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ goto out;
+ }
+
+ initialized = !!status->state;
+
+out:
+ snp_free_firmware_page(status);
+
+ return initialized;
+}
+
+void __init sev_hardware_setup(void)
+{
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ struct sev_platform_init_args init_args = {0};
+ bool sev_snp_supported = false;
+ bool sev_es_supported = false;
+ bool sev_supported = false;
+
+ if (!sev_enabled || !npt_enabled || !nrips)
+ goto out;
+
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation. Ditto for flushing by ASID, as SEV
+ * guests are bound to a single ASID, i.e. KVM can't rotate to a new
+ * ASID to effect a TLB flush.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
+ goto out;
+
+ /*
+ * The kernel's initcall infrastructure lacks the ability to express
+ * dependencies between initcalls, whereas the modules infrastructure
+ * automatically handles dependencies via symbol loading. Ensure the
+ * PSP SEV driver is initialized before proceeding if KVM is built-in,
+ * as the dependency isn't handled by the initcall infrastructure.
+ */
+ if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
+ goto out;
+
+ /* Retrieve SEV CPUID information */
+ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
+
+ /* Set encryption bit location for SEV-ES guests */
+ sev_enc_bit = ebx & 0x3f;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = ecx;
+ if (!max_sev_asid)
+ goto out;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = edx;
+ sev_me_mask = 1UL << (ebx & 0x3f);
+
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ goto out;
+
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap) {
+ bitmap_free(sev_asid_bitmap);
+ sev_asid_bitmap = NULL;
+ goto out;
+ }
+
+ if (min_sev_asid <= max_sev_asid) {
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ }
+ sev_supported = true;
+
+ /* SEV-ES support requested? */
+ if (!sev_es_enabled)
+ goto out;
+
+ /*
+ * SEV-ES requires MMIO caching as KVM doesn't have access to the guest
+ * instruction stream, i.e. can't emulate in response to a #NPF and
+ * instead relies on #NPF(RSVD) being reflected into the guest as #VC
+ * (the guest can then do a #VMGEXIT to request MMIO emulation).
+ */
+ if (!enable_mmio_caching)
+ goto out;
+
+ /* Does the CPU support SEV-ES? */
+ if (!boot_cpu_has(X86_FEATURE_SEV_ES))
+ goto out;
+
+ if (!lbrv) {
+ WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
+ "LBRV must be present for SEV-ES support");
+ goto out;
+ }
+
+ /* Has the system been allocated ASIDs for SEV-ES? */
+ if (min_sev_asid == 1)
+ goto out;
+
+ min_sev_es_asid = min_snp_asid = 1;
+ max_sev_es_asid = max_snp_asid = min_sev_asid - 1;
+
+ sev_es_asid_count = min_sev_asid - 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
+ sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
+
+out:
+ if (sev_enabled) {
+ init_args.probe = true;
+
+ if (sev_is_snp_ciphertext_hiding_supported())
+ init_args.max_snp_asid = min(nr_ciphertext_hiding_asids,
+ min_sev_asid - 1);
+
+ if (sev_platform_init(&init_args))
+ sev_supported = sev_es_supported = sev_snp_supported = false;
+ else if (sev_snp_supported)
+ sev_snp_supported = is_sev_snp_initialized();
+
+ if (sev_snp_supported) {
+ snp_supported_policy_bits = sev_get_snp_policy_bits() &
+ KVM_SNP_POLICY_MASK_VALID;
+ nr_ciphertext_hiding_asids = init_args.max_snp_asid;
+ }
+
+ /*
+ * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP
+ * ASID range is partitioned into separate SEV-ES and SEV-SNP
+ * ASID ranges, with the SEV-SNP range being [1..max_snp_asid]
+ * and the SEV-ES range being (max_snp_asid..max_sev_es_asid].
+ * Note, SEV-ES may effectively be disabled if all ASIDs from
+ * the joint range are assigned to SEV-SNP.
+ */
+ if (nr_ciphertext_hiding_asids) {
+ max_snp_asid = nr_ciphertext_hiding_asids;
+ min_sev_es_asid = max_snp_asid + 1;
+ pr_info("SEV-SNP ciphertext hiding enabled\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ pr_info("SEV %s (ASIDs %u - %u)\n",
+ sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
+ "unusable" :
+ "disabled",
+ min_sev_asid, max_sev_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_ES))
+ pr_info("SEV-ES %s (ASIDs %u - %u)\n",
+ sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" :
+ "unusable" :
+ "disabled",
+ min_sev_es_asid, max_sev_es_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ str_enabled_disabled(sev_snp_supported),
+ min_snp_asid, max_snp_asid);
+
+ sev_enabled = sev_supported;
+ sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
+ if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
+ !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
+ sev_es_debug_swap_enabled = false;
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+
+ if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC))
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC;
+}
+
+void sev_hardware_unsetup(void)
+{
+ if (!sev_enabled)
+ return;
+
+ /* No need to take sev_bitmap_lock, all VMs have been destroyed. */
+ sev_flush_asids(1, max_sev_asid);
+
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
+
+ sev_platform_shutdown();
+}
+
+int sev_cpu_init(struct svm_cpu_data *sd)
+{
+ if (!sev_enabled)
+ return 0;
+
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Pages used by hardware to hold guest encrypted state must be flushed before
+ * returning them to the system.
+ */
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
+{
+ unsigned int asid = sev_get_asid(vcpu->kvm);
+
+ /*
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
+ */
+ unsigned long addr = (unsigned long)va;
+
+ /*
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
+
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_sev_writeback_caches;
+
+ return;
+
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
+}
+
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
+ return;
+
+ sev_writeback_caches(kvm);
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return;
+
+ svm = to_svm(vcpu);
+
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
+ if (vcpu->arch.guest_state_protected)
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
+ __free_page(virt_to_page(svm->sev_es.vmsa));
+
+skip_vmsa_free:
+ if (svm->sev_es.ghcb_sa_free)
+ kvfree(svm->sev_es.ghcb_sa);
+}
+
+static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control)
+{
+ return (((u64)control->exit_code_hi) << 32) | control->exit_code;
+}
+
+static void dump_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ unsigned int nbits;
+
+ /* Re-use the dump_invalid_vmcb module parameter */
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
+
+ /*
+ * Print KVM's snapshot of the GHCB values that were (unsuccessfully)
+ * used to handle the exit. If the guest has since modified the GHCB
+ * itself, dumping the raw GHCB won't help debug why KVM was unable to
+ * handle the VMGEXIT that KVM observed.
+ */
+ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
+ kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
+ control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
+ control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
+ pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
+ svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
+ pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
+}
+
+static void sev_es_sync_to_ghcb(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be returned:
+ * GPRs RAX, RBX, RCX, RDX
+ *
+ * Copy their values, even if they may not have been written during the
+ * VM-Exit. It's the guest's responsibility to not consume random data.
+ */
+ ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
+ ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
+ ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
+ ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
+}
+
+static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct ghcb *ghcb = svm->sev_es.ghcb;
+ u64 exit_code;
+
+ /*
+ * The GHCB protocol so far allows for the following data
+ * to be supplied:
+ * GPRs RAX, RBX, RCX, RDX
+ * XCR0
+ * CPL
+ *
+ * VMMCALL allows the guest to provide extra registers. KVM also
+ * expects RSI for hypercalls, so include that, too.
+ *
+ * Copy their values to the appropriate location if supplied.
+ */
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+
+ BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap));
+ memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap));
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm);
+ vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm);
+
+ svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm);
+
+ if (kvm_ghcb_xcr0_is_valid(svm))
+ __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm));
+
+ if (kvm_ghcb_xss_is_valid(svm))
+ __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm));
+
+ /* Copy the GHCB exit information into the VMCB fields */
+ exit_code = kvm_ghcb_get_sw_exit_code(svm);
+ control->exit_code = lower_32_bits(exit_code);
+ control->exit_code_hi = upper_32_bits(exit_code);
+ control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm);
+ control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm);
+ svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm);
+
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 exit_code;
+ u64 reason;
+
+ /*
+ * Retrieve the exit code now even though it may not be marked valid
+ * as it could help with debugging.
+ */
+ exit_code = kvm_get_cached_sw_exit_code(control);
+
+ /* Only GHCB Usage code 0 is supported */
+ if (svm->sev_es.ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
+ if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
+ !kvm_ghcb_sw_exit_info_2_is_valid(svm))
+ goto vmgexit_err;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSC:
+ break;
+ case SVM_EXIT_RDPMC:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_CPUID:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd)
+ if (!kvm_ghcb_xcr0_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_INVD:
+ break;
+ case SVM_EXIT_IOIO:
+ if (control->exit_info_1 & SVM_IOIO_STR_MASK) {
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ } else {
+ if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK))
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_MSR:
+ if (!kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ if (control->exit_info_1) {
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ }
+ break;
+ case SVM_EXIT_VMMCALL:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_cpl_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_RDTSCP:
+ break;
+ case SVM_EXIT_WBINVD:
+ break;
+ case SVM_EXIT_MONITOR:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm) ||
+ !kvm_ghcb_rdx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_EXIT_MWAIT:
+ if (!kvm_ghcb_rax_is_valid(svm) ||
+ !kvm_ghcb_rcx_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_MMIO_READ:
+ case SVM_VMGEXIT_MMIO_WRITE:
+ if (!kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ case SVM_VMGEXIT_AP_JUMP_TABLE:
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
+ break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
+ break;
+ default:
+ reason = GHCB_ERR_INVALID_EVENT;
+ goto vmgexit_err;
+ }
+
+ return 0;
+
+vmgexit_err:
+ if (reason == GHCB_ERR_INVALID_USAGE) {
+ vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
+ svm->sev_es.ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
+ } else {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
+ exit_code);
+ dump_ghcb(svm);
+ }
+
+ svm_vmgexit_bad_input(svm, reason);
+
+ /* Resume the guest to "return" the error code. */
+ return 1;
+}
+
+void sev_es_unmap_ghcb(struct vcpu_svm *svm)
+{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
+ if (!svm->sev_es.ghcb)
+ return;
+
+ if (svm->sev_es.ghcb_sa_free) {
+ /*
+ * The scratch area lives outside the GHCB, so there is a
+ * buffer that, depending on the operation performed, may
+ * need to be synced, then freed.
+ */
+ if (svm->sev_es.ghcb_sa_sync) {
+ kvm_write_guest(svm->vcpu.kvm,
+ svm->sev_es.sw_scratch,
+ svm->sev_es.ghcb_sa,
+ svm->sev_es.ghcb_sa_len);
+ svm->sev_es.ghcb_sa_sync = false;
+ }
+
+ kvfree(svm->sev_es.ghcb_sa);
+ svm->sev_es.ghcb_sa = NULL;
+ svm->sev_es.ghcb_sa_free = false;
+ }
+
+ trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_to_ghcb(svm);
+
+ kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map);
+ svm->sev_es.ghcb = NULL;
+}
+
+int pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct kvm *kvm = svm->vcpu.kvm;
+ unsigned int asid = sev_get_asid(kvm);
+
+ /*
+ * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid
+ * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP
+ * AP Destroy event.
+ */
+ if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
+ return -EINVAL;
+
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->vcpu.arch.last_vmentry_cpu == cpu)
+ return 0;
+
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ return 0;
+}
+
+#define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE)
+static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_scratch_beg, ghcb_scratch_end;
+ u64 scratch_gpa_beg, scratch_gpa_end;
+ void *scratch_va;
+
+ scratch_gpa_beg = svm->sev_es.sw_scratch;
+ if (!scratch_gpa_beg) {
+ pr_err("vmgexit: scratch gpa not provided\n");
+ goto e_scratch;
+ }
+
+ scratch_gpa_end = scratch_gpa_beg + len;
+ if (scratch_gpa_end < scratch_gpa_beg) {
+ pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
+ len, scratch_gpa_beg);
+ goto e_scratch;
+ }
+
+ if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
+ /* Scratch area begins within GHCB */
+ ghcb_scratch_beg = control->ghcb_gpa +
+ offsetof(struct ghcb, shared_buffer);
+ ghcb_scratch_end = control->ghcb_gpa +
+ offsetof(struct ghcb, reserved_0xff0);
+
+ /*
+ * If the scratch area begins within the GHCB, it must be
+ * completely contained in the GHCB shared buffer area.
+ */
+ if (scratch_gpa_beg < ghcb_scratch_beg ||
+ scratch_gpa_end > ghcb_scratch_end) {
+ pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
+ scratch_gpa_beg, scratch_gpa_end);
+ goto e_scratch;
+ }
+
+ scratch_va = (void *)svm->sev_es.ghcb;
+ scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
+ } else {
+ /*
+ * The guest memory must be read into a kernel buffer, so
+ * limit the size
+ */
+ if (len > GHCB_SCRATCH_AREA_LIMIT) {
+ pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
+ len, GHCB_SCRATCH_AREA_LIMIT);
+ goto e_scratch;
+ }
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
+ if (!scratch_va)
+ return -ENOMEM;
+
+ if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
+ /* Unable to copy scratch area from guest */
+ pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
+
+ kvfree(scratch_va);
+ return -EFAULT;
+ }
+
+ /*
+ * The scratch area is outside the GHCB. The operation will
+ * dictate whether the buffer needs to be synced before running
+ * the vCPU next time (i.e. a read was requested so the data
+ * must be written back to the guest memory).
+ */
+ svm->sev_es.ghcb_sa_sync = sync;
+ svm->sev_es.ghcb_sa_free = true;
+ }
+
+ svm->sev_es.ghcb_sa = scratch_va;
+ svm->sev_es.ghcb_sa_len = len;
+
+ return 0;
+
+e_scratch:
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return 1;
+}
+
+static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
+ unsigned int pos)
+{
+ svm->vmcb->control.ghcb_gpa &= ~(mask << pos);
+ svm->vmcb->control.ghcb_gpa |= (value & mask) << pos;
+}
+
+static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos)
+{
+ return (svm->vmcb->control.ghcb_gpa >> pos) & mask;
+}
+
+static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
+{
+ svm->vmcb->control.ghcb_gpa = value;
+}
+
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+
+ /*
+ * PSC requests always get a "no action" response in SW_EXITINFO1, with
+ * a PSC-specific return code in SW_EXITINFO2 that provides the "real"
+ * return code. E.g. if the PSC request was interrupted, the need to
+ * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
+ */
+ svm_vmgexit_no_action(svm, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ BUG();
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_memory_slot *slot;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ return;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa))
+ return;
+
+ gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return;
+
+ /*
+ * The new VMSA will be private memory guest memory, so retrieve the
+ * PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL))
+ return;
+
+ /*
+ * From this point forward, the VMSA will always be a guest-mapped page
+ * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
+ * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
+ * minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever changes
+ * then care should be taken to ensure svm->sev_es.vmsa is pinned
+ * through some other means.
+ */
+ kvm_release_page_clean(page);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ target_svm = to_svm(target_vcpu);
+
+ guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ case SVM_VMGEXIT_AP_CREATE:
+ if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
+ vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features);
+ return -EINVAL;
+ }
+
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ return -EINVAL;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ return -EINVAL;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ return -EINVAL;
+ }
+
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /*
+ * Unless Creation is deferred until INIT, signal the vCPU to update
+ * its state.
+ */
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
+ kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+
+ return 0;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ /* No action is requested *from KVM* if there was a firmware error. */
+ svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
+static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+ u64 ghcb_info;
+ int ret = 1;
+
+ ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK;
+
+ trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id,
+ control->ghcb_gpa);
+
+ switch (ghcb_info) {
+ case GHCB_MSR_SEV_INFO_REQ:
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+ break;
+ case GHCB_MSR_CPUID_REQ: {
+ u64 cpuid_fn, cpuid_reg, cpuid_value;
+
+ cpuid_fn = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_FUNC_MASK,
+ GHCB_MSR_CPUID_FUNC_POS);
+
+ /* Initialize the registers needed by the CPUID intercept */
+ vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn;
+ vcpu->arch.regs[VCPU_REGS_RCX] = 0;
+
+ ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
+ if (!ret) {
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ cpuid_reg = get_ghcb_msr_bits(svm,
+ GHCB_MSR_CPUID_REG_MASK,
+ GHCB_MSR_CPUID_REG_POS);
+ if (cpuid_reg == 0)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX];
+ else if (cpuid_reg == 1)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX];
+ else if (cpuid_reg == 2)
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX];
+ else
+ cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX];
+
+ set_ghcb_msr_bits(svm, cpuid_value,
+ GHCB_MSR_CPUID_VALUE_MASK,
+ GHCB_MSR_CPUID_VALUE_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
+ case GHCB_MSR_TERM_REQ: {
+ u64 reason_set, reason_code;
+
+ reason_set = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_SET_MASK,
+ GHCB_MSR_TERM_REASON_SET_POS);
+ reason_code = get_ghcb_msr_bits(svm,
+ GHCB_MSR_TERM_REASON_MASK,
+ GHCB_MSR_TERM_REASON_POS);
+ pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
+ reason_set, reason_code);
+
+ goto out_terminate;
+ }
+ default:
+ /* Error, keep GHCB MSR value as-is */
+ break;
+ }
+
+ trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
+ control->ghcb_gpa, ret);
+
+ return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
+}
+
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ u64 ghcb_gpa, exit_code;
+ int ret;
+
+ /* Validate the GHCB */
+ ghcb_gpa = control->ghcb_gpa;
+ if (ghcb_gpa & GHCB_MSR_INFO_MASK)
+ return sev_handle_vmgexit_msr_protocol(svm);
+
+ if (!ghcb_gpa) {
+ vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
+ /* Unable to map GHCB from guest */
+ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
+ ghcb_gpa);
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
+ }
+
+ svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
+
+ trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
+
+ sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
+ ret = sev_es_validate_vmgexit(svm);
+ if (ret)
+ return ret;
+
+ svm_vmgexit_success(svm, 0);
+
+ exit_code = kvm_get_cached_sw_exit_code(control);
+ switch (exit_code) {
+ case SVM_VMGEXIT_MMIO_READ:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_read(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_MMIO_WRITE:
+ ret = setup_vmgexit_scratch(svm, false, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = kvm_sev_es_mmio_write(vcpu,
+ control->exit_info_1,
+ control->exit_info_2,
+ svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_NMI_COMPLETE:
+ ++vcpu->stat.nmi_window_exits;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
+ ret = kvm_emulate_ap_reset_hold(vcpu);
+ break;
+ case SVM_VMGEXIT_AP_JUMP_TABLE: {
+ struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
+
+ switch (control->exit_info_1) {
+ case 0:
+ /* Set AP jump table address */
+ sev->ap_jump_table = control->exit_info_2;
+ break;
+ case 1:
+ /* Get AP jump table address */
+ svm_vmgexit_success(svm, sev->ap_jump_table);
+ break;
+ default:
+ pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
+ control->exit_info_1);
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ }
+ case SVM_VMGEXIT_HV_FEATURES:
+ svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ vcpu_unimpl(vcpu,
+ "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
+ break;
+ default:
+ ret = svm_invoke_exit_handler(vcpu, exit_code);
+ }
+
+ return ret;
+}
+
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
+{
+ int count;
+ int bytes;
+ int r;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
+ return -EINVAL;
+
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ r = setup_vmgexit_scratch(svm, in, bytes);
+ if (r)
+ return r;
+
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
+ count, in);
+}
+
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
+
+ svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R,
+ !snp_is_secure_tsc_enabled(vcpu->kvm));
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * KVM treats the guest as being capable of using XSAVES even if XSAVES
+ * isn't enabled in guest CPUID as there is no intercept for XSAVES,
+ * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is
+ * exposed to the guest and XSAVES is supported in hardware. Condition
+ * full XSS passthrough on the guest being able to use XSAVES *and*
+ * XSAVES being exposed to the guest so that KVM can at least honor
+ * guest CPUID for RDMSR and WRMSR.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
+}
+
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_cpuid_entry2 *best;
+
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+ if (best)
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+}
+
+static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
+
+ /*
+ * An SEV-ES guest requires a VMSA area that is a separate from the
+ * VMCB page. Do not include the encryption mask on the VMSA physical
+ * address since hardware will access it using the guest key. Note,
+ * the VMSA will be NULL if this vCPU is the destination for intrahost
+ * migration, and will be copied later.
+ */
+ if (!svm->sev_es.snp_has_guest_vmsa) {
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ else
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
+ svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+ VMCB_ALLOWED_SEV_FEATURES_VALID;
+
+ /* Can't intercept CR register access, HV can't modify CR registers */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR4_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR8_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR4_WRITE);
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+
+ /* Track EFER/CR register changes */
+ svm_set_intercept(svm, TRAP_EFER_WRITE);
+ svm_set_intercept(svm, TRAP_CR0_WRITE);
+ svm_set_intercept(svm, TRAP_CR4_WRITE);
+ svm_set_intercept(svm, TRAP_CR8_WRITE);
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ if (!sev_vcpu_has_debug_swap(svm)) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ recalc_intercepts(svm);
+ } else {
+ /*
+ * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
+ * allow debugging SEV-ES guests, and enables DebugSwap iff
+ * NO_NESTED_DATA_BP is supported, so there's no reason to
+ * intercept #DB when DebugSwap is enabled. For simplicity
+ * with respect to guest debug, intercept #DB for other VMs
+ * even if NO_NESTED_DATA_BP is supported, i.e. even if the
+ * guest can't DoS the CPU with infinite #DB vectoring.
+ */
+ clr_exception_intercept(svm, DB_VECTOR);
+ }
+
+ /* Can't intercept XSETBV, HV can't modify XCR0 directly */
+ svm_clr_intercept(svm, INTERCEPT_XSETBV);
+
+ /*
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
+ */
+ if (!init_event)
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
+ GHCB_VERSION_MIN,
+ sev_enc_bit));
+}
+
+void sev_init_vmcb(struct vcpu_svm *svm, bool init_event)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+
+ /*
+ * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
+ * KVM can't decrypt guest memory to decode the faulting instruction.
+ */
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ if (init_event && sev_snp_guest(vcpu->kvm))
+ sev_snp_init_protected_guest_state(vcpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_init_vmcb(svm, init_event);
+}
+
+int sev_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct page *vmsa_page;
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!sev_es_guest(vcpu->kvm))
+ return 0;
+
+ /*
+ * SEV-ES guests require a separate (from the VMCB) VMSA page used to
+ * contain the encrypted register state of the guest.
+ */
+ vmsa_page = snp_safe_alloc_page();
+ if (!vmsa_page)
+ return -ENOMEM;
+
+ svm->sev_es.vmsa = page_address(vmsa_page);
+
+ vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm);
+
+ return 0;
+}
+
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+
+ /*
+ * All host state for SEV-ES guests is categorized into three swap types
+ * based on how it is handled by hardware during a world switch:
+ *
+ * A: VMRUN: Host state saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * B: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * C: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state initialized to default(reset) values
+ *
+ * Manually save type-B state, i.e. state that is loaded by VMEXIT but
+ * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
+ * by common SVM code).
+ */
+ hostsa->xcr0 = kvm_host.xcr0;
+ hostsa->pkru = read_pkru();
+ hostsa->xss = kvm_host.xss;
+
+ /*
+ * If DebugSwap is enabled, debug registers are loaded but NOT saved by
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
+ * not save or load debug registers. Sadly, KVM can't prevent SNP
+ * guests from lying about DebugSwap on secondary vCPUs, i.e. the
+ * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
+ * the guest has actually enabled (or not!) in the VMSA.
+ *
+ * If DebugSwap is *possible*, save the masks so that they're restored
+ * if the guest enables DebugSwap. But for the DRs themselves, do NOT
+ * rely on the CPU to restore the host values; KVM will restore them as
+ * needed in common code, via hw_breakpoint_restore(). Note, KVM does
+ * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
+ * don't need to be restored per se, KVM just needs to ensure they are
+ * loaded with the correct values *if* the CPU writes the MSRs.
+ */
+ if (sev_vcpu_has_debug_swap(svm) ||
+ (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
+ hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
+ hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
+ hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+ hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
+ }
+
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the feature is
+ * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area.
+ * Set the save area to the current hardware value, i.e. the current
+ * user return value, so that the correct value is restored on #VMEXIT.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) &&
+ !WARN_ON_ONCE(tsc_aux_uret_slot < 0))
+ hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot);
+}
+
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* First SIPI: Use the values as initially set by the VMM */
+ if (!svm->sev_es.received_first_sipi) {
+ svm->sev_es.received_first_sipi = true;
+ return;
+ }
+
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ svm_vmgexit_success(svm, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
+}
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ unsigned long pfn;
+ struct page *p;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+
+ /*
+ * Allocate an SNP-safe page to workaround the SNP erratum where
+ * the CPU will incorrectly signal an RMP violation #PF if a
+ * hugepage (2MB or 1GB) collides with the RMP entry of a
+ * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ *
+ * Allocate one extra page, choose a page which is not
+ * 2MB-aligned, and free the other.
+ */
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ pfn = page_to_pfn(p);
+ if (IS_ALIGNED(pfn, PTRS_PER_PMD))
+ __free_page(p++);
+ else
+ __free_page(p + 1);
+
+ return p;
+}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ struct page *page;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_has_gmem(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ kvm_release_page_unused(page);
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}
+
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area *vmsa;
+ struct kvm_sev_info *sev;
+ int error = 0;
+ int ret;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return NULL;
+
+ /*
+ * If the VMSA has not yet been encrypted, return a pointer to the
+ * current un-encrypted VMSA.
+ */
+ if (!vcpu->arch.guest_state_protected)
+ return (struct vmcb_save_area *)svm->sev_es.vmsa;
+
+ sev = to_kvm_sev_info(vcpu->kvm);
+
+ /* Check if the SEV policy allows debugging */
+ if (sev_snp_guest(vcpu->kvm)) {
+ if (!(sev->policy & SNP_POLICY_MASK_DEBUG))
+ return NULL;
+ } else {
+ if (sev->policy & SEV_POLICY_MASK_NODBG)
+ return NULL;
+ }
+
+ if (sev_snp_guest(vcpu->kvm)) {
+ struct sev_data_snp_dbg dbg = {0};
+
+ vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+ if (!vmsa)
+ return NULL;
+
+ dbg.gctx_paddr = __psp_pa(sev->snp_context);
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+
+ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
+
+ /*
+ * Return the target page to a hypervisor page no matter what.
+ * If this fails, the page can't be used, so leak it and don't
+ * try to use it.
+ */
+ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
+ return NULL;
+
+ if (ret) {
+ pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ free_page((unsigned long)vmsa);
+
+ return NULL;
+ }
+ } else {
+ struct sev_data_dbg dbg = {0};
+ struct page *vmsa_page;
+
+ vmsa_page = alloc_page(GFP_KERNEL);
+ if (!vmsa_page)
+ return NULL;
+
+ vmsa = page_address(vmsa_page);
+
+ dbg.handle = sev->handle;
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+ dbg.len = PAGE_SIZE;
+
+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
+ if (ret) {
+ pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
+ ret, error, error);
+ __free_page(vmsa_page);
+
+ return NULL;
+ }
+ }
+
+ return vmsa;
+}
+
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
+{
+ /* If the VMSA has not yet been encrypted, nothing was allocated */
+ if (!vcpu->arch.guest_state_protected || !vmsa)
+ return;
+
+ free_page((unsigned long)vmsa);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
new file mode 100644
index 000000000000..f56c2d895011
--- /dev/null
+++ b/arch/x86/kvm/svm/svm.c
@@ -0,0 +1,5510 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/amd-iommu.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/hashtable.h>
+#include <linux/objtool.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
+#include <linux/mutex.h>
+
+#include <asm/apic.h>
+#include <asm/msr.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/reboot.h>
+#include <asm/fpu/api.h>
+
+#include <trace/events/ipi.h>
+
+#include "trace.h"
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
+MODULE_LICENSE("GPL");
+
+#ifdef MODULE
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+#endif
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+static bool erratum_383_found __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, 0444);
+
+/* enable/disable Next RIP Save */
+int nrips = true;
+module_param(nrips, int, 0444);
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable LBR virtualization */
+int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
+
+bool __read_mostly dump_invalid_vmcb;
+module_param(dump_invalid_vmcb, bool, 0644);
+
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
+
+static bool svm_gp_erratum_intercept = true;
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static unsigned long iopm_base;
+
+DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+int tsc_aux_uret_slot __ro_after_init = -1;
+
+static int get_npt_level(void)
+{
+#ifdef CONFIG_X86_64
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 old_efer = vcpu->arch.efer;
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
+ if (!(efer & EFER_SVME)) {
+ svm_leave_nested(vcpu);
+ svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
+
+ /*
+ * Free the nested guest state, unless we are in SMM.
+ * In this case we will return to the nested guest
+ * as soon as we leave SMM.
+ */
+ if (!is_smm(vcpu))
+ svm_free_nested(svm);
+
+ } else {
+ int ret = svm_allocate_nested(svm);
+
+ if (ret) {
+ vcpu->arch.efer = old_efer;
+ return ret;
+ }
+
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
+ set_exception_intercept(svm, GP_VECTOR);
+ }
+ }
+
+ svm->vmcb->save.efer = efer | EFER_SVME;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ return 0;
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
+static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
+ int emul_type,
+ bool commit_side_effects)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_rflags;
+
+ /*
+ * SEV-ES does not expose the next RIP. The RIP update is controlled by
+ * the type of exit and the #VC handler in the guest.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ goto done;
+
+ if (nrips && svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ if (unlikely(!commit_side_effects))
+ old_rflags = svm->vmcb->save.rflags;
+
+ if (!kvm_emulate_instruction(vcpu, emul_type))
+ return 0;
+
+ if (unlikely(!commit_side_effects))
+ svm->vmcb->save.rflags = old_rflags;
+ } else {
+ kvm_rip_write(vcpu, svm->next_rip);
+ }
+
+done:
+ if (likely(commit_side_effects))
+ svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
+}
+
+static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
+{
+ const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
+ EMULTYPE_SET_SOFT_INT_VECTOR(vector);
+ unsigned long rip, old_rip = kvm_rip_read(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Due to architectural shortcomings, the CPU doesn't always provide
+ * NextRIP, e.g. if KVM intercepted an exception that occurred while
+ * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
+ * the instruction even if NextRIP is supported to acquire the next
+ * RIP so that it can be shoved into the NextRIP field, otherwise
+ * hardware will fail to advance guest RIP during event injection.
+ * Drop the exception/interrupt if emulation fails and effectively
+ * retry the instruction, it's the least awful option. If NRIPS is
+ * in use, the skip must not commit any side effects such as clearing
+ * the interrupt shadow or RFLAGS.RF.
+ */
+ if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
+ return -EIO;
+
+ rip = kvm_rip_read(vcpu);
+
+ /*
+ * Save the injection information, even when using next_rip, as the
+ * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
+ * doesn't complete due to a VM-Exit occurring while the CPU is
+ * vectoring the event. Decoding the instruction isn't guaranteed to
+ * work as there may be no backing instruction, e.g. if the event is
+ * being injected by L1 for L2, or if the guest is patching INT3 into
+ * a different instruction.
+ */
+ svm->soft_int_injected = true;
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = old_rip;
+ svm->soft_int_next_rip = rip;
+
+ if (nrips)
+ kvm_rip_write(vcpu, old_rip);
+
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ svm->vmcb->control.next_rip = rip;
+
+ return 0;
+}
+
+static void svm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (kvm_exception_is_soft(ex->vector) &&
+ svm_update_soft_interrupt_rip(vcpu, ex->vector))
+ return;
+
+ svm->vmcb->control.event_inj = ex->vector
+ | SVM_EVTINJ_VALID
+ | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = ex->error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
+ return;
+
+ val |= (1ULL << 47);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, val);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static bool __kvm_is_svm_supported(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+ return false;
+ }
+
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_svm_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!__kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
+}
+
+static void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ return;
+
+ wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+}
+
+static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return &sd->save_area->host_sev_es_save;
+}
+
+static inline void kvm_cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrq(MSR_VM_HSAVE_PA, 0);
+ rdmsrq(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrq(MSR_EFER, efer & ~EFER_SVME);
+ }
+}
+
+static void svm_emergency_disable_virtualization_cpu(void)
+{
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
+}
+
+static void svm_disable_virtualization_cpu(void)
+{
+ /* Make sure we clean up behind us */
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+
+ kvm_cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_enable_virtualization_cpu(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ int me = raw_smp_processor_id();
+
+ rdmsrq(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ sd = per_cpu_ptr(&svm_data, me);
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ wrmsrq(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ u64 len, status = 0;
+ int err;
+
+ err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
+ if (!err)
+ err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+
+ if (!sd->save_area)
+ return;
+
+ kfree(sd->sev_vmcbs);
+ __free_page(__sme_pa_to_page(sd->save_area_pa));
+ sd->save_area_pa = 0;
+ sd->save_area = NULL;
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct page *save_area_page;
+ int ret = -ENOMEM;
+
+ memset(sd, 0, sizeof(struct svm_cpu_data));
+ save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
+ if (!save_area_page)
+ return ret;
+
+ ret = sev_cpu_init(sd);
+ if (ret)
+ goto free_save_area;
+
+ sd->save_area = page_address(save_area_page);
+ sd->save_area_pa = __sme_page_pa(save_area_page);
+ return 0;
+
+free_save_area:
+ __free_page(save_area_page);
+ return ret;
+
+}
+
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ /*
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
+
+ return svm_test_msr_bitmap_write(msrpm, msr);
+}
+
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ void *msrpm = svm->msrpm;
+
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
+
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+ svm->nested.force_msr_bitmap_recalc = true;
+}
+
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
+
+ if (!pages)
+ return NULL;
+
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
+
+ return pm;
+}
+
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (intercept == svm->lbr_msrs_intercepted)
+ return;
+
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
+
+ svm->lbr_msrs_intercepted = intercept;
+}
+
+void svm_vcpu_free_msrpm(void *msrpm)
+{
+ __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
+}
+
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
+
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ /*
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
+
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled);
+ }
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
+
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
+}
+
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+}
+
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ __svm_enable_lbrv(vcpu);
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
+{
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
+
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
+
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ if (kvm_pause_in_guest(vcpu->kvm))
+ return;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old) {
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
+}
+
+static void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ avic_hardware_unsetup();
+
+ sev_hardware_unsetup();
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.ctl.tsc_offset;
+}
+
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
+}
+
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+}
+
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
+}
+
+/* Evaluate instruction intercepts that depend on guest CPUID features. */
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+ * roots, or if INVPCID is disabled in the guest to inject #UD.
+ */
+ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
+ if (!npt_enabled ||
+ !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
+ svm_set_intercept(svm, INTERCEPT_INVPCID);
+ else
+ svm_clr_intercept(svm, INTERCEPT_INVPCID);
+ }
+
+ if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ else
+ svm_set_intercept(svm, INTERCEPT_RDTSCP);
+ }
+
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ }
+}
+
+static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
+static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
+
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR3_READ);
+ svm_set_intercept(svm, INTERCEPT_CR4_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
+
+ svm_set_intercept(svm, INTERCEPT_INTR);
+ svm_set_intercept(svm, INTERCEPT_NMI);
+
+ if (intercept_smi)
+ svm_set_intercept(svm, INTERCEPT_SMI);
+
+ svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ svm_set_intercept(svm, INTERCEPT_RDPMC);
+ svm_set_intercept(svm, INTERCEPT_CPUID);
+ svm_set_intercept(svm, INTERCEPT_INVD);
+ svm_set_intercept(svm, INTERCEPT_INVLPG);
+ svm_set_intercept(svm, INTERCEPT_INVLPGA);
+ svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
+ svm_set_intercept(svm, INTERCEPT_MSR_PROT);
+ svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
+ svm_set_intercept(svm, INTERCEPT_VMRUN);
+ svm_set_intercept(svm, INTERCEPT_VMMCALL);
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm_set_intercept(svm, INTERCEPT_STGI);
+ svm_set_intercept(svm, INTERCEPT_CLGI);
+ svm_set_intercept(svm, INTERCEPT_SKINIT);
+ svm_set_intercept(svm, INTERCEPT_WBINVD);
+ svm_set_intercept(svm, INTERCEPT_XSETBV);
+ svm_set_intercept(svm, INTERCEPT_RDPRU);
+ svm_set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(vcpu->kvm)) {
+ svm_set_intercept(svm, INTERCEPT_MONITOR);
+ svm_set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(vcpu->kvm)) {
+ if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
+ svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
+ else
+ svm_set_intercept(svm, INTERCEPT_HLT);
+ }
+
+ control->iopm_base_pa = iopm_base;
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.base = 0;
+ save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ svm_clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ svm_clr_intercept(svm, INTERCEPT_CR3_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = vcpu->arch.pat;
+ save->cr3 = 0;
+ }
+ svm->current_vmcb->asid_generation = 0;
+ svm->asid = 0;
+
+ svm->nested.vmcb12_gpa = INVALID_GPA;
+ svm->nested.last_vmcb12_gpa = INVALID_GPA;
+
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ svm_set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ svm_clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_init_vmcb(svm, vmcb);
+
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
+ if (vgif) {
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ svm_clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
+ if (sev_guest(vcpu->kvm))
+ sev_init_vmcb(svm, init_event);
+
+ svm_hv_init_vmcb(vmcb);
+
+ kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
+
+ vmcb_mark_all_dirty(vmcb);
+
+ enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm_init_osvw(vcpu);
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ init_vmcb(vcpu, init_event);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
+}
+
+void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
+{
+ svm->current_vmcb = target_vmcb;
+ svm->vmcb = target_vmcb->ptr;
+}
+
+static int svm_vcpu_precreate(struct kvm *kvm)
+{
+ return avic_alloc_physical_id_table(kvm);
+}
+
+static int svm_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm;
+ struct page *vmcb01_page;
+ int err;
+
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
+
+ err = -ENOMEM;
+ vmcb01_page = snp_safe_alloc_page();
+ if (!vmcb01_page)
+ goto out;
+
+ err = sev_vcpu_create(vcpu);
+ if (err)
+ goto error_free_vmcb_page;
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto error_free_sev;
+
+ svm->msrpm = svm_vcpu_alloc_msrpm();
+ if (!svm->msrpm) {
+ err = -ENOMEM;
+ goto error_free_sev;
+ }
+
+ svm->x2avic_msrs_intercepted = true;
+ svm->lbr_msrs_intercepted = true;
+
+ svm->vmcb01.ptr = page_address(vmcb01_page);
+ svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
+ svm->guest_state_loaded = false;
+
+ return 0;
+
+error_free_sev:
+ sev_free_vcpu(vcpu);
+error_free_vmcb_page:
+ __free_page(vmcb01_page);
+out:
+ return err;
+}
+
+static void svm_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
+
+ svm_leave_nested(vcpu);
+ svm_free_nested(svm);
+
+ sev_free_vcpu(vcpu);
+
+ __free_page(__sme_pa_to_page(svm->vmcb01.pa));
+ svm_vcpu_free_msrpm(svm->msrpm);
+}
+
+#ifdef CONFIG_CPU_MITIGATIONS
+static DEFINE_SPINLOCK(srso_lock);
+static atomic_t srso_nr_vms;
+
+static void svm_srso_clear_bp_spec_reduce(void *ign)
+{
+ struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
+
+ if (!sd->bp_spec_reduce_set)
+ return;
+
+ msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ sd->bp_spec_reduce_set = false;
+}
+
+static void svm_srso_vm_destroy(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ if (atomic_dec_return(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ /*
+ * Verify a new VM didn't come along, acquire the lock, and increment
+ * the count before this task acquired the lock.
+ */
+ if (atomic_read(&srso_nr_vms))
+ return;
+
+ on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
+}
+
+static void svm_srso_vm_init(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
+ return;
+
+ /*
+ * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
+ * transition, i.e. destroying the last VM, is fully complete, e.g. so
+ * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
+ */
+ if (atomic_inc_not_zero(&srso_nr_vms))
+ return;
+
+ guard(spinlock)(&srso_lock);
+
+ atomic_inc(&srso_nr_vms);
+}
+#else
+static void svm_srso_vm_init(void) { }
+static void svm_srso_vm_destroy(void) { }
+#endif
+
+static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_unmap_ghcb(svm);
+
+ if (svm->guest_state_loaded)
+ return;
+
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
+ vmsave(sd->save_area_pa);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
+
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+
+ /*
+ * TSC_AUX is always virtualized (context switched by hardware) for
+ * SEV-ES guests when the feature is available. For non-SEV-ES guests,
+ * context switch TSC_AUX via the user_return MSR infrastructure (not
+ * all CPUs support TSC_AUX virtualization).
+ */
+ if (likely(tsc_aux_uret_slot >= 0) &&
+ (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
+ kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
+
+ if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
+ !sd->bp_spec_reduce_set) {
+ sd->bp_spec_reduce_set = true;
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
+ }
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ to_svm(vcpu)->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
+ svm_prepare_host_switch(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ kvm_register_mark_available(vcpu, reg);
+
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control;
+
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
+
+ svm_set_intercept(svm, INTERCEPT_VINTR);
+
+ /*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
+ * This is just a dummy VINTR to actually cause a vmexit to happen.
+ * Actual injection of virtual interrupts happens through EVENTINJ.
+ */
+ control = &svm->vmcb->control;
+ control->int_vector = 0x0;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ svm_clr_intercept(svm, INTERCEPT_VINTR);
+
+ /* Drop int_ctl fields related to VINTR injection. */
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+ if (is_guest_mode(&svm->vcpu)) {
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
+
+ WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
+ (svm->nested.ctl.int_ctl & V_TPR_MASK));
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
+ }
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+ struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save01->fs;
+ case VCPU_SREG_GS: return &save01->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save01->tr;
+ case VCPU_SREG_LDTR: return &save01->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
+static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ return true;
+}
+
+void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled) {
+ hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+
+ svm->vmcb->save.cr0 = hcr0;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+
+ /*
+ * SEV-ES guests must always keep the CR intercepts cleared. CR
+ * tracking is done using the CR write traps.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
+ if (hcr0 == cr0) {
+ /* Selective CR0 write remains on. */
+ svm_clr_intercept(svm, INTERCEPT_CR0_READ);
+ svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ svm_set_intercept(svm, INTERCEPT_CR0_READ);
+ svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return true;
+}
+
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = vcpu->arch.cr4;
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled) {
+ cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ }
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
+ }
+
+ svm->current_vmcb->asid_generation = sd->asid_generation;
+ svm->asid = sd->next_asid++;
+}
+
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (unlikely(value != vmcb->save.dr6)) {
+ vmcb->save.dr6 = value;
+ vmcb_mark_dirty(vmcb, VMCB_DR);
+ }
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
+ return;
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ /*
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
+ * because db_interception might need it. We can do it before vmentry.
+ */
+ vcpu->arch.dr6 = svm->vmcb->save.dr6;
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ svm->vmcb->save.dr7 = value;
+ vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int rc;
+
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ /*
+ * WARN if hardware generates a fault with an error code that collides
+ * with KVM-defined sythentic flags. Clear the flags and continue on,
+ * i.e. don't terminate the VM, as KVM can't possibly be relying on a
+ * flag that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
+ error_code &= ~PFERR_SYNTHETIC_MASK;
+
+ if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
+ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+
+ if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
+ sev_handle_rmp_fault(vcpu, fault_address, error_code);
+
+ return rc;
+}
+
+static int db_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
+ kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
+ kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct kvm_vcpu *vcpu)
+{
+ return handle_ud(vcpu);
+}
+
+static int ac_interception(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);
+
+ if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
+ value &= ~(1ULL << 2);
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ kvm_machine_check();
+}
+
+static int mc_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
+ *
+ * The VM save area for SEV-ES guests has already been encrypted so it
+ * cannot be reinitialized, i.e. synthesizing INIT is futile.
+ */
+ if (!sev_es_guest(vcpu->kvm)) {
+ clear_page(svm->vmcb);
+#ifdef CONFIG_KVM_SMM
+ if (is_smm(vcpu))
+ kvm_smm_changed(vcpu, false);
+#endif
+ kvm_vcpu_reset(vcpu, true);
+ }
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++vcpu->stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+
+ if (string) {
+ if (sev_es_guest(vcpu->kvm))
+ return sev_es_string_io(svm, size, port, in);
+ else
+ return kvm_emulate_instruction(vcpu, 0);
+ }
+
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static int nmi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int intr_interception(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb12;
+ struct kvm_host_map map;
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmcb12 = map.hva;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (vmload) {
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
+ svm->sysenter_eip_hi = 0;
+ svm->sysenter_esp_hi = 0;
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
+
+ kvm_vcpu_unmap(vcpu, &map);
+
+ return ret;
+}
+
+static int vmload_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, true);
+}
+
+static int vmsave_interception(struct kvm_vcpu *vcpu)
+{
+ return vmload_vmsave_interception(vcpu, false);
+}
+
+static int vmrun_interception(struct kvm_vcpu *vcpu)
+{
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ return nested_svm_vmrun(vcpu);
+}
+
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* Returns '1' or -errno on failure, '0' on success. */
+ ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ return svm_instr_handlers[opcode](vcpu);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
+ return emulate_svm_instr(vcpu, opcode);
+ }
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+void svm_set_gif(struct vcpu_svm *svm, bool value)
+{
+ if (value) {
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ * Likewise, clear the VINTR intercept, we will set it
+ * again while processing KVM_REQ_EVENT if needed.
+ */
+ if (vgif)
+ svm_clr_intercept(svm, INTERCEPT_STGI);
+ if (svm_is_intercept(svm, INTERCEPT_VINTR))
+ svm_clear_vintr(svm);
+
+ enable_gif(svm);
+ if (svm->vcpu.arch.smi_pending ||
+ svm->vcpu.arch.nmi_pending ||
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ } else {
+ disable_gif(svm);
+
+ /*
+ * After a CLGI no interrupts should come. But if vGIF is
+ * in use, we still rely on the VINTR intercept (rather than
+ * STGI) to detect an open interrupt window.
+ */
+ if (!vgif)
+ svm_clear_vintr(svm);
+ }
+}
+
+static int stgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), true);
+ return ret;
+}
+
+static int clgi_interception(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(vcpu))
+ return 1;
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+ svm_set_gif(to_svm(vcpu), false);
+ return ret;
+}
+
+static int invlpga_interception(struct kvm_vcpu *vcpu)
+{
+ gva_t gva = kvm_rax_read(vcpu);
+ u32 asid = kvm_rcx_read(vcpu);
+
+ /* FIXME: Handle an address size prefix. */
+ if (!is_long_mode(vcpu))
+ gva = (u32)gva;
+
+ trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, gva);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int skinit_interception(struct kvm_vcpu *vcpu)
+{
+ trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int task_switch_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ vcpu->arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ case SVM_EXITINTINFO_TYPE_SOFT:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!svm_skip_emulated_instruction(vcpu))
+ return 0;
+ }
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
+}
+
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
+static int iret_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
+ ++vcpu->stat.nmi_window_exits;
+ svm->awaiting_iret_completion = true;
+
+ svm_clr_iret_intercept(svm);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 1;
+}
+
+static int invlpg_interception(struct kvm_vcpu *vcpu)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int emulate_on_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0);
+}
+
+static int rsm_interception(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
+}
+
+static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long cr0 = vcpu->arch.cr0;
+ bool ret = false;
+
+ if (!is_guest_mode(vcpu) ||
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(vcpu, val))
+ err = kvm_set_cr0(vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ val = vcpu->arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ }
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr_trap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long old_value, new_value;
+ unsigned int cr;
+ int ret = 0;
+
+ new_value = (unsigned long)svm->vmcb->control.exit_info_1;
+
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
+ switch (cr) {
+ case 0:
+ old_value = kvm_read_cr0(vcpu);
+ svm_set_cr0(vcpu, new_value);
+
+ kvm_post_set_cr0(vcpu, old_value, new_value);
+ break;
+ case 4:
+ old_value = kvm_read_cr4(vcpu);
+ svm_set_cr4(vcpu, new_value);
+
+ kvm_post_set_cr4(vcpu, old_value, new_value);
+ break;
+ case 8:
+ ret = kvm_set_cr8(vcpu, new_value);
+ break;
+ default:
+ WARN(1, "unhandled CR%d write trap", cr);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int dr_interception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int reg, dr;
+ int err = 0;
+
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
+ if (vcpu->guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(vcpu);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
+ } else {
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
+ }
+
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int cr8_write_interception(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(vcpu);
+ if (lapic_in_kernel(vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(vcpu))
+ return r;
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int efer_trap(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr_info;
+ int ret;
+
+ /*
+ * Clear the EFER_SVME bit from EFER. The SVM code always sets this
+ * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
+ * whether the guest has X86_FEATURE_SVM - this avoids a failure if
+ * the guest doesn't have X86_FEATURE_SVM.
+ */
+ msr_info.host_initiated = false;
+ msr_info.index = MSR_EFER;
+ msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
+ ret = kvm_set_msr_common(vcpu, &msr_info);
+
+ return kvm_complete_insn_gp(vcpu, ret);
+}
+
+static int svm_get_feature_msr(u32 msr, u64 *data)
+{
+ *data = 0;
+
+ switch (msr) {
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ break;
+ default:
+ return KVM_MSR_RET_UNSUPPORTED;
+ }
+
+ return 0;
+}
+
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
+{
+ return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected &&
+ msr_info->index != MSR_IA32_XSS &&
+ !msr_write_intercepted(vcpu, msr_info->index);
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (sev_es_prevent_msr_access(vcpu, msr_info)) {
+ msr_info->data = 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+ }
+
+ switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
+ case MSR_STAR:
+ msr_info->data = svm->vmcb01.ptr->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb01.ptr->save.cstar;
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb01.ptr->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
+ if (guest_cpuid_is_intel_compatible(vcpu))
+ msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
+ break;
+ case MSR_IA32_S_CET:
+ msr_info->data = svm->vmcb->save.s_cet;
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = svm->vmcb->save.isst_addr;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = svm->vmcb->save.ssp;
+ break;
+ case MSR_TSC_AUX:
+ msr_info->data = svm->tsc_aux;
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ msr_info->data = svm->vmcb->save.spec_ctrl;
+ else
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_AMD64_DE_CFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
+ return kvm_complete_insn_gp(vcpu, err);
+
+ svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
+ return 1;
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret = 0;
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+
+ if (sev_es_prevent_msr_access(vcpu, msr))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
+ switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
+
+ if (data & SVM_TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr);
+ if (ret)
+ break;
+
+ svm->vmcb01.ptr->save.g_pat = data;
+ if (is_guest_mode(vcpu))
+ nested_vmcb02_compute_g_pat(svm);
+ vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ if (kvm_spec_ctrl_test_value(data))
+ return 1;
+
+ if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
+ svm->vmcb->save.spec_ctrl = data;
+ else
+ svm->spec_ctrl = data;
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested: